Source code for comicgeeks.extract

"""
This module contains functions for extracting and parsing details out
of comic book filenames.

@author: cbanack
"""

import re

__failed_regex = None

# ==============================================================================


[docs]def regex(filename_s, regex_s): """ Takes the filename of a comic book, and extracts three strings out of it using the given regular expression, which must match the filename and create regex groups called "series", "num", and "year". The extracted details will be the series name, the issue number, and the issue year. These three details are returned as a triple, i.e. ("batman", "344", "2004"). As long as AT LEAST a series name is found, this function will return the triple (missing values will be ""). Otherwise, it returns None. """ global __failed_regex results = None if regex_s != __failed_regex: try: match = re.match(regex_s, filename_s) if match: founddict = { x: match.group(x) for x in match.groupdict() if match.group(x) and match.group(x).strip() } if "series" in founddict: results = ( match.group("series"), match.group("num") if "num" in founddict else "", match.group("year") if "year" in founddict else "", ) except: __failed_regex = regex_s results = None return results
# ==============================================================================
[docs]def extract(name_s): """ Takes the filename of a comic book, and extracts three strings out of it: the series name, the issue number, and the issue year. These three pieces of information are returned as a triple, i.e. ("batman", "344", "2004"). This function never returns None, and it will ALWAYS return the triple with at least a non-empty series name (even if it is just "unknown"), but the issue number and year may be "" if they couldn't be determined. """ # 1. 's' is the name of our 'working' series name. we'll slowly strip the # 'non-series name' data out of it, til what's left is the series name s = name_s # 2. but first, see if there's a volume/year in there. volume_year_s = __extract_year(s) # 3. strip out all bracketed data from the name def recurse_sub(pattern, s): while re.search(pattern, s): s = re.sub(pattern, "", s) return s s = recurse_sub(r"\([^\(]*?\)", s) s = recurse_sub(r"\{[^\{]*?\}", s) s = recurse_sub(r"\[[^\[]*?\]", s) # 4. clean out underscores s = re.sub(r"_", " ", s) # 5. remove all trace of volume from the name (like "vol. 2a" and "vol -3.1") s = re.sub(r"(?i)(\b((v|vol)\.?|volume))\s*-?\s*[0-9]+[.0-9a-z]*", "", s) # 6. remove all page counts, ie. "245p" or "50 pages" s = re.sub(r"(?i)\b[.,]?\s*\d+\s*(p|pg|pgs|pages)\b[.,]?", "", s) # 7. remove anything following a similar pattern to "02 of 02 covers" s = re.sub(r"(?i)(\d+\s*of\s*\d+\s*covers)", "", s) # 8. if the name has things like "4 of 5", remove the " of 5" part # also, if the name has 3-6, remove the -6 part. note that we'll # try to handle the word "of" in a few common languages, like french/ # spanish (de), italian (di), german (von), dutch (van) or polish (z) s = re.sub(r"(?i)(?<=\d)(\s*(of|de|di|von|van|z)\s*#*\d+)", "", s) s = re.sub(r"(?<=\d)(-\d+)", "", s) # 9. iff this is one of those comic books that replaces all spaces with # dashes, then strip the dashes out. otherwise leave them in (because # they might be important, like minus signs or something.) if "-" in s and " " not in s: s = re.sub(r"(?<![-_# ])-", " ", s) # 10. get an ordered list of issue number-like strings in the filename # for example: 3, #4, 5a, 6.00, 10.0b, .5, -1.0 # also, remove numbers that look like years, EXCEPT on the "2000AD" series matches = __extract_numbers(s) # 11. if there's multiple numbers in the filename, and it starts with # something like "05. " or "12 - " we assuming these files are part of # a reading list, and we strip out that first part. pattern = r"^\s*\d+(\.\s+|\s*-\s*(?=\D))" if len(matches) > 1 and re.match(pattern, s): s = re.sub(pattern, "", s, 1) matches = __extract_numbers(s) # 12. if we parsed out some potential issue numbers, designate the LAST # (rightmost) one as the actual issue number, and remove it from the name if len(matches) > 0: issue_num_s = matches[-1].group() issue_num_s = issue_num_s.replace(matches[-1].group(1), "") # series_s = s[:matches[-1].start(0)] + s[matches[-1].end(0):] series_s = s[: matches[-1].start(0)] # 10a. strip off leading/trailing zeroes matches = re.match("^(0+)([0-9].*)$", issue_num_s) issue_num_s = matches.group(2) if matches else issue_num_s if re.match(r"^-?[.0-9]+.?\w+$", issue_num_s) and __is_number(issue_num_s): issue_num_s = __sstr( float(issue_num_s) if "." in issue_num_s else int(issue_num_s) ) else: issue_num_s = "" series_s = s # 13. contract repeating whitespace, and strip bad chars off the ends series_s = re.sub(r"\s{2,}", " ", series_s).strip(" ,-_") return [series_s, issue_num_s, volume_year_s]
# ============================================================================== def __extract_year(s): """ Searches through the given string left-to-right, seeing if an intelligible publication year can be extracted. if it can, it will be returned as a four digit string, otherwise "" will be returned. """ retval = "" # type one years appear exactly as "V2003". there's a popular comicrack # script that creates dates that look like this, so parse em if we can results = [ x[1] for x in re.findall(r"(?i)(^|[, -_])v(\d{4})($|[, -_])", s) if __isYear(x[1]) ] if len(results) == 1: retval = results[0] else: # roughly, we're looking for a year or year range inside brackets # so: [2003], (2004-6), {2000-2010}, etc. # 1. get everything substring is strictly inside only one set of brackets results = re.findall(r"\([^[\](){}]*?\)", s) results += re.findall(r"\[[^[\](){}]*?\]", s) results += re.findall(r"\{[^[\](){}]*?\}", s) # 2. strip off the outer brackets and spaces results = [x.strip(r"()[]{}").strip() for x in results] # 3. if there is a year range, strip of the second half "2006-2009" -> "2006" results = [re.sub(r"(\d{4})\s*-\s*\d{1,4}", r"\1", x) for x in results] # 4. only keep strings that are valid 4 digit years results = [x for x in results if __isYear(x)] retval = results[-1] if results else "" return retval # ============================================================================== def __extract_numbers(s): """ Searches through the given string left-to-right, building an ordered list of "issue number-like" re.match objects. For example, this method finds matches substrings like: 3, #4, 5a, 6.00, 10.0b, .5, -1.0 """ matches = list(re.finditer(r"(?u)(^|[_\s#])(-?\d*\.?\d\w*)(\.?\w*$|)", s)) # remove matches that look like years, EXCEPT on the "2000AD" series, # the "The Beano" series, and any year that starts with '#' (i.e. #1950) is2000AD = re.match(r"(?i)\s*2000[\s\.-_]*a[\s.-_]*d.*", s) isBeano = re.match(r"(?i)\s*the[\s\.-_]+beano[\s.-_]+#?\d{4}", s) if not is2000AD and not isBeano: matches = [ x for x in matches if not __isYear(x.group(2)) or (x.start(2) > 0 and s[x.start(2) - 1] == "#") ] return matches # ============================================================================== def __isYear(d): """Returns true iff the give stream appears to be a valid 4 digit year.""" return re.match(r"^\d{4}$", d) and int(d) > 1900 and int(d) < 2100 # ============================================================================== def __sstr(object): """safely converts the given object into a string (sstr = safestr)""" if object is None: return "<None>" if __is_string(object): # this is needed, because str() breaks on some strings that have unicode # characters, due to a python bug. (all strings in python are unicode.) return object return str(object) # ============================================================================== def __is_number(s): """returns a boolean indicating whether the given object is a number, or a string that can be converted to a number.""" try: float(s) return True except: return False # ============================================================================== def __is_string(object): """returns a boolean indicating whether the given object is a string""" if object is None: return False return True