Source code for comicgeeks.extract

"""
This module contains functions for extracting and parsing details out
of comic book filenames.

@author: cbanack
"""

import re

__failed_regex = None

# ==============================================================================


[docs]def regex(filename_s, regex_s):
    """
    Takes the filename of a comic book, and extracts three strings out of it
    using the given regular expression, which must match the filename and create
    regex groups called "series", "num", and "year".  The extracted details
    will be the series name, the issue number, and the issue year.  These three
    details are returned as a triple, i.e. ("batman", "344", "2004").

    As long as AT LEAST a series name is found, this function will return the
    triple (missing values will be "").  Otherwise, it returns None.
    """
    global __failed_regex

    results = None
    if regex_s != __failed_regex:
        try:
            match = re.match(regex_s, filename_s)
            if match:
                founddict = {
                    x: match.group(x)
                    for x in match.groupdict()
                    if match.group(x) and match.group(x).strip()
                }
                if "series" in founddict:
                    results = (
                        match.group("series"),
                        match.group("num") if "num" in founddict else "",
                        match.group("year") if "year" in founddict else "",
                    )
        except:
            __failed_regex = regex_s
            results = None
    return results


# ==============================================================================


[docs]def extract(name_s):
    """
    Takes the filename of a comic book, and extracts three strings out of it: the
    series name, the issue number, and the issue year. These three pieces
    of information are returned as a triple, i.e. ("batman", "344", "2004").

    This function never returns None, and it will ALWAYS return the triple with
    at least a non-empty series name (even if it is just "unknown"), but the
    issue number and year may be "" if they couldn't be determined.
    """

    # 1. 's' is the name of our 'working' series name.  we'll slowly strip the
    #    'non-series name' data out of it, til what's left is the series name
    s = name_s

    # 2. but first, see if there's a volume/year in there.
    volume_year_s = __extract_year(s)

    # 3. strip out all bracketed data from the name
    def recurse_sub(pattern, s):
        while re.search(pattern, s):
            s = re.sub(pattern, "", s)
        return s

    s = recurse_sub(r"\([^\(]*?\)", s)
    s = recurse_sub(r"\{[^\{]*?\}", s)
    s = recurse_sub(r"\[[^\[]*?\]", s)

    # 4. clean out underscores
    s = re.sub(r"_", " ", s)

    # 5. remove all trace of volume from the name (like "vol. 2a" and "vol -3.1")
    s = re.sub(r"(?i)(\b((v|vol)\.?|volume))\s*-?\s*[0-9]+[.0-9a-z]*", "", s)

    # 6. remove all page counts, ie. "245p" or "50 pages"
    s = re.sub(r"(?i)\b[.,]?\s*\d+\s*(p|pg|pgs|pages)\b[.,]?", "", s)

    # 7. remove anything following a similar pattern to "02 of 02 covers"
    s = re.sub(r"(?i)(\d+\s*of\s*\d+\s*covers)", "", s)

    # 8. if the name has things like "4 of 5", remove the " of 5" part
    #    also, if the name has 3-6, remove the -6 part.  note that we'll
    #    try to handle the word "of" in a few common languages, like french/
    #    spanish (de), italian (di), german (von), dutch (van) or polish (z)
    s = re.sub(r"(?i)(?<=\d)(\s*(of|de|di|von|van|z)\s*#*\d+)", "", s)
    s = re.sub(r"(?<=\d)(-\d+)", "", s)

    # 9. iff this is one of those comic books that replaces all spaces with
    #    dashes, then strip the dashes out.  otherwise leave them in (because
    #    they might be important, like minus signs or something.)
    if "-" in s and " " not in s:
        s = re.sub(r"(?<![-_# ])-", " ", s)

    # 10. get an ordered list of issue number-like strings in the filename
    #    for example:  3, #4, 5a, 6.00, 10.0b, .5, -1.0
    #    also, remove numbers that look like years, EXCEPT on the "2000AD" series
    matches = __extract_numbers(s)

    # 11. if there's multiple numbers in the filename, and it starts with
    #    something like "05. " or "12 - " we assuming these files are part of
    #    a reading list, and we strip out that first part.
    pattern = r"^\s*\d+(\.\s+|\s*-\s*(?=\D))"
    if len(matches) > 1 and re.match(pattern, s):
        s = re.sub(pattern, "", s, 1)
        matches = __extract_numbers(s)
    # 12. if we parsed out some potential issue numbers, designate the LAST
    #    (rightmost) one as the actual issue number, and remove it from the name
    if len(matches) > 0:
        issue_num_s = matches[-1].group()
        issue_num_s = issue_num_s.replace(matches[-1].group(1), "")
        # series_s = s[:matches[-1].start(0)] + s[matches[-1].end(0):]
        series_s = s[: matches[-1].start(0)]
        # 10a. strip off leading/trailing zeroes
        matches = re.match("^(0+)([0-9].*)$", issue_num_s)
        issue_num_s = matches.group(2) if matches else issue_num_s
        if re.match(r"^-?[.0-9]+.?\w+$", issue_num_s) and __is_number(issue_num_s):
            issue_num_s = __sstr(
                float(issue_num_s) if "." in issue_num_s else int(issue_num_s)
            )
    else:
        issue_num_s = ""
        series_s = s

    # 13. contract repeating whitespace, and strip bad chars off the ends
    series_s = re.sub(r"\s{2,}", " ", series_s).strip(" ,-_")

    return [series_s, issue_num_s, volume_year_s]


# ==============================================================================
def __extract_year(s):
    """
    Searches through the given string left-to-right, seeing if an intelligible
    publication year can be extracted.  if it can, it will be returned as a
    four digit string, otherwise "" will be returned.
    """

    retval = ""

    # type one years appear exactly as "V2003".  there's a popular comicrack
    # script that creates dates that look like this, so parse em if we can
    results = [
        x[1]
        for x in re.findall(r"(?i)(^|[, -_])v(\d{4})($|[, -_])", s)
        if __isYear(x[1])
    ]

    if len(results) == 1:
        retval = results[0]
    else:
        # roughly, we're looking for a year or year range inside brackets
        # so: [2003], (2004-6), {2000-2010}, etc.

        # 1. get everything substring is strictly inside only one set of brackets
        results = re.findall(r"\([^[\](){}]*?\)", s)
        results += re.findall(r"\[[^[\](){}]*?\]", s)
        results += re.findall(r"\{[^[\](){}]*?\}", s)
        # 2. strip off the outer brackets and spaces
        results = [x.strip(r"()[]{}").strip() for x in results]
        # 3. if there is a year range, strip of the second half "2006-2009" -> "2006"
        results = [re.sub(r"(\d{4})\s*-\s*\d{1,4}", r"\1", x) for x in results]
        # 4. only keep strings that are valid 4 digit years
        results = [x for x in results if __isYear(x)]
        retval = results[-1] if results else ""

    return retval


# ==============================================================================
def __extract_numbers(s):
    """
    Searches through the given string left-to-right, building an ordered list of
    "issue number-like" re.match objects.  For example, this method finds
    matches substrings like:  3, #4, 5a, 6.00, 10.0b, .5, -1.0
    """
    matches = list(re.finditer(r"(?u)(^|[_\s#])(-?\d*\.?\d\w*)(\.?\w*$|)", s))
    # remove matches that look like years, EXCEPT on the "2000AD" series,
    # the  "The Beano" series, and any year that starts with '#' (i.e. #1950)
    is2000AD = re.match(r"(?i)\s*2000[\s\.-_]*a[\s.-_]*d.*", s)
    isBeano = re.match(r"(?i)\s*the[\s\.-_]+beano[\s.-_]+#?\d{4}", s)
    if not is2000AD and not isBeano:
        matches = [
            x
            for x in matches
            if not __isYear(x.group(2)) or (x.start(2) > 0 and s[x.start(2) - 1] == "#")
        ]
    return matches


# ==============================================================================
def __isYear(d):
    """Returns true iff the give stream appears to be a valid 4 digit year."""
    return re.match(r"^\d{4}$", d) and int(d) > 1900 and int(d) < 2100


# ==============================================================================
def __sstr(object):
    """safely converts the given object into a string (sstr = safestr)"""
    if object is None:
        return "<None>"
    if __is_string(object):
        # this is needed, because str() breaks on some strings that have unicode
        # characters, due to a python bug.  (all strings in python are unicode.)
        return object
    return str(object)


# ==============================================================================
def __is_number(s):
    """returns a boolean indicating whether the given object is a number, or
    a string that can be converted to a number."""
    try:
        float(s)
        return True
    except:
        return False


# ==============================================================================
def __is_string(object):
    """returns a boolean indicating whether the given object is a string"""

    if object is None:
        return False
    return True