Source code for clique

# :coding: utf-8
# :copyright: Copyright (c) 2013 Martin Pengelly-Phillips
# :license: See LICENSE.txt.

import re
from collections import defaultdict

from ._version import __version__
from .collection import Collection
from .error import CollectionError


#: Pattern for matching an index with optional padding.
DIGITS_PATTERN = r'(?P<index>(?P<padding>0*)\d+)'

#: Common patterns that can be passed to :py:func:`~clique.assemble`.
PATTERNS = {
    'frames': r'\.{0}\.\D+\d?$'.format(DIGITS_PATTERN),
    'versions': r'v{0}'.format(DIGITS_PATTERN)
}


[docs]def assemble(
    iterable, patterns=None, minimum_items=2, case_sensitive=True,
    assume_padded_when_ambiguous=False
):
    '''Assemble items in *iterable* into discreet collections.

    *patterns* may be specified as a list of regular expressions to limit
    the returned collection possibilities. Use this when interested in
    collections that only match specific patterns. Each pattern must contain
    the expression from :py:data:`DIGITS_PATTERN` exactly once.

    A selection of common expressions are available in :py:data:`PATTERNS`.

    .. note::

        If a pattern is supplied as a string it will be automatically compiled
        to a :py:class:`re.RegexObject` instance for convenience.

    When *patterns* is not specified, collections are formed by examining all
    possible groupings of the items in *iterable* based around common numerical
    components.

    *minimum_items* dictates the minimum number of items a collection must have
    in order to be included in the result. The default is 2, filtering out
    single item collections.

    If *case_sensitive* is False, then items will be treated as part of the same
    collection when they only differ in casing. To avoid ambiguity, the
    resulting collection will always be lowercase. For example, "item.0001.dpx"
    and "Item.0002.dpx" would be part of the same collection, "item.%04d.dpx".

    .. note::

        Any compiled *patterns* will also respect the set case sensitivity.

    For certain collections it may be ambiguous whether they are padded or not.
    For example, 1000-1010 can be considered either an unpadded collection or a
    four padded collection. By default, Clique is conservative and assumes that
    the collection is unpadded. To change this behaviour, set
    *assume_padded_when_ambiguous* to True and any ambiguous collection will
    have a relevant padding set.

    .. note::

        *assume_padded_when_ambiguous* has no effect on collections that are
        unambiguous. For example, 1-100 will always be considered unpadded
        regardless of the *assume_padded_when_ambiguous* setting.

    Return tuple of two lists (collections, remainder) where 'collections' is a
    list of assembled :py:class:`~clique.collection.Collection` instances and
    'remainder' is a list of items that did not belong to any collection.

    '''
    collection_map = defaultdict(set)
    collections = []
    remainder = []

    # Compile patterns.
    flags = 0
    if not case_sensitive:
        flags |= re.IGNORECASE

    compiled_patterns = []

    if patterns is not None:
        if not patterns:
            return collections, list(iterable)

        for pattern in patterns:
            if isinstance(pattern, str):
                compiled_patterns.append(re.compile(pattern, flags=flags))
            else:
                compiled_patterns.append(pattern)

    else:
        compiled_patterns.append(re.compile(DIGITS_PATTERN, flags=flags))

    # Process iterable.
    for item in iterable:
        matched = False

        for pattern in compiled_patterns:
            for match in pattern.finditer(item):
                index = match.group('index')

                head = item[:match.start('index')]
                tail = item[match.end('index'):]

                if not case_sensitive:
                    head = head.lower()
                    tail = tail.lower()

                padding = match.group('padding')
                if padding:
                    padding = len(index)
                else:
                    padding = 0

                key = (head, tail, padding)
                collection_map[key].add(int(index))
                matched = True

        if not matched:
            remainder.append(item)

    # Form collections.
    merge_candidates = []
    for (head, tail, padding), indexes in collection_map.items():
        collection = Collection(head, tail, padding, indexes)
        collections.append(collection)

        if collection.padding == 0:
            merge_candidates.append(collection)

    # Merge together collections that align on padding boundaries. For example,
    # 0998-0999 and 1000-1001 can be merged into 0998-1001. Note that only
    # indexes within the padding width limit are merged. If a collection is
    # entirely merged into another then it will not be included as a separate
    # collection in the results.
    fully_merged = []
    for collection in collections:
        if collection.padding == 0:
            continue

        for candidate in merge_candidates:
            if (
                candidate.head == collection.head and
                candidate.tail == collection.tail
            ):
                merged_index_count = 0
                for index in candidate.indexes:
                    if len(str(abs(index))) == collection.padding:
                        collection.indexes.add(index)
                        merged_index_count += 1

                if merged_index_count == len(candidate.indexes):
                    fully_merged.append(candidate)

    # Filter out fully merged collections.
    collections = [collection for collection in collections
                   if collection not in fully_merged]

    # Filter out collections that do not have at least as many indexes as
    # minimum_items. In addition, add any members of a filtered collection,
    # which are not members of an unfiltered collection, to the remainder.
    filtered = []
    remainder_candidates = []
    for collection in collections:
        if len(collection.indexes) >= minimum_items:
            filtered.append(collection)
        else:
            for member in collection:
                remainder_candidates.append(member)

    for candidate in remainder_candidates:
        # Check if candidate has already been added to remainder to avoid
        # duplicate entries.
        if candidate in remainder:
            continue

        has_membership = False

        for collection in filtered:
            if candidate in collection:
                has_membership = True
                break

        if not has_membership:
            remainder.append(candidate)

    # Set padding for all ambiguous collections according to the
    # assume_padded_when_ambiguous setting.
    if assume_padded_when_ambiguous:
        for collection in filtered:
            if (
                not collection.padding and collection.indexes
            ):
                indexes = list(collection.indexes)
                first_index_width = len(str(indexes[0]))
                last_index_width = len(str(indexes[-1]))
                if first_index_width == last_index_width:
                    collection.padding = first_index_width

    return filtered, remainder


[docs]def parse(value, pattern='{head}{padding}{tail} [{ranges}]'):
    '''Parse *value* into a :py:class:`~clique.collection.Collection`.

    Use *pattern* to extract information from *value*. It may make use of the
    following keys:

        * *head* - Common leading part of the collection.
        * *tail* - Common trailing part of the collection.
        * *padding* - Padding value in ``%0d`` format.
        * *range* - Total range in the form ``start-end``.
        * *ranges* - Comma separated ranges of indexes.
        * *holes* - Comma separated ranges of missing indexes.

    .. note::

        *holes* only makes sense if *range* or *ranges* is also present.

    '''
    # Construct regular expression for given pattern.
    expressions = {
        'head': r'(?P<head>.*)',
        'tail': r'(?P<tail>.*)',
        'padding': r'%(?P<padding>\d*)d',
        'range': r'(?P<range>\d+-\d+)?',
        'ranges': r'(?P<ranges>[\d ,\-]+)?',
        'holes': r'(?P<holes>[\d ,\-]+)'
    }

    pattern_regex = re.escape(pattern)
    for key, expression in expressions.items():
        pattern_regex = pattern_regex.replace(
            r'\{{{0}\}}'.format(key),
            expression
        )
    pattern_regex = r'^{0}$'.format(pattern_regex)

    # Match pattern against value and use results to construct collection.
    match = re.search(pattern_regex, value)
    if match is None:
        raise ValueError('Value did not match pattern.')

    groups = match.groupdict()
    if 'padding' in groups and groups['padding']:
        groups['padding'] = int(groups['padding'])
    else:
        groups['padding'] = 0

    # Create collection and then add indexes.
    collection = Collection(
        groups.get('head', ''),
        groups.get('tail', ''),
        groups['padding']
    )

    if groups.get('range', None) is not None:
        start, end = [int(part) for part in groups['range'].split('-')]
        collection.indexes.update(range(start, end + 1))

    if groups.get('ranges', None) is not None:
        parts = [part.strip() for part in groups['ranges'].split(',')]
        for part in parts:
            index_range = [int(range_part) for range_part in part.split('-', 2)]

            if len(index_range) > 1:
                # Index range.
                for index in range(index_range[0], index_range[1] + 1):
                    collection.indexes.add(index)
            else:
                # Single index.
                collection.indexes.add(index_range[0])

    if 'holes' in groups:
        parts = [part.strip() for part in groups['holes'].split(',')]
        for part in parts:
            index_range = [int(hole_part) for hole_part in part.split('-', 2)]

            if len(index_range) > 1:
                # Index range.
                for index in range(index_range[0], index_range[1] + 1):
                    collection.indexes.remove(index)
            else:
                # Single index.
                collection.indexes.remove(index_range[0])

    return collection