Source code for clique

# :coding: utf-8
# :copyright: Copyright (c) 2013 Martin Pengelly-Phillips
# :license: See LICENSE.txt.

import re
from collections import defaultdict

from ._version import __version__
from .collection import Collection
from .error import CollectionError


#: Pattern for matching an index with optional padding.
DIGITS_PATTERN = '(?P<index>(?P<padding>0*)\d+)'

_DIGITS_REGEX = re.compile(DIGITS_PATTERN)

#: Common patterns that can be passed to :py:func:`~clique.assemble`.
PATTERNS = {
    'frames': '\.{0}\.\D+\d?$'.format(DIGITS_PATTERN),
    'versions': 'v{0}'.format(DIGITS_PATTERN)
}


[docs]def assemble(iterable, patterns=None, minimum_items=2):
    '''Assemble items in *iterable* into discreet collections.

    *patterns* may be specified as a list of regular expressions to limit
    the returned collection possibilities. Use this when interested in
    collections that only match specific patterns. Each pattern must contain
    the expression from :py:data:`DIGITS_PATTERN` exactly once.

    A selection of common expressions are available in :py:data:`PATTERNS`.

    .. note::

        If a pattern is supplied as a string it will be automatically compiled
        to a :py:class:`re.RegexObject` instance for convenience.

    When *patterns* is not specified, collections are formed by examining all
    possible groupings of the items in *iterable* based around common numerical
    components.

    *minimum_items* dictates the minimum number of items a collection must have
    in order to be included in the result. The default is 2, filtering out
    single item collections.

    Return tuple of two lists (collections, remainder) where 'collections' is a
    list of assembled :py:class:`~clique.collection.Collection` instances and
    'remainder' is a list of items that did not belong to any collection.

    '''
    collection_map = defaultdict(set)
    collections = []
    remainder = []

    # Compile patterns.
    compiled_patterns = []

    if patterns is not None:
        if not patterns:
            return collections, list(iterable)

        for pattern in patterns:
            if isinstance(pattern, basestring):
                compiled_patterns.append(re.compile(pattern))
            else:
                compiled_patterns.append(pattern)

    else:
        compiled_patterns.append(_DIGITS_REGEX)

    # Process iterable.
    for item in iterable:
        matched = False

        for pattern in compiled_patterns:
            for match in pattern.finditer(item):
                index = match.group('index')

                head = item[:match.start('index')]
                tail = item[match.end('index'):]

                padding = match.group('padding')
                if padding:
                    padding = len(index)
                else:
                    padding = 0

                key = (head, tail, padding)
                collection_map[key].add(int(index))
                matched = True

        if not matched:
            remainder.append(item)

    # Form collections.
    merge_candidates = []
    for (head, tail, padding), indexes in collection_map.items():
            collection = Collection(head, tail, padding, indexes)
            collections.append(collection)

            if collection.padding == 0:
                merge_candidates.append(collection)

    # Merge together collections that align on padding boundaries. For example,
    # 0998-0999 and 1000-1001 can be merged into 0998-1001. Note that only
    # indexes within the padding width limit are merged. If a collection is
    # entirely merged into another then it will not be included as a separate
    # collection in the results.
    fully_merged = []
    for collection in collections:
        if collection.padding == 0:
            continue

        for candidate in merge_candidates:
            if (candidate.head == collection.head and
                candidate.tail == collection.tail):

                merged_index_count = 0
                for index in candidate.indexes:
                    if len(str(abs(index))) == collection.padding:
                        collection.indexes.add(index)
                        merged_index_count += 1

                if merged_index_count == len(candidate.indexes):
                    fully_merged.append(candidate)

    # Filter out fully merged collections.
    collections = [collection for collection in collections
                   if collection not in fully_merged]

    # Filter out collections that do not have at least as many indexes as
    # minimum_items. In addition, add any members of a filtered collection,
    # which are not members of an unfiltered collection, to the remainder.
    filtered = []
    remainder_candidates = []
    for collection in collections:
        if len(collection.indexes) >= minimum_items:
            filtered.append(collection)
        else:
            for member in collection:
                remainder_candidates.append(member)

    for candidate in remainder_candidates:
        # Check if candidate has already been added to remainder to avoid
        # duplicate entries.
        if candidate in remainder:
            continue

        has_membership = False

        for collection in filtered:
            if candidate in collection:
                has_membership = True
                break

        if not has_membership:
            remainder.append(candidate)

    return filtered, remainder


[docs]def parse(value, pattern='{head}{padding}{tail} [{ranges}]'):
    '''Parse *value* into a :py:class:`~clique.collection.Collection`.

    Use *pattern* to extract information from *value*. It may make use of the
    following keys:

        * *head* - Common leading part of the collection.
        * *tail* - Common trailing part of the collection.
        * *padding* - Padding value in ``%0d`` format.
        * *range* - Total range in the form ``start-end``
        * *ranges* - Comma separated ranges of indexes.
        * *holes* - Comma separated ranges of missing indexes.

    .. note::

        *holes* only makes sense if *range* or *ranges* is also present.

    '''
    # Construct regular expression for given pattern.
    expressions = {
        'head': '(?P<head>.*)',
        'tail': '(?P<tail>.*)',
        'padding': '%(?P<padding>\d*)d',
        'range': '(?P<range>\d+-\d+)',
        'ranges': '(?P<ranges>[\d ,\-]+)',
        'holes': '(?P<holes>[\d ,\-]+)'
    }

    pattern_regex = re.escape(pattern)
    for key, expression in expressions.items():
        pattern_regex = pattern_regex.replace(
            '\{{{0}\}}'.format(key),
            expression
        )
    pattern_regex = '^{0}$'.format(pattern_regex)

    # Match pattern against value and use results to construct collection.
    match = re.search(pattern_regex, value)
    if match is None:
        raise ValueError('Value did not match pattern.')

    groups = match.groupdict()
    if 'padding' in groups and groups['padding']:
        groups['padding'] = int(groups['padding'])
    else:
        groups['padding'] = 0

    # Create collection and then add indexes.
    collection = Collection(
        groups.get('head', ''),
        groups.get('tail', ''),
        groups['padding']
    )

    if 'range' in groups:
        start, end = map(int, groups['range'].split('-'))
        collection.indexes.update(range(start, end + 1))

    if 'ranges' in groups:
        parts = [part.strip() for part in groups['ranges'].split(',')]
        for part in parts:
            index_range = map(int, part.split('-', 2))

            if len(index_range) > 1:
                # Index range.
                for index in range(index_range[0], index_range[1] + 1):
                    collection.indexes.add(index)
            else:
                # Single index.
                collection.indexes.add(index_range[0])

    if 'holes' in groups:
        parts = [part.strip() for part in groups['holes'].split(',')]
        for part in parts:
            index_range = map(int, part.split('-', 2))

            if len(index_range) > 1:
                # Index range.
                for index in range(index_range[0], index_range[1] + 1):
                    collection.indexes.remove(index)
            else:
                # Single index.
                collection.indexes.remove(index_range[0])

    return collection