Source code for micromet.format.file_compile

#!/usr/bin/env python3
"""
Compile files by substring into a single directory.

Key logic:
- Group by exact filename (case-sensitive match on the filename itself).
- Within each group, deduplicate items that have the *same* (creation_time, size).
- If >1 unique items remain *and* both creation_time and size differ across them,
  copy all, labeled sequentially: name_1.ext, name_2.ext, ...
- Else (effectively duplicates), copy only one.
"""

from __future__ import annotations
import argparse
import os
import re
from pathlib import Path
from dataclasses import dataclass
from typing import Dict, List, Tuple
import shutil
import sys
import time



[docs]
@dataclass(frozen=True)
class FileInfo:
    """
    A container for file metadata.

    Attributes
    ----------
    path : Path
        The full path to the file.
    size : int
        The size of the file in bytes.
    create_ts : float
        The creation timestamp of the file. This may be platform-dependent.
    mtime_ts : float
        The modification timestamp of the file.
    """

    path: Path
    size: int
    create_ts: float  # "creation" time (platform-dependent, see _get_creation_time)
    mtime_ts: float



def _get_creation_time(p: Path) -> float:
    """
    Get the file creation time in a cross-platform manner.

    This function attempts to get the most accurate creation time
    available on the current operating system.

    Parameters
    ----------
    p : Path
        The path to the file.

    Returns
    -------
    float
        The creation timestamp of the file.
    """
    st = p.stat()
    if hasattr(st, "st_birthtime"):  # macOS, some BSDs
        return st.st_birthtime
    return st.st_ctime  # Windows: creation, Linux: change time


def _gather_files(root: Path, pattern: str, case_sensitive: bool = True) -> List[Path]:
    """
    Gather all files in a directory tree whose names match a specific pattern.

    Parameters
    ----------
    root : Path
        The root directory to start the search from.
    pattern : str
        The regular expression pattern to search for in filenames.
        For a simple substring search, you can pass the substring directly,
        and it will be treated as an escaped literal pattern.
    case_sensitive : bool, default True
        Whether the search should be case-sensitive.

    Returns
    -------
    List[Path]
        A list of paths to the files that match the criteria.
    """
    files: List[Path] = []
    
    # 1. Escape the pattern to treat it as a literal string for simple searches
    # If the user wants to use true regex, they must escape special characters themselves,
    # or you could introduce an 'is_regex' flag, but for simplicity, we'll assume
    # the user is passing a regex pattern for complex searches.
    
    # Compile the regex pattern with or without IGNORECASE flag
    flags = 0 if case_sensitive else re.IGNORECASE
    try:
        compiled_pattern = re.compile(pattern, flags=flags)
    except re.error:
        # Handle cases where the pattern might be malformed, though less likely
        # if the user is careful.
        print(f"Error compiling regex pattern: {pattern}")
        return files


    for dirpath, _, filenames in os.walk(root):
        for fn in filenames:
            # 2. Use re.search() to check for a match
            if compiled_pattern.search(fn):
                files.append(Path(dirpath) / fn)
                
    return files


def _to_fileinfo(paths: List[Path], use_mtime: bool) -> List[FileInfo]:
    """
    Convert a list of file paths to a list of FileInfo objects.

    Parameters
    ----------
    paths : List[Path]
        A list of paths to files.
    use_mtime : bool
        If True, use the modification time as the creation time.

    Returns
    -------
    List[FileInfo]
        A list of FileInfo objects corresponding to the input paths.
    """
    out: List[FileInfo] = []
    for p in paths:
        try:
            st = p.stat()
            fi = FileInfo(
                path=p,
                size=st.st_size,
                create_ts=st.st_mtime if use_mtime else _get_creation_time(p),
                mtime_ts=st.st_mtime,
            )
            out.append(fi)
        except FileNotFoundError:
            # Skip files that disappear between walk and stat
            continue
    return out


def _group_by_filename(infos: List[FileInfo]) -> Dict[str, List[FileInfo]]:
    """
    Group a list of FileInfo objects by their filename.

    Parameters
    ----------
    infos : List[FileInfo]
        A list of FileInfo objects.

    Returns
    -------
    Dict[str, List[FileInfo]]
        A dictionary mapping each filename to a list of FileInfo objects
        that share that name.
    """
    byname: Dict[str, List[FileInfo]] = {}
    for fi in infos:
        byname.setdefault(fi.path.name, []).append(fi)
    return byname


def _unique_by_ctime_size(items: List[FileInfo]) -> List[FileInfo]:
    """
    Filter a list of FileInfo objects to find unique items by creation time and size.

    Parameters
    ----------
    items : List[FileInfo]
        A list of FileInfo objects to be filtered.

    Returns
    -------
    List[FileInfo]
        A list containing only the unique FileInfo objects.
    """
    seen: set[Tuple[int, int]] = set()
    unique: List[FileInfo] = []
    # Round timestamps to integer seconds for dedup; adjust if you need finer resolution
    for fi in items:
        key = (int(fi.create_ts), fi.size)
        if key not in seen:
            seen.add(key)
            unique.append(fi)
    return unique


def _all_differ_in_both_ctime_and_size(items: List[FileInfo]) -> bool:
    """
    Check if all items in a list differ in both creation time and size.

    Returns True if every pair of items in the list has a different
    creation time and a different size.

    Parameters
    ----------
    items : List[FileInfo]
        A list of FileInfo objects to compare.

    Returns
    -------
    bool
        True if all items are unique in both creation time and size,
        False otherwise.
    """
    n = len(items)
    if n <= 1:
        return False
    for i in range(n):
        for j in range(i + 1, n):
            same_ctime = int(items[i].create_ts) == int(items[j].create_ts)
            same_size = items[i].size == items[j].size
            if same_ctime or same_size:
                return False
    return True


def _ensure_outdir(p: Path):
    """
    Ensure that a directory exists, creating it if necessary.

    Parameters
    ----------
    p : Path
        The path to the directory.
    """
    p.mkdir(parents=True, exist_ok=True)


def _format_time(ts: float) -> str:
    """
    Format a timestamp into a string.

    Parameters
    ----------
    ts : float
        The timestamp to format.

    Returns
    -------
    str
        The formatted time string in 'YYYY-MM-DD_HH-MM-SS' format.
    """
    return time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime(ts))



[docs]
def compile_files(
    root: Path,
    outdir: Path,
    contains: str,
    case_sensitive: bool = False,
    dry_run: bool = False,
    use_mtime: bool = False,
    sequential_zero_pad: int = 1,
) -> None:
    """
    Compile files from a source directory to a destination, handling duplicates.

    This function scans a directory tree for files containing a specific
    substring in their names, groups them by filename, and then copies
    them to an output directory. It includes logic to handle duplicate
    files based on their creation time and size.

    Parameters
    ----------
    root : Path
        The root directory to search for files.
    outdir : Path
        The directory where the compiled files will be saved.
    contains : str
        The substring that filenames must contain to be included.
    case_sensitive : bool, optional
        If True, the search for `contains` is case-sensitive.
        Defaults to False.
    dry_run : bool, optional
        If True, the function will only print the actions it would take
        without actually copying any files. Defaults to False.
    use_mtime : bool, optional
        If True, use the file's modification time instead of its creation
        time for comparisons. Defaults to False.
    sequential_zero_pad : int, optional
        The number of digits to use for zero-padding when creating
        sequential filenames for duplicates. Defaults to 1.
    """
    _ensure_outdir(outdir)

    paths = _gather_files(root, contains, case_sensitive)
    infos = _to_fileinfo(paths, use_mtime=use_mtime)
    groups = _group_by_filename(infos)

    copied = 0
    skipped_dup = 0
    made_sequential = 0

    for filename, items in sorted(groups.items()):
        uniques = _unique_by_ctime_size(items)

        # Only one unique (ctime,size): copy just one (the earliest by ctime)
        if len(uniques) == 1:
            src = uniques[0].path
            dst = outdir / filename
            if dst.exists():
                # If same filename already placed (from another pass), skip if same size,
                # else append a suffix to avoid overwrite.
                if dst.stat().st_size == uniques[0].size:
                    skipped_dup += 1
                    continue
                # Different size: avoid overwrite by adding suffix
                stem, ext = Path(filename).stem, Path(filename).suffix
                dst = outdir / f"{stem}_1{ext}"
            if dry_run:
                print(f"[DRY-RUN] COPY {src} -> {dst}")
            else:
                shutil.copy2(src, dst)
            copied += 1

        else:
            # Multiple unique versions for the same filename
            # If they differ in BOTH creation time and size, label sequentially.
            # Otherwise treat as duplicates and copy only one.
            if _all_differ_in_both_ctime_and_size(uniques):
                # Sort by creation time (oldest first)
                uniques_sorted = sorted(uniques, key=lambda fi: fi.create_ts)
                stem, ext = Path(filename).stem, Path(filename).suffix
                for idx, fi in enumerate(uniques_sorted, start=1):
                    suffix = f"_{str(idx).zfill(sequential_zero_pad)}"
                    dst = outdir / f"{stem}{suffix}{ext}"
                    if dst.exists():
                        # Find the next available suffix to avoid accidental overwrite
                        k = idx
                        while dst.exists():
                            k += 1
                            suffix = f"_{str(k).zfill(sequential_zero_pad)}"
                            dst = outdir / f"{stem}{suffix}{ext}"
                    if dry_run:
                        print(f"[DRY-RUN] COPY {fi.path} -> {dst}")
                    else:
                        shutil.copy2(fi.path, dst)
                    made_sequential += 1
            else:
                # Treat as duplicates: pick the earliest by creation time and copy once
                choice = min(uniques, key=lambda fi: fi.create_ts)
                dst = outdir / filename
                if dst.exists():
                    if dst.stat().st_size == choice.size:
                        skipped_dup += 1
                        continue
                    # Different size but not both differing -> keep just one, but avoid overwrite
                    stem, ext = Path(filename).stem, Path(filename).suffix
                    dst = outdir / f"{stem}_1{ext}"
                if dry_run:
                    print(f"[DRY-RUN] COPY {choice.path} -> {dst}")
                else:
                    shutil.copy2(choice.path, dst)
                copied += 1

    print(
        f"Done. Copied: {copied}, Sequentially labeled: {made_sequential}, Skipped duplicates: {skipped_dup}"
    )