API Reference¶

This page uses mkdocstrings to generate an API reference from the src/filoma package.

filoma: filesystem profiling and directory analysis.

A modular Python tool for profiling files, analyzing directory structures, and inspecting image data.

This module exposes a tiny, ergonomic public surface while importing heavy optional dependencies lazily (Polars, Pillow, Rust extension, etc.). Accessing convenience classes like :class:DataFrame or subpackages like filoma.directories will import the underlying modules on-demand.

`getattr(name)` ¶

Lazy import and attribute resolution for top-level names.

Implements PEP 562: import submodules or attributes on demand.

Source code in filoma/__init__.py

def __getattr__(name: str):
    """Lazy import and attribute resolution for top-level names.

    Implements PEP 562: import submodules or attributes on demand.
    """
    mapping = {
        # top-level subpackages
        "core": "filoma.core",
        "directories": "filoma.directories",
        "files": "filoma.files",
        "images": "filoma.images",
        "filaraki": "filoma.filaraki",
        # common classes placed in submodules (module:attr)
        "DataFrame": "filoma.dataframe:DataFrame",
        "DirectoryProfiler": "filoma.directories.directory_profiler:DirectoryProfiler",
        "FileProfiler": "filoma.files.file_profiler:FileProfiler",
        "ImageProfiler": "filoma.images.image_profiler:ImageProfiler",
    }

    if name == "Dataset":
        from .dataset import Dataset

        globals()["Dataset"] = Dataset
        return Dataset

    if name in mapping:
        target = mapping[name]
        if ":" in target:
            module_name, attr = target.split(":", 1)
            mod = importlib.import_module(module_name)
            value = getattr(mod, attr)
        else:
            value = importlib.import_module(target)

        globals()[name] = value
        return value

    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")

`probe(path, **kwargs)` ¶

Quick helper: probe a directory path and return a DirectoryAnalysis.

This wrapper accepts probe-specific keyword arguments such as max_depth and threads and forwards them to :class:DirectoryProfiler.probe. Other kwargs are used to configure the :class:DirectoryProfiler constructor.

Source code in filoma/__init__.py

def probe(path: str, **kwargs: Any) -> Any:
    """Quick helper: probe a directory path and return a DirectoryAnalysis.

    This wrapper accepts probe-specific keyword arguments such as
    ``max_depth`` and ``threads`` and forwards them to
    :class:`DirectoryProfiler.probe`. Other kwargs are used to configure the
    :class:`DirectoryProfiler` constructor.
    """
    # Extract probe-only parameters so they are not passed to the
    # DirectoryProfiler constructor (which doesn't accept them).
    max_depth = kwargs.pop("max_depth", None)
    threads = kwargs.pop("threads", None)

    # If the provided path points to a file, dispatch to FileProfiler.probe
    try:
        from pathlib import Path

        p = Path(path)
        if p.exists() and p.is_file():
            # Forward any file-specific kwargs (e.g., compute_hash) via kwargs
            from .files.file_profiler import FileProfiler

            return FileProfiler().probe(path, **kwargs)
    except Exception:
        # If any checks fail, fall back to directory probing behaviour and
        # let the underlying profiler raise appropriate errors.
        pass

    # Local import to ensure the class is available without forcing it at
    # module import time.
    from .directories import DirectoryProfiler, DirectoryProfilerConfig

    # Build a typed config from remaining kwargs and instantiate the profiler
    config = DirectoryProfilerConfig(**kwargs)
    profiler = DirectoryProfiler(config)
    return profiler.probe(path, max_depth=max_depth, threads=threads)

`probe_file(path, **kwargs)` ¶

Quick helper: probe a single file and return a Filo dataclass.

Source code in filoma/__init__.py

def probe_file(path: str, **kwargs: Any) -> Any:
    """Quick helper: probe a single file and return a Filo dataclass."""
    from .files.file_profiler import FileProfiler

    return FileProfiler().probe(path, **kwargs)

`probe_image(arg, **kwargs)` ¶

Analyze an image.

If arg is a numpy array, :class:ImageProfiler.probe is used; if it's path-like, attempt to locate an image-specific profiler or load it to numpy and analyze.

This wrapper favors simplicity for interactive use; for advanced control instantiate profilers directly.

Source code in filoma/__init__.py

def probe_image(arg: Any, **kwargs: Any) -> Any:
    """Analyze an image.

    If ``arg`` is a numpy array, :class:`ImageProfiler.probe` is used; if
    it's path-like, attempt to locate an image-specific profiler or load it
    to numpy and analyze.

    This wrapper favors simplicity for interactive use; for advanced
    control instantiate profilers directly.
    """
    # Local imports; keep them inside the function to avoid heavy deps at
    # module import time.
    from pathlib import Path

    try:
        import numpy as _np
    except Exception:
        _np = None

    # If it's a numpy array, use ImageProfiler directly
    if _np is not None and hasattr(_np, "ndarray") and isinstance(arg, _np.ndarray):
        from .images.image_profiler import ImageProfiler

        return ImageProfiler().probe(arg)

    # Treat as path-like
    p = Path(arg)
    suffix = p.suffix.lower() if p.suffix else ""

    try:
        # Use images package specializers when available
        from .images import NpyProfiler, PngProfiler, TifProfiler, ZarrProfiler

        if suffix == ".png":
            return PngProfiler().probe(p)
        if suffix == ".npy":
            return NpyProfiler().probe(p)
        if suffix in (".tif", ".tiff"):
            return TifProfiler().probe(p)
        if suffix == ".zarr":
            return ZarrProfiler().probe(p)
    except Exception:
        # If specialist creation fails, fall back to generic loader below
        pass

    # Generic fallback: try Pillow + numpy loader
    try:
        # Third-party import
        from PIL import Image as _PILImage

        # Local import
        from .images.image_profiler import ImageProfiler

        img = _PILImage.open(p)
        arr = _np.array(img) if _np is not None else None
        if arr is not None:
            return ImageProfiler().probe(arr)
    except Exception:
        pass

    # Last resort: return an ImageReport with status explaining failure
    from .images.image_profiler import ImageReport

    return ImageReport(path=str(p), status="failed_to_load_or_unsupported_format")

`probe_to_df(path, to_pandas=False, enrich=True, **kwargs)` ¶

Return a Polars DataFrame (or pandas if to_pandas=True).

Force DataFrame building on the profiler and optionally run a small enrichment chain: .add_depth_col(path).add_path_components().add_file_stats_cols().

Source code in filoma/__init__.py

def probe_to_df(path: str, to_pandas: bool = False, enrich: bool = True, **kwargs: Any) -> Any:
    """Return a Polars DataFrame (or pandas if to_pandas=True).

    Force DataFrame building on the profiler and optionally run a small
    enrichment chain: .add_depth_col(path).add_path_components().add_file_stats_cols().
    """
    # Extract probe-only parameters
    max_depth = kwargs.pop("max_depth", None)
    threads = kwargs.pop("threads", None)

    # Lazy import to avoid heavy deps at module import time
    from .directories import DirectoryProfiler, DirectoryProfilerConfig

    # Force DataFrame building and construct a typed config
    kwargs["build_dataframe"] = True
    config = DirectoryProfilerConfig(**kwargs)
    profiler = DirectoryProfiler(config)
    analysis = profiler.probe(path, max_depth=max_depth, threads=threads)

    df_wrapper = analysis.to_df()
    if df_wrapper is None:
        raise RuntimeError("DataFrame was not built. Ensure 'polars' is installed and that DataFrame building is enabled (build_dataframe=True).")

    # Initialize lineage
    df_wrapper.add_lineage_entry("probe", path=path, **kwargs)

    # Optionally enrich the DataFrame wrapper with useful columns/stats
    df_enriched = df_wrapper
    if enrich:
        try:
            df_enriched = df_enriched.add_depth_col(path).add_path_components().add_file_stats_cols()
        except Exception:
            # If enrichment fails for any reason, fall back to the raw DataFrame
            pass

    # Return requested format: filoma.DataFrame wrapper (default) or pandas
    # Keep the `to_pandas` convenience for callers that explicitly want pandas
    if to_pandas:
        try:
            return df_enriched.df.to_pandas()
        except Exception as e:
            raise RuntimeError(f"Failed to convert Polars DataFrame to pandas: {e}")

    return df_enriched

`snapshot(path, mode='fast', export=None, include_hidden=False, pattern=None, metadata=None)` ¶

Create a snapshot of a dataset with configurable integrity checking.

Three integrity levels: - "fast": Hash of filename + size + mtime (99% effective for accidental changes) - "deep": Fast + hash of first/last 4KB (detects header/corruption changes) - "full": Complete SHA-256 hash (audit mode, slow for large files)

Parameters:

Name	Type	Description	Default
`path`	`str`	Path to the dataset directory to snapshot	required
`mode`	`str`	Integrity level - "fast", "deep", or "full"	`'fast'`
`export`	`Optional[str]`	Optional path to save the snapshot JSON file	`None`
`include_hidden`	`bool`	Whether to include hidden files/directories	`False`
`pattern`	`Optional[str]`	Optional glob pattern to filter files (e.g., "*.txt")	`None`
`metadata`	`Optional[Dict[str, Any]]`	Optional metadata dictionary to include in snapshot	`None`

Returns:

Type	Description
`Any`	DatasetSnapshot object containing all file entries and hashes

Source code in filoma/__init__.py

def snapshot(
    path: str,
    mode: str = "fast",
    export: Optional[str] = None,
    include_hidden: bool = False,
    pattern: Optional[str] = None,
    metadata: Optional[Dict[str, Any]] = None,
) -> Any:
    """Create a snapshot of a dataset with configurable integrity checking.

    Three integrity levels:
    - "fast": Hash of filename + size + mtime (99% effective for accidental changes)
    - "deep": Fast + hash of first/last 4KB (detects header/corruption changes)
    - "full": Complete SHA-256 hash (audit mode, slow for large files)

    Args:
        path: Path to the dataset directory to snapshot
        mode: Integrity level - "fast", "deep", or "full"
        export: Optional path to save the snapshot JSON file
        include_hidden: Whether to include hidden files/directories
        pattern: Optional glob pattern to filter files (e.g., "*.txt")
        metadata: Optional metadata dictionary to include in snapshot

    Returns:
        DatasetSnapshot object containing all file entries and hashes

    """
    from .core.snapshot import snapshot as _snapshot

    return _snapshot(
        path=path,
        mode=mode,
        export=export,
        include_hidden=include_hidden,
        pattern=pattern,
        metadata=metadata,
    )

`verify_snapshot(snapshot_path, target_path=None, mode=None)` ¶

Verify a directory against a saved snapshot.

Parameters:

Name	Type	Description	Default
`snapshot_path`	`str`	Path to the saved snapshot JSON file	required
`target_path`	`Optional[str]`	Optional path to verify (defaults to snapshot's root_path)	`None`
`mode`	`Optional[str]`	Verification mode (defaults to snapshot's mode)	`None`

Returns:

Type	Description
`Dict[str, Any]`	Dictionary with verification results

Source code in filoma/__init__.py

def verify_snapshot(
    snapshot_path: str,
    target_path: Optional[str] = None,
    mode: Optional[str] = None,
) -> Dict[str, Any]:
    """Verify a directory against a saved snapshot.

    Args:
        snapshot_path: Path to the saved snapshot JSON file
        target_path: Optional path to verify (defaults to snapshot's root_path)
        mode: Verification mode (defaults to snapshot's mode)

    Returns:
        Dictionary with verification results

    """
    from .core.snapshot import verify as _verify

    return _verify(
        snapshot_path=snapshot_path,
        target_path=target_path,
        mode=mode,
    )

Package overview¶

The top-level package docstring is rendered above. Below are some focused sections for important modules and classes.

DataFrame wrapper¶

The filoma.DataFrame wrapper provides convenience enrichers and helpers that operate on a Polars DataFrame internally.

A wrapper around Polars DataFrame for enhanced file and directory analysis.

This class provides a specialized interface for working with file path data, allowing for easy manipulation and analysis of filesystem information.

All standard Polars DataFrame methods and properties are available through attribute delegation, so you can use this like a regular Polars DataFrame with additional file-specific functionality.

Source code in filoma/dataframe.py

class DataFrame:
    """A wrapper around Polars DataFrame for enhanced file and directory analysis.

    This class provides a specialized interface for working with file path data,
    allowing for easy manipulation and analysis of filesystem information.

    All standard Polars DataFrame methods and properties are available through
    attribute delegation, so you can use this like a regular Polars DataFrame
    with additional file-specific functionality.
    """

    def __init__(
        self,
        data: Optional[Union[pl.DataFrame, List[str], List[Path], Dict[str, Any]]] = None,
        lineage: Optional[List[Dict[str, Any]]] = None,
    ):
        """Initialize a DataFrame.

        Args:
        ----
            data: Initial data. Can be:
                - A Polars DataFrame
                - A dictionary mapping column names to sequences (all same length)
                - A list of string paths
                - A list of Path objects
                - None for an empty DataFrame
            lineage: Optional list of lineage entries.

        """
        if data is None:
            self._df = pl.DataFrame({"path": []}, schema={"path": pl.String})
        elif isinstance(data, pl.DataFrame):
            self._df = data
        elif isinstance(data, dict):
            if not data:
                self._df = pl.DataFrame()
            else:
                processed: Dict[str, List[Any]] = {}
                expected_len: Optional[int] = None
                for col, values in data.items():
                    if not isinstance(values, (list, tuple)):
                        raise ValueError("Dictionary values must be list or tuple sequences")
                    seq = [str(x) if isinstance(x, Path) else x for x in values]
                    if expected_len is None:
                        expected_len = len(seq)
                    elif len(seq) != expected_len:
                        raise ValueError("All dictionary value sequences must have the same length")
                    processed[col] = seq
                self._df = pl.DataFrame(processed)
        elif isinstance(data, list):
            if data and isinstance(data[0], dict):
                # Handle list of dictionaries (from manifest or to_dicts())
                self._df = pl.from_dicts(data)
            else:
                paths = [str(path) for path in data]
                self._df = pl.DataFrame({"path": paths})
        else:
            raise ValueError("data must be a Polars DataFrame, dict of columns, list of paths, or None")
        self._pd_cache = None
        self.with_enrich = False
        self.with_filename_features = False
        self._lineage = lineage or []

    def _ensure_polars(self) -> pl.DataFrame:
        """Ensure the internal `_df` is a Polars DataFrame.

        If the underlying object is not a Polars DataFrame attempt to convert
        it (via pandas conversion if available or `pl.DataFrame(...)`). This
        prevents AttributeError when methods expect Polars APIs like
        `with_columns` or `map_elements`.
        """
        # Fast path
        if isinstance(self._df, pl.DataFrame):
            return self._df

        # Try pandas conversion first if pandas is present and this looks like
        # a pandas DataFrame
        try:
            if pd is not None and isinstance(self._df, pd.DataFrame):
                self._df = pl.from_pandas(self._df)
                # Invalidate any cached pandas view since we've converted
                self.invalidate_pandas_cache()
                return self._df
        except Exception:
            # fall through to generic conversion
            pass

        # Generic attempt to coerce into a Polars DataFrame
        try:
            self._df = pl.DataFrame(self._df)
            self.invalidate_pandas_cache()
            return self._df
        except Exception as exc:
            raise RuntimeError(f"Unable to coerce internal DataFrame to polars.DataFrame: {exc}")

    def __getattr__(self, name: str) -> Any:
        """Delegate attribute access to the underlying Polars DataFrame.

        This allows direct access to all Polars DataFrame methods and properties
        like columns, dtypes, shape, select, filter, group_by, etc.
        """
        # Directly return the attribute from the underlying Polars DataFrame.
        # NOTE: We intentionally do NOT wrap returned Polars DataFrames anymore.
        # This makes filoma.DataFrame behave like a Polars DataFrame by default
        # (calls like df.head(), df.select(...), etc. return native Polars
        # objects). This is a breaking change compared to previously wrapping
        # Polars results in filoma.DataFrame.
        try:
            attr = getattr(self._df, name)
        except AttributeError:
            # Preserve the original error semantics
            raise

        # If the attribute is callable, return a wrapper that conditionally
        # wraps returned polars.DataFrame objects into filoma.DataFrame
        if callable(attr):

            def wrapper(*args, **kwargs):
                result = attr(*args, **kwargs)
                # If the underlying call mutated the Polars DataFrame in-place,
                # Polars often returns None or the same object reference. In
                # that case invalidate the cached pandas conversion so future
                # .pandas/.pandas_cached calls reflect the mutation.
                if result is None or result is self._df:
                    try:
                        self.invalidate_pandas_cache()
                    except Exception:
                        # Best-effort: do not let cache invalidation break calls
                        pass
                    return result

                # If wrapping is enabled and result is a Polars DataFrame,
                # wrap it back into filoma.DataFrame for compatibility.
                # Propagate lineage to the new wrapper.
                if get_default_wrap_polars() and isinstance(result, pl.DataFrame):
                    return DataFrame(result, lineage=list(self._lineage))

                return result

            return wrapper

        # Non-callable attributes (properties) — if it's a Polars DataFrame and
        # wrapping is requested, wrap it; otherwise return as-is.
        if get_default_wrap_polars() and isinstance(attr, pl.DataFrame):
            return DataFrame(attr, lineage=list(self._lineage))

        return attr

    def __dir__(self) -> List[str]:
        """Expose both wrapper and underlying Polars attributes in interactive help."""
        attrs = set(super().__dir__())
        try:
            attrs.update(dir(self._df))
        except Exception:
            pass
        return sorted(list(attrs))

    def __getitem__(self, key):
        """Forward subscription (e.g., df['path']) to the underlying Polars DataFrame.

        Returns native Polars objects (Series or DataFrame) to match the default
        Polars-first behavior of this wrapper.
        """
        return self._df.__getitem__(key)

    def __setitem__(self, key, value):
        """Forward item assignment to the underlying Polars DataFrame."""
        # Polars DataFrame supports column assignment via df[key] = value
        # Try to support common user-friendly patterns: assigning a Python
        # sequence or a Series to create/replace a column. Polars' native
        # __setitem__ may raise TypeError in some versions, so handle that
        # explicitly and fall back to with_columns.
        try:
            if isinstance(key, str):
                # Accept polars Series, pandas Series, or Python sequences
                if isinstance(value, pl.Series):
                    series = value
                else:
                    try:
                        # pandas Series -> polars Series
                        if pd is not None and hasattr(value, "__array__") and not isinstance(value, (list, tuple)):
                            series = pl.Series(value)
                        elif isinstance(value, (list, tuple)):
                            series = pl.Series(key, list(value))
                        else:
                            # Scalar value: repeat across rows
                            series = pl.Series(key, [value] * len(self._df))
                    except Exception:
                        series = None

                if "series" in locals() and series is not None:
                    # Use with_columns to add/replace the column
                    self._df = self._df.with_columns(series.alias(key))
                    self.invalidate_pandas_cache()
                    return

            # Fallback to delegating to Polars __setitem__ for other patterns
            self._df.__setitem__(key, value)
            # Underlying data has changed; invalidate any cached pandas conversion
            self.invalidate_pandas_cache()
        except TypeError:
            # Polars raises TypeError for some unsupported assignment forms
            # (e.g., assigning a Series by index). Re-raise a clearer message
            msg = "DataFrame object does not support `Series` assignment by index\n\nUse `DataFrame.with_columns`."
            raise TypeError(msg)

    def invalidate_pandas_cache(self) -> None:
        """Clear the cached pandas conversion created by `to_pandas()`.

        Call this after mutating the underlying Polars DataFrame to ensure
        subsequent `pandas` accesses reflect the latest data.
        """
        self._pd_cache = None

    def add_lineage_entry(self, operation: str, **kwargs: Any) -> None:
        """Add a lineage entry to track the history of this DataFrame.

        Args:
        ----
            operation: Name of the operation performed.
            **kwargs: Parameters used for the operation.

        """
        self._lineage.append(
            {
                "operation": operation,
                "parameters": {k: str(v) if isinstance(v, Path) else v for k, v in kwargs.items()},
                "timestamp": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            }
        )

    @property
    def lineage(self) -> List[Dict[str, Any]]:
        """Return the lineage history of this DataFrame."""
        return self._lineage

    @property
    def df(self) -> pl.DataFrame:
        """Get the underlying Polars DataFrame."""
        return self._df

    def __len__(self) -> int:
        """Get the number of rows in the DataFrame."""
        # polars.DataFrame supports len(), but some wrapped/native objects
        # (for example older PyArrow-backed objects) may not implement __len__.
        # Try common fallbacks in order of preference.
        try:
            return len(self._df)
        except Exception:
            # polars exposes `.height` as row count and `.shape[0]` as rows
            try:
                return int(getattr(self._df, "height"))
            except Exception:
                try:
                    return int(self._df.shape[0])
                except Exception:
                    # Last resort: convert to pandas if available (cheap for small frames)
                    if pd is not None:
                        try:
                            return int(self._df.to_pandas().shape[0])
                        except Exception:
                            return 0
                    return 0

    def __repr__(self) -> str:
        """Return the string representation of the DataFrame."""
        # Avoid calling the underlying object's __str__/__repr__ if it may
        # raise TypeError (observed with some PyDataFrame wrappers). Use
        # safe fallbacks for a short textual preview.
        row_count = len(self)
        # Try polars' to_string-like rendering if available
        try:
            # Polars DataFrame implements __str__/__repr__; prefer repr()
            df_preview = repr(self._df)
        except Exception:
            try:
                # Try to convert to pandas for a safer repr
                if pd is not None:
                    df_preview = repr(self._df.to_pandas())
                else:
                    df_preview = "<unrepresentable DataFrame>"
            except Exception:
                df_preview = "<unrepresentable DataFrame>"

        return f"filoma.DataFrame with {row_count} rows\n{df_preview}"

    def __str__(self) -> str:
        """Return the string representation of the DataFrame."""
        return self.__repr__()

    def head(self, n: int = 5) -> "DataFrame":
        """Get the first n rows."""
        res = DataFrame(self._df.head(n), lineage=list(self._lineage))
        res.add_lineage_entry("head", n=n)
        return res

    def tail(self, n: int = 5) -> "DataFrame":
        """Get the last n rows."""
        res = DataFrame(self._df.tail(n), lineage=list(self._lineage))
        res.add_lineage_entry("tail", n=n)
        return res

    def add_path_components(self, inplace: bool = False) -> "DataFrame":
        """Add columns for path components (parent, name, stem, suffix).

        Returns
        -------
            New DataFrame with additional path component columns

        """
        cols_to_add = []
        if "parent" not in self._df.columns:
            cols_to_add.append(pl.col("path").map_elements(lambda x: str(Path(x).parent), return_dtype=pl.String).alias("parent"))
        if "name" not in self._df.columns:
            cols_to_add.append(pl.col("path").map_elements(lambda x: Path(x).name, return_dtype=pl.String).alias("name"))
        if "stem" not in self._df.columns:
            cols_to_add.append(pl.col("path").map_elements(lambda x: Path(x).stem, return_dtype=pl.String).alias("stem"))
        if "suffix" not in self._df.columns:
            cols_to_add.append(pl.col("path").map_elements(lambda x: Path(x).suffix, return_dtype=pl.String).alias("suffix"))

        if not cols_to_add:
            return self if inplace else DataFrame(self._df)

        df_with_components = self._df.with_columns(cols_to_add)
        if inplace:
            self._df = df_with_components
            self.invalidate_pandas_cache()
            self.add_lineage_entry("add_path_components")
            return self

        res = DataFrame(df_with_components, lineage=list(self._lineage))
        res.add_lineage_entry("add_path_components")
        return res

    def add_file_stats_cols(
        self,
        path: str = "path",
        base_path: Optional[Union[str, Path]] = None,
        compute_hash: bool = False,
        inplace: bool = False,
    ) -> "DataFrame":
        """Add file statistics columns (size, modified time, etc.) based on a column containing filesystem paths.

        Args:
        ----
            path: Name of the column containing file system paths.
            base_path: Optional base path. If provided, any non-absolute paths in the
                path column are resolved relative to this base.
            compute_hash: Whether to compute SHA256 hashes (slow for large files).
            inplace: If True, modify this DataFrame in-place and return ``self``.

        Returns:
        -------
            New DataFrame with file statistics columns added, or ``self`` when
            ``inplace=True``.

        Raises:
        ------
            ValueError: If the specified path column does not exist.

        """
        if path not in self._df.columns:
            raise ValueError(f"Column '{path}' not found in DataFrame")

        # Define the set of columns we intend to add
        target_cols = {
            "size_bytes",
            "modified_time",
            "created_time",
            "is_file",
            "is_dir",
            "owner",
            "group",
            "mode_str",
            "inode",
            "nlink",
            "sha256",
            "xattrs",
        }
        # Decide if we need to proceed. Proceed if any target column is missing,
        # OR if we need to compute hashes and the column is missing or has nulls.
        needs_hashes = compute_hash and ("sha256" not in self._df.columns or self._df["sha256"].null_count() > 0)
        missing_any = not all(c in self._df.columns for c in target_cols)

        if not missing_any and not needs_hashes:
            return self if inplace else DataFrame(self._df, lineage=list(self._lineage))

        # Resolve base path if provided
        base = Path(base_path) if base_path is not None else None

        # Use filoma's FileProfiler to collect rich file metadata
        profiler = FileProfiler()

        def get_file_stats(path_str: str) -> Dict[str, Any]:
            try:
                p = Path(path_str)
                if base is not None and not p.is_absolute():
                    p = base / p
                full_path = str(p)
                if not p.exists():
                    logger.warning(f"Path does not exist: {full_path}")
                    return {
                        "size_bytes": None,
                        "modified_time": None,
                        "created_time": None,
                        "is_file": None,
                        "is_dir": None,
                        "owner": None,
                        "group": None,
                        "mode_str": None,
                        "inode": None,
                        "nlink": None,
                        "sha256": None,
                        "xattrs": "{}",
                    }

                # Use the profiler; let it handle symlinks and permissions
                filo = profiler.probe(full_path, compute_hash=compute_hash)
                row = filo.as_dict()

                # Normalize keys to a stable schema used by this helper
                return {
                    "size_bytes": row.get("size"),
                    "modified_time": row.get("modified"),
                    "created_time": row.get("created"),
                    "is_file": row.get("is_file"),
                    "is_dir": row.get("is_dir"),
                    "owner": row.get("owner"),
                    "group": row.get("group"),
                    "mode_str": row.get("mode_str"),
                    "inode": row.get("inode"),
                    "nlink": row.get("nlink"),
                    "sha256": row.get("sha256"),
                    "xattrs": json.dumps(row.get("xattrs") or {}),
                }
            except Exception:
                # On any error, return a row of Nones/empties preserving schema
                return {
                    "size_bytes": None,
                    "modified_time": None,
                    "created_time": None,
                    "is_file": None,
                    "is_dir": None,
                    "owner": None,
                    "group": None,
                    "mode_str": None,
                    "inode": None,
                    "nlink": None,
                    "sha256": None,
                    "xattrs": "{}",
                }

        stats_data = [get_file_stats(p) for p in self._df[path].to_list()]

        stats_df = pl.DataFrame(
            stats_data,
            schema={
                "size_bytes": pl.Int64,
                "modified_time": pl.String,
                "created_time": pl.String,
                "is_file": pl.Boolean,
                "is_dir": pl.Boolean,
                "owner": pl.String,
                "group": pl.String,
                "mode_str": pl.String,
                "inode": pl.Int64,
                "nlink": pl.Int64,
                "sha256": pl.String,
                "xattrs": pl.String,
            },
        )

        # If columns already exist, we need to drop them before joining to avoid duplicates
        df_base = self._df
        overlapping_cols = [c for c in stats_df.columns if c in df_base.columns]
        if overlapping_cols:
            df_base = df_base.drop(overlapping_cols)

        df_with_stats = pl.concat([df_base, stats_df], how="horizontal")
        if inplace:
            self._df = df_with_stats
            self.invalidate_pandas_cache()
            self.add_lineage_entry("add_file_stats_cols", path_col=path, compute_hash=compute_hash)
            return self

        res = DataFrame(df_with_stats, lineage=list(self._lineage))
        res.add_lineage_entry("add_file_stats_cols", path_col=path, compute_hash=compute_hash)
        return res

    def add_depth_col(self, path: Optional[Union[str, Path]] = None, inplace: bool = False) -> "DataFrame":
        """Add a depth column showing the nesting level of each path.

        Args:
        ----
            path: The path to calculate depth from. If None, uses the common root.
            inplace: If True, modify this DataFrame in-place and return ``self``.

        Returns:
        -------
            New DataFrame with depth column

        """
        if "depth" in self._df.columns:
            return self if inplace else DataFrame(self._df)

        if path is None:
            # Find the common root path
            paths = [Path(p) for p in self._df["path"].to_list()]
            if not paths:
                path = Path()
            else:
                # Find common parent
                common_parts = []
                first_parts = paths[0].parts
                for i, part in enumerate(first_parts):
                    if all(len(p.parts) > i and p.parts[i] == part for p in paths):
                        common_parts.append(part)
                    else:
                        break
                path = Path(*common_parts) if common_parts else Path()
        else:
            path = Path(path)

        # Use a different local name to avoid shadowing the parameter inside calculate_depth
        path_root = path

        def calculate_depth(path_str: str) -> int:
            """Calculate the depth of a path relative to the provided root path."""
            try:
                p = Path(path_str)
                relative_path = p.relative_to(path_root)
                return len(relative_path.parts)
            except ValueError:
                # Path is not relative to the provided root path
                return len(Path(path_str).parts)

        df_with_depth = self._df.with_columns([pl.col("path").map_elements(calculate_depth, return_dtype=pl.Int64).alias("depth")])
        if inplace:
            self._df = df_with_depth
            self.invalidate_pandas_cache()
            self.add_lineage_entry("add_depth_col", reference_path=path)
            return self

        res = DataFrame(df_with_depth, lineage=list(self._lineage))
        res.add_lineage_entry("add_depth_col", reference_path=path)
        return res

    def filter_by_extension(self, extensions: Union[str, List[str]]) -> "DataFrame":
        """Filter the DataFrame to only include files with specific extensions.

        Args:
        ----
            extensions: File extension(s) to filter by (with or without leading dot)

        Returns:
        -------
            Filtered DataFrame

        """
        if isinstance(extensions, str):
            extensions = [extensions]

        # Normalize extensions (ensure they start with a dot)
        normalized_extensions = []
        for ext in extensions:
            if not ext.startswith("."):
                ext = "." + ext
            normalized_extensions.append(ext.lower())

        filtered_df = self._df.filter(
            pl.col("path").map_elements(
                lambda x: Path(x).suffix.lower() in normalized_extensions,
                return_dtype=pl.Boolean,
            )
        )
        res = DataFrame(filtered_df, lineage=list(self._lineage))
        res.add_lineage_entry("filter_by_extension", extensions=extensions)
        return res

    def filter_by_pattern(self, pattern: str) -> "DataFrame":
        """Filter the DataFrame by path pattern.

        Args:
        ----
            pattern: Pattern to match (uses Polars string contains)

        Returns:
        -------
            Filtered DataFrame

        """
        filtered_df = self._df.filter(pl.col("path").str.contains(pattern))
        res = DataFrame(filtered_df, lineage=list(self._lineage))
        res.add_lineage_entry("filter_by_pattern", pattern=pattern)
        return res

    def extension_counts(self) -> pl.DataFrame:
        """Group files by extension and count them.

        Returns
        -------
            Polars DataFrame with extension counts

        """
        # underlying `_df` is expected to be a Polars DataFrame
        df_with_ext = self._df.with_columns(
            [
                pl.col("path")
                .map_elements(
                    lambda x: (Path(x).suffix.lower() if Path(x).suffix else "<no extension>"),
                    return_dtype=pl.String,
                )
                .alias("extension")
            ]
        )
        result = df_with_ext.group_by("extension").len().sort("len", descending=True)
        return DataFrame(result)

    def directory_counts(self) -> pl.DataFrame:
        """Group files by their parent directory and count them.

        Returns
        -------
            Polars DataFrame with directory counts

        """
        # underlying `_df` is expected to be a Polars DataFrame
        df_with_parent = self._df.with_columns([pl.col("path").map_elements(lambda x: str(Path(x).parent), return_dtype=pl.String).alias("parent_dir")])
        result = df_with_parent.group_by("parent_dir").len().sort("len", descending=True)
        return DataFrame(result)

    def to_polars(self) -> pl.DataFrame:
        """Get the underlying Polars DataFrame."""
        return self._df

    def to_pandas(self, force: bool = False) -> Any:
        """Convert to a pandas DataFrame.

        By default this method will return a cached pandas conversion if one
        exists (for performance). Set ``force=True`` to reconvert from the
        current Polars DataFrame and update the cache.
        """
        if pd is None:
            raise ImportError("pandas is not installed. Please install it to use to_pandas().")
        # Convert and cache on first access or when forced
        if force or self._pd_cache is None:
            # Use Polars' to_pandas conversion for consistency
            self._pd_cache = self._df.to_pandas()
        return self._pd_cache

    @property
    def polars(self) -> pl.DataFrame:
        """Property access for the underlying Polars DataFrame (convenience)."""
        return self.to_polars()

    @property
    def pandas(self) -> Any:
        """Return a fresh pandas DataFrame conversion (not the cached object).

        This is intentionally a fresh conversion so callers who expect an
        up-to-date pandas view can access it directly. Use ``pandas_cached`` or
        ``to_pandas(force=False)`` to access the cached conversion for repeated
        reads, or ``to_pandas(force=True)`` to reconvert and update the cache.

        Raises
        ------
            ImportError: if pandas is not installed.

        """
        if pd is None:
            raise ImportError("pandas is not installed. Please install it to use pandas property.")
        return self._df.to_pandas()

    @property
    def pandas_cached(self) -> Any:
        """Return a cached pandas DataFrame, converting once if needed.

        This is useful when repeated conversions would be expensive and the
        caller is comfortable with an explicit cache that can be invalidated
        with ``invalidate_pandas_cache()`` or by calling ``to_pandas(force=True)``.
        """
        return self.to_pandas(force=False)

    @property
    def native(self):
        """Return the dataframe in the module-wide default backend.

        If `get_default_dataframe_backend()` is 'polars' this returns a Polars
        DataFrame, otherwise it returns a pandas DataFrame.
        """
        if get_default_dataframe_backend() == "polars":
            return self.polars
        return self.pandas

    @classmethod
    def from_pandas(cls, df: Any) -> "DataFrame":
        """Construct a filoma.DataFrame from a pandas DataFrame.

        This is a convenience wrapper that converts the pandas DataFrame into
        a Polars DataFrame and wraps it. Requires pandas to be installed.
        """
        if pd is None:
            raise RuntimeError("pandas is not available in this environment")
        # Convert via Polars for internal consistency
        pl_df = pl.from_pandas(df)
        return cls(pl_df)

    def to_dict(self) -> Dict[str, List]:
        """Convert to a dictionary."""
        return self._df.to_dict(as_series=False)

    def save_csv(self, path: Union[str, Path]) -> None:
        """Save the DataFrame to CSV."""
        self._df.write_csv(str(path))

    def save_parquet(self, path: Union[str, Path]) -> None:
        """Save the DataFrame to Parquet format."""
        self._df.write_parquet(str(path))

    # Convenience methods for common Polars operations that users expect
    @property
    def columns(self) -> List[str]:
        """Get column names."""
        return self._df.columns

    @property
    def dtypes(self) -> List[pl.DataType]:
        """Get column data types."""
        return self._df.dtypes

    @property
    def shape(self) -> tuple:
        """Get DataFrame shape (rows, columns)."""
        # Attempt to return a (rows, cols) tuple even if the underlying
        # object doesn't expose .shape or len(). Use the same fallbacks as
        # in __len__ for rows and inspect columns for width.
        try:
            rows, cols = self._df.shape
            return (int(rows), int(cols))
        except Exception:
            # Rows fallback
            try:
                rows = len(self)
            except Exception:
                rows = 0
            # Columns fallback: try .columns or pandas conversion
            try:
                cols = len(getattr(self._df, "columns"))
            except Exception:
                try:
                    if pd is not None:
                        cols = self._df.to_pandas().shape[1]
                    else:
                        cols = 0
                except Exception:
                    cols = 0
            return (int(rows), int(cols))

    def describe(self, percentiles: Optional[List[float]] = None) -> pl.DataFrame:
        """Generate descriptive statistics.

        Args:
        ----
            percentiles: List of percentiles to include (default: [0.25, 0.5, 0.75])

        """
        # Polars' describe returns a new DataFrame summarizing columns; wrap it
        return DataFrame(self._df.describe(percentiles=percentiles))

    def info(self) -> None:
        """Print concise summary of the DataFrame."""
        print("filoma.DataFrame")
        print(f"Shape: {self.shape}")
        print(f"Columns: {len(self.columns)}")
        print()

        # Column info
        print("Column details:")
        for i, (col, dtype) in enumerate(zip(self.columns, self.dtypes)):
            null_count = self._df[col].null_count()
            print(f"  {i:2d}  {col:15s} {str(dtype):15s} {null_count:8d} nulls")

        # Memory usage approximation
        memory_mb = sum(self._df[col].estimated_size("mb") for col in self.columns)
        print(f"\nEstimated memory usage: {memory_mb:.2f} MB")

    def unique(self, subset: Optional[Union[str, List[str]]] = None) -> "DataFrame":
        """Get unique rows.

        Args:
        ----
            subset: Column name(s) to consider for uniqueness

        """
        if subset is None:
            result = self._df.unique()
        else:
            result = self._df.unique(subset=subset)
        res = DataFrame(result, lineage=list(self._lineage))
        res.add_lineage_entry("unique", subset=subset)
        return res

    def sort(self, by: Union[str, List[str]], descending: bool = False) -> "DataFrame":
        """Sort the DataFrame.

        Args:
        ----
            by: Column name(s) to sort by
            descending: Sort in descending order

        """
        result = self._df.sort(by, descending=descending)
        res = DataFrame(result, lineage=list(self._lineage))
        res.add_lineage_entry("sort", by=by, descending=descending)
        return res

    def enrich(self, inplace: bool = False):
        """Enrich the DataFrame by adding features like path components, file stats, and depth.

        Args:
        ----
            inplace: If True, perform the operation in-place and return self.
                     If False (default), return a new DataFrame with the changes.

        """
        # Chain the enrichment methods; this produces a new DataFrame wrapper.
        # These methods are now idempotent, so calling enrich() multiple times is safe.
        # Use intermediate wrappers to avoid redundant lineage entries if desired,
        # but here we'll just record a single 'enrich' operation for the user.
        # To avoid multiple inner lineage entries, we can use the underlying _df.
        enriched_df = self.add_path_components().add_file_stats_cols().add_depth_col()._df

        if inplace:
            # Update the internal state of the current object
            self._df = enriched_df
            self.with_enrich = True
            self.invalidate_pandas_cache()
            self.add_lineage_entry("enrich")
            return self

        # Return the new, enriched DataFrame instance
        res = DataFrame(enriched_df, lineage=list(self._lineage))
        res.with_enrich = True
        res.add_lineage_entry("enrich")
        return res

    def evaluate_duplicates(
        self,
        path_col: str = "path",
        text_threshold: float = 0.8,
        image_max_distance: int = 5,
        text_k: int = 3,
        show_table: bool = True,
        cross_dir_paths: Optional[List[str]] = None,
    ) -> dict:
        """Evaluate duplicates among files in the DataFrame.

        Scans the `path_col` column, runs exact, text and image duplicate
        detectors. Optionally filters to show only duplicates that cross
        directory boundaries (requires `cross_dir_paths` to define boundaries).
        """
        if path_col not in self._df.columns:
            raise ValueError(f"Column '{path_col}' not found in DataFrame")

        # filter for files only
        paths = [str(p) for p in self._df[path_col].to_list() if Path(p).is_file()]
        res = _dedup.find_duplicates(
            paths,
            text_k=text_k,
            text_threshold=text_threshold,
            image_max_distance=image_max_distance,
        )

        # Filter for cross-directory duplicates if requested
        if cross_dir_paths:
            for category in ["exact", "text", "image"]:
                filtered_groups = []
                for group in res.get(category, []):
                    # Check if file sources span multiple folders
                    source_dirs = set()
                    for p in group:
                        for cp in cross_dir_paths:
                            if str(p).startswith(str(cp)):
                                source_dirs.add(cp)
                    if len(source_dirs) > 1:
                        filtered_groups.append(group)
                res[category] = filtered_groups

        # Summarize counts
        exact_groups = res.get("exact", [])
        text_groups = res.get("text", [])
        image_groups = res.get("image", [])

        console = Console()
        if show_table:
            table = Table(title="Duplicate Summary (Cross-Dir)" if cross_dir_paths else "Duplicate Summary")
            table.add_column("Type", style="bold cyan")
            table.add_column("Groups", style="white")
            table.add_column("Files In Groups", style="white")
            table.add_row(
                "exact",
                str(len(exact_groups)),
                str(sum(len(g) for g in exact_groups) if exact_groups else 0),
            )
            table.add_row(
                "text",
                str(len(text_groups)),
                str(sum(len(g) for g in text_groups) if text_groups else 0),
            )
            table.add_row(
                "image",
                str(len(image_groups)),
                str(sum(len(g) for g in image_groups) if image_groups else 0),
            )
            console.print(table)

        logger.info(
            f"Duplicate summary: exact={len(exact_groups)} groups "
            f"({sum(len(g) for g in exact_groups) if exact_groups else 0} files), "
            f"text={len(text_groups)} groups "
            f"({sum(len(g) for g in text_groups) if text_groups else 0} files), "
            f"image={len(image_groups)} groups "
            f"({sum(len(g) for g in image_groups) if image_groups else 0} files)"
        )

        return res

    def add_filename_features(
        self,
        path_col: str = "path",
        sep: str = "_",
        prefix: Optional[str] = "feat",
        max_tokens: Optional[int] = None,
        include_parent: bool = False,
        include_all_parts: bool = False,
        token_names: Optional[Union[str, Sequence[str]]] = None,
        enrich: bool = False,
        inplace: bool = False,
    ) -> "DataFrame":
        """Discover filename features and add them as columns on this DataFrame.

        This instance method discovers separator-based tokens from filename
        stems and adds columns (e.g., `feat1`, `feat2` or `token1`, ...).

        Args:
        ----
            path_col: Column containing path strings to analyze (default: 'path').
            sep: Separator used to split filename stems (default: '_').
            prefix: Column name prefix for discovered tokens (default: 'feat').
            max_tokens: Optional cap on extracted tokens; by default uses observed max.
            include_parent: If True, add a `parent` column containing immediate parent folder name.
            include_all_parts: If True, add `path_part0`, `path_part1`, ... for all Path.parts.
            token_names: Optional list of token column names or 'auto' to generate readable names.
            enrich: If True, automatically enrich the DataFrame with path components and file stats before discovery.
            inplace: If True, perform the operation in-place and return self. Otherwise returns a new `filoma.DataFrame`.

        Returns:
        -------
            A new or modified `filoma.DataFrame` with discovered filename features.

        """
        # Determine the base Polars DataFrame for feature discovery
        base_df = self
        if enrich and not self.with_enrich:
            logger.info("Enriching DataFrame before discovering filename features")
            base_df = self.enrich(inplace=False)

        # Polars-native implementation inlined here (formerly a top-level helper).
        pl_df = base_df._df
        if path_col not in pl_df.columns:
            raise ValueError(f"DataFrame must have a '{path_col}' column")

        stems = [Path(s).stem for s in pl_df[path_col].to_list()]
        split_tokens = [stem.split(sep) if stem is not None else [""] for stem in stems]
        observed_max = max((len(t) for t in split_tokens), default=0)
        if max_tokens is None:
            eff_max = observed_max
        else:
            eff_max = max_tokens

        # Normalize token_names
        if token_names == "auto":
            token_names_seq = None
            auto_mode = True
        elif isinstance(token_names, (list, tuple)):
            token_names_seq = list(token_names)
            auto_mode = False
        else:
            token_names_seq = None
            auto_mode = False

        new_cols = []
        for i in range(eff_max):
            if token_names_seq is not None and i < len(token_names_seq) and token_names_seq[i]:
                col_name = token_names_seq[i]
            elif auto_mode:
                base = prefix if prefix else "token"
                col_name = f"{base}{i + 1}"
            else:
                if prefix:
                    col_name = f"{prefix}{i + 1}"
                else:
                    col_name = f"token{i + 1}"

            def pick_token(s: str, idx=i):
                st = Path(s).stem
                parts = st.split(sep) if st is not None else [""]
                try:
                    return parts[idx]
                except Exception:
                    return ""

            new_cols.append(pl.col(path_col).map_elements(pick_token, return_dtype=pl.Utf8).alias(col_name))

        if include_parent:
            new_cols.append(pl.col(path_col).map_elements(lambda s: Path(s).parent.name, return_dtype=pl.Utf8).alias("parent"))

        if include_all_parts:
            parts_lists = [list(Path(s).parts) for s in pl_df[path_col].to_list()]
            max_parts = max((len(p) for p in parts_lists), default=0)
            for i in range(max_parts):
                col_name = f"path_part{i}"

                def pick_part(s: str, idx=i):
                    try:
                        parts = list(Path(s).parts)
                        return parts[idx]
                    except Exception:
                        return ""

                new_cols.append(pl.col(path_col).map_elements(pick_part, return_dtype=pl.Utf8).alias(col_name))

        pl_result = pl_df.with_columns(new_cols)

        # Wrap the result in a filoma.DataFrame
        enriched_wrapper = DataFrame(pl_result, lineage=list(self._lineage))
        enriched_wrapper.with_filename_features = True
        enriched_wrapper.add_lineage_entry(
            "add_filename_features",
            sep=sep,
            prefix=prefix,
            max_tokens=max_tokens,
            include_parent=include_parent,
            token_names=token_names,
        )

        if inplace:
            self._df = enriched_wrapper._df
            self.with_filename_features = True
            if enrich and not self.with_enrich:
                self.with_enrich = True
            self.invalidate_pandas_cache()
            self._lineage = enriched_wrapper._lineage
            return self

        return enriched_wrapper

`columns` `property` ¶

Get column names.

`df` `property` ¶

Get the underlying Polars DataFrame.

`dtypes` `property` ¶

Get column data types.

`lineage` `property` ¶

Return the lineage history of this DataFrame.

`native` `property` ¶

Return the dataframe in the module-wide default backend.

If get_default_dataframe_backend() is 'polars' this returns a Polars DataFrame, otherwise it returns a pandas DataFrame.

`pandas` `property` ¶

Return a fresh pandas DataFrame conversion (not the cached object).

This is intentionally a fresh conversion so callers who expect an up-to-date pandas view can access it directly. Use pandas_cached or to_pandas(force=False) to access the cached conversion for repeated reads, or to_pandas(force=True) to reconvert and update the cache.

Raises¶

ImportError: if pandas is not installed.

`pandas_cached` `property` ¶

Return a cached pandas DataFrame, converting once if needed.

This is useful when repeated conversions would be expensive and the caller is comfortable with an explicit cache that can be invalidated with invalidate_pandas_cache() or by calling to_pandas(force=True).

`polars` `property` ¶

Property access for the underlying Polars DataFrame (convenience).

`shape` `property` ¶

Get DataFrame shape (rows, columns).

`dir()` ¶

Expose both wrapper and underlying Polars attributes in interactive help.

Source code in filoma/dataframe.py

def __dir__(self) -> List[str]:
    """Expose both wrapper and underlying Polars attributes in interactive help."""
    attrs = set(super().__dir__())
    try:
        attrs.update(dir(self._df))
    except Exception:
        pass
    return sorted(list(attrs))

`getattr(name)` ¶

Delegate attribute access to the underlying Polars DataFrame.

This allows direct access to all Polars DataFrame methods and properties like columns, dtypes, shape, select, filter, group_by, etc.

Source code in filoma/dataframe.py

def __getattr__(self, name: str) -> Any:
    """Delegate attribute access to the underlying Polars DataFrame.

    This allows direct access to all Polars DataFrame methods and properties
    like columns, dtypes, shape, select, filter, group_by, etc.
    """
    # Directly return the attribute from the underlying Polars DataFrame.
    # NOTE: We intentionally do NOT wrap returned Polars DataFrames anymore.
    # This makes filoma.DataFrame behave like a Polars DataFrame by default
    # (calls like df.head(), df.select(...), etc. return native Polars
    # objects). This is a breaking change compared to previously wrapping
    # Polars results in filoma.DataFrame.
    try:
        attr = getattr(self._df, name)
    except AttributeError:
        # Preserve the original error semantics
        raise

    # If the attribute is callable, return a wrapper that conditionally
    # wraps returned polars.DataFrame objects into filoma.DataFrame
    if callable(attr):

        def wrapper(*args, **kwargs):
            result = attr(*args, **kwargs)
            # If the underlying call mutated the Polars DataFrame in-place,
            # Polars often returns None or the same object reference. In
            # that case invalidate the cached pandas conversion so future
            # .pandas/.pandas_cached calls reflect the mutation.
            if result is None or result is self._df:
                try:
                    self.invalidate_pandas_cache()
                except Exception:
                    # Best-effort: do not let cache invalidation break calls
                    pass
                return result

            # If wrapping is enabled and result is a Polars DataFrame,
            # wrap it back into filoma.DataFrame for compatibility.
            # Propagate lineage to the new wrapper.
            if get_default_wrap_polars() and isinstance(result, pl.DataFrame):
                return DataFrame(result, lineage=list(self._lineage))

            return result

        return wrapper

    # Non-callable attributes (properties) — if it's a Polars DataFrame and
    # wrapping is requested, wrap it; otherwise return as-is.
    if get_default_wrap_polars() and isinstance(attr, pl.DataFrame):
        return DataFrame(attr, lineage=list(self._lineage))

    return attr

`getitem(key)` ¶

Forward subscription (e.g., df['path']) to the underlying Polars DataFrame.

Returns native Polars objects (Series or DataFrame) to match the default Polars-first behavior of this wrapper.

Source code in filoma/dataframe.py

def __getitem__(self, key):
    """Forward subscription (e.g., df['path']) to the underlying Polars DataFrame.

    Returns native Polars objects (Series or DataFrame) to match the default
    Polars-first behavior of this wrapper.
    """
    return self._df.__getitem__(key)

`init(data=None, lineage=None)` ¶

Initialize a DataFrame.

data: Initial data. Can be:
    - A Polars DataFrame
    - A dictionary mapping column names to sequences (all same length)
    - A list of string paths
    - A list of Path objects
    - None for an empty DataFrame
lineage: Optional list of lineage entries.

Source code in filoma/dataframe.py

def __init__(
    self,
    data: Optional[Union[pl.DataFrame, List[str], List[Path], Dict[str, Any]]] = None,
    lineage: Optional[List[Dict[str, Any]]] = None,
):
    """Initialize a DataFrame.

    Args:
    ----
        data: Initial data. Can be:
            - A Polars DataFrame
            - A dictionary mapping column names to sequences (all same length)
            - A list of string paths
            - A list of Path objects
            - None for an empty DataFrame
        lineage: Optional list of lineage entries.

    """
    if data is None:
        self._df = pl.DataFrame({"path": []}, schema={"path": pl.String})
    elif isinstance(data, pl.DataFrame):
        self._df = data
    elif isinstance(data, dict):
        if not data:
            self._df = pl.DataFrame()
        else:
            processed: Dict[str, List[Any]] = {}
            expected_len: Optional[int] = None
            for col, values in data.items():
                if not isinstance(values, (list, tuple)):
                    raise ValueError("Dictionary values must be list or tuple sequences")
                seq = [str(x) if isinstance(x, Path) else x for x in values]
                if expected_len is None:
                    expected_len = len(seq)
                elif len(seq) != expected_len:
                    raise ValueError("All dictionary value sequences must have the same length")
                processed[col] = seq
            self._df = pl.DataFrame(processed)
    elif isinstance(data, list):
        if data and isinstance(data[0], dict):
            # Handle list of dictionaries (from manifest or to_dicts())
            self._df = pl.from_dicts(data)
        else:
            paths = [str(path) for path in data]
            self._df = pl.DataFrame({"path": paths})
    else:
        raise ValueError("data must be a Polars DataFrame, dict of columns, list of paths, or None")
    self._pd_cache = None
    self.with_enrich = False
    self.with_filename_features = False
    self._lineage = lineage or []

`len()` ¶

Get the number of rows in the DataFrame.

Source code in filoma/dataframe.py

def __len__(self) -> int:
    """Get the number of rows in the DataFrame."""
    # polars.DataFrame supports len(), but some wrapped/native objects
    # (for example older PyArrow-backed objects) may not implement __len__.
    # Try common fallbacks in order of preference.
    try:
        return len(self._df)
    except Exception:
        # polars exposes `.height` as row count and `.shape[0]` as rows
        try:
            return int(getattr(self._df, "height"))
        except Exception:
            try:
                return int(self._df.shape[0])
            except Exception:
                # Last resort: convert to pandas if available (cheap for small frames)
                if pd is not None:
                    try:
                        return int(self._df.to_pandas().shape[0])
                    except Exception:
                        return 0
                return 0

`repr()` ¶

Return the string representation of the DataFrame.

Source code in filoma/dataframe.py

def __repr__(self) -> str:
    """Return the string representation of the DataFrame."""
    # Avoid calling the underlying object's __str__/__repr__ if it may
    # raise TypeError (observed with some PyDataFrame wrappers). Use
    # safe fallbacks for a short textual preview.
    row_count = len(self)
    # Try polars' to_string-like rendering if available
    try:
        # Polars DataFrame implements __str__/__repr__; prefer repr()
        df_preview = repr(self._df)
    except Exception:
        try:
            # Try to convert to pandas for a safer repr
            if pd is not None:
                df_preview = repr(self._df.to_pandas())
            else:
                df_preview = "<unrepresentable DataFrame>"
        except Exception:
            df_preview = "<unrepresentable DataFrame>"

    return f"filoma.DataFrame with {row_count} rows\n{df_preview}"

`setitem(key, value)` ¶

Forward item assignment to the underlying Polars DataFrame.

Source code in filoma/dataframe.py

def __setitem__(self, key, value):
    """Forward item assignment to the underlying Polars DataFrame."""
    # Polars DataFrame supports column assignment via df[key] = value
    # Try to support common user-friendly patterns: assigning a Python
    # sequence or a Series to create/replace a column. Polars' native
    # __setitem__ may raise TypeError in some versions, so handle that
    # explicitly and fall back to with_columns.
    try:
        if isinstance(key, str):
            # Accept polars Series, pandas Series, or Python sequences
            if isinstance(value, pl.Series):
                series = value
            else:
                try:
                    # pandas Series -> polars Series
                    if pd is not None and hasattr(value, "__array__") and not isinstance(value, (list, tuple)):
                        series = pl.Series(value)
                    elif isinstance(value, (list, tuple)):
                        series = pl.Series(key, list(value))
                    else:
                        # Scalar value: repeat across rows
                        series = pl.Series(key, [value] * len(self._df))
                except Exception:
                    series = None

            if "series" in locals() and series is not None:
                # Use with_columns to add/replace the column
                self._df = self._df.with_columns(series.alias(key))
                self.invalidate_pandas_cache()
                return

        # Fallback to delegating to Polars __setitem__ for other patterns
        self._df.__setitem__(key, value)
        # Underlying data has changed; invalidate any cached pandas conversion
        self.invalidate_pandas_cache()
    except TypeError:
        # Polars raises TypeError for some unsupported assignment forms
        # (e.g., assigning a Series by index). Re-raise a clearer message
        msg = "DataFrame object does not support `Series` assignment by index\n\nUse `DataFrame.with_columns`."
        raise TypeError(msg)

`str()` ¶

Return the string representation of the DataFrame.

Source code in filoma/dataframe.py

def __str__(self) -> str:
    """Return the string representation of the DataFrame."""
    return self.__repr__()

`add_depth_col(path=None, inplace=False)` ¶

Add a depth column showing the nesting level of each path.

path: The path to calculate depth from. If None, uses the common root.
inplace: If True, modify this DataFrame in-place and return ``self``.

New DataFrame with depth column

Source code in filoma/dataframe.py

def add_depth_col(self, path: Optional[Union[str, Path]] = None, inplace: bool = False) -> "DataFrame":
    """Add a depth column showing the nesting level of each path.

    Args:
    ----
        path: The path to calculate depth from. If None, uses the common root.
        inplace: If True, modify this DataFrame in-place and return ``self``.

    Returns:
    -------
        New DataFrame with depth column

    """
    if "depth" in self._df.columns:
        return self if inplace else DataFrame(self._df)

    if path is None:
        # Find the common root path
        paths = [Path(p) for p in self._df["path"].to_list()]
        if not paths:
            path = Path()
        else:
            # Find common parent
            common_parts = []
            first_parts = paths[0].parts
            for i, part in enumerate(first_parts):
                if all(len(p.parts) > i and p.parts[i] == part for p in paths):
                    common_parts.append(part)
                else:
                    break
            path = Path(*common_parts) if common_parts else Path()
    else:
        path = Path(path)

    # Use a different local name to avoid shadowing the parameter inside calculate_depth
    path_root = path

    def calculate_depth(path_str: str) -> int:
        """Calculate the depth of a path relative to the provided root path."""
        try:
            p = Path(path_str)
            relative_path = p.relative_to(path_root)
            return len(relative_path.parts)
        except ValueError:
            # Path is not relative to the provided root path
            return len(Path(path_str).parts)

    df_with_depth = self._df.with_columns([pl.col("path").map_elements(calculate_depth, return_dtype=pl.Int64).alias("depth")])
    if inplace:
        self._df = df_with_depth
        self.invalidate_pandas_cache()
        self.add_lineage_entry("add_depth_col", reference_path=path)
        return self

    res = DataFrame(df_with_depth, lineage=list(self._lineage))
    res.add_lineage_entry("add_depth_col", reference_path=path)
    return res

`add_file_stats_cols(path='path', base_path=None, compute_hash=False, inplace=False)` ¶

Add file statistics columns (size, modified time, etc.) based on a column containing filesystem paths.

path: Name of the column containing file system paths.
base_path: Optional base path. If provided, any non-absolute paths in the
    path column are resolved relative to this base.
compute_hash: Whether to compute SHA256 hashes (slow for large files).
inplace: If True, modify this DataFrame in-place and return ``self``.

New DataFrame with file statistics columns added, or ``self`` when
``inplace=True``.

ValueError: If the specified path column does not exist.

Source code in filoma/dataframe.py

def add_file_stats_cols(
    self,
    path: str = "path",
    base_path: Optional[Union[str, Path]] = None,
    compute_hash: bool = False,
    inplace: bool = False,
) -> "DataFrame":
    """Add file statistics columns (size, modified time, etc.) based on a column containing filesystem paths.

    Args:
    ----
        path: Name of the column containing file system paths.
        base_path: Optional base path. If provided, any non-absolute paths in the
            path column are resolved relative to this base.
        compute_hash: Whether to compute SHA256 hashes (slow for large files).
        inplace: If True, modify this DataFrame in-place and return ``self``.

    Returns:
    -------
        New DataFrame with file statistics columns added, or ``self`` when
        ``inplace=True``.

    Raises:
    ------
        ValueError: If the specified path column does not exist.

    """
    if path not in self._df.columns:
        raise ValueError(f"Column '{path}' not found in DataFrame")

    # Define the set of columns we intend to add
    target_cols = {
        "size_bytes",
        "modified_time",
        "created_time",
        "is_file",
        "is_dir",
        "owner",
        "group",
        "mode_str",
        "inode",
        "nlink",
        "sha256",
        "xattrs",
    }
    # Decide if we need to proceed. Proceed if any target column is missing,
    # OR if we need to compute hashes and the column is missing or has nulls.
    needs_hashes = compute_hash and ("sha256" not in self._df.columns or self._df["sha256"].null_count() > 0)
    missing_any = not all(c in self._df.columns for c in target_cols)

    if not missing_any and not needs_hashes:
        return self if inplace else DataFrame(self._df, lineage=list(self._lineage))

    # Resolve base path if provided
    base = Path(base_path) if base_path is not None else None

    # Use filoma's FileProfiler to collect rich file metadata
    profiler = FileProfiler()

    def get_file_stats(path_str: str) -> Dict[str, Any]:
        try:
            p = Path(path_str)
            if base is not None and not p.is_absolute():
                p = base / p
            full_path = str(p)
            if not p.exists():
                logger.warning(f"Path does not exist: {full_path}")
                return {
                    "size_bytes": None,
                    "modified_time": None,
                    "created_time": None,
                    "is_file": None,
                    "is_dir": None,
                    "owner": None,
                    "group": None,
                    "mode_str": None,
                    "inode": None,
                    "nlink": None,
                    "sha256": None,
                    "xattrs": "{}",
                }

            # Use the profiler; let it handle symlinks and permissions
            filo = profiler.probe(full_path, compute_hash=compute_hash)
            row = filo.as_dict()

            # Normalize keys to a stable schema used by this helper
            return {
                "size_bytes": row.get("size"),
                "modified_time": row.get("modified"),
                "created_time": row.get("created"),
                "is_file": row.get("is_file"),
                "is_dir": row.get("is_dir"),
                "owner": row.get("owner"),
                "group": row.get("group"),
                "mode_str": row.get("mode_str"),
                "inode": row.get("inode"),
                "nlink": row.get("nlink"),
                "sha256": row.get("sha256"),
                "xattrs": json.dumps(row.get("xattrs") or {}),
            }
        except Exception:
            # On any error, return a row of Nones/empties preserving schema
            return {
                "size_bytes": None,
                "modified_time": None,
                "created_time": None,
                "is_file": None,
                "is_dir": None,
                "owner": None,
                "group": None,
                "mode_str": None,
                "inode": None,
                "nlink": None,
                "sha256": None,
                "xattrs": "{}",
            }

    stats_data = [get_file_stats(p) for p in self._df[path].to_list()]

    stats_df = pl.DataFrame(
        stats_data,
        schema={
            "size_bytes": pl.Int64,
            "modified_time": pl.String,
            "created_time": pl.String,
            "is_file": pl.Boolean,
            "is_dir": pl.Boolean,
            "owner": pl.String,
            "group": pl.String,
            "mode_str": pl.String,
            "inode": pl.Int64,
            "nlink": pl.Int64,
            "sha256": pl.String,
            "xattrs": pl.String,
        },
    )

    # If columns already exist, we need to drop them before joining to avoid duplicates
    df_base = self._df
    overlapping_cols = [c for c in stats_df.columns if c in df_base.columns]
    if overlapping_cols:
        df_base = df_base.drop(overlapping_cols)

    df_with_stats = pl.concat([df_base, stats_df], how="horizontal")
    if inplace:
        self._df = df_with_stats
        self.invalidate_pandas_cache()
        self.add_lineage_entry("add_file_stats_cols", path_col=path, compute_hash=compute_hash)
        return self

    res = DataFrame(df_with_stats, lineage=list(self._lineage))
    res.add_lineage_entry("add_file_stats_cols", path_col=path, compute_hash=compute_hash)
    return res

`add_filename_features(path_col='path', sep='_', prefix='feat', max_tokens=None, include_parent=False, include_all_parts=False, token_names=None, enrich=False, inplace=False)` ¶

Discover filename features and add them as columns on this DataFrame.

This instance method discovers separator-based tokens from filename stems and adds columns (e.g., feat1, feat2 or token1, ...).

path_col: Column containing path strings to analyze (default: 'path').
sep: Separator used to split filename stems (default: '_').
prefix: Column name prefix for discovered tokens (default: 'feat').
max_tokens: Optional cap on extracted tokens; by default uses observed max.
include_parent: If True, add a `parent` column containing immediate parent folder name.
include_all_parts: If True, add `path_part0`, `path_part1`, ... for all Path.parts.
token_names: Optional list of token column names or 'auto' to generate readable names.
enrich: If True, automatically enrich the DataFrame with path components and file stats before discovery.
inplace: If True, perform the operation in-place and return self. Otherwise returns a new `filoma.DataFrame`.

A new or modified `filoma.DataFrame` with discovered filename features.

Source code in filoma/dataframe.py

def add_filename_features(
    self,
    path_col: str = "path",
    sep: str = "_",
    prefix: Optional[str] = "feat",
    max_tokens: Optional[int] = None,
    include_parent: bool = False,
    include_all_parts: bool = False,
    token_names: Optional[Union[str, Sequence[str]]] = None,
    enrich: bool = False,
    inplace: bool = False,
) -> "DataFrame":
    """Discover filename features and add them as columns on this DataFrame.

    This instance method discovers separator-based tokens from filename
    stems and adds columns (e.g., `feat1`, `feat2` or `token1`, ...).

    Args:
    ----
        path_col: Column containing path strings to analyze (default: 'path').
        sep: Separator used to split filename stems (default: '_').
        prefix: Column name prefix for discovered tokens (default: 'feat').
        max_tokens: Optional cap on extracted tokens; by default uses observed max.
        include_parent: If True, add a `parent` column containing immediate parent folder name.
        include_all_parts: If True, add `path_part0`, `path_part1`, ... for all Path.parts.
        token_names: Optional list of token column names or 'auto' to generate readable names.
        enrich: If True, automatically enrich the DataFrame with path components and file stats before discovery.
        inplace: If True, perform the operation in-place and return self. Otherwise returns a new `filoma.DataFrame`.

    Returns:
    -------
        A new or modified `filoma.DataFrame` with discovered filename features.

    """
    # Determine the base Polars DataFrame for feature discovery
    base_df = self
    if enrich and not self.with_enrich:
        logger.info("Enriching DataFrame before discovering filename features")
        base_df = self.enrich(inplace=False)

    # Polars-native implementation inlined here (formerly a top-level helper).
    pl_df = base_df._df
    if path_col not in pl_df.columns:
        raise ValueError(f"DataFrame must have a '{path_col}' column")

    stems = [Path(s).stem for s in pl_df[path_col].to_list()]
    split_tokens = [stem.split(sep) if stem is not None else [""] for stem in stems]
    observed_max = max((len(t) for t in split_tokens), default=0)
    if max_tokens is None:
        eff_max = observed_max
    else:
        eff_max = max_tokens

    # Normalize token_names
    if token_names == "auto":
        token_names_seq = None
        auto_mode = True
    elif isinstance(token_names, (list, tuple)):
        token_names_seq = list(token_names)
        auto_mode = False
    else:
        token_names_seq = None
        auto_mode = False

    new_cols = []
    for i in range(eff_max):
        if token_names_seq is not None and i < len(token_names_seq) and token_names_seq[i]:
            col_name = token_names_seq[i]
        elif auto_mode:
            base = prefix if prefix else "token"
            col_name = f"{base}{i + 1}"
        else:
            if prefix:
                col_name = f"{prefix}{i + 1}"
            else:
                col_name = f"token{i + 1}"

        def pick_token(s: str, idx=i):
            st = Path(s).stem
            parts = st.split(sep) if st is not None else [""]
            try:
                return parts[idx]
            except Exception:
                return ""

        new_cols.append(pl.col(path_col).map_elements(pick_token, return_dtype=pl.Utf8).alias(col_name))

    if include_parent:
        new_cols.append(pl.col(path_col).map_elements(lambda s: Path(s).parent.name, return_dtype=pl.Utf8).alias("parent"))

    if include_all_parts:
        parts_lists = [list(Path(s).parts) for s in pl_df[path_col].to_list()]
        max_parts = max((len(p) for p in parts_lists), default=0)
        for i in range(max_parts):
            col_name = f"path_part{i}"

            def pick_part(s: str, idx=i):
                try:
                    parts = list(Path(s).parts)
                    return parts[idx]
                except Exception:
                    return ""

            new_cols.append(pl.col(path_col).map_elements(pick_part, return_dtype=pl.Utf8).alias(col_name))

    pl_result = pl_df.with_columns(new_cols)

    # Wrap the result in a filoma.DataFrame
    enriched_wrapper = DataFrame(pl_result, lineage=list(self._lineage))
    enriched_wrapper.with_filename_features = True
    enriched_wrapper.add_lineage_entry(
        "add_filename_features",
        sep=sep,
        prefix=prefix,
        max_tokens=max_tokens,
        include_parent=include_parent,
        token_names=token_names,
    )

    if inplace:
        self._df = enriched_wrapper._df
        self.with_filename_features = True
        if enrich and not self.with_enrich:
            self.with_enrich = True
        self.invalidate_pandas_cache()
        self._lineage = enriched_wrapper._lineage
        return self

    return enriched_wrapper

`add_lineage_entry(operation, **kwargs)` ¶

Add a lineage entry to track the history of this DataFrame.

operation: Name of the operation performed.
**kwargs: Parameters used for the operation.

Source code in filoma/dataframe.py

def add_lineage_entry(self, operation: str, **kwargs: Any) -> None:
    """Add a lineage entry to track the history of this DataFrame.

    Args:
    ----
        operation: Name of the operation performed.
        **kwargs: Parameters used for the operation.

    """
    self._lineage.append(
        {
            "operation": operation,
            "parameters": {k: str(v) if isinstance(v, Path) else v for k, v in kwargs.items()},
            "timestamp": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
        }
    )

`add_path_components(inplace=False)` ¶

Add columns for path components (parent, name, stem, suffix).

Returns¶

New DataFrame with additional path component columns

Source code in filoma/dataframe.py

def add_path_components(self, inplace: bool = False) -> "DataFrame":
    """Add columns for path components (parent, name, stem, suffix).

    Returns
    -------
        New DataFrame with additional path component columns

    """
    cols_to_add = []
    if "parent" not in self._df.columns:
        cols_to_add.append(pl.col("path").map_elements(lambda x: str(Path(x).parent), return_dtype=pl.String).alias("parent"))
    if "name" not in self._df.columns:
        cols_to_add.append(pl.col("path").map_elements(lambda x: Path(x).name, return_dtype=pl.String).alias("name"))
    if "stem" not in self._df.columns:
        cols_to_add.append(pl.col("path").map_elements(lambda x: Path(x).stem, return_dtype=pl.String).alias("stem"))
    if "suffix" not in self._df.columns:
        cols_to_add.append(pl.col("path").map_elements(lambda x: Path(x).suffix, return_dtype=pl.String).alias("suffix"))

    if not cols_to_add:
        return self if inplace else DataFrame(self._df)

    df_with_components = self._df.with_columns(cols_to_add)
    if inplace:
        self._df = df_with_components
        self.invalidate_pandas_cache()
        self.add_lineage_entry("add_path_components")
        return self

    res = DataFrame(df_with_components, lineage=list(self._lineage))
    res.add_lineage_entry("add_path_components")
    return res

`describe(percentiles=None)` ¶

Generate descriptive statistics.

percentiles: List of percentiles to include (default: [0.25, 0.5, 0.75])

Source code in filoma/dataframe.py

def describe(self, percentiles: Optional[List[float]] = None) -> pl.DataFrame:
    """Generate descriptive statistics.

    Args:
    ----
        percentiles: List of percentiles to include (default: [0.25, 0.5, 0.75])

    """
    # Polars' describe returns a new DataFrame summarizing columns; wrap it
    return DataFrame(self._df.describe(percentiles=percentiles))

`directory_counts()` ¶

Group files by their parent directory and count them.

Returns¶

Polars DataFrame with directory counts

Source code in filoma/dataframe.py

def directory_counts(self) -> pl.DataFrame:
    """Group files by their parent directory and count them.

    Returns
    -------
        Polars DataFrame with directory counts

    """
    # underlying `_df` is expected to be a Polars DataFrame
    df_with_parent = self._df.with_columns([pl.col("path").map_elements(lambda x: str(Path(x).parent), return_dtype=pl.String).alias("parent_dir")])
    result = df_with_parent.group_by("parent_dir").len().sort("len", descending=True)
    return DataFrame(result)

`enrich(inplace=False)` ¶

Enrich the DataFrame by adding features like path components, file stats, and depth.

inplace: If True, perform the operation in-place and return self.
         If False (default), return a new DataFrame with the changes.

Source code in filoma/dataframe.py

def enrich(self, inplace: bool = False):
    """Enrich the DataFrame by adding features like path components, file stats, and depth.

    Args:
    ----
        inplace: If True, perform the operation in-place and return self.
                 If False (default), return a new DataFrame with the changes.

    """
    # Chain the enrichment methods; this produces a new DataFrame wrapper.
    # These methods are now idempotent, so calling enrich() multiple times is safe.
    # Use intermediate wrappers to avoid redundant lineage entries if desired,
    # but here we'll just record a single 'enrich' operation for the user.
    # To avoid multiple inner lineage entries, we can use the underlying _df.
    enriched_df = self.add_path_components().add_file_stats_cols().add_depth_col()._df

    if inplace:
        # Update the internal state of the current object
        self._df = enriched_df
        self.with_enrich = True
        self.invalidate_pandas_cache()
        self.add_lineage_entry("enrich")
        return self

    # Return the new, enriched DataFrame instance
    res = DataFrame(enriched_df, lineage=list(self._lineage))
    res.with_enrich = True
    res.add_lineage_entry("enrich")
    return res

`evaluate_duplicates(path_col='path', text_threshold=0.8, image_max_distance=5, text_k=3, show_table=True, cross_dir_paths=None)` ¶

Evaluate duplicates among files in the DataFrame.

Scans the path_col column, runs exact, text and image duplicate detectors. Optionally filters to show only duplicates that cross directory boundaries (requires cross_dir_paths to define boundaries).

Source code in filoma/dataframe.py

def evaluate_duplicates(
    self,
    path_col: str = "path",
    text_threshold: float = 0.8,
    image_max_distance: int = 5,
    text_k: int = 3,
    show_table: bool = True,
    cross_dir_paths: Optional[List[str]] = None,
) -> dict:
    """Evaluate duplicates among files in the DataFrame.

    Scans the `path_col` column, runs exact, text and image duplicate
    detectors. Optionally filters to show only duplicates that cross
    directory boundaries (requires `cross_dir_paths` to define boundaries).
    """
    if path_col not in self._df.columns:
        raise ValueError(f"Column '{path_col}' not found in DataFrame")

    # filter for files only
    paths = [str(p) for p in self._df[path_col].to_list() if Path(p).is_file()]
    res = _dedup.find_duplicates(
        paths,
        text_k=text_k,
        text_threshold=text_threshold,
        image_max_distance=image_max_distance,
    )

    # Filter for cross-directory duplicates if requested
    if cross_dir_paths:
        for category in ["exact", "text", "image"]:
            filtered_groups = []
            for group in res.get(category, []):
                # Check if file sources span multiple folders
                source_dirs = set()
                for p in group:
                    for cp in cross_dir_paths:
                        if str(p).startswith(str(cp)):
                            source_dirs.add(cp)
                if len(source_dirs) > 1:
                    filtered_groups.append(group)
            res[category] = filtered_groups

    # Summarize counts
    exact_groups = res.get("exact", [])
    text_groups = res.get("text", [])
    image_groups = res.get("image", [])

    console = Console()
    if show_table:
        table = Table(title="Duplicate Summary (Cross-Dir)" if cross_dir_paths else "Duplicate Summary")
        table.add_column("Type", style="bold cyan")
        table.add_column("Groups", style="white")
        table.add_column("Files In Groups", style="white")
        table.add_row(
            "exact",
            str(len(exact_groups)),
            str(sum(len(g) for g in exact_groups) if exact_groups else 0),
        )
        table.add_row(
            "text",
            str(len(text_groups)),
            str(sum(len(g) for g in text_groups) if text_groups else 0),
        )
        table.add_row(
            "image",
            str(len(image_groups)),
            str(sum(len(g) for g in image_groups) if image_groups else 0),
        )
        console.print(table)

    logger.info(
        f"Duplicate summary: exact={len(exact_groups)} groups "
        f"({sum(len(g) for g in exact_groups) if exact_groups else 0} files), "
        f"text={len(text_groups)} groups "
        f"({sum(len(g) for g in text_groups) if text_groups else 0} files), "
        f"image={len(image_groups)} groups "
        f"({sum(len(g) for g in image_groups) if image_groups else 0} files)"
    )

    return res

`extension_counts()` ¶

Group files by extension and count them.

Returns¶

Polars DataFrame with extension counts

Source code in filoma/dataframe.py

def extension_counts(self) -> pl.DataFrame:
    """Group files by extension and count them.

    Returns
    -------
        Polars DataFrame with extension counts

    """
    # underlying `_df` is expected to be a Polars DataFrame
    df_with_ext = self._df.with_columns(
        [
            pl.col("path")
            .map_elements(
                lambda x: (Path(x).suffix.lower() if Path(x).suffix else "<no extension>"),
                return_dtype=pl.String,
            )
            .alias("extension")
        ]
    )
    result = df_with_ext.group_by("extension").len().sort("len", descending=True)
    return DataFrame(result)

`filter_by_extension(extensions)` ¶

Filter the DataFrame to only include files with specific extensions.

extensions: File extension(s) to filter by (with or without leading dot)

Filtered DataFrame

Source code in filoma/dataframe.py

def filter_by_extension(self, extensions: Union[str, List[str]]) -> "DataFrame":
    """Filter the DataFrame to only include files with specific extensions.

    Args:
    ----
        extensions: File extension(s) to filter by (with or without leading dot)

    Returns:
    -------
        Filtered DataFrame

    """
    if isinstance(extensions, str):
        extensions = [extensions]

    # Normalize extensions (ensure they start with a dot)
    normalized_extensions = []
    for ext in extensions:
        if not ext.startswith("."):
            ext = "." + ext
        normalized_extensions.append(ext.lower())

    filtered_df = self._df.filter(
        pl.col("path").map_elements(
            lambda x: Path(x).suffix.lower() in normalized_extensions,
            return_dtype=pl.Boolean,
        )
    )
    res = DataFrame(filtered_df, lineage=list(self._lineage))
    res.add_lineage_entry("filter_by_extension", extensions=extensions)
    return res

`filter_by_pattern(pattern)` ¶

Filter the DataFrame by path pattern.

pattern: Pattern to match (uses Polars string contains)

Filtered DataFrame

Source code in filoma/dataframe.py

def filter_by_pattern(self, pattern: str) -> "DataFrame":
    """Filter the DataFrame by path pattern.

    Args:
    ----
        pattern: Pattern to match (uses Polars string contains)

    Returns:
    -------
        Filtered DataFrame

    """
    filtered_df = self._df.filter(pl.col("path").str.contains(pattern))
    res = DataFrame(filtered_df, lineage=list(self._lineage))
    res.add_lineage_entry("filter_by_pattern", pattern=pattern)
    return res

`from_pandas(df)` `classmethod` ¶

Construct a filoma.DataFrame from a pandas DataFrame.

This is a convenience wrapper that converts the pandas DataFrame into a Polars DataFrame and wraps it. Requires pandas to be installed.

Source code in filoma/dataframe.py

@classmethod
def from_pandas(cls, df: Any) -> "DataFrame":
    """Construct a filoma.DataFrame from a pandas DataFrame.

    This is a convenience wrapper that converts the pandas DataFrame into
    a Polars DataFrame and wraps it. Requires pandas to be installed.
    """
    if pd is None:
        raise RuntimeError("pandas is not available in this environment")
    # Convert via Polars for internal consistency
    pl_df = pl.from_pandas(df)
    return cls(pl_df)

`head(n=5)` ¶

Get the first n rows.

Source code in filoma/dataframe.py

def head(self, n: int = 5) -> "DataFrame":
    """Get the first n rows."""
    res = DataFrame(self._df.head(n), lineage=list(self._lineage))
    res.add_lineage_entry("head", n=n)
    return res

`info()` ¶

Print concise summary of the DataFrame.

Source code in filoma/dataframe.py

def info(self) -> None:
    """Print concise summary of the DataFrame."""
    print("filoma.DataFrame")
    print(f"Shape: {self.shape}")
    print(f"Columns: {len(self.columns)}")
    print()

    # Column info
    print("Column details:")
    for i, (col, dtype) in enumerate(zip(self.columns, self.dtypes)):
        null_count = self._df[col].null_count()
        print(f"  {i:2d}  {col:15s} {str(dtype):15s} {null_count:8d} nulls")

    # Memory usage approximation
    memory_mb = sum(self._df[col].estimated_size("mb") for col in self.columns)
    print(f"\nEstimated memory usage: {memory_mb:.2f} MB")

`invalidate_pandas_cache()` ¶

Clear the cached pandas conversion created by to_pandas().

Call this after mutating the underlying Polars DataFrame to ensure subsequent pandas accesses reflect the latest data.

Source code in filoma/dataframe.py

def invalidate_pandas_cache(self) -> None:
    """Clear the cached pandas conversion created by `to_pandas()`.

    Call this after mutating the underlying Polars DataFrame to ensure
    subsequent `pandas` accesses reflect the latest data.
    """
    self._pd_cache = None

`save_csv(path)` ¶

Save the DataFrame to CSV.

Source code in filoma/dataframe.py

def save_csv(self, path: Union[str, Path]) -> None:
    """Save the DataFrame to CSV."""
    self._df.write_csv(str(path))

`save_parquet(path)` ¶

Save the DataFrame to Parquet format.

Source code in filoma/dataframe.py

def save_parquet(self, path: Union[str, Path]) -> None:
    """Save the DataFrame to Parquet format."""
    self._df.write_parquet(str(path))

`sort(by, descending=False)` ¶

Sort the DataFrame.

by: Column name(s) to sort by
descending: Sort in descending order

Source code in filoma/dataframe.py

def sort(self, by: Union[str, List[str]], descending: bool = False) -> "DataFrame":
    """Sort the DataFrame.

    Args:
    ----
        by: Column name(s) to sort by
        descending: Sort in descending order

    """
    result = self._df.sort(by, descending=descending)
    res = DataFrame(result, lineage=list(self._lineage))
    res.add_lineage_entry("sort", by=by, descending=descending)
    return res

`tail(n=5)` ¶

Get the last n rows.

Source code in filoma/dataframe.py

def tail(self, n: int = 5) -> "DataFrame":
    """Get the last n rows."""
    res = DataFrame(self._df.tail(n), lineage=list(self._lineage))
    res.add_lineage_entry("tail", n=n)
    return res

`to_dict()` ¶

Convert to a dictionary.

Source code in filoma/dataframe.py

def to_dict(self) -> Dict[str, List]:
    """Convert to a dictionary."""
    return self._df.to_dict(as_series=False)

`to_pandas(force=False)` ¶

Convert to a pandas DataFrame.

By default this method will return a cached pandas conversion if one exists (for performance). Set force=True to reconvert from the current Polars DataFrame and update the cache.

Source code in filoma/dataframe.py

def to_pandas(self, force: bool = False) -> Any:
    """Convert to a pandas DataFrame.

    By default this method will return a cached pandas conversion if one
    exists (for performance). Set ``force=True`` to reconvert from the
    current Polars DataFrame and update the cache.
    """
    if pd is None:
        raise ImportError("pandas is not installed. Please install it to use to_pandas().")
    # Convert and cache on first access or when forced
    if force or self._pd_cache is None:
        # Use Polars' to_pandas conversion for consistency
        self._pd_cache = self._df.to_pandas()
    return self._pd_cache

`to_polars()` ¶

Get the underlying Polars DataFrame.

Source code in filoma/dataframe.py

def to_polars(self) -> pl.DataFrame:
    """Get the underlying Polars DataFrame."""
    return self._df

`unique(subset=None)` ¶

Get unique rows.

subset: Column name(s) to consider for uniqueness

Source code in filoma/dataframe.py

def unique(self, subset: Optional[Union[str, List[str]]] = None) -> "DataFrame":
    """Get unique rows.

    Args:
    ----
        subset: Column name(s) to consider for uniqueness

    """
    if subset is None:
        result = self._df.unique()
    else:
        result = self._df.unique(subset=subset)
    res = DataFrame(result, lineage=list(self._lineage))
    res.add_lineage_entry("unique", subset=subset)
    return res

handler: python

Directory profiler¶

The directory profiling API and configuration helpers.

Directory profiling utilities.

This module provides :class:DirectoryProfiler which analyzes directory trees and returns a :class:DirectoryAnalysis dataclass with summary statistics and optional DataFrame support.

`DirectoryAnalysis` `dataclass` ¶

Bases: Mapping

Structured container for directory analysis results.

This is the canonical, dataclass-first return value for directory probes. Use :meth:to_dict to convert to a plain dict and :meth:to_df to access the optional DataFrame. The class exists to provide a typed, ergonomic API for programmatic consumption.

Source code in filoma/directories/directory_profiler.py

@dataclass
class DirectoryAnalysis(Mapping):
    """Structured container for directory analysis results.

    This is the canonical, dataclass-first return value for directory probes.
    Use :meth:`to_dict` to convert to a plain dict and :meth:`to_df`
    to access the optional DataFrame. The class exists to provide a typed,
    ergonomic API for programmatic consumption.
    """

    path: str
    summary: Dict
    file_extensions: Dict
    common_folder_names: Dict
    empty_folders: List[str]
    top_folders_by_file_count: List
    depth_distribution: Dict
    dataframe: Optional["DataFrame"] = None
    timing: Optional[Dict] = None
    dataframe_note: Optional[str] = None
    _path_obj: Path = field(init=False, repr=False)

    def __post_init__(self):
        """Initialize the path object."""
        self._path_obj = Path(self.path)

    def __getattr__(self, name: str) -> Any:
        """Delegate attribute access to the path object."""
        return getattr(self._path_obj, name)

    @property
    def path_obj(self) -> Path:
        """Return the path object."""
        return self._path_obj

    @classmethod
    def from_dict(cls, d: Dict) -> "DirectoryAnalysis":
        """Create a :class:`DirectoryAnalysis` from a plain dict.

        Parameters
        ----------
        d : dict
            Dictionary in the shape produced by :meth:`DirectoryProfiler.probe`.

        Returns
        -------
        DirectoryAnalysis
            Constructed dataclass instance.

        """
        return cls(
            path=d.get("path") or "",
            summary=d.get("summary", {}),
            file_extensions=d.get("file_extensions", {}),
            common_folder_names=d.get("common_folder_names", {}),
            empty_folders=d.get("empty_folders", []),
            top_folders_by_file_count=d.get("top_folders_by_file_count", []),
            depth_distribution=d.get("depth_distribution", {}),
            dataframe=d.get("dataframe"),
            timing=d.get("timing"),
            dataframe_note=d.get("dataframe_note"),
        )

    def to_dict(self) -> Dict:
        """Return a plain ``dict`` representation of this analysis."""
        # Convert to a plain dict shape
        d = {
            "path": self.path,
            "summary": self.summary,
            "file_extensions": self.file_extensions,
            "common_folder_names": self.common_folder_names,
            "empty_folders": self.empty_folders,
            "top_folders_by_file_count": self.top_folders_by_file_count,
            "depth_distribution": self.depth_distribution,
        }
        if self.dataframe is not None:
            d["dataframe"] = self.dataframe
        if self.timing is not None:
            d["timing"] = self.timing
        if self.dataframe_note is not None:
            d["dataframe_note"] = self.dataframe_note
        return d

    def to_df(self) -> Optional["DataFrame"]:
        """Return the attached DataFrame wrapper or log a helpful warning when absent.

        This method used to silently return None when no DataFrame was built which
        often confused interactive users calling ``analysis.to_df()``. We now log a
        warning explaining the likely causes (DataFrame building disabled or polars
        not installed) to surface actionable next steps.
        """
        if self.dataframe is None:
            # Emit a helpful, actionable warning rather than silently returning None
            logger.warning(
                "No DataFrame available for analysis at path {path!s}. "
                "DataFrame building is disabled by default or 'polars' is not installed. "
                "Call DirectoryProfiler(build_dataframe=True) or use filoma.probe_to_df(...) to obtain a DataFrame.",
                path=self.path,
            )
        return self.dataframe

    def as_dict(self) -> Dict:
        """Alias for :meth:`to_dict`.

        Provided for backward compatibility with dict-based APIs.
        """
        return self.to_dict()

    # Convenience printing helpers so callers can write `analysis.print_summary()`
    # or `analysis.print_report()` without importing DirectoryProfiler. These
    # delegate to the existing DirectoryProfiler rich printers for consistency.
    def print_summary(self, profiler: "DirectoryProfiler | None" = None):
        """Pretty-print a short summary using the rich-based DirectoryProfiler printer.

        If `profiler` is provided it will be used (useful to customize show_progress,
        console, or other profiler settings); otherwise a default profiler is created.
        """
        # Local import to avoid import cycles at module import time
        if profiler is None:
            profiler = DirectoryProfiler(DirectoryProfilerConfig())
        profiler.print_summary(self)

    def print_report(self, profiler: "DirectoryProfiler | None" = None):
        """Pretty-print the full report (summary + extras) via DirectoryProfiler.

        This is an alias for `print_summary` + additional report sections; kept
        as a separate method name for discoverability and symmetry with other
        profilers in the project.
        """
        if profiler is None:
            profiler = DirectoryProfiler(DirectoryProfilerConfig())
        profiler.print_report(self)

    # Mapping protocol implementations so callers can still use dict-like access
    # (e.g., result['summary']) even though the canonical return type is a dataclass.
    def _as_dict(self) -> Dict:
        return self.to_dict()

    def __getitem__(self, key):
        """Mapping-style access to analysis fields by key."""
        return self._as_dict()[key]

    def __iter__(self):
        """Iterate over analysis mapping keys."""
        return iter(self._as_dict())

    def __len__(self):
        """Return number of top-level fields in the analysis mapping."""
        return len(self._as_dict())

`path_obj` `property` ¶

Return the path object.

`getattr(name)` ¶

Delegate attribute access to the path object.

Source code in filoma/directories/directory_profiler.py

def __getattr__(self, name: str) -> Any:
    """Delegate attribute access to the path object."""
    return getattr(self._path_obj, name)

`getitem(key)` ¶

Mapping-style access to analysis fields by key.

Source code in filoma/directories/directory_profiler.py

def __getitem__(self, key):
    """Mapping-style access to analysis fields by key."""
    return self._as_dict()[key]

`iter()` ¶

Iterate over analysis mapping keys.

Source code in filoma/directories/directory_profiler.py

def __iter__(self):
    """Iterate over analysis mapping keys."""
    return iter(self._as_dict())

`len()` ¶

Return number of top-level fields in the analysis mapping.

Source code in filoma/directories/directory_profiler.py

def __len__(self):
    """Return number of top-level fields in the analysis mapping."""
    return len(self._as_dict())

`__post_init__()` ¶

Initialize the path object.

Source code in filoma/directories/directory_profiler.py

def __post_init__(self):
    """Initialize the path object."""
    self._path_obj = Path(self.path)

`as_dict()` ¶

Alias for :meth:to_dict.

Provided for backward compatibility with dict-based APIs.

Source code in filoma/directories/directory_profiler.py

def as_dict(self) -> Dict:
    """Alias for :meth:`to_dict`.

    Provided for backward compatibility with dict-based APIs.
    """
    return self.to_dict()

`from_dict(d)` `classmethod` ¶

Create a :class:DirectoryAnalysis from a plain dict.

Parameters¶

d : dict Dictionary in the shape produced by :meth:DirectoryProfiler.probe.

Returns¶

DirectoryAnalysis Constructed dataclass instance.

Source code in filoma/directories/directory_profiler.py

@classmethod
def from_dict(cls, d: Dict) -> "DirectoryAnalysis":
    """Create a :class:`DirectoryAnalysis` from a plain dict.

    Parameters
    ----------
    d : dict
        Dictionary in the shape produced by :meth:`DirectoryProfiler.probe`.

    Returns
    -------
    DirectoryAnalysis
        Constructed dataclass instance.

    """
    return cls(
        path=d.get("path") or "",
        summary=d.get("summary", {}),
        file_extensions=d.get("file_extensions", {}),
        common_folder_names=d.get("common_folder_names", {}),
        empty_folders=d.get("empty_folders", []),
        top_folders_by_file_count=d.get("top_folders_by_file_count", []),
        depth_distribution=d.get("depth_distribution", {}),
        dataframe=d.get("dataframe"),
        timing=d.get("timing"),
        dataframe_note=d.get("dataframe_note"),
    )

`print_report(profiler=None)` ¶

Pretty-print the full report (summary + extras) via DirectoryProfiler.

This is an alias for print_summary + additional report sections; kept as a separate method name for discoverability and symmetry with other profilers in the project.

Source code in filoma/directories/directory_profiler.py

def print_report(self, profiler: "DirectoryProfiler | None" = None):
    """Pretty-print the full report (summary + extras) via DirectoryProfiler.

    This is an alias for `print_summary` + additional report sections; kept
    as a separate method name for discoverability and symmetry with other
    profilers in the project.
    """
    if profiler is None:
        profiler = DirectoryProfiler(DirectoryProfilerConfig())
    profiler.print_report(self)

`print_summary(profiler=None)` ¶

Pretty-print a short summary using the rich-based DirectoryProfiler printer.

If profiler is provided it will be used (useful to customize show_progress, console, or other profiler settings); otherwise a default profiler is created.

Source code in filoma/directories/directory_profiler.py

def print_summary(self, profiler: "DirectoryProfiler | None" = None):
    """Pretty-print a short summary using the rich-based DirectoryProfiler printer.

    If `profiler` is provided it will be used (useful to customize show_progress,
    console, or other profiler settings); otherwise a default profiler is created.
    """
    # Local import to avoid import cycles at module import time
    if profiler is None:
        profiler = DirectoryProfiler(DirectoryProfilerConfig())
    profiler.print_summary(self)

`to_df()` ¶

Return the attached DataFrame wrapper or log a helpful warning when absent.

This method used to silently return None when no DataFrame was built which often confused interactive users calling analysis.to_df(). We now log a warning explaining the likely causes (DataFrame building disabled or polars not installed) to surface actionable next steps.

Source code in filoma/directories/directory_profiler.py

def to_df(self) -> Optional["DataFrame"]:
    """Return the attached DataFrame wrapper or log a helpful warning when absent.

    This method used to silently return None when no DataFrame was built which
    often confused interactive users calling ``analysis.to_df()``. We now log a
    warning explaining the likely causes (DataFrame building disabled or polars
    not installed) to surface actionable next steps.
    """
    if self.dataframe is None:
        # Emit a helpful, actionable warning rather than silently returning None
        logger.warning(
            "No DataFrame available for analysis at path {path!s}. "
            "DataFrame building is disabled by default or 'polars' is not installed. "
            "Call DirectoryProfiler(build_dataframe=True) or use filoma.probe_to_df(...) to obtain a DataFrame.",
            path=self.path,
        )
    return self.dataframe

`to_dict()` ¶

Return a plain dict representation of this analysis.

Source code in filoma/directories/directory_profiler.py

def to_dict(self) -> Dict:
    """Return a plain ``dict`` representation of this analysis."""
    # Convert to a plain dict shape
    d = {
        "path": self.path,
        "summary": self.summary,
        "file_extensions": self.file_extensions,
        "common_folder_names": self.common_folder_names,
        "empty_folders": self.empty_folders,
        "top_folders_by_file_count": self.top_folders_by_file_count,
        "depth_distribution": self.depth_distribution,
    }
    if self.dataframe is not None:
        d["dataframe"] = self.dataframe
    if self.timing is not None:
        d["timing"] = self.timing
    if self.dataframe_note is not None:
        d["dataframe_note"] = self.dataframe_note
    return d

`DirectoryProfiler` ¶

Analyzes directory structures for basic statistics and patterns.

Provides file counts, folder patterns, empty directories, and extension analysis.

Can use either a pure Python implementation or a faster Rust implementation when available. Supports both sequential and parallel Rust processing.

Source code in filoma/directories/directory_profiler.py

class DirectoryProfiler:
    """Analyzes directory structures for basic statistics and patterns.

    Provides file counts, folder patterns, empty directories, and extension analysis.

    Can use either a pure Python implementation or a faster Rust implementation
    when available. Supports both sequential and parallel Rust processing.

    """

    def __init__(self, config: "DirectoryProfilerConfig"):
        """Initialize the directory profiler.

        The profiler is configured with a `DirectoryProfilerConfig` instance which
        holds options such as whether to use Rust acceleration, parallel processing,
        fd integration, thresholding for parallelism, DataFrame building, and progress
        reporting callbacks. Pass a `DirectoryProfilerConfig` object as the single
        `config` argument. See `DirectoryProfilerConfig` for descriptions of each
        configurable field.
        """
        # Expect a DirectoryProfilerConfig object — no legacy kwargs supported.
        if not hasattr(config, "__class__") or config.__class__.__name__ != "DirectoryProfilerConfig":
            raise TypeError("DirectoryProfiler requires a DirectoryProfilerConfig instance as the sole argument")

        self.console = Console()
        self.config = config

        # Set simple aliases for common flags to preserve prior attribute names
        # Internal availability checks are still performed below.
        self.search_backend = config.search_backend
        self.parallel_threshold = config.parallel_threshold
        self._fast_path_only = config.fast_path_only
        self.progress_callback = config.progress_callback

        # Validate availability and enforce clear relationships
        # Use explicit booleans from the config
        if config.use_rust and not RUST_AVAILABLE:
            raise RuntimeError("Rust implementation requested but not available in this build")
        if config.use_parallel and not RUST_PARALLEL_AVAILABLE:
            raise RuntimeError("Parallel Rust requested but not available")
        if config.use_async and not RUST_ASYNC_AVAILABLE:
            raise RuntimeError("Async Rust prober requested but not available in this build")
        if config.use_fd and not FD_AVAILABLE:
            raise RuntimeError("fd integration requested but not available in this environment")
        if config.build_dataframe and not DATAFRAME_AVAILABLE:
            raise RuntimeError("DataFrame building requested but Polars/DataFrame support is not available")

        # Network args only apply when use_async is True (explicit)
        # Only validate if user has set custom network params (not using defaults)
        has_custom_network_params = config.network_concurrency != 192 or config.network_timeout_ms != 20000 or config.network_retries != 0
        if not config.use_async and has_custom_network_params:
            raise ValueError("Network tuning parameters only apply when use_async=True")

        # Threads only applies when use_fd is True or search_backend='fd'
        is_using_fd = config.use_fd or config.search_backend == "fd"
        if config.threads is not None and not is_using_fd:
            raise ValueError("'threads' setting only applies when use_fd=True or search_backend='fd'")

        # Decide which implementation to use based on search_backend and availability
        backend_choice = config.search_backend
        if backend_choice == "auto":
            # Honor explicit user preferences when provided.
            # If both backends are explicitly requested and available, prefer fd
            if config.use_fd and config.use_rust and FD_AVAILABLE and RUST_AVAILABLE:
                backend_choice = "fd"
            # If user explicitly requested Rust and it's available, use it
            elif config.use_rust and RUST_AVAILABLE:
                backend_choice = "rust"
            # If user explicitly requested fd and it's available, use it
            elif config.use_fd and FD_AVAILABLE:
                backend_choice = "fd"
            else:
                # No explicit preference from user -> auto-detect best available
                # For pure file discovery (fast_path_only), prefer python/os.walk
                if config.fast_path_only:
                    backend_choice = "python"
                elif RUST_AVAILABLE:
                    backend_choice = "rust"
                elif FD_AVAILABLE:
                    backend_choice = "fd"
                else:
                    backend_choice = "python"

        if backend_choice == "rust":
            self.use_rust = True
            self.use_fd = False
        elif backend_choice == "fd":
            self.use_rust = False
            self.use_fd = True
        else:
            self.use_rust = False
            self.use_fd = False

        # Parallel/async/other toggles come directly from config (already validated)
        self.use_parallel = bool(config.use_parallel and self.use_rust)
        self.use_async = bool(config.use_async and self.use_rust)

        # Other instance-level flags
        self.build_dataframe = bool(config.build_dataframe)
        self.return_absolute_paths = bool(config.return_absolute_paths)
        # Progress handling
        if _is_interactive_environment() and config.show_progress:
            logger.debug("Interactive environment detected, disabling progress bars to avoid conflicts")
            self.show_progress = False
        else:
            self.show_progress = bool(config.show_progress)

        # Network tuning (only valid if use_async True)
        self.network_concurrency = config.network_concurrency
        self.network_timeout_ms = config.network_timeout_ms
        self.network_retries = config.network_retries

        # Threads forwarded to fd if using fd backend
        self.threads = config.threads if self.use_fd else None

        # Defer fd integration initialization until actually used
        self.fd_integration = None

    def is_rust_available(self) -> bool:
        """Check if Rust implementation is available and being used.

        Returns
        -------
            True if Rust implementation is available and enabled, False otherwise

        """
        return self.use_rust and RUST_AVAILABLE

    def is_parallel_available(self) -> bool:
        """Check if parallel Rust implementation is available and being used.

        Returns
        -------
            True if parallel Rust implementation is available and enabled, False otherwise

        """
        return self.use_parallel and RUST_PARALLEL_AVAILABLE

    def is_fd_available(self) -> bool:
        """Check if fd integration is available and being used.

        Returns
        -------
            True if fd is available and enabled, False otherwise

        """
        # Use FD_AVAILABLE to reflect whether the fd integration package is importable
        # Tests may monkeypatch FD_AVAILABLE without having the fd binary present.
        return self.use_fd and FD_AVAILABLE

    def get_implementation_info(self) -> dict:
        """Get information about which implementations are available and being used.

        Returns
        -------
            Dictionary with implementation availability status

        """
        return {
            "rust_available": RUST_AVAILABLE,
            "rust_parallel_available": RUST_PARALLEL_AVAILABLE,
            "rust_async_available": RUST_ASYNC_AVAILABLE,
            "fd_available": FD_AVAILABLE,
            "dataframe_available": DATAFRAME_AVAILABLE,
            "using_rust": self.use_rust,
            "using_parallel": self.use_parallel,
            "using_async": bool(self.use_async and RUST_ASYNC_AVAILABLE),
            "using_fd": self.use_fd,
            "using_dataframe": self.build_dataframe,
            "return_absolute_paths": self.return_absolute_paths,
            "search_backend": self.search_backend,
            "python_fallback": not (self.use_rust or self.use_fd),
        }

    def probe(self, path: str, max_depth: Optional[int] = None, threads: Optional[int] = None) -> "DirectoryAnalysis":
        """Analyze a directory tree and return comprehensive statistics.

        Args:
        ----
            path: Path to the root directory to probe
            max_depth: Maximum depth to traverse (None for unlimited)
            threads: Optional override for number of threads when using fd backend

        Returns:
        -------
            A :class:`DirectoryAnalysis` instance containing analysis results

        """
        start_time = time.time()

        # Choose the best backend
        backend = self._choose_backend()

        # Log the start of analysis
        impl_type = self._get_impl_display_name(backend)
        logger.info(f"Starting directory analysis of '{path}' using {impl_type} implementation")

        try:
            if backend == "fd":
                # threads param overrides instance threads when provided
                chosen_threads = threads if threads is not None else self.threads
                result = self._probe_fd(path, max_depth, threads=chosen_threads)
            elif backend == "rust":
                result = self._probe_rust(path, max_depth, fast_path_only=self._fast_path_only)
            else:
                result = self._probe_python(path, max_depth)

            # Calculate and log timing
            elapsed_time = time.time() - start_time
            total_items = result["summary"]["total_files"] + result["summary"]["total_folders"]

            logger.success(
                f"Directory analysis completed in {elapsed_time:.2f}s - "
                f"Found {total_items:,} items ({result['summary']['total_files']:,} files, "
                f"{result['summary']['total_folders']:,} folders) using {impl_type}"
            )

            # Add timing information to result
            result["timing"] = {
                "elapsed_seconds": elapsed_time,
                "implementation": impl_type,
                "items_per_second": (total_items / elapsed_time if elapsed_time > 0 else 0),
            }

            # Return a structured dataclass by default for easier programmatic use
            return DirectoryAnalysis.from_dict(result)

        except Exception as e:
            elapsed_time = time.time() - start_time
            logger.error(f"Directory analysis failed after {elapsed_time:.2f}s: {str(e)}")
            raise

    def _choose_backend(self) -> str:
        """Choose the best available backend based on settings and availability.

        Returns
        -------
            Backend name: "fd", "rust", or "python"

        """
        # If search_backend is 'auto' and neither rust nor fd are requested
        # by the resolved preferences, prefer the Python backend. This avoids
        # forcing Python when the user specifically preferred fd.
        if self.search_backend == "auto" and not (self.use_rust or self.use_fd):
            return "python"

        if self.search_backend == "fd":
            if self.use_fd and FD_AVAILABLE:
                return "fd"
            else:
                logger.warning("fd backend requested but not available, falling back to auto selection")

        elif self.search_backend == "rust":
            if self.use_rust:
                return "rust"
            else:
                logger.warning("Rust backend requested but not available, falling back to auto selection")

        elif self.search_backend == "python":
            return "python"

        # Auto selection logic
        if self.search_backend == "auto":
            # Based on cold cache benchmarks Rust tends to be the fastest
            # general-purpose backend. Prefer Rust when available; fall back
            # to fd when Rust is not enabled/available but fd is explicitly
            # enabled by the user.
            if self.use_rust and RUST_AVAILABLE:
                return "rust"
            elif self.use_fd and FD_AVAILABLE:
                return "fd"
            else:
                return "python"

        # Fallback to python if nothing else works
        return "python"

    def _get_impl_display_name(self, backend: str) -> str:
        """Get display name for implementation type."""
        if backend == "fd":
            return "🔍 fd"
        elif backend == "rust":
            if self.use_parallel and RUST_PARALLEL_AVAILABLE:
                return "🦀 Rust (Parallel)"
            else:
                return "🦀 Rust (Sequential)"
        else:
            return "🐍 Python"

    def _probe_fd(self, path: str, max_depth: Optional[int] = None, threads: Optional[int] = None) -> Dict:
        """Use fd for file discovery + Python for analysis.

        This hybrid approach leverages fd's ultra-fast file discovery
        while using Python for statistical analysis to maintain
        consistency with other backends.
        """
        # Lazily initialize fd integration here. This ensures tests that
        # monkeypatch FD_AVAILABLE can control availability without the
        # constructor eagerly probing the environment.
        if self.fd_integration is None:
            # If the fd integration package wasn't importable at module
            # import time, reflect that now.
            if not FD_AVAILABLE:
                raise RuntimeError("fd integration not available")
            try:
                self.fd_integration = FdIntegration()
                if not self.fd_integration.is_available():
                    # fd binary is not usable on this system
                    self.fd_integration = None
                    raise RuntimeError("fd integration not available")
            except Exception:
                self.fd_integration = None
                raise RuntimeError("fd integration not available")

        progress = None
        task_id = None

        if self.show_progress:
            progress = Progress(
                SpinnerColumn(),
                TextColumn("[bold blue]Discovering files with fd..."),
                BarColumn(),
                TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
                TimeElapsedColumn(),
                console=self.console,
                transient=True,
            )
            progress.start()
            task_id = progress.add_task("Discovering...", total=None)

        # Run the fd discovery and analysis inside a try so we always stop
        # the progress bar in the finally block below.
        try:
            # Use fd to get all files and directories rapidly
            if progress and task_id is not None:
                progress.update(task_id, description="[bold blue]Finding files...")

            # fd's --max-depth applies to the matched path; to match the
            # Python/Rust semantics where files up to depth (max_depth + 1)
            # are included, when a max_depth is provided for the probe we
            # increase the file search depth by 1.
            file_max_depth = None if max_depth is None else max_depth + 1
            # When using fd in auto mode, prefer flags that match a raw
            # traversal (include hidden files, don't honor ignore files, but don't follow symlinks)
            fd_find_kwargs: dict = {
                "path": path,
                "file_types": ["f"],
                "max_depth": file_max_depth,
                "absolute_paths": self.return_absolute_paths,
                "threads": threads,
            }
            if self.search_backend == "auto" or self.config.fd_no_ignore:
                fd_find_kwargs.update({"search_hidden": True, "no_ignore": True, "follow_links": False})

            all_files = self.fd_integration.find(**fd_find_kwargs)

            if progress and task_id is not None:
                progress.update(task_id, description="[bold blue]Finding directories...")

            all_dirs = self.fd_integration.find(
                path=path,
                file_types=["d"],  # Directories only
                max_depth=max_depth,
                absolute_paths=self.return_absolute_paths,
                threads=threads,
                search_hidden=True if self.search_backend == "auto" else False,
                no_ignore=True if self.search_backend == "auto" else False,
                follow_links=False,  # Don't follow symlinks by default
            )

            # Convert to Path objects for analysis
            root_path_obj = Path(path).resolve()
            all_paths = [Path(p) for p in all_files + all_dirs]

            # If DataFrame building is enabled and DataFrame support is available,
            # build a prebuilt DataFrame from the fd results and pass it to the
            # Python probing logic to avoid rebuilding the DataFrame there.
            prebuilt_df = None
            if self.build_dataframe and DATAFRAME_AVAILABLE:
                try:
                    prebuilt_df = DataFrame([str(p) for p in all_paths])
                except Exception:
                    # If DataFrame construction fails for any reason, fall back
                    # to letting _probe_paths_python collect paths itself.
                    prebuilt_df = None

            if progress and task_id is not None:
                progress.update(task_id, description="[bold yellow]Analyzing discovered files...")
                progress.update(task_id, total=100, completed=50)

                # Now probe the discovered paths using Python logic
                # Pass the existing progress to avoid conflicts. If a prebuilt DataFrame
                # exists, provide it to avoid rebuilding the DataFrame inside the probe.
                result = self._probe_paths_python(
                    root_path_obj,
                    all_paths,
                    max_depth,
                    existing_progress=progress,
                    existing_task_id=task_id,
                    prebuilt_dataframe=prebuilt_df,
                )
            else:
                # No progress provided; run probe without progress integration
                result = self._probe_paths_python(
                    root_path_obj,
                    all_paths,
                    max_depth,
                    existing_progress=None,
                    existing_task_id=None,
                    prebuilt_dataframe=prebuilt_df,
                )

            if progress and task_id is not None:
                progress.update(task_id, description="[bold green]Analysis complete!")
                progress.update(task_id, completed=100)

            return result

        finally:
            if progress:
                progress.stop()

    def sample_paths(self, path: str, sample_size: int = 20) -> Dict[str, List[str]]:
        """Return small samples of paths for quick backend-diffing.

        Returns a dict with keys 'fd_files', 'fd_dirs', 'python_files'. Rust currently
        does not expose a path list in the public API so it is omitted (you can
        re-run the Rust prober separately if needed).
        """
        samples = {"fd_files": [], "fd_dirs": [], "python_files": []}
        try:
            if FD_AVAILABLE:
                fd = FdIntegration()
                samples["fd_files"] = fd.find(
                    path=path,
                    file_types=["f"],
                    max_results=sample_size,
                    search_hidden=True,
                    no_ignore=True,
                    follow_links=False,
                    absolute_paths=self.return_absolute_paths,
                )
                samples["fd_dirs"] = fd.find(
                    path=path,
                    file_types=["d"],
                    max_results=sample_size,
                    search_hidden=True,
                    no_ignore=True,
                    follow_links=False,
                    absolute_paths=self.return_absolute_paths,
                )
        except Exception:
            samples["fd_files"] = []
            samples["fd_dirs"] = []

        # Python sample
        try:
            root = Path(path)
            python_files = []
            for i, p in enumerate(root.rglob("*")):
                if p.is_file():
                    python_files.append(str(p.resolve()))
                if len(python_files) >= sample_size:
                    break
            samples["python_files"] = python_files
        except Exception:
            samples["python_files"] = []

        return samples

    def _probe_paths_python(
        self,
        path_root: Path,
        all_paths: List[Path],
        max_depth: Optional[int] = None,
        existing_progress=None,
        existing_task_id=None,
        prebuilt_dataframe=None,
    ) -> Dict:
        """Analyze pre-discovered paths using Python logic.

        This method takes a list of paths (from fd or other source) and performs
        the statistical analysis to maintain consistency with the Python backend.

        Args:
        ----
            path: Root directory being probed
            all_paths: List of paths to probe
            max_depth: Maximum depth for analysis
            existing_progress: Existing progress bar to reuse (avoids conflicts)
            existing_task_id: Existing task ID to update
            path_root: The resolved root Path for the probe (used for depth calculations)
            prebuilt_dataframe: Optional DataFrame supplied to avoid rebuilding inside probe

        """
        # Initialize counters and collections
        file_count = 0
        folder_count = 1  # Start with 1 to count the root directory itself
        total_size = 0
        empty_folders = []
        file_extensions = Counter()
        folder_names = Counter()
        files_per_folder = defaultdict(int)
        depth_stats = defaultdict(int)

        # Count the root directory at depth 0
        depth_stats[0] = 1

        # Collection for DataFrame if enabled. If a prebuilt_dataframe is provided
        # (e.g. from fd results), skip collecting paths and attach it at the end.
        dataframe_paths = [] if (self.build_dataframe and prebuilt_dataframe is None) else None

        # Sort paths for better progress indication (guard against None or unsortable lists)
        if all_paths:
            try:
                all_paths.sort()
            except Exception:
                # If sorting fails (e.g., mixed types), ignore and proceed
                pass

        progress = existing_progress
        task_id = existing_task_id
        processed_items = 0
        progress_owned = False  # Track if we own the progress bar

        if self.show_progress and existing_progress is None:
            # Only create new progress if none was provided
            progress = Progress(
                SpinnerColumn(),
                TextColumn("[bold blue]Analyzing file metadata..."),
                BarColumn(),
                TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
                TextColumn("({task.completed:,}/{task.total:,} items)"),
                TimeElapsedColumn(),
                console=self.console,
                transient=True,
            )
            progress.start()
            task_id = progress.add_task("Analyzing...", total=len(all_paths))
            progress_owned = True
        elif existing_progress and existing_task_id:
            # Update existing progress for the analysis phase
            existing_progress.update(
                existing_task_id,
                description="[bold yellow]Analyzing file metadata...",
                total=len(all_paths),
                completed=0,
            )

        try:
            for current_path in all_paths:
                processed_items += 1

                # Update progress
                if progress and task_id is not None:
                    if processed_items % 100 == 0:
                        progress.update(task_id, completed=processed_items)

                    if self.progress_callback:
                        self.progress_callback(
                            f"Processing: {current_path.name}",
                            processed_items,
                            len(all_paths),
                        )

                # Calculate current depth
                try:
                    depth = len(current_path.relative_to(path_root).parts)
                except ValueError:
                    depth = 0

                # Skip if beyond max depth (should not happen with fd filtering, but safety check)
                if max_depth is not None:
                    if current_path.is_dir() and depth > max_depth:
                        continue
                    elif current_path.is_file() and depth > max_depth + 1:
                        continue

                # Add to paths collection if DataFrame is enabled and we're collecting paths
                if self.build_dataframe and dataframe_paths is not None:
                    dataframe_paths.append(str(current_path))

                if current_path.is_dir():
                    depth_stats[depth] += 1
                    folder_count += 1

                    # Check for empty folders
                    try:
                        if not any(current_path.iterdir()):
                            empty_folders.append(str(current_path))
                    except (OSError, PermissionError):
                        pass

                    # Analyze folder names for patterns
                    folder_names[current_path.name] += 1

                elif current_path.is_file():
                    file_count += 1

                    # Count files in parent directory
                    files_per_folder[str(current_path.parent)] += 1

                    # Get file extension
                    ext = current_path.suffix.lower()
                    if ext:
                        file_extensions[ext] += 1
                    else:
                        file_extensions["<no extension>"] += 1

                    # Add to total size
                    try:
                        total_size += current_path.stat().st_size
                    except (OSError, IOError):
                        pass

            # Final progress update
            if progress and task_id is not None:
                progress.update(task_id, completed=processed_items)

            # Calculate summary statistics
            avg_files_per_folder = file_count / max(1, folder_count)

            # Find folders with most files
            top_folders_by_file_count = sorted(files_per_folder.items(), key=lambda x: x[1], reverse=True)[:10]

            # Build result dictionary
            result = {
                "path": str(path_root),
                "summary": {
                    "total_files": file_count,
                    "total_folders": folder_count,
                    "total_size_bytes": total_size,
                    "total_size_mb": round(total_size / (1024 * 1024), 2),
                    "avg_files_per_folder": round(avg_files_per_folder, 2),
                    "max_depth": max(depth_stats.keys()) if depth_stats else 0,
                    "empty_folder_count": len(empty_folders),
                },
                "file_extensions": dict(file_extensions.most_common(20)),
                "common_folder_names": dict(folder_names.most_common(20)),
                "empty_folders": empty_folders,
                "top_folders_by_file_count": top_folders_by_file_count,
                "depth_distribution": dict(depth_stats),
            }

            # Add DataFrame if enabled
            if self.build_dataframe and DATAFRAME_AVAILABLE:
                if prebuilt_dataframe is not None:
                    # Use prebuilt DataFrame supplied by caller (fd results)
                    result["dataframe"] = prebuilt_dataframe
                else:
                    result["dataframe"] = DataFrame(dataframe_paths)

            return result

        finally:
            if progress and progress_owned:
                progress.stop()

    def _probe_rust(self, path: str, max_depth: Optional[int] = None, fast_path_only: bool = False) -> Dict:
        """Use the Rust implementation for analysis.

        For performance, the main statistical analysis is done in Rust.
        If DataFrame building is enabled, file paths are collected separately
        using Python/pathlib to maintain consistency with the Python implementation.
        """
        progress = None
        task_id = None

        if self.show_progress:
            progress = Progress(
                SpinnerColumn(),
                TextColumn("[bold blue]Analyzing directory structure..."),
                BarColumn(),
                TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
                TimeElapsedColumn(),
                console=self.console,
                transient=True,  # Remove progress bar when done
            )
            progress.start()
            task_id = progress.add_task("Analyzing...", total=None)

        try:
            # Choose Rust variant: async for network filesystems, sync otherwise
            try:
                fs_type = self._detect_filesystem_type(path)
            except Exception:
                fs_type = None

            is_network_fs = False
            if fs_type:
                # Common network FS types
                if any(x in fs_type.lower() for x in ("nfs", "cifs", "smb", "ceph", "gluster", "sshfs")):
                    is_network_fs = True

            # If network FS choose async Rust prober which limits concurrency and uses tokio
            # Only use the async Rust variant when the path looks like a network
            # filesystem AND the user explicitly enabled async via `use_async`.
            if is_network_fs and self.use_async:
                # Default concurrency limit can be tuned; use configured values
                if RUST_ASYNC_AVAILABLE:
                    # Decide Rust flag defaults: when search_backend is 'auto', scan hidden/ignored but don't follow symlinks
                    if self.search_backend == "auto":
                        follow = False
                        hidden = True
                        no_ignore = True
                    else:
                        follow = None
                        hidden = None
                        no_ignore = None

                    result = probe_directory_rust_async(
                        path,
                        max_depth,
                        self.network_concurrency,
                        self.network_timeout_ms,
                        self.network_retries,
                        fast_path_only,
                        follow_links=follow,
                        search_hidden=hidden,
                        no_ignore=no_ignore,
                    )
                else:
                    # Async variant not available; fall back to parallel or sequential Rust
                    if self.use_parallel and RUST_PARALLEL_AVAILABLE:
                        if self.search_backend == "auto":
                            follow = False
                            hidden = True
                            no_ignore = True
                        else:
                            follow = None
                            hidden = None
                            no_ignore = None

                        result = probe_directory_rust_parallel(
                            path,
                            max_depth,
                            self.parallel_threshold,
                            fast_path_only,
                            follow_links=follow,
                            search_hidden=hidden,
                            no_ignore=no_ignore,
                        )
                    else:
                        result = probe_directory_rust(path, max_depth, fast_path_only)
            elif is_network_fs and not self.use_async:
                # User explicitly disabled async; prefer parallel or sequential Rust
                if self.use_parallel and RUST_PARALLEL_AVAILABLE:
                    if self.search_backend == "auto":
                        follow = False
                        hidden = True
                        no_ignore = True
                    else:
                        follow = None
                        hidden = None
                        no_ignore = None

                    result = probe_directory_rust_parallel(
                        path,
                        max_depth,
                        self.parallel_threshold,
                        fast_path_only,
                        follow_links=follow,
                        search_hidden=hidden,
                        no_ignore=no_ignore,
                    )
                else:
                    if self.search_backend == "auto":
                        follow = False
                        hidden = True
                        no_ignore = True
                    else:
                        follow = None
                        hidden = None
                        no_ignore = None

                    result = probe_directory_rust(
                        path,
                        max_depth,
                        fast_path_only,
                        follow_links=follow,
                        search_hidden=hidden,
                        no_ignore=no_ignore,
                    )
            else:
                if self.search_backend == "auto":
                    follow = False
                    hidden = True
                    no_ignore = True
                else:
                    follow = None
                    hidden = None
                    no_ignore = None

                if self.use_parallel and RUST_PARALLEL_AVAILABLE:
                    result = probe_directory_rust_parallel(
                        path,
                        max_depth,
                        self.parallel_threshold,
                        fast_path_only,
                        follow_links=follow,
                        search_hidden=hidden,
                        no_ignore=no_ignore,
                    )
                else:
                    result = probe_directory_rust(
                        path,
                        max_depth,
                        fast_path_only,
                        follow_links=follow,
                        search_hidden=hidden,
                        no_ignore=no_ignore,
                    )

            # Update progress to show completion
            if progress and task_id is not None:
                progress.update(task_id, description="[bold green]Analysis complete!")
                progress.update(task_id, total=100, completed=100)

            # Rust now returns absolute (or canonicalized when follow_links=True) paths,
            # so Python-side normalization is no longer necessary here.

            # If DataFrame building is enabled, we need to collect file paths
            # since the Rust implementation doesn't return them
            if self.build_dataframe and DATAFRAME_AVAILABLE:
                if progress and task_id is not None:
                    progress.update(task_id, description="[bold yellow]Building DataFrame...")

                root_path_obj = Path(path)
                all_paths = []
                permission_errors_encountered = False

                # Collect paths using Python (pathlib) with error handling for system directories
                try:
                    for current_path in root_path_obj.rglob("*"):
                        try:
                            # Calculate current depth
                            depth = len(current_path.relative_to(root_path_obj).parts)

                            # Skip if beyond max depth
                            if max_depth is not None and depth > max_depth:
                                continue

                            all_paths.append(str(current_path))
                        except (ValueError, OSError, PermissionError):
                            # Skip paths that can't be accessed or processed
                            permission_errors_encountered = True
                            continue
                except (OSError, PermissionError, FileNotFoundError):
                    # If rglob fails entirely, provide DataFrame with whatever we collected
                    self.console.print("[yellow]Warning: Some paths couldn't be accessed for DataFrame building[/yellow]")
                    logger.warning(f"DataFrame building encountered permission errors on {path}, providing partial results")
                    permission_errors_encountered = True

                # Add DataFrame to the result (may be partial if there were permission errors)
                result["dataframe"] = DataFrame(all_paths)
                if permission_errors_encountered:
                    # Add a note only if we actually encountered permission errors
                    result["dataframe_note"] = "DataFrame may be incomplete due to permission restrictions"

                if progress and task_id is not None:
                    progress.update(task_id, description="[bold green]DataFrame built!")

            return result

        finally:
            if progress:
                progress.stop()

    def _probe_python(self, path: str, max_depth: Optional[int] = None) -> Dict:
        """Pure Python implementation with enhanced DataFrame support and progress indication."""
        path_root = Path(path)
        if not path_root.exists():
            raise ValueError(f"Path does not exist: {path_root}")
        if not path_root.is_dir():
            raise ValueError(f"Path is not a directory: {path_root}")

        # Initialize counters and collections
        file_count = 0
        folder_count = 1  # Start with 1 to count the root directory itself
        total_size = 0
        empty_folders = []
        file_extensions = Counter()
        folder_names = Counter()
        files_per_folder = defaultdict(int)
        depth_stats = defaultdict(int)

        # Count the root directory at depth 0
        depth_stats[0] = 1

        # Collection for DataFrame if enabled
        all_paths = [] if self.build_dataframe else None

        # Estimate total items for progress tracking
        progress = None
        task_id = None
        total_items = None
        processed_items = 0

        if self.show_progress:
            # Quick estimation pass
            total_items = sum(1 for _ in path_root.rglob("*"))

            progress = Progress(
                SpinnerColumn(),
                TextColumn("[bold blue]Analyzing directory structure..."),
                BarColumn(),
                TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
                TextColumn("({task.completed:,}/{task.total:,} items)"),
                TimeElapsedColumn(),
                console=self.console,
                transient=True,
            )
            progress.start()
            task_id = progress.add_task("Analyzing...", total=total_items)

        try:
            # Walk through directory tree using pathlib for consistency
            try:
                for current_path in path_root.rglob("*"):
                    try:
                        processed_items += 1

                        # Update progress
                        if progress and task_id is not None:
                            if processed_items % 100 == 0:  # Update every 100 items for performance
                                progress.update(task_id, completed=processed_items)

                            # Call custom progress callback if provided
                            if self.progress_callback:
                                self.progress_callback(
                                    f"Processing: {current_path.name}",
                                    processed_items,
                                    total_items or 0,
                                )

                        # Calculate current depth
                        try:
                            depth = len(current_path.relative_to(path_root).parts)
                        except ValueError:
                            depth = 0

                        # Skip if beyond max depth (match Rust implementation logic)
                        if max_depth is not None:
                            try:
                                if current_path.is_dir() and depth > max_depth:
                                    continue
                                elif current_path.is_file() and depth > max_depth + 1:
                                    continue
                            except (OSError, PermissionError):
                                # Skip paths we can't access for depth checking
                                continue

                        # Add to paths collection if DataFrame is enabled
                        if self.build_dataframe and all_paths is not None:
                            all_paths.append(str(current_path))

                        try:
                            is_dir = current_path.is_dir()
                            is_file = current_path.is_file()
                        except (OSError, PermissionError):
                            # Skip paths we can't determine type for
                            continue

                        if is_dir:
                            depth_stats[depth] += 1
                            folder_count += 1

                            # Check for empty folders
                            try:
                                if not any(current_path.iterdir()):
                                    empty_folders.append(str(current_path))
                            except (OSError, PermissionError):
                                # Skip directories we can't read
                                pass

                            # Analyze folder names for patterns
                            folder_names[current_path.name] += 1

                        elif is_file:
                            file_count += 1

                            # Count files in parent directory
                            files_per_folder[str(current_path.parent)] += 1

                            # Get file extension
                            ext = current_path.suffix.lower()
                            if ext:
                                file_extensions[ext] += 1
                            else:
                                file_extensions["<no extension>"] += 1

                            # Add to total size
                            try:
                                total_size += current_path.stat().st_size
                            except (OSError, IOError):
                                # Skip files we can't stat (permissions, broken symlinks, etc.)
                                pass

                    except (OSError, PermissionError):
                        # Skip individual files/directories we can't access
                        continue

            except (OSError, PermissionError):
                # If rglob fails entirely, we can't probe this directory
                self.console.print(f"[yellow]Warning: Cannot access directory {path_root} - insufficient permissions[/yellow]")
                # Return minimal result
                return {
                    "path": str(path_root),
                    "summary": {
                        "total_files": 0,
                        "total_folders": 0,
                        "total_size_bytes": 0,
                        "total_size_mb": 0.0,
                        "avg_files_per_folder": 0.0,
                        "max_depth": 0,
                        "empty_folder_count": 0,
                    },
                    "file_extensions": {},
                    "common_folder_names": {},
                    "empty_folders": [],
                    "top_folders_by_file_count": [],
                    "depth_distribution": {},
                    "timing": {"error": "Permission denied"},
                }

            # Final progress update
            if progress and task_id is not None:
                progress.update(task_id, completed=processed_items)

            # Calculate summary statistics
            avg_files_per_folder = file_count / max(1, folder_count)

            # Find folders with most files
            top_folders_by_file_count = sorted(files_per_folder.items(), key=lambda x: x[1], reverse=True)[:10]

            # Build result dictionary
            result = {
                "path": str(path_root),
                "summary": {
                    "total_files": file_count,
                    "total_folders": folder_count,
                    "total_size_bytes": total_size,
                    "total_size_mb": round(total_size / (1024 * 1024), 2),
                    "avg_files_per_folder": round(avg_files_per_folder, 2),
                    "max_depth": max(depth_stats.keys()) if depth_stats else 0,
                    "empty_folder_count": len(empty_folders),
                },
                "file_extensions": dict(file_extensions.most_common(20)),
                "common_folder_names": dict(folder_names.most_common(20)),
                "empty_folders": empty_folders,
                "top_folders_by_file_count": top_folders_by_file_count,
                "depth_distribution": dict(depth_stats),
            }

            # Add DataFrame if enabled
            if self.build_dataframe and DATAFRAME_AVAILABLE:
                result["dataframe"] = DataFrame(all_paths)

            return result

        finally:
            if progress:
                progress.stop()

    def print_summary(self, analysis: "DirectoryAnalysis"):
        """Print a summary of the directory analysis (expects DirectoryAnalysis)."""
        if not isinstance(analysis, DirectoryAnalysis):
            raise TypeError("print_summary expects a DirectoryAnalysis instance")

        summary = analysis.summary
        timing = analysis.timing or {}

        # Show which implementation was used with more detail
        impl_type = timing.get("implementation", "Unknown")

        # Add DataFrame indicator
        if self.build_dataframe and analysis.dataframe is not None:
            impl_type += " + 📊 DataFrame"

        # Main summary table
        title = f"Directory Analysis: {analysis.path} ({impl_type})"
        if timing:
            title += f" - {timing.get('elapsed_seconds', 0):.2f}s"

        table = Table(title=title)
        table.add_column("Metric", style="bold cyan")
        table.add_column("Value", style="white")

        table.add_row("Total Files", f"{summary['total_files']:,}")
        table.add_row("Total Folders", f"{summary['total_folders']:,}")
        table.add_row("Total Size", f"{summary['total_size_mb']:,} MB")
        table.add_row("Average Files per Folder", str(summary["avg_files_per_folder"]))
        table.add_row("Maximum Depth", str(summary["max_depth"]))
        table.add_row("Empty Folders", str(summary["empty_folder_count"]))

        # Add DataFrame info if available
        if self.build_dataframe and analysis.dataframe is not None:
            df = analysis.dataframe
            table.add_row("DataFrame Rows", f"{len(df):,}")

        # Add timing information if available
        if timing:
            table.add_row("Analysis Time", f"{timing['elapsed_seconds']:.2f}s")
            if timing.get("items_per_second", 0) > 0:
                table.add_row("Processing Speed", f"{timing['items_per_second']:,.0f} items/sec")

        self.console.print(table)
        self.console.print()

    def get_dataframe(self, analysis: "DirectoryAnalysis") -> Optional["DataFrame"]:
        """Get the DataFrame from analysis results.

        Args:
        ----
            analysis: :class:`DirectoryAnalysis` instance

        Returns:
        -------
            DataFrame object if available, None otherwise

        """
        if not isinstance(analysis, DirectoryAnalysis):
            raise TypeError("get_dataframe expects a DirectoryAnalysis instance")
        return analysis.to_df()

    def is_dataframe_enabled(self) -> bool:
        """Check if DataFrame building is enabled and available.

        Returns
        -------
            True if DataFrame building is enabled, False otherwise

        """
        return self.build_dataframe and DATAFRAME_AVAILABLE

    def _detect_filesystem_type(self, path: str) -> Optional[str]:
        """Attempt to detect the filesystem type for a given path.

        Returns the fs type string (e.g., 'nfs', 'ext4') or None if not detected.
        """
        import os

        try:
            # Parse /proc/mounts for the mount containing the path
            mounts = []
            with open("/proc/mounts", "r") as f:
                for line in f:
                    parts = line.split()
                    if len(parts) >= 3:
                        mounts.append((parts[1], parts[2]))  # (mount_point, fs_type)

            # Find best match by longest mount_point prefix
            best = ("", None)
            p = os.path.abspath(path)
            for mnt, fst in mounts:
                if p.startswith(mnt) and len(mnt) > len(best[0]):
                    best = (mnt, fst)

            if best[1]:
                return best[1]

        except Exception:
            pass

        # Fallback: try os.statvfs and map f_fsid is not portable; return None
        return None

    def print_file_extensions(self, analysis: "DirectoryAnalysis", top_n: int = 10):
        """Print the most common file extensions (expects DirectoryAnalysis)."""
        if not isinstance(analysis, DirectoryAnalysis):
            raise TypeError("print_file_extensions expects a DirectoryAnalysis instance")

        extensions = analysis.file_extensions

        if not extensions:
            return

        table = Table(title="File Extensions")
        table.add_column("Extension", style="bold magenta")
        table.add_column("Count", style="white")
        table.add_column("Percentage", style="green")
        total_files = analysis.summary["total_files"]

        for ext, count in list(extensions.items())[:top_n]:
            percentage = (count / total_files * 100) if total_files > 0 else 0
            table.add_row(ext, f"{count:,}", f"{percentage:.1f}%")

        self.console.print(table)
        self.console.print()

    def print_folder_patterns(self, analysis: "DirectoryAnalysis", top_n: int = 10):
        """Print the most common folder names (expects DirectoryAnalysis)."""
        if not isinstance(analysis, DirectoryAnalysis):
            raise TypeError("print_folder_patterns expects a DirectoryAnalysis instance")

        folder_names = analysis.common_folder_names

        if not folder_names:
            return

        table = Table(title="Common Folder Names")
        table.add_column("Folder Name", style="bold blue")
        table.add_column("Occurrences", style="white")

        for name, count in list(folder_names.items())[:top_n]:
            table.add_row(name, f"{count:,}")

        self.console.print(table)
        self.console.print()

    def print_empty_folders(self, analysis: "DirectoryAnalysis", max_show: int = 20):
        """Print empty folders found (expects DirectoryAnalysis)."""
        if not isinstance(analysis, DirectoryAnalysis):
            raise TypeError("print_empty_folders expects a DirectoryAnalysis instance")

        empty_folders = analysis.empty_folders

        if not empty_folders:
            self.console.print("[green]✓ No empty folders found![/green]")
            return

        table = Table(title=f"Empty Folders (showing {min(len(empty_folders), max_show)} of {len(empty_folders)})")
        table.add_column("Path", style="yellow")

        for folder in empty_folders[:max_show]:
            table.add_row(folder)

        if len(empty_folders) > max_show:
            table.add_row(f"... and {len(empty_folders) - max_show} more")

        self.console.print(table)
        self.console.print()

    def print_report(self, analysis: "DirectoryAnalysis"):
        """Print a comprehensive report of the directory analysis.

        Expects a :class:`DirectoryAnalysis` instance. Use :meth:`to_dict`
        if you need a plain dict shape for downstream tooling.
        """
        if not isinstance(analysis, DirectoryAnalysis):
            raise TypeError("print_report expects a DirectoryAnalysis instance")

        self.print_summary(analysis)
        self.print_file_extensions(analysis)
        self.print_folder_patterns(analysis)
        self.print_empty_folders(analysis)

`init(config)` ¶

Initialize the directory profiler.

The profiler is configured with a DirectoryProfilerConfig instance which holds options such as whether to use Rust acceleration, parallel processing, fd integration, thresholding for parallelism, DataFrame building, and progress reporting callbacks. Pass a DirectoryProfilerConfig object as the single config argument. See DirectoryProfilerConfig for descriptions of each configurable field.

Source code in filoma/directories/directory_profiler.py

def __init__(self, config: "DirectoryProfilerConfig"):
    """Initialize the directory profiler.

    The profiler is configured with a `DirectoryProfilerConfig` instance which
    holds options such as whether to use Rust acceleration, parallel processing,
    fd integration, thresholding for parallelism, DataFrame building, and progress
    reporting callbacks. Pass a `DirectoryProfilerConfig` object as the single
    `config` argument. See `DirectoryProfilerConfig` for descriptions of each
    configurable field.
    """
    # Expect a DirectoryProfilerConfig object — no legacy kwargs supported.
    if not hasattr(config, "__class__") or config.__class__.__name__ != "DirectoryProfilerConfig":
        raise TypeError("DirectoryProfiler requires a DirectoryProfilerConfig instance as the sole argument")

    self.console = Console()
    self.config = config

    # Set simple aliases for common flags to preserve prior attribute names
    # Internal availability checks are still performed below.
    self.search_backend = config.search_backend
    self.parallel_threshold = config.parallel_threshold
    self._fast_path_only = config.fast_path_only
    self.progress_callback = config.progress_callback

    # Validate availability and enforce clear relationships
    # Use explicit booleans from the config
    if config.use_rust and not RUST_AVAILABLE:
        raise RuntimeError("Rust implementation requested but not available in this build")
    if config.use_parallel and not RUST_PARALLEL_AVAILABLE:
        raise RuntimeError("Parallel Rust requested but not available")
    if config.use_async and not RUST_ASYNC_AVAILABLE:
        raise RuntimeError("Async Rust prober requested but not available in this build")
    if config.use_fd and not FD_AVAILABLE:
        raise RuntimeError("fd integration requested but not available in this environment")
    if config.build_dataframe and not DATAFRAME_AVAILABLE:
        raise RuntimeError("DataFrame building requested but Polars/DataFrame support is not available")

    # Network args only apply when use_async is True (explicit)
    # Only validate if user has set custom network params (not using defaults)
    has_custom_network_params = config.network_concurrency != 192 or config.network_timeout_ms != 20000 or config.network_retries != 0
    if not config.use_async and has_custom_network_params:
        raise ValueError("Network tuning parameters only apply when use_async=True")

    # Threads only applies when use_fd is True or search_backend='fd'
    is_using_fd = config.use_fd or config.search_backend == "fd"
    if config.threads is not None and not is_using_fd:
        raise ValueError("'threads' setting only applies when use_fd=True or search_backend='fd'")

    # Decide which implementation to use based on search_backend and availability
    backend_choice = config.search_backend
    if backend_choice == "auto":
        # Honor explicit user preferences when provided.
        # If both backends are explicitly requested and available, prefer fd
        if config.use_fd and config.use_rust and FD_AVAILABLE and RUST_AVAILABLE:
            backend_choice = "fd"
        # If user explicitly requested Rust and it's available, use it
        elif config.use_rust and RUST_AVAILABLE:
            backend_choice = "rust"
        # If user explicitly requested fd and it's available, use it
        elif config.use_fd and FD_AVAILABLE:
            backend_choice = "fd"
        else:
            # No explicit preference from user -> auto-detect best available
            # For pure file discovery (fast_path_only), prefer python/os.walk
            if config.fast_path_only:
                backend_choice = "python"
            elif RUST_AVAILABLE:
                backend_choice = "rust"
            elif FD_AVAILABLE:
                backend_choice = "fd"
            else:
                backend_choice = "python"

    if backend_choice == "rust":
        self.use_rust = True
        self.use_fd = False
    elif backend_choice == "fd":
        self.use_rust = False
        self.use_fd = True
    else:
        self.use_rust = False
        self.use_fd = False

    # Parallel/async/other toggles come directly from config (already validated)
    self.use_parallel = bool(config.use_parallel and self.use_rust)
    self.use_async = bool(config.use_async and self.use_rust)

    # Other instance-level flags
    self.build_dataframe = bool(config.build_dataframe)
    self.return_absolute_paths = bool(config.return_absolute_paths)
    # Progress handling
    if _is_interactive_environment() and config.show_progress:
        logger.debug("Interactive environment detected, disabling progress bars to avoid conflicts")
        self.show_progress = False
    else:
        self.show_progress = bool(config.show_progress)

    # Network tuning (only valid if use_async True)
    self.network_concurrency = config.network_concurrency
    self.network_timeout_ms = config.network_timeout_ms
    self.network_retries = config.network_retries

    # Threads forwarded to fd if using fd backend
    self.threads = config.threads if self.use_fd else None

    # Defer fd integration initialization until actually used
    self.fd_integration = None

`get_dataframe(analysis)` ¶

Get the DataFrame from analysis results.

analysis: :class:`DirectoryAnalysis` instance

DataFrame object if available, None otherwise

Source code in filoma/directories/directory_profiler.py

def get_dataframe(self, analysis: "DirectoryAnalysis") -> Optional["DataFrame"]:
    """Get the DataFrame from analysis results.

    Args:
    ----
        analysis: :class:`DirectoryAnalysis` instance

    Returns:
    -------
        DataFrame object if available, None otherwise

    """
    if not isinstance(analysis, DirectoryAnalysis):
        raise TypeError("get_dataframe expects a DirectoryAnalysis instance")
    return analysis.to_df()

`get_implementation_info()` ¶

Get information about which implementations are available and being used.

Returns¶

Dictionary with implementation availability status

Source code in filoma/directories/directory_profiler.py

def get_implementation_info(self) -> dict:
    """Get information about which implementations are available and being used.

    Returns
    -------
        Dictionary with implementation availability status

    """
    return {
        "rust_available": RUST_AVAILABLE,
        "rust_parallel_available": RUST_PARALLEL_AVAILABLE,
        "rust_async_available": RUST_ASYNC_AVAILABLE,
        "fd_available": FD_AVAILABLE,
        "dataframe_available": DATAFRAME_AVAILABLE,
        "using_rust": self.use_rust,
        "using_parallel": self.use_parallel,
        "using_async": bool(self.use_async and RUST_ASYNC_AVAILABLE),
        "using_fd": self.use_fd,
        "using_dataframe": self.build_dataframe,
        "return_absolute_paths": self.return_absolute_paths,
        "search_backend": self.search_backend,
        "python_fallback": not (self.use_rust or self.use_fd),
    }

`is_dataframe_enabled()` ¶

Check if DataFrame building is enabled and available.

Returns¶

True if DataFrame building is enabled, False otherwise

Source code in filoma/directories/directory_profiler.py

def is_dataframe_enabled(self) -> bool:
    """Check if DataFrame building is enabled and available.

    Returns
    -------
        True if DataFrame building is enabled, False otherwise

    """
    return self.build_dataframe and DATAFRAME_AVAILABLE

`is_fd_available()` ¶

Check if fd integration is available and being used.

Returns¶

True if fd is available and enabled, False otherwise

Source code in filoma/directories/directory_profiler.py

def is_fd_available(self) -> bool:
    """Check if fd integration is available and being used.

    Returns
    -------
        True if fd is available and enabled, False otherwise

    """
    # Use FD_AVAILABLE to reflect whether the fd integration package is importable
    # Tests may monkeypatch FD_AVAILABLE without having the fd binary present.
    return self.use_fd and FD_AVAILABLE

`is_parallel_available()` ¶

Check if parallel Rust implementation is available and being used.

Returns¶

True if parallel Rust implementation is available and enabled, False otherwise

Source code in filoma/directories/directory_profiler.py

def is_parallel_available(self) -> bool:
    """Check if parallel Rust implementation is available and being used.

    Returns
    -------
        True if parallel Rust implementation is available and enabled, False otherwise

    """
    return self.use_parallel and RUST_PARALLEL_AVAILABLE

`is_rust_available()` ¶

Check if Rust implementation is available and being used.

Returns¶

True if Rust implementation is available and enabled, False otherwise

Source code in filoma/directories/directory_profiler.py

def is_rust_available(self) -> bool:
    """Check if Rust implementation is available and being used.

    Returns
    -------
        True if Rust implementation is available and enabled, False otherwise

    """
    return self.use_rust and RUST_AVAILABLE

`print_empty_folders(analysis, max_show=20)` ¶

Print empty folders found (expects DirectoryAnalysis).

Source code in filoma/directories/directory_profiler.py

def print_empty_folders(self, analysis: "DirectoryAnalysis", max_show: int = 20):
    """Print empty folders found (expects DirectoryAnalysis)."""
    if not isinstance(analysis, DirectoryAnalysis):
        raise TypeError("print_empty_folders expects a DirectoryAnalysis instance")

    empty_folders = analysis.empty_folders

    if not empty_folders:
        self.console.print("[green]✓ No empty folders found![/green]")
        return

    table = Table(title=f"Empty Folders (showing {min(len(empty_folders), max_show)} of {len(empty_folders)})")
    table.add_column("Path", style="yellow")

    for folder in empty_folders[:max_show]:
        table.add_row(folder)

    if len(empty_folders) > max_show:
        table.add_row(f"... and {len(empty_folders) - max_show} more")

    self.console.print(table)
    self.console.print()

`print_file_extensions(analysis, top_n=10)` ¶

Print the most common file extensions (expects DirectoryAnalysis).

Source code in filoma/directories/directory_profiler.py

def print_file_extensions(self, analysis: "DirectoryAnalysis", top_n: int = 10):
    """Print the most common file extensions (expects DirectoryAnalysis)."""
    if not isinstance(analysis, DirectoryAnalysis):
        raise TypeError("print_file_extensions expects a DirectoryAnalysis instance")

    extensions = analysis.file_extensions

    if not extensions:
        return

    table = Table(title="File Extensions")
    table.add_column("Extension", style="bold magenta")
    table.add_column("Count", style="white")
    table.add_column("Percentage", style="green")
    total_files = analysis.summary["total_files"]

    for ext, count in list(extensions.items())[:top_n]:
        percentage = (count / total_files * 100) if total_files > 0 else 0
        table.add_row(ext, f"{count:,}", f"{percentage:.1f}%")

    self.console.print(table)
    self.console.print()

`print_folder_patterns(analysis, top_n=10)` ¶

Print the most common folder names (expects DirectoryAnalysis).

Source code in filoma/directories/directory_profiler.py

def print_folder_patterns(self, analysis: "DirectoryAnalysis", top_n: int = 10):
    """Print the most common folder names (expects DirectoryAnalysis)."""
    if not isinstance(analysis, DirectoryAnalysis):
        raise TypeError("print_folder_patterns expects a DirectoryAnalysis instance")

    folder_names = analysis.common_folder_names

    if not folder_names:
        return

    table = Table(title="Common Folder Names")
    table.add_column("Folder Name", style="bold blue")
    table.add_column("Occurrences", style="white")

    for name, count in list(folder_names.items())[:top_n]:
        table.add_row(name, f"{count:,}")

    self.console.print(table)
    self.console.print()

`print_report(analysis)` ¶

Print a comprehensive report of the directory analysis.

Expects a :class:DirectoryAnalysis instance. Use :meth:to_dict if you need a plain dict shape for downstream tooling.

Source code in filoma/directories/directory_profiler.py

def print_report(self, analysis: "DirectoryAnalysis"):
    """Print a comprehensive report of the directory analysis.

    Expects a :class:`DirectoryAnalysis` instance. Use :meth:`to_dict`
    if you need a plain dict shape for downstream tooling.
    """
    if not isinstance(analysis, DirectoryAnalysis):
        raise TypeError("print_report expects a DirectoryAnalysis instance")

    self.print_summary(analysis)
    self.print_file_extensions(analysis)
    self.print_folder_patterns(analysis)
    self.print_empty_folders(analysis)

`print_summary(analysis)` ¶

Print a summary of the directory analysis (expects DirectoryAnalysis).

Source code in filoma/directories/directory_profiler.py

def print_summary(self, analysis: "DirectoryAnalysis"):
    """Print a summary of the directory analysis (expects DirectoryAnalysis)."""
    if not isinstance(analysis, DirectoryAnalysis):
        raise TypeError("print_summary expects a DirectoryAnalysis instance")

    summary = analysis.summary
    timing = analysis.timing or {}

    # Show which implementation was used with more detail
    impl_type = timing.get("implementation", "Unknown")

    # Add DataFrame indicator
    if self.build_dataframe and analysis.dataframe is not None:
        impl_type += " + 📊 DataFrame"

    # Main summary table
    title = f"Directory Analysis: {analysis.path} ({impl_type})"
    if timing:
        title += f" - {timing.get('elapsed_seconds', 0):.2f}s"

    table = Table(title=title)
    table.add_column("Metric", style="bold cyan")
    table.add_column("Value", style="white")

    table.add_row("Total Files", f"{summary['total_files']:,}")
    table.add_row("Total Folders", f"{summary['total_folders']:,}")
    table.add_row("Total Size", f"{summary['total_size_mb']:,} MB")
    table.add_row("Average Files per Folder", str(summary["avg_files_per_folder"]))
    table.add_row("Maximum Depth", str(summary["max_depth"]))
    table.add_row("Empty Folders", str(summary["empty_folder_count"]))

    # Add DataFrame info if available
    if self.build_dataframe and analysis.dataframe is not None:
        df = analysis.dataframe
        table.add_row("DataFrame Rows", f"{len(df):,}")

    # Add timing information if available
    if timing:
        table.add_row("Analysis Time", f"{timing['elapsed_seconds']:.2f}s")
        if timing.get("items_per_second", 0) > 0:
            table.add_row("Processing Speed", f"{timing['items_per_second']:,.0f} items/sec")

    self.console.print(table)
    self.console.print()

`probe(path, max_depth=None, threads=None)` ¶

Analyze a directory tree and return comprehensive statistics.

path: Path to the root directory to probe
max_depth: Maximum depth to traverse (None for unlimited)
threads: Optional override for number of threads when using fd backend

A :class:`DirectoryAnalysis` instance containing analysis results

Source code in filoma/directories/directory_profiler.py

def probe(self, path: str, max_depth: Optional[int] = None, threads: Optional[int] = None) -> "DirectoryAnalysis":
    """Analyze a directory tree and return comprehensive statistics.

    Args:
    ----
        path: Path to the root directory to probe
        max_depth: Maximum depth to traverse (None for unlimited)
        threads: Optional override for number of threads when using fd backend

    Returns:
    -------
        A :class:`DirectoryAnalysis` instance containing analysis results

    """
    start_time = time.time()

    # Choose the best backend
    backend = self._choose_backend()

    # Log the start of analysis
    impl_type = self._get_impl_display_name(backend)
    logger.info(f"Starting directory analysis of '{path}' using {impl_type} implementation")

    try:
        if backend == "fd":
            # threads param overrides instance threads when provided
            chosen_threads = threads if threads is not None else self.threads
            result = self._probe_fd(path, max_depth, threads=chosen_threads)
        elif backend == "rust":
            result = self._probe_rust(path, max_depth, fast_path_only=self._fast_path_only)
        else:
            result = self._probe_python(path, max_depth)

        # Calculate and log timing
        elapsed_time = time.time() - start_time
        total_items = result["summary"]["total_files"] + result["summary"]["total_folders"]

        logger.success(
            f"Directory analysis completed in {elapsed_time:.2f}s - "
            f"Found {total_items:,} items ({result['summary']['total_files']:,} files, "
            f"{result['summary']['total_folders']:,} folders) using {impl_type}"
        )

        # Add timing information to result
        result["timing"] = {
            "elapsed_seconds": elapsed_time,
            "implementation": impl_type,
            "items_per_second": (total_items / elapsed_time if elapsed_time > 0 else 0),
        }

        # Return a structured dataclass by default for easier programmatic use
        return DirectoryAnalysis.from_dict(result)

    except Exception as e:
        elapsed_time = time.time() - start_time
        logger.error(f"Directory analysis failed after {elapsed_time:.2f}s: {str(e)}")
        raise

`sample_paths(path, sample_size=20)` ¶

Return small samples of paths for quick backend-diffing.

Returns a dict with keys 'fd_files', 'fd_dirs', 'python_files'. Rust currently does not expose a path list in the public API so it is omitted (you can re-run the Rust prober separately if needed).

Source code in filoma/directories/directory_profiler.py

def sample_paths(self, path: str, sample_size: int = 20) -> Dict[str, List[str]]:
    """Return small samples of paths for quick backend-diffing.

    Returns a dict with keys 'fd_files', 'fd_dirs', 'python_files'. Rust currently
    does not expose a path list in the public API so it is omitted (you can
    re-run the Rust prober separately if needed).
    """
    samples = {"fd_files": [], "fd_dirs": [], "python_files": []}
    try:
        if FD_AVAILABLE:
            fd = FdIntegration()
            samples["fd_files"] = fd.find(
                path=path,
                file_types=["f"],
                max_results=sample_size,
                search_hidden=True,
                no_ignore=True,
                follow_links=False,
                absolute_paths=self.return_absolute_paths,
            )
            samples["fd_dirs"] = fd.find(
                path=path,
                file_types=["d"],
                max_results=sample_size,
                search_hidden=True,
                no_ignore=True,
                follow_links=False,
                absolute_paths=self.return_absolute_paths,
            )
    except Exception:
        samples["fd_files"] = []
        samples["fd_dirs"] = []

    # Python sample
    try:
        root = Path(path)
        python_files = []
        for i, p in enumerate(root.rglob("*")):
            if p.is_file():
                python_files.append(str(p.resolve()))
            if len(python_files) >= sample_size:
                break
        samples["python_files"] = python_files
    except Exception:
        samples["python_files"] = []

    return samples

`DirectoryProfilerConfig` `dataclass` ¶

Configuration for DirectoryProfiler (explicit, typed, no legacy kwargs).

All fields are documented and validated in post_init.

Source code in filoma/directories/directory_profiler.py

@dataclass(frozen=True)
class DirectoryProfilerConfig:
    """Configuration for DirectoryProfiler (explicit, typed, no legacy kwargs).

    All fields are documented and validated in __post_init__.
    """

    # Backend selection
    use_rust: bool = False
    use_parallel: bool = True
    use_async: bool = False
    use_fd: bool = False
    search_backend: str = "auto"  # 'rust' | 'fd' | 'python' | 'auto'

    # General tuning
    parallel_threshold: int = 1000
    build_dataframe: bool = False
    return_absolute_paths: bool = False
    show_progress: bool = True
    progress_callback: Optional[Callable[[str, int, int], None]] = None
    fast_path_only: bool = False

    # Network tuning (only valid when use_async is True)
    network_concurrency: int = 192
    network_timeout_ms: int = 20000
    network_retries: int = 0

    # fd-specific tuning
    threads: Optional[int] = None
    fd_no_ignore: bool = False

    def __post_init__(self):
        """Validate configuration fields after initialization.

        Ensures values are within acceptable ranges and relationships are
        enforced (for example, network tuning only when async is enabled).
        """
        # Basic validations
        if self.search_backend not in ("auto", "rust", "fd", "python"):
            raise ValueError("search_backend must be one of 'auto','rust','fd','python'")
        if not isinstance(self.parallel_threshold, int) or self.parallel_threshold < 0:
            raise ValueError("parallel_threshold must be a non-negative integer")
        if not isinstance(self.network_concurrency, int) or self.network_concurrency <= 0:
            raise ValueError("network_concurrency must be a positive integer")
        if self.network_timeout_ms <= 0:
            raise ValueError("network_timeout_ms must be positive")
        if self.network_retries < 0:
            raise ValueError("network_retries must be non-negative")

        # Relationship validations - only validate if non-default network params are set
        # Default values are: network_concurrency=192, network_timeout_ms=20000, network_retries=0
        has_custom_network_params = self.network_concurrency != 192 or self.network_timeout_ms != 20000 or self.network_retries != 0
        if not self.use_async and has_custom_network_params:
            raise ValueError("Network tuning parameters only apply when use_async=True")

        # Check if fd backend is being used (either explicitly or via search_backend)
        is_using_fd = self.use_fd or self.search_backend == "fd"
        if self.threads is not None and not is_using_fd:
            raise ValueError("'threads' only applies when use_fd=True or search_backend='fd'")

`__post_init__()` ¶

Validate configuration fields after initialization.

Ensures values are within acceptable ranges and relationships are enforced (for example, network tuning only when async is enabled).

Source code in filoma/directories/directory_profiler.py

def __post_init__(self):
    """Validate configuration fields after initialization.

    Ensures values are within acceptable ranges and relationships are
    enforced (for example, network tuning only when async is enabled).
    """
    # Basic validations
    if self.search_backend not in ("auto", "rust", "fd", "python"):
        raise ValueError("search_backend must be one of 'auto','rust','fd','python'")
    if not isinstance(self.parallel_threshold, int) or self.parallel_threshold < 0:
        raise ValueError("parallel_threshold must be a non-negative integer")
    if not isinstance(self.network_concurrency, int) or self.network_concurrency <= 0:
        raise ValueError("network_concurrency must be a positive integer")
    if self.network_timeout_ms <= 0:
        raise ValueError("network_timeout_ms must be positive")
    if self.network_retries < 0:
        raise ValueError("network_retries must be non-negative")

    # Relationship validations - only validate if non-default network params are set
    # Default values are: network_concurrency=192, network_timeout_ms=20000, network_retries=0
    has_custom_network_params = self.network_concurrency != 192 or self.network_timeout_ms != 20000 or self.network_retries != 0
    if not self.use_async and has_custom_network_params:
        raise ValueError("Network tuning parameters only apply when use_async=True")

    # Check if fd backend is being used (either explicitly or via search_backend)
    is_using_fd = self.use_fd or self.search_backend == "fd"
    if self.threads is not None and not is_using_fd:
        raise ValueError("'threads' only applies when use_fd=True or search_backend='fd'")

handler: python

API Reference¶

__getattr__(name) ¶

probe(path, **kwargs) ¶

probe_file(path, **kwargs) ¶

probe_image(arg, **kwargs) ¶

probe_to_df(path, to_pandas=False, enrich=True, **kwargs) ¶

snapshot(path, mode='fast', export=None, include_hidden=False, pattern=None, metadata=None) ¶

verify_snapshot(snapshot_path, target_path=None, mode=None) ¶

Package overview¶

DataFrame wrapper¶

columns property ¶

df property ¶

dtypes property ¶

lineage property ¶

native property ¶

pandas property ¶

Raises¶

pandas_cached property ¶

polars property ¶

shape property ¶

__dir__() ¶

__getattr__(name) ¶

__getitem__(key) ¶

__init__(data=None, lineage=None) ¶

__len__() ¶

__repr__() ¶

__setitem__(key, value) ¶

__str__() ¶

add_depth_col(path=None, inplace=False) ¶

add_file_stats_cols(path='path', base_path=None, compute_hash=False, inplace=False) ¶

add_filename_features(path_col='path', sep='_', prefix='feat', max_tokens=None, include_parent=False, include_all_parts=False, token_names=None, enrich=False, inplace=False) ¶

add_lineage_entry(operation, **kwargs) ¶

add_path_components(inplace=False) ¶

Returns¶

describe(percentiles=None) ¶

directory_counts() ¶

Returns¶

enrich(inplace=False) ¶

evaluate_duplicates(path_col='path', text_threshold=0.8, image_max_distance=5, text_k=3, show_table=True, cross_dir_paths=None) ¶

extension_counts() ¶

Returns¶

filter_by_extension(extensions) ¶

filter_by_pattern(pattern) ¶

from_pandas(df) classmethod ¶

head(n=5) ¶

info() ¶

invalidate_pandas_cache() ¶

save_csv(path) ¶

save_parquet(path) ¶

sort(by, descending=False) ¶

tail(n=5) ¶

to_dict() ¶

to_pandas(force=False) ¶

to_polars() ¶

unique(subset=None) ¶

Directory profiler¶

DirectoryAnalysis dataclass ¶

path_obj property ¶

__getattr__(name) ¶

__getitem__(key) ¶

__iter__() ¶

__len__() ¶

__post_init__() ¶

as_dict() ¶

from_dict(d) classmethod ¶

Parameters¶

Returns¶

print_report(profiler=None) ¶

print_summary(profiler=None) ¶

to_df() ¶

to_dict() ¶

DirectoryProfiler ¶

__init__(config) ¶

get_dataframe(analysis) ¶

get_implementation_info() ¶

Returns¶

is_dataframe_enabled() ¶

Returns¶

is_fd_available() ¶

Returns¶

`getattr(name)` ¶

`probe(path, **kwargs)` ¶

`probe_file(path, **kwargs)` ¶

`probe_image(arg, **kwargs)` ¶

`probe_to_df(path, to_pandas=False, enrich=True, **kwargs)` ¶

`snapshot(path, mode='fast', export=None, include_hidden=False, pattern=None, metadata=None)` ¶

`verify_snapshot(snapshot_path, target_path=None, mode=None)` ¶

`columns` `property` ¶

`df` `property` ¶

`dtypes` `property` ¶

`lineage` `property` ¶

`native` `property` ¶

`pandas` `property` ¶

`pandas_cached` `property` ¶

`polars` `property` ¶

`shape` `property` ¶

`dir()` ¶

`getattr(name)` ¶

`getitem(key)` ¶

`init(data=None, lineage=None)` ¶

`len()` ¶

`repr()` ¶

`setitem(key, value)` ¶

`str()` ¶

`add_depth_col(path=None, inplace=False)` ¶

`add_file_stats_cols(path='path', base_path=None, compute_hash=False, inplace=False)` ¶

`add_filename_features(path_col='path', sep='_', prefix='feat', max_tokens=None, include_parent=False, include_all_parts=False, token_names=None, enrich=False, inplace=False)` ¶

`add_lineage_entry(operation, **kwargs)` ¶

`add_path_components(inplace=False)` ¶

`describe(percentiles=None)` ¶

`directory_counts()` ¶

`enrich(inplace=False)` ¶

`evaluate_duplicates(path_col='path', text_threshold=0.8, image_max_distance=5, text_k=3, show_table=True, cross_dir_paths=None)` ¶

`extension_counts()` ¶

`filter_by_extension(extensions)` ¶

`filter_by_pattern(pattern)` ¶

`from_pandas(df)` `classmethod` ¶

`head(n=5)` ¶

`info()` ¶

`invalidate_pandas_cache()` ¶

`save_csv(path)` ¶

`save_parquet(path)` ¶

`sort(by, descending=False)` ¶

`tail(n=5)` ¶

`to_dict()` ¶

`to_pandas(force=False)` ¶

`to_polars()` ¶

`unique(subset=None)` ¶

`DirectoryAnalysis` `dataclass` ¶

`path_obj` `property` ¶

`getattr(name)` ¶

`getitem(key)` ¶

`iter()` ¶

`len()` ¶

`__post_init__()` ¶

`as_dict()` ¶

`from_dict(d)` `classmethod` ¶

`print_report(profiler=None)` ¶

`print_summary(profiler=None)` ¶

`to_df()` ¶

`to_dict()` ¶

`DirectoryProfiler` ¶

`init(config)` ¶

`get_dataframe(analysis)` ¶

`get_implementation_info()` ¶

`is_dataframe_enabled()` ¶

`is_fd_available()` ¶

`is_parallel_available()` ¶

`is_rust_available()` ¶

`print_empty_folders(analysis, max_show=20)` ¶

`print_file_extensions(analysis, top_n=10)` ¶

`print_folder_patterns(analysis, top_n=10)` ¶

`print_report(analysis)` ¶

`print_summary(analysis)` ¶

`probe(path, max_depth=None, threads=None)` ¶

`sample_paths(path, sample_size=20)` ¶

`DirectoryProfilerConfig` `dataclass` ¶

`__post_init__()` ¶