Skip to content

API Reference

This page uses mkdocstrings to generate an API reference from the src/filoma package.

filoma: filesystem profiling and directory analysis.

A modular Python tool for profiling files, analyzing directory structures, and inspecting image data.

This module exposes a tiny, ergonomic public surface while importing heavy optional dependencies lazily (Polars, Pillow, Rust extension, etc.). Accessing convenience classes like :class:DataFrame or subpackages like filoma.directories will import the underlying modules on-demand.

__getattr__(name)

Lazy import and attribute resolution for top-level names.

Implements PEP 562: import submodules or attributes on demand.

Source code in filoma/__init__.py
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
def __getattr__(name: str):
    """Lazy import and attribute resolution for top-level names.

    Implements PEP 562: import submodules or attributes on demand.
    """
    mapping = {
        # top-level subpackages
        "core": "filoma.core",
        "directories": "filoma.directories",
        "files": "filoma.files",
        "images": "filoma.images",
        "filaraki": "filoma.filaraki",
        # common classes placed in submodules (module:attr)
        "DataFrame": "filoma.dataframe:DataFrame",
        "DirectoryProfiler": "filoma.directories.directory_profiler:DirectoryProfiler",
        "FileProfiler": "filoma.files.file_profiler:FileProfiler",
        "ImageProfiler": "filoma.images.image_profiler:ImageProfiler",
    }

    if name == "Dataset":
        from .dataset import Dataset

        globals()["Dataset"] = Dataset
        return Dataset

    if name in mapping:
        target = mapping[name]
        if ":" in target:
            module_name, attr = target.split(":", 1)
            mod = importlib.import_module(module_name)
            value = getattr(mod, attr)
        else:
            value = importlib.import_module(target)

        globals()[name] = value
        return value

    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")

probe(path, **kwargs)

Quick helper: probe a directory path and return a DirectoryAnalysis.

This wrapper accepts probe-specific keyword arguments such as max_depth and threads and forwards them to :class:DirectoryProfiler.probe. Other kwargs are used to configure the :class:DirectoryProfiler constructor.

Source code in filoma/__init__.py
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
def probe(path: str, **kwargs: Any) -> Any:
    """Quick helper: probe a directory path and return a DirectoryAnalysis.

    This wrapper accepts probe-specific keyword arguments such as
    ``max_depth`` and ``threads`` and forwards them to
    :class:`DirectoryProfiler.probe`. Other kwargs are used to configure the
    :class:`DirectoryProfiler` constructor.
    """
    # Extract probe-only parameters so they are not passed to the
    # DirectoryProfiler constructor (which doesn't accept them).
    max_depth = kwargs.pop("max_depth", None)
    threads = kwargs.pop("threads", None)

    # If the provided path points to a file, dispatch to FileProfiler.probe
    try:
        from pathlib import Path

        p = Path(path)
        if p.exists() and p.is_file():
            # Forward any file-specific kwargs (e.g., compute_hash) via kwargs
            from .files.file_profiler import FileProfiler

            return FileProfiler().probe(path, **kwargs)
    except Exception:
        # If any checks fail, fall back to directory probing behaviour and
        # let the underlying profiler raise appropriate errors.
        pass

    # Local import to ensure the class is available without forcing it at
    # module import time.
    from .directories import DirectoryProfiler, DirectoryProfilerConfig

    # Build a typed config from remaining kwargs and instantiate the profiler
    config = DirectoryProfilerConfig(**kwargs)
    profiler = DirectoryProfiler(config)
    return profiler.probe(path, max_depth=max_depth, threads=threads)

probe_file(path, **kwargs)

Quick helper: probe a single file and return a Filo dataclass.

Source code in filoma/__init__.py
124
125
126
127
128
def probe_file(path: str, **kwargs: Any) -> Any:
    """Quick helper: probe a single file and return a Filo dataclass."""
    from .files.file_profiler import FileProfiler

    return FileProfiler().probe(path, **kwargs)

probe_image(arg, **kwargs)

Analyze an image.

If arg is a numpy array, :class:ImageProfiler.probe is used; if it's path-like, attempt to locate an image-specific profiler or load it to numpy and analyze.

This wrapper favors simplicity for interactive use; for advanced control instantiate profilers directly.

Source code in filoma/__init__.py
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
def probe_image(arg: Any, **kwargs: Any) -> Any:
    """Analyze an image.

    If ``arg`` is a numpy array, :class:`ImageProfiler.probe` is used; if
    it's path-like, attempt to locate an image-specific profiler or load it
    to numpy and analyze.

    This wrapper favors simplicity for interactive use; for advanced
    control instantiate profilers directly.
    """
    # Local imports; keep them inside the function to avoid heavy deps at
    # module import time.
    from pathlib import Path

    try:
        import numpy as _np
    except Exception:
        _np = None

    # If it's a numpy array, use ImageProfiler directly
    if _np is not None and hasattr(_np, "ndarray") and isinstance(arg, _np.ndarray):
        from .images.image_profiler import ImageProfiler

        return ImageProfiler().probe(arg)

    # Treat as path-like
    p = Path(arg)
    suffix = p.suffix.lower() if p.suffix else ""

    try:
        # Use images package specializers when available
        from .images import NpyProfiler, PngProfiler, TifProfiler, ZarrProfiler

        if suffix == ".png":
            return PngProfiler().probe(p)
        if suffix == ".npy":
            return NpyProfiler().probe(p)
        if suffix in (".tif", ".tiff"):
            return TifProfiler().probe(p)
        if suffix == ".zarr":
            return ZarrProfiler().probe(p)
    except Exception:
        # If specialist creation fails, fall back to generic loader below
        pass

    # Generic fallback: try Pillow + numpy loader
    try:
        # Third-party import
        from PIL import Image as _PILImage

        # Local import
        from .images.image_profiler import ImageProfiler

        img = _PILImage.open(p)
        arr = _np.array(img) if _np is not None else None
        if arr is not None:
            return ImageProfiler().probe(arr)
    except Exception:
        pass

    # Last resort: return an ImageReport with status explaining failure
    from .images.image_profiler import ImageReport

    return ImageReport(path=str(p), status="failed_to_load_or_unsupported_format")

probe_to_df(path, to_pandas=False, enrich=True, **kwargs)

Return a Polars DataFrame (or pandas if to_pandas=True).

Force DataFrame building on the profiler and optionally run a small enrichment chain: .add_depth_col(path).add_path_components().add_file_stats_cols().

Source code in filoma/__init__.py
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
def probe_to_df(path: str, to_pandas: bool = False, enrich: bool = True, **kwargs: Any) -> Any:
    """Return a Polars DataFrame (or pandas if to_pandas=True).

    Force DataFrame building on the profiler and optionally run a small
    enrichment chain: .add_depth_col(path).add_path_components().add_file_stats_cols().
    """
    # Extract probe-only parameters
    max_depth = kwargs.pop("max_depth", None)
    threads = kwargs.pop("threads", None)

    # Lazy import to avoid heavy deps at module import time
    from .directories import DirectoryProfiler, DirectoryProfilerConfig

    # Force DataFrame building and construct a typed config
    kwargs["build_dataframe"] = True
    config = DirectoryProfilerConfig(**kwargs)
    profiler = DirectoryProfiler(config)
    analysis = profiler.probe(path, max_depth=max_depth, threads=threads)

    df_wrapper = analysis.to_df()
    if df_wrapper is None:
        raise RuntimeError("DataFrame was not built. Ensure 'polars' is installed and that DataFrame building is enabled (build_dataframe=True).")

    # Initialize lineage
    df_wrapper.add_lineage_entry("probe", path=path, **kwargs)

    # Optionally enrich the DataFrame wrapper with useful columns/stats
    df_enriched = df_wrapper
    if enrich:
        try:
            df_enriched = df_enriched.add_depth_col(path).add_path_components().add_file_stats_cols()
        except Exception:
            # If enrichment fails for any reason, fall back to the raw DataFrame
            pass

    # Return requested format: filoma.DataFrame wrapper (default) or pandas
    # Keep the `to_pandas` convenience for callers that explicitly want pandas
    if to_pandas:
        try:
            return df_enriched.df.to_pandas()
        except Exception as e:
            raise RuntimeError(f"Failed to convert Polars DataFrame to pandas: {e}")

    return df_enriched

snapshot(path, mode='fast', export=None, include_hidden=False, pattern=None, metadata=None)

Create a snapshot of a dataset with configurable integrity checking.

Three integrity levels: - "fast": Hash of filename + size + mtime (99% effective for accidental changes) - "deep": Fast + hash of first/last 4KB (detects header/corruption changes) - "full": Complete SHA-256 hash (audit mode, slow for large files)

Parameters:

Name Type Description Default
path str

Path to the dataset directory to snapshot

required
mode str

Integrity level - "fast", "deep", or "full"

'fast'
export Optional[str]

Optional path to save the snapshot JSON file

None
include_hidden bool

Whether to include hidden files/directories

False
pattern Optional[str]

Optional glob pattern to filter files (e.g., "*.txt")

None
metadata Optional[Dict[str, Any]]

Optional metadata dictionary to include in snapshot

None

Returns:

Type Description
Any

DatasetSnapshot object containing all file entries and hashes

Source code in filoma/__init__.py
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
def snapshot(
    path: str,
    mode: str = "fast",
    export: Optional[str] = None,
    include_hidden: bool = False,
    pattern: Optional[str] = None,
    metadata: Optional[Dict[str, Any]] = None,
) -> Any:
    """Create a snapshot of a dataset with configurable integrity checking.

    Three integrity levels:
    - "fast": Hash of filename + size + mtime (99% effective for accidental changes)
    - "deep": Fast + hash of first/last 4KB (detects header/corruption changes)
    - "full": Complete SHA-256 hash (audit mode, slow for large files)

    Args:
        path: Path to the dataset directory to snapshot
        mode: Integrity level - "fast", "deep", or "full"
        export: Optional path to save the snapshot JSON file
        include_hidden: Whether to include hidden files/directories
        pattern: Optional glob pattern to filter files (e.g., "*.txt")
        metadata: Optional metadata dictionary to include in snapshot

    Returns:
        DatasetSnapshot object containing all file entries and hashes

    """
    from .core.snapshot import snapshot as _snapshot

    return _snapshot(
        path=path,
        mode=mode,
        export=export,
        include_hidden=include_hidden,
        pattern=pattern,
        metadata=metadata,
    )

verify_snapshot(snapshot_path, target_path=None, mode=None)

Verify a directory against a saved snapshot.

Parameters:

Name Type Description Default
snapshot_path str

Path to the saved snapshot JSON file

required
target_path Optional[str]

Optional path to verify (defaults to snapshot's root_path)

None
mode Optional[str]

Verification mode (defaults to snapshot's mode)

None

Returns:

Type Description
Dict[str, Any]

Dictionary with verification results

Source code in filoma/__init__.py
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
def verify_snapshot(
    snapshot_path: str,
    target_path: Optional[str] = None,
    mode: Optional[str] = None,
) -> Dict[str, Any]:
    """Verify a directory against a saved snapshot.

    Args:
        snapshot_path: Path to the saved snapshot JSON file
        target_path: Optional path to verify (defaults to snapshot's root_path)
        mode: Verification mode (defaults to snapshot's mode)

    Returns:
        Dictionary with verification results

    """
    from .core.snapshot import verify as _verify

    return _verify(
        snapshot_path=snapshot_path,
        target_path=target_path,
        mode=mode,
    )

Package overview

The top-level package docstring is rendered above. Below are some focused sections for important modules and classes.

DataFrame wrapper

The filoma.DataFrame wrapper provides convenience enrichers and helpers that operate on a Polars DataFrame internally.

A wrapper around Polars DataFrame for enhanced file and directory analysis.

This class provides a specialized interface for working with file path data, allowing for easy manipulation and analysis of filesystem information.

All standard Polars DataFrame methods and properties are available through attribute delegation, so you can use this like a regular Polars DataFrame with additional file-specific functionality.

Source code in filoma/dataframe.py
 107
 108
 109
 110
 111
 112
 113
 114
 115
 116
 117
 118
 119
 120
 121
 122
 123
 124
 125
 126
 127
 128
 129
 130
 131
 132
 133
 134
 135
 136
 137
 138
 139
 140
 141
 142
 143
 144
 145
 146
 147
 148
 149
 150
 151
 152
 153
 154
 155
 156
 157
 158
 159
 160
 161
 162
 163
 164
 165
 166
 167
 168
 169
 170
 171
 172
 173
 174
 175
 176
 177
 178
 179
 180
 181
 182
 183
 184
 185
 186
 187
 188
 189
 190
 191
 192
 193
 194
 195
 196
 197
 198
 199
 200
 201
 202
 203
 204
 205
 206
 207
 208
 209
 210
 211
 212
 213
 214
 215
 216
 217
 218
 219
 220
 221
 222
 223
 224
 225
 226
 227
 228
 229
 230
 231
 232
 233
 234
 235
 236
 237
 238
 239
 240
 241
 242
 243
 244
 245
 246
 247
 248
 249
 250
 251
 252
 253
 254
 255
 256
 257
 258
 259
 260
 261
 262
 263
 264
 265
 266
 267
 268
 269
 270
 271
 272
 273
 274
 275
 276
 277
 278
 279
 280
 281
 282
 283
 284
 285
 286
 287
 288
 289
 290
 291
 292
 293
 294
 295
 296
 297
 298
 299
 300
 301
 302
 303
 304
 305
 306
 307
 308
 309
 310
 311
 312
 313
 314
 315
 316
 317
 318
 319
 320
 321
 322
 323
 324
 325
 326
 327
 328
 329
 330
 331
 332
 333
 334
 335
 336
 337
 338
 339
 340
 341
 342
 343
 344
 345
 346
 347
 348
 349
 350
 351
 352
 353
 354
 355
 356
 357
 358
 359
 360
 361
 362
 363
 364
 365
 366
 367
 368
 369
 370
 371
 372
 373
 374
 375
 376
 377
 378
 379
 380
 381
 382
 383
 384
 385
 386
 387
 388
 389
 390
 391
 392
 393
 394
 395
 396
 397
 398
 399
 400
 401
 402
 403
 404
 405
 406
 407
 408
 409
 410
 411
 412
 413
 414
 415
 416
 417
 418
 419
 420
 421
 422
 423
 424
 425
 426
 427
 428
 429
 430
 431
 432
 433
 434
 435
 436
 437
 438
 439
 440
 441
 442
 443
 444
 445
 446
 447
 448
 449
 450
 451
 452
 453
 454
 455
 456
 457
 458
 459
 460
 461
 462
 463
 464
 465
 466
 467
 468
 469
 470
 471
 472
 473
 474
 475
 476
 477
 478
 479
 480
 481
 482
 483
 484
 485
 486
 487
 488
 489
 490
 491
 492
 493
 494
 495
 496
 497
 498
 499
 500
 501
 502
 503
 504
 505
 506
 507
 508
 509
 510
 511
 512
 513
 514
 515
 516
 517
 518
 519
 520
 521
 522
 523
 524
 525
 526
 527
 528
 529
 530
 531
 532
 533
 534
 535
 536
 537
 538
 539
 540
 541
 542
 543
 544
 545
 546
 547
 548
 549
 550
 551
 552
 553
 554
 555
 556
 557
 558
 559
 560
 561
 562
 563
 564
 565
 566
 567
 568
 569
 570
 571
 572
 573
 574
 575
 576
 577
 578
 579
 580
 581
 582
 583
 584
 585
 586
 587
 588
 589
 590
 591
 592
 593
 594
 595
 596
 597
 598
 599
 600
 601
 602
 603
 604
 605
 606
 607
 608
 609
 610
 611
 612
 613
 614
 615
 616
 617
 618
 619
 620
 621
 622
 623
 624
 625
 626
 627
 628
 629
 630
 631
 632
 633
 634
 635
 636
 637
 638
 639
 640
 641
 642
 643
 644
 645
 646
 647
 648
 649
 650
 651
 652
 653
 654
 655
 656
 657
 658
 659
 660
 661
 662
 663
 664
 665
 666
 667
 668
 669
 670
 671
 672
 673
 674
 675
 676
 677
 678
 679
 680
 681
 682
 683
 684
 685
 686
 687
 688
 689
 690
 691
 692
 693
 694
 695
 696
 697
 698
 699
 700
 701
 702
 703
 704
 705
 706
 707
 708
 709
 710
 711
 712
 713
 714
 715
 716
 717
 718
 719
 720
 721
 722
 723
 724
 725
 726
 727
 728
 729
 730
 731
 732
 733
 734
 735
 736
 737
 738
 739
 740
 741
 742
 743
 744
 745
 746
 747
 748
 749
 750
 751
 752
 753
 754
 755
 756
 757
 758
 759
 760
 761
 762
 763
 764
 765
 766
 767
 768
 769
 770
 771
 772
 773
 774
 775
 776
 777
 778
 779
 780
 781
 782
 783
 784
 785
 786
 787
 788
 789
 790
 791
 792
 793
 794
 795
 796
 797
 798
 799
 800
 801
 802
 803
 804
 805
 806
 807
 808
 809
 810
 811
 812
 813
 814
 815
 816
 817
 818
 819
 820
 821
 822
 823
 824
 825
 826
 827
 828
 829
 830
 831
 832
 833
 834
 835
 836
 837
 838
 839
 840
 841
 842
 843
 844
 845
 846
 847
 848
 849
 850
 851
 852
 853
 854
 855
 856
 857
 858
 859
 860
 861
 862
 863
 864
 865
 866
 867
 868
 869
 870
 871
 872
 873
 874
 875
 876
 877
 878
 879
 880
 881
 882
 883
 884
 885
 886
 887
 888
 889
 890
 891
 892
 893
 894
 895
 896
 897
 898
 899
 900
 901
 902
 903
 904
 905
 906
 907
 908
 909
 910
 911
 912
 913
 914
 915
 916
 917
 918
 919
 920
 921
 922
 923
 924
 925
 926
 927
 928
 929
 930
 931
 932
 933
 934
 935
 936
 937
 938
 939
 940
 941
 942
 943
 944
 945
 946
 947
 948
 949
 950
 951
 952
 953
 954
 955
 956
 957
 958
 959
 960
 961
 962
 963
 964
 965
 966
 967
 968
 969
 970
 971
 972
 973
 974
 975
 976
 977
 978
 979
 980
 981
 982
 983
 984
 985
 986
 987
 988
 989
 990
 991
 992
 993
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
class DataFrame:
    """A wrapper around Polars DataFrame for enhanced file and directory analysis.

    This class provides a specialized interface for working with file path data,
    allowing for easy manipulation and analysis of filesystem information.

    All standard Polars DataFrame methods and properties are available through
    attribute delegation, so you can use this like a regular Polars DataFrame
    with additional file-specific functionality.
    """

    def __init__(
        self,
        data: Optional[Union[pl.DataFrame, List[str], List[Path], Dict[str, Any]]] = None,
        lineage: Optional[List[Dict[str, Any]]] = None,
    ):
        """Initialize a DataFrame.

        Args:
        ----
            data: Initial data. Can be:
                - A Polars DataFrame
                - A dictionary mapping column names to sequences (all same length)
                - A list of string paths
                - A list of Path objects
                - None for an empty DataFrame
            lineage: Optional list of lineage entries.

        """
        if data is None:
            self._df = pl.DataFrame({"path": []}, schema={"path": pl.String})
        elif isinstance(data, pl.DataFrame):
            self._df = data
        elif isinstance(data, dict):
            if not data:
                self._df = pl.DataFrame()
            else:
                processed: Dict[str, List[Any]] = {}
                expected_len: Optional[int] = None
                for col, values in data.items():
                    if not isinstance(values, (list, tuple)):
                        raise ValueError("Dictionary values must be list or tuple sequences")
                    seq = [str(x) if isinstance(x, Path) else x for x in values]
                    if expected_len is None:
                        expected_len = len(seq)
                    elif len(seq) != expected_len:
                        raise ValueError("All dictionary value sequences must have the same length")
                    processed[col] = seq
                self._df = pl.DataFrame(processed)
        elif isinstance(data, list):
            if data and isinstance(data[0], dict):
                # Handle list of dictionaries (from manifest or to_dicts())
                self._df = pl.from_dicts(data)
            else:
                paths = [str(path) for path in data]
                self._df = pl.DataFrame({"path": paths})
        else:
            raise ValueError("data must be a Polars DataFrame, dict of columns, list of paths, or None")
        self._pd_cache = None
        self.with_enrich = False
        self.with_filename_features = False
        self._lineage = lineage or []

    def _ensure_polars(self) -> pl.DataFrame:
        """Ensure the internal `_df` is a Polars DataFrame.

        If the underlying object is not a Polars DataFrame attempt to convert
        it (via pandas conversion if available or `pl.DataFrame(...)`). This
        prevents AttributeError when methods expect Polars APIs like
        `with_columns` or `map_elements`.
        """
        # Fast path
        if isinstance(self._df, pl.DataFrame):
            return self._df

        # Try pandas conversion first if pandas is present and this looks like
        # a pandas DataFrame
        try:
            if pd is not None and isinstance(self._df, pd.DataFrame):
                self._df = pl.from_pandas(self._df)
                # Invalidate any cached pandas view since we've converted
                self.invalidate_pandas_cache()
                return self._df
        except Exception:
            # fall through to generic conversion
            pass

        # Generic attempt to coerce into a Polars DataFrame
        try:
            self._df = pl.DataFrame(self._df)
            self.invalidate_pandas_cache()
            return self._df
        except Exception as exc:
            raise RuntimeError(f"Unable to coerce internal DataFrame to polars.DataFrame: {exc}")

    def __getattr__(self, name: str) -> Any:
        """Delegate attribute access to the underlying Polars DataFrame.

        This allows direct access to all Polars DataFrame methods and properties
        like columns, dtypes, shape, select, filter, group_by, etc.
        """
        # Directly return the attribute from the underlying Polars DataFrame.
        # NOTE: We intentionally do NOT wrap returned Polars DataFrames anymore.
        # This makes filoma.DataFrame behave like a Polars DataFrame by default
        # (calls like df.head(), df.select(...), etc. return native Polars
        # objects). This is a breaking change compared to previously wrapping
        # Polars results in filoma.DataFrame.
        try:
            attr = getattr(self._df, name)
        except AttributeError:
            # Preserve the original error semantics
            raise

        # If the attribute is callable, return a wrapper that conditionally
        # wraps returned polars.DataFrame objects into filoma.DataFrame
        if callable(attr):

            def wrapper(*args, **kwargs):
                result = attr(*args, **kwargs)
                # If the underlying call mutated the Polars DataFrame in-place,
                # Polars often returns None or the same object reference. In
                # that case invalidate the cached pandas conversion so future
                # .pandas/.pandas_cached calls reflect the mutation.
                if result is None or result is self._df:
                    try:
                        self.invalidate_pandas_cache()
                    except Exception:
                        # Best-effort: do not let cache invalidation break calls
                        pass
                    return result

                # If wrapping is enabled and result is a Polars DataFrame,
                # wrap it back into filoma.DataFrame for compatibility.
                # Propagate lineage to the new wrapper.
                if get_default_wrap_polars() and isinstance(result, pl.DataFrame):
                    return DataFrame(result, lineage=list(self._lineage))

                return result

            return wrapper

        # Non-callable attributes (properties) — if it's a Polars DataFrame and
        # wrapping is requested, wrap it; otherwise return as-is.
        if get_default_wrap_polars() and isinstance(attr, pl.DataFrame):
            return DataFrame(attr, lineage=list(self._lineage))

        return attr

    def __dir__(self) -> List[str]:
        """Expose both wrapper and underlying Polars attributes in interactive help."""
        attrs = set(super().__dir__())
        try:
            attrs.update(dir(self._df))
        except Exception:
            pass
        return sorted(list(attrs))

    def __getitem__(self, key):
        """Forward subscription (e.g., df['path']) to the underlying Polars DataFrame.

        Returns native Polars objects (Series or DataFrame) to match the default
        Polars-first behavior of this wrapper.
        """
        return self._df.__getitem__(key)

    def __setitem__(self, key, value):
        """Forward item assignment to the underlying Polars DataFrame."""
        # Polars DataFrame supports column assignment via df[key] = value
        # Try to support common user-friendly patterns: assigning a Python
        # sequence or a Series to create/replace a column. Polars' native
        # __setitem__ may raise TypeError in some versions, so handle that
        # explicitly and fall back to with_columns.
        try:
            if isinstance(key, str):
                # Accept polars Series, pandas Series, or Python sequences
                if isinstance(value, pl.Series):
                    series = value
                else:
                    try:
                        # pandas Series -> polars Series
                        if pd is not None and hasattr(value, "__array__") and not isinstance(value, (list, tuple)):
                            series = pl.Series(value)
                        elif isinstance(value, (list, tuple)):
                            series = pl.Series(key, list(value))
                        else:
                            # Scalar value: repeat across rows
                            series = pl.Series(key, [value] * len(self._df))
                    except Exception:
                        series = None

                if "series" in locals() and series is not None:
                    # Use with_columns to add/replace the column
                    self._df = self._df.with_columns(series.alias(key))
                    self.invalidate_pandas_cache()
                    return

            # Fallback to delegating to Polars __setitem__ for other patterns
            self._df.__setitem__(key, value)
            # Underlying data has changed; invalidate any cached pandas conversion
            self.invalidate_pandas_cache()
        except TypeError:
            # Polars raises TypeError for some unsupported assignment forms
            # (e.g., assigning a Series by index). Re-raise a clearer message
            msg = "DataFrame object does not support `Series` assignment by index\n\nUse `DataFrame.with_columns`."
            raise TypeError(msg)

    def invalidate_pandas_cache(self) -> None:
        """Clear the cached pandas conversion created by `to_pandas()`.

        Call this after mutating the underlying Polars DataFrame to ensure
        subsequent `pandas` accesses reflect the latest data.
        """
        self._pd_cache = None

    def add_lineage_entry(self, operation: str, **kwargs: Any) -> None:
        """Add a lineage entry to track the history of this DataFrame.

        Args:
        ----
            operation: Name of the operation performed.
            **kwargs: Parameters used for the operation.

        """
        self._lineage.append(
            {
                "operation": operation,
                "parameters": {k: str(v) if isinstance(v, Path) else v for k, v in kwargs.items()},
                "timestamp": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            }
        )

    @property
    def lineage(self) -> List[Dict[str, Any]]:
        """Return the lineage history of this DataFrame."""
        return self._lineage

    @property
    def df(self) -> pl.DataFrame:
        """Get the underlying Polars DataFrame."""
        return self._df

    def __len__(self) -> int:
        """Get the number of rows in the DataFrame."""
        # polars.DataFrame supports len(), but some wrapped/native objects
        # (for example older PyArrow-backed objects) may not implement __len__.
        # Try common fallbacks in order of preference.
        try:
            return len(self._df)
        except Exception:
            # polars exposes `.height` as row count and `.shape[0]` as rows
            try:
                return int(getattr(self._df, "height"))
            except Exception:
                try:
                    return int(self._df.shape[0])
                except Exception:
                    # Last resort: convert to pandas if available (cheap for small frames)
                    if pd is not None:
                        try:
                            return int(self._df.to_pandas().shape[0])
                        except Exception:
                            return 0
                    return 0

    def __repr__(self) -> str:
        """Return the string representation of the DataFrame."""
        # Avoid calling the underlying object's __str__/__repr__ if it may
        # raise TypeError (observed with some PyDataFrame wrappers). Use
        # safe fallbacks for a short textual preview.
        row_count = len(self)
        # Try polars' to_string-like rendering if available
        try:
            # Polars DataFrame implements __str__/__repr__; prefer repr()
            df_preview = repr(self._df)
        except Exception:
            try:
                # Try to convert to pandas for a safer repr
                if pd is not None:
                    df_preview = repr(self._df.to_pandas())
                else:
                    df_preview = "<unrepresentable DataFrame>"
            except Exception:
                df_preview = "<unrepresentable DataFrame>"

        return f"filoma.DataFrame with {row_count} rows\n{df_preview}"

    def __str__(self) -> str:
        """Return the string representation of the DataFrame."""
        return self.__repr__()

    def head(self, n: int = 5) -> "DataFrame":
        """Get the first n rows."""
        res = DataFrame(self._df.head(n), lineage=list(self._lineage))
        res.add_lineage_entry("head", n=n)
        return res

    def tail(self, n: int = 5) -> "DataFrame":
        """Get the last n rows."""
        res = DataFrame(self._df.tail(n), lineage=list(self._lineage))
        res.add_lineage_entry("tail", n=n)
        return res

    def add_path_components(self, inplace: bool = False) -> "DataFrame":
        """Add columns for path components (parent, name, stem, suffix).

        Returns
        -------
            New DataFrame with additional path component columns

        """
        cols_to_add = []
        if "parent" not in self._df.columns:
            cols_to_add.append(pl.col("path").map_elements(lambda x: str(Path(x).parent), return_dtype=pl.String).alias("parent"))
        if "name" not in self._df.columns:
            cols_to_add.append(pl.col("path").map_elements(lambda x: Path(x).name, return_dtype=pl.String).alias("name"))
        if "stem" not in self._df.columns:
            cols_to_add.append(pl.col("path").map_elements(lambda x: Path(x).stem, return_dtype=pl.String).alias("stem"))
        if "suffix" not in self._df.columns:
            cols_to_add.append(pl.col("path").map_elements(lambda x: Path(x).suffix, return_dtype=pl.String).alias("suffix"))

        if not cols_to_add:
            return self if inplace else DataFrame(self._df)

        df_with_components = self._df.with_columns(cols_to_add)
        if inplace:
            self._df = df_with_components
            self.invalidate_pandas_cache()
            self.add_lineage_entry("add_path_components")
            return self

        res = DataFrame(df_with_components, lineage=list(self._lineage))
        res.add_lineage_entry("add_path_components")
        return res

    def add_file_stats_cols(
        self,
        path: str = "path",
        base_path: Optional[Union[str, Path]] = None,
        compute_hash: bool = False,
        inplace: bool = False,
    ) -> "DataFrame":
        """Add file statistics columns (size, modified time, etc.) based on a column containing filesystem paths.

        Args:
        ----
            path: Name of the column containing file system paths.
            base_path: Optional base path. If provided, any non-absolute paths in the
                path column are resolved relative to this base.
            compute_hash: Whether to compute SHA256 hashes (slow for large files).
            inplace: If True, modify this DataFrame in-place and return ``self``.

        Returns:
        -------
            New DataFrame with file statistics columns added, or ``self`` when
            ``inplace=True``.

        Raises:
        ------
            ValueError: If the specified path column does not exist.

        """
        if path not in self._df.columns:
            raise ValueError(f"Column '{path}' not found in DataFrame")

        # Define the set of columns we intend to add
        target_cols = {
            "size_bytes",
            "modified_time",
            "created_time",
            "is_file",
            "is_dir",
            "owner",
            "group",
            "mode_str",
            "inode",
            "nlink",
            "sha256",
            "xattrs",
        }
        # Decide if we need to proceed. Proceed if any target column is missing,
        # OR if we need to compute hashes and the column is missing or has nulls.
        needs_hashes = compute_hash and ("sha256" not in self._df.columns or self._df["sha256"].null_count() > 0)
        missing_any = not all(c in self._df.columns for c in target_cols)

        if not missing_any and not needs_hashes:
            return self if inplace else DataFrame(self._df, lineage=list(self._lineage))

        # Resolve base path if provided
        base = Path(base_path) if base_path is not None else None

        # Use filoma's FileProfiler to collect rich file metadata
        profiler = FileProfiler()

        def get_file_stats(path_str: str) -> Dict[str, Any]:
            try:
                p = Path(path_str)
                if base is not None and not p.is_absolute():
                    p = base / p
                full_path = str(p)
                if not p.exists():
                    logger.warning(f"Path does not exist: {full_path}")
                    return {
                        "size_bytes": None,
                        "modified_time": None,
                        "created_time": None,
                        "is_file": None,
                        "is_dir": None,
                        "owner": None,
                        "group": None,
                        "mode_str": None,
                        "inode": None,
                        "nlink": None,
                        "sha256": None,
                        "xattrs": "{}",
                    }

                # Use the profiler; let it handle symlinks and permissions
                filo = profiler.probe(full_path, compute_hash=compute_hash)
                row = filo.as_dict()

                # Normalize keys to a stable schema used by this helper
                return {
                    "size_bytes": row.get("size"),
                    "modified_time": row.get("modified"),
                    "created_time": row.get("created"),
                    "is_file": row.get("is_file"),
                    "is_dir": row.get("is_dir"),
                    "owner": row.get("owner"),
                    "group": row.get("group"),
                    "mode_str": row.get("mode_str"),
                    "inode": row.get("inode"),
                    "nlink": row.get("nlink"),
                    "sha256": row.get("sha256"),
                    "xattrs": json.dumps(row.get("xattrs") or {}),
                }
            except Exception:
                # On any error, return a row of Nones/empties preserving schema
                return {
                    "size_bytes": None,
                    "modified_time": None,
                    "created_time": None,
                    "is_file": None,
                    "is_dir": None,
                    "owner": None,
                    "group": None,
                    "mode_str": None,
                    "inode": None,
                    "nlink": None,
                    "sha256": None,
                    "xattrs": "{}",
                }

        stats_data = [get_file_stats(p) for p in self._df[path].to_list()]

        stats_df = pl.DataFrame(
            stats_data,
            schema={
                "size_bytes": pl.Int64,
                "modified_time": pl.String,
                "created_time": pl.String,
                "is_file": pl.Boolean,
                "is_dir": pl.Boolean,
                "owner": pl.String,
                "group": pl.String,
                "mode_str": pl.String,
                "inode": pl.Int64,
                "nlink": pl.Int64,
                "sha256": pl.String,
                "xattrs": pl.String,
            },
        )

        # If columns already exist, we need to drop them before joining to avoid duplicates
        df_base = self._df
        overlapping_cols = [c for c in stats_df.columns if c in df_base.columns]
        if overlapping_cols:
            df_base = df_base.drop(overlapping_cols)

        df_with_stats = pl.concat([df_base, stats_df], how="horizontal")
        if inplace:
            self._df = df_with_stats
            self.invalidate_pandas_cache()
            self.add_lineage_entry("add_file_stats_cols", path_col=path, compute_hash=compute_hash)
            return self

        res = DataFrame(df_with_stats, lineage=list(self._lineage))
        res.add_lineage_entry("add_file_stats_cols", path_col=path, compute_hash=compute_hash)
        return res

    def add_depth_col(self, path: Optional[Union[str, Path]] = None, inplace: bool = False) -> "DataFrame":
        """Add a depth column showing the nesting level of each path.

        Args:
        ----
            path: The path to calculate depth from. If None, uses the common root.
            inplace: If True, modify this DataFrame in-place and return ``self``.

        Returns:
        -------
            New DataFrame with depth column

        """
        if "depth" in self._df.columns:
            return self if inplace else DataFrame(self._df)

        if path is None:
            # Find the common root path
            paths = [Path(p) for p in self._df["path"].to_list()]
            if not paths:
                path = Path()
            else:
                # Find common parent
                common_parts = []
                first_parts = paths[0].parts
                for i, part in enumerate(first_parts):
                    if all(len(p.parts) > i and p.parts[i] == part for p in paths):
                        common_parts.append(part)
                    else:
                        break
                path = Path(*common_parts) if common_parts else Path()
        else:
            path = Path(path)

        # Use a different local name to avoid shadowing the parameter inside calculate_depth
        path_root = path

        def calculate_depth(path_str: str) -> int:
            """Calculate the depth of a path relative to the provided root path."""
            try:
                p = Path(path_str)
                relative_path = p.relative_to(path_root)
                return len(relative_path.parts)
            except ValueError:
                # Path is not relative to the provided root path
                return len(Path(path_str).parts)

        df_with_depth = self._df.with_columns([pl.col("path").map_elements(calculate_depth, return_dtype=pl.Int64).alias("depth")])
        if inplace:
            self._df = df_with_depth
            self.invalidate_pandas_cache()
            self.add_lineage_entry("add_depth_col", reference_path=path)
            return self

        res = DataFrame(df_with_depth, lineage=list(self._lineage))
        res.add_lineage_entry("add_depth_col", reference_path=path)
        return res

    def filter_by_extension(self, extensions: Union[str, List[str]]) -> "DataFrame":
        """Filter the DataFrame to only include files with specific extensions.

        Args:
        ----
            extensions: File extension(s) to filter by (with or without leading dot)

        Returns:
        -------
            Filtered DataFrame

        """
        if isinstance(extensions, str):
            extensions = [extensions]

        # Normalize extensions (ensure they start with a dot)
        normalized_extensions = []
        for ext in extensions:
            if not ext.startswith("."):
                ext = "." + ext
            normalized_extensions.append(ext.lower())

        filtered_df = self._df.filter(
            pl.col("path").map_elements(
                lambda x: Path(x).suffix.lower() in normalized_extensions,
                return_dtype=pl.Boolean,
            )
        )
        res = DataFrame(filtered_df, lineage=list(self._lineage))
        res.add_lineage_entry("filter_by_extension", extensions=extensions)
        return res

    def filter_by_pattern(self, pattern: str) -> "DataFrame":
        """Filter the DataFrame by path pattern.

        Args:
        ----
            pattern: Pattern to match (uses Polars string contains)

        Returns:
        -------
            Filtered DataFrame

        """
        filtered_df = self._df.filter(pl.col("path").str.contains(pattern))
        res = DataFrame(filtered_df, lineage=list(self._lineage))
        res.add_lineage_entry("filter_by_pattern", pattern=pattern)
        return res

    def extension_counts(self) -> pl.DataFrame:
        """Group files by extension and count them.

        Returns
        -------
            Polars DataFrame with extension counts

        """
        # underlying `_df` is expected to be a Polars DataFrame
        df_with_ext = self._df.with_columns(
            [
                pl.col("path")
                .map_elements(
                    lambda x: (Path(x).suffix.lower() if Path(x).suffix else "<no extension>"),
                    return_dtype=pl.String,
                )
                .alias("extension")
            ]
        )
        result = df_with_ext.group_by("extension").len().sort("len", descending=True)
        return DataFrame(result)

    def directory_counts(self) -> pl.DataFrame:
        """Group files by their parent directory and count them.

        Returns
        -------
            Polars DataFrame with directory counts

        """
        # underlying `_df` is expected to be a Polars DataFrame
        df_with_parent = self._df.with_columns([pl.col("path").map_elements(lambda x: str(Path(x).parent), return_dtype=pl.String).alias("parent_dir")])
        result = df_with_parent.group_by("parent_dir").len().sort("len", descending=True)
        return DataFrame(result)

    def to_polars(self) -> pl.DataFrame:
        """Get the underlying Polars DataFrame."""
        return self._df

    def to_pandas(self, force: bool = False) -> Any:
        """Convert to a pandas DataFrame.

        By default this method will return a cached pandas conversion if one
        exists (for performance). Set ``force=True`` to reconvert from the
        current Polars DataFrame and update the cache.
        """
        if pd is None:
            raise ImportError("pandas is not installed. Please install it to use to_pandas().")
        # Convert and cache on first access or when forced
        if force or self._pd_cache is None:
            # Use Polars' to_pandas conversion for consistency
            self._pd_cache = self._df.to_pandas()
        return self._pd_cache

    @property
    def polars(self) -> pl.DataFrame:
        """Property access for the underlying Polars DataFrame (convenience)."""
        return self.to_polars()

    @property
    def pandas(self) -> Any:
        """Return a fresh pandas DataFrame conversion (not the cached object).

        This is intentionally a fresh conversion so callers who expect an
        up-to-date pandas view can access it directly. Use ``pandas_cached`` or
        ``to_pandas(force=False)`` to access the cached conversion for repeated
        reads, or ``to_pandas(force=True)`` to reconvert and update the cache.

        Raises
        ------
            ImportError: if pandas is not installed.

        """
        if pd is None:
            raise ImportError("pandas is not installed. Please install it to use pandas property.")
        return self._df.to_pandas()

    @property
    def pandas_cached(self) -> Any:
        """Return a cached pandas DataFrame, converting once if needed.

        This is useful when repeated conversions would be expensive and the
        caller is comfortable with an explicit cache that can be invalidated
        with ``invalidate_pandas_cache()`` or by calling ``to_pandas(force=True)``.
        """
        return self.to_pandas(force=False)

    @property
    def native(self):
        """Return the dataframe in the module-wide default backend.

        If `get_default_dataframe_backend()` is 'polars' this returns a Polars
        DataFrame, otherwise it returns a pandas DataFrame.
        """
        if get_default_dataframe_backend() == "polars":
            return self.polars
        return self.pandas

    @classmethod
    def from_pandas(cls, df: Any) -> "DataFrame":
        """Construct a filoma.DataFrame from a pandas DataFrame.

        This is a convenience wrapper that converts the pandas DataFrame into
        a Polars DataFrame and wraps it. Requires pandas to be installed.
        """
        if pd is None:
            raise RuntimeError("pandas is not available in this environment")
        # Convert via Polars for internal consistency
        pl_df = pl.from_pandas(df)
        return cls(pl_df)

    def to_dict(self) -> Dict[str, List]:
        """Convert to a dictionary."""
        return self._df.to_dict(as_series=False)

    def save_csv(self, path: Union[str, Path]) -> None:
        """Save the DataFrame to CSV."""
        self._df.write_csv(str(path))

    def save_parquet(self, path: Union[str, Path]) -> None:
        """Save the DataFrame to Parquet format."""
        self._df.write_parquet(str(path))

    # Convenience methods for common Polars operations that users expect
    @property
    def columns(self) -> List[str]:
        """Get column names."""
        return self._df.columns

    @property
    def dtypes(self) -> List[pl.DataType]:
        """Get column data types."""
        return self._df.dtypes

    @property
    def shape(self) -> tuple:
        """Get DataFrame shape (rows, columns)."""
        # Attempt to return a (rows, cols) tuple even if the underlying
        # object doesn't expose .shape or len(). Use the same fallbacks as
        # in __len__ for rows and inspect columns for width.
        try:
            rows, cols = self._df.shape
            return (int(rows), int(cols))
        except Exception:
            # Rows fallback
            try:
                rows = len(self)
            except Exception:
                rows = 0
            # Columns fallback: try .columns or pandas conversion
            try:
                cols = len(getattr(self._df, "columns"))
            except Exception:
                try:
                    if pd is not None:
                        cols = self._df.to_pandas().shape[1]
                    else:
                        cols = 0
                except Exception:
                    cols = 0
            return (int(rows), int(cols))

    def describe(self, percentiles: Optional[List[float]] = None) -> pl.DataFrame:
        """Generate descriptive statistics.

        Args:
        ----
            percentiles: List of percentiles to include (default: [0.25, 0.5, 0.75])

        """
        # Polars' describe returns a new DataFrame summarizing columns; wrap it
        return DataFrame(self._df.describe(percentiles=percentiles))

    def info(self) -> None:
        """Print concise summary of the DataFrame."""
        print("filoma.DataFrame")
        print(f"Shape: {self.shape}")
        print(f"Columns: {len(self.columns)}")
        print()

        # Column info
        print("Column details:")
        for i, (col, dtype) in enumerate(zip(self.columns, self.dtypes)):
            null_count = self._df[col].null_count()
            print(f"  {i:2d}  {col:15s} {str(dtype):15s} {null_count:8d} nulls")

        # Memory usage approximation
        memory_mb = sum(self._df[col].estimated_size("mb") for col in self.columns)
        print(f"\nEstimated memory usage: {memory_mb:.2f} MB")

    def unique(self, subset: Optional[Union[str, List[str]]] = None) -> "DataFrame":
        """Get unique rows.

        Args:
        ----
            subset: Column name(s) to consider for uniqueness

        """
        if subset is None:
            result = self._df.unique()
        else:
            result = self._df.unique(subset=subset)
        res = DataFrame(result, lineage=list(self._lineage))
        res.add_lineage_entry("unique", subset=subset)
        return res

    def sort(self, by: Union[str, List[str]], descending: bool = False) -> "DataFrame":
        """Sort the DataFrame.

        Args:
        ----
            by: Column name(s) to sort by
            descending: Sort in descending order

        """
        result = self._df.sort(by, descending=descending)
        res = DataFrame(result, lineage=list(self._lineage))
        res.add_lineage_entry("sort", by=by, descending=descending)
        return res

    def enrich(self, inplace: bool = False):
        """Enrich the DataFrame by adding features like path components, file stats, and depth.

        Args:
        ----
            inplace: If True, perform the operation in-place and return self.
                     If False (default), return a new DataFrame with the changes.

        """
        # Chain the enrichment methods; this produces a new DataFrame wrapper.
        # These methods are now idempotent, so calling enrich() multiple times is safe.
        # Use intermediate wrappers to avoid redundant lineage entries if desired,
        # but here we'll just record a single 'enrich' operation for the user.
        # To avoid multiple inner lineage entries, we can use the underlying _df.
        enriched_df = self.add_path_components().add_file_stats_cols().add_depth_col()._df

        if inplace:
            # Update the internal state of the current object
            self._df = enriched_df
            self.with_enrich = True
            self.invalidate_pandas_cache()
            self.add_lineage_entry("enrich")
            return self

        # Return the new, enriched DataFrame instance
        res = DataFrame(enriched_df, lineage=list(self._lineage))
        res.with_enrich = True
        res.add_lineage_entry("enrich")
        return res

    def evaluate_duplicates(
        self,
        path_col: str = "path",
        text_threshold: float = 0.8,
        image_max_distance: int = 5,
        text_k: int = 3,
        show_table: bool = True,
        cross_dir_paths: Optional[List[str]] = None,
    ) -> dict:
        """Evaluate duplicates among files in the DataFrame.

        Scans the `path_col` column, runs exact, text and image duplicate
        detectors. Optionally filters to show only duplicates that cross
        directory boundaries (requires `cross_dir_paths` to define boundaries).
        """
        if path_col not in self._df.columns:
            raise ValueError(f"Column '{path_col}' not found in DataFrame")

        # filter for files only
        paths = [str(p) for p in self._df[path_col].to_list() if Path(p).is_file()]
        res = _dedup.find_duplicates(
            paths,
            text_k=text_k,
            text_threshold=text_threshold,
            image_max_distance=image_max_distance,
        )

        # Filter for cross-directory duplicates if requested
        if cross_dir_paths:
            for category in ["exact", "text", "image"]:
                filtered_groups = []
                for group in res.get(category, []):
                    # Check if file sources span multiple folders
                    source_dirs = set()
                    for p in group:
                        for cp in cross_dir_paths:
                            if str(p).startswith(str(cp)):
                                source_dirs.add(cp)
                    if len(source_dirs) > 1:
                        filtered_groups.append(group)
                res[category] = filtered_groups

        # Summarize counts
        exact_groups = res.get("exact", [])
        text_groups = res.get("text", [])
        image_groups = res.get("image", [])

        console = Console()
        if show_table:
            table = Table(title="Duplicate Summary (Cross-Dir)" if cross_dir_paths else "Duplicate Summary")
            table.add_column("Type", style="bold cyan")
            table.add_column("Groups", style="white")
            table.add_column("Files In Groups", style="white")
            table.add_row(
                "exact",
                str(len(exact_groups)),
                str(sum(len(g) for g in exact_groups) if exact_groups else 0),
            )
            table.add_row(
                "text",
                str(len(text_groups)),
                str(sum(len(g) for g in text_groups) if text_groups else 0),
            )
            table.add_row(
                "image",
                str(len(image_groups)),
                str(sum(len(g) for g in image_groups) if image_groups else 0),
            )
            console.print(table)

        logger.info(
            f"Duplicate summary: exact={len(exact_groups)} groups "
            f"({sum(len(g) for g in exact_groups) if exact_groups else 0} files), "
            f"text={len(text_groups)} groups "
            f"({sum(len(g) for g in text_groups) if text_groups else 0} files), "
            f"image={len(image_groups)} groups "
            f"({sum(len(g) for g in image_groups) if image_groups else 0} files)"
        )

        return res

    def add_filename_features(
        self,
        path_col: str = "path",
        sep: str = "_",
        prefix: Optional[str] = "feat",
        max_tokens: Optional[int] = None,
        include_parent: bool = False,
        include_all_parts: bool = False,
        token_names: Optional[Union[str, Sequence[str]]] = None,
        enrich: bool = False,
        inplace: bool = False,
    ) -> "DataFrame":
        """Discover filename features and add them as columns on this DataFrame.

        This instance method discovers separator-based tokens from filename
        stems and adds columns (e.g., `feat1`, `feat2` or `token1`, ...).

        Args:
        ----
            path_col: Column containing path strings to analyze (default: 'path').
            sep: Separator used to split filename stems (default: '_').
            prefix: Column name prefix for discovered tokens (default: 'feat').
            max_tokens: Optional cap on extracted tokens; by default uses observed max.
            include_parent: If True, add a `parent` column containing immediate parent folder name.
            include_all_parts: If True, add `path_part0`, `path_part1`, ... for all Path.parts.
            token_names: Optional list of token column names or 'auto' to generate readable names.
            enrich: If True, automatically enrich the DataFrame with path components and file stats before discovery.
            inplace: If True, perform the operation in-place and return self. Otherwise returns a new `filoma.DataFrame`.

        Returns:
        -------
            A new or modified `filoma.DataFrame` with discovered filename features.

        """
        # Determine the base Polars DataFrame for feature discovery
        base_df = self
        if enrich and not self.with_enrich:
            logger.info("Enriching DataFrame before discovering filename features")
            base_df = self.enrich(inplace=False)

        # Polars-native implementation inlined here (formerly a top-level helper).
        pl_df = base_df._df
        if path_col not in pl_df.columns:
            raise ValueError(f"DataFrame must have a '{path_col}' column")

        stems = [Path(s).stem for s in pl_df[path_col].to_list()]
        split_tokens = [stem.split(sep) if stem is not None else [""] for stem in stems]
        observed_max = max((len(t) for t in split_tokens), default=0)
        if max_tokens is None:
            eff_max = observed_max
        else:
            eff_max = max_tokens

        # Normalize token_names
        if token_names == "auto":
            token_names_seq = None
            auto_mode = True
        elif isinstance(token_names, (list, tuple)):
            token_names_seq = list(token_names)
            auto_mode = False
        else:
            token_names_seq = None
            auto_mode = False

        new_cols = []
        for i in range(eff_max):
            if token_names_seq is not None and i < len(token_names_seq) and token_names_seq[i]:
                col_name = token_names_seq[i]
            elif auto_mode:
                base = prefix if prefix else "token"
                col_name = f"{base}{i + 1}"
            else:
                if prefix:
                    col_name = f"{prefix}{i + 1}"
                else:
                    col_name = f"token{i + 1}"

            def pick_token(s: str, idx=i):
                st = Path(s).stem
                parts = st.split(sep) if st is not None else [""]
                try:
                    return parts[idx]
                except Exception:
                    return ""

            new_cols.append(pl.col(path_col).map_elements(pick_token, return_dtype=pl.Utf8).alias(col_name))

        if include_parent:
            new_cols.append(pl.col(path_col).map_elements(lambda s: Path(s).parent.name, return_dtype=pl.Utf8).alias("parent"))

        if include_all_parts:
            parts_lists = [list(Path(s).parts) for s in pl_df[path_col].to_list()]
            max_parts = max((len(p) for p in parts_lists), default=0)
            for i in range(max_parts):
                col_name = f"path_part{i}"

                def pick_part(s: str, idx=i):
                    try:
                        parts = list(Path(s).parts)
                        return parts[idx]
                    except Exception:
                        return ""

                new_cols.append(pl.col(path_col).map_elements(pick_part, return_dtype=pl.Utf8).alias(col_name))

        pl_result = pl_df.with_columns(new_cols)

        # Wrap the result in a filoma.DataFrame
        enriched_wrapper = DataFrame(pl_result, lineage=list(self._lineage))
        enriched_wrapper.with_filename_features = True
        enriched_wrapper.add_lineage_entry(
            "add_filename_features",
            sep=sep,
            prefix=prefix,
            max_tokens=max_tokens,
            include_parent=include_parent,
            token_names=token_names,
        )

        if inplace:
            self._df = enriched_wrapper._df
            self.with_filename_features = True
            if enrich and not self.with_enrich:
                self.with_enrich = True
            self.invalidate_pandas_cache()
            self._lineage = enriched_wrapper._lineage
            return self

        return enriched_wrapper

columns property

Get column names.

df property

Get the underlying Polars DataFrame.

dtypes property

Get column data types.

lineage property

Return the lineage history of this DataFrame.

native property

Return the dataframe in the module-wide default backend.

If get_default_dataframe_backend() is 'polars' this returns a Polars DataFrame, otherwise it returns a pandas DataFrame.

pandas property

Return a fresh pandas DataFrame conversion (not the cached object).

This is intentionally a fresh conversion so callers who expect an up-to-date pandas view can access it directly. Use pandas_cached or to_pandas(force=False) to access the cached conversion for repeated reads, or to_pandas(force=True) to reconvert and update the cache.

Raises

ImportError: if pandas is not installed.

pandas_cached property

Return a cached pandas DataFrame, converting once if needed.

This is useful when repeated conversions would be expensive and the caller is comfortable with an explicit cache that can be invalidated with invalidate_pandas_cache() or by calling to_pandas(force=True).

polars property

Property access for the underlying Polars DataFrame (convenience).

shape property

Get DataFrame shape (rows, columns).

__dir__()

Expose both wrapper and underlying Polars attributes in interactive help.

Source code in filoma/dataframe.py
255
256
257
258
259
260
261
262
def __dir__(self) -> List[str]:
    """Expose both wrapper and underlying Polars attributes in interactive help."""
    attrs = set(super().__dir__())
    try:
        attrs.update(dir(self._df))
    except Exception:
        pass
    return sorted(list(attrs))

__getattr__(name)

Delegate attribute access to the underlying Polars DataFrame.

This allows direct access to all Polars DataFrame methods and properties like columns, dtypes, shape, select, filter, group_by, etc.

Source code in filoma/dataframe.py
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
def __getattr__(self, name: str) -> Any:
    """Delegate attribute access to the underlying Polars DataFrame.

    This allows direct access to all Polars DataFrame methods and properties
    like columns, dtypes, shape, select, filter, group_by, etc.
    """
    # Directly return the attribute from the underlying Polars DataFrame.
    # NOTE: We intentionally do NOT wrap returned Polars DataFrames anymore.
    # This makes filoma.DataFrame behave like a Polars DataFrame by default
    # (calls like df.head(), df.select(...), etc. return native Polars
    # objects). This is a breaking change compared to previously wrapping
    # Polars results in filoma.DataFrame.
    try:
        attr = getattr(self._df, name)
    except AttributeError:
        # Preserve the original error semantics
        raise

    # If the attribute is callable, return a wrapper that conditionally
    # wraps returned polars.DataFrame objects into filoma.DataFrame
    if callable(attr):

        def wrapper(*args, **kwargs):
            result = attr(*args, **kwargs)
            # If the underlying call mutated the Polars DataFrame in-place,
            # Polars often returns None or the same object reference. In
            # that case invalidate the cached pandas conversion so future
            # .pandas/.pandas_cached calls reflect the mutation.
            if result is None or result is self._df:
                try:
                    self.invalidate_pandas_cache()
                except Exception:
                    # Best-effort: do not let cache invalidation break calls
                    pass
                return result

            # If wrapping is enabled and result is a Polars DataFrame,
            # wrap it back into filoma.DataFrame for compatibility.
            # Propagate lineage to the new wrapper.
            if get_default_wrap_polars() and isinstance(result, pl.DataFrame):
                return DataFrame(result, lineage=list(self._lineage))

            return result

        return wrapper

    # Non-callable attributes (properties) — if it's a Polars DataFrame and
    # wrapping is requested, wrap it; otherwise return as-is.
    if get_default_wrap_polars() and isinstance(attr, pl.DataFrame):
        return DataFrame(attr, lineage=list(self._lineage))

    return attr

__getitem__(key)

Forward subscription (e.g., df['path']) to the underlying Polars DataFrame.

Returns native Polars objects (Series or DataFrame) to match the default Polars-first behavior of this wrapper.

Source code in filoma/dataframe.py
264
265
266
267
268
269
270
def __getitem__(self, key):
    """Forward subscription (e.g., df['path']) to the underlying Polars DataFrame.

    Returns native Polars objects (Series or DataFrame) to match the default
    Polars-first behavior of this wrapper.
    """
    return self._df.__getitem__(key)

__init__(data=None, lineage=None)

Initialize a DataFrame.


data: Initial data. Can be:
    - A Polars DataFrame
    - A dictionary mapping column names to sequences (all same length)
    - A list of string paths
    - A list of Path objects
    - None for an empty DataFrame
lineage: Optional list of lineage entries.
Source code in filoma/dataframe.py
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
def __init__(
    self,
    data: Optional[Union[pl.DataFrame, List[str], List[Path], Dict[str, Any]]] = None,
    lineage: Optional[List[Dict[str, Any]]] = None,
):
    """Initialize a DataFrame.

    Args:
    ----
        data: Initial data. Can be:
            - A Polars DataFrame
            - A dictionary mapping column names to sequences (all same length)
            - A list of string paths
            - A list of Path objects
            - None for an empty DataFrame
        lineage: Optional list of lineage entries.

    """
    if data is None:
        self._df = pl.DataFrame({"path": []}, schema={"path": pl.String})
    elif isinstance(data, pl.DataFrame):
        self._df = data
    elif isinstance(data, dict):
        if not data:
            self._df = pl.DataFrame()
        else:
            processed: Dict[str, List[Any]] = {}
            expected_len: Optional[int] = None
            for col, values in data.items():
                if not isinstance(values, (list, tuple)):
                    raise ValueError("Dictionary values must be list or tuple sequences")
                seq = [str(x) if isinstance(x, Path) else x for x in values]
                if expected_len is None:
                    expected_len = len(seq)
                elif len(seq) != expected_len:
                    raise ValueError("All dictionary value sequences must have the same length")
                processed[col] = seq
            self._df = pl.DataFrame(processed)
    elif isinstance(data, list):
        if data and isinstance(data[0], dict):
            # Handle list of dictionaries (from manifest or to_dicts())
            self._df = pl.from_dicts(data)
        else:
            paths = [str(path) for path in data]
            self._df = pl.DataFrame({"path": paths})
    else:
        raise ValueError("data must be a Polars DataFrame, dict of columns, list of paths, or None")
    self._pd_cache = None
    self.with_enrich = False
    self.with_filename_features = False
    self._lineage = lineage or []

__len__()

Get the number of rows in the DataFrame.

Source code in filoma/dataframe.py
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
def __len__(self) -> int:
    """Get the number of rows in the DataFrame."""
    # polars.DataFrame supports len(), but some wrapped/native objects
    # (for example older PyArrow-backed objects) may not implement __len__.
    # Try common fallbacks in order of preference.
    try:
        return len(self._df)
    except Exception:
        # polars exposes `.height` as row count and `.shape[0]` as rows
        try:
            return int(getattr(self._df, "height"))
        except Exception:
            try:
                return int(self._df.shape[0])
            except Exception:
                # Last resort: convert to pandas if available (cheap for small frames)
                if pd is not None:
                    try:
                        return int(self._df.to_pandas().shape[0])
                    except Exception:
                        return 0
                return 0

__repr__()

Return the string representation of the DataFrame.

Source code in filoma/dataframe.py
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
def __repr__(self) -> str:
    """Return the string representation of the DataFrame."""
    # Avoid calling the underlying object's __str__/__repr__ if it may
    # raise TypeError (observed with some PyDataFrame wrappers). Use
    # safe fallbacks for a short textual preview.
    row_count = len(self)
    # Try polars' to_string-like rendering if available
    try:
        # Polars DataFrame implements __str__/__repr__; prefer repr()
        df_preview = repr(self._df)
    except Exception:
        try:
            # Try to convert to pandas for a safer repr
            if pd is not None:
                df_preview = repr(self._df.to_pandas())
            else:
                df_preview = "<unrepresentable DataFrame>"
        except Exception:
            df_preview = "<unrepresentable DataFrame>"

    return f"filoma.DataFrame with {row_count} rows\n{df_preview}"

__setitem__(key, value)

Forward item assignment to the underlying Polars DataFrame.

Source code in filoma/dataframe.py
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
def __setitem__(self, key, value):
    """Forward item assignment to the underlying Polars DataFrame."""
    # Polars DataFrame supports column assignment via df[key] = value
    # Try to support common user-friendly patterns: assigning a Python
    # sequence or a Series to create/replace a column. Polars' native
    # __setitem__ may raise TypeError in some versions, so handle that
    # explicitly and fall back to with_columns.
    try:
        if isinstance(key, str):
            # Accept polars Series, pandas Series, or Python sequences
            if isinstance(value, pl.Series):
                series = value
            else:
                try:
                    # pandas Series -> polars Series
                    if pd is not None and hasattr(value, "__array__") and not isinstance(value, (list, tuple)):
                        series = pl.Series(value)
                    elif isinstance(value, (list, tuple)):
                        series = pl.Series(key, list(value))
                    else:
                        # Scalar value: repeat across rows
                        series = pl.Series(key, [value] * len(self._df))
                except Exception:
                    series = None

            if "series" in locals() and series is not None:
                # Use with_columns to add/replace the column
                self._df = self._df.with_columns(series.alias(key))
                self.invalidate_pandas_cache()
                return

        # Fallback to delegating to Polars __setitem__ for other patterns
        self._df.__setitem__(key, value)
        # Underlying data has changed; invalidate any cached pandas conversion
        self.invalidate_pandas_cache()
    except TypeError:
        # Polars raises TypeError for some unsupported assignment forms
        # (e.g., assigning a Series by index). Re-raise a clearer message
        msg = "DataFrame object does not support `Series` assignment by index\n\nUse `DataFrame.with_columns`."
        raise TypeError(msg)

__str__()

Return the string representation of the DataFrame.

Source code in filoma/dataframe.py
393
394
395
def __str__(self) -> str:
    """Return the string representation of the DataFrame."""
    return self.__repr__()

add_depth_col(path=None, inplace=False)

Add a depth column showing the nesting level of each path.


path: The path to calculate depth from. If None, uses the common root.
inplace: If True, modify this DataFrame in-place and return ``self``.

New DataFrame with depth column
Source code in filoma/dataframe.py
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
def add_depth_col(self, path: Optional[Union[str, Path]] = None, inplace: bool = False) -> "DataFrame":
    """Add a depth column showing the nesting level of each path.

    Args:
    ----
        path: The path to calculate depth from. If None, uses the common root.
        inplace: If True, modify this DataFrame in-place and return ``self``.

    Returns:
    -------
        New DataFrame with depth column

    """
    if "depth" in self._df.columns:
        return self if inplace else DataFrame(self._df)

    if path is None:
        # Find the common root path
        paths = [Path(p) for p in self._df["path"].to_list()]
        if not paths:
            path = Path()
        else:
            # Find common parent
            common_parts = []
            first_parts = paths[0].parts
            for i, part in enumerate(first_parts):
                if all(len(p.parts) > i and p.parts[i] == part for p in paths):
                    common_parts.append(part)
                else:
                    break
            path = Path(*common_parts) if common_parts else Path()
    else:
        path = Path(path)

    # Use a different local name to avoid shadowing the parameter inside calculate_depth
    path_root = path

    def calculate_depth(path_str: str) -> int:
        """Calculate the depth of a path relative to the provided root path."""
        try:
            p = Path(path_str)
            relative_path = p.relative_to(path_root)
            return len(relative_path.parts)
        except ValueError:
            # Path is not relative to the provided root path
            return len(Path(path_str).parts)

    df_with_depth = self._df.with_columns([pl.col("path").map_elements(calculate_depth, return_dtype=pl.Int64).alias("depth")])
    if inplace:
        self._df = df_with_depth
        self.invalidate_pandas_cache()
        self.add_lineage_entry("add_depth_col", reference_path=path)
        return self

    res = DataFrame(df_with_depth, lineage=list(self._lineage))
    res.add_lineage_entry("add_depth_col", reference_path=path)
    return res

add_file_stats_cols(path='path', base_path=None, compute_hash=False, inplace=False)

Add file statistics columns (size, modified time, etc.) based on a column containing filesystem paths.


path: Name of the column containing file system paths.
base_path: Optional base path. If provided, any non-absolute paths in the
    path column are resolved relative to this base.
compute_hash: Whether to compute SHA256 hashes (slow for large files).
inplace: If True, modify this DataFrame in-place and return ``self``.

New DataFrame with file statistics columns added, or ``self`` when
``inplace=True``.

ValueError: If the specified path column does not exist.
Source code in filoma/dataframe.py
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
def add_file_stats_cols(
    self,
    path: str = "path",
    base_path: Optional[Union[str, Path]] = None,
    compute_hash: bool = False,
    inplace: bool = False,
) -> "DataFrame":
    """Add file statistics columns (size, modified time, etc.) based on a column containing filesystem paths.

    Args:
    ----
        path: Name of the column containing file system paths.
        base_path: Optional base path. If provided, any non-absolute paths in the
            path column are resolved relative to this base.
        compute_hash: Whether to compute SHA256 hashes (slow for large files).
        inplace: If True, modify this DataFrame in-place and return ``self``.

    Returns:
    -------
        New DataFrame with file statistics columns added, or ``self`` when
        ``inplace=True``.

    Raises:
    ------
        ValueError: If the specified path column does not exist.

    """
    if path not in self._df.columns:
        raise ValueError(f"Column '{path}' not found in DataFrame")

    # Define the set of columns we intend to add
    target_cols = {
        "size_bytes",
        "modified_time",
        "created_time",
        "is_file",
        "is_dir",
        "owner",
        "group",
        "mode_str",
        "inode",
        "nlink",
        "sha256",
        "xattrs",
    }
    # Decide if we need to proceed. Proceed if any target column is missing,
    # OR if we need to compute hashes and the column is missing or has nulls.
    needs_hashes = compute_hash and ("sha256" not in self._df.columns or self._df["sha256"].null_count() > 0)
    missing_any = not all(c in self._df.columns for c in target_cols)

    if not missing_any and not needs_hashes:
        return self if inplace else DataFrame(self._df, lineage=list(self._lineage))

    # Resolve base path if provided
    base = Path(base_path) if base_path is not None else None

    # Use filoma's FileProfiler to collect rich file metadata
    profiler = FileProfiler()

    def get_file_stats(path_str: str) -> Dict[str, Any]:
        try:
            p = Path(path_str)
            if base is not None and not p.is_absolute():
                p = base / p
            full_path = str(p)
            if not p.exists():
                logger.warning(f"Path does not exist: {full_path}")
                return {
                    "size_bytes": None,
                    "modified_time": None,
                    "created_time": None,
                    "is_file": None,
                    "is_dir": None,
                    "owner": None,
                    "group": None,
                    "mode_str": None,
                    "inode": None,
                    "nlink": None,
                    "sha256": None,
                    "xattrs": "{}",
                }

            # Use the profiler; let it handle symlinks and permissions
            filo = profiler.probe(full_path, compute_hash=compute_hash)
            row = filo.as_dict()

            # Normalize keys to a stable schema used by this helper
            return {
                "size_bytes": row.get("size"),
                "modified_time": row.get("modified"),
                "created_time": row.get("created"),
                "is_file": row.get("is_file"),
                "is_dir": row.get("is_dir"),
                "owner": row.get("owner"),
                "group": row.get("group"),
                "mode_str": row.get("mode_str"),
                "inode": row.get("inode"),
                "nlink": row.get("nlink"),
                "sha256": row.get("sha256"),
                "xattrs": json.dumps(row.get("xattrs") or {}),
            }
        except Exception:
            # On any error, return a row of Nones/empties preserving schema
            return {
                "size_bytes": None,
                "modified_time": None,
                "created_time": None,
                "is_file": None,
                "is_dir": None,
                "owner": None,
                "group": None,
                "mode_str": None,
                "inode": None,
                "nlink": None,
                "sha256": None,
                "xattrs": "{}",
            }

    stats_data = [get_file_stats(p) for p in self._df[path].to_list()]

    stats_df = pl.DataFrame(
        stats_data,
        schema={
            "size_bytes": pl.Int64,
            "modified_time": pl.String,
            "created_time": pl.String,
            "is_file": pl.Boolean,
            "is_dir": pl.Boolean,
            "owner": pl.String,
            "group": pl.String,
            "mode_str": pl.String,
            "inode": pl.Int64,
            "nlink": pl.Int64,
            "sha256": pl.String,
            "xattrs": pl.String,
        },
    )

    # If columns already exist, we need to drop them before joining to avoid duplicates
    df_base = self._df
    overlapping_cols = [c for c in stats_df.columns if c in df_base.columns]
    if overlapping_cols:
        df_base = df_base.drop(overlapping_cols)

    df_with_stats = pl.concat([df_base, stats_df], how="horizontal")
    if inplace:
        self._df = df_with_stats
        self.invalidate_pandas_cache()
        self.add_lineage_entry("add_file_stats_cols", path_col=path, compute_hash=compute_hash)
        return self

    res = DataFrame(df_with_stats, lineage=list(self._lineage))
    res.add_lineage_entry("add_file_stats_cols", path_col=path, compute_hash=compute_hash)
    return res

add_filename_features(path_col='path', sep='_', prefix='feat', max_tokens=None, include_parent=False, include_all_parts=False, token_names=None, enrich=False, inplace=False)

Discover filename features and add them as columns on this DataFrame.

This instance method discovers separator-based tokens from filename stems and adds columns (e.g., feat1, feat2 or token1, ...).


path_col: Column containing path strings to analyze (default: 'path').
sep: Separator used to split filename stems (default: '_').
prefix: Column name prefix for discovered tokens (default: 'feat').
max_tokens: Optional cap on extracted tokens; by default uses observed max.
include_parent: If True, add a `parent` column containing immediate parent folder name.
include_all_parts: If True, add `path_part0`, `path_part1`, ... for all Path.parts.
token_names: Optional list of token column names or 'auto' to generate readable names.
enrich: If True, automatically enrich the DataFrame with path components and file stats before discovery.
inplace: If True, perform the operation in-place and return self. Otherwise returns a new `filoma.DataFrame`.

A new or modified `filoma.DataFrame` with discovered filename features.
Source code in filoma/dataframe.py
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
def add_filename_features(
    self,
    path_col: str = "path",
    sep: str = "_",
    prefix: Optional[str] = "feat",
    max_tokens: Optional[int] = None,
    include_parent: bool = False,
    include_all_parts: bool = False,
    token_names: Optional[Union[str, Sequence[str]]] = None,
    enrich: bool = False,
    inplace: bool = False,
) -> "DataFrame":
    """Discover filename features and add them as columns on this DataFrame.

    This instance method discovers separator-based tokens from filename
    stems and adds columns (e.g., `feat1`, `feat2` or `token1`, ...).

    Args:
    ----
        path_col: Column containing path strings to analyze (default: 'path').
        sep: Separator used to split filename stems (default: '_').
        prefix: Column name prefix for discovered tokens (default: 'feat').
        max_tokens: Optional cap on extracted tokens; by default uses observed max.
        include_parent: If True, add a `parent` column containing immediate parent folder name.
        include_all_parts: If True, add `path_part0`, `path_part1`, ... for all Path.parts.
        token_names: Optional list of token column names or 'auto' to generate readable names.
        enrich: If True, automatically enrich the DataFrame with path components and file stats before discovery.
        inplace: If True, perform the operation in-place and return self. Otherwise returns a new `filoma.DataFrame`.

    Returns:
    -------
        A new or modified `filoma.DataFrame` with discovered filename features.

    """
    # Determine the base Polars DataFrame for feature discovery
    base_df = self
    if enrich and not self.with_enrich:
        logger.info("Enriching DataFrame before discovering filename features")
        base_df = self.enrich(inplace=False)

    # Polars-native implementation inlined here (formerly a top-level helper).
    pl_df = base_df._df
    if path_col not in pl_df.columns:
        raise ValueError(f"DataFrame must have a '{path_col}' column")

    stems = [Path(s).stem for s in pl_df[path_col].to_list()]
    split_tokens = [stem.split(sep) if stem is not None else [""] for stem in stems]
    observed_max = max((len(t) for t in split_tokens), default=0)
    if max_tokens is None:
        eff_max = observed_max
    else:
        eff_max = max_tokens

    # Normalize token_names
    if token_names == "auto":
        token_names_seq = None
        auto_mode = True
    elif isinstance(token_names, (list, tuple)):
        token_names_seq = list(token_names)
        auto_mode = False
    else:
        token_names_seq = None
        auto_mode = False

    new_cols = []
    for i in range(eff_max):
        if token_names_seq is not None and i < len(token_names_seq) and token_names_seq[i]:
            col_name = token_names_seq[i]
        elif auto_mode:
            base = prefix if prefix else "token"
            col_name = f"{base}{i + 1}"
        else:
            if prefix:
                col_name = f"{prefix}{i + 1}"
            else:
                col_name = f"token{i + 1}"

        def pick_token(s: str, idx=i):
            st = Path(s).stem
            parts = st.split(sep) if st is not None else [""]
            try:
                return parts[idx]
            except Exception:
                return ""

        new_cols.append(pl.col(path_col).map_elements(pick_token, return_dtype=pl.Utf8).alias(col_name))

    if include_parent:
        new_cols.append(pl.col(path_col).map_elements(lambda s: Path(s).parent.name, return_dtype=pl.Utf8).alias("parent"))

    if include_all_parts:
        parts_lists = [list(Path(s).parts) for s in pl_df[path_col].to_list()]
        max_parts = max((len(p) for p in parts_lists), default=0)
        for i in range(max_parts):
            col_name = f"path_part{i}"

            def pick_part(s: str, idx=i):
                try:
                    parts = list(Path(s).parts)
                    return parts[idx]
                except Exception:
                    return ""

            new_cols.append(pl.col(path_col).map_elements(pick_part, return_dtype=pl.Utf8).alias(col_name))

    pl_result = pl_df.with_columns(new_cols)

    # Wrap the result in a filoma.DataFrame
    enriched_wrapper = DataFrame(pl_result, lineage=list(self._lineage))
    enriched_wrapper.with_filename_features = True
    enriched_wrapper.add_lineage_entry(
        "add_filename_features",
        sep=sep,
        prefix=prefix,
        max_tokens=max_tokens,
        include_parent=include_parent,
        token_names=token_names,
    )

    if inplace:
        self._df = enriched_wrapper._df
        self.with_filename_features = True
        if enrich and not self.with_enrich:
            self.with_enrich = True
        self.invalidate_pandas_cache()
        self._lineage = enriched_wrapper._lineage
        return self

    return enriched_wrapper

add_lineage_entry(operation, **kwargs)

Add a lineage entry to track the history of this DataFrame.


operation: Name of the operation performed.
**kwargs: Parameters used for the operation.
Source code in filoma/dataframe.py
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
def add_lineage_entry(self, operation: str, **kwargs: Any) -> None:
    """Add a lineage entry to track the history of this DataFrame.

    Args:
    ----
        operation: Name of the operation performed.
        **kwargs: Parameters used for the operation.

    """
    self._lineage.append(
        {
            "operation": operation,
            "parameters": {k: str(v) if isinstance(v, Path) else v for k, v in kwargs.items()},
            "timestamp": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
        }
    )

add_path_components(inplace=False)

Add columns for path components (parent, name, stem, suffix).

Returns

New DataFrame with additional path component columns
Source code in filoma/dataframe.py
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
def add_path_components(self, inplace: bool = False) -> "DataFrame":
    """Add columns for path components (parent, name, stem, suffix).

    Returns
    -------
        New DataFrame with additional path component columns

    """
    cols_to_add = []
    if "parent" not in self._df.columns:
        cols_to_add.append(pl.col("path").map_elements(lambda x: str(Path(x).parent), return_dtype=pl.String).alias("parent"))
    if "name" not in self._df.columns:
        cols_to_add.append(pl.col("path").map_elements(lambda x: Path(x).name, return_dtype=pl.String).alias("name"))
    if "stem" not in self._df.columns:
        cols_to_add.append(pl.col("path").map_elements(lambda x: Path(x).stem, return_dtype=pl.String).alias("stem"))
    if "suffix" not in self._df.columns:
        cols_to_add.append(pl.col("path").map_elements(lambda x: Path(x).suffix, return_dtype=pl.String).alias("suffix"))

    if not cols_to_add:
        return self if inplace else DataFrame(self._df)

    df_with_components = self._df.with_columns(cols_to_add)
    if inplace:
        self._df = df_with_components
        self.invalidate_pandas_cache()
        self.add_lineage_entry("add_path_components")
        return self

    res = DataFrame(df_with_components, lineage=list(self._lineage))
    res.add_lineage_entry("add_path_components")
    return res

describe(percentiles=None)

Generate descriptive statistics.


percentiles: List of percentiles to include (default: [0.25, 0.5, 0.75])
Source code in filoma/dataframe.py
865
866
867
868
869
870
871
872
873
874
def describe(self, percentiles: Optional[List[float]] = None) -> pl.DataFrame:
    """Generate descriptive statistics.

    Args:
    ----
        percentiles: List of percentiles to include (default: [0.25, 0.5, 0.75])

    """
    # Polars' describe returns a new DataFrame summarizing columns; wrap it
    return DataFrame(self._df.describe(percentiles=percentiles))

directory_counts()

Group files by their parent directory and count them.

Returns

Polars DataFrame with directory counts
Source code in filoma/dataframe.py
725
726
727
728
729
730
731
732
733
734
735
736
def directory_counts(self) -> pl.DataFrame:
    """Group files by their parent directory and count them.

    Returns
    -------
        Polars DataFrame with directory counts

    """
    # underlying `_df` is expected to be a Polars DataFrame
    df_with_parent = self._df.with_columns([pl.col("path").map_elements(lambda x: str(Path(x).parent), return_dtype=pl.String).alias("parent_dir")])
    result = df_with_parent.group_by("parent_dir").len().sort("len", descending=True)
    return DataFrame(result)

enrich(inplace=False)

Enrich the DataFrame by adding features like path components, file stats, and depth.


inplace: If True, perform the operation in-place and return self.
         If False (default), return a new DataFrame with the changes.
Source code in filoma/dataframe.py
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
def enrich(self, inplace: bool = False):
    """Enrich the DataFrame by adding features like path components, file stats, and depth.

    Args:
    ----
        inplace: If True, perform the operation in-place and return self.
                 If False (default), return a new DataFrame with the changes.

    """
    # Chain the enrichment methods; this produces a new DataFrame wrapper.
    # These methods are now idempotent, so calling enrich() multiple times is safe.
    # Use intermediate wrappers to avoid redundant lineage entries if desired,
    # but here we'll just record a single 'enrich' operation for the user.
    # To avoid multiple inner lineage entries, we can use the underlying _df.
    enriched_df = self.add_path_components().add_file_stats_cols().add_depth_col()._df

    if inplace:
        # Update the internal state of the current object
        self._df = enriched_df
        self.with_enrich = True
        self.invalidate_pandas_cache()
        self.add_lineage_entry("enrich")
        return self

    # Return the new, enriched DataFrame instance
    res = DataFrame(enriched_df, lineage=list(self._lineage))
    res.with_enrich = True
    res.add_lineage_entry("enrich")
    return res

evaluate_duplicates(path_col='path', text_threshold=0.8, image_max_distance=5, text_k=3, show_table=True, cross_dir_paths=None)

Evaluate duplicates among files in the DataFrame.

Scans the path_col column, runs exact, text and image duplicate detectors. Optionally filters to show only duplicates that cross directory boundaries (requires cross_dir_paths to define boundaries).

Source code in filoma/dataframe.py
 953
 954
 955
 956
 957
 958
 959
 960
 961
 962
 963
 964
 965
 966
 967
 968
 969
 970
 971
 972
 973
 974
 975
 976
 977
 978
 979
 980
 981
 982
 983
 984
 985
 986
 987
 988
 989
 990
 991
 992
 993
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
def evaluate_duplicates(
    self,
    path_col: str = "path",
    text_threshold: float = 0.8,
    image_max_distance: int = 5,
    text_k: int = 3,
    show_table: bool = True,
    cross_dir_paths: Optional[List[str]] = None,
) -> dict:
    """Evaluate duplicates among files in the DataFrame.

    Scans the `path_col` column, runs exact, text and image duplicate
    detectors. Optionally filters to show only duplicates that cross
    directory boundaries (requires `cross_dir_paths` to define boundaries).
    """
    if path_col not in self._df.columns:
        raise ValueError(f"Column '{path_col}' not found in DataFrame")

    # filter for files only
    paths = [str(p) for p in self._df[path_col].to_list() if Path(p).is_file()]
    res = _dedup.find_duplicates(
        paths,
        text_k=text_k,
        text_threshold=text_threshold,
        image_max_distance=image_max_distance,
    )

    # Filter for cross-directory duplicates if requested
    if cross_dir_paths:
        for category in ["exact", "text", "image"]:
            filtered_groups = []
            for group in res.get(category, []):
                # Check if file sources span multiple folders
                source_dirs = set()
                for p in group:
                    for cp in cross_dir_paths:
                        if str(p).startswith(str(cp)):
                            source_dirs.add(cp)
                if len(source_dirs) > 1:
                    filtered_groups.append(group)
            res[category] = filtered_groups

    # Summarize counts
    exact_groups = res.get("exact", [])
    text_groups = res.get("text", [])
    image_groups = res.get("image", [])

    console = Console()
    if show_table:
        table = Table(title="Duplicate Summary (Cross-Dir)" if cross_dir_paths else "Duplicate Summary")
        table.add_column("Type", style="bold cyan")
        table.add_column("Groups", style="white")
        table.add_column("Files In Groups", style="white")
        table.add_row(
            "exact",
            str(len(exact_groups)),
            str(sum(len(g) for g in exact_groups) if exact_groups else 0),
        )
        table.add_row(
            "text",
            str(len(text_groups)),
            str(sum(len(g) for g in text_groups) if text_groups else 0),
        )
        table.add_row(
            "image",
            str(len(image_groups)),
            str(sum(len(g) for g in image_groups) if image_groups else 0),
        )
        console.print(table)

    logger.info(
        f"Duplicate summary: exact={len(exact_groups)} groups "
        f"({sum(len(g) for g in exact_groups) if exact_groups else 0} files), "
        f"text={len(text_groups)} groups "
        f"({sum(len(g) for g in text_groups) if text_groups else 0} files), "
        f"image={len(image_groups)} groups "
        f"({sum(len(g) for g in image_groups) if image_groups else 0} files)"
    )

    return res

extension_counts()

Group files by extension and count them.

Returns

Polars DataFrame with extension counts
Source code in filoma/dataframe.py
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
def extension_counts(self) -> pl.DataFrame:
    """Group files by extension and count them.

    Returns
    -------
        Polars DataFrame with extension counts

    """
    # underlying `_df` is expected to be a Polars DataFrame
    df_with_ext = self._df.with_columns(
        [
            pl.col("path")
            .map_elements(
                lambda x: (Path(x).suffix.lower() if Path(x).suffix else "<no extension>"),
                return_dtype=pl.String,
            )
            .alias("extension")
        ]
    )
    result = df_with_ext.group_by("extension").len().sort("len", descending=True)
    return DataFrame(result)

filter_by_extension(extensions)

Filter the DataFrame to only include files with specific extensions.


extensions: File extension(s) to filter by (with or without leading dot)

Filtered DataFrame
Source code in filoma/dataframe.py
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
def filter_by_extension(self, extensions: Union[str, List[str]]) -> "DataFrame":
    """Filter the DataFrame to only include files with specific extensions.

    Args:
    ----
        extensions: File extension(s) to filter by (with or without leading dot)

    Returns:
    -------
        Filtered DataFrame

    """
    if isinstance(extensions, str):
        extensions = [extensions]

    # Normalize extensions (ensure they start with a dot)
    normalized_extensions = []
    for ext in extensions:
        if not ext.startswith("."):
            ext = "." + ext
        normalized_extensions.append(ext.lower())

    filtered_df = self._df.filter(
        pl.col("path").map_elements(
            lambda x: Path(x).suffix.lower() in normalized_extensions,
            return_dtype=pl.Boolean,
        )
    )
    res = DataFrame(filtered_df, lineage=list(self._lineage))
    res.add_lineage_entry("filter_by_extension", extensions=extensions)
    return res

filter_by_pattern(pattern)

Filter the DataFrame by path pattern.


pattern: Pattern to match (uses Polars string contains)

Filtered DataFrame
Source code in filoma/dataframe.py
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
def filter_by_pattern(self, pattern: str) -> "DataFrame":
    """Filter the DataFrame by path pattern.

    Args:
    ----
        pattern: Pattern to match (uses Polars string contains)

    Returns:
    -------
        Filtered DataFrame

    """
    filtered_df = self._df.filter(pl.col("path").str.contains(pattern))
    res = DataFrame(filtered_df, lineage=list(self._lineage))
    res.add_lineage_entry("filter_by_pattern", pattern=pattern)
    return res

from_pandas(df) classmethod

Construct a filoma.DataFrame from a pandas DataFrame.

This is a convenience wrapper that converts the pandas DataFrame into a Polars DataFrame and wraps it. Requires pandas to be installed.

Source code in filoma/dataframe.py
801
802
803
804
805
806
807
808
809
810
811
812
@classmethod
def from_pandas(cls, df: Any) -> "DataFrame":
    """Construct a filoma.DataFrame from a pandas DataFrame.

    This is a convenience wrapper that converts the pandas DataFrame into
    a Polars DataFrame and wraps it. Requires pandas to be installed.
    """
    if pd is None:
        raise RuntimeError("pandas is not available in this environment")
    # Convert via Polars for internal consistency
    pl_df = pl.from_pandas(df)
    return cls(pl_df)

head(n=5)

Get the first n rows.

Source code in filoma/dataframe.py
397
398
399
400
401
def head(self, n: int = 5) -> "DataFrame":
    """Get the first n rows."""
    res = DataFrame(self._df.head(n), lineage=list(self._lineage))
    res.add_lineage_entry("head", n=n)
    return res

info()

Print concise summary of the DataFrame.

Source code in filoma/dataframe.py
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
def info(self) -> None:
    """Print concise summary of the DataFrame."""
    print("filoma.DataFrame")
    print(f"Shape: {self.shape}")
    print(f"Columns: {len(self.columns)}")
    print()

    # Column info
    print("Column details:")
    for i, (col, dtype) in enumerate(zip(self.columns, self.dtypes)):
        null_count = self._df[col].null_count()
        print(f"  {i:2d}  {col:15s} {str(dtype):15s} {null_count:8d} nulls")

    # Memory usage approximation
    memory_mb = sum(self._df[col].estimated_size("mb") for col in self.columns)
    print(f"\nEstimated memory usage: {memory_mb:.2f} MB")

invalidate_pandas_cache()

Clear the cached pandas conversion created by to_pandas().

Call this after mutating the underlying Polars DataFrame to ensure subsequent pandas accesses reflect the latest data.

Source code in filoma/dataframe.py
313
314
315
316
317
318
319
def invalidate_pandas_cache(self) -> None:
    """Clear the cached pandas conversion created by `to_pandas()`.

    Call this after mutating the underlying Polars DataFrame to ensure
    subsequent `pandas` accesses reflect the latest data.
    """
    self._pd_cache = None

save_csv(path)

Save the DataFrame to CSV.

Source code in filoma/dataframe.py
818
819
820
def save_csv(self, path: Union[str, Path]) -> None:
    """Save the DataFrame to CSV."""
    self._df.write_csv(str(path))

save_parquet(path)

Save the DataFrame to Parquet format.

Source code in filoma/dataframe.py
822
823
824
def save_parquet(self, path: Union[str, Path]) -> None:
    """Save the DataFrame to Parquet format."""
    self._df.write_parquet(str(path))

sort(by, descending=False)

Sort the DataFrame.


by: Column name(s) to sort by
descending: Sort in descending order
Source code in filoma/dataframe.py
909
910
911
912
913
914
915
916
917
918
919
920
921
def sort(self, by: Union[str, List[str]], descending: bool = False) -> "DataFrame":
    """Sort the DataFrame.

    Args:
    ----
        by: Column name(s) to sort by
        descending: Sort in descending order

    """
    result = self._df.sort(by, descending=descending)
    res = DataFrame(result, lineage=list(self._lineage))
    res.add_lineage_entry("sort", by=by, descending=descending)
    return res

tail(n=5)

Get the last n rows.

Source code in filoma/dataframe.py
403
404
405
406
407
def tail(self, n: int = 5) -> "DataFrame":
    """Get the last n rows."""
    res = DataFrame(self._df.tail(n), lineage=list(self._lineage))
    res.add_lineage_entry("tail", n=n)
    return res

to_dict()

Convert to a dictionary.

Source code in filoma/dataframe.py
814
815
816
def to_dict(self) -> Dict[str, List]:
    """Convert to a dictionary."""
    return self._df.to_dict(as_series=False)

to_pandas(force=False)

Convert to a pandas DataFrame.

By default this method will return a cached pandas conversion if one exists (for performance). Set force=True to reconvert from the current Polars DataFrame and update the cache.

Source code in filoma/dataframe.py
742
743
744
745
746
747
748
749
750
751
752
753
754
755
def to_pandas(self, force: bool = False) -> Any:
    """Convert to a pandas DataFrame.

    By default this method will return a cached pandas conversion if one
    exists (for performance). Set ``force=True`` to reconvert from the
    current Polars DataFrame and update the cache.
    """
    if pd is None:
        raise ImportError("pandas is not installed. Please install it to use to_pandas().")
    # Convert and cache on first access or when forced
    if force or self._pd_cache is None:
        # Use Polars' to_pandas conversion for consistency
        self._pd_cache = self._df.to_pandas()
    return self._pd_cache

to_polars()

Get the underlying Polars DataFrame.

Source code in filoma/dataframe.py
738
739
740
def to_polars(self) -> pl.DataFrame:
    """Get the underlying Polars DataFrame."""
    return self._df

unique(subset=None)

Get unique rows.


subset: Column name(s) to consider for uniqueness
Source code in filoma/dataframe.py
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
def unique(self, subset: Optional[Union[str, List[str]]] = None) -> "DataFrame":
    """Get unique rows.

    Args:
    ----
        subset: Column name(s) to consider for uniqueness

    """
    if subset is None:
        result = self._df.unique()
    else:
        result = self._df.unique(subset=subset)
    res = DataFrame(result, lineage=list(self._lineage))
    res.add_lineage_entry("unique", subset=subset)
    return res

handler: python

Directory profiler

The directory profiling API and configuration helpers.

Directory profiling utilities.

This module provides :class:DirectoryProfiler which analyzes directory trees and returns a :class:DirectoryAnalysis dataclass with summary statistics and optional DataFrame support.

DirectoryAnalysis dataclass

Bases: Mapping

Structured container for directory analysis results.

This is the canonical, dataclass-first return value for directory probes. Use :meth:to_dict to convert to a plain dict and :meth:to_df to access the optional DataFrame. The class exists to provide a typed, ergonomic API for programmatic consumption.

Source code in filoma/directories/directory_profiler.py
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
@dataclass
class DirectoryAnalysis(Mapping):
    """Structured container for directory analysis results.

    This is the canonical, dataclass-first return value for directory probes.
    Use :meth:`to_dict` to convert to a plain dict and :meth:`to_df`
    to access the optional DataFrame. The class exists to provide a typed,
    ergonomic API for programmatic consumption.
    """

    path: str
    summary: Dict
    file_extensions: Dict
    common_folder_names: Dict
    empty_folders: List[str]
    top_folders_by_file_count: List
    depth_distribution: Dict
    dataframe: Optional["DataFrame"] = None
    timing: Optional[Dict] = None
    dataframe_note: Optional[str] = None
    _path_obj: Path = field(init=False, repr=False)

    def __post_init__(self):
        """Initialize the path object."""
        self._path_obj = Path(self.path)

    def __getattr__(self, name: str) -> Any:
        """Delegate attribute access to the path object."""
        return getattr(self._path_obj, name)

    @property
    def path_obj(self) -> Path:
        """Return the path object."""
        return self._path_obj

    @classmethod
    def from_dict(cls, d: Dict) -> "DirectoryAnalysis":
        """Create a :class:`DirectoryAnalysis` from a plain dict.

        Parameters
        ----------
        d : dict
            Dictionary in the shape produced by :meth:`DirectoryProfiler.probe`.

        Returns
        -------
        DirectoryAnalysis
            Constructed dataclass instance.

        """
        return cls(
            path=d.get("path") or "",
            summary=d.get("summary", {}),
            file_extensions=d.get("file_extensions", {}),
            common_folder_names=d.get("common_folder_names", {}),
            empty_folders=d.get("empty_folders", []),
            top_folders_by_file_count=d.get("top_folders_by_file_count", []),
            depth_distribution=d.get("depth_distribution", {}),
            dataframe=d.get("dataframe"),
            timing=d.get("timing"),
            dataframe_note=d.get("dataframe_note"),
        )

    def to_dict(self) -> Dict:
        """Return a plain ``dict`` representation of this analysis."""
        # Convert to a plain dict shape
        d = {
            "path": self.path,
            "summary": self.summary,
            "file_extensions": self.file_extensions,
            "common_folder_names": self.common_folder_names,
            "empty_folders": self.empty_folders,
            "top_folders_by_file_count": self.top_folders_by_file_count,
            "depth_distribution": self.depth_distribution,
        }
        if self.dataframe is not None:
            d["dataframe"] = self.dataframe
        if self.timing is not None:
            d["timing"] = self.timing
        if self.dataframe_note is not None:
            d["dataframe_note"] = self.dataframe_note
        return d

    def to_df(self) -> Optional["DataFrame"]:
        """Return the attached DataFrame wrapper or log a helpful warning when absent.

        This method used to silently return None when no DataFrame was built which
        often confused interactive users calling ``analysis.to_df()``. We now log a
        warning explaining the likely causes (DataFrame building disabled or polars
        not installed) to surface actionable next steps.
        """
        if self.dataframe is None:
            # Emit a helpful, actionable warning rather than silently returning None
            logger.warning(
                "No DataFrame available for analysis at path {path!s}. "
                "DataFrame building is disabled by default or 'polars' is not installed. "
                "Call DirectoryProfiler(build_dataframe=True) or use filoma.probe_to_df(...) to obtain a DataFrame.",
                path=self.path,
            )
        return self.dataframe

    def as_dict(self) -> Dict:
        """Alias for :meth:`to_dict`.

        Provided for backward compatibility with dict-based APIs.
        """
        return self.to_dict()

    # Convenience printing helpers so callers can write `analysis.print_summary()`
    # or `analysis.print_report()` without importing DirectoryProfiler. These
    # delegate to the existing DirectoryProfiler rich printers for consistency.
    def print_summary(self, profiler: "DirectoryProfiler | None" = None):
        """Pretty-print a short summary using the rich-based DirectoryProfiler printer.

        If `profiler` is provided it will be used (useful to customize show_progress,
        console, or other profiler settings); otherwise a default profiler is created.
        """
        # Local import to avoid import cycles at module import time
        if profiler is None:
            profiler = DirectoryProfiler(DirectoryProfilerConfig())
        profiler.print_summary(self)

    def print_report(self, profiler: "DirectoryProfiler | None" = None):
        """Pretty-print the full report (summary + extras) via DirectoryProfiler.

        This is an alias for `print_summary` + additional report sections; kept
        as a separate method name for discoverability and symmetry with other
        profilers in the project.
        """
        if profiler is None:
            profiler = DirectoryProfiler(DirectoryProfilerConfig())
        profiler.print_report(self)

    # Mapping protocol implementations so callers can still use dict-like access
    # (e.g., result['summary']) even though the canonical return type is a dataclass.
    def _as_dict(self) -> Dict:
        return self.to_dict()

    def __getitem__(self, key):
        """Mapping-style access to analysis fields by key."""
        return self._as_dict()[key]

    def __iter__(self):
        """Iterate over analysis mapping keys."""
        return iter(self._as_dict())

    def __len__(self):
        """Return number of top-level fields in the analysis mapping."""
        return len(self._as_dict())

path_obj property

Return the path object.

__getattr__(name)

Delegate attribute access to the path object.

Source code in filoma/directories/directory_profiler.py
172
173
174
def __getattr__(self, name: str) -> Any:
    """Delegate attribute access to the path object."""
    return getattr(self._path_obj, name)

__getitem__(key)

Mapping-style access to analysis fields by key.

Source code in filoma/directories/directory_profiler.py
284
285
286
def __getitem__(self, key):
    """Mapping-style access to analysis fields by key."""
    return self._as_dict()[key]

__iter__()

Iterate over analysis mapping keys.

Source code in filoma/directories/directory_profiler.py
288
289
290
def __iter__(self):
    """Iterate over analysis mapping keys."""
    return iter(self._as_dict())

__len__()

Return number of top-level fields in the analysis mapping.

Source code in filoma/directories/directory_profiler.py
292
293
294
def __len__(self):
    """Return number of top-level fields in the analysis mapping."""
    return len(self._as_dict())

__post_init__()

Initialize the path object.

Source code in filoma/directories/directory_profiler.py
168
169
170
def __post_init__(self):
    """Initialize the path object."""
    self._path_obj = Path(self.path)

as_dict()

Alias for :meth:to_dict.

Provided for backward compatibility with dict-based APIs.

Source code in filoma/directories/directory_profiler.py
247
248
249
250
251
252
def as_dict(self) -> Dict:
    """Alias for :meth:`to_dict`.

    Provided for backward compatibility with dict-based APIs.
    """
    return self.to_dict()

from_dict(d) classmethod

Create a :class:DirectoryAnalysis from a plain dict.

Parameters

d : dict Dictionary in the shape produced by :meth:DirectoryProfiler.probe.

Returns

DirectoryAnalysis Constructed dataclass instance.

Source code in filoma/directories/directory_profiler.py
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
@classmethod
def from_dict(cls, d: Dict) -> "DirectoryAnalysis":
    """Create a :class:`DirectoryAnalysis` from a plain dict.

    Parameters
    ----------
    d : dict
        Dictionary in the shape produced by :meth:`DirectoryProfiler.probe`.

    Returns
    -------
    DirectoryAnalysis
        Constructed dataclass instance.

    """
    return cls(
        path=d.get("path") or "",
        summary=d.get("summary", {}),
        file_extensions=d.get("file_extensions", {}),
        common_folder_names=d.get("common_folder_names", {}),
        empty_folders=d.get("empty_folders", []),
        top_folders_by_file_count=d.get("top_folders_by_file_count", []),
        depth_distribution=d.get("depth_distribution", {}),
        dataframe=d.get("dataframe"),
        timing=d.get("timing"),
        dataframe_note=d.get("dataframe_note"),
    )

print_report(profiler=None)

Pretty-print the full report (summary + extras) via DirectoryProfiler.

This is an alias for print_summary + additional report sections; kept as a separate method name for discoverability and symmetry with other profilers in the project.

Source code in filoma/directories/directory_profiler.py
268
269
270
271
272
273
274
275
276
277
def print_report(self, profiler: "DirectoryProfiler | None" = None):
    """Pretty-print the full report (summary + extras) via DirectoryProfiler.

    This is an alias for `print_summary` + additional report sections; kept
    as a separate method name for discoverability and symmetry with other
    profilers in the project.
    """
    if profiler is None:
        profiler = DirectoryProfiler(DirectoryProfilerConfig())
    profiler.print_report(self)

print_summary(profiler=None)

Pretty-print a short summary using the rich-based DirectoryProfiler printer.

If profiler is provided it will be used (useful to customize show_progress, console, or other profiler settings); otherwise a default profiler is created.

Source code in filoma/directories/directory_profiler.py
257
258
259
260
261
262
263
264
265
266
def print_summary(self, profiler: "DirectoryProfiler | None" = None):
    """Pretty-print a short summary using the rich-based DirectoryProfiler printer.

    If `profiler` is provided it will be used (useful to customize show_progress,
    console, or other profiler settings); otherwise a default profiler is created.
    """
    # Local import to avoid import cycles at module import time
    if profiler is None:
        profiler = DirectoryProfiler(DirectoryProfilerConfig())
    profiler.print_summary(self)

to_df()

Return the attached DataFrame wrapper or log a helpful warning when absent.

This method used to silently return None when no DataFrame was built which often confused interactive users calling analysis.to_df(). We now log a warning explaining the likely causes (DataFrame building disabled or polars not installed) to surface actionable next steps.

Source code in filoma/directories/directory_profiler.py
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
def to_df(self) -> Optional["DataFrame"]:
    """Return the attached DataFrame wrapper or log a helpful warning when absent.

    This method used to silently return None when no DataFrame was built which
    often confused interactive users calling ``analysis.to_df()``. We now log a
    warning explaining the likely causes (DataFrame building disabled or polars
    not installed) to surface actionable next steps.
    """
    if self.dataframe is None:
        # Emit a helpful, actionable warning rather than silently returning None
        logger.warning(
            "No DataFrame available for analysis at path {path!s}. "
            "DataFrame building is disabled by default or 'polars' is not installed. "
            "Call DirectoryProfiler(build_dataframe=True) or use filoma.probe_to_df(...) to obtain a DataFrame.",
            path=self.path,
        )
    return self.dataframe

to_dict()

Return a plain dict representation of this analysis.

Source code in filoma/directories/directory_profiler.py
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
def to_dict(self) -> Dict:
    """Return a plain ``dict`` representation of this analysis."""
    # Convert to a plain dict shape
    d = {
        "path": self.path,
        "summary": self.summary,
        "file_extensions": self.file_extensions,
        "common_folder_names": self.common_folder_names,
        "empty_folders": self.empty_folders,
        "top_folders_by_file_count": self.top_folders_by_file_count,
        "depth_distribution": self.depth_distribution,
    }
    if self.dataframe is not None:
        d["dataframe"] = self.dataframe
    if self.timing is not None:
        d["timing"] = self.timing
    if self.dataframe_note is not None:
        d["dataframe_note"] = self.dataframe_note
    return d

DirectoryProfiler

Analyzes directory structures for basic statistics and patterns.

Provides file counts, folder patterns, empty directories, and extension analysis.

Can use either a pure Python implementation or a faster Rust implementation when available. Supports both sequential and parallel Rust processing.

Source code in filoma/directories/directory_profiler.py
 297
 298
 299
 300
 301
 302
 303
 304
 305
 306
 307
 308
 309
 310
 311
 312
 313
 314
 315
 316
 317
 318
 319
 320
 321
 322
 323
 324
 325
 326
 327
 328
 329
 330
 331
 332
 333
 334
 335
 336
 337
 338
 339
 340
 341
 342
 343
 344
 345
 346
 347
 348
 349
 350
 351
 352
 353
 354
 355
 356
 357
 358
 359
 360
 361
 362
 363
 364
 365
 366
 367
 368
 369
 370
 371
 372
 373
 374
 375
 376
 377
 378
 379
 380
 381
 382
 383
 384
 385
 386
 387
 388
 389
 390
 391
 392
 393
 394
 395
 396
 397
 398
 399
 400
 401
 402
 403
 404
 405
 406
 407
 408
 409
 410
 411
 412
 413
 414
 415
 416
 417
 418
 419
 420
 421
 422
 423
 424
 425
 426
 427
 428
 429
 430
 431
 432
 433
 434
 435
 436
 437
 438
 439
 440
 441
 442
 443
 444
 445
 446
 447
 448
 449
 450
 451
 452
 453
 454
 455
 456
 457
 458
 459
 460
 461
 462
 463
 464
 465
 466
 467
 468
 469
 470
 471
 472
 473
 474
 475
 476
 477
 478
 479
 480
 481
 482
 483
 484
 485
 486
 487
 488
 489
 490
 491
 492
 493
 494
 495
 496
 497
 498
 499
 500
 501
 502
 503
 504
 505
 506
 507
 508
 509
 510
 511
 512
 513
 514
 515
 516
 517
 518
 519
 520
 521
 522
 523
 524
 525
 526
 527
 528
 529
 530
 531
 532
 533
 534
 535
 536
 537
 538
 539
 540
 541
 542
 543
 544
 545
 546
 547
 548
 549
 550
 551
 552
 553
 554
 555
 556
 557
 558
 559
 560
 561
 562
 563
 564
 565
 566
 567
 568
 569
 570
 571
 572
 573
 574
 575
 576
 577
 578
 579
 580
 581
 582
 583
 584
 585
 586
 587
 588
 589
 590
 591
 592
 593
 594
 595
 596
 597
 598
 599
 600
 601
 602
 603
 604
 605
 606
 607
 608
 609
 610
 611
 612
 613
 614
 615
 616
 617
 618
 619
 620
 621
 622
 623
 624
 625
 626
 627
 628
 629
 630
 631
 632
 633
 634
 635
 636
 637
 638
 639
 640
 641
 642
 643
 644
 645
 646
 647
 648
 649
 650
 651
 652
 653
 654
 655
 656
 657
 658
 659
 660
 661
 662
 663
 664
 665
 666
 667
 668
 669
 670
 671
 672
 673
 674
 675
 676
 677
 678
 679
 680
 681
 682
 683
 684
 685
 686
 687
 688
 689
 690
 691
 692
 693
 694
 695
 696
 697
 698
 699
 700
 701
 702
 703
 704
 705
 706
 707
 708
 709
 710
 711
 712
 713
 714
 715
 716
 717
 718
 719
 720
 721
 722
 723
 724
 725
 726
 727
 728
 729
 730
 731
 732
 733
 734
 735
 736
 737
 738
 739
 740
 741
 742
 743
 744
 745
 746
 747
 748
 749
 750
 751
 752
 753
 754
 755
 756
 757
 758
 759
 760
 761
 762
 763
 764
 765
 766
 767
 768
 769
 770
 771
 772
 773
 774
 775
 776
 777
 778
 779
 780
 781
 782
 783
 784
 785
 786
 787
 788
 789
 790
 791
 792
 793
 794
 795
 796
 797
 798
 799
 800
 801
 802
 803
 804
 805
 806
 807
 808
 809
 810
 811
 812
 813
 814
 815
 816
 817
 818
 819
 820
 821
 822
 823
 824
 825
 826
 827
 828
 829
 830
 831
 832
 833
 834
 835
 836
 837
 838
 839
 840
 841
 842
 843
 844
 845
 846
 847
 848
 849
 850
 851
 852
 853
 854
 855
 856
 857
 858
 859
 860
 861
 862
 863
 864
 865
 866
 867
 868
 869
 870
 871
 872
 873
 874
 875
 876
 877
 878
 879
 880
 881
 882
 883
 884
 885
 886
 887
 888
 889
 890
 891
 892
 893
 894
 895
 896
 897
 898
 899
 900
 901
 902
 903
 904
 905
 906
 907
 908
 909
 910
 911
 912
 913
 914
 915
 916
 917
 918
 919
 920
 921
 922
 923
 924
 925
 926
 927
 928
 929
 930
 931
 932
 933
 934
 935
 936
 937
 938
 939
 940
 941
 942
 943
 944
 945
 946
 947
 948
 949
 950
 951
 952
 953
 954
 955
 956
 957
 958
 959
 960
 961
 962
 963
 964
 965
 966
 967
 968
 969
 970
 971
 972
 973
 974
 975
 976
 977
 978
 979
 980
 981
 982
 983
 984
 985
 986
 987
 988
 989
 990
 991
 992
 993
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
class DirectoryProfiler:
    """Analyzes directory structures for basic statistics and patterns.

    Provides file counts, folder patterns, empty directories, and extension analysis.

    Can use either a pure Python implementation or a faster Rust implementation
    when available. Supports both sequential and parallel Rust processing.

    """

    def __init__(self, config: "DirectoryProfilerConfig"):
        """Initialize the directory profiler.

        The profiler is configured with a `DirectoryProfilerConfig` instance which
        holds options such as whether to use Rust acceleration, parallel processing,
        fd integration, thresholding for parallelism, DataFrame building, and progress
        reporting callbacks. Pass a `DirectoryProfilerConfig` object as the single
        `config` argument. See `DirectoryProfilerConfig` for descriptions of each
        configurable field.
        """
        # Expect a DirectoryProfilerConfig object — no legacy kwargs supported.
        if not hasattr(config, "__class__") or config.__class__.__name__ != "DirectoryProfilerConfig":
            raise TypeError("DirectoryProfiler requires a DirectoryProfilerConfig instance as the sole argument")

        self.console = Console()
        self.config = config

        # Set simple aliases for common flags to preserve prior attribute names
        # Internal availability checks are still performed below.
        self.search_backend = config.search_backend
        self.parallel_threshold = config.parallel_threshold
        self._fast_path_only = config.fast_path_only
        self.progress_callback = config.progress_callback

        # Validate availability and enforce clear relationships
        # Use explicit booleans from the config
        if config.use_rust and not RUST_AVAILABLE:
            raise RuntimeError("Rust implementation requested but not available in this build")
        if config.use_parallel and not RUST_PARALLEL_AVAILABLE:
            raise RuntimeError("Parallel Rust requested but not available")
        if config.use_async and not RUST_ASYNC_AVAILABLE:
            raise RuntimeError("Async Rust prober requested but not available in this build")
        if config.use_fd and not FD_AVAILABLE:
            raise RuntimeError("fd integration requested but not available in this environment")
        if config.build_dataframe and not DATAFRAME_AVAILABLE:
            raise RuntimeError("DataFrame building requested but Polars/DataFrame support is not available")

        # Network args only apply when use_async is True (explicit)
        # Only validate if user has set custom network params (not using defaults)
        has_custom_network_params = config.network_concurrency != 192 or config.network_timeout_ms != 20000 or config.network_retries != 0
        if not config.use_async and has_custom_network_params:
            raise ValueError("Network tuning parameters only apply when use_async=True")

        # Threads only applies when use_fd is True or search_backend='fd'
        is_using_fd = config.use_fd or config.search_backend == "fd"
        if config.threads is not None and not is_using_fd:
            raise ValueError("'threads' setting only applies when use_fd=True or search_backend='fd'")

        # Decide which implementation to use based on search_backend and availability
        backend_choice = config.search_backend
        if backend_choice == "auto":
            # Honor explicit user preferences when provided.
            # If both backends are explicitly requested and available, prefer fd
            if config.use_fd and config.use_rust and FD_AVAILABLE and RUST_AVAILABLE:
                backend_choice = "fd"
            # If user explicitly requested Rust and it's available, use it
            elif config.use_rust and RUST_AVAILABLE:
                backend_choice = "rust"
            # If user explicitly requested fd and it's available, use it
            elif config.use_fd and FD_AVAILABLE:
                backend_choice = "fd"
            else:
                # No explicit preference from user -> auto-detect best available
                # For pure file discovery (fast_path_only), prefer python/os.walk
                if config.fast_path_only:
                    backend_choice = "python"
                elif RUST_AVAILABLE:
                    backend_choice = "rust"
                elif FD_AVAILABLE:
                    backend_choice = "fd"
                else:
                    backend_choice = "python"

        if backend_choice == "rust":
            self.use_rust = True
            self.use_fd = False
        elif backend_choice == "fd":
            self.use_rust = False
            self.use_fd = True
        else:
            self.use_rust = False
            self.use_fd = False

        # Parallel/async/other toggles come directly from config (already validated)
        self.use_parallel = bool(config.use_parallel and self.use_rust)
        self.use_async = bool(config.use_async and self.use_rust)

        # Other instance-level flags
        self.build_dataframe = bool(config.build_dataframe)
        self.return_absolute_paths = bool(config.return_absolute_paths)
        # Progress handling
        if _is_interactive_environment() and config.show_progress:
            logger.debug("Interactive environment detected, disabling progress bars to avoid conflicts")
            self.show_progress = False
        else:
            self.show_progress = bool(config.show_progress)

        # Network tuning (only valid if use_async True)
        self.network_concurrency = config.network_concurrency
        self.network_timeout_ms = config.network_timeout_ms
        self.network_retries = config.network_retries

        # Threads forwarded to fd if using fd backend
        self.threads = config.threads if self.use_fd else None

        # Defer fd integration initialization until actually used
        self.fd_integration = None

    def is_rust_available(self) -> bool:
        """Check if Rust implementation is available and being used.

        Returns
        -------
            True if Rust implementation is available and enabled, False otherwise

        """
        return self.use_rust and RUST_AVAILABLE

    def is_parallel_available(self) -> bool:
        """Check if parallel Rust implementation is available and being used.

        Returns
        -------
            True if parallel Rust implementation is available and enabled, False otherwise

        """
        return self.use_parallel and RUST_PARALLEL_AVAILABLE

    def is_fd_available(self) -> bool:
        """Check if fd integration is available and being used.

        Returns
        -------
            True if fd is available and enabled, False otherwise

        """
        # Use FD_AVAILABLE to reflect whether the fd integration package is importable
        # Tests may monkeypatch FD_AVAILABLE without having the fd binary present.
        return self.use_fd and FD_AVAILABLE

    def get_implementation_info(self) -> dict:
        """Get information about which implementations are available and being used.

        Returns
        -------
            Dictionary with implementation availability status

        """
        return {
            "rust_available": RUST_AVAILABLE,
            "rust_parallel_available": RUST_PARALLEL_AVAILABLE,
            "rust_async_available": RUST_ASYNC_AVAILABLE,
            "fd_available": FD_AVAILABLE,
            "dataframe_available": DATAFRAME_AVAILABLE,
            "using_rust": self.use_rust,
            "using_parallel": self.use_parallel,
            "using_async": bool(self.use_async and RUST_ASYNC_AVAILABLE),
            "using_fd": self.use_fd,
            "using_dataframe": self.build_dataframe,
            "return_absolute_paths": self.return_absolute_paths,
            "search_backend": self.search_backend,
            "python_fallback": not (self.use_rust or self.use_fd),
        }

    def probe(self, path: str, max_depth: Optional[int] = None, threads: Optional[int] = None) -> "DirectoryAnalysis":
        """Analyze a directory tree and return comprehensive statistics.

        Args:
        ----
            path: Path to the root directory to probe
            max_depth: Maximum depth to traverse (None for unlimited)
            threads: Optional override for number of threads when using fd backend

        Returns:
        -------
            A :class:`DirectoryAnalysis` instance containing analysis results

        """
        start_time = time.time()

        # Choose the best backend
        backend = self._choose_backend()

        # Log the start of analysis
        impl_type = self._get_impl_display_name(backend)
        logger.info(f"Starting directory analysis of '{path}' using {impl_type} implementation")

        try:
            if backend == "fd":
                # threads param overrides instance threads when provided
                chosen_threads = threads if threads is not None else self.threads
                result = self._probe_fd(path, max_depth, threads=chosen_threads)
            elif backend == "rust":
                result = self._probe_rust(path, max_depth, fast_path_only=self._fast_path_only)
            else:
                result = self._probe_python(path, max_depth)

            # Calculate and log timing
            elapsed_time = time.time() - start_time
            total_items = result["summary"]["total_files"] + result["summary"]["total_folders"]

            logger.success(
                f"Directory analysis completed in {elapsed_time:.2f}s - "
                f"Found {total_items:,} items ({result['summary']['total_files']:,} files, "
                f"{result['summary']['total_folders']:,} folders) using {impl_type}"
            )

            # Add timing information to result
            result["timing"] = {
                "elapsed_seconds": elapsed_time,
                "implementation": impl_type,
                "items_per_second": (total_items / elapsed_time if elapsed_time > 0 else 0),
            }

            # Return a structured dataclass by default for easier programmatic use
            return DirectoryAnalysis.from_dict(result)

        except Exception as e:
            elapsed_time = time.time() - start_time
            logger.error(f"Directory analysis failed after {elapsed_time:.2f}s: {str(e)}")
            raise

    def _choose_backend(self) -> str:
        """Choose the best available backend based on settings and availability.

        Returns
        -------
            Backend name: "fd", "rust", or "python"

        """
        # If search_backend is 'auto' and neither rust nor fd are requested
        # by the resolved preferences, prefer the Python backend. This avoids
        # forcing Python when the user specifically preferred fd.
        if self.search_backend == "auto" and not (self.use_rust or self.use_fd):
            return "python"

        if self.search_backend == "fd":
            if self.use_fd and FD_AVAILABLE:
                return "fd"
            else:
                logger.warning("fd backend requested but not available, falling back to auto selection")

        elif self.search_backend == "rust":
            if self.use_rust:
                return "rust"
            else:
                logger.warning("Rust backend requested but not available, falling back to auto selection")

        elif self.search_backend == "python":
            return "python"

        # Auto selection logic
        if self.search_backend == "auto":
            # Based on cold cache benchmarks Rust tends to be the fastest
            # general-purpose backend. Prefer Rust when available; fall back
            # to fd when Rust is not enabled/available but fd is explicitly
            # enabled by the user.
            if self.use_rust and RUST_AVAILABLE:
                return "rust"
            elif self.use_fd and FD_AVAILABLE:
                return "fd"
            else:
                return "python"

        # Fallback to python if nothing else works
        return "python"

    def _get_impl_display_name(self, backend: str) -> str:
        """Get display name for implementation type."""
        if backend == "fd":
            return "🔍 fd"
        elif backend == "rust":
            if self.use_parallel and RUST_PARALLEL_AVAILABLE:
                return "🦀 Rust (Parallel)"
            else:
                return "🦀 Rust (Sequential)"
        else:
            return "🐍 Python"

    def _probe_fd(self, path: str, max_depth: Optional[int] = None, threads: Optional[int] = None) -> Dict:
        """Use fd for file discovery + Python for analysis.

        This hybrid approach leverages fd's ultra-fast file discovery
        while using Python for statistical analysis to maintain
        consistency with other backends.
        """
        # Lazily initialize fd integration here. This ensures tests that
        # monkeypatch FD_AVAILABLE can control availability without the
        # constructor eagerly probing the environment.
        if self.fd_integration is None:
            # If the fd integration package wasn't importable at module
            # import time, reflect that now.
            if not FD_AVAILABLE:
                raise RuntimeError("fd integration not available")
            try:
                self.fd_integration = FdIntegration()
                if not self.fd_integration.is_available():
                    # fd binary is not usable on this system
                    self.fd_integration = None
                    raise RuntimeError("fd integration not available")
            except Exception:
                self.fd_integration = None
                raise RuntimeError("fd integration not available")

        progress = None
        task_id = None

        if self.show_progress:
            progress = Progress(
                SpinnerColumn(),
                TextColumn("[bold blue]Discovering files with fd..."),
                BarColumn(),
                TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
                TimeElapsedColumn(),
                console=self.console,
                transient=True,
            )
            progress.start()
            task_id = progress.add_task("Discovering...", total=None)

        # Run the fd discovery and analysis inside a try so we always stop
        # the progress bar in the finally block below.
        try:
            # Use fd to get all files and directories rapidly
            if progress and task_id is not None:
                progress.update(task_id, description="[bold blue]Finding files...")

            # fd's --max-depth applies to the matched path; to match the
            # Python/Rust semantics where files up to depth (max_depth + 1)
            # are included, when a max_depth is provided for the probe we
            # increase the file search depth by 1.
            file_max_depth = None if max_depth is None else max_depth + 1
            # When using fd in auto mode, prefer flags that match a raw
            # traversal (include hidden files, don't honor ignore files, but don't follow symlinks)
            fd_find_kwargs: dict = {
                "path": path,
                "file_types": ["f"],
                "max_depth": file_max_depth,
                "absolute_paths": self.return_absolute_paths,
                "threads": threads,
            }
            if self.search_backend == "auto" or self.config.fd_no_ignore:
                fd_find_kwargs.update({"search_hidden": True, "no_ignore": True, "follow_links": False})

            all_files = self.fd_integration.find(**fd_find_kwargs)

            if progress and task_id is not None:
                progress.update(task_id, description="[bold blue]Finding directories...")

            all_dirs = self.fd_integration.find(
                path=path,
                file_types=["d"],  # Directories only
                max_depth=max_depth,
                absolute_paths=self.return_absolute_paths,
                threads=threads,
                search_hidden=True if self.search_backend == "auto" else False,
                no_ignore=True if self.search_backend == "auto" else False,
                follow_links=False,  # Don't follow symlinks by default
            )

            # Convert to Path objects for analysis
            root_path_obj = Path(path).resolve()
            all_paths = [Path(p) for p in all_files + all_dirs]

            # If DataFrame building is enabled and DataFrame support is available,
            # build a prebuilt DataFrame from the fd results and pass it to the
            # Python probing logic to avoid rebuilding the DataFrame there.
            prebuilt_df = None
            if self.build_dataframe and DATAFRAME_AVAILABLE:
                try:
                    prebuilt_df = DataFrame([str(p) for p in all_paths])
                except Exception:
                    # If DataFrame construction fails for any reason, fall back
                    # to letting _probe_paths_python collect paths itself.
                    prebuilt_df = None

            if progress and task_id is not None:
                progress.update(task_id, description="[bold yellow]Analyzing discovered files...")
                progress.update(task_id, total=100, completed=50)

                # Now probe the discovered paths using Python logic
                # Pass the existing progress to avoid conflicts. If a prebuilt DataFrame
                # exists, provide it to avoid rebuilding the DataFrame inside the probe.
                result = self._probe_paths_python(
                    root_path_obj,
                    all_paths,
                    max_depth,
                    existing_progress=progress,
                    existing_task_id=task_id,
                    prebuilt_dataframe=prebuilt_df,
                )
            else:
                # No progress provided; run probe without progress integration
                result = self._probe_paths_python(
                    root_path_obj,
                    all_paths,
                    max_depth,
                    existing_progress=None,
                    existing_task_id=None,
                    prebuilt_dataframe=prebuilt_df,
                )

            if progress and task_id is not None:
                progress.update(task_id, description="[bold green]Analysis complete!")
                progress.update(task_id, completed=100)

            return result

        finally:
            if progress:
                progress.stop()

    def sample_paths(self, path: str, sample_size: int = 20) -> Dict[str, List[str]]:
        """Return small samples of paths for quick backend-diffing.

        Returns a dict with keys 'fd_files', 'fd_dirs', 'python_files'. Rust currently
        does not expose a path list in the public API so it is omitted (you can
        re-run the Rust prober separately if needed).
        """
        samples = {"fd_files": [], "fd_dirs": [], "python_files": []}
        try:
            if FD_AVAILABLE:
                fd = FdIntegration()
                samples["fd_files"] = fd.find(
                    path=path,
                    file_types=["f"],
                    max_results=sample_size,
                    search_hidden=True,
                    no_ignore=True,
                    follow_links=False,
                    absolute_paths=self.return_absolute_paths,
                )
                samples["fd_dirs"] = fd.find(
                    path=path,
                    file_types=["d"],
                    max_results=sample_size,
                    search_hidden=True,
                    no_ignore=True,
                    follow_links=False,
                    absolute_paths=self.return_absolute_paths,
                )
        except Exception:
            samples["fd_files"] = []
            samples["fd_dirs"] = []

        # Python sample
        try:
            root = Path(path)
            python_files = []
            for i, p in enumerate(root.rglob("*")):
                if p.is_file():
                    python_files.append(str(p.resolve()))
                if len(python_files) >= sample_size:
                    break
            samples["python_files"] = python_files
        except Exception:
            samples["python_files"] = []

        return samples

    def _probe_paths_python(
        self,
        path_root: Path,
        all_paths: List[Path],
        max_depth: Optional[int] = None,
        existing_progress=None,
        existing_task_id=None,
        prebuilt_dataframe=None,
    ) -> Dict:
        """Analyze pre-discovered paths using Python logic.

        This method takes a list of paths (from fd or other source) and performs
        the statistical analysis to maintain consistency with the Python backend.

        Args:
        ----
            path: Root directory being probed
            all_paths: List of paths to probe
            max_depth: Maximum depth for analysis
            existing_progress: Existing progress bar to reuse (avoids conflicts)
            existing_task_id: Existing task ID to update
            path_root: The resolved root Path for the probe (used for depth calculations)
            prebuilt_dataframe: Optional DataFrame supplied to avoid rebuilding inside probe

        """
        # Initialize counters and collections
        file_count = 0
        folder_count = 1  # Start with 1 to count the root directory itself
        total_size = 0
        empty_folders = []
        file_extensions = Counter()
        folder_names = Counter()
        files_per_folder = defaultdict(int)
        depth_stats = defaultdict(int)

        # Count the root directory at depth 0
        depth_stats[0] = 1

        # Collection for DataFrame if enabled. If a prebuilt_dataframe is provided
        # (e.g. from fd results), skip collecting paths and attach it at the end.
        dataframe_paths = [] if (self.build_dataframe and prebuilt_dataframe is None) else None

        # Sort paths for better progress indication (guard against None or unsortable lists)
        if all_paths:
            try:
                all_paths.sort()
            except Exception:
                # If sorting fails (e.g., mixed types), ignore and proceed
                pass

        progress = existing_progress
        task_id = existing_task_id
        processed_items = 0
        progress_owned = False  # Track if we own the progress bar

        if self.show_progress and existing_progress is None:
            # Only create new progress if none was provided
            progress = Progress(
                SpinnerColumn(),
                TextColumn("[bold blue]Analyzing file metadata..."),
                BarColumn(),
                TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
                TextColumn("({task.completed:,}/{task.total:,} items)"),
                TimeElapsedColumn(),
                console=self.console,
                transient=True,
            )
            progress.start()
            task_id = progress.add_task("Analyzing...", total=len(all_paths))
            progress_owned = True
        elif existing_progress and existing_task_id:
            # Update existing progress for the analysis phase
            existing_progress.update(
                existing_task_id,
                description="[bold yellow]Analyzing file metadata...",
                total=len(all_paths),
                completed=0,
            )

        try:
            for current_path in all_paths:
                processed_items += 1

                # Update progress
                if progress and task_id is not None:
                    if processed_items % 100 == 0:
                        progress.update(task_id, completed=processed_items)

                    if self.progress_callback:
                        self.progress_callback(
                            f"Processing: {current_path.name}",
                            processed_items,
                            len(all_paths),
                        )

                # Calculate current depth
                try:
                    depth = len(current_path.relative_to(path_root).parts)
                except ValueError:
                    depth = 0

                # Skip if beyond max depth (should not happen with fd filtering, but safety check)
                if max_depth is not None:
                    if current_path.is_dir() and depth > max_depth:
                        continue
                    elif current_path.is_file() and depth > max_depth + 1:
                        continue

                # Add to paths collection if DataFrame is enabled and we're collecting paths
                if self.build_dataframe and dataframe_paths is not None:
                    dataframe_paths.append(str(current_path))

                if current_path.is_dir():
                    depth_stats[depth] += 1
                    folder_count += 1

                    # Check for empty folders
                    try:
                        if not any(current_path.iterdir()):
                            empty_folders.append(str(current_path))
                    except (OSError, PermissionError):
                        pass

                    # Analyze folder names for patterns
                    folder_names[current_path.name] += 1

                elif current_path.is_file():
                    file_count += 1

                    # Count files in parent directory
                    files_per_folder[str(current_path.parent)] += 1

                    # Get file extension
                    ext = current_path.suffix.lower()
                    if ext:
                        file_extensions[ext] += 1
                    else:
                        file_extensions["<no extension>"] += 1

                    # Add to total size
                    try:
                        total_size += current_path.stat().st_size
                    except (OSError, IOError):
                        pass

            # Final progress update
            if progress and task_id is not None:
                progress.update(task_id, completed=processed_items)

            # Calculate summary statistics
            avg_files_per_folder = file_count / max(1, folder_count)

            # Find folders with most files
            top_folders_by_file_count = sorted(files_per_folder.items(), key=lambda x: x[1], reverse=True)[:10]

            # Build result dictionary
            result = {
                "path": str(path_root),
                "summary": {
                    "total_files": file_count,
                    "total_folders": folder_count,
                    "total_size_bytes": total_size,
                    "total_size_mb": round(total_size / (1024 * 1024), 2),
                    "avg_files_per_folder": round(avg_files_per_folder, 2),
                    "max_depth": max(depth_stats.keys()) if depth_stats else 0,
                    "empty_folder_count": len(empty_folders),
                },
                "file_extensions": dict(file_extensions.most_common(20)),
                "common_folder_names": dict(folder_names.most_common(20)),
                "empty_folders": empty_folders,
                "top_folders_by_file_count": top_folders_by_file_count,
                "depth_distribution": dict(depth_stats),
            }

            # Add DataFrame if enabled
            if self.build_dataframe and DATAFRAME_AVAILABLE:
                if prebuilt_dataframe is not None:
                    # Use prebuilt DataFrame supplied by caller (fd results)
                    result["dataframe"] = prebuilt_dataframe
                else:
                    result["dataframe"] = DataFrame(dataframe_paths)

            return result

        finally:
            if progress and progress_owned:
                progress.stop()

    def _probe_rust(self, path: str, max_depth: Optional[int] = None, fast_path_only: bool = False) -> Dict:
        """Use the Rust implementation for analysis.

        For performance, the main statistical analysis is done in Rust.
        If DataFrame building is enabled, file paths are collected separately
        using Python/pathlib to maintain consistency with the Python implementation.
        """
        progress = None
        task_id = None

        if self.show_progress:
            progress = Progress(
                SpinnerColumn(),
                TextColumn("[bold blue]Analyzing directory structure..."),
                BarColumn(),
                TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
                TimeElapsedColumn(),
                console=self.console,
                transient=True,  # Remove progress bar when done
            )
            progress.start()
            task_id = progress.add_task("Analyzing...", total=None)

        try:
            # Choose Rust variant: async for network filesystems, sync otherwise
            try:
                fs_type = self._detect_filesystem_type(path)
            except Exception:
                fs_type = None

            is_network_fs = False
            if fs_type:
                # Common network FS types
                if any(x in fs_type.lower() for x in ("nfs", "cifs", "smb", "ceph", "gluster", "sshfs")):
                    is_network_fs = True

            # If network FS choose async Rust prober which limits concurrency and uses tokio
            # Only use the async Rust variant when the path looks like a network
            # filesystem AND the user explicitly enabled async via `use_async`.
            if is_network_fs and self.use_async:
                # Default concurrency limit can be tuned; use configured values
                if RUST_ASYNC_AVAILABLE:
                    # Decide Rust flag defaults: when search_backend is 'auto', scan hidden/ignored but don't follow symlinks
                    if self.search_backend == "auto":
                        follow = False
                        hidden = True
                        no_ignore = True
                    else:
                        follow = None
                        hidden = None
                        no_ignore = None

                    result = probe_directory_rust_async(
                        path,
                        max_depth,
                        self.network_concurrency,
                        self.network_timeout_ms,
                        self.network_retries,
                        fast_path_only,
                        follow_links=follow,
                        search_hidden=hidden,
                        no_ignore=no_ignore,
                    )
                else:
                    # Async variant not available; fall back to parallel or sequential Rust
                    if self.use_parallel and RUST_PARALLEL_AVAILABLE:
                        if self.search_backend == "auto":
                            follow = False
                            hidden = True
                            no_ignore = True
                        else:
                            follow = None
                            hidden = None
                            no_ignore = None

                        result = probe_directory_rust_parallel(
                            path,
                            max_depth,
                            self.parallel_threshold,
                            fast_path_only,
                            follow_links=follow,
                            search_hidden=hidden,
                            no_ignore=no_ignore,
                        )
                    else:
                        result = probe_directory_rust(path, max_depth, fast_path_only)
            elif is_network_fs and not self.use_async:
                # User explicitly disabled async; prefer parallel or sequential Rust
                if self.use_parallel and RUST_PARALLEL_AVAILABLE:
                    if self.search_backend == "auto":
                        follow = False
                        hidden = True
                        no_ignore = True
                    else:
                        follow = None
                        hidden = None
                        no_ignore = None

                    result = probe_directory_rust_parallel(
                        path,
                        max_depth,
                        self.parallel_threshold,
                        fast_path_only,
                        follow_links=follow,
                        search_hidden=hidden,
                        no_ignore=no_ignore,
                    )
                else:
                    if self.search_backend == "auto":
                        follow = False
                        hidden = True
                        no_ignore = True
                    else:
                        follow = None
                        hidden = None
                        no_ignore = None

                    result = probe_directory_rust(
                        path,
                        max_depth,
                        fast_path_only,
                        follow_links=follow,
                        search_hidden=hidden,
                        no_ignore=no_ignore,
                    )
            else:
                if self.search_backend == "auto":
                    follow = False
                    hidden = True
                    no_ignore = True
                else:
                    follow = None
                    hidden = None
                    no_ignore = None

                if self.use_parallel and RUST_PARALLEL_AVAILABLE:
                    result = probe_directory_rust_parallel(
                        path,
                        max_depth,
                        self.parallel_threshold,
                        fast_path_only,
                        follow_links=follow,
                        search_hidden=hidden,
                        no_ignore=no_ignore,
                    )
                else:
                    result = probe_directory_rust(
                        path,
                        max_depth,
                        fast_path_only,
                        follow_links=follow,
                        search_hidden=hidden,
                        no_ignore=no_ignore,
                    )

            # Update progress to show completion
            if progress and task_id is not None:
                progress.update(task_id, description="[bold green]Analysis complete!")
                progress.update(task_id, total=100, completed=100)

            # Rust now returns absolute (or canonicalized when follow_links=True) paths,
            # so Python-side normalization is no longer necessary here.

            # If DataFrame building is enabled, we need to collect file paths
            # since the Rust implementation doesn't return them
            if self.build_dataframe and DATAFRAME_AVAILABLE:
                if progress and task_id is not None:
                    progress.update(task_id, description="[bold yellow]Building DataFrame...")

                root_path_obj = Path(path)
                all_paths = []
                permission_errors_encountered = False

                # Collect paths using Python (pathlib) with error handling for system directories
                try:
                    for current_path in root_path_obj.rglob("*"):
                        try:
                            # Calculate current depth
                            depth = len(current_path.relative_to(root_path_obj).parts)

                            # Skip if beyond max depth
                            if max_depth is not None and depth > max_depth:
                                continue

                            all_paths.append(str(current_path))
                        except (ValueError, OSError, PermissionError):
                            # Skip paths that can't be accessed or processed
                            permission_errors_encountered = True
                            continue
                except (OSError, PermissionError, FileNotFoundError):
                    # If rglob fails entirely, provide DataFrame with whatever we collected
                    self.console.print("[yellow]Warning: Some paths couldn't be accessed for DataFrame building[/yellow]")
                    logger.warning(f"DataFrame building encountered permission errors on {path}, providing partial results")
                    permission_errors_encountered = True

                # Add DataFrame to the result (may be partial if there were permission errors)
                result["dataframe"] = DataFrame(all_paths)
                if permission_errors_encountered:
                    # Add a note only if we actually encountered permission errors
                    result["dataframe_note"] = "DataFrame may be incomplete due to permission restrictions"

                if progress and task_id is not None:
                    progress.update(task_id, description="[bold green]DataFrame built!")

            return result

        finally:
            if progress:
                progress.stop()

    def _probe_python(self, path: str, max_depth: Optional[int] = None) -> Dict:
        """Pure Python implementation with enhanced DataFrame support and progress indication."""
        path_root = Path(path)
        if not path_root.exists():
            raise ValueError(f"Path does not exist: {path_root}")
        if not path_root.is_dir():
            raise ValueError(f"Path is not a directory: {path_root}")

        # Initialize counters and collections
        file_count = 0
        folder_count = 1  # Start with 1 to count the root directory itself
        total_size = 0
        empty_folders = []
        file_extensions = Counter()
        folder_names = Counter()
        files_per_folder = defaultdict(int)
        depth_stats = defaultdict(int)

        # Count the root directory at depth 0
        depth_stats[0] = 1

        # Collection for DataFrame if enabled
        all_paths = [] if self.build_dataframe else None

        # Estimate total items for progress tracking
        progress = None
        task_id = None
        total_items = None
        processed_items = 0

        if self.show_progress:
            # Quick estimation pass
            total_items = sum(1 for _ in path_root.rglob("*"))

            progress = Progress(
                SpinnerColumn(),
                TextColumn("[bold blue]Analyzing directory structure..."),
                BarColumn(),
                TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
                TextColumn("({task.completed:,}/{task.total:,} items)"),
                TimeElapsedColumn(),
                console=self.console,
                transient=True,
            )
            progress.start()
            task_id = progress.add_task("Analyzing...", total=total_items)

        try:
            # Walk through directory tree using pathlib for consistency
            try:
                for current_path in path_root.rglob("*"):
                    try:
                        processed_items += 1

                        # Update progress
                        if progress and task_id is not None:
                            if processed_items % 100 == 0:  # Update every 100 items for performance
                                progress.update(task_id, completed=processed_items)

                            # Call custom progress callback if provided
                            if self.progress_callback:
                                self.progress_callback(
                                    f"Processing: {current_path.name}",
                                    processed_items,
                                    total_items or 0,
                                )

                        # Calculate current depth
                        try:
                            depth = len(current_path.relative_to(path_root).parts)
                        except ValueError:
                            depth = 0

                        # Skip if beyond max depth (match Rust implementation logic)
                        if max_depth is not None:
                            try:
                                if current_path.is_dir() and depth > max_depth:
                                    continue
                                elif current_path.is_file() and depth > max_depth + 1:
                                    continue
                            except (OSError, PermissionError):
                                # Skip paths we can't access for depth checking
                                continue

                        # Add to paths collection if DataFrame is enabled
                        if self.build_dataframe and all_paths is not None:
                            all_paths.append(str(current_path))

                        try:
                            is_dir = current_path.is_dir()
                            is_file = current_path.is_file()
                        except (OSError, PermissionError):
                            # Skip paths we can't determine type for
                            continue

                        if is_dir:
                            depth_stats[depth] += 1
                            folder_count += 1

                            # Check for empty folders
                            try:
                                if not any(current_path.iterdir()):
                                    empty_folders.append(str(current_path))
                            except (OSError, PermissionError):
                                # Skip directories we can't read
                                pass

                            # Analyze folder names for patterns
                            folder_names[current_path.name] += 1

                        elif is_file:
                            file_count += 1

                            # Count files in parent directory
                            files_per_folder[str(current_path.parent)] += 1

                            # Get file extension
                            ext = current_path.suffix.lower()
                            if ext:
                                file_extensions[ext] += 1
                            else:
                                file_extensions["<no extension>"] += 1

                            # Add to total size
                            try:
                                total_size += current_path.stat().st_size
                            except (OSError, IOError):
                                # Skip files we can't stat (permissions, broken symlinks, etc.)
                                pass

                    except (OSError, PermissionError):
                        # Skip individual files/directories we can't access
                        continue

            except (OSError, PermissionError):
                # If rglob fails entirely, we can't probe this directory
                self.console.print(f"[yellow]Warning: Cannot access directory {path_root} - insufficient permissions[/yellow]")
                # Return minimal result
                return {
                    "path": str(path_root),
                    "summary": {
                        "total_files": 0,
                        "total_folders": 0,
                        "total_size_bytes": 0,
                        "total_size_mb": 0.0,
                        "avg_files_per_folder": 0.0,
                        "max_depth": 0,
                        "empty_folder_count": 0,
                    },
                    "file_extensions": {},
                    "common_folder_names": {},
                    "empty_folders": [],
                    "top_folders_by_file_count": [],
                    "depth_distribution": {},
                    "timing": {"error": "Permission denied"},
                }

            # Final progress update
            if progress and task_id is not None:
                progress.update(task_id, completed=processed_items)

            # Calculate summary statistics
            avg_files_per_folder = file_count / max(1, folder_count)

            # Find folders with most files
            top_folders_by_file_count = sorted(files_per_folder.items(), key=lambda x: x[1], reverse=True)[:10]

            # Build result dictionary
            result = {
                "path": str(path_root),
                "summary": {
                    "total_files": file_count,
                    "total_folders": folder_count,
                    "total_size_bytes": total_size,
                    "total_size_mb": round(total_size / (1024 * 1024), 2),
                    "avg_files_per_folder": round(avg_files_per_folder, 2),
                    "max_depth": max(depth_stats.keys()) if depth_stats else 0,
                    "empty_folder_count": len(empty_folders),
                },
                "file_extensions": dict(file_extensions.most_common(20)),
                "common_folder_names": dict(folder_names.most_common(20)),
                "empty_folders": empty_folders,
                "top_folders_by_file_count": top_folders_by_file_count,
                "depth_distribution": dict(depth_stats),
            }

            # Add DataFrame if enabled
            if self.build_dataframe and DATAFRAME_AVAILABLE:
                result["dataframe"] = DataFrame(all_paths)

            return result

        finally:
            if progress:
                progress.stop()

    def print_summary(self, analysis: "DirectoryAnalysis"):
        """Print a summary of the directory analysis (expects DirectoryAnalysis)."""
        if not isinstance(analysis, DirectoryAnalysis):
            raise TypeError("print_summary expects a DirectoryAnalysis instance")

        summary = analysis.summary
        timing = analysis.timing or {}

        # Show which implementation was used with more detail
        impl_type = timing.get("implementation", "Unknown")

        # Add DataFrame indicator
        if self.build_dataframe and analysis.dataframe is not None:
            impl_type += " + 📊 DataFrame"

        # Main summary table
        title = f"Directory Analysis: {analysis.path} ({impl_type})"
        if timing:
            title += f" - {timing.get('elapsed_seconds', 0):.2f}s"

        table = Table(title=title)
        table.add_column("Metric", style="bold cyan")
        table.add_column("Value", style="white")

        table.add_row("Total Files", f"{summary['total_files']:,}")
        table.add_row("Total Folders", f"{summary['total_folders']:,}")
        table.add_row("Total Size", f"{summary['total_size_mb']:,} MB")
        table.add_row("Average Files per Folder", str(summary["avg_files_per_folder"]))
        table.add_row("Maximum Depth", str(summary["max_depth"]))
        table.add_row("Empty Folders", str(summary["empty_folder_count"]))

        # Add DataFrame info if available
        if self.build_dataframe and analysis.dataframe is not None:
            df = analysis.dataframe
            table.add_row("DataFrame Rows", f"{len(df):,}")

        # Add timing information if available
        if timing:
            table.add_row("Analysis Time", f"{timing['elapsed_seconds']:.2f}s")
            if timing.get("items_per_second", 0) > 0:
                table.add_row("Processing Speed", f"{timing['items_per_second']:,.0f} items/sec")

        self.console.print(table)
        self.console.print()

    def get_dataframe(self, analysis: "DirectoryAnalysis") -> Optional["DataFrame"]:
        """Get the DataFrame from analysis results.

        Args:
        ----
            analysis: :class:`DirectoryAnalysis` instance

        Returns:
        -------
            DataFrame object if available, None otherwise

        """
        if not isinstance(analysis, DirectoryAnalysis):
            raise TypeError("get_dataframe expects a DirectoryAnalysis instance")
        return analysis.to_df()

    def is_dataframe_enabled(self) -> bool:
        """Check if DataFrame building is enabled and available.

        Returns
        -------
            True if DataFrame building is enabled, False otherwise

        """
        return self.build_dataframe and DATAFRAME_AVAILABLE

    def _detect_filesystem_type(self, path: str) -> Optional[str]:
        """Attempt to detect the filesystem type for a given path.

        Returns the fs type string (e.g., 'nfs', 'ext4') or None if not detected.
        """
        import os

        try:
            # Parse /proc/mounts for the mount containing the path
            mounts = []
            with open("/proc/mounts", "r") as f:
                for line in f:
                    parts = line.split()
                    if len(parts) >= 3:
                        mounts.append((parts[1], parts[2]))  # (mount_point, fs_type)

            # Find best match by longest mount_point prefix
            best = ("", None)
            p = os.path.abspath(path)
            for mnt, fst in mounts:
                if p.startswith(mnt) and len(mnt) > len(best[0]):
                    best = (mnt, fst)

            if best[1]:
                return best[1]

        except Exception:
            pass

        # Fallback: try os.statvfs and map f_fsid is not portable; return None
        return None

    def print_file_extensions(self, analysis: "DirectoryAnalysis", top_n: int = 10):
        """Print the most common file extensions (expects DirectoryAnalysis)."""
        if not isinstance(analysis, DirectoryAnalysis):
            raise TypeError("print_file_extensions expects a DirectoryAnalysis instance")

        extensions = analysis.file_extensions

        if not extensions:
            return

        table = Table(title="File Extensions")
        table.add_column("Extension", style="bold magenta")
        table.add_column("Count", style="white")
        table.add_column("Percentage", style="green")
        total_files = analysis.summary["total_files"]

        for ext, count in list(extensions.items())[:top_n]:
            percentage = (count / total_files * 100) if total_files > 0 else 0
            table.add_row(ext, f"{count:,}", f"{percentage:.1f}%")

        self.console.print(table)
        self.console.print()

    def print_folder_patterns(self, analysis: "DirectoryAnalysis", top_n: int = 10):
        """Print the most common folder names (expects DirectoryAnalysis)."""
        if not isinstance(analysis, DirectoryAnalysis):
            raise TypeError("print_folder_patterns expects a DirectoryAnalysis instance")

        folder_names = analysis.common_folder_names

        if not folder_names:
            return

        table = Table(title="Common Folder Names")
        table.add_column("Folder Name", style="bold blue")
        table.add_column("Occurrences", style="white")

        for name, count in list(folder_names.items())[:top_n]:
            table.add_row(name, f"{count:,}")

        self.console.print(table)
        self.console.print()

    def print_empty_folders(self, analysis: "DirectoryAnalysis", max_show: int = 20):
        """Print empty folders found (expects DirectoryAnalysis)."""
        if not isinstance(analysis, DirectoryAnalysis):
            raise TypeError("print_empty_folders expects a DirectoryAnalysis instance")

        empty_folders = analysis.empty_folders

        if not empty_folders:
            self.console.print("[green]✓ No empty folders found![/green]")
            return

        table = Table(title=f"Empty Folders (showing {min(len(empty_folders), max_show)} of {len(empty_folders)})")
        table.add_column("Path", style="yellow")

        for folder in empty_folders[:max_show]:
            table.add_row(folder)

        if len(empty_folders) > max_show:
            table.add_row(f"... and {len(empty_folders) - max_show} more")

        self.console.print(table)
        self.console.print()

    def print_report(self, analysis: "DirectoryAnalysis"):
        """Print a comprehensive report of the directory analysis.

        Expects a :class:`DirectoryAnalysis` instance. Use :meth:`to_dict`
        if you need a plain dict shape for downstream tooling.
        """
        if not isinstance(analysis, DirectoryAnalysis):
            raise TypeError("print_report expects a DirectoryAnalysis instance")

        self.print_summary(analysis)
        self.print_file_extensions(analysis)
        self.print_folder_patterns(analysis)
        self.print_empty_folders(analysis)

__init__(config)

Initialize the directory profiler.

The profiler is configured with a DirectoryProfilerConfig instance which holds options such as whether to use Rust acceleration, parallel processing, fd integration, thresholding for parallelism, DataFrame building, and progress reporting callbacks. Pass a DirectoryProfilerConfig object as the single config argument. See DirectoryProfilerConfig for descriptions of each configurable field.

Source code in filoma/directories/directory_profiler.py
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
def __init__(self, config: "DirectoryProfilerConfig"):
    """Initialize the directory profiler.

    The profiler is configured with a `DirectoryProfilerConfig` instance which
    holds options such as whether to use Rust acceleration, parallel processing,
    fd integration, thresholding for parallelism, DataFrame building, and progress
    reporting callbacks. Pass a `DirectoryProfilerConfig` object as the single
    `config` argument. See `DirectoryProfilerConfig` for descriptions of each
    configurable field.
    """
    # Expect a DirectoryProfilerConfig object — no legacy kwargs supported.
    if not hasattr(config, "__class__") or config.__class__.__name__ != "DirectoryProfilerConfig":
        raise TypeError("DirectoryProfiler requires a DirectoryProfilerConfig instance as the sole argument")

    self.console = Console()
    self.config = config

    # Set simple aliases for common flags to preserve prior attribute names
    # Internal availability checks are still performed below.
    self.search_backend = config.search_backend
    self.parallel_threshold = config.parallel_threshold
    self._fast_path_only = config.fast_path_only
    self.progress_callback = config.progress_callback

    # Validate availability and enforce clear relationships
    # Use explicit booleans from the config
    if config.use_rust and not RUST_AVAILABLE:
        raise RuntimeError("Rust implementation requested but not available in this build")
    if config.use_parallel and not RUST_PARALLEL_AVAILABLE:
        raise RuntimeError("Parallel Rust requested but not available")
    if config.use_async and not RUST_ASYNC_AVAILABLE:
        raise RuntimeError("Async Rust prober requested but not available in this build")
    if config.use_fd and not FD_AVAILABLE:
        raise RuntimeError("fd integration requested but not available in this environment")
    if config.build_dataframe and not DATAFRAME_AVAILABLE:
        raise RuntimeError("DataFrame building requested but Polars/DataFrame support is not available")

    # Network args only apply when use_async is True (explicit)
    # Only validate if user has set custom network params (not using defaults)
    has_custom_network_params = config.network_concurrency != 192 or config.network_timeout_ms != 20000 or config.network_retries != 0
    if not config.use_async and has_custom_network_params:
        raise ValueError("Network tuning parameters only apply when use_async=True")

    # Threads only applies when use_fd is True or search_backend='fd'
    is_using_fd = config.use_fd or config.search_backend == "fd"
    if config.threads is not None and not is_using_fd:
        raise ValueError("'threads' setting only applies when use_fd=True or search_backend='fd'")

    # Decide which implementation to use based on search_backend and availability
    backend_choice = config.search_backend
    if backend_choice == "auto":
        # Honor explicit user preferences when provided.
        # If both backends are explicitly requested and available, prefer fd
        if config.use_fd and config.use_rust and FD_AVAILABLE and RUST_AVAILABLE:
            backend_choice = "fd"
        # If user explicitly requested Rust and it's available, use it
        elif config.use_rust and RUST_AVAILABLE:
            backend_choice = "rust"
        # If user explicitly requested fd and it's available, use it
        elif config.use_fd and FD_AVAILABLE:
            backend_choice = "fd"
        else:
            # No explicit preference from user -> auto-detect best available
            # For pure file discovery (fast_path_only), prefer python/os.walk
            if config.fast_path_only:
                backend_choice = "python"
            elif RUST_AVAILABLE:
                backend_choice = "rust"
            elif FD_AVAILABLE:
                backend_choice = "fd"
            else:
                backend_choice = "python"

    if backend_choice == "rust":
        self.use_rust = True
        self.use_fd = False
    elif backend_choice == "fd":
        self.use_rust = False
        self.use_fd = True
    else:
        self.use_rust = False
        self.use_fd = False

    # Parallel/async/other toggles come directly from config (already validated)
    self.use_parallel = bool(config.use_parallel and self.use_rust)
    self.use_async = bool(config.use_async and self.use_rust)

    # Other instance-level flags
    self.build_dataframe = bool(config.build_dataframe)
    self.return_absolute_paths = bool(config.return_absolute_paths)
    # Progress handling
    if _is_interactive_environment() and config.show_progress:
        logger.debug("Interactive environment detected, disabling progress bars to avoid conflicts")
        self.show_progress = False
    else:
        self.show_progress = bool(config.show_progress)

    # Network tuning (only valid if use_async True)
    self.network_concurrency = config.network_concurrency
    self.network_timeout_ms = config.network_timeout_ms
    self.network_retries = config.network_retries

    # Threads forwarded to fd if using fd backend
    self.threads = config.threads if self.use_fd else None

    # Defer fd integration initialization until actually used
    self.fd_integration = None

get_dataframe(analysis)

Get the DataFrame from analysis results.


analysis: :class:`DirectoryAnalysis` instance

DataFrame object if available, None otherwise
Source code in filoma/directories/directory_profiler.py
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
def get_dataframe(self, analysis: "DirectoryAnalysis") -> Optional["DataFrame"]:
    """Get the DataFrame from analysis results.

    Args:
    ----
        analysis: :class:`DirectoryAnalysis` instance

    Returns:
    -------
        DataFrame object if available, None otherwise

    """
    if not isinstance(analysis, DirectoryAnalysis):
        raise TypeError("get_dataframe expects a DirectoryAnalysis instance")
    return analysis.to_df()

get_implementation_info()

Get information about which implementations are available and being used.

Returns
Dictionary with implementation availability status
Source code in filoma/directories/directory_profiler.py
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
def get_implementation_info(self) -> dict:
    """Get information about which implementations are available and being used.

    Returns
    -------
        Dictionary with implementation availability status

    """
    return {
        "rust_available": RUST_AVAILABLE,
        "rust_parallel_available": RUST_PARALLEL_AVAILABLE,
        "rust_async_available": RUST_ASYNC_AVAILABLE,
        "fd_available": FD_AVAILABLE,
        "dataframe_available": DATAFRAME_AVAILABLE,
        "using_rust": self.use_rust,
        "using_parallel": self.use_parallel,
        "using_async": bool(self.use_async and RUST_ASYNC_AVAILABLE),
        "using_fd": self.use_fd,
        "using_dataframe": self.build_dataframe,
        "return_absolute_paths": self.return_absolute_paths,
        "search_backend": self.search_backend,
        "python_fallback": not (self.use_rust or self.use_fd),
    }

is_dataframe_enabled()

Check if DataFrame building is enabled and available.

Returns
True if DataFrame building is enabled, False otherwise
Source code in filoma/directories/directory_profiler.py
1422
1423
1424
1425
1426
1427
1428
1429
1430
def is_dataframe_enabled(self) -> bool:
    """Check if DataFrame building is enabled and available.

    Returns
    -------
        True if DataFrame building is enabled, False otherwise

    """
    return self.build_dataframe and DATAFRAME_AVAILABLE

is_fd_available()

Check if fd integration is available and being used.

Returns
True if fd is available and enabled, False otherwise
Source code in filoma/directories/directory_profiler.py
435
436
437
438
439
440
441
442
443
444
445
def is_fd_available(self) -> bool:
    """Check if fd integration is available and being used.

    Returns
    -------
        True if fd is available and enabled, False otherwise

    """
    # Use FD_AVAILABLE to reflect whether the fd integration package is importable
    # Tests may monkeypatch FD_AVAILABLE without having the fd binary present.
    return self.use_fd and FD_AVAILABLE

is_parallel_available()

Check if parallel Rust implementation is available and being used.

Returns
True if parallel Rust implementation is available and enabled, False otherwise
Source code in filoma/directories/directory_profiler.py
425
426
427
428
429
430
431
432
433
def is_parallel_available(self) -> bool:
    """Check if parallel Rust implementation is available and being used.

    Returns
    -------
        True if parallel Rust implementation is available and enabled, False otherwise

    """
    return self.use_parallel and RUST_PARALLEL_AVAILABLE

is_rust_available()

Check if Rust implementation is available and being used.

Returns
True if Rust implementation is available and enabled, False otherwise
Source code in filoma/directories/directory_profiler.py
415
416
417
418
419
420
421
422
423
def is_rust_available(self) -> bool:
    """Check if Rust implementation is available and being used.

    Returns
    -------
        True if Rust implementation is available and enabled, False otherwise

    """
    return self.use_rust and RUST_AVAILABLE

print_empty_folders(analysis, max_show=20)

Print empty folders found (expects DirectoryAnalysis).

Source code in filoma/directories/directory_profiler.py
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
def print_empty_folders(self, analysis: "DirectoryAnalysis", max_show: int = 20):
    """Print empty folders found (expects DirectoryAnalysis)."""
    if not isinstance(analysis, DirectoryAnalysis):
        raise TypeError("print_empty_folders expects a DirectoryAnalysis instance")

    empty_folders = analysis.empty_folders

    if not empty_folders:
        self.console.print("[green]✓ No empty folders found![/green]")
        return

    table = Table(title=f"Empty Folders (showing {min(len(empty_folders), max_show)} of {len(empty_folders)})")
    table.add_column("Path", style="yellow")

    for folder in empty_folders[:max_show]:
        table.add_row(folder)

    if len(empty_folders) > max_show:
        table.add_row(f"... and {len(empty_folders) - max_show} more")

    self.console.print(table)
    self.console.print()

print_file_extensions(analysis, top_n=10)

Print the most common file extensions (expects DirectoryAnalysis).

Source code in filoma/directories/directory_profiler.py
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
def print_file_extensions(self, analysis: "DirectoryAnalysis", top_n: int = 10):
    """Print the most common file extensions (expects DirectoryAnalysis)."""
    if not isinstance(analysis, DirectoryAnalysis):
        raise TypeError("print_file_extensions expects a DirectoryAnalysis instance")

    extensions = analysis.file_extensions

    if not extensions:
        return

    table = Table(title="File Extensions")
    table.add_column("Extension", style="bold magenta")
    table.add_column("Count", style="white")
    table.add_column("Percentage", style="green")
    total_files = analysis.summary["total_files"]

    for ext, count in list(extensions.items())[:top_n]:
        percentage = (count / total_files * 100) if total_files > 0 else 0
        table.add_row(ext, f"{count:,}", f"{percentage:.1f}%")

    self.console.print(table)
    self.console.print()

print_folder_patterns(analysis, top_n=10)

Print the most common folder names (expects DirectoryAnalysis).

Source code in filoma/directories/directory_profiler.py
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
def print_folder_patterns(self, analysis: "DirectoryAnalysis", top_n: int = 10):
    """Print the most common folder names (expects DirectoryAnalysis)."""
    if not isinstance(analysis, DirectoryAnalysis):
        raise TypeError("print_folder_patterns expects a DirectoryAnalysis instance")

    folder_names = analysis.common_folder_names

    if not folder_names:
        return

    table = Table(title="Common Folder Names")
    table.add_column("Folder Name", style="bold blue")
    table.add_column("Occurrences", style="white")

    for name, count in list(folder_names.items())[:top_n]:
        table.add_row(name, f"{count:,}")

    self.console.print(table)
    self.console.print()

print_report(analysis)

Print a comprehensive report of the directory analysis.

Expects a :class:DirectoryAnalysis instance. Use :meth:to_dict if you need a plain dict shape for downstream tooling.

Source code in filoma/directories/directory_profiler.py
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
def print_report(self, analysis: "DirectoryAnalysis"):
    """Print a comprehensive report of the directory analysis.

    Expects a :class:`DirectoryAnalysis` instance. Use :meth:`to_dict`
    if you need a plain dict shape for downstream tooling.
    """
    if not isinstance(analysis, DirectoryAnalysis):
        raise TypeError("print_report expects a DirectoryAnalysis instance")

    self.print_summary(analysis)
    self.print_file_extensions(analysis)
    self.print_folder_patterns(analysis)
    self.print_empty_folders(analysis)

print_summary(analysis)

Print a summary of the directory analysis (expects DirectoryAnalysis).

Source code in filoma/directories/directory_profiler.py
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
def print_summary(self, analysis: "DirectoryAnalysis"):
    """Print a summary of the directory analysis (expects DirectoryAnalysis)."""
    if not isinstance(analysis, DirectoryAnalysis):
        raise TypeError("print_summary expects a DirectoryAnalysis instance")

    summary = analysis.summary
    timing = analysis.timing or {}

    # Show which implementation was used with more detail
    impl_type = timing.get("implementation", "Unknown")

    # Add DataFrame indicator
    if self.build_dataframe and analysis.dataframe is not None:
        impl_type += " + 📊 DataFrame"

    # Main summary table
    title = f"Directory Analysis: {analysis.path} ({impl_type})"
    if timing:
        title += f" - {timing.get('elapsed_seconds', 0):.2f}s"

    table = Table(title=title)
    table.add_column("Metric", style="bold cyan")
    table.add_column("Value", style="white")

    table.add_row("Total Files", f"{summary['total_files']:,}")
    table.add_row("Total Folders", f"{summary['total_folders']:,}")
    table.add_row("Total Size", f"{summary['total_size_mb']:,} MB")
    table.add_row("Average Files per Folder", str(summary["avg_files_per_folder"]))
    table.add_row("Maximum Depth", str(summary["max_depth"]))
    table.add_row("Empty Folders", str(summary["empty_folder_count"]))

    # Add DataFrame info if available
    if self.build_dataframe and analysis.dataframe is not None:
        df = analysis.dataframe
        table.add_row("DataFrame Rows", f"{len(df):,}")

    # Add timing information if available
    if timing:
        table.add_row("Analysis Time", f"{timing['elapsed_seconds']:.2f}s")
        if timing.get("items_per_second", 0) > 0:
            table.add_row("Processing Speed", f"{timing['items_per_second']:,.0f} items/sec")

    self.console.print(table)
    self.console.print()

probe(path, max_depth=None, threads=None)

Analyze a directory tree and return comprehensive statistics.


path: Path to the root directory to probe
max_depth: Maximum depth to traverse (None for unlimited)
threads: Optional override for number of threads when using fd backend

A :class:`DirectoryAnalysis` instance containing analysis results
Source code in filoma/directories/directory_profiler.py
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
def probe(self, path: str, max_depth: Optional[int] = None, threads: Optional[int] = None) -> "DirectoryAnalysis":
    """Analyze a directory tree and return comprehensive statistics.

    Args:
    ----
        path: Path to the root directory to probe
        max_depth: Maximum depth to traverse (None for unlimited)
        threads: Optional override for number of threads when using fd backend

    Returns:
    -------
        A :class:`DirectoryAnalysis` instance containing analysis results

    """
    start_time = time.time()

    # Choose the best backend
    backend = self._choose_backend()

    # Log the start of analysis
    impl_type = self._get_impl_display_name(backend)
    logger.info(f"Starting directory analysis of '{path}' using {impl_type} implementation")

    try:
        if backend == "fd":
            # threads param overrides instance threads when provided
            chosen_threads = threads if threads is not None else self.threads
            result = self._probe_fd(path, max_depth, threads=chosen_threads)
        elif backend == "rust":
            result = self._probe_rust(path, max_depth, fast_path_only=self._fast_path_only)
        else:
            result = self._probe_python(path, max_depth)

        # Calculate and log timing
        elapsed_time = time.time() - start_time
        total_items = result["summary"]["total_files"] + result["summary"]["total_folders"]

        logger.success(
            f"Directory analysis completed in {elapsed_time:.2f}s - "
            f"Found {total_items:,} items ({result['summary']['total_files']:,} files, "
            f"{result['summary']['total_folders']:,} folders) using {impl_type}"
        )

        # Add timing information to result
        result["timing"] = {
            "elapsed_seconds": elapsed_time,
            "implementation": impl_type,
            "items_per_second": (total_items / elapsed_time if elapsed_time > 0 else 0),
        }

        # Return a structured dataclass by default for easier programmatic use
        return DirectoryAnalysis.from_dict(result)

    except Exception as e:
        elapsed_time = time.time() - start_time
        logger.error(f"Directory analysis failed after {elapsed_time:.2f}s: {str(e)}")
        raise

sample_paths(path, sample_size=20)

Return small samples of paths for quick backend-diffing.

Returns a dict with keys 'fd_files', 'fd_dirs', 'python_files'. Rust currently does not expose a path list in the public API so it is omitted (you can re-run the Rust prober separately if needed).

Source code in filoma/directories/directory_profiler.py
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
def sample_paths(self, path: str, sample_size: int = 20) -> Dict[str, List[str]]:
    """Return small samples of paths for quick backend-diffing.

    Returns a dict with keys 'fd_files', 'fd_dirs', 'python_files'. Rust currently
    does not expose a path list in the public API so it is omitted (you can
    re-run the Rust prober separately if needed).
    """
    samples = {"fd_files": [], "fd_dirs": [], "python_files": []}
    try:
        if FD_AVAILABLE:
            fd = FdIntegration()
            samples["fd_files"] = fd.find(
                path=path,
                file_types=["f"],
                max_results=sample_size,
                search_hidden=True,
                no_ignore=True,
                follow_links=False,
                absolute_paths=self.return_absolute_paths,
            )
            samples["fd_dirs"] = fd.find(
                path=path,
                file_types=["d"],
                max_results=sample_size,
                search_hidden=True,
                no_ignore=True,
                follow_links=False,
                absolute_paths=self.return_absolute_paths,
            )
    except Exception:
        samples["fd_files"] = []
        samples["fd_dirs"] = []

    # Python sample
    try:
        root = Path(path)
        python_files = []
        for i, p in enumerate(root.rglob("*")):
            if p.is_file():
                python_files.append(str(p.resolve()))
            if len(python_files) >= sample_size:
                break
        samples["python_files"] = python_files
    except Exception:
        samples["python_files"] = []

    return samples

DirectoryProfilerConfig dataclass

Configuration for DirectoryProfiler (explicit, typed, no legacy kwargs).

All fields are documented and validated in post_init.

Source code in filoma/directories/directory_profiler.py
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
@dataclass(frozen=True)
class DirectoryProfilerConfig:
    """Configuration for DirectoryProfiler (explicit, typed, no legacy kwargs).

    All fields are documented and validated in __post_init__.
    """

    # Backend selection
    use_rust: bool = False
    use_parallel: bool = True
    use_async: bool = False
    use_fd: bool = False
    search_backend: str = "auto"  # 'rust' | 'fd' | 'python' | 'auto'

    # General tuning
    parallel_threshold: int = 1000
    build_dataframe: bool = False
    return_absolute_paths: bool = False
    show_progress: bool = True
    progress_callback: Optional[Callable[[str, int, int], None]] = None
    fast_path_only: bool = False

    # Network tuning (only valid when use_async is True)
    network_concurrency: int = 192
    network_timeout_ms: int = 20000
    network_retries: int = 0

    # fd-specific tuning
    threads: Optional[int] = None
    fd_no_ignore: bool = False

    def __post_init__(self):
        """Validate configuration fields after initialization.

        Ensures values are within acceptable ranges and relationships are
        enforced (for example, network tuning only when async is enabled).
        """
        # Basic validations
        if self.search_backend not in ("auto", "rust", "fd", "python"):
            raise ValueError("search_backend must be one of 'auto','rust','fd','python'")
        if not isinstance(self.parallel_threshold, int) or self.parallel_threshold < 0:
            raise ValueError("parallel_threshold must be a non-negative integer")
        if not isinstance(self.network_concurrency, int) or self.network_concurrency <= 0:
            raise ValueError("network_concurrency must be a positive integer")
        if self.network_timeout_ms <= 0:
            raise ValueError("network_timeout_ms must be positive")
        if self.network_retries < 0:
            raise ValueError("network_retries must be non-negative")

        # Relationship validations - only validate if non-default network params are set
        # Default values are: network_concurrency=192, network_timeout_ms=20000, network_retries=0
        has_custom_network_params = self.network_concurrency != 192 or self.network_timeout_ms != 20000 or self.network_retries != 0
        if not self.use_async and has_custom_network_params:
            raise ValueError("Network tuning parameters only apply when use_async=True")

        # Check if fd backend is being used (either explicitly or via search_backend)
        is_using_fd = self.use_fd or self.search_backend == "fd"
        if self.threads is not None and not is_using_fd:
            raise ValueError("'threads' only applies when use_fd=True or search_backend='fd'")

__post_init__()

Validate configuration fields after initialization.

Ensures values are within acceptable ranges and relationships are enforced (for example, network tuning only when async is enabled).

Source code in filoma/directories/directory_profiler.py
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
def __post_init__(self):
    """Validate configuration fields after initialization.

    Ensures values are within acceptable ranges and relationships are
    enforced (for example, network tuning only when async is enabled).
    """
    # Basic validations
    if self.search_backend not in ("auto", "rust", "fd", "python"):
        raise ValueError("search_backend must be one of 'auto','rust','fd','python'")
    if not isinstance(self.parallel_threshold, int) or self.parallel_threshold < 0:
        raise ValueError("parallel_threshold must be a non-negative integer")
    if not isinstance(self.network_concurrency, int) or self.network_concurrency <= 0:
        raise ValueError("network_concurrency must be a positive integer")
    if self.network_timeout_ms <= 0:
        raise ValueError("network_timeout_ms must be positive")
    if self.network_retries < 0:
        raise ValueError("network_retries must be non-negative")

    # Relationship validations - only validate if non-default network params are set
    # Default values are: network_concurrency=192, network_timeout_ms=20000, network_retries=0
    has_custom_network_params = self.network_concurrency != 192 or self.network_timeout_ms != 20000 or self.network_retries != 0
    if not self.use_async and has_custom_network_params:
        raise ValueError("Network tuning parameters only apply when use_async=True")

    # Check if fd backend is being used (either explicitly or via search_backend)
    is_using_fd = self.use_fd or self.search_backend == "fd"
    if self.threads is not None and not is_using_fd:
        raise ValueError("'threads' only applies when use_fd=True or search_backend='fd'")

handler: python