Cookbook¶

Practical, copy‑paste recipes organized by what you want to accomplish.

I want to search and discover files...¶

Find specific file types quickly¶

Use FdFinder for powerful pattern-based file discovery:

from filoma.directories import FdFinder

finder = FdFinder()

# Find all Python files
python_files = finder.find_files(pattern=r"\.py$")

# Find files by multiple extensions
code_files = finder.find_by_extension(['py', 'rs', 'js'])

# Find files using a glob pattern
config_files = finder.find_files(pattern="*.{json,yaml}", use_glob=True)

print(f"Found {len(python_files)} Python files.")

Profile directories and files¶

Get comprehensive analysis of directory contents:

from filoma import probe_to_df
from filoma.directories import DirectoryProfiler

# Quick overview with DataFrame output
dfw = probe_to_df('.')

# Detailed profiling with custom configuration
from filoma.directories import DirectoryProfilerConfig
config = DirectoryProfilerConfig(fast_path_only=True, build_dataframe=True)
analysis = DirectoryProfiler(config).probe('.')
paths_df = analysis.to_df().df

Skip metadata collection for speed¶

When you only need file paths without size/time information:

from filoma import probe_to_df
from filoma.dataframe import DataFrame

# Fast path discovery without metadata
dfw = probe_to_df('.', enrich=False)

# Add metadata later if needed
base = DataFrame(dfw.df)
with_stats = base.add_file_stats_cols()  # adds size, times, owner, etc.

I want to explore and analyze my data...¶

Find the largest files¶

from filoma import probe_to_df

dfw = probe_to_df('.')
largest = dfw.df.select(['path','size_bytes']).sort('size_bytes', descending=True).head(10)
print(largest)

Analyze file extension distribution¶

from filoma import probe_to_df

dfw = probe_to_df('.')
by_ext = dfw.df.groupby('suffix').count().sort('count', descending=True).head(15)
print(by_ext)

Count files per directory¶

from filoma import probe_to_df

dfw = probe_to_df('.')
# Add parent column and count files per directory
counts = dfw.df.with_columns(
    dfw.df['path'].str.split('/').list.slice(-2,1).alias('parent')
).groupby('parent').count().sort('count', descending=True)
print(counts)

Filter files by criteria¶

from filoma import probe_to_df

dfw = probe_to_df('.')

# Filter by file extension
python_files = dfw.df.filter(dfw.df['path'].str.ends_with('.py'))

# Filter by size (files larger than 5MB)
large_files = dfw.df.filter(dfw.df['size_bytes'] > 5_000_000)

# Filter by modification time (recently modified)
from datetime import datetime, timedelta
cutoff = datetime.utcnow() - timedelta(hours=24)
recent = dfw.df.filter(dfw.df['modified_time'] > cutoff.isoformat())

Analyze directory depth patterns¶

from filoma import probe_to_df

dfw = probe_to_df('.')
depth_stats = dfw.df.groupby('depth').count().sort('depth')
print(depth_stats)

Profile image files¶

from filoma import probe_to_df, probe_image

dfw = probe_to_df('.')
images = dfw.df.filter(dfw.df['suffix'].is_in(['.png','.tif','.npy']))
large_images = images.filter(images['size_bytes'] > 5_000_000)

# Get detailed image information
reports = [probe_image(p) for p in large_images['path'].to_list()]

I want to find and remove duplicates...¶

Simple duplicate detection by size and hash¶

from filoma import probe_to_df, probe_file
import collections

dfw = probe_to_df('.')

# Find potential duplicates by size
size_groups = dfw.df.groupby('size_bytes').count().filter(pl.col('count') > 1)
candidates = dfw.df.filter(dfw.df['size_bytes'].is_in(size_groups['size_bytes'].to_list()))

# Verify with hash comparison
hash_map = collections.defaultdict(list)
for path in candidates['path'].to_list():
    filo = probe_file(path, compute_hash=True)
    hash_map[filo.sha256].append(path)

duplicates = [v for v in hash_map.values() if len(v) > 1]
print(f"Found {len(duplicates)} groups of duplicates")

Compute hashes for specific files¶

from filoma import probe_file

paths = ['README.md', 'pyproject.toml']
rows = []
for p in paths:
    filo = probe_file(p, compute_hash=True)
    rows.append({'path': filo.path, 'sha256': filo.sha256})

print(rows)

I want to export and integrate with other tools...¶

Export data for downstream processing¶

from filoma import probe_to_df

dfw = probe_to_df('.')

# Save as different formats
dfw.save_parquet('files.parquet')
dfw.save_csv('files.csv')

# Convert to pandas for other libraries
pandas_df = dfw.to_pandas()

# Convert to raw polars for advanced operations
polars_df = dfw.df

Work with the raw polars DataFrame¶

from filoma import probe_to_df
import polars as pl

dfw = probe_to_df('.')

# Access the underlying polars DataFrame for advanced operations
raw_df = dfw.df

# Complex polars operations
result = raw_df.lazy().filter(
    (pl.col('size_bytes') > 1000000) &
    (pl.col('path').str.contains(r'\.py$'))
).group_by('depth').agg([
    pl.col('size_bytes').sum().alias('total_size'),
    pl.col('path').count().alias('file_count')
]).collect()

Missing a recipe? Open an issue to request it!