Skip to content

Batch Processing Examples

Process multiple GeoParquet files efficiently.

Sequential Processing

from pathlib import Path
from geoparquet_io.core.add_bbox_column import add_bbox_column

def process_directory(input_dir, output_dir):
    """Process all parquet files in a directory."""
    import os
    os.makedirs(output_dir, exist_ok=True)

    for input_file in Path(input_dir).glob("*.parquet"):
        output_file = Path(output_dir) / input_file.name

        add_bbox_column(
            input_parquet=str(input_file),
            output_parquet=str(output_file),
            bbox_name="bbox",
            verbose=False,
            compression="ZSTD",
            compression_level=15
        )

        print(f"✓ Processed {input_file.name}")

# Usage
process_directory("input/", "output/")

Parallel Processing

For processing many files on multi-core machines:

from concurrent.futures import ProcessPoolExecutor, as_completed
from pathlib import Path
from geoparquet_io.core.hilbert_order import hilbert_order

def process_file(args):
    """Process a single file."""
    input_file, output_dir = args
    output_file = Path(output_dir) / input_file.name

    try:
        hilbert_order(
            input_parquet=str(input_file),
            output_parquet=str(output_file),
            geometry_column="geometry",
            add_bbox=True,
            verbose=False
        )
        return (True, input_file.name, None)
    except Exception as e:
        return (False, input_file.name, str(e))

def parallel_process(input_dir, output_dir):
    """Process files in parallel."""
    import os
    os.makedirs(output_dir, exist_ok=True)

    files = list(Path(input_dir).glob("*.parquet"))
    args_list = [(f, output_dir) for f in files]

    max_workers = os.cpu_count() or 4

    with ProcessPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(process_file, args): args[0]
                  for args in args_list}

        for future in as_completed(futures):
            success, filename, error = future.result()
            if success:
                print(f"✓ {filename}")
            else:
                print(f"❌ {filename}: {error}")

# Usage
parallel_process("input/", "output/")

Progress Tracking

Add progress bars with tqdm:

from pathlib import Path
from tqdm import tqdm
from geoparquet_io.core.add_bbox_column import add_bbox_column

def process_with_progress(input_dir, output_dir):
    """Process files with progress bar."""
    import os
    os.makedirs(output_dir, exist_ok=True)

    files = list(Path(input_dir).glob("*.parquet"))

    for input_file in tqdm(files, desc="Processing files"):
        output_file = Path(output_dir) / input_file.name

        add_bbox_column(
            input_parquet=str(input_file),
            output_parquet=str(output_file),
            verbose=False
        )

# Usage
process_with_progress("input/", "output/")

Complete Example Script

See the examples/batch_processing.py file in the repository for a complete working example with error handling and file size reporting.

See Also