Spaces:

bhanug2026
/

aviation-disruption-intelligence

Sleeping

aviation-disruption-intelligence / src /utils /io_utils.py

bhanug2026

Initial commit

47c6cfd 4 days ago

3.33 kB

	"""
	src/utils/io_utils.py
	=====================
	CSV append-only helpers, deduplication, and schema validation utilities.
	All data files grow continuously; this module ensures safe, idempotent appends.
	"""

	import os
	import pandas as pd
	from pathlib import Path
	from datetime import datetime
	from typing import Optional, List
	from src.utils.logger import get_logger

	logger = get_logger(__name__)


	def append_to_csv(
	df: pd.DataFrame,
	filepath: str \| Path,
	dedupe_cols: Optional[List[str]] = None,
	) -> int:
	"""
	Append rows to a CSV file, creating it if it doesn't exist.
	Deduplicates on ``dedupe_cols`` so the pipeline is idempotent.

	Returns
	-------
	int : number of NEW rows written
	"""
	filepath = Path(filepath)
	filepath.parent.mkdir(parents=True, exist_ok=True)

	if df.empty:
	logger.warning("append_to_csv: empty DataFrame, nothing written to %s", filepath)
	return 0

	if filepath.exists():
	existing = pd.read_csv(filepath, low_memory=False)
	combined = pd.concat([existing, df], ignore_index=True)
	if dedupe_cols:
	valid_cols = [c for c in dedupe_cols if c in combined.columns]
	if valid_cols:
	before = len(combined)
	combined = combined.drop_duplicates(subset=valid_cols, keep="last")
	logger.debug("Deduplicated %d → %d rows on %s", before, len(combined), valid_cols)
	new_rows = len(combined) - len(existing)
	else:
	combined = df.copy()
	new_rows = len(combined)

	combined.to_csv(filepath, index=False)
	logger.info("Wrote %d new rows to %s (total: %d)", new_rows, filepath.name, len(combined))
	return max(new_rows, 0)


	def load_csv_safe(filepath: str \| Path, **kwargs) -> pd.DataFrame:
	"""Load a CSV file, returning an empty DataFrame if it doesn't exist."""
	filepath = Path(filepath)
	if not filepath.exists():
	logger.warning("File not found: %s — returning empty DataFrame", filepath)
	return pd.DataFrame()
	return pd.read_csv(filepath, low_memory=False, **kwargs)


	def save_json(data: dict, filepath: str \| Path) -> None:
	"""Save a dict as JSON, creating parent directories as needed."""
	import json
	filepath = Path(filepath)
	filepath.parent.mkdir(parents=True, exist_ok=True)
	with open(filepath, "w", encoding="utf-8") as f:
	json.dump(data, f, indent=2, default=str)
	logger.info("Saved JSON → %s", filepath.name)


	def load_json(filepath: str \| Path) -> dict:
	"""Load JSON file, returning {} if not found or corrupted."""
	import json
	filepath = Path(filepath)
	if not filepath.exists():
	return {}
	try:
	with open(filepath, encoding="utf-8") as f:
	content = f.read().strip()
	# Handle files with multiple concatenated JSON objects (take the first)
	decoder = json.JSONDecoder()
	obj, _ = decoder.raw_decode(content)
	return obj
	except Exception as e:
	logger.warning("Could not parse JSON from %s (%s) — returning {}", filepath.name, e)
	return {}


	def timestamped_filename(prefix: str, ext: str = "csv") -> str:
	"""Return a filename like 'prefix_20260101_1200.csv'."""
	ts = datetime.utcnow().strftime("%Y%m%d_%H%M")
	return f"{prefix}_{ts}.{ext}"