"""Shared configuration and helpers for EDA pipeline.""" import json from datetime import datetime from pathlib import Path # ------------------------------------------------------------------- # Paths and versioning # ------------------------------------------------------------------- # Project root (repo root) = parent of src/ PROJECT_ROOT = Path(__file__).resolve().parents[1] DATA_DIR = PROJECT_ROOT / "Data" CASE_FILE_PARQUET = DATA_DIR / "cases.parquet" HEARING_FILE_PARQUET = DATA_DIR / "hearings.parquet" # Default paths (used when EDA is run standalone) REPORTS_DIR = PROJECT_ROOT / "reports" FIGURES_DIR = REPORTS_DIR / "figures" VERSION = "v1.0.0" RUN_TS = datetime.now().strftime("%Y%m%d_%H%M%S") # These will be set by set_output_paths() when running from pipeline RUN_DIR = None PARAMS_DIR = None CASES_CLEAN_PARQUET = None HEARINGS_CLEAN_PARQUET = None def set_output_paths(eda_dir: Path, data_dir: Path, params_dir: Path): """Configure output paths from OutputManager. Call this from pipeline before running EDA modules. When not called, falls back to legacy reports/figures/ structure. """ global RUN_DIR, PARAMS_DIR, CASES_CLEAN_PARQUET, HEARINGS_CLEAN_PARQUET RUN_DIR = eda_dir PARAMS_DIR = params_dir CASES_CLEAN_PARQUET = data_dir / "cases_clean.parquet" HEARINGS_CLEAN_PARQUET = data_dir / "hearings_clean.parquet" # Ensure directories exist RUN_DIR.mkdir(parents=True, exist_ok=True) PARAMS_DIR.mkdir(parents=True, exist_ok=True) def _get_run_dir() -> Path: """Get RUN_DIR, creating default if not set.""" global RUN_DIR if RUN_DIR is None: # Standalone mode: use legacy versioned directory FIGURES_DIR.mkdir(parents=True, exist_ok=True) RUN_DIR = FIGURES_DIR / f"{VERSION}_{RUN_TS}" RUN_DIR.mkdir(parents=True, exist_ok=True) return RUN_DIR def _get_params_dir() -> Path: """Get PARAMS_DIR, creating default if not set.""" global PARAMS_DIR if PARAMS_DIR is None: run_dir = _get_run_dir() PARAMS_DIR = run_dir / "params" PARAMS_DIR.mkdir(parents=True, exist_ok=True) return PARAMS_DIR def _get_cases_parquet() -> Path: """Get CASES_CLEAN_PARQUET path.""" global CASES_CLEAN_PARQUET if CASES_CLEAN_PARQUET is None: CASES_CLEAN_PARQUET = _get_run_dir() / "cases_clean.parquet" return CASES_CLEAN_PARQUET def _get_hearings_parquet() -> Path: """Get HEARINGS_CLEAN_PARQUET path.""" global HEARINGS_CLEAN_PARQUET if HEARINGS_CLEAN_PARQUET is None: HEARINGS_CLEAN_PARQUET = _get_run_dir() / "hearings_clean.parquet" return HEARINGS_CLEAN_PARQUET # ------------------------------------------------------------------- # Null tokens and canonicalisation # ------------------------------------------------------------------- NULL_TOKENS = ["", "NULL", "Null", "null", "NA", "N/A", "na", "NaN", "nan", "-", "--"] def write_metadata(meta: dict) -> None: """Write run metadata into RUN_DIR/metadata.json.""" run_dir = _get_run_dir() meta_path = run_dir / "metadata.json" try: with open(meta_path, "w", encoding="utf-8") as f: json.dump(meta, f, indent=2, default=str) except Exception as e: print(f"[WARN] Metadata export error: {e}") def safe_write_figure(fig, filename: str) -> None: """Write plotly figure to EDA figures directory. Args: fig: Plotly figure object filename: HTML filename (e.g., "1_case_type_distribution.html") Uses CDN for Plotly.js instead of embedding to reduce file size from ~3MB to ~50KB per file. """ run_dir = _get_run_dir() output_path = run_dir / filename try: fig.write_html( str(output_path), include_plotlyjs="cdn", # Use CDN instead of embedding full library config={"displayModeBar": True, "displaylogo": False}, # Cleaner UI ) except Exception as e: raise RuntimeError(f"Failed to write {filename} to {output_path}: {e}")