File size: 4,039 Bytes
eadbc29
 
 
 
 
 
 
 
 
 
 
 
 
d3a967e
 
eadbc29
 
 
 
 
1425c98
eadbc29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1425c98
eadbc29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1425c98
eadbc29
 
 
 
 
 
 
1425c98
 
eadbc29
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
"""Shared configuration and helpers for EDA pipeline."""

import json
from datetime import datetime
from pathlib import Path

# -------------------------------------------------------------------
# Paths and versioning
# -------------------------------------------------------------------
# Project root (repo root) = parent of src/
PROJECT_ROOT = Path(__file__).resolve().parents[1]

DATA_DIR = PROJECT_ROOT / "Data"
CASE_FILE_PARQUET = DATA_DIR / "cases.parquet"
HEARING_FILE_PARQUET = DATA_DIR / "hearings.parquet"

# Default paths (used when EDA is run standalone)
REPORTS_DIR = PROJECT_ROOT / "reports"
FIGURES_DIR = REPORTS_DIR / "figures"

VERSION = "v1.0.0"
RUN_TS = datetime.now().strftime("%Y%m%d_%H%M%S")

# These will be set by set_output_paths() when running from pipeline
RUN_DIR = None
PARAMS_DIR = None
CASES_CLEAN_PARQUET = None
HEARINGS_CLEAN_PARQUET = None


def set_output_paths(eda_dir: Path, data_dir: Path, params_dir: Path):
    """Configure output paths from OutputManager.

    Call this from pipeline before running EDA modules.
    When not called, falls back to legacy reports/figures/ structure.
    """
    global RUN_DIR, PARAMS_DIR, CASES_CLEAN_PARQUET, HEARINGS_CLEAN_PARQUET
    RUN_DIR = eda_dir
    PARAMS_DIR = params_dir
    CASES_CLEAN_PARQUET = data_dir / "cases_clean.parquet"
    HEARINGS_CLEAN_PARQUET = data_dir / "hearings_clean.parquet"

    # Ensure directories exist
    RUN_DIR.mkdir(parents=True, exist_ok=True)
    PARAMS_DIR.mkdir(parents=True, exist_ok=True)


def _get_run_dir() -> Path:
    """Get RUN_DIR, creating default if not set."""
    global RUN_DIR
    if RUN_DIR is None:
        # Standalone mode: use legacy versioned directory
        FIGURES_DIR.mkdir(parents=True, exist_ok=True)
        RUN_DIR = FIGURES_DIR / f"{VERSION}_{RUN_TS}"
        RUN_DIR.mkdir(parents=True, exist_ok=True)
    return RUN_DIR


def _get_params_dir() -> Path:
    """Get PARAMS_DIR, creating default if not set."""
    global PARAMS_DIR
    if PARAMS_DIR is None:
        run_dir = _get_run_dir()
        PARAMS_DIR = run_dir / "params"
        PARAMS_DIR.mkdir(parents=True, exist_ok=True)
    return PARAMS_DIR


def _get_cases_parquet() -> Path:
    """Get CASES_CLEAN_PARQUET path."""
    global CASES_CLEAN_PARQUET
    if CASES_CLEAN_PARQUET is None:
        CASES_CLEAN_PARQUET = _get_run_dir() / "cases_clean.parquet"
    return CASES_CLEAN_PARQUET


def _get_hearings_parquet() -> Path:
    """Get HEARINGS_CLEAN_PARQUET path."""
    global HEARINGS_CLEAN_PARQUET
    if HEARINGS_CLEAN_PARQUET is None:
        HEARINGS_CLEAN_PARQUET = _get_run_dir() / "hearings_clean.parquet"
    return HEARINGS_CLEAN_PARQUET


# -------------------------------------------------------------------
# Null tokens and canonicalisation
# -------------------------------------------------------------------
NULL_TOKENS = ["", "NULL", "Null", "null", "NA", "N/A", "na", "NaN", "nan", "-", "--"]


def write_metadata(meta: dict) -> None:
    """Write run metadata into RUN_DIR/metadata.json."""
    run_dir = _get_run_dir()
    meta_path = run_dir / "metadata.json"
    try:
        with open(meta_path, "w", encoding="utf-8") as f:
            json.dump(meta, f, indent=2, default=str)
    except Exception as e:
        print(f"[WARN] Metadata export error: {e}")


def safe_write_figure(fig, filename: str) -> None:
    """Write plotly figure to EDA figures directory.

    Args:
        fig: Plotly figure object
        filename: HTML filename (e.g., "1_case_type_distribution.html")

    Uses CDN for Plotly.js instead of embedding to reduce file size from ~3MB to ~50KB per file.
    """
    run_dir = _get_run_dir()
    output_path = run_dir / filename
    try:
        fig.write_html(
            str(output_path),
            include_plotlyjs="cdn",  # Use CDN instead of embedding full library
            config={"displayModeBar": True, "displaylogo": False},  # Cleaner UI
        )
    except Exception as e:
        raise RuntimeError(f"Failed to write {filename} to {output_path}: {e}")