Spaces:

RoyAalekh
/

hackathon_code4change

Sleeping

App Files Files Community

RoyAalekh commited on Nov 19, 2025

Commit

4ffade4

1 Parent(s): bb59487

WIP

Browse files

Files changed (6) hide show

.gitignore +3 -0
main.py +16 -993
src/eda_config.py +56 -0
src/eda_exploration.py +509 -0
src/eda_load_clean.py +236 -0
src/eda_parameters.py +400 -0

.gitignore CHANGED Viewed

@@ -13,3 +13,6 @@ uv.lock
 *.idea
 .vscode/
 __pylintrc__

 *.idea
 .vscode/
 __pylintrc__
+.pdf
+.html
+.docx

main.py CHANGED Viewed

@@ -1,1000 +1,23 @@
-"""Exploratory Data Analysis for Karnataka High Court Scheduling Dataset.
-- Uses Polars for fast data handling
-- Uses Plotly for interactive visualizations
 """
-# ======================================================
-# 1. Imports and Setup
-# ======================================================
-import json
-import shutil
-from datetime import datetime, timedelta
-from pathlib import Path
-import pandas as pd
-import plotly.express as px
-import plotly.graph_objects as go
-import plotly.io as pio
-import polars as pl
-pio.renderers.default = "browser"  # open plots in browser
-# ======================================================
-# 2. Paths
-# ======================================================
-DATA_DIR = Path("Data")
-CASES_FILE = DATA_DIR / "ISDMHack_Cases_WPfinal.csv"
-HEAR_FILE = DATA_DIR / "ISDMHack_Hear.csv"
-OUT_DIR = Path("reports/figures")
-OUT_DIR.mkdir(parents=True, exist_ok=True)
-# Versioning for iterative runs
-VERSION = "v0.3.0"
-RUN_TS = datetime.now().strftime("%Y%m%d_%H%M%S")
-OUT_DIR_VER = OUT_DIR / f"{VERSION}_{RUN_TS}"
-OUT_DIR_VER.mkdir(parents=True, exist_ok=True)
-def _copy_to_versioned(filename: str):
-    src = OUT_DIR / filename
-    dst = OUT_DIR_VER / filename
-    try:
-        if src.exists():
-            shutil.copyfile(src, dst)
-    except Exception as e:
-        print(f"Versioned copy failed for {filename}: {e}")
-# ======================================================
-# 3. Load Data
-# ======================================================
-# Improve null parsing and schema inference so textual placeholders like "NA" become proper nulls
-NULL_TOKENS = ["", "NULL", "Null", "null", "NA", "N/A", "na", "NaN", "nan", "-", "--"]
-cases = pl.read_csv(
-    CASES_FILE,
-    try_parse_dates=True,
-    null_values=NULL_TOKENS,
-    infer_schema_length=100_000,
-)
-hearings = pl.read_csv(
-    HEAR_FILE,
-    try_parse_dates=True,
-    null_values=NULL_TOKENS,
-    infer_schema_length=100_000,
-)
-print(f"Cases shape: {cases.shape}")
-print(f"Hearings shape: {hearings.shape}")
-# ======================================================
-# 4. Basic Cleaning
-# ======================================================
-for col in ["DATE_FILED", "DECISION_DATE", "REGISTRATION_DATE"]:
-    if col in cases.columns and cases[col].dtype == pl.Utf8:
-        cases = cases.with_columns(pl.col(col).str.strptime(pl.Date, "%d-%m-%Y", strict=False))
-cases = cases.unique(subset=["CNR_NUMBER"])
-hearings = hearings.unique(subset=["Hearing_ID"])
-# Canonicalize key categorical/text fields and coerce common placeholder strings to nulls
-def _norm_text_col(df: pl.DataFrame, col: str) -> pl.DataFrame:
-    if col not in df.columns:
-        return df
-    return df.with_columns(
-        pl.when(
-            pl.col(col)
-            .cast(pl.Utf8)
-            .str.strip_chars()
-            .str.to_uppercase()
-            .is_in(["", "NA", "N/A", "NULL", "NONE", "-", "--"])
-        )
-        .then(pl.lit(None))
-        .otherwise(pl.col(col).cast(pl.Utf8).str.strip_chars().str.to_uppercase())
-        .alias(col)
-    )
-# Normalize CASE_TYPE early
-cases = _norm_text_col(cases, "CASE_TYPE")
-# Normalize stage and purpose/judge text on hearings
-for c in [
-    "Remappedstages",
-    "PurposeofHearing",
-    "NJDG_JUDGE_NAME",
-    "BeforeHonourableJudge",
-]:
-    hearings = _norm_text_col(hearings, c)
-# Fix frequent stage aliases/typos into canonical labels
-if "Remappedstages" in hearings.columns:
-    STAGE_MAP = {
-        "ORDERS/JUDGMENTS": "ORDERS / JUDGMENT",
-        "ORDER/JUDGMENT": "ORDERS / JUDGMENT",
-        "ORDERS  /  JUDGMENT": "ORDERS / JUDGMENT",
-        "ORDERS /JUDGMENT": "ORDERS / JUDGMENT",
-        "ORDERS/JUDGMENT": "ORDERS / JUDGMENT",
-        "INTERLOCUTARY APPLICATION": "INTERLOCUTORY APPLICATION",
-        "FRAMING OF CHARGE": "FRAMING OF CHARGES",
-        "PRE ADMISSION": "PRE-ADMISSION",
-    }
-    hearings = hearings.with_columns(
-        pl.col("Remappedstages")
-        .map_elements(lambda x: STAGE_MAP.get(x, x) if x is not None else None)
-        .alias("Remappedstages")
-    )
-# ======================================================
-# 5. Derived Features
-# ======================================================
-# --- Disposal duration (use provided DISPOSALTIME_ADJ) ---
-# The dataset already contains the adjusted disposal time; normalize dtype only.
-if "DISPOSALTIME_ADJ" in cases.columns:
-    cases = cases.with_columns(pl.col("DISPOSALTIME_ADJ").cast(pl.Int32))
-# --- Filing / Decision Years ---
-cases = cases.with_columns(
-    [
-        pl.col("DATE_FILED").dt.year().alias("YEAR_FILED"),
-        pl.col("DECISION_DATE").dt.year().alias("YEAR_DECISION"),
-    ]
-)
-# --- Hearing count per case ---
-hearing_freq = hearings.group_by("CNR_NUMBER").agg(pl.count("BusinessOnDate").alias("N_HEARINGS"))
-cases = cases.join(hearing_freq, on="CNR_NUMBER", how="left")
-# --- For each CNR case, we have multiple hearings, so we need to calculate the average hearing gap.
-# We have BusinessOnDate column, which represents the date of each hearing for that case. So for each case,
-# we can calculate the difference between consecutive hearings and then take the mean of these differences to get the average hearing gap.
-hearings = (
-    hearings.filter(pl.col("BusinessOnDate").is_not_null())  # remove unusable rows
-    .sort(["CNR_NUMBER", "BusinessOnDate"])  # chronological within case
-    .with_columns(
-        ((pl.col("BusinessOnDate") - pl.col("BusinessOnDate").shift(1)) / timedelta(days=1))
-        .over("CNR_NUMBER")
-        .alias("HEARING_GAP_DAYS")
-    )
-)
-gap_summary = hearings.group_by("CNR_NUMBER").agg(pl.mean("HEARING_GAP_DAYS").alias("AVG_GAP"))
-cases = cases.join(gap_summary, on="CNR_NUMBER", how="left")
-# replace null in N_HEARINGS and AVG_GAP columns with 0
-cases = cases.with_columns(
-    pl.col("N_HEARINGS").fill_null(0).cast(pl.Int64),
-    pl.col("AVG_GAP").fill_null(0.0).fill_nan(0.0).cast(pl.Float64),
-)
-print("\n=== Feature Summary ===")
-print(cases.select(["CASE_TYPE", "DISPOSALTIME_ADJ", "N_HEARINGS", "AVG_GAP"]).describe())
-cases_pd = cases.to_pandas()
-hearings_pd = hearings.to_pandas()
-# ======================================================
-# 6. Interactive Visualizations
-# ======================================================
-# 1. Case Type Distribution
-fig1 = px.bar(
-    cases_pd,
-    x="CASE_TYPE",
-    color="CASE_TYPE",
-    title="Case Type Distribution",
-)
-fig1.update_layout(showlegend=False, xaxis_title="Case Type", yaxis_title="Number of Cases")
-fig1.write_html(OUT_DIR / "1_case_type_distribution.html")
-_copy_to_versioned("1_case_type_distribution.html")
-# fig1.show()
-# 2. Filing Trends by Year
-year_counts = cases_pd.groupby("YEAR_FILED")["CNR_NUMBER"].count().reset_index(name="Count")
-fig2 = px.line(year_counts, x="YEAR_FILED", y="Count", markers=True, title="Cases Filed by Year")
-fig2.update_traces(line_color="royalblue")
-fig2.update_layout(xaxis=dict(rangeslider=dict(visible=True)))
-fig2.write_html(OUT_DIR / "2_cases_filed_by_year.html")
-_copy_to_versioned("2_cases_filed_by_year.html")
-# fig2.show()
-# 3. Disposal Duration Distribution
-fig3 = px.histogram(
-    cases_pd,
-    x="DISPOSALTIME_ADJ",
-    nbins=50,
-    title="Distribution of Disposal Time (Adjusted Days)",
-    color_discrete_sequence=["indianred"],
-)
-fig3.update_layout(xaxis_title="Days", yaxis_title="Cases")
-fig3.write_html(OUT_DIR / "3_disposal_time_distribution.html")
-_copy_to_versioned("3_disposal_time_distribution.html")
-# fig3.show()
-# 4. Hearings vs Disposal Time
-fig4 = px.scatter(
-    cases_pd,
-    x="N_HEARINGS",
-    y="DISPOSALTIME_ADJ",
-    color="CASE_TYPE",
-    hover_data=["CNR_NUMBER", "YEAR_FILED"],
-    title="Hearings vs Disposal Duration",
-)
-fig4.update_traces(marker=dict(size=6, opacity=0.7))
-fig4.write_html(OUT_DIR / "4_hearings_vs_disposal.html")
-_copy_to_versioned("4_hearings_vs_disposal.html")
-# fig4.show()
-# 5. Boxplot by Case Type
-fig5 = px.box(
-    cases_pd,
-    x="CASE_TYPE",
-    y="DISPOSALTIME_ADJ",
-    color="CASE_TYPE",
-    title="Disposal Time (Adjusted) by Case Type",
-)
-fig5.update_layout(showlegend=False)
-fig5.write_html(OUT_DIR / "5_box_disposal_by_type.html")
-_copy_to_versioned("5_box_disposal_by_type.html")
-# fig5.show()
-# 7. Judge Workload
-if "H" in cases_pd.columns:
-    judge_load = (
-        cases_pd.groupby("BeforeHonourableJudge")["CNR_NUMBER"]
-        .count()
-        .reset_index(name="Count")
-        .sort_values("Count", ascending=False)
-        .head(20)
-    )
-    fig7 = px.bar(
-        judge_load,
-        x="BeforeHonourableJudge",
-        y="Count",
-        title="Top 20 Judges by Number of Cases Disposed",
-    )
-    fig7.update_layout(xaxis_title="Judge", yaxis_title="Cases")
-    fig7.write_html(OUT_DIR / "7_judge_workload.html")
-    _copy_to_versioned("7_judge_workload.html")
-    fig7.show()
-# 8. Stage Frequency
-if "Remappedstages" in hearings_pd.columns:
-    stage_counts = hearings_pd["Remappedstages"].value_counts().reset_index()
-    stage_counts.columns = ["Stage", "Count"]
-    fig8 = px.bar(
-        stage_counts,
-        x="Stage",
-        y="Count",
-        color="Stage",
-        title="Frequency of Hearing Stages",
-    )
-    fig8.update_layout(showlegend=False, xaxis_title="Stage", yaxis_title="Count")
-    fig8.write_html(OUT_DIR / "8_stage_frequency.html")
-    _copy_to_versioned("8_stage_frequency.html")
-    fig8.show()
-print("\nAll interactive plots saved to:", OUT_DIR.resolve())
-# ======================================================
-# 7. Extended EDA: Data Audit, Linkage, Gaps, Stages, Cohorts, Seasonality, Purpose, Workload,
-# Bottlenecks
-# ======================================================
-# 7.1 Data Audit & Schema Checks
-print("\n=== Column dtypes (cases) ===")
-print(cases.dtypes)
-print("\n=== Column dtypes (hearings) ===")
-print(hearings.dtypes)
-def null_summary(df: pl.DataFrame, name: str):
-    ns = df.select(
-        [
-            pl.lit(name).alias("TABLE"),
-            pl.len().alias("ROWS"),
-        ]
-        + [pl.col(c).is_null().sum().alias(f"{c}__nulls") for c in df.columns]
-    )
-    print(f"\n=== Null summary ({name}) ===")
-    print(ns)
-null_summary(cases, "cases")
-null_summary(hearings, "hearings")
-# Duplicate keys
-print("\n=== Duplicates check ===")
-try:
-    print(
-        "Cases dup CNR_NUMBER: unique vs total ->",
-        cases.select(
-            pl.col("CNR_NUMBER").n_unique().alias("unique"), pl.len().alias("total")
-        ).to_dict(as_series=False),
-    )
-except Exception as e:
-    print("Cases duplicate check error:", e)
-try:
-    print(
-        "Hearings dup Hearing_ID: unique vs total ->",
-        hearings.select(
-            pl.col("Hearing_ID").n_unique().alias("unique"), pl.len().alias("total")
-        ).to_dict(as_series=False),
-    )
-except Exception as e:
-    print("Hearings duplicate check error:", e)
-# Key integrity: every hearing must map to a case
-if "CNR_NUMBER" in hearings.columns:
-    missed = hearings.join(cases.select("CNR_NUMBER"), on="CNR_NUMBER", how="anti")
-    print("Unmapped hearings -> cases:", missed.height)
-# 7.2 Consistency & Timeline Checks
-neg_disp = (
-    cases.filter(pl.col("DISPOSALTIME_ADJ") < 1)
-    if "DISPOSALTIME_ADJ" in cases.columns
-    else pl.DataFrame()
-)
-print(
-    "Negative/zero disposal adjusted days rows:",
-    neg_disp.height if isinstance(neg_disp, pl.DataFrame) else 0,
-)
-if (
-    set(["DATE_FILED", "DECISION_DATE"]).issubset(cases.columns)
-    and "BusinessOnDate" in hearings.columns
-):
-    h2 = hearings.join(
-        cases.select(["CNR_NUMBER", "DATE_FILED", "DECISION_DATE"]),
-        on="CNR_NUMBER",
-        how="left",
-    )
-    # Categorize anomalies for better diagnosis
-    before_filed = h2.filter(
-        pl.col("BusinessOnDate").is_not_null()
-        & pl.col("DATE_FILED").is_not_null()
-        & (pl.col("BusinessOnDate") < pl.col("DATE_FILED"))
-    )
-    after_decision = h2.filter(
-        pl.col("BusinessOnDate").is_not_null()
-        & pl.col("DECISION_DATE").is_not_null()
-        & (pl.col("BusinessOnDate") > pl.col("DECISION_DATE"))
-    )
-    missing_bounds = h2.filter(
-        pl.col("BusinessOnDate").is_not_null()
-        & (pl.col("DATE_FILED").is_null() | pl.col("DECISION_DATE").is_null())
-    )
-    print(
-        "Hearings outside case lifecycle:",
-        before_filed.height + after_decision.height,
-        "(before_filed=",
-        before_filed.height,
-        ", after_decision=",
-        after_decision.height,
-        ", missing_bounds=",
-        missing_bounds.height,
-        ")",
-    )
-# 7.3 Rich Hearing Gap Statistics
-if "BusinessOnDate" in hearings.columns and "CNR_NUMBER" in hearings.columns:
-    hearing_gaps = (
-        hearings.filter(pl.col("BusinessOnDate").is_not_null())
-        .sort(["CNR_NUMBER", "BusinessOnDate"])
-        .with_columns(
-            ((pl.col("BusinessOnDate") - pl.col("BusinessOnDate").shift(1)) / timedelta(days=1))
-            .over("CNR_NUMBER")
-            .alias("HEARING_GAP_DAYS")
-        )
-    )
-    gap_stats = hearing_gaps.group_by("CNR_NUMBER").agg(
-        [
-            pl.col("HEARING_GAP_DAYS").mean().alias("GAP_MEAN"),
-            pl.col("HEARING_GAP_DAYS").median().alias("GAP_MEDIAN"),
-            pl.col("HEARING_GAP_DAYS").quantile(0.25).alias("GAP_P25"),
-            pl.col("HEARING_GAP_DAYS").quantile(0.75).alias("GAP_P75"),
-            pl.col("HEARING_GAP_DAYS").std(ddof=1).alias("GAP_STD"),
-            pl.col("HEARING_GAP_DAYS").count().alias("N_GAPS"),
-        ]
-    )
-    cases = cases.join(gap_stats, on="CNR_NUMBER", how="left")
-    # Plot: Median hearing gap by case type
-    try:
-        fig_gap = px.box(
-            cases.to_pandas(),
-            x="CASE_TYPE",
-            y="GAP_MEDIAN",
-            points=False,
-            title="Median Hearing Gap by Case Type",
-        )
-        fig_gap.write_html(OUT_DIR / "9_gap_median_by_type.html")
-        _copy_to_versioned("9_gap_median_by_type.html")
-    except Exception as e:
-        print("Gap median plot error:", e)
-# 7.4 Stage Transitions & Durations
-stage_col = "Remappedstages" if "Remappedstages" in hearings.columns else None
-transitions = None
-stage_duration = None
-if stage_col:
-    # Define a canonical stage order to enforce left-to-right Sankey
-    STAGE_ORDER = [
-        "PRE-ADMISSION",
-        "ADMISSION",
-        "FRAMING OF CHARGES",
-        "EVIDENCE",
-        "ARGUMENTS",
-        "INTERLOCUTORY APPLICATION",
-        "SETTLEMENT",
-        "ORDERS / JUDGMENT",
-        "FINAL DISPOSAL",
-        "OTHER",
-        "NA",
-    ]
-    h_stage = (
-        hearings.filter(pl.col("BusinessOnDate").is_not_null())
-        .sort(["CNR_NUMBER", "BusinessOnDate"])
-        .with_columns(
-            [
-                pl.col(stage_col)
-                .fill_null("NA")
-                .map_elements(
-                    lambda s: s if s in STAGE_ORDER else ("OTHER" if s is not None else "NA")
-                )
-                .alias("STAGE"),
-                pl.col("BusinessOnDate").alias("DT"),
-            ]
-        )
-        .with_columns(
-            [
-                (pl.col("STAGE") != pl.col("STAGE").shift(1))
-                .over("CNR_NUMBER")
-                .alias("STAGE_CHANGE"),
-            ]
-        )
-    )
-    # All transitions from row i to i+1 within case
-    transitions_raw = (
-        h_stage.with_columns(
-            [
-                pl.col("STAGE").alias("STAGE_FROM"),
-                pl.col("STAGE").shift(-1).over("CNR_NUMBER").alias("STAGE_TO"),
-            ]
-        )
-        .filter(pl.col("STAGE_TO").is_not_null())
-        .group_by(["STAGE_FROM", "STAGE_TO"])
-        .agg(pl.len().alias("N"))
-        .sort("N", descending=True)
-    )
-    # Filter to non-regressive or same-stage transitions based on STAGE_ORDER index
-    order_idx = {s: i for i, s in enumerate(STAGE_ORDER)}
-    transitions = transitions_raw.filter(
-        pl.col("STAGE_FROM").map_elements(lambda s: order_idx.get(s, 10))
-        <= pl.col("STAGE_TO").map_elements(lambda s: order_idx.get(s, 10))
-    )
-    print("\nTop stage transitions (filtered, head):\n", transitions.head(20))
-    # Run-lengths by stage to estimate time-in-stage
-    runs = (
-        h_stage.with_columns(
-            [
-                pl.when(pl.col("STAGE_CHANGE"))
-                .then(1)
-                .otherwise(0)
-                .cum_sum()
-                .over("CNR_NUMBER")
-                .alias("RUN_ID")
-            ]
-        )
-        .group_by(["CNR_NUMBER", "STAGE", "RUN_ID"])
-        .agg(
-            [
-                pl.col("DT").min().alias("RUN_START"),
-                pl.col("DT").max().alias("RUN_END"),
-                pl.len().alias("HEARINGS_IN_RUN"),
-            ]
-        )
-        .with_columns(
-            ((pl.col("RUN_END") - pl.col("RUN_START")) / timedelta(days=1)).alias("RUN_DAYS")
-        )
-    )
-    stage_duration = (
-        runs.group_by("STAGE")
-        .agg(
-            [
-                pl.col("RUN_DAYS").median().alias("RUN_MEDIAN_DAYS"),
-                pl.col("RUN_DAYS").mean().alias("RUN_MEAN_DAYS"),
-                pl.col("HEARINGS_IN_RUN").median().alias("HEARINGS_PER_RUN_MED"),
-                pl.len().alias("N_RUNS"),
-            ]
-        )
-        .sort("RUN_MEDIAN_DAYS", descending=True)
-    )
-    print("\nStage duration summary:\n", stage_duration)
-    # Sankey with ordered nodes following STAGE_ORDER
-    try:
-        tr_df = transitions.to_pandas()
-        labels = [
-            s for s in STAGE_ORDER if s in set(tr_df["STAGE_FROM"]).union(set(tr_df["STAGE_TO"]))
-        ]
-        idx = {l: i for i, l in enumerate(labels)}
-        tr_df = tr_df[tr_df["STAGE_FROM"].isin(labels) & tr_df["STAGE_TO"].isin(labels)].copy()
-        tr_df = tr_df.sort_values(by=["STAGE_FROM", "STAGE_TO"], key=lambda c: c.map(idx))
-        sankey = go.Figure(
-            data=[
-                go.Sankey(
-                    arrangement="snap",
-                    node=dict(label=labels, pad=15, thickness=18),
-                    link=dict(
-                        source=tr_df["STAGE_FROM"].map(idx).tolist(),
-                        target=tr_df["STAGE_TO"].map(idx).tolist(),
-                        value=tr_df["N"].tolist(),
-                    ),
-                )
-            ]
-        )
-        sankey.update_layout(title_text="Stage Transition Sankey (Ordered, non-regressive)")
-        sankey.write_html(OUT_DIR / "10_stage_transition_sankey.html")
-        _copy_to_versioned("10_stage_transition_sankey.html")
-    except Exception as e:
-        print("Sankey error:", e)
-    # Bottleneck impact bar
-    try:
-        st_pd = stage_duration.with_columns(
-            (pl.col("RUN_MEDIAN_DAYS") * pl.col("N_RUNS")).alias("IMPACT")
-        ).to_pandas()
-        fig_b = px.bar(
-            st_pd.sort_values("IMPACT", ascending=False),
-            x="STAGE",
-            y="IMPACT",
-            title="Stage Bottleneck Impact (Median Days x Runs)",
-        )
-        fig_b.write_html(OUT_DIR / "15_bottleneck_impact.html")
-        _copy_to_versioned("15_bottleneck_impact.html")
-    except Exception as e:
-        print("Bottleneck plot error:", e)
-# 7.5 Cohort Analysis by Filing Year & Case Type
-if "YEAR_FILED" in cases.columns and "CASE_TYPE" in cases.columns:
-    cohort = (
-        cases.filter(pl.col("YEAR_FILED").is_not_null())
-        .group_by(["YEAR_FILED", "CASE_TYPE"])
-        .agg(
-            [
-                pl.col("DISPOSALTIME_ADJ").count().alias("N"),
-                pl.col("DISPOSALTIME_ADJ").median().alias("Q50"),
-                pl.col("DISPOSALTIME_ADJ").quantile(0.9).alias("Q90"),
-                pl.col("DISPOSALTIME_ADJ").mean().alias("MEAN"),
-            ]
-        )
-        .sort(["YEAR_FILED", "CASE_TYPE"])
-    )
-    try:
-        fig_c = px.line(
-            cohort.to_pandas(),
-            x="YEAR_FILED",
-            y="Q50",
-            color="CASE_TYPE",
-            title="Median Disposal Days by Filing Year & Case Type",
-        )
-        fig_c.write_html(OUT_DIR / "13_cohort_median_disposal.html")
-        _copy_to_versioned("13_cohort_median_disposal.html")
-    except Exception as e:
-        print("Cohort plot error:", e)
-# 7.6 Seasonality & Calendar Effects
-if "BusinessOnDate" in hearings.columns:
-    m_hear = (
-        hearings.filter(pl.col("BusinessOnDate").is_not_null())
-        .with_columns(
-            [
-                pl.col("BusinessOnDate").dt.year().alias("Y"),
-                pl.col("BusinessOnDate").dt.month().alias("M"),
-            ]
-        )
-        .with_columns(
-            [
-                # First day of month date for plotting
-                pl.date(pl.col("Y"), pl.col("M"), pl.lit(1)).alias("YM")
-            ]
-        )
-    )
-    monthly_listings = m_hear.group_by("YM").agg(pl.len().alias("N_HEARINGS")).sort("YM")
-    try:
-        fig_m = px.line(
-            monthly_listings.to_pandas(), x="YM", y="N_HEARINGS", title="Monthly Hearings Listed"
-        )
-        fig_m.update_layout(yaxis=dict(tickformat=",d"))
-        fig_m.write_html(OUT_DIR / "11_monthly_hearings.html")
-        _copy_to_versioned("11_monthly_hearings.html")
-    except Exception as e:
-        print("Monthly listings plot error:", e)
-    # Waterfall: month-over-month change with anomaly flags
-    try:
-        ml = monthly_listings.with_columns(
-            [
-                pl.col("N_HEARINGS").shift(1).alias("PREV"),
-                (pl.col("N_HEARINGS") - pl.col("N_HEARINGS").shift(1)).alias("DELTA"),
-            ]
-        )
-        ml_pd = ml.to_pandas()
-        # Rolling z-score over 12-month window for anomaly detection
-        ml_pd["ROLL_MEAN"] = ml_pd["N_HEARINGS"].rolling(window=12, min_periods=6).mean()
-        ml_pd["ROLL_STD"] = ml_pd["N_HEARINGS"].rolling(window=12, min_periods=6).std()
-        ml_pd["Z"] = (ml_pd["N_HEARINGS"] - ml_pd["ROLL_MEAN"]) / ml_pd["ROLL_STD"]
-        ml_pd["ANOM"] = ml_pd["Z"].abs() >= 3.0
-        # Build waterfall values: first is absolute level, others are deltas
-        measures = ["relative"] * len(ml_pd)
-        measures[0] = "absolute"
-        y_vals = ml_pd["DELTA"].astype(float).fillna(ml_pd["N_HEARINGS"].astype(float)).tolist()
-        fig_w = go.Figure(
-            go.Waterfall(
-                x=ml_pd["YM"],
-                measure=measures,
-                y=y_vals,
-                text=[f"{int(v):,}" if pd.notnull(v) else "" for v in ml_pd["N_HEARINGS"]],
-                increasing=dict(marker=dict(color="seagreen")),
-                decreasing=dict(marker=dict(color="indianred")),
-                connector={"line": {"color": "rgb(110,110,110)"}},
-            )
-        )
-        # Highlight anomalies as red markers on top
-        fig_w.add_trace(
-            go.Scatter(
-                x=ml_pd.loc[ml_pd["ANOM"], "YM"],
-                y=ml_pd.loc[ml_pd["ANOM"], "N_HEARINGS"],
-                mode="markers",
-                marker=dict(color="crimson", size=8),
-                name="Anomaly (|z|>=3)",
-            )
-        )
-        fig_w.update_layout(
-            title="Monthly Hearings Waterfall (MoM change) with Anomalies",
-            yaxis=dict(tickformat=",d"),
-        )
-        fig_w.write_html(OUT_DIR / "11b_monthly_waterfall.html")
-        _copy_to_versioned("11b_monthly_waterfall.html")
-        # Export anomalies CSV
-        ml_pd_out = ml_pd.copy()
-        ml_pd_out["YM"] = ml_pd_out["YM"].astype(str)
-        ml_pd_out.to_csv(OUT_DIR / "monthly_anomalies.csv", index=False)
-        _copy_to_versioned("monthly_anomalies.csv")
-    except Exception as e:
-        print("Monthly waterfall error:", e)
-    # DOW x Month heatmap
-    dow_heat = (
-        hearings.filter(pl.col("BusinessOnDate").is_not_null())
-        .with_columns(
-            [
-                pl.col("BusinessOnDate").dt.weekday().alias("DOW"),
-                pl.col("BusinessOnDate").dt.month().alias("MONTH"),
-            ]
-        )
-        .group_by(["MONTH", "DOW"])
-        .agg(pl.len().alias("N"))
-    )
-    try:
-        fig_heat = px.density_heatmap(
-            dow_heat.to_pandas(), x="DOW", y="MONTH", z="N", title="Hearings by Weekday and Month"
-        )
-        fig_heat.write_html(OUT_DIR / "16_dow_month_heatmap.html")
-        _copy_to_versioned("16_dow_month_heatmap.html")
-    except Exception as e:
-        print("DOW-Month heatmap error:", e)
-# 7.7 Purpose Text Normalization & Tagging
-text_col = None
-for c in ["PurposeofHearing", "Purpose of Hearing", "PURPOSE_OF_HEARING"]:
-    if c in hearings.columns:
-        text_col = c
-        break
-def _has_kw_expr(col: str, kws: list[str]):
-    expr = None
-    for k in kws:
-        e = pl.col(col).str.contains(k)
-        expr = e if expr is None else (expr | e)
-    return (expr if expr is not None else pl.lit(False)).fill_null(False)
-if text_col:
-    hear_txt = hearings.with_columns(
-        [pl.col(text_col).cast(pl.Utf8).str.strip_chars().str.to_uppercase().alias("PURPOSE_TXT")]
-    )
-    async_kw = ["NON-COMPLIANCE", "OFFICE OBJECTION", "COMPLIANCE", "NOTICE", "SERVICE", "LISTING"]
-    subs_kw = ["EVIDENCE", "ARGUMENT", "FINAL HEARING", "JUDGMENT", "ORDER", "DISPOSAL"]
-    hear_txt = hear_txt.with_columns(
-        [
-            pl.when(_has_kw_expr("PURPOSE_TXT", async_kw))
-            .then(pl.lit("ASYNC_OR_ADMIN"))
-            .when(_has_kw_expr("PURPOSE_TXT", subs_kw))
-            .then(pl.lit("SUBSTANTIVE"))
-            .otherwise(pl.lit("UNKNOWN"))
-            .alias("PURPOSE_TAG")
-        ]
-    )
-    tag_share = (
-        hear_txt.group_by(["CASE_TYPE", "PURPOSE_TAG"])
-        .agg(pl.len().alias("N"))
-        .with_columns((pl.col("N") / pl.col("N").sum().over("CASE_TYPE")).alias("SHARE"))
-        .sort(["CASE_TYPE", "SHARE"], descending=[False, True])
-    )
-    try:
-        fig_t = px.bar(
-            tag_share.to_pandas(),
-            x="CASE_TYPE",
-            y="SHARE",
-            color="PURPOSE_TAG",
-            title="Purpose Tag Shares by Case Type",
-            barmode="stack",
-        )
-        fig_t.write_html(OUT_DIR / "14_purpose_tag_shares.html")
-        _copy_to_versioned("14_purpose_tag_shares.html")
-    except Exception as e:
-        print("Purpose shares plot error:", e)
-# 7.8 Judge/Day Workload & Throughput (use hearing-level judge only)
-judge_col = None
-for c in [
-    "BeforeHonourableJudge",
-]:
-    if c in hearings.columns:
-        judge_col = c
-        break
-if judge_col and "BusinessOnDate" in hearings.columns:
-    jday = (
-        hearings.filter(pl.col("BusinessOnDate").is_not_null())
-        .group_by([judge_col, "BusinessOnDate"])
-        .agg(pl.len().alias("N_HEARINGS"))
-    )
-    try:
-        fig_j = px.box(
-            jday.to_pandas(), x=judge_col, y="N_HEARINGS", title="Per-day Hearings per Judge"
-        )
-        fig_j.update_layout(
-            xaxis={"categoryorder": "total descending"}, yaxis=dict(tickformat=",d")
-        )
-        fig_j.write_html(OUT_DIR / "12_judge_day_load.html")
-        _copy_to_versioned("12_judge_day_load.html")
-    except Exception as e:
-        print("Judge day load plot error:", e)
-    # Court/day workload if courtroom columns are present
-    court_col = None
-    for cc in ["COURT_NUMBER", "COURT_NAME"]:
-        if cc in hearings.columns:
-            court_col = cc
-            break
-    if court_col:
-        cday = (
-            hearings.filter(pl.col("BusinessOnDate").is_not_null())
-            .group_by([court_col, "BusinessOnDate"])
-            .agg(pl.len().alias("N_HEARINGS"))
-        )
-        try:
-            fig_court = px.box(
-                cday.to_pandas(),
-                x=court_col,
-                y="N_HEARINGS",
-                title="Per-day Hearings per Courtroom",
-            )
-            fig_court.update_layout(
-                xaxis={"categoryorder": "total descending"}, yaxis=dict(tickformat=",d")
-            )
-            fig_court.write_html(OUT_DIR / "12b_court_day_load.html")
-            _copy_to_versioned("12b_court_day_load.html")
-        except Exception as e:
-            print("Court day load plot error:", e)
-# 7.9 Bottlenecks & Outliers
-try:
-    long_tail = (
-        cases.sort("DISPOSALTIME_ADJ", descending=True)
-        .select(
-            ["CNR_NUMBER", "CASE_TYPE", "DISPOSALTIME_ADJ", "N_HEARINGS", "GAP_MEDIAN", "GAP_P75"]
-        )
-        .head(50)
-    )
-    print("\nLongest disposal cases (top 50):\n", long_tail)
-except Exception as e:
-    print("Long-tail extraction error:", e)
-if transitions is not None:
-    try:
-        self_transitions = (
-            transitions.filter(pl.col("STAGE_FROM") == pl.col("STAGE_TO"))
-            .select(pl.sum("N"))
-            .to_series()[0]
-        )
-        print(
-            "Self-transitions (same stage repeated):",
-            int(self_transitions) if self_transitions is not None else 0,
-        )
-    except Exception as e:
-        print("Self-transition calc error:", e)
-# 7.9.b Feature outliers (overall and within type)
-try:
-    # Compute z-scores within CASE_TYPE for selected features
-    feat_cols = ["DISPOSALTIME_ADJ", "N_HEARINGS", "GAP_MEDIAN", "GAP_STD"]
-    df = cases
-    for fc in feat_cols:
-        if fc not in df.columns:
-            df = df.with_columns(pl.lit(None).alias(fc))
-    z_within = df.with_columns(
-        [
-            (
-                (pl.col(fc) - pl.col(fc).mean().over("CASE_TYPE"))
-                / pl.col(fc).std().over("CASE_TYPE")
-            ).alias(f"Z_{fc}_TYPE")
-            for fc in feat_cols
-        ]
-    )
-    # Flag outliers for |z|>=3
-    z_within = z_within.with_columns(
-        [(pl.col(f"Z_{fc}_TYPE").abs() >= 3.0).alias(f"OUT_{fc}_TYPE") for fc in feat_cols]
-    )
-    # Collect existing outlier flag columns and filter rows with any outlier
-    outlier_cols = [f"OUT_{fc}_TYPE" for fc in feat_cols if f"OUT_{fc}_TYPE" in z_within.columns]
-    out_any = z_within.filter(pl.any_horizontal(*[pl.col(col) for col in outlier_cols]))
-    out_path = OUT_DIR / "feature_outliers.csv"
-    out_any.select(
-        ["CNR_NUMBER", "CASE_TYPE"] + feat_cols + [f"Z_{fc}_TYPE" for fc in feat_cols]
-    ).write_csv(out_path)
-    _copy_to_versioned("feature_outliers.csv")
-    print("Feature outliers exported to", out_path.resolve())
-except Exception as e:
-    print("Feature outliers error:", e)
-# 7.10 Scheduler-ready Features & Alerts
-if "BusinessOnDate" in hearings.columns:
-    h_latest = (
-        hearings.filter(pl.col("BusinessOnDate").is_not_null())
-        .sort(["CNR_NUMBER", "BusinessOnDate"])
-        .group_by("CNR_NUMBER")
-        .agg(
-            [
-                pl.col("BusinessOnDate").max().alias("LAST_HEARING"),
-                (pl.col(stage_col).last() if stage_col else pl.lit(None)).alias("LAST_STAGE"),
-                (pl.col(stage_col).n_unique() if stage_col else pl.lit(None)).alias(
-                    "N_DISTINCT_STAGES"
-                ),
-            ]
-        )
-    )
-    cases = cases.join(h_latest, on="CNR_NUMBER", how="left")
-cases = cases.with_columns(
-    [
-        pl.when(pl.col("N_HEARINGS") > 50).then(50).otherwise(pl.col("N_HEARINGS")).alias("NH_CAP"),
-        pl.when(pl.col("GAP_MEDIAN").is_null() | (pl.col("GAP_MEDIAN") <= 0))
-        .then(999.0)
-        .otherwise(pl.col("GAP_MEDIAN"))
-        .alias("GAPM_SAFE"),
-    ]
-)
-# Clamp GAPM_SAFE to 100 in a separate step (Polars may not allow referencing columns
-# created within the same with_columns call across expressions in older versions)
-cases = cases.with_columns(
-    [
-        pl.when(pl.col("GAPM_SAFE") > 100)
-        .then(100.0)
-        .otherwise(pl.col("GAPM_SAFE"))
-        .alias("GAPM_CLAMP"),
-    ]
-)
-cases = cases.with_columns(
-    [
-        (
-            # progress term (0-40)
-            (pl.when((pl.col("NH_CAP") / 50) > 1.0).then(1.0).otherwise(pl.col("NH_CAP") / 50) * 40)
-            # momentum term (0-30)
-            + (
-                pl.when((100 / pl.col("GAPM_CLAMP")) > 1.0)
-                .then(1.0)
-                .otherwise(100 / pl.col("GAPM_CLAMP"))
-                * 30
-            )
-            # stage bonus (10 or 30)
-            + pl.when(pl.col("LAST_STAGE").is_in(["ARGUMENTS", "EVIDENCE", "ORDERS / JUDGMENT"]))
-            .then(30)
-            .otherwise(10)
-        ).alias("READINESS_SCORE_RAW")
-    ]
-)
-cases = cases.with_columns(
-    pl.when(pl.col("READINESS_SCORE_RAW") > 100.0)
-    .then(100.0)
-    .otherwise(pl.col("READINESS_SCORE_RAW"))
-    .alias("READINESS_SCORE")
-)
-# Diagnostic preview to validate readiness components
-try:
-    print(
-        "\nREADINESS sample:\n",
-        cases.select(["NH_CAP", "GAPM_SAFE", "GAPM_CLAMP", "READINESS_SCORE"]).head(5),
-    )
-except Exception as e:
-    print("Readiness diagnostic error:", e)
-# Alert flags within type
-try:
-    cases = cases.with_columns(
-        [
-            (
-                pl.col("DISPOSALTIME_ADJ")
-                > pl.col("DISPOSALTIME_ADJ").quantile(0.9).over("CASE_TYPE")
-            ).alias("ALERT_P90_TYPE"),
-            (pl.col("N_HEARINGS") > pl.col("N_HEARINGS").quantile(0.9).over("CASE_TYPE")).alias(
-                "ALERT_HEARING_HEAVY"
-            ),
-            (pl.col("GAP_MEDIAN") > pl.col("GAP_MEDIAN").quantile(0.9).over("CASE_TYPE")).alias(
-                "ALERT_LONG_GAP"
-            ),
-        ]
-    )
-except Exception as e:
-    print("Alert flags calc error:", e)
-# Export compact features CSV
-try:
-    (
-        cases.select(
-            [
-                "CNR_NUMBER",
-                "CASE_TYPE",
-                "YEAR_FILED",
-                "YEAR_DECISION",
-                "DISPOSALTIME_ADJ",
-                "N_HEARINGS",
-                "GAP_MEDIAN",
-                "GAP_STD",
-                "LAST_HEARING",
-                "DAYS_SINCE_LAST_HEARING",
-                "LAST_STAGE",
-                "READINESS_SCORE",
-                "ALERT_P90_TYPE",
-                "ALERT_HEARING_HEAVY",
-                "ALERT_LONG_GAP",
-            ]
-        )
-    ).write_csv(OUT_DIR / "cases_features.csv")
-    print("Exported cases_features.csv to", (OUT_DIR / "cases_features.csv").resolve())
-except Exception as e:
-    print("Export features CSV error:", e)
-# 7.11 Run metadata
-try:
-    meta = {
-        "version": VERSION,
-        "timestamp": RUN_TS,
-        "cases_shape": list(cases.shape),
-        "hearings_shape": list(hearings.shape),
-        "cases_columns": cases.columns,
-        "hearings_columns": hearings.columns,
-    }
-    with open(OUT_DIR_VER / "metadata.json", "w", encoding="utf-8") as f:
-        json.dump(meta, f, indent=2, default=str)
-except Exception as e:
-    print("Metadata export error:", e)

+"""Entrypoint to run the full EDA + parameter pipeline.
+Order:
+1. Load & clean (save Parquet + metadata)
+2. Visual EDA (plots + CSV summaries)
+3. Parameter extraction (JSON/CSV priors + features)
 """
+from src.eda_exploration import run_exploration
+from src.eda_load_clean import run_load_and_clean
+from src.eda_parameters import run_parameter_export
+if __name__ == "__main__":
+    print("Step 1/3: Load and clean")
+    run_load_and_clean()
+    print("\nStep 2/3: Exploratory analysis and plots")
+    run_exploration()
+    print("\nStep 3/3: Parameter extraction for simulation/scheduler")
+    run_parameter_export()
+    print("\nAll steps complete.")

src/eda_config.py ADDED Viewed

	@@ -0,0 +1,56 @@

+"""Shared configuration and helpers for EDA pipeline."""
+import json
+import shutil
+from datetime import datetime
+from pathlib import Path
+# -------------------------------------------------------------------
+# Paths and versioning
+# -------------------------------------------------------------------
+DATA_DIR = Path("Data")
+CASES_FILE = DATA_DIR / "ISDMHack_Cases_WPfinal.csv"
+HEAR_FILE = DATA_DIR / "ISDMHack_Hear.csv"
+REPORTS_DIR = Path("reports")
+FIGURES_DIR = REPORTS_DIR / "figures"
+FIGURES_DIR.mkdir(parents=True, exist_ok=True)
+VERSION = "v0.4.0"
+RUN_TS = datetime.now().strftime("%Y%m%d_%H%M%S")
+RUN_DIR = FIGURES_DIR / f"{VERSION}_{RUN_TS}"
+RUN_DIR.mkdir(parents=True, exist_ok=True)
+PARAMS_DIR = RUN_DIR / "params"
+PARAMS_DIR.mkdir(parents=True, exist_ok=True)
+# cleaned data outputs
+CASES_CLEAN_PARQUET = RUN_DIR / "cases_clean.parquet"
+HEARINGS_CLEAN_PARQUET = RUN_DIR / "hearings_clean.parquet"
+# -------------------------------------------------------------------
+# Null tokens and canonicalisation
+# -------------------------------------------------------------------
+NULL_TOKENS = ["", "NULL", "Null", "null", "NA", "N/A", "na", "NaN", "nan", "-", "--"]
+def copy_to_versioned(filename: str) -> None:
+    """Copy a file from FIGURES_DIR to RUN_DIR for versioned snapshots."""
+    src = FIGURES_DIR / filename
+    dst = RUN_DIR / filename
+    try:
+        if src.exists():
+            shutil.copyfile(src, dst)
+    except Exception as e:
+        print(f"[WARN] Versioned copy failed for {filename}: {e}")
+def write_metadata(meta: dict) -> None:
+    """Write run metadata into RUN_DIR/metadata.json."""
+    meta_path = RUN_DIR / "metadata.json"
+    try:
+        with open(meta_path, "w", encoding="utf-8") as f:
+            json.dump(meta, f, indent=2, default=str)
+    except Exception as e:
+        print(f"[WARN] Metadata export error: {e}")

src/eda_exploration.py ADDED Viewed

	@@ -0,0 +1,509 @@

+"""Module 2: Visual and descriptive EDA.
+Responsibilities:
+- Case type distribution, filing trends, disposal distribution.
+- Hearing gap distributions by type.
+- Stage transition Sankey & stage bottlenecks.
+- Cohorts by filing year.
+- Seasonality and monthly anomalies.
+- Judge and courtroom workload.
+- Purpose tags and stage frequency.
+Inputs:
+- Cleaned Parquet from eda_load_clean.
+Outputs:
+- Interactive HTML plots in FIGURES_DIR and versioned copies in RUN_DIR.
+- Some CSV summaries (e.g., stage_duration.csv, transitions.csv, monthly_anomalies.csv).
+"""
+from datetime import timedelta
+import pandas as pd
+import plotly.express as px
+import plotly.graph_objects as go
+import plotly.io as pio
+import polars as pl
+from src.eda_config import (
+    CASES_CLEAN_PARQUET,
+    FIGURES_DIR,
+    HEARINGS_CLEAN_PARQUET,
+    RUN_DIR,
+    copy_to_versioned,
+)
+pio.renderers.default = "browser"
+def load_cleaned():
+    cases = pl.read_parquet(CASES_CLEAN_PARQUET)
+    hearings = pl.read_parquet(HEARINGS_CLEAN_PARQUET)
+    print("Loaded cleaned data for exploration")
+    print("Cases:", cases.shape, "Hearings:", hearings.shape)
+    return cases, hearings
+def run_exploration() -> None:
+    cases, hearings = load_cleaned()
+    cases_pd = cases.to_pandas()
+    hearings_pd = hearings.to_pandas()
+    # --------------------------------------------------
+    # 1. Case Type Distribution
+    # --------------------------------------------------
+    fig1 = px.bar(
+        cases_pd,
+        x="CASE_TYPE",
+        color="CASE_TYPE",
+        title="Case Type Distribution",
+    )
+    fig1.update_layout(showlegend=False, xaxis_title="Case Type", yaxis_title="Number of Cases")
+    f1 = "1_case_type_distribution.html"
+    fig1.write_html(FIGURES_DIR / f1)
+    copy_to_versioned(f1)
+    # --------------------------------------------------
+    # 2. Filing Trends by Year
+    # --------------------------------------------------
+    if "YEAR_FILED" in cases_pd.columns:
+        year_counts = cases_pd.groupby("YEAR_FILED")["CNR_NUMBER"].count().reset_index(name="Count")
+        fig2 = px.line(
+            year_counts, x="YEAR_FILED", y="Count", markers=True, title="Cases Filed by Year"
+        )
+        fig2.update_traces(line_color="royalblue")
+        fig2.update_layout(xaxis=dict(rangeslider=dict(visible=True)))
+        f2 = "2_cases_filed_by_year.html"
+        fig2.write_html(FIGURES_DIR / f2)
+        copy_to_versioned(f2)
+    # --------------------------------------------------
+    # 3. Disposal Duration Distribution
+    # --------------------------------------------------
+    if "DISPOSALTIME_ADJ" in cases_pd.columns:
+        fig3 = px.histogram(
+            cases_pd,
+            x="DISPOSALTIME_ADJ",
+            nbins=50,
+            title="Distribution of Disposal Time (Adjusted Days)",
+            color_discrete_sequence=["indianred"],
+        )
+        fig3.update_layout(xaxis_title="Days", yaxis_title="Cases")
+        f3 = "3_disposal_time_distribution.html"
+        fig3.write_html(FIGURES_DIR / f3)
+        copy_to_versioned(f3)
+    # --------------------------------------------------
+    # 4. Hearings vs Disposal Time
+    # --------------------------------------------------
+    if {"N_HEARINGS", "DISPOSALTIME_ADJ"}.issubset(cases_pd.columns):
+        fig4 = px.scatter(
+            cases_pd,
+            x="N_HEARINGS",
+            y="DISPOSALTIME_ADJ",
+            color="CASE_TYPE",
+            hover_data=["CNR_NUMBER", "YEAR_FILED"],
+            title="Hearings vs Disposal Duration",
+        )
+        fig4.update_traces(marker=dict(size=6, opacity=0.7))
+        f4 = "4_hearings_vs_disposal.html"
+        fig4.write_html(FIGURES_DIR / f4)
+        copy_to_versioned(f4)
+    # --------------------------------------------------
+    # 5. Boxplot by Case Type
+    # --------------------------------------------------
+    fig5 = px.box(
+        cases_pd,
+        x="CASE_TYPE",
+        y="DISPOSALTIME_ADJ",
+        color="CASE_TYPE",
+        title="Disposal Time (Adjusted) by Case Type",
+    )
+    fig5.update_layout(showlegend=False)
+    f5 = "5_box_disposal_by_type.html"
+    fig5.write_html(FIGURES_DIR / f5)
+    copy_to_versioned(f5)
+    # --------------------------------------------------
+    # 6. Stage Frequency
+    # --------------------------------------------------
+    if "Remappedstages" in hearings_pd.columns:
+        stage_counts = hearings_pd["Remappedstages"].value_counts().reset_index()
+        stage_counts.columns = ["Stage", "Count"]
+        fig6 = px.bar(
+            stage_counts,
+            x="Stage",
+            y="Count",
+            color="Stage",
+            title="Frequency of Hearing Stages",
+        )
+        fig6.update_layout(showlegend=False, xaxis_title="Stage", yaxis_title="Count")
+        f6 = "6_stage_frequency.html"
+        fig6.write_html(FIGURES_DIR / f6)
+        copy_to_versioned(f6)
+    # --------------------------------------------------
+    # 7. Gap median by case type
+    # --------------------------------------------------
+    if "GAP_MEDIAN" in cases_pd.columns:
+        fig_gap = px.box(
+            cases_pd,
+            x="CASE_TYPE",
+            y="GAP_MEDIAN",
+            points=False,
+            title="Median Hearing Gap by Case Type",
+        )
+        fg = "9_gap_median_by_type.html"
+        fig_gap.write_html(FIGURES_DIR / fg)
+        copy_to_versioned(fg)
+    # --------------------------------------------------
+    # 8. Stage transitions & bottleneck plot
+    # --------------------------------------------------
+    stage_col = "Remappedstages" if "Remappedstages" in hearings.columns else None
+    transitions = None
+    stage_duration = None
+    if stage_col and "BusinessOnDate" in hearings.columns:
+        STAGE_ORDER = [
+            "PRE-ADMISSION",
+            "ADMISSION",
+            "FRAMING OF CHARGES",
+            "EVIDENCE",
+            "ARGUMENTS",
+            "INTERLOCUTORY APPLICATION",
+            "SETTLEMENT",
+            "ORDERS / JUDGMENT",
+            "FINAL DISPOSAL",
+            "OTHER",
+            "NA",
+        ]
+        order_idx = {s: i for i, s in enumerate(STAGE_ORDER)}
+        h_stage = (
+            hearings.filter(pl.col("BusinessOnDate").is_not_null())
+            .sort(["CNR_NUMBER", "BusinessOnDate"])
+            .with_columns(
+                [
+                    pl.col(stage_col)
+                    .fill_null("NA")
+                    .map_elements(
+                        lambda s: s if s in STAGE_ORDER else ("OTHER" if s is not None else "NA")
+                    )
+                    .alias("STAGE"),
+                    pl.col("BusinessOnDate").alias("DT"),
+                ]
+            )
+            .with_columns(
+                [
+                    (pl.col("STAGE") != pl.col("STAGE").shift(1))
+                    .over("CNR_NUMBER")
+                    .alias("STAGE_CHANGE"),
+                ]
+            )
+        )
+        transitions_raw = (
+            h_stage.with_columns(
+                [
+                    pl.col("STAGE").alias("STAGE_FROM"),
+                    pl.col("STAGE").shift(-1).over("CNR_NUMBER").alias("STAGE_TO"),
+                ]
+            )
+            .filter(pl.col("STAGE_TO").is_not_null())
+            .group_by(["STAGE_FROM", "STAGE_TO"])
+            .agg(pl.len().alias("N"))
+        )
+        transitions = transitions_raw.filter(
+            pl.col("STAGE_FROM").map_elements(lambda s: order_idx.get(s, 10))
+            <= pl.col("STAGE_TO").map_elements(lambda s: order_idx.get(s, 10))
+        ).sort("N", descending=True)
+        transitions.write_csv(RUN_DIR / "transitions.csv")
+        runs = (
+            h_stage.with_columns(
+                [
+                    pl.when(pl.col("STAGE_CHANGE"))
+                    .then(1)
+                    .otherwise(0)
+                    .cum_sum()
+                    .over("CNR_NUMBER")
+                    .alias("RUN_ID")
+                ]
+            )
+            .group_by(["CNR_NUMBER", "STAGE", "RUN_ID"])
+            .agg(
+                [
+                    pl.col("DT").min().alias("RUN_START"),
+                    pl.col("DT").max().alias("RUN_END"),
+                    pl.len().alias("HEARINGS_IN_RUN"),
+                ]
+            )
+            .with_columns(
+                ((pl.col("RUN_END") - pl.col("RUN_START")) / timedelta(days=1)).alias("RUN_DAYS")
+            )
+        )
+        stage_duration = (
+            runs.group_by("STAGE")
+            .agg(
+                [
+                    pl.col("RUN_DAYS").median().alias("RUN_MEDIAN_DAYS"),
+                    pl.col("RUN_DAYS").mean().alias("RUN_MEAN_DAYS"),
+                    pl.col("HEARINGS_IN_RUN").median().alias("HEARINGS_PER_RUN_MED"),
+                    pl.len().alias("N_RUNS"),
+                ]
+            )
+            .sort("RUN_MEDIAN_DAYS", descending=True)
+        )
+        stage_duration.write_csv(RUN_DIR / "stage_duration.csv")
+        # Sankey
+        try:
+            tr_df = transitions.to_pandas()
+            labels = [
+                s
+                for s in STAGE_ORDER
+                if s in set(tr_df["STAGE_FROM"]).union(set(tr_df["STAGE_TO"]))
+            ]
+            idx = {l: i for i, l in enumerate(labels)}
+            tr_df = tr_df[tr_df["STAGE_FROM"].isin(labels) & tr_df["STAGE_TO"].isin(labels)].copy()
+            tr_df = tr_df.sort_values(by=["STAGE_FROM", "STAGE_TO"], key=lambda c: c.map(idx))
+            sankey = go.Figure(
+                data=[
+                    go.Sankey(
+                        arrangement="snap",
+                        node=dict(label=labels, pad=15, thickness=18),
+                        link=dict(
+                            source=tr_df["STAGE_FROM"].map(idx).tolist(),
+                            target=tr_df["STAGE_TO"].map(idx).tolist(),
+                            value=tr_df["N"].tolist(),
+                        ),
+                    )
+                ]
+            )
+            sankey.update_layout(title_text="Stage Transition Sankey (Ordered)")
+            f10 = "10_stage_transition_sankey.html"
+            sankey.write_html(FIGURES_DIR / f10)
+            copy_to_versioned(f10)
+        except Exception as e:
+            print("Sankey error:", e)
+        # Bottleneck impact
+        try:
+            st_pd = stage_duration.with_columns(
+                (pl.col("RUN_MEDIAN_DAYS") * pl.col("N_RUNS")).alias("IMPACT")
+            ).to_pandas()
+            fig_b = px.bar(
+                st_pd.sort_values("IMPACT", ascending=False),
+                x="STAGE",
+                y="IMPACT",
+                title="Stage Bottleneck Impact (Median Days x Runs)",
+            )
+            fb = "15_bottleneck_impact.html"
+            fig_b.write_html(FIGURES_DIR / fb)
+            copy_to_versioned(fb)
+        except Exception as e:
+            print("Bottleneck plot error:", e)
+    # --------------------------------------------------
+    # 9. Monthly seasonality and anomalies
+    # --------------------------------------------------
+    if "BusinessOnDate" in hearings.columns:
+        m_hear = (
+            hearings.filter(pl.col("BusinessOnDate").is_not_null())
+            .with_columns(
+                [
+                    pl.col("BusinessOnDate").dt.year().alias("Y"),
+                    pl.col("BusinessOnDate").dt.month().alias("M"),
+                ]
+            )
+            .with_columns(pl.date(pl.col("Y"), pl.col("M"), pl.lit(1)).alias("YM"))
+        )
+        monthly_listings = m_hear.group_by("YM").agg(pl.len().alias("N_HEARINGS")).sort("YM")
+        monthly_listings.write_csv(RUN_DIR / "monthly_hearings.csv")
+        try:
+            fig_m = px.line(
+                monthly_listings.to_pandas(),
+                x="YM",
+                y="N_HEARINGS",
+                title="Monthly Hearings Listed",
+            )
+            fig_m.update_layout(yaxis=dict(tickformat=",d"))
+            fm = "11_monthly_hearings.html"
+            fig_m.write_html(FIGURES_DIR / fm)
+            copy_to_versioned(fm)
+        except Exception as e:
+            print("Monthly listings error:", e)
+        # Waterfall + anomalies
+        try:
+            ml = monthly_listings.with_columns(
+                [
+                    pl.col("N_HEARINGS").shift(1).alias("PREV"),
+                    (pl.col("N_HEARINGS") - pl.col("N_HEARINGS").shift(1)).alias("DELTA"),
+                ]
+            )
+            ml_pd = ml.to_pandas()
+            ml_pd["ROLL_MEAN"] = ml_pd["N_HEARINGS"].rolling(window=12, min_periods=6).mean()
+            ml_pd["ROLL_STD"] = ml_pd["N_HEARINGS"].rolling(window=12, min_periods=6).std()
+            ml_pd["Z"] = (ml_pd["N_HEARINGS"] - ml_pd["ROLL_MEAN"]) / ml_pd["ROLL_STD"]
+            ml_pd["ANOM"] = ml_pd["Z"].abs() >= 3.0
+            measures = ["relative"] * len(ml_pd)
+            measures[0] = "absolute"
+            y_vals = ml_pd["DELTA"].astype(float).fillna(ml_pd["N_HEARINGS"].astype(float)).tolist()
+            fig_w = go.Figure(
+                go.Waterfall(
+                    x=ml_pd["YM"],
+                    measure=measures,
+                    y=y_vals,
+                    text=[f"{int(v):,}" if pd.notnull(v) else "" for v in ml_pd["N_HEARINGS"]],
+                    increasing=dict(marker=dict(color="seagreen")),
+                    decreasing=dict(marker=dict(color="indianred")),
+                    connector={"line": {"color": "rgb(110,110,110)"}},
+                )
+            )
+            fig_w.add_trace(
+                go.Scatter(
+                    x=ml_pd.loc[ml_pd["ANOM"], "YM"],
+                    y=ml_pd.loc[ml_pd["ANOM"], "N_HEARINGS"],
+                    mode="markers",
+                    marker=dict(color="crimson", size=8),
+                    name="Anomaly (|z|>=3)",
+                )
+            )
+            fig_w.update_layout(
+                title="Monthly Hearings Waterfall (MoM change) with Anomalies",
+                yaxis=dict(tickformat=",d"),
+            )
+            fw = "11b_monthly_waterfall.html"
+            fig_w.write_html(FIGURES_DIR / fw)
+            copy_to_versioned(fw)
+            ml_pd_out = ml_pd.copy()
+            ml_pd_out["YM"] = ml_pd_out["YM"].astype(str)
+            ml_pd_out.to_csv(RUN_DIR / "monthly_anomalies.csv", index=False)
+        except Exception as e:
+            print("Monthly waterfall error:", e)
+    # --------------------------------------------------
+    # 10. Judge and court workload
+    # --------------------------------------------------
+    judge_col = None
+    for c in [
+        "BeforeHonourableJudge",
+        "Before Hon'ble Judges",
+        "Before_Honble_Judges",
+        "NJDG_JUDGE_NAME",
+    ]:
+        if c in hearings.columns:
+            judge_col = c
+            break
+    if judge_col and "BusinessOnDate" in hearings.columns:
+        jday = (
+            hearings.filter(pl.col("BusinessOnDate").is_not_null())
+            .group_by([judge_col, "BusinessOnDate"])
+            .agg(pl.len().alias("N_HEARINGS"))
+        )
+        try:
+            fig_j = px.box(
+                jday.to_pandas(),
+                x=judge_col,
+                y="N_HEARINGS",
+                title="Per-day Hearings per Judge",
+            )
+            fig_j.update_layout(
+                xaxis={"categoryorder": "total descending"}, yaxis=dict(tickformat=",d")
+            )
+            fj = "12_judge_day_load.html"
+            fig_j.write_html(FIGURES_DIR / fj)
+            copy_to_versioned(fj)
+        except Exception as e:
+            print("Judge workload error:", e)
+    court_col = None
+    for cc in ["COURT_NUMBER", "CourtName"]:
+        if cc in hearings.columns:
+            court_col = cc
+            break
+    if court_col and "BusinessOnDate" in hearings.columns:
+        cday = (
+            hearings.filter(pl.col("BusinessOnDate").is_not_null())
+            .group_by([court_col, "BusinessOnDate"])
+            .agg(pl.len().alias("N_HEARINGS"))
+        )
+        try:
+            fig_court = px.box(
+                cday.to_pandas(),
+                x=court_col,
+                y="N_HEARINGS",
+                title="Per-day Hearings per Courtroom",
+            )
+            fig_court.update_layout(
+                xaxis={"categoryorder": "total descending"}, yaxis=dict(tickformat=",d")
+            )
+            fc = "12b_court_day_load.html"
+            fig_court.write_html(FIGURES_DIR / fc)
+            copy_to_versioned(fc)
+        except Exception as e:
+            print("Court workload error:", e)
+    # --------------------------------------------------
+    # 11. Purpose tagging distributions
+    # --------------------------------------------------
+    text_col = None
+    for c in ["PurposeofHearing", "Purpose of Hearing", "PURPOSE_OF_HEARING"]:
+        if c in hearings.columns:
+            text_col = c
+            break
+    def _has_kw_expr(col: str, kws: list[str]):
+        expr = None
+        for k in kws:
+            e = pl.col(col).str.contains(k)
+            expr = e if expr is None else (expr | e)
+        return (expr if expr is not None else pl.lit(False)).fill_null(False)
+    if text_col:
+        hear_txt = hearings.with_columns(
+            pl.col(text_col).cast(pl.Utf8).str.strip_chars().str.to_uppercase().alias("PURPOSE_TXT")
+        )
+        async_kw = ["NON-COMPLIANCE", "OFFICE OBJECTION", "COMPLIANCE", "NOTICE", "SERVICE"]
+        subs_kw = ["EVIDENCE", "ARGUMENT", "FINAL HEARING", "JUDGMENT", "ORDER", "DISPOSAL"]
+        hear_txt = hear_txt.with_columns(
+            pl.when(_has_kw_expr("PURPOSE_TXT", async_kw))
+            .then(pl.lit("ASYNC_OR_ADMIN"))
+            .when(_has_kw_expr("PURPOSE_TXT", subs_kw))
+            .then(pl.lit("SUBSTANTIVE"))
+            .otherwise(pl.lit("UNKNOWN"))
+            .alias("PURPOSE_TAG")
+        )
+        tag_share = (
+            hear_txt.group_by(["CASE_TYPE", "PURPOSE_TAG"])
+            .agg(pl.len().alias("N"))
+            .with_columns((pl.col("N") / pl.col("N").sum().over("CASE_TYPE")).alias("SHARE"))
+            .sort(["CASE_TYPE", "SHARE"], descending=[False, True])
+        )
+        tag_share.write_csv(RUN_DIR / "purpose_tag_shares.csv")
+        try:
+            fig_t = px.bar(
+                tag_share.to_pandas(),
+                x="CASE_TYPE",
+                y="SHARE",
+                color="PURPOSE_TAG",
+                title="Purpose Tag Shares by Case Type",
+                barmode="stack",
+            )
+            ft = "14_purpose_tag_shares.html"
+            fig_t.write_html(FIGURES_DIR / ft)
+            copy_to_versioned(ft)
+        except Exception as e:
+            print("Purpose shares error:", e)
+if __name__ == "__main__":
+    run_exploration()

src/eda_load_clean.py ADDED Viewed

	@@ -0,0 +1,236 @@

+"""Module 1: Load, clean, and augment the High Court dataset.
+Responsibilities:
+- Read CSVs with robust null handling.
+- Normalise key text columns (case type, stages, judge names).
+- Basic integrity checks (nulls, duplicates, lifecycle).
+- Compute core per-case hearing gap stats (mean/median/std).
+- Save cleaned data as Parquet for downstream modules.
+"""
+from datetime import timedelta
+import polars as pl
+from src.eda_config import (
+    CASES_CLEAN_PARQUET,
+    CASES_FILE,
+    HEAR_FILE,
+    HEARINGS_CLEAN_PARQUET,
+    NULL_TOKENS,
+    RUN_TS,
+    VERSION,
+    write_metadata,
+)
+# -------------------------------------------------------------------
+# Helpers
+# -------------------------------------------------------------------
+def _norm_text_col(df: pl.DataFrame, col: str) -> pl.DataFrame:
+    if col not in df.columns:
+        return df
+    return df.with_columns(
+        pl.when(
+            pl.col(col)
+            .cast(pl.Utf8)
+            .str.strip_chars()
+            .str.to_uppercase()
+            .is_in(["", "NA", "N/A", "NULL", "NONE", "-", "--"])
+        )
+        .then(pl.lit(None))
+        .otherwise(pl.col(col).cast(pl.Utf8).str.strip_chars().str.to_uppercase())
+        .alias(col)
+    )
+def _null_summary(df: pl.DataFrame, name: str) -> None:
+    print(f"\n=== Null summary ({name}) ===")
+    n = df.height
+    row = {"TABLE": name, "ROWS": n}
+    for c in df.columns:
+        row[f"{c}__nulls"] = int(df.select(pl.col(c).is_null().sum()).item())
+    print(row)
+# -------------------------------------------------------------------
+# Main logic
+# -------------------------------------------------------------------
+def load_raw() -> tuple[pl.DataFrame, pl.DataFrame]:
+    print("Loading raw data with Polars...")
+    cases = pl.read_csv(
+        CASES_FILE,
+        try_parse_dates=True,
+        null_values=NULL_TOKENS,
+        infer_schema_length=100_000,
+    )
+    hearings = pl.read_csv(
+        HEAR_FILE,
+        try_parse_dates=True,
+        null_values=NULL_TOKENS,
+        infer_schema_length=100_000,
+    )
+    print(f"Cases shape: {cases.shape}")
+    print(f"Hearings shape: {hearings.shape}")
+    return cases, hearings
+def clean_and_augment(
+    cases: pl.DataFrame, hearings: pl.DataFrame
+) -> tuple[pl.DataFrame, pl.DataFrame]:
+    # Standardise date columns if needed
+    for col in ["DATE_FILED", "DECISION_DATE", "REGISTRATION_DATE", "LAST_SYNC_TIME"]:
+        if col in cases.columns and cases[col].dtype == pl.Utf8:
+            cases = cases.with_columns(pl.col(col).str.strptime(pl.Date, "%d-%m-%Y", strict=False))
+    # Deduplicate on keys
+    if "CNR_NUMBER" in cases.columns:
+        cases = cases.unique(subset=["CNR_NUMBER"])
+    if "Hearing_ID" in hearings.columns:
+        hearings = hearings.unique(subset=["Hearing_ID"])
+    # Normalise key text fields
+    cases = _norm_text_col(cases, "CASE_TYPE")
+    for c in [
+        "Remappedstages",
+        "PurposeofHearing",
+        "BeforeHonourableJudge",
+    ]:
+        hearings = _norm_text_col(hearings, c)
+    # Simple stage canonicalisation
+    if "Remappedstages" in hearings.columns:
+        STAGE_MAP = {
+            "ORDERS/JUDGMENTS": "ORDERS / JUDGMENT",
+            "ORDER/JUDGMENT": "ORDERS / JUDGMENT",
+            "ORDERS  /  JUDGMENT": "ORDERS / JUDGMENT",
+            "ORDERS /JUDGMENT": "ORDERS / JUDGMENT",
+            "INTERLOCUTARY APPLICATION": "INTERLOCUTORY APPLICATION",
+            "FRAMING OF CHARGE": "FRAMING OF CHARGES",
+            "PRE ADMISSION": "PRE-ADMISSION",
+        }
+        hearings = hearings.with_columns(
+            pl.col("Remappedstages")
+            .map_elements(lambda x: STAGE_MAP.get(x, x) if x is not None else None)
+            .alias("Remappedstages")
+        )
+    # Normalise disposal time
+    if "DISPOSALTIME_ADJ" in cases.columns:
+        cases = cases.with_columns(pl.col("DISPOSALTIME_ADJ").cast(pl.Int32))
+    # Year fields
+    if "DATE_FILED" in cases.columns:
+        cases = cases.with_columns(
+            [
+                pl.col("DATE_FILED").dt.year().alias("YEAR_FILED"),
+                pl.col("DECISION_DATE").dt.year().alias("YEAR_DECISION"),
+            ]
+        )
+    # Hearing counts per case
+    if {"CNR_NUMBER", "BusinessOnDate"}.issubset(hearings.columns):
+        hearing_freq = hearings.group_by("CNR_NUMBER").agg(
+            pl.count("BusinessOnDate").alias("N_HEARINGS")
+        )
+        cases = cases.join(hearing_freq, on="CNR_NUMBER", how="left")
+    else:
+        cases = cases.with_columns(pl.lit(0).alias("N_HEARINGS"))
+    # Per-case hearing gap stats (mean/median/std, p25, p75, count)
+    if {"CNR_NUMBER", "BusinessOnDate"}.issubset(hearings.columns):
+        hearing_gaps = (
+            hearings.filter(pl.col("BusinessOnDate").is_not_null())
+            .sort(["CNR_NUMBER", "BusinessOnDate"])
+            .with_columns(
+                ((pl.col("BusinessOnDate") - pl.col("BusinessOnDate").shift(1)) / timedelta(days=1))
+                .over("CNR_NUMBER")
+                .alias("HEARING_GAP_DAYS")
+            )
+        )
+        gap_stats = hearing_gaps.group_by("CNR_NUMBER").agg(
+            [
+                pl.col("HEARING_GAP_DAYS").mean().alias("GAP_MEAN"),
+                pl.col("HEARING_GAP_DAYS").median().alias("GAP_MEDIAN"),
+                pl.col("HEARING_GAP_DAYS").quantile(0.25).alias("GAP_P25"),
+                pl.col("HEARING_GAP_DAYS").quantile(0.75).alias("GAP_P75"),
+                pl.col("HEARING_GAP_DAYS").std(ddof=1).alias("GAP_STD"),
+                pl.col("HEARING_GAP_DAYS").count().alias("N_GAPS"),
+            ]
+        )
+        cases = cases.join(gap_stats, on="CNR_NUMBER", how="left")
+    else:
+        for col in ["GAP_MEAN", "GAP_MEDIAN", "GAP_P25", "GAP_P75", "GAP_STD", "N_GAPS"]:
+            cases = cases.with_columns(pl.lit(None).alias(col))
+    # Fill some basics
+    cases = cases.with_columns(
+        [
+            pl.col("N_HEARINGS").fill_null(0).cast(pl.Int64),
+            pl.col("GAP_MEDIAN").fill_null(0.0).cast(pl.Float64),
+        ]
+    )
+    # Print audits
+    print("\n=== dtypes (cases) ===")
+    print(cases.dtypes)
+    print("\n=== dtypes (hearings) ===")
+    print(hearings.dtypes)
+    _null_summary(cases, "cases")
+    _null_summary(hearings, "hearings")
+    # Simple lifecycle consistency check
+    if {"DATE_FILED", "DECISION_DATE"}.issubset(
+        cases.columns
+    ) and "BusinessOnDate" in hearings.columns:
+        h2 = hearings.join(
+            cases.select(["CNR_NUMBER", "DATE_FILED", "DECISION_DATE"]),
+            on="CNR_NUMBER",
+            how="left",
+        )
+        before_filed = h2.filter(
+            pl.col("BusinessOnDate").is_not_null()
+            & pl.col("DATE_FILED").is_not_null()
+            & (pl.col("BusinessOnDate") < pl.col("DATE_FILED"))
+        )
+        after_decision = h2.filter(
+            pl.col("BusinessOnDate").is_not_null()
+            & pl.col("DECISION_DATE").is_not_null()
+            & (pl.col("BusinessOnDate") > pl.col("DECISION_DATE"))
+        )
+        print(
+            "Hearings before filing:",
+            before_filed.height,
+            "| after decision:",
+            after_decision.height,
+        )
+    return cases, hearings
+def save_clean(cases: pl.DataFrame, hearings: pl.DataFrame) -> None:
+    cases.write_parquet(CASES_CLEAN_PARQUET)
+    hearings.write_parquet(HEARINGS_CLEAN_PARQUET)
+    print(f"Saved cleaned cases -> {CASES_CLEAN_PARQUET}")
+    print(f"Saved cleaned hearings -> {HEARINGS_CLEAN_PARQUET}")
+    meta = {
+        "version": VERSION,
+        "timestamp": RUN_TS,
+        "cases_shape": list(cases.shape),
+        "hearings_shape": list(hearings.shape),
+        "cases_columns": cases.columns,
+        "hearings_columns": hearings.columns,
+    }
+    write_metadata(meta)
+def run_load_and_clean() -> None:
+    cases_raw, hearings_raw = load_raw()
+    cases_clean, hearings_clean = clean_and_augment(cases_raw, hearings_raw)
+    save_clean(cases_clean, hearings_clean)
+if __name__ == "__main__":
+    run_load_and_clean()

src/eda_parameters.py ADDED Viewed

	@@ -0,0 +1,400 @@

+"""Module 3: Parameter extraction for scheduling simulation / optimisation.
+Responsibilities:
+- Extract stage transition probabilities (per stage).
+- Stage residence time distributions (medians, p90).
+- Court capacity priors (median/p90 hearings per day).
+- Adjournment and not-reached proxies by stage × case type.
+- Entropy of stage transitions (predictability).
+- Case-type summary stats (disposal, hearing counts, gaps).
+- Readiness score and alert flags per case.
+- Export JSON/CSV parameter files into PARAMS_DIR.
+"""
+import json
+from datetime import timedelta
+import polars as pl
+from src.eda_config import (
+    CASES_CLEAN_PARQUET,
+    HEARINGS_CLEAN_PARQUET,
+    PARAMS_DIR,
+)
+def load_cleaned():
+    cases = pl.read_parquet(CASES_CLEAN_PARQUET)
+    hearings = pl.read_parquet(HEARINGS_CLEAN_PARQUET)
+    return cases, hearings
+def extract_parameters() -> None:
+    cases, hearings = load_cleaned()
+    # --------------------------------------------------
+    # 1. Stage transitions and probabilities
+    # --------------------------------------------------
+    stage_col = "Remappedstages" if "Remappedstages" in hearings.columns else None
+    transitions = None
+    stage_duration = None
+    if stage_col and "BusinessOnDate" in hearings.columns:
+        STAGE_ORDER = [
+            "PRE-ADMISSION",
+            "ADMISSION",
+            "FRAMING OF CHARGES",
+            "EVIDENCE",
+            "ARGUMENTS",
+            "INTERLOCUTORY APPLICATION",
+            "SETTLEMENT",
+            "ORDERS / JUDGMENT",
+            "FINAL DISPOSAL",
+            "OTHER",
+            "NA",
+        ]
+        order_idx = {s: i for i, s in enumerate(STAGE_ORDER)}
+        h_stage = (
+            hearings.filter(pl.col("BusinessOnDate").is_not_null())
+            .sort(["CNR_NUMBER", "BusinessOnDate"])
+            .with_columns(
+                [
+                    pl.col(stage_col)
+                    .fill_null("NA")
+                    .map_elements(
+                        lambda s: s if s in STAGE_ORDER else ("OTHER" if s is not None else "NA")
+                    )
+                    .alias("STAGE"),
+                    pl.col("BusinessOnDate").alias("DT"),
+                ]
+            )
+            .with_columns(
+                [
+                    (pl.col("STAGE") != pl.col("STAGE").shift(1))
+                    .over("CNR_NUMBER")
+                    .alias("STAGE_CHANGE"),
+                ]
+            )
+        )
+        transitions_raw = (
+            h_stage.with_columns(
+                [
+                    pl.col("STAGE").alias("STAGE_FROM"),
+                    pl.col("STAGE").shift(-1).over("CNR_NUMBER").alias("STAGE_TO"),
+                ]
+            )
+            .filter(pl.col("STAGE_TO").is_not_null())
+            .group_by(["STAGE_FROM", "STAGE_TO"])
+            .agg(pl.len().alias("N"))
+        )
+        transitions = transitions_raw.filter(
+            pl.col("STAGE_FROM").map_elements(lambda s: order_idx.get(s, 10))
+            <= pl.col("STAGE_TO").map_elements(lambda s: order_idx.get(s, 10))
+        ).sort("N", descending=True)
+        transitions.write_csv(PARAMS_DIR / "stage_transitions.csv")
+        # Probabilities per STAGE_FROM
+        row_tot = transitions.group_by("STAGE_FROM").agg(pl.col("N").sum().alias("row_n"))
+        trans_probs = transitions.join(row_tot, on="STAGE_FROM").with_columns(
+            (pl.col("N") / pl.col("row_n")).alias("p")
+        )
+        trans_probs.write_csv(PARAMS_DIR / "stage_transition_probs.csv")
+        # Entropy of transitions
+        ent = (
+            trans_probs.group_by("STAGE_FROM")
+            .agg((-(pl.col("p") * pl.col("p").log()).sum()).alias("entropy"))
+            .sort("entropy", descending=True)
+        )
+        ent.write_csv(PARAMS_DIR / "stage_transition_entropy.csv")
+        # Stage residence (runs)
+        runs = (
+            h_stage.with_columns(
+                [
+                    pl.when(pl.col("STAGE_CHANGE"))
+                    .then(1)
+                    .otherwise(0)
+                    .cum_sum()
+                    .over("CNR_NUMBER")
+                    .alias("RUN_ID")
+                ]
+            )
+            .group_by(["CNR_NUMBER", "STAGE", "RUN_ID"])
+            .agg(
+                [
+                    pl.col("DT").min().alias("RUN_START"),
+                    pl.col("DT").max().alias("RUN_END"),
+                    pl.len().alias("HEARINGS_IN_RUN"),
+                ]
+            )
+            .with_columns(
+                ((pl.col("RUN_END") - pl.col("RUN_START")) / timedelta(days=1)).alias("RUN_DAYS")
+            )
+        )
+        stage_duration = (
+            runs.group_by("STAGE")
+            .agg(
+                [
+                    pl.col("RUN_DAYS").median().alias("RUN_MEDIAN_DAYS"),
+                    pl.col("RUN_DAYS").quantile(0.9).alias("RUN_P90_DAYS"),
+                    pl.col("HEARINGS_IN_RUN").median().alias("HEARINGS_PER_RUN_MED"),
+                    pl.len().alias("N_RUNS"),
+                ]
+            )
+            .sort("RUN_MEDIAN_DAYS", descending=True)
+        )
+        stage_duration.write_csv(PARAMS_DIR / "stage_duration.csv")
+    # --------------------------------------------------
+    # 2. Court capacity (cases per courtroom per day)
+    # --------------------------------------------------
+    capacity_stats = None
+    if {"BusinessOnDate", "CourtName"}.issubset(hearings.columns):
+        cap = (
+            hearings.filter(pl.col("BusinessOnDate").is_not_null())
+            .group_by(["CourtName", "BusinessOnDate"])
+            .agg(pl.len().alias("heard_count"))
+        )
+        cap_stats = (
+            cap.group_by("CourtName")
+            .agg(
+                [
+                    pl.col("heard_count").median().alias("slots_median"),
+                    pl.col("heard_count").quantile(0.9).alias("slots_p90"),
+                ]
+            )
+            .sort("slots_median", descending=True)
+        )
+        cap_stats.write_csv(PARAMS_DIR / "court_capacity_stats.csv")
+        # simple global aggregate
+        capacity_stats = {
+            "slots_median_global": float(cap["heard_count"].median()),
+            "slots_p90_global": float(cap["heard_count"].quantile(0.9)),
+        }
+        with open(PARAMS_DIR / "court_capacity_global.json", "w") as f:
+            json.dump(capacity_stats, f, indent=2)
+    # --------------------------------------------------
+    # 3. Adjournment and not-reached proxies
+    # --------------------------------------------------
+    if "BusinessOnDate" in hearings.columns and stage_col:
+        # recompute hearing gaps if needed
+        if "HEARING_GAP_DAYS" not in hearings.columns:
+            hearings = (
+                hearings.filter(pl.col("BusinessOnDate").is_not_null())
+                .sort(["CNR_NUMBER", "BusinessOnDate"])
+                .with_columns(
+                    (
+                        (pl.col("BusinessOnDate") - pl.col("BusinessOnDate").shift(1))
+                        / timedelta(days=1)
+                    )
+                    .over("CNR_NUMBER")
+                    .alias("HEARING_GAP_DAYS")
+                )
+            )
+        stage_median_gap = hearings.group_by("Remappedstages").agg(
+            pl.col("HEARING_GAP_DAYS").median().alias("gap_median")
+        )
+        hearings = hearings.join(stage_median_gap, on="Remappedstages", how="left")
+        def _contains_any(col: str, kws: list[str]):
+            expr = None
+            for k in kws:
+                e = pl.col(col).str.contains(k)
+                expr = e if expr is None else (expr | e)
+            return (expr if expr is not None else pl.lit(False)).fill_null(False)
+        # Not reached proxies from purpose text
+        text_col = None
+        for c in ["PurposeofHearing", "Purpose of Hearing", "PURPOSE_OF_HEARING"]:
+            if c in hearings.columns:
+                text_col = c
+                break
+        hearings = hearings.with_columns(
+            [
+                pl.when(pl.col("HEARING_GAP_DAYS") > (pl.col("gap_median") * 1.3))
+                .then(1)
+                .otherwise(0)
+                .alias("is_adjourn_proxy")
+            ]
+        )
+        if text_col:
+            hearings = hearings.with_columns(
+                pl.when(_contains_any(text_col, ["NOT REACHED", "NR", "NOT TAKEN UP", "NOT HEARD"]))
+                .then(1)
+                .otherwise(0)
+                .alias("is_not_reached_proxy")
+            )
+        else:
+            hearings = hearings.with_columns(pl.lit(0).alias("is_not_reached_proxy"))
+        outcome_stage = (
+            hearings.group_by(["Remappedstages", "casetype"])
+            .agg(
+                [
+                    pl.mean("is_adjourn_proxy").alias("p_adjourn_proxy"),
+                    pl.mean("is_not_reached_proxy").alias("p_not_reached_proxy"),
+                    pl.count().alias("n"),
+                ]
+            )
+            .sort(["Remappedstages", "casetype"])
+        )
+        outcome_stage.write_csv(PARAMS_DIR / "adjournment_proxies.csv")
+    # --------------------------------------------------
+    # 4. Case-type summary and correlations
+    # --------------------------------------------------
+    by_type = (
+        cases.group_by("CASE_TYPE")
+        .agg(
+            [
+                pl.count().alias("n_cases"),
+                pl.col("DISPOSALTIME_ADJ").median().alias("disp_median"),
+                pl.col("DISPOSALTIME_ADJ").quantile(0.9).alias("disp_p90"),
+                pl.col("N_HEARINGS").median().alias("hear_median"),
+                pl.col("GAP_MEDIAN").median().alias("gap_median"),
+            ]
+        )
+        .sort("n_cases", descending=True)
+    )
+    by_type.write_csv(PARAMS_DIR / "case_type_summary.csv")
+    # Correlations for a quick diagnostic
+    corr_cols = ["DISPOSALTIME_ADJ", "N_HEARINGS", "GAP_MEDIAN"]
+    corr_df = cases.select(corr_cols).to_pandas()
+    corr = corr_df.corr(method="spearman")
+    corr.to_csv(PARAMS_DIR / "correlations_spearman.csv")
+    # --------------------------------------------------
+    # 5. Readiness score and alerts
+    # --------------------------------------------------
+    cases = cases.with_columns(
+        [
+            pl.when(pl.col("N_HEARINGS") > 50)
+            .then(50)
+            .otherwise(pl.col("N_HEARINGS"))
+            .alias("NH_CAP"),
+            pl.when(pl.col("GAP_MEDIAN").is_null() | (pl.col("GAP_MEDIAN") <= 0))
+            .then(999.0)
+            .otherwise(pl.col("GAP_MEDIAN"))
+            .alias("GAPM_SAFE"),
+        ]
+    )
+    cases = cases.with_columns(
+        pl.when(pl.col("GAPM_SAFE") > 100)
+        .then(100.0)
+        .otherwise(pl.col("GAPM_SAFE"))
+        .alias("GAPM_CLAMP")
+    )
+    # Stage at last hearing
+    if "BusinessOnDate" in hearings.columns and stage_col:
+        h_latest = (
+            hearings.filter(pl.col("BusinessOnDate").is_not_null())
+            .sort(["CNR_NUMBER", "BusinessOnDate"])
+            .group_by("CNR_NUMBER")
+            .agg(
+                [
+                    pl.col("BusinessOnDate").max().alias("LAST_HEARING"),
+                    pl.col(stage_col).last().alias("LAST_STAGE"),
+                    pl.col(stage_col).n_unique().alias("N_DISTINCT_STAGES"),
+                ]
+            )
+        )
+        cases = cases.join(h_latest, on="CNR_NUMBER", how="left")
+    else:
+        cases = cases.with_columns(
+            [
+                pl.lit(None).alias("LAST_HEARING"),
+                pl.lit(None).alias("LAST_STAGE"),
+                pl.lit(None).alias("N_DISTINCT_STAGES"),
+            ]
+        )
+    # Normalised readiness in [0,1]
+    cases = cases.with_columns(
+        (
+            (pl.col("NH_CAP") / 50).clip(upper_bound=1.0) * 0.4
+            + (100 / pl.col("GAPM_CLAMP")).clip(upper_bound=1.0) * 0.3
+            + pl.when(pl.col("LAST_STAGE").is_in(["ARGUMENTS", "EVIDENCE", "ORDERS / JUDGMENT"]))
+            .then(0.3)
+            .otherwise(0.1)
+        ).alias("READINESS_SCORE")
+    )
+    # Alert flags (within case type)
+    try:
+        cases = cases.with_columns(
+            [
+                (
+                    pl.col("DISPOSALTIME_ADJ")
+                    > pl.col("DISPOSALTIME_ADJ").quantile(0.9).over("CASE_TYPE")
+                ).alias("ALERT_P90_TYPE"),
+                (pl.col("N_HEARINGS") > pl.col("N_HEARINGS").quantile(0.9).over("CASE_TYPE")).alias(
+                    "ALERT_HEARING_HEAVY"
+                ),
+                (pl.col("GAP_MEDIAN") > pl.col("GAP_MEDIAN").quantile(0.9).over("CASE_TYPE")).alias(
+                    "ALERT_LONG_GAP"
+                ),
+            ]
+        )
+    except Exception as e:
+        print("Alert flag computation error:", e)
+    feature_cols = [
+        "CNR_NUMBER",
+        "CASE_TYPE",
+        "YEAR_FILED",
+        "YEAR_DECISION",
+        "DISPOSALTIME_ADJ",
+        "N_HEARINGS",
+        "GAP_MEDIAN",
+        "GAP_STD",
+        "LAST_HEARING",
+        "LAST_STAGE",
+        "READINESS_SCORE",
+        "ALERT_P90_TYPE",
+        "ALERT_HEARING_HEAVY",
+        "ALERT_LONG_GAP",
+    ]
+    feature_cols_existing = [c for c in feature_cols if c in cases.columns]
+    cases.select(feature_cols_existing).write_csv(PARAMS_DIR / "cases_features.csv")
+    # Simple age funnel
+    if {"DATE_FILED", "DECISION_DATE"}.issubset(cases.columns):
+        age_funnel = (
+            cases.with_columns(
+                ((pl.col("DECISION_DATE") - pl.col("DATE_FILED")) / timedelta(days=365)).alias(
+                    "AGE_YRS"
+                )
+            )
+            .with_columns(
+                pl.when(pl.col("AGE_YRS") < 1)
+                .then(pl.lit("<1y"))
+                .when(pl.col("AGE_YRS") < 3)
+                .then(pl.lit("1-3y"))
+                .when(pl.col("AGE_YRS") < 5)
+                .then(pl.lit("3-5y"))
+                .otherwise(pl.lit(">5y"))
+                .alias("AGE_BUCKET")
+            )
+            .group_by("AGE_BUCKET")
+            .agg(pl.len().alias("N"))
+            .sort("AGE_BUCKET")
+        )
+        age_funnel.write_csv(PARAMS_DIR / "age_funnel.csv")
+def run_parameter_export() -> None:
+    extract_parameters()
+    print("Parameter extraction complete. Files in:", PARAMS_DIR.resolve())
+if __name__ == "__main__":
+    run_parameter_export()