Spaces:
Sleeping
Sleeping
| """Module 2: Visual and descriptive EDA. | |
| Responsibilities: | |
| - Case type distribution, filing trends, disposal distribution. | |
| - Hearing gap distributions by type. | |
| - Stage transition Sankey & stage bottlenecks. | |
| - Cohorts by filing year. | |
| - Seasonality and monthly anomalies. | |
| - Judge and courtroom workload. | |
| - Purpose tags and stage frequency. | |
| Inputs: | |
| - Cleaned Parquet from eda_load_clean. | |
| Outputs: | |
| - Interactive HTML plots in FIGURES_DIR and versioned copies in _get_run_dir(). | |
| - Some CSV summaries (e.g., stage_duration.csv, transitions.csv, monthly_anomalies.csv). | |
| """ | |
| from datetime import timedelta | |
| import plotly.express as px | |
| import plotly.graph_objects as go | |
| import plotly.io as pio | |
| import polars as pl | |
| from eda.config import ( | |
| _get_cases_parquet, | |
| _get_hearings_parquet, | |
| _get_run_dir, | |
| safe_write_figure, | |
| ) | |
| px.defaults.template = "plotly_white" | |
| px.defaults.color_discrete_sequence = px.colors.qualitative.Set2 | |
| pio.templates.default = "plotly_white" | |
| def load_cleaned(): | |
| cases = pl.read_parquet(_get_cases_parquet()) | |
| hearings = pl.read_parquet(_get_hearings_parquet()) | |
| print("Loaded cleaned data for exploration") | |
| print("Cases:", cases.shape, "Hearings:", hearings.shape) | |
| return cases, hearings | |
| def run_exploration() -> None: | |
| cases, hearings = load_cleaned() | |
| # 1. Case Type Distribution | |
| # -------------------------------------------------- | |
| try: | |
| ct_counts = ( | |
| cases.group_by("CASE_TYPE") | |
| .agg(pl.len().alias("COUNT")) | |
| .sort("COUNT", descending=True) | |
| ) | |
| fig1 = px.bar( | |
| ct_counts.to_pandas(), | |
| x="CASE_TYPE", | |
| y="COUNT", | |
| color="CASE_TYPE", | |
| title="Case Type Distribution", | |
| ) | |
| fig1.update_layout( | |
| showlegend=False, | |
| xaxis_title="Case Type", | |
| yaxis_title="Number of Cases", | |
| xaxis_tickangle=-45, | |
| ) | |
| safe_write_figure(fig1, "1_case_type_distribution.html") | |
| except Exception as e: | |
| print("Case type distribution error:", e) | |
| # -------------------------------------------------- | |
| # 2. Filing Trends by Year (single line, no slider) | |
| # -------------------------------------------------- | |
| if "YEAR_FILED" in cases.columns: | |
| year_counts = ( | |
| cases.group_by("YEAR_FILED") | |
| .agg(pl.len().alias("Count")) | |
| .sort("YEAR_FILED", descending=False) | |
| ) | |
| df_year = year_counts.to_pandas() | |
| fig2 = px.line( | |
| df_year, | |
| x="YEAR_FILED", | |
| y="Count", | |
| markers=True, | |
| title="Cases Filed by Year", | |
| ) | |
| fig2.update_layout(xaxis_title="Year", yaxis_title="Cases") | |
| # Fix y-axis max to 10k (counts are known to be < 10k) | |
| fig2.update_yaxes(range=[0, 10000]) | |
| f2 = "2_cases_filed_by_year.html" | |
| safe_write_figure(fig2, f2) | |
| # -------------------------------------------------- | |
| # 3. Disposal Duration Distribution | |
| # -------------------------------------------------- | |
| if "DISPOSALTIME_ADJ" in cases.columns: | |
| fig3 = px.histogram( | |
| x=cases["DISPOSALTIME_ADJ"].to_list(), | |
| nbins=50, | |
| title="Distribution of Disposal Time (Adjusted Days)", | |
| color_discrete_sequence=["indianred"], | |
| ) | |
| fig3.update_layout(xaxis_title="Days", yaxis_title="Cases") | |
| f3 = "3_disposal_time_distribution.html" | |
| safe_write_figure(fig3, f3) | |
| # -------------------------------------------------- | |
| # 4. Hearings vs Disposal Time | |
| # -------------------------------------------------- | |
| if {"N_HEARINGS", "DISPOSALTIME_ADJ"}.issubset(set(cases.columns)): | |
| # Convert only necessary columns for plotting with color/hover metadata | |
| cases_scatter = cases.select( | |
| ["N_HEARINGS", "DISPOSALTIME_ADJ", "CASE_TYPE", "CNR_NUMBER", "YEAR_FILED"] | |
| ).to_pandas() | |
| fig4 = px.scatter( | |
| cases_scatter, | |
| x="N_HEARINGS", | |
| y="DISPOSALTIME_ADJ", | |
| color="CASE_TYPE", | |
| hover_data=["CNR_NUMBER", "YEAR_FILED"], | |
| title="Hearings vs Disposal Duration", | |
| ) | |
| fig4.update_traces(marker=dict(size=6, opacity=0.7)) | |
| f4 = "4_hearings_vs_disposal.html" | |
| safe_write_figure(fig4, f4) | |
| # -------------------------------------------------- | |
| # 5. Boxplot by Case Type | |
| # -------------------------------------------------- | |
| fig5 = px.box( | |
| cases.select(["CASE_TYPE", "DISPOSALTIME_ADJ"]).to_pandas(), | |
| x="CASE_TYPE", | |
| y="DISPOSALTIME_ADJ", | |
| color="CASE_TYPE", | |
| title="Disposal Time (Adjusted) by Case Type", | |
| ) | |
| fig5.update_layout(showlegend=False, xaxis_tickangle=-45) | |
| f5 = "5_box_disposal_by_type.html" | |
| safe_write_figure(fig5, f5) | |
| # -------------------------------------------------- | |
| # 6. Stage Frequency | |
| # -------------------------------------------------- | |
| if "Remappedstages" in hearings.columns: | |
| stage_counts = ( | |
| hearings["Remappedstages"] | |
| .value_counts() | |
| .rename({"Remappedstages": "Stage", "count": "Count"}) | |
| ) | |
| fig6 = px.bar( | |
| stage_counts.to_pandas(), | |
| x="Stage", | |
| y="Count", | |
| color="Stage", | |
| title="Frequency of Hearing Stages (Log Scale)", | |
| log_y=True, | |
| ) | |
| fig6.update_layout( | |
| showlegend=False, | |
| xaxis_title="Stage", | |
| yaxis_title="Count (log scale)", | |
| xaxis_tickangle=-45, | |
| height=500, | |
| ) | |
| f6 = "6_stage_frequency.html" | |
| safe_write_figure(fig6, f6) | |
| # -------------------------------------------------- | |
| # 7. Gap median by case type | |
| # -------------------------------------------------- | |
| if "GAP_MEDIAN" in cases.columns: | |
| fig_gap = px.box( | |
| cases.select(["CASE_TYPE", "GAP_MEDIAN"]).to_pandas(), | |
| x="CASE_TYPE", | |
| y="GAP_MEDIAN", | |
| points=False, | |
| title="Median Hearing Gap by Case Type", | |
| ) | |
| fig_gap.update_layout(xaxis_tickangle=-45) | |
| fg = "9_gap_median_by_type.html" | |
| safe_write_figure(fig_gap, fg) | |
| # -------------------------------------------------- | |
| # 8. Stage transitions & bottleneck plot | |
| # -------------------------------------------------- | |
| stage_col = "Remappedstages" if "Remappedstages" in hearings.columns else None | |
| transitions = None | |
| stage_duration = None | |
| if stage_col and "BusinessOnDate" in hearings.columns: | |
| STAGE_ORDER = [ | |
| "PRE-ADMISSION", | |
| "ADMISSION", | |
| "FRAMING OF CHARGES", | |
| "EVIDENCE", | |
| "ARGUMENTS", | |
| "INTERLOCUTORY APPLICATION", | |
| "SETTLEMENT", | |
| "ORDERS / JUDGMENT", | |
| "FINAL DISPOSAL", | |
| "OTHER", | |
| "NA", | |
| ] | |
| order_idx = {s: i for i, s in enumerate(STAGE_ORDER)} | |
| h_stage = ( | |
| hearings.filter(pl.col("BusinessOnDate").is_not_null()) | |
| .sort(["CNR_NUMBER", "BusinessOnDate"]) | |
| .with_columns( | |
| [ | |
| pl.col(stage_col) | |
| .fill_null("NA") | |
| .map_elements( | |
| lambda s: s | |
| if s in STAGE_ORDER | |
| else ("OTHER" if s is not None else "NA") | |
| ) | |
| .alias("STAGE"), | |
| pl.col("BusinessOnDate").alias("DT"), | |
| ] | |
| ) | |
| .with_columns( | |
| [ | |
| (pl.col("STAGE") != pl.col("STAGE").shift(1)) | |
| .over("CNR_NUMBER") | |
| .alias("STAGE_CHANGE"), | |
| ] | |
| ) | |
| ) | |
| transitions_raw = ( | |
| h_stage.with_columns( | |
| [ | |
| pl.col("STAGE").alias("STAGE_FROM"), | |
| pl.col("STAGE").shift(-1).over("CNR_NUMBER").alias("STAGE_TO"), | |
| ] | |
| ) | |
| .filter(pl.col("STAGE_TO").is_not_null()) | |
| .group_by(["STAGE_FROM", "STAGE_TO"]) | |
| .agg(pl.len().alias("N")) | |
| ) | |
| transitions = transitions_raw.filter( | |
| pl.col("STAGE_FROM").map_elements(lambda s: order_idx.get(s, 10)) | |
| <= pl.col("STAGE_TO").map_elements(lambda s: order_idx.get(s, 10)) | |
| ).sort("N", descending=True) | |
| transitions.write_csv(str(_get_run_dir() / "transitions.csv")) | |
| runs = ( | |
| h_stage.with_columns( | |
| [ | |
| pl.when(pl.col("STAGE_CHANGE")) | |
| .then(1) | |
| .otherwise(0) | |
| .cum_sum() | |
| .over("CNR_NUMBER") | |
| .alias("RUN_ID") | |
| ] | |
| ) | |
| .group_by(["CNR_NUMBER", "STAGE", "RUN_ID"]) | |
| .agg( | |
| [ | |
| pl.col("DT").min().alias("RUN_START"), | |
| pl.col("DT").max().alias("RUN_END"), | |
| pl.len().alias("HEARINGS_IN_RUN"), | |
| ] | |
| ) | |
| .with_columns( | |
| ((pl.col("RUN_END") - pl.col("RUN_START")) / timedelta(days=1)).alias( | |
| "RUN_DAYS" | |
| ) | |
| ) | |
| ) | |
| stage_duration = ( | |
| runs.group_by("STAGE") | |
| .agg( | |
| [ | |
| pl.col("RUN_DAYS").median().alias("RUN_MEDIAN_DAYS"), | |
| pl.col("RUN_DAYS").mean().alias("RUN_MEAN_DAYS"), | |
| pl.col("HEARINGS_IN_RUN").median().alias("HEARINGS_PER_RUN_MED"), | |
| pl.len().alias("N_RUNS"), | |
| ] | |
| ) | |
| .sort("RUN_MEDIAN_DAYS", descending=True) | |
| ) | |
| stage_duration.write_csv(str(_get_run_dir() / "stage_duration.csv")) | |
| # Sankey | |
| try: | |
| tr_df = transitions.to_pandas() | |
| labels = [ | |
| s | |
| for s in STAGE_ORDER | |
| if s in set(tr_df["STAGE_FROM"]).union(set(tr_df["STAGE_TO"])) | |
| ] | |
| idx = {label: i for i, label in enumerate(labels)} | |
| tr_df = tr_df[ | |
| tr_df["STAGE_FROM"].isin(labels) & tr_df["STAGE_TO"].isin(labels) | |
| ].copy() | |
| tr_df = tr_df.sort_values( | |
| by=["STAGE_FROM", "STAGE_TO"], key=lambda c: c.map(idx) | |
| ) | |
| sankey = go.Figure( | |
| data=[ | |
| go.Sankey( | |
| arrangement="snap", | |
| node=dict(label=labels, pad=15, thickness=18), | |
| link=dict( | |
| source=tr_df["STAGE_FROM"].map(idx).tolist(), | |
| target=tr_df["STAGE_TO"].map(idx).tolist(), | |
| value=tr_df["N"].tolist(), | |
| ), | |
| ) | |
| ] | |
| ) | |
| sankey.update_layout( | |
| title_text="Stage Transition Sankey (Ordered)", | |
| height=800, | |
| margin=dict(t=50, b=50, l=50, r=50), | |
| ) | |
| f10 = "10_stage_transition_sankey.html" | |
| safe_write_figure(sankey, f10) | |
| except Exception as e: | |
| print("Sankey error:", e) | |
| # Bottleneck impact | |
| try: | |
| st_pd = stage_duration.with_columns( | |
| (pl.col("RUN_MEDIAN_DAYS") * pl.col("N_RUNS")).alias("IMPACT") | |
| ).to_pandas() | |
| fig_b = px.bar( | |
| st_pd.sort_values("IMPACT", ascending=False), | |
| x="STAGE", | |
| y="IMPACT", | |
| title="Stage Bottleneck Impact (Median Days x Runs)", | |
| ) | |
| fig_b.update_layout(xaxis_tickangle=-45) | |
| fb = "15_bottleneck_impact.html" | |
| safe_write_figure(fig_b, fb) | |
| except Exception as e: | |
| print("Bottleneck plot error:", e) | |
| # -------------------------------------------------- | |
| # 9. Monthly seasonality and anomalies | |
| # -------------------------------------------------- | |
| if "BusinessOnDate" in hearings.columns: | |
| m_hear = ( | |
| hearings.filter(pl.col("BusinessOnDate").is_not_null()) | |
| .with_columns( | |
| [ | |
| pl.col("BusinessOnDate").dt.year().alias("Y"), | |
| pl.col("BusinessOnDate").dt.month().alias("M"), | |
| ] | |
| ) | |
| .with_columns(pl.date(pl.col("Y"), pl.col("M"), pl.lit(1)).alias("YM")) | |
| ) | |
| monthly_listings = ( | |
| m_hear.group_by("YM").agg(pl.len().alias("N_HEARINGS")).sort("YM") | |
| ) | |
| monthly_listings.write_csv(str(_get_run_dir() / "monthly_hearings.csv")) | |
| try: | |
| fig_m = px.line( | |
| monthly_listings.to_pandas(), | |
| x="YM", | |
| y="N_HEARINGS", | |
| title="Monthly Hearings Listed", | |
| ) | |
| fig_m.update_layout(yaxis=dict(tickformat=",d")) | |
| fm = "11_monthly_hearings.html" | |
| safe_write_figure(fig_m, fm) | |
| except Exception as e: | |
| print("Monthly listings error:", e) | |
| # Anomaly detection (no waterfall plot) | |
| try: | |
| ml = monthly_listings.with_columns( | |
| [ | |
| pl.col("N_HEARINGS").shift(1).alias("PREV"), | |
| (pl.col("N_HEARINGS") - pl.col("N_HEARINGS").shift(1)).alias( | |
| "DELTA" | |
| ), | |
| ] | |
| ) | |
| ml_pd = ml.to_pandas() | |
| ml_pd["ROLL_MEAN"] = ( | |
| ml_pd["N_HEARINGS"].rolling(window=12, min_periods=6).mean() | |
| ) | |
| ml_pd["ROLL_STD"] = ( | |
| ml_pd["N_HEARINGS"].rolling(window=12, min_periods=6).std() | |
| ) | |
| ml_pd["Z"] = (ml_pd["N_HEARINGS"] - ml_pd["ROLL_MEAN"]) / ml_pd["ROLL_STD"] | |
| ml_pd["ANOM"] = ml_pd["Z"].abs() >= 3.0 | |
| # Export anomalies and enriched monthly series | |
| ml_pd_out = ml_pd.copy() | |
| ml_pd_out["YM"] = ml_pd_out["YM"].astype(str) | |
| ml_pd_out.to_csv(str(_get_run_dir() / "monthly_anomalies.csv"), index=False) | |
| except Exception as e: | |
| print("Monthly anomalies computation error:", e) | |
| # -------------------------------------------------- | |
| # 10. Judge and court workload | |
| # -------------------------------------------------- | |
| judge_col = None | |
| for c in [ | |
| "BeforeHonourableJudge", | |
| "Before Hon'ble Judges", | |
| "Before_Honble_Judges", | |
| "NJDG_JUDGE_NAME", | |
| ]: | |
| if c in hearings.columns: | |
| judge_col = c | |
| break | |
| if judge_col and "BusinessOnDate" in hearings.columns: | |
| jday = ( | |
| hearings.filter(pl.col("BusinessOnDate").is_not_null()) | |
| .group_by([judge_col, "BusinessOnDate"]) | |
| .agg(pl.len().alias("N_HEARINGS")) | |
| ) | |
| try: | |
| fig_j = px.box( | |
| jday.to_pandas(), | |
| x=judge_col, | |
| y="N_HEARINGS", | |
| title="Per-day Hearings per Judge", | |
| ) | |
| fig_j.update_layout( | |
| xaxis={"categoryorder": "total descending", "tickangle": -45}, | |
| yaxis=dict(tickformat=",d"), | |
| ) | |
| fj = "12_judge_day_load.html" | |
| safe_write_figure(fig_j, fj) | |
| except Exception as e: | |
| print("Judge workload error:", e) | |
| court_col = None | |
| for cc in ["COURT_NUMBER", "CourtName"]: | |
| if cc in hearings.columns: | |
| court_col = cc | |
| break | |
| if court_col and "BusinessOnDate" in hearings.columns: | |
| cday = ( | |
| hearings.filter(pl.col("BusinessOnDate").is_not_null()) | |
| .group_by([court_col, "BusinessOnDate"]) | |
| .agg(pl.len().alias("N_HEARINGS")) | |
| ) | |
| try: | |
| fig_court = px.box( | |
| cday.to_pandas(), | |
| x=court_col, | |
| y="N_HEARINGS", | |
| title="Per-day Hearings per Courtroom", | |
| ) | |
| fig_court.update_layout( | |
| xaxis={"categoryorder": "total descending", "tickangle": -45}, | |
| yaxis=dict(tickformat=",d"), | |
| ) | |
| fc = "12b_court_day_load.html" | |
| safe_write_figure(fig_court, fc) | |
| except Exception as e: | |
| print("Court workload error:", e) | |
| # -------------------------------------------------- | |
| # 11. Purpose tagging distributions | |
| # -------------------------------------------------- | |
| text_col = None | |
| for c in ["PurposeofHearing", "Purpose of Hearing", "PURPOSE_OF_HEARING"]: | |
| if c in hearings.columns: | |
| text_col = c | |
| break | |
| def _has_kw_expr(col: str, kws: list[str]): | |
| expr = None | |
| for k in kws: | |
| e = pl.col(col).str.contains(k) | |
| expr = e if expr is None else (expr | e) | |
| return (expr if expr is not None else pl.lit(False)).fill_null(False) | |
| if text_col: | |
| hear_txt = hearings.with_columns( | |
| pl.col(text_col) | |
| .cast(pl.Utf8) | |
| .str.strip_chars() | |
| .str.to_uppercase() | |
| .alias("PURPOSE_TXT") | |
| ) | |
| async_kw = [ | |
| "NON-COMPLIANCE", | |
| "OFFICE OBJECTION", | |
| "COMPLIANCE", | |
| "NOTICE", | |
| "SERVICE", | |
| ] | |
| subs_kw = [ | |
| "EVIDENCE", | |
| "ARGUMENT", | |
| "FINAL HEARING", | |
| "JUDGMENT", | |
| "ORDER", | |
| "DISPOSAL", | |
| ] | |
| hear_txt = hear_txt.with_columns( | |
| pl.when(_has_kw_expr("PURPOSE_TXT", async_kw)) | |
| .then(pl.lit("ASYNC_OR_ADMIN")) | |
| .when(_has_kw_expr("PURPOSE_TXT", subs_kw)) | |
| .then(pl.lit("SUBSTANTIVE")) | |
| .otherwise(pl.lit("UNKNOWN")) | |
| .alias("PURPOSE_TAG") | |
| ) | |
| tag_share = ( | |
| hear_txt.group_by(["CASE_TYPE", "PURPOSE_TAG"]) | |
| .agg(pl.len().alias("N")) | |
| .with_columns( | |
| (pl.col("N") / pl.col("N").sum().over("CASE_TYPE")).alias("SHARE") | |
| ) | |
| .sort(["CASE_TYPE", "SHARE"], descending=[False, True]) | |
| ) | |
| tag_share.write_csv(str(_get_run_dir() / "purpose_tag_shares.csv")) | |
| try: | |
| fig_t = px.bar( | |
| tag_share.to_pandas(), | |
| x="CASE_TYPE", | |
| y="SHARE", | |
| color="PURPOSE_TAG", | |
| title="Purpose Tag Shares by Case Type", | |
| barmode="stack", | |
| ) | |
| fig_t.update_layout(xaxis_tickangle=-45) | |
| ft = "14_purpose_tag_shares.html" | |
| safe_write_figure(fig_t, ft) | |
| except Exception as e: | |
| print("Purpose shares error:", e) | |
| if __name__ == "__main__": | |
| run_exploration() | |