RoyAalekh commited on
Commit
4ffade4
·
1 Parent(s): bb59487
Files changed (6) hide show
  1. .gitignore +3 -0
  2. main.py +16 -993
  3. src/eda_config.py +56 -0
  4. src/eda_exploration.py +509 -0
  5. src/eda_load_clean.py +236 -0
  6. src/eda_parameters.py +400 -0
.gitignore CHANGED
@@ -13,3 +13,6 @@ uv.lock
13
  *.idea
14
  .vscode/
15
  __pylintrc__
 
 
 
 
13
  *.idea
14
  .vscode/
15
  __pylintrc__
16
+ .pdf
17
+ .html
18
+ .docx
main.py CHANGED
@@ -1,1000 +1,23 @@
1
- """Exploratory Data Analysis for Karnataka High Court Scheduling Dataset.
2
 
3
- - Uses Polars for fast data handling
4
- - Uses Plotly for interactive visualizations
 
 
5
  """
6
 
7
- # ======================================================
8
- # 1. Imports and Setup
9
- # ======================================================
10
- import json
11
- import shutil
12
- from datetime import datetime, timedelta
13
- from pathlib import Path
14
 
15
- import pandas as pd
16
- import plotly.express as px
17
- import plotly.graph_objects as go
18
- import plotly.io as pio
19
- import polars as pl
20
 
21
- pio.renderers.default = "browser" # open plots in browser
 
22
 
23
- # ======================================================
24
- # 2. Paths
25
- # ======================================================
26
- DATA_DIR = Path("Data")
27
- CASES_FILE = DATA_DIR / "ISDMHack_Cases_WPfinal.csv"
28
- HEAR_FILE = DATA_DIR / "ISDMHack_Hear.csv"
29
- OUT_DIR = Path("reports/figures")
30
- OUT_DIR.mkdir(parents=True, exist_ok=True)
31
 
32
- # Versioning for iterative runs
33
- VERSION = "v0.3.0"
34
- RUN_TS = datetime.now().strftime("%Y%m%d_%H%M%S")
35
- OUT_DIR_VER = OUT_DIR / f"{VERSION}_{RUN_TS}"
36
- OUT_DIR_VER.mkdir(parents=True, exist_ok=True)
37
-
38
-
39
- def _copy_to_versioned(filename: str):
40
- src = OUT_DIR / filename
41
- dst = OUT_DIR_VER / filename
42
- try:
43
- if src.exists():
44
- shutil.copyfile(src, dst)
45
- except Exception as e:
46
- print(f"Versioned copy failed for {filename}: {e}")
47
-
48
-
49
- # ======================================================
50
- # 3. Load Data
51
- # ======================================================
52
-
53
- # Improve null parsing and schema inference so textual placeholders like "NA" become proper nulls
54
- NULL_TOKENS = ["", "NULL", "Null", "null", "NA", "N/A", "na", "NaN", "nan", "-", "--"]
55
-
56
- cases = pl.read_csv(
57
- CASES_FILE,
58
- try_parse_dates=True,
59
- null_values=NULL_TOKENS,
60
- infer_schema_length=100_000,
61
- )
62
- hearings = pl.read_csv(
63
- HEAR_FILE,
64
- try_parse_dates=True,
65
- null_values=NULL_TOKENS,
66
- infer_schema_length=100_000,
67
- )
68
-
69
- print(f"Cases shape: {cases.shape}")
70
- print(f"Hearings shape: {hearings.shape}")
71
-
72
- # ======================================================
73
- # 4. Basic Cleaning
74
- # ======================================================
75
- for col in ["DATE_FILED", "DECISION_DATE", "REGISTRATION_DATE"]:
76
- if col in cases.columns and cases[col].dtype == pl.Utf8:
77
- cases = cases.with_columns(pl.col(col).str.strptime(pl.Date, "%d-%m-%Y", strict=False))
78
-
79
- cases = cases.unique(subset=["CNR_NUMBER"])
80
- hearings = hearings.unique(subset=["Hearing_ID"])
81
-
82
-
83
- # Canonicalize key categorical/text fields and coerce common placeholder strings to nulls
84
- def _norm_text_col(df: pl.DataFrame, col: str) -> pl.DataFrame:
85
- if col not in df.columns:
86
- return df
87
- return df.with_columns(
88
- pl.when(
89
- pl.col(col)
90
- .cast(pl.Utf8)
91
- .str.strip_chars()
92
- .str.to_uppercase()
93
- .is_in(["", "NA", "N/A", "NULL", "NONE", "-", "--"])
94
- )
95
- .then(pl.lit(None))
96
- .otherwise(pl.col(col).cast(pl.Utf8).str.strip_chars().str.to_uppercase())
97
- .alias(col)
98
- )
99
-
100
-
101
- # Normalize CASE_TYPE early
102
- cases = _norm_text_col(cases, "CASE_TYPE")
103
-
104
- # Normalize stage and purpose/judge text on hearings
105
- for c in [
106
- "Remappedstages",
107
- "PurposeofHearing",
108
- "NJDG_JUDGE_NAME",
109
- "BeforeHonourableJudge",
110
- ]:
111
- hearings = _norm_text_col(hearings, c)
112
-
113
- # Fix frequent stage aliases/typos into canonical labels
114
- if "Remappedstages" in hearings.columns:
115
- STAGE_MAP = {
116
- "ORDERS/JUDGMENTS": "ORDERS / JUDGMENT",
117
- "ORDER/JUDGMENT": "ORDERS / JUDGMENT",
118
- "ORDERS / JUDGMENT": "ORDERS / JUDGMENT",
119
- "ORDERS /JUDGMENT": "ORDERS / JUDGMENT",
120
- "ORDERS/JUDGMENT": "ORDERS / JUDGMENT",
121
- "INTERLOCUTARY APPLICATION": "INTERLOCUTORY APPLICATION",
122
- "FRAMING OF CHARGE": "FRAMING OF CHARGES",
123
- "PRE ADMISSION": "PRE-ADMISSION",
124
- }
125
- hearings = hearings.with_columns(
126
- pl.col("Remappedstages")
127
- .map_elements(lambda x: STAGE_MAP.get(x, x) if x is not None else None)
128
- .alias("Remappedstages")
129
- )
130
-
131
- # ======================================================
132
- # 5. Derived Features
133
- # ======================================================
134
-
135
- # --- Disposal duration (use provided DISPOSALTIME_ADJ) ---
136
- # The dataset already contains the adjusted disposal time; normalize dtype only.
137
- if "DISPOSALTIME_ADJ" in cases.columns:
138
- cases = cases.with_columns(pl.col("DISPOSALTIME_ADJ").cast(pl.Int32))
139
-
140
- # --- Filing / Decision Years ---
141
- cases = cases.with_columns(
142
- [
143
- pl.col("DATE_FILED").dt.year().alias("YEAR_FILED"),
144
- pl.col("DECISION_DATE").dt.year().alias("YEAR_DECISION"),
145
- ]
146
- )
147
-
148
- # --- Hearing count per case ---
149
- hearing_freq = hearings.group_by("CNR_NUMBER").agg(pl.count("BusinessOnDate").alias("N_HEARINGS"))
150
- cases = cases.join(hearing_freq, on="CNR_NUMBER", how="left")
151
-
152
- # --- For each CNR case, we have multiple hearings, so we need to calculate the average hearing gap.
153
- # We have BusinessOnDate column, which represents the date of each hearing for that case. So for each case,
154
- # we can calculate the difference between consecutive hearings and then take the mean of these differences to get the average hearing gap.
155
- hearings = (
156
- hearings.filter(pl.col("BusinessOnDate").is_not_null()) # remove unusable rows
157
- .sort(["CNR_NUMBER", "BusinessOnDate"]) # chronological within case
158
- .with_columns(
159
- ((pl.col("BusinessOnDate") - pl.col("BusinessOnDate").shift(1)) / timedelta(days=1))
160
- .over("CNR_NUMBER")
161
- .alias("HEARING_GAP_DAYS")
162
- )
163
- )
164
- gap_summary = hearings.group_by("CNR_NUMBER").agg(pl.mean("HEARING_GAP_DAYS").alias("AVG_GAP"))
165
- cases = cases.join(gap_summary, on="CNR_NUMBER", how="left")
166
-
167
- # replace null in N_HEARINGS and AVG_GAP columns with 0
168
- cases = cases.with_columns(
169
- pl.col("N_HEARINGS").fill_null(0).cast(pl.Int64),
170
- pl.col("AVG_GAP").fill_null(0.0).fill_nan(0.0).cast(pl.Float64),
171
- )
172
-
173
- print("\n=== Feature Summary ===")
174
- print(cases.select(["CASE_TYPE", "DISPOSALTIME_ADJ", "N_HEARINGS", "AVG_GAP"]).describe())
175
-
176
- cases_pd = cases.to_pandas()
177
- hearings_pd = hearings.to_pandas()
178
-
179
- # ======================================================
180
- # 6. Interactive Visualizations
181
- # ======================================================
182
-
183
- # 1. Case Type Distribution
184
- fig1 = px.bar(
185
- cases_pd,
186
- x="CASE_TYPE",
187
- color="CASE_TYPE",
188
- title="Case Type Distribution",
189
- )
190
- fig1.update_layout(showlegend=False, xaxis_title="Case Type", yaxis_title="Number of Cases")
191
- fig1.write_html(OUT_DIR / "1_case_type_distribution.html")
192
- _copy_to_versioned("1_case_type_distribution.html")
193
- # fig1.show()
194
-
195
- # 2. Filing Trends by Year
196
- year_counts = cases_pd.groupby("YEAR_FILED")["CNR_NUMBER"].count().reset_index(name="Count")
197
- fig2 = px.line(year_counts, x="YEAR_FILED", y="Count", markers=True, title="Cases Filed by Year")
198
- fig2.update_traces(line_color="royalblue")
199
- fig2.update_layout(xaxis=dict(rangeslider=dict(visible=True)))
200
- fig2.write_html(OUT_DIR / "2_cases_filed_by_year.html")
201
- _copy_to_versioned("2_cases_filed_by_year.html")
202
- # fig2.show()
203
-
204
- # 3. Disposal Duration Distribution
205
- fig3 = px.histogram(
206
- cases_pd,
207
- x="DISPOSALTIME_ADJ",
208
- nbins=50,
209
- title="Distribution of Disposal Time (Adjusted Days)",
210
- color_discrete_sequence=["indianred"],
211
- )
212
- fig3.update_layout(xaxis_title="Days", yaxis_title="Cases")
213
- fig3.write_html(OUT_DIR / "3_disposal_time_distribution.html")
214
- _copy_to_versioned("3_disposal_time_distribution.html")
215
- # fig3.show()
216
-
217
- # 4. Hearings vs Disposal Time
218
- fig4 = px.scatter(
219
- cases_pd,
220
- x="N_HEARINGS",
221
- y="DISPOSALTIME_ADJ",
222
- color="CASE_TYPE",
223
- hover_data=["CNR_NUMBER", "YEAR_FILED"],
224
- title="Hearings vs Disposal Duration",
225
- )
226
- fig4.update_traces(marker=dict(size=6, opacity=0.7))
227
- fig4.write_html(OUT_DIR / "4_hearings_vs_disposal.html")
228
- _copy_to_versioned("4_hearings_vs_disposal.html")
229
- # fig4.show()
230
-
231
- # 5. Boxplot by Case Type
232
- fig5 = px.box(
233
- cases_pd,
234
- x="CASE_TYPE",
235
- y="DISPOSALTIME_ADJ",
236
- color="CASE_TYPE",
237
- title="Disposal Time (Adjusted) by Case Type",
238
- )
239
- fig5.update_layout(showlegend=False)
240
- fig5.write_html(OUT_DIR / "5_box_disposal_by_type.html")
241
- _copy_to_versioned("5_box_disposal_by_type.html")
242
- # fig5.show()
243
-
244
- # 7. Judge Workload
245
- if "H" in cases_pd.columns:
246
- judge_load = (
247
- cases_pd.groupby("BeforeHonourableJudge")["CNR_NUMBER"]
248
- .count()
249
- .reset_index(name="Count")
250
- .sort_values("Count", ascending=False)
251
- .head(20)
252
- )
253
- fig7 = px.bar(
254
- judge_load,
255
- x="BeforeHonourableJudge",
256
- y="Count",
257
- title="Top 20 Judges by Number of Cases Disposed",
258
- )
259
- fig7.update_layout(xaxis_title="Judge", yaxis_title="Cases")
260
- fig7.write_html(OUT_DIR / "7_judge_workload.html")
261
- _copy_to_versioned("7_judge_workload.html")
262
- fig7.show()
263
-
264
- # 8. Stage Frequency
265
- if "Remappedstages" in hearings_pd.columns:
266
- stage_counts = hearings_pd["Remappedstages"].value_counts().reset_index()
267
- stage_counts.columns = ["Stage", "Count"]
268
- fig8 = px.bar(
269
- stage_counts,
270
- x="Stage",
271
- y="Count",
272
- color="Stage",
273
- title="Frequency of Hearing Stages",
274
- )
275
- fig8.update_layout(showlegend=False, xaxis_title="Stage", yaxis_title="Count")
276
- fig8.write_html(OUT_DIR / "8_stage_frequency.html")
277
- _copy_to_versioned("8_stage_frequency.html")
278
- fig8.show()
279
-
280
- print("\nAll interactive plots saved to:", OUT_DIR.resolve())
281
-
282
- # ======================================================
283
- # 7. Extended EDA: Data Audit, Linkage, Gaps, Stages, Cohorts, Seasonality, Purpose, Workload,
284
- # Bottlenecks
285
- # ======================================================
286
-
287
- # 7.1 Data Audit & Schema Checks
288
- print("\n=== Column dtypes (cases) ===")
289
- print(cases.dtypes)
290
- print("\n=== Column dtypes (hearings) ===")
291
- print(hearings.dtypes)
292
-
293
-
294
- def null_summary(df: pl.DataFrame, name: str):
295
- ns = df.select(
296
- [
297
- pl.lit(name).alias("TABLE"),
298
- pl.len().alias("ROWS"),
299
- ]
300
- + [pl.col(c).is_null().sum().alias(f"{c}__nulls") for c in df.columns]
301
- )
302
- print(f"\n=== Null summary ({name}) ===")
303
- print(ns)
304
-
305
-
306
- null_summary(cases, "cases")
307
- null_summary(hearings, "hearings")
308
-
309
- # Duplicate keys
310
- print("\n=== Duplicates check ===")
311
- try:
312
- print(
313
- "Cases dup CNR_NUMBER: unique vs total ->",
314
- cases.select(
315
- pl.col("CNR_NUMBER").n_unique().alias("unique"), pl.len().alias("total")
316
- ).to_dict(as_series=False),
317
- )
318
- except Exception as e:
319
- print("Cases duplicate check error:", e)
320
- try:
321
- print(
322
- "Hearings dup Hearing_ID: unique vs total ->",
323
- hearings.select(
324
- pl.col("Hearing_ID").n_unique().alias("unique"), pl.len().alias("total")
325
- ).to_dict(as_series=False),
326
- )
327
- except Exception as e:
328
- print("Hearings duplicate check error:", e)
329
-
330
- # Key integrity: every hearing must map to a case
331
- if "CNR_NUMBER" in hearings.columns:
332
- missed = hearings.join(cases.select("CNR_NUMBER"), on="CNR_NUMBER", how="anti")
333
- print("Unmapped hearings -> cases:", missed.height)
334
-
335
- # 7.2 Consistency & Timeline Checks
336
- neg_disp = (
337
- cases.filter(pl.col("DISPOSALTIME_ADJ") < 1)
338
- if "DISPOSALTIME_ADJ" in cases.columns
339
- else pl.DataFrame()
340
- )
341
- print(
342
- "Negative/zero disposal adjusted days rows:",
343
- neg_disp.height if isinstance(neg_disp, pl.DataFrame) else 0,
344
- )
345
-
346
- if (
347
- set(["DATE_FILED", "DECISION_DATE"]).issubset(cases.columns)
348
- and "BusinessOnDate" in hearings.columns
349
- ):
350
- h2 = hearings.join(
351
- cases.select(["CNR_NUMBER", "DATE_FILED", "DECISION_DATE"]),
352
- on="CNR_NUMBER",
353
- how="left",
354
- )
355
- # Categorize anomalies for better diagnosis
356
- before_filed = h2.filter(
357
- pl.col("BusinessOnDate").is_not_null()
358
- & pl.col("DATE_FILED").is_not_null()
359
- & (pl.col("BusinessOnDate") < pl.col("DATE_FILED"))
360
- )
361
- after_decision = h2.filter(
362
- pl.col("BusinessOnDate").is_not_null()
363
- & pl.col("DECISION_DATE").is_not_null()
364
- & (pl.col("BusinessOnDate") > pl.col("DECISION_DATE"))
365
- )
366
- missing_bounds = h2.filter(
367
- pl.col("BusinessOnDate").is_not_null()
368
- & (pl.col("DATE_FILED").is_null() | pl.col("DECISION_DATE").is_null())
369
- )
370
- print(
371
- "Hearings outside case lifecycle:",
372
- before_filed.height + after_decision.height,
373
- "(before_filed=",
374
- before_filed.height,
375
- ", after_decision=",
376
- after_decision.height,
377
- ", missing_bounds=",
378
- missing_bounds.height,
379
- ")",
380
- )
381
-
382
- # 7.3 Rich Hearing Gap Statistics
383
- if "BusinessOnDate" in hearings.columns and "CNR_NUMBER" in hearings.columns:
384
- hearing_gaps = (
385
- hearings.filter(pl.col("BusinessOnDate").is_not_null())
386
- .sort(["CNR_NUMBER", "BusinessOnDate"])
387
- .with_columns(
388
- ((pl.col("BusinessOnDate") - pl.col("BusinessOnDate").shift(1)) / timedelta(days=1))
389
- .over("CNR_NUMBER")
390
- .alias("HEARING_GAP_DAYS")
391
- )
392
- )
393
- gap_stats = hearing_gaps.group_by("CNR_NUMBER").agg(
394
- [
395
- pl.col("HEARING_GAP_DAYS").mean().alias("GAP_MEAN"),
396
- pl.col("HEARING_GAP_DAYS").median().alias("GAP_MEDIAN"),
397
- pl.col("HEARING_GAP_DAYS").quantile(0.25).alias("GAP_P25"),
398
- pl.col("HEARING_GAP_DAYS").quantile(0.75).alias("GAP_P75"),
399
- pl.col("HEARING_GAP_DAYS").std(ddof=1).alias("GAP_STD"),
400
- pl.col("HEARING_GAP_DAYS").count().alias("N_GAPS"),
401
- ]
402
- )
403
- cases = cases.join(gap_stats, on="CNR_NUMBER", how="left")
404
-
405
- # Plot: Median hearing gap by case type
406
- try:
407
- fig_gap = px.box(
408
- cases.to_pandas(),
409
- x="CASE_TYPE",
410
- y="GAP_MEDIAN",
411
- points=False,
412
- title="Median Hearing Gap by Case Type",
413
- )
414
- fig_gap.write_html(OUT_DIR / "9_gap_median_by_type.html")
415
- _copy_to_versioned("9_gap_median_by_type.html")
416
- except Exception as e:
417
- print("Gap median plot error:", e)
418
-
419
- # 7.4 Stage Transitions & Durations
420
- stage_col = "Remappedstages" if "Remappedstages" in hearings.columns else None
421
- transitions = None
422
- stage_duration = None
423
- if stage_col:
424
- # Define a canonical stage order to enforce left-to-right Sankey
425
- STAGE_ORDER = [
426
- "PRE-ADMISSION",
427
- "ADMISSION",
428
- "FRAMING OF CHARGES",
429
- "EVIDENCE",
430
- "ARGUMENTS",
431
- "INTERLOCUTORY APPLICATION",
432
- "SETTLEMENT",
433
- "ORDERS / JUDGMENT",
434
- "FINAL DISPOSAL",
435
- "OTHER",
436
- "NA",
437
- ]
438
-
439
- h_stage = (
440
- hearings.filter(pl.col("BusinessOnDate").is_not_null())
441
- .sort(["CNR_NUMBER", "BusinessOnDate"])
442
- .with_columns(
443
- [
444
- pl.col(stage_col)
445
- .fill_null("NA")
446
- .map_elements(
447
- lambda s: s if s in STAGE_ORDER else ("OTHER" if s is not None else "NA")
448
- )
449
- .alias("STAGE"),
450
- pl.col("BusinessOnDate").alias("DT"),
451
- ]
452
- )
453
- .with_columns(
454
- [
455
- (pl.col("STAGE") != pl.col("STAGE").shift(1))
456
- .over("CNR_NUMBER")
457
- .alias("STAGE_CHANGE"),
458
- ]
459
- )
460
- )
461
-
462
- # All transitions from row i to i+1 within case
463
- transitions_raw = (
464
- h_stage.with_columns(
465
- [
466
- pl.col("STAGE").alias("STAGE_FROM"),
467
- pl.col("STAGE").shift(-1).over("CNR_NUMBER").alias("STAGE_TO"),
468
- ]
469
- )
470
- .filter(pl.col("STAGE_TO").is_not_null())
471
- .group_by(["STAGE_FROM", "STAGE_TO"])
472
- .agg(pl.len().alias("N"))
473
- .sort("N", descending=True)
474
- )
475
-
476
- # Filter to non-regressive or same-stage transitions based on STAGE_ORDER index
477
- order_idx = {s: i for i, s in enumerate(STAGE_ORDER)}
478
- transitions = transitions_raw.filter(
479
- pl.col("STAGE_FROM").map_elements(lambda s: order_idx.get(s, 10))
480
- <= pl.col("STAGE_TO").map_elements(lambda s: order_idx.get(s, 10))
481
- )
482
-
483
- print("\nTop stage transitions (filtered, head):\n", transitions.head(20))
484
-
485
- # Run-lengths by stage to estimate time-in-stage
486
- runs = (
487
- h_stage.with_columns(
488
- [
489
- pl.when(pl.col("STAGE_CHANGE"))
490
- .then(1)
491
- .otherwise(0)
492
- .cum_sum()
493
- .over("CNR_NUMBER")
494
- .alias("RUN_ID")
495
- ]
496
- )
497
- .group_by(["CNR_NUMBER", "STAGE", "RUN_ID"])
498
- .agg(
499
- [
500
- pl.col("DT").min().alias("RUN_START"),
501
- pl.col("DT").max().alias("RUN_END"),
502
- pl.len().alias("HEARINGS_IN_RUN"),
503
- ]
504
- )
505
- .with_columns(
506
- ((pl.col("RUN_END") - pl.col("RUN_START")) / timedelta(days=1)).alias("RUN_DAYS")
507
- )
508
- )
509
- stage_duration = (
510
- runs.group_by("STAGE")
511
- .agg(
512
- [
513
- pl.col("RUN_DAYS").median().alias("RUN_MEDIAN_DAYS"),
514
- pl.col("RUN_DAYS").mean().alias("RUN_MEAN_DAYS"),
515
- pl.col("HEARINGS_IN_RUN").median().alias("HEARINGS_PER_RUN_MED"),
516
- pl.len().alias("N_RUNS"),
517
- ]
518
- )
519
- .sort("RUN_MEDIAN_DAYS", descending=True)
520
- )
521
- print("\nStage duration summary:\n", stage_duration)
522
-
523
- # Sankey with ordered nodes following STAGE_ORDER
524
- try:
525
- tr_df = transitions.to_pandas()
526
- labels = [
527
- s for s in STAGE_ORDER if s in set(tr_df["STAGE_FROM"]).union(set(tr_df["STAGE_TO"]))
528
- ]
529
- idx = {l: i for i, l in enumerate(labels)}
530
- tr_df = tr_df[tr_df["STAGE_FROM"].isin(labels) & tr_df["STAGE_TO"].isin(labels)].copy()
531
- tr_df = tr_df.sort_values(by=["STAGE_FROM", "STAGE_TO"], key=lambda c: c.map(idx))
532
- sankey = go.Figure(
533
- data=[
534
- go.Sankey(
535
- arrangement="snap",
536
- node=dict(label=labels, pad=15, thickness=18),
537
- link=dict(
538
- source=tr_df["STAGE_FROM"].map(idx).tolist(),
539
- target=tr_df["STAGE_TO"].map(idx).tolist(),
540
- value=tr_df["N"].tolist(),
541
- ),
542
- )
543
- ]
544
- )
545
- sankey.update_layout(title_text="Stage Transition Sankey (Ordered, non-regressive)")
546
- sankey.write_html(OUT_DIR / "10_stage_transition_sankey.html")
547
- _copy_to_versioned("10_stage_transition_sankey.html")
548
- except Exception as e:
549
- print("Sankey error:", e)
550
-
551
- # Bottleneck impact bar
552
- try:
553
- st_pd = stage_duration.with_columns(
554
- (pl.col("RUN_MEDIAN_DAYS") * pl.col("N_RUNS")).alias("IMPACT")
555
- ).to_pandas()
556
- fig_b = px.bar(
557
- st_pd.sort_values("IMPACT", ascending=False),
558
- x="STAGE",
559
- y="IMPACT",
560
- title="Stage Bottleneck Impact (Median Days x Runs)",
561
- )
562
- fig_b.write_html(OUT_DIR / "15_bottleneck_impact.html")
563
- _copy_to_versioned("15_bottleneck_impact.html")
564
- except Exception as e:
565
- print("Bottleneck plot error:", e)
566
-
567
- # 7.5 Cohort Analysis by Filing Year & Case Type
568
- if "YEAR_FILED" in cases.columns and "CASE_TYPE" in cases.columns:
569
- cohort = (
570
- cases.filter(pl.col("YEAR_FILED").is_not_null())
571
- .group_by(["YEAR_FILED", "CASE_TYPE"])
572
- .agg(
573
- [
574
- pl.col("DISPOSALTIME_ADJ").count().alias("N"),
575
- pl.col("DISPOSALTIME_ADJ").median().alias("Q50"),
576
- pl.col("DISPOSALTIME_ADJ").quantile(0.9).alias("Q90"),
577
- pl.col("DISPOSALTIME_ADJ").mean().alias("MEAN"),
578
- ]
579
- )
580
- .sort(["YEAR_FILED", "CASE_TYPE"])
581
- )
582
- try:
583
- fig_c = px.line(
584
- cohort.to_pandas(),
585
- x="YEAR_FILED",
586
- y="Q50",
587
- color="CASE_TYPE",
588
- title="Median Disposal Days by Filing Year & Case Type",
589
- )
590
- fig_c.write_html(OUT_DIR / "13_cohort_median_disposal.html")
591
- _copy_to_versioned("13_cohort_median_disposal.html")
592
- except Exception as e:
593
- print("Cohort plot error:", e)
594
-
595
- # 7.6 Seasonality & Calendar Effects
596
- if "BusinessOnDate" in hearings.columns:
597
- m_hear = (
598
- hearings.filter(pl.col("BusinessOnDate").is_not_null())
599
- .with_columns(
600
- [
601
- pl.col("BusinessOnDate").dt.year().alias("Y"),
602
- pl.col("BusinessOnDate").dt.month().alias("M"),
603
- ]
604
- )
605
- .with_columns(
606
- [
607
- # First day of month date for plotting
608
- pl.date(pl.col("Y"), pl.col("M"), pl.lit(1)).alias("YM")
609
- ]
610
- )
611
- )
612
- monthly_listings = m_hear.group_by("YM").agg(pl.len().alias("N_HEARINGS")).sort("YM")
613
- try:
614
- fig_m = px.line(
615
- monthly_listings.to_pandas(), x="YM", y="N_HEARINGS", title="Monthly Hearings Listed"
616
- )
617
- fig_m.update_layout(yaxis=dict(tickformat=",d"))
618
- fig_m.write_html(OUT_DIR / "11_monthly_hearings.html")
619
- _copy_to_versioned("11_monthly_hearings.html")
620
- except Exception as e:
621
- print("Monthly listings plot error:", e)
622
-
623
- # Waterfall: month-over-month change with anomaly flags
624
- try:
625
- ml = monthly_listings.with_columns(
626
- [
627
- pl.col("N_HEARINGS").shift(1).alias("PREV"),
628
- (pl.col("N_HEARINGS") - pl.col("N_HEARINGS").shift(1)).alias("DELTA"),
629
- ]
630
- )
631
- ml_pd = ml.to_pandas()
632
- # Rolling z-score over 12-month window for anomaly detection
633
- ml_pd["ROLL_MEAN"] = ml_pd["N_HEARINGS"].rolling(window=12, min_periods=6).mean()
634
- ml_pd["ROLL_STD"] = ml_pd["N_HEARINGS"].rolling(window=12, min_periods=6).std()
635
- ml_pd["Z"] = (ml_pd["N_HEARINGS"] - ml_pd["ROLL_MEAN"]) / ml_pd["ROLL_STD"]
636
- ml_pd["ANOM"] = ml_pd["Z"].abs() >= 3.0
637
-
638
- # Build waterfall values: first is absolute level, others are deltas
639
- measures = ["relative"] * len(ml_pd)
640
- measures[0] = "absolute"
641
- y_vals = ml_pd["DELTA"].astype(float).fillna(ml_pd["N_HEARINGS"].astype(float)).tolist()
642
- fig_w = go.Figure(
643
- go.Waterfall(
644
- x=ml_pd["YM"],
645
- measure=measures,
646
- y=y_vals,
647
- text=[f"{int(v):,}" if pd.notnull(v) else "" for v in ml_pd["N_HEARINGS"]],
648
- increasing=dict(marker=dict(color="seagreen")),
649
- decreasing=dict(marker=dict(color="indianred")),
650
- connector={"line": {"color": "rgb(110,110,110)"}},
651
- )
652
- )
653
- # Highlight anomalies as red markers on top
654
- fig_w.add_trace(
655
- go.Scatter(
656
- x=ml_pd.loc[ml_pd["ANOM"], "YM"],
657
- y=ml_pd.loc[ml_pd["ANOM"], "N_HEARINGS"],
658
- mode="markers",
659
- marker=dict(color="crimson", size=8),
660
- name="Anomaly (|z|>=3)",
661
- )
662
- )
663
- fig_w.update_layout(
664
- title="Monthly Hearings Waterfall (MoM change) with Anomalies",
665
- yaxis=dict(tickformat=",d"),
666
- )
667
- fig_w.write_html(OUT_DIR / "11b_monthly_waterfall.html")
668
- _copy_to_versioned("11b_monthly_waterfall.html")
669
-
670
- # Export anomalies CSV
671
- ml_pd_out = ml_pd.copy()
672
- ml_pd_out["YM"] = ml_pd_out["YM"].astype(str)
673
- ml_pd_out.to_csv(OUT_DIR / "monthly_anomalies.csv", index=False)
674
- _copy_to_versioned("monthly_anomalies.csv")
675
- except Exception as e:
676
- print("Monthly waterfall error:", e)
677
-
678
- # DOW x Month heatmap
679
- dow_heat = (
680
- hearings.filter(pl.col("BusinessOnDate").is_not_null())
681
- .with_columns(
682
- [
683
- pl.col("BusinessOnDate").dt.weekday().alias("DOW"),
684
- pl.col("BusinessOnDate").dt.month().alias("MONTH"),
685
- ]
686
- )
687
- .group_by(["MONTH", "DOW"])
688
- .agg(pl.len().alias("N"))
689
- )
690
- try:
691
- fig_heat = px.density_heatmap(
692
- dow_heat.to_pandas(), x="DOW", y="MONTH", z="N", title="Hearings by Weekday and Month"
693
- )
694
- fig_heat.write_html(OUT_DIR / "16_dow_month_heatmap.html")
695
- _copy_to_versioned("16_dow_month_heatmap.html")
696
- except Exception as e:
697
- print("DOW-Month heatmap error:", e)
698
-
699
- # 7.7 Purpose Text Normalization & Tagging
700
- text_col = None
701
- for c in ["PurposeofHearing", "Purpose of Hearing", "PURPOSE_OF_HEARING"]:
702
- if c in hearings.columns:
703
- text_col = c
704
- break
705
-
706
-
707
- def _has_kw_expr(col: str, kws: list[str]):
708
- expr = None
709
- for k in kws:
710
- e = pl.col(col).str.contains(k)
711
- expr = e if expr is None else (expr | e)
712
- return (expr if expr is not None else pl.lit(False)).fill_null(False)
713
-
714
-
715
- if text_col:
716
- hear_txt = hearings.with_columns(
717
- [pl.col(text_col).cast(pl.Utf8).str.strip_chars().str.to_uppercase().alias("PURPOSE_TXT")]
718
- )
719
- async_kw = ["NON-COMPLIANCE", "OFFICE OBJECTION", "COMPLIANCE", "NOTICE", "SERVICE", "LISTING"]
720
- subs_kw = ["EVIDENCE", "ARGUMENT", "FINAL HEARING", "JUDGMENT", "ORDER", "DISPOSAL"]
721
- hear_txt = hear_txt.with_columns(
722
- [
723
- pl.when(_has_kw_expr("PURPOSE_TXT", async_kw))
724
- .then(pl.lit("ASYNC_OR_ADMIN"))
725
- .when(_has_kw_expr("PURPOSE_TXT", subs_kw))
726
- .then(pl.lit("SUBSTANTIVE"))
727
- .otherwise(pl.lit("UNKNOWN"))
728
- .alias("PURPOSE_TAG")
729
- ]
730
- )
731
- tag_share = (
732
- hear_txt.group_by(["CASE_TYPE", "PURPOSE_TAG"])
733
- .agg(pl.len().alias("N"))
734
- .with_columns((pl.col("N") / pl.col("N").sum().over("CASE_TYPE")).alias("SHARE"))
735
- .sort(["CASE_TYPE", "SHARE"], descending=[False, True])
736
- )
737
- try:
738
- fig_t = px.bar(
739
- tag_share.to_pandas(),
740
- x="CASE_TYPE",
741
- y="SHARE",
742
- color="PURPOSE_TAG",
743
- title="Purpose Tag Shares by Case Type",
744
- barmode="stack",
745
- )
746
- fig_t.write_html(OUT_DIR / "14_purpose_tag_shares.html")
747
- _copy_to_versioned("14_purpose_tag_shares.html")
748
- except Exception as e:
749
- print("Purpose shares plot error:", e)
750
-
751
- # 7.8 Judge/Day Workload & Throughput (use hearing-level judge only)
752
- judge_col = None
753
- for c in [
754
- "BeforeHonourableJudge",
755
- ]:
756
- if c in hearings.columns:
757
- judge_col = c
758
- break
759
-
760
- if judge_col and "BusinessOnDate" in hearings.columns:
761
- jday = (
762
- hearings.filter(pl.col("BusinessOnDate").is_not_null())
763
- .group_by([judge_col, "BusinessOnDate"])
764
- .agg(pl.len().alias("N_HEARINGS"))
765
- )
766
- try:
767
- fig_j = px.box(
768
- jday.to_pandas(), x=judge_col, y="N_HEARINGS", title="Per-day Hearings per Judge"
769
- )
770
- fig_j.update_layout(
771
- xaxis={"categoryorder": "total descending"}, yaxis=dict(tickformat=",d")
772
- )
773
- fig_j.write_html(OUT_DIR / "12_judge_day_load.html")
774
- _copy_to_versioned("12_judge_day_load.html")
775
- except Exception as e:
776
- print("Judge day load plot error:", e)
777
-
778
- # Court/day workload if courtroom columns are present
779
- court_col = None
780
- for cc in ["COURT_NUMBER", "COURT_NAME"]:
781
- if cc in hearings.columns:
782
- court_col = cc
783
- break
784
- if court_col:
785
- cday = (
786
- hearings.filter(pl.col("BusinessOnDate").is_not_null())
787
- .group_by([court_col, "BusinessOnDate"])
788
- .agg(pl.len().alias("N_HEARINGS"))
789
- )
790
- try:
791
- fig_court = px.box(
792
- cday.to_pandas(),
793
- x=court_col,
794
- y="N_HEARINGS",
795
- title="Per-day Hearings per Courtroom",
796
- )
797
- fig_court.update_layout(
798
- xaxis={"categoryorder": "total descending"}, yaxis=dict(tickformat=",d")
799
- )
800
- fig_court.write_html(OUT_DIR / "12b_court_day_load.html")
801
- _copy_to_versioned("12b_court_day_load.html")
802
- except Exception as e:
803
- print("Court day load plot error:", e)
804
-
805
- # 7.9 Bottlenecks & Outliers
806
- try:
807
- long_tail = (
808
- cases.sort("DISPOSALTIME_ADJ", descending=True)
809
- .select(
810
- ["CNR_NUMBER", "CASE_TYPE", "DISPOSALTIME_ADJ", "N_HEARINGS", "GAP_MEDIAN", "GAP_P75"]
811
- )
812
- .head(50)
813
- )
814
- print("\nLongest disposal cases (top 50):\n", long_tail)
815
- except Exception as e:
816
- print("Long-tail extraction error:", e)
817
-
818
- if transitions is not None:
819
- try:
820
- self_transitions = (
821
- transitions.filter(pl.col("STAGE_FROM") == pl.col("STAGE_TO"))
822
- .select(pl.sum("N"))
823
- .to_series()[0]
824
- )
825
- print(
826
- "Self-transitions (same stage repeated):",
827
- int(self_transitions) if self_transitions is not None else 0,
828
- )
829
- except Exception as e:
830
- print("Self-transition calc error:", e)
831
-
832
- # 7.9.b Feature outliers (overall and within type)
833
- try:
834
- # Compute z-scores within CASE_TYPE for selected features
835
- feat_cols = ["DISPOSALTIME_ADJ", "N_HEARINGS", "GAP_MEDIAN", "GAP_STD"]
836
- df = cases
837
- for fc in feat_cols:
838
- if fc not in df.columns:
839
- df = df.with_columns(pl.lit(None).alias(fc))
840
- z_within = df.with_columns(
841
- [
842
- (
843
- (pl.col(fc) - pl.col(fc).mean().over("CASE_TYPE"))
844
- / pl.col(fc).std().over("CASE_TYPE")
845
- ).alias(f"Z_{fc}_TYPE")
846
- for fc in feat_cols
847
- ]
848
- )
849
- # Flag outliers for |z|>=3
850
- z_within = z_within.with_columns(
851
- [(pl.col(f"Z_{fc}_TYPE").abs() >= 3.0).alias(f"OUT_{fc}_TYPE") for fc in feat_cols]
852
- )
853
-
854
- # Collect existing outlier flag columns and filter rows with any outlier
855
- outlier_cols = [f"OUT_{fc}_TYPE" for fc in feat_cols if f"OUT_{fc}_TYPE" in z_within.columns]
856
- out_any = z_within.filter(pl.any_horizontal(*[pl.col(col) for col in outlier_cols]))
857
-
858
- out_path = OUT_DIR / "feature_outliers.csv"
859
- out_any.select(
860
- ["CNR_NUMBER", "CASE_TYPE"] + feat_cols + [f"Z_{fc}_TYPE" for fc in feat_cols]
861
- ).write_csv(out_path)
862
- _copy_to_versioned("feature_outliers.csv")
863
- print("Feature outliers exported to", out_path.resolve())
864
- except Exception as e:
865
- print("Feature outliers error:", e)
866
-
867
- # 7.10 Scheduler-ready Features & Alerts
868
- if "BusinessOnDate" in hearings.columns:
869
- h_latest = (
870
- hearings.filter(pl.col("BusinessOnDate").is_not_null())
871
- .sort(["CNR_NUMBER", "BusinessOnDate"])
872
- .group_by("CNR_NUMBER")
873
- .agg(
874
- [
875
- pl.col("BusinessOnDate").max().alias("LAST_HEARING"),
876
- (pl.col(stage_col).last() if stage_col else pl.lit(None)).alias("LAST_STAGE"),
877
- (pl.col(stage_col).n_unique() if stage_col else pl.lit(None)).alias(
878
- "N_DISTINCT_STAGES"
879
- ),
880
- ]
881
- )
882
- )
883
- cases = cases.join(h_latest, on="CNR_NUMBER", how="left")
884
-
885
- cases = cases.with_columns(
886
- [
887
- pl.when(pl.col("N_HEARINGS") > 50).then(50).otherwise(pl.col("N_HEARINGS")).alias("NH_CAP"),
888
- pl.when(pl.col("GAP_MEDIAN").is_null() | (pl.col("GAP_MEDIAN") <= 0))
889
- .then(999.0)
890
- .otherwise(pl.col("GAP_MEDIAN"))
891
- .alias("GAPM_SAFE"),
892
- ]
893
- )
894
-
895
- # Clamp GAPM_SAFE to 100 in a separate step (Polars may not allow referencing columns
896
- # created within the same with_columns call across expressions in older versions)
897
- cases = cases.with_columns(
898
- [
899
- pl.when(pl.col("GAPM_SAFE") > 100)
900
- .then(100.0)
901
- .otherwise(pl.col("GAPM_SAFE"))
902
- .alias("GAPM_CLAMP"),
903
- ]
904
- )
905
-
906
- cases = cases.with_columns(
907
- [
908
- (
909
- # progress term (0-40)
910
- (pl.when((pl.col("NH_CAP") / 50) > 1.0).then(1.0).otherwise(pl.col("NH_CAP") / 50) * 40)
911
- # momentum term (0-30)
912
- + (
913
- pl.when((100 / pl.col("GAPM_CLAMP")) > 1.0)
914
- .then(1.0)
915
- .otherwise(100 / pl.col("GAPM_CLAMP"))
916
- * 30
917
- )
918
- # stage bonus (10 or 30)
919
- + pl.when(pl.col("LAST_STAGE").is_in(["ARGUMENTS", "EVIDENCE", "ORDERS / JUDGMENT"]))
920
- .then(30)
921
- .otherwise(10)
922
- ).alias("READINESS_SCORE_RAW")
923
- ]
924
- )
925
- cases = cases.with_columns(
926
- pl.when(pl.col("READINESS_SCORE_RAW") > 100.0)
927
- .then(100.0)
928
- .otherwise(pl.col("READINESS_SCORE_RAW"))
929
- .alias("READINESS_SCORE")
930
- )
931
-
932
- # Diagnostic preview to validate readiness components
933
- try:
934
- print(
935
- "\nREADINESS sample:\n",
936
- cases.select(["NH_CAP", "GAPM_SAFE", "GAPM_CLAMP", "READINESS_SCORE"]).head(5),
937
- )
938
- except Exception as e:
939
- print("Readiness diagnostic error:", e)
940
-
941
- # Alert flags within type
942
- try:
943
- cases = cases.with_columns(
944
- [
945
- (
946
- pl.col("DISPOSALTIME_ADJ")
947
- > pl.col("DISPOSALTIME_ADJ").quantile(0.9).over("CASE_TYPE")
948
- ).alias("ALERT_P90_TYPE"),
949
- (pl.col("N_HEARINGS") > pl.col("N_HEARINGS").quantile(0.9).over("CASE_TYPE")).alias(
950
- "ALERT_HEARING_HEAVY"
951
- ),
952
- (pl.col("GAP_MEDIAN") > pl.col("GAP_MEDIAN").quantile(0.9).over("CASE_TYPE")).alias(
953
- "ALERT_LONG_GAP"
954
- ),
955
- ]
956
- )
957
- except Exception as e:
958
- print("Alert flags calc error:", e)
959
-
960
- # Export compact features CSV
961
- try:
962
- (
963
- cases.select(
964
- [
965
- "CNR_NUMBER",
966
- "CASE_TYPE",
967
- "YEAR_FILED",
968
- "YEAR_DECISION",
969
- "DISPOSALTIME_ADJ",
970
- "N_HEARINGS",
971
- "GAP_MEDIAN",
972
- "GAP_STD",
973
- "LAST_HEARING",
974
- "DAYS_SINCE_LAST_HEARING",
975
- "LAST_STAGE",
976
- "READINESS_SCORE",
977
- "ALERT_P90_TYPE",
978
- "ALERT_HEARING_HEAVY",
979
- "ALERT_LONG_GAP",
980
- ]
981
- )
982
- ).write_csv(OUT_DIR / "cases_features.csv")
983
- print("Exported cases_features.csv to", (OUT_DIR / "cases_features.csv").resolve())
984
- except Exception as e:
985
- print("Export features CSV error:", e)
986
-
987
- # 7.11 Run metadata
988
- try:
989
- meta = {
990
- "version": VERSION,
991
- "timestamp": RUN_TS,
992
- "cases_shape": list(cases.shape),
993
- "hearings_shape": list(hearings.shape),
994
- "cases_columns": cases.columns,
995
- "hearings_columns": hearings.columns,
996
- }
997
- with open(OUT_DIR_VER / "metadata.json", "w", encoding="utf-8") as f:
998
- json.dump(meta, f, indent=2, default=str)
999
- except Exception as e:
1000
- print("Metadata export error:", e)
 
1
+ """Entrypoint to run the full EDA + parameter pipeline.
2
 
3
+ Order:
4
+ 1. Load & clean (save Parquet + metadata)
5
+ 2. Visual EDA (plots + CSV summaries)
6
+ 3. Parameter extraction (JSON/CSV priors + features)
7
  """
8
 
9
+ from src.eda_exploration import run_exploration
10
+ from src.eda_load_clean import run_load_and_clean
11
+ from src.eda_parameters import run_parameter_export
 
 
 
 
12
 
13
+ if __name__ == "__main__":
14
+ print("Step 1/3: Load and clean")
15
+ run_load_and_clean()
 
 
16
 
17
+ print("\nStep 2/3: Exploratory analysis and plots")
18
+ run_exploration()
19
 
20
+ print("\nStep 3/3: Parameter extraction for simulation/scheduler")
21
+ run_parameter_export()
 
 
 
 
 
 
22
 
23
+ print("\nAll steps complete.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/eda_config.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Shared configuration and helpers for EDA pipeline."""
2
+
3
+ import json
4
+ import shutil
5
+ from datetime import datetime
6
+ from pathlib import Path
7
+
8
+ # -------------------------------------------------------------------
9
+ # Paths and versioning
10
+ # -------------------------------------------------------------------
11
+ DATA_DIR = Path("Data")
12
+ CASES_FILE = DATA_DIR / "ISDMHack_Cases_WPfinal.csv"
13
+ HEAR_FILE = DATA_DIR / "ISDMHack_Hear.csv"
14
+
15
+ REPORTS_DIR = Path("reports")
16
+ FIGURES_DIR = REPORTS_DIR / "figures"
17
+ FIGURES_DIR.mkdir(parents=True, exist_ok=True)
18
+
19
+ VERSION = "v0.4.0"
20
+ RUN_TS = datetime.now().strftime("%Y%m%d_%H%M%S")
21
+
22
+ RUN_DIR = FIGURES_DIR / f"{VERSION}_{RUN_TS}"
23
+ RUN_DIR.mkdir(parents=True, exist_ok=True)
24
+
25
+ PARAMS_DIR = RUN_DIR / "params"
26
+ PARAMS_DIR.mkdir(parents=True, exist_ok=True)
27
+
28
+ # cleaned data outputs
29
+ CASES_CLEAN_PARQUET = RUN_DIR / "cases_clean.parquet"
30
+ HEARINGS_CLEAN_PARQUET = RUN_DIR / "hearings_clean.parquet"
31
+
32
+ # -------------------------------------------------------------------
33
+ # Null tokens and canonicalisation
34
+ # -------------------------------------------------------------------
35
+ NULL_TOKENS = ["", "NULL", "Null", "null", "NA", "N/A", "na", "NaN", "nan", "-", "--"]
36
+
37
+
38
+ def copy_to_versioned(filename: str) -> None:
39
+ """Copy a file from FIGURES_DIR to RUN_DIR for versioned snapshots."""
40
+ src = FIGURES_DIR / filename
41
+ dst = RUN_DIR / filename
42
+ try:
43
+ if src.exists():
44
+ shutil.copyfile(src, dst)
45
+ except Exception as e:
46
+ print(f"[WARN] Versioned copy failed for {filename}: {e}")
47
+
48
+
49
+ def write_metadata(meta: dict) -> None:
50
+ """Write run metadata into RUN_DIR/metadata.json."""
51
+ meta_path = RUN_DIR / "metadata.json"
52
+ try:
53
+ with open(meta_path, "w", encoding="utf-8") as f:
54
+ json.dump(meta, f, indent=2, default=str)
55
+ except Exception as e:
56
+ print(f"[WARN] Metadata export error: {e}")
src/eda_exploration.py ADDED
@@ -0,0 +1,509 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Module 2: Visual and descriptive EDA.
2
+
3
+ Responsibilities:
4
+ - Case type distribution, filing trends, disposal distribution.
5
+ - Hearing gap distributions by type.
6
+ - Stage transition Sankey & stage bottlenecks.
7
+ - Cohorts by filing year.
8
+ - Seasonality and monthly anomalies.
9
+ - Judge and courtroom workload.
10
+ - Purpose tags and stage frequency.
11
+
12
+ Inputs:
13
+ - Cleaned Parquet from eda_load_clean.
14
+
15
+ Outputs:
16
+ - Interactive HTML plots in FIGURES_DIR and versioned copies in RUN_DIR.
17
+ - Some CSV summaries (e.g., stage_duration.csv, transitions.csv, monthly_anomalies.csv).
18
+ """
19
+
20
+ from datetime import timedelta
21
+
22
+ import pandas as pd
23
+ import plotly.express as px
24
+ import plotly.graph_objects as go
25
+ import plotly.io as pio
26
+ import polars as pl
27
+ from src.eda_config import (
28
+ CASES_CLEAN_PARQUET,
29
+ FIGURES_DIR,
30
+ HEARINGS_CLEAN_PARQUET,
31
+ RUN_DIR,
32
+ copy_to_versioned,
33
+ )
34
+
35
+ pio.renderers.default = "browser"
36
+
37
+
38
+ def load_cleaned():
39
+ cases = pl.read_parquet(CASES_CLEAN_PARQUET)
40
+ hearings = pl.read_parquet(HEARINGS_CLEAN_PARQUET)
41
+ print("Loaded cleaned data for exploration")
42
+ print("Cases:", cases.shape, "Hearings:", hearings.shape)
43
+ return cases, hearings
44
+
45
+
46
+ def run_exploration() -> None:
47
+ cases, hearings = load_cleaned()
48
+ cases_pd = cases.to_pandas()
49
+ hearings_pd = hearings.to_pandas()
50
+
51
+ # --------------------------------------------------
52
+ # 1. Case Type Distribution
53
+ # --------------------------------------------------
54
+ fig1 = px.bar(
55
+ cases_pd,
56
+ x="CASE_TYPE",
57
+ color="CASE_TYPE",
58
+ title="Case Type Distribution",
59
+ )
60
+ fig1.update_layout(showlegend=False, xaxis_title="Case Type", yaxis_title="Number of Cases")
61
+ f1 = "1_case_type_distribution.html"
62
+ fig1.write_html(FIGURES_DIR / f1)
63
+ copy_to_versioned(f1)
64
+
65
+ # --------------------------------------------------
66
+ # 2. Filing Trends by Year
67
+ # --------------------------------------------------
68
+ if "YEAR_FILED" in cases_pd.columns:
69
+ year_counts = cases_pd.groupby("YEAR_FILED")["CNR_NUMBER"].count().reset_index(name="Count")
70
+ fig2 = px.line(
71
+ year_counts, x="YEAR_FILED", y="Count", markers=True, title="Cases Filed by Year"
72
+ )
73
+ fig2.update_traces(line_color="royalblue")
74
+ fig2.update_layout(xaxis=dict(rangeslider=dict(visible=True)))
75
+ f2 = "2_cases_filed_by_year.html"
76
+ fig2.write_html(FIGURES_DIR / f2)
77
+ copy_to_versioned(f2)
78
+
79
+ # --------------------------------------------------
80
+ # 3. Disposal Duration Distribution
81
+ # --------------------------------------------------
82
+ if "DISPOSALTIME_ADJ" in cases_pd.columns:
83
+ fig3 = px.histogram(
84
+ cases_pd,
85
+ x="DISPOSALTIME_ADJ",
86
+ nbins=50,
87
+ title="Distribution of Disposal Time (Adjusted Days)",
88
+ color_discrete_sequence=["indianred"],
89
+ )
90
+ fig3.update_layout(xaxis_title="Days", yaxis_title="Cases")
91
+ f3 = "3_disposal_time_distribution.html"
92
+ fig3.write_html(FIGURES_DIR / f3)
93
+ copy_to_versioned(f3)
94
+
95
+ # --------------------------------------------------
96
+ # 4. Hearings vs Disposal Time
97
+ # --------------------------------------------------
98
+ if {"N_HEARINGS", "DISPOSALTIME_ADJ"}.issubset(cases_pd.columns):
99
+ fig4 = px.scatter(
100
+ cases_pd,
101
+ x="N_HEARINGS",
102
+ y="DISPOSALTIME_ADJ",
103
+ color="CASE_TYPE",
104
+ hover_data=["CNR_NUMBER", "YEAR_FILED"],
105
+ title="Hearings vs Disposal Duration",
106
+ )
107
+ fig4.update_traces(marker=dict(size=6, opacity=0.7))
108
+ f4 = "4_hearings_vs_disposal.html"
109
+ fig4.write_html(FIGURES_DIR / f4)
110
+ copy_to_versioned(f4)
111
+
112
+ # --------------------------------------------------
113
+ # 5. Boxplot by Case Type
114
+ # --------------------------------------------------
115
+ fig5 = px.box(
116
+ cases_pd,
117
+ x="CASE_TYPE",
118
+ y="DISPOSALTIME_ADJ",
119
+ color="CASE_TYPE",
120
+ title="Disposal Time (Adjusted) by Case Type",
121
+ )
122
+ fig5.update_layout(showlegend=False)
123
+ f5 = "5_box_disposal_by_type.html"
124
+ fig5.write_html(FIGURES_DIR / f5)
125
+ copy_to_versioned(f5)
126
+
127
+ # --------------------------------------------------
128
+ # 6. Stage Frequency
129
+ # --------------------------------------------------
130
+ if "Remappedstages" in hearings_pd.columns:
131
+ stage_counts = hearings_pd["Remappedstages"].value_counts().reset_index()
132
+ stage_counts.columns = ["Stage", "Count"]
133
+ fig6 = px.bar(
134
+ stage_counts,
135
+ x="Stage",
136
+ y="Count",
137
+ color="Stage",
138
+ title="Frequency of Hearing Stages",
139
+ )
140
+ fig6.update_layout(showlegend=False, xaxis_title="Stage", yaxis_title="Count")
141
+ f6 = "6_stage_frequency.html"
142
+ fig6.write_html(FIGURES_DIR / f6)
143
+ copy_to_versioned(f6)
144
+
145
+ # --------------------------------------------------
146
+ # 7. Gap median by case type
147
+ # --------------------------------------------------
148
+ if "GAP_MEDIAN" in cases_pd.columns:
149
+ fig_gap = px.box(
150
+ cases_pd,
151
+ x="CASE_TYPE",
152
+ y="GAP_MEDIAN",
153
+ points=False,
154
+ title="Median Hearing Gap by Case Type",
155
+ )
156
+ fg = "9_gap_median_by_type.html"
157
+ fig_gap.write_html(FIGURES_DIR / fg)
158
+ copy_to_versioned(fg)
159
+
160
+ # --------------------------------------------------
161
+ # 8. Stage transitions & bottleneck plot
162
+ # --------------------------------------------------
163
+ stage_col = "Remappedstages" if "Remappedstages" in hearings.columns else None
164
+ transitions = None
165
+ stage_duration = None
166
+ if stage_col and "BusinessOnDate" in hearings.columns:
167
+ STAGE_ORDER = [
168
+ "PRE-ADMISSION",
169
+ "ADMISSION",
170
+ "FRAMING OF CHARGES",
171
+ "EVIDENCE",
172
+ "ARGUMENTS",
173
+ "INTERLOCUTORY APPLICATION",
174
+ "SETTLEMENT",
175
+ "ORDERS / JUDGMENT",
176
+ "FINAL DISPOSAL",
177
+ "OTHER",
178
+ "NA",
179
+ ]
180
+ order_idx = {s: i for i, s in enumerate(STAGE_ORDER)}
181
+
182
+ h_stage = (
183
+ hearings.filter(pl.col("BusinessOnDate").is_not_null())
184
+ .sort(["CNR_NUMBER", "BusinessOnDate"])
185
+ .with_columns(
186
+ [
187
+ pl.col(stage_col)
188
+ .fill_null("NA")
189
+ .map_elements(
190
+ lambda s: s if s in STAGE_ORDER else ("OTHER" if s is not None else "NA")
191
+ )
192
+ .alias("STAGE"),
193
+ pl.col("BusinessOnDate").alias("DT"),
194
+ ]
195
+ )
196
+ .with_columns(
197
+ [
198
+ (pl.col("STAGE") != pl.col("STAGE").shift(1))
199
+ .over("CNR_NUMBER")
200
+ .alias("STAGE_CHANGE"),
201
+ ]
202
+ )
203
+ )
204
+
205
+ transitions_raw = (
206
+ h_stage.with_columns(
207
+ [
208
+ pl.col("STAGE").alias("STAGE_FROM"),
209
+ pl.col("STAGE").shift(-1).over("CNR_NUMBER").alias("STAGE_TO"),
210
+ ]
211
+ )
212
+ .filter(pl.col("STAGE_TO").is_not_null())
213
+ .group_by(["STAGE_FROM", "STAGE_TO"])
214
+ .agg(pl.len().alias("N"))
215
+ )
216
+
217
+ transitions = transitions_raw.filter(
218
+ pl.col("STAGE_FROM").map_elements(lambda s: order_idx.get(s, 10))
219
+ <= pl.col("STAGE_TO").map_elements(lambda s: order_idx.get(s, 10))
220
+ ).sort("N", descending=True)
221
+
222
+ transitions.write_csv(RUN_DIR / "transitions.csv")
223
+
224
+ runs = (
225
+ h_stage.with_columns(
226
+ [
227
+ pl.when(pl.col("STAGE_CHANGE"))
228
+ .then(1)
229
+ .otherwise(0)
230
+ .cum_sum()
231
+ .over("CNR_NUMBER")
232
+ .alias("RUN_ID")
233
+ ]
234
+ )
235
+ .group_by(["CNR_NUMBER", "STAGE", "RUN_ID"])
236
+ .agg(
237
+ [
238
+ pl.col("DT").min().alias("RUN_START"),
239
+ pl.col("DT").max().alias("RUN_END"),
240
+ pl.len().alias("HEARINGS_IN_RUN"),
241
+ ]
242
+ )
243
+ .with_columns(
244
+ ((pl.col("RUN_END") - pl.col("RUN_START")) / timedelta(days=1)).alias("RUN_DAYS")
245
+ )
246
+ )
247
+ stage_duration = (
248
+ runs.group_by("STAGE")
249
+ .agg(
250
+ [
251
+ pl.col("RUN_DAYS").median().alias("RUN_MEDIAN_DAYS"),
252
+ pl.col("RUN_DAYS").mean().alias("RUN_MEAN_DAYS"),
253
+ pl.col("HEARINGS_IN_RUN").median().alias("HEARINGS_PER_RUN_MED"),
254
+ pl.len().alias("N_RUNS"),
255
+ ]
256
+ )
257
+ .sort("RUN_MEDIAN_DAYS", descending=True)
258
+ )
259
+ stage_duration.write_csv(RUN_DIR / "stage_duration.csv")
260
+
261
+ # Sankey
262
+ try:
263
+ tr_df = transitions.to_pandas()
264
+ labels = [
265
+ s
266
+ for s in STAGE_ORDER
267
+ if s in set(tr_df["STAGE_FROM"]).union(set(tr_df["STAGE_TO"]))
268
+ ]
269
+ idx = {l: i for i, l in enumerate(labels)}
270
+ tr_df = tr_df[tr_df["STAGE_FROM"].isin(labels) & tr_df["STAGE_TO"].isin(labels)].copy()
271
+ tr_df = tr_df.sort_values(by=["STAGE_FROM", "STAGE_TO"], key=lambda c: c.map(idx))
272
+ sankey = go.Figure(
273
+ data=[
274
+ go.Sankey(
275
+ arrangement="snap",
276
+ node=dict(label=labels, pad=15, thickness=18),
277
+ link=dict(
278
+ source=tr_df["STAGE_FROM"].map(idx).tolist(),
279
+ target=tr_df["STAGE_TO"].map(idx).tolist(),
280
+ value=tr_df["N"].tolist(),
281
+ ),
282
+ )
283
+ ]
284
+ )
285
+ sankey.update_layout(title_text="Stage Transition Sankey (Ordered)")
286
+ f10 = "10_stage_transition_sankey.html"
287
+ sankey.write_html(FIGURES_DIR / f10)
288
+ copy_to_versioned(f10)
289
+ except Exception as e:
290
+ print("Sankey error:", e)
291
+
292
+ # Bottleneck impact
293
+ try:
294
+ st_pd = stage_duration.with_columns(
295
+ (pl.col("RUN_MEDIAN_DAYS") * pl.col("N_RUNS")).alias("IMPACT")
296
+ ).to_pandas()
297
+ fig_b = px.bar(
298
+ st_pd.sort_values("IMPACT", ascending=False),
299
+ x="STAGE",
300
+ y="IMPACT",
301
+ title="Stage Bottleneck Impact (Median Days x Runs)",
302
+ )
303
+ fb = "15_bottleneck_impact.html"
304
+ fig_b.write_html(FIGURES_DIR / fb)
305
+ copy_to_versioned(fb)
306
+ except Exception as e:
307
+ print("Bottleneck plot error:", e)
308
+
309
+ # --------------------------------------------------
310
+ # 9. Monthly seasonality and anomalies
311
+ # --------------------------------------------------
312
+ if "BusinessOnDate" in hearings.columns:
313
+ m_hear = (
314
+ hearings.filter(pl.col("BusinessOnDate").is_not_null())
315
+ .with_columns(
316
+ [
317
+ pl.col("BusinessOnDate").dt.year().alias("Y"),
318
+ pl.col("BusinessOnDate").dt.month().alias("M"),
319
+ ]
320
+ )
321
+ .with_columns(pl.date(pl.col("Y"), pl.col("M"), pl.lit(1)).alias("YM"))
322
+ )
323
+ monthly_listings = m_hear.group_by("YM").agg(pl.len().alias("N_HEARINGS")).sort("YM")
324
+ monthly_listings.write_csv(RUN_DIR / "monthly_hearings.csv")
325
+
326
+ try:
327
+ fig_m = px.line(
328
+ monthly_listings.to_pandas(),
329
+ x="YM",
330
+ y="N_HEARINGS",
331
+ title="Monthly Hearings Listed",
332
+ )
333
+ fig_m.update_layout(yaxis=dict(tickformat=",d"))
334
+ fm = "11_monthly_hearings.html"
335
+ fig_m.write_html(FIGURES_DIR / fm)
336
+ copy_to_versioned(fm)
337
+ except Exception as e:
338
+ print("Monthly listings error:", e)
339
+
340
+ # Waterfall + anomalies
341
+ try:
342
+ ml = monthly_listings.with_columns(
343
+ [
344
+ pl.col("N_HEARINGS").shift(1).alias("PREV"),
345
+ (pl.col("N_HEARINGS") - pl.col("N_HEARINGS").shift(1)).alias("DELTA"),
346
+ ]
347
+ )
348
+ ml_pd = ml.to_pandas()
349
+ ml_pd["ROLL_MEAN"] = ml_pd["N_HEARINGS"].rolling(window=12, min_periods=6).mean()
350
+ ml_pd["ROLL_STD"] = ml_pd["N_HEARINGS"].rolling(window=12, min_periods=6).std()
351
+ ml_pd["Z"] = (ml_pd["N_HEARINGS"] - ml_pd["ROLL_MEAN"]) / ml_pd["ROLL_STD"]
352
+ ml_pd["ANOM"] = ml_pd["Z"].abs() >= 3.0
353
+
354
+ measures = ["relative"] * len(ml_pd)
355
+ measures[0] = "absolute"
356
+ y_vals = ml_pd["DELTA"].astype(float).fillna(ml_pd["N_HEARINGS"].astype(float)).tolist()
357
+
358
+ fig_w = go.Figure(
359
+ go.Waterfall(
360
+ x=ml_pd["YM"],
361
+ measure=measures,
362
+ y=y_vals,
363
+ text=[f"{int(v):,}" if pd.notnull(v) else "" for v in ml_pd["N_HEARINGS"]],
364
+ increasing=dict(marker=dict(color="seagreen")),
365
+ decreasing=dict(marker=dict(color="indianred")),
366
+ connector={"line": {"color": "rgb(110,110,110)"}},
367
+ )
368
+ )
369
+ fig_w.add_trace(
370
+ go.Scatter(
371
+ x=ml_pd.loc[ml_pd["ANOM"], "YM"],
372
+ y=ml_pd.loc[ml_pd["ANOM"], "N_HEARINGS"],
373
+ mode="markers",
374
+ marker=dict(color="crimson", size=8),
375
+ name="Anomaly (|z|>=3)",
376
+ )
377
+ )
378
+ fig_w.update_layout(
379
+ title="Monthly Hearings Waterfall (MoM change) with Anomalies",
380
+ yaxis=dict(tickformat=",d"),
381
+ )
382
+ fw = "11b_monthly_waterfall.html"
383
+ fig_w.write_html(FIGURES_DIR / fw)
384
+ copy_to_versioned(fw)
385
+
386
+ ml_pd_out = ml_pd.copy()
387
+ ml_pd_out["YM"] = ml_pd_out["YM"].astype(str)
388
+ ml_pd_out.to_csv(RUN_DIR / "monthly_anomalies.csv", index=False)
389
+ except Exception as e:
390
+ print("Monthly waterfall error:", e)
391
+
392
+ # --------------------------------------------------
393
+ # 10. Judge and court workload
394
+ # --------------------------------------------------
395
+ judge_col = None
396
+ for c in [
397
+ "BeforeHonourableJudge",
398
+ "Before Hon'ble Judges",
399
+ "Before_Honble_Judges",
400
+ "NJDG_JUDGE_NAME",
401
+ ]:
402
+ if c in hearings.columns:
403
+ judge_col = c
404
+ break
405
+
406
+ if judge_col and "BusinessOnDate" in hearings.columns:
407
+ jday = (
408
+ hearings.filter(pl.col("BusinessOnDate").is_not_null())
409
+ .group_by([judge_col, "BusinessOnDate"])
410
+ .agg(pl.len().alias("N_HEARINGS"))
411
+ )
412
+ try:
413
+ fig_j = px.box(
414
+ jday.to_pandas(),
415
+ x=judge_col,
416
+ y="N_HEARINGS",
417
+ title="Per-day Hearings per Judge",
418
+ )
419
+ fig_j.update_layout(
420
+ xaxis={"categoryorder": "total descending"}, yaxis=dict(tickformat=",d")
421
+ )
422
+ fj = "12_judge_day_load.html"
423
+ fig_j.write_html(FIGURES_DIR / fj)
424
+ copy_to_versioned(fj)
425
+ except Exception as e:
426
+ print("Judge workload error:", e)
427
+
428
+ court_col = None
429
+ for cc in ["COURT_NUMBER", "CourtName"]:
430
+ if cc in hearings.columns:
431
+ court_col = cc
432
+ break
433
+ if court_col and "BusinessOnDate" in hearings.columns:
434
+ cday = (
435
+ hearings.filter(pl.col("BusinessOnDate").is_not_null())
436
+ .group_by([court_col, "BusinessOnDate"])
437
+ .agg(pl.len().alias("N_HEARINGS"))
438
+ )
439
+ try:
440
+ fig_court = px.box(
441
+ cday.to_pandas(),
442
+ x=court_col,
443
+ y="N_HEARINGS",
444
+ title="Per-day Hearings per Courtroom",
445
+ )
446
+ fig_court.update_layout(
447
+ xaxis={"categoryorder": "total descending"}, yaxis=dict(tickformat=",d")
448
+ )
449
+ fc = "12b_court_day_load.html"
450
+ fig_court.write_html(FIGURES_DIR / fc)
451
+ copy_to_versioned(fc)
452
+ except Exception as e:
453
+ print("Court workload error:", e)
454
+
455
+ # --------------------------------------------------
456
+ # 11. Purpose tagging distributions
457
+ # --------------------------------------------------
458
+ text_col = None
459
+ for c in ["PurposeofHearing", "Purpose of Hearing", "PURPOSE_OF_HEARING"]:
460
+ if c in hearings.columns:
461
+ text_col = c
462
+ break
463
+
464
+ def _has_kw_expr(col: str, kws: list[str]):
465
+ expr = None
466
+ for k in kws:
467
+ e = pl.col(col).str.contains(k)
468
+ expr = e if expr is None else (expr | e)
469
+ return (expr if expr is not None else pl.lit(False)).fill_null(False)
470
+
471
+ if text_col:
472
+ hear_txt = hearings.with_columns(
473
+ pl.col(text_col).cast(pl.Utf8).str.strip_chars().str.to_uppercase().alias("PURPOSE_TXT")
474
+ )
475
+ async_kw = ["NON-COMPLIANCE", "OFFICE OBJECTION", "COMPLIANCE", "NOTICE", "SERVICE"]
476
+ subs_kw = ["EVIDENCE", "ARGUMENT", "FINAL HEARING", "JUDGMENT", "ORDER", "DISPOSAL"]
477
+ hear_txt = hear_txt.with_columns(
478
+ pl.when(_has_kw_expr("PURPOSE_TXT", async_kw))
479
+ .then(pl.lit("ASYNC_OR_ADMIN"))
480
+ .when(_has_kw_expr("PURPOSE_TXT", subs_kw))
481
+ .then(pl.lit("SUBSTANTIVE"))
482
+ .otherwise(pl.lit("UNKNOWN"))
483
+ .alias("PURPOSE_TAG")
484
+ )
485
+ tag_share = (
486
+ hear_txt.group_by(["CASE_TYPE", "PURPOSE_TAG"])
487
+ .agg(pl.len().alias("N"))
488
+ .with_columns((pl.col("N") / pl.col("N").sum().over("CASE_TYPE")).alias("SHARE"))
489
+ .sort(["CASE_TYPE", "SHARE"], descending=[False, True])
490
+ )
491
+ tag_share.write_csv(RUN_DIR / "purpose_tag_shares.csv")
492
+ try:
493
+ fig_t = px.bar(
494
+ tag_share.to_pandas(),
495
+ x="CASE_TYPE",
496
+ y="SHARE",
497
+ color="PURPOSE_TAG",
498
+ title="Purpose Tag Shares by Case Type",
499
+ barmode="stack",
500
+ )
501
+ ft = "14_purpose_tag_shares.html"
502
+ fig_t.write_html(FIGURES_DIR / ft)
503
+ copy_to_versioned(ft)
504
+ except Exception as e:
505
+ print("Purpose shares error:", e)
506
+
507
+
508
+ if __name__ == "__main__":
509
+ run_exploration()
src/eda_load_clean.py ADDED
@@ -0,0 +1,236 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Module 1: Load, clean, and augment the High Court dataset.
2
+
3
+ Responsibilities:
4
+ - Read CSVs with robust null handling.
5
+ - Normalise key text columns (case type, stages, judge names).
6
+ - Basic integrity checks (nulls, duplicates, lifecycle).
7
+ - Compute core per-case hearing gap stats (mean/median/std).
8
+ - Save cleaned data as Parquet for downstream modules.
9
+ """
10
+
11
+ from datetime import timedelta
12
+
13
+ import polars as pl
14
+ from src.eda_config import (
15
+ CASES_CLEAN_PARQUET,
16
+ CASES_FILE,
17
+ HEAR_FILE,
18
+ HEARINGS_CLEAN_PARQUET,
19
+ NULL_TOKENS,
20
+ RUN_TS,
21
+ VERSION,
22
+ write_metadata,
23
+ )
24
+
25
+
26
+ # -------------------------------------------------------------------
27
+ # Helpers
28
+ # -------------------------------------------------------------------
29
+ def _norm_text_col(df: pl.DataFrame, col: str) -> pl.DataFrame:
30
+ if col not in df.columns:
31
+ return df
32
+ return df.with_columns(
33
+ pl.when(
34
+ pl.col(col)
35
+ .cast(pl.Utf8)
36
+ .str.strip_chars()
37
+ .str.to_uppercase()
38
+ .is_in(["", "NA", "N/A", "NULL", "NONE", "-", "--"])
39
+ )
40
+ .then(pl.lit(None))
41
+ .otherwise(pl.col(col).cast(pl.Utf8).str.strip_chars().str.to_uppercase())
42
+ .alias(col)
43
+ )
44
+
45
+
46
+ def _null_summary(df: pl.DataFrame, name: str) -> None:
47
+ print(f"\n=== Null summary ({name}) ===")
48
+ n = df.height
49
+ row = {"TABLE": name, "ROWS": n}
50
+ for c in df.columns:
51
+ row[f"{c}__nulls"] = int(df.select(pl.col(c).is_null().sum()).item())
52
+ print(row)
53
+
54
+
55
+ # -------------------------------------------------------------------
56
+ # Main logic
57
+ # -------------------------------------------------------------------
58
+ def load_raw() -> tuple[pl.DataFrame, pl.DataFrame]:
59
+ print("Loading raw data with Polars...")
60
+ cases = pl.read_csv(
61
+ CASES_FILE,
62
+ try_parse_dates=True,
63
+ null_values=NULL_TOKENS,
64
+ infer_schema_length=100_000,
65
+ )
66
+ hearings = pl.read_csv(
67
+ HEAR_FILE,
68
+ try_parse_dates=True,
69
+ null_values=NULL_TOKENS,
70
+ infer_schema_length=100_000,
71
+ )
72
+ print(f"Cases shape: {cases.shape}")
73
+ print(f"Hearings shape: {hearings.shape}")
74
+ return cases, hearings
75
+
76
+
77
+ def clean_and_augment(
78
+ cases: pl.DataFrame, hearings: pl.DataFrame
79
+ ) -> tuple[pl.DataFrame, pl.DataFrame]:
80
+ # Standardise date columns if needed
81
+ for col in ["DATE_FILED", "DECISION_DATE", "REGISTRATION_DATE", "LAST_SYNC_TIME"]:
82
+ if col in cases.columns and cases[col].dtype == pl.Utf8:
83
+ cases = cases.with_columns(pl.col(col).str.strptime(pl.Date, "%d-%m-%Y", strict=False))
84
+
85
+ # Deduplicate on keys
86
+ if "CNR_NUMBER" in cases.columns:
87
+ cases = cases.unique(subset=["CNR_NUMBER"])
88
+ if "Hearing_ID" in hearings.columns:
89
+ hearings = hearings.unique(subset=["Hearing_ID"])
90
+
91
+ # Normalise key text fields
92
+ cases = _norm_text_col(cases, "CASE_TYPE")
93
+
94
+ for c in [
95
+ "Remappedstages",
96
+ "PurposeofHearing",
97
+ "BeforeHonourableJudge",
98
+ ]:
99
+ hearings = _norm_text_col(hearings, c)
100
+
101
+ # Simple stage canonicalisation
102
+ if "Remappedstages" in hearings.columns:
103
+ STAGE_MAP = {
104
+ "ORDERS/JUDGMENTS": "ORDERS / JUDGMENT",
105
+ "ORDER/JUDGMENT": "ORDERS / JUDGMENT",
106
+ "ORDERS / JUDGMENT": "ORDERS / JUDGMENT",
107
+ "ORDERS /JUDGMENT": "ORDERS / JUDGMENT",
108
+ "INTERLOCUTARY APPLICATION": "INTERLOCUTORY APPLICATION",
109
+ "FRAMING OF CHARGE": "FRAMING OF CHARGES",
110
+ "PRE ADMISSION": "PRE-ADMISSION",
111
+ }
112
+ hearings = hearings.with_columns(
113
+ pl.col("Remappedstages")
114
+ .map_elements(lambda x: STAGE_MAP.get(x, x) if x is not None else None)
115
+ .alias("Remappedstages")
116
+ )
117
+
118
+ # Normalise disposal time
119
+ if "DISPOSALTIME_ADJ" in cases.columns:
120
+ cases = cases.with_columns(pl.col("DISPOSALTIME_ADJ").cast(pl.Int32))
121
+
122
+ # Year fields
123
+ if "DATE_FILED" in cases.columns:
124
+ cases = cases.with_columns(
125
+ [
126
+ pl.col("DATE_FILED").dt.year().alias("YEAR_FILED"),
127
+ pl.col("DECISION_DATE").dt.year().alias("YEAR_DECISION"),
128
+ ]
129
+ )
130
+
131
+ # Hearing counts per case
132
+ if {"CNR_NUMBER", "BusinessOnDate"}.issubset(hearings.columns):
133
+ hearing_freq = hearings.group_by("CNR_NUMBER").agg(
134
+ pl.count("BusinessOnDate").alias("N_HEARINGS")
135
+ )
136
+ cases = cases.join(hearing_freq, on="CNR_NUMBER", how="left")
137
+ else:
138
+ cases = cases.with_columns(pl.lit(0).alias("N_HEARINGS"))
139
+
140
+ # Per-case hearing gap stats (mean/median/std, p25, p75, count)
141
+ if {"CNR_NUMBER", "BusinessOnDate"}.issubset(hearings.columns):
142
+ hearing_gaps = (
143
+ hearings.filter(pl.col("BusinessOnDate").is_not_null())
144
+ .sort(["CNR_NUMBER", "BusinessOnDate"])
145
+ .with_columns(
146
+ ((pl.col("BusinessOnDate") - pl.col("BusinessOnDate").shift(1)) / timedelta(days=1))
147
+ .over("CNR_NUMBER")
148
+ .alias("HEARING_GAP_DAYS")
149
+ )
150
+ )
151
+ gap_stats = hearing_gaps.group_by("CNR_NUMBER").agg(
152
+ [
153
+ pl.col("HEARING_GAP_DAYS").mean().alias("GAP_MEAN"),
154
+ pl.col("HEARING_GAP_DAYS").median().alias("GAP_MEDIAN"),
155
+ pl.col("HEARING_GAP_DAYS").quantile(0.25).alias("GAP_P25"),
156
+ pl.col("HEARING_GAP_DAYS").quantile(0.75).alias("GAP_P75"),
157
+ pl.col("HEARING_GAP_DAYS").std(ddof=1).alias("GAP_STD"),
158
+ pl.col("HEARING_GAP_DAYS").count().alias("N_GAPS"),
159
+ ]
160
+ )
161
+ cases = cases.join(gap_stats, on="CNR_NUMBER", how="left")
162
+ else:
163
+ for col in ["GAP_MEAN", "GAP_MEDIAN", "GAP_P25", "GAP_P75", "GAP_STD", "N_GAPS"]:
164
+ cases = cases.with_columns(pl.lit(None).alias(col))
165
+
166
+ # Fill some basics
167
+ cases = cases.with_columns(
168
+ [
169
+ pl.col("N_HEARINGS").fill_null(0).cast(pl.Int64),
170
+ pl.col("GAP_MEDIAN").fill_null(0.0).cast(pl.Float64),
171
+ ]
172
+ )
173
+
174
+ # Print audits
175
+ print("\n=== dtypes (cases) ===")
176
+ print(cases.dtypes)
177
+ print("\n=== dtypes (hearings) ===")
178
+ print(hearings.dtypes)
179
+
180
+ _null_summary(cases, "cases")
181
+ _null_summary(hearings, "hearings")
182
+
183
+ # Simple lifecycle consistency check
184
+ if {"DATE_FILED", "DECISION_DATE"}.issubset(
185
+ cases.columns
186
+ ) and "BusinessOnDate" in hearings.columns:
187
+ h2 = hearings.join(
188
+ cases.select(["CNR_NUMBER", "DATE_FILED", "DECISION_DATE"]),
189
+ on="CNR_NUMBER",
190
+ how="left",
191
+ )
192
+ before_filed = h2.filter(
193
+ pl.col("BusinessOnDate").is_not_null()
194
+ & pl.col("DATE_FILED").is_not_null()
195
+ & (pl.col("BusinessOnDate") < pl.col("DATE_FILED"))
196
+ )
197
+ after_decision = h2.filter(
198
+ pl.col("BusinessOnDate").is_not_null()
199
+ & pl.col("DECISION_DATE").is_not_null()
200
+ & (pl.col("BusinessOnDate") > pl.col("DECISION_DATE"))
201
+ )
202
+ print(
203
+ "Hearings before filing:",
204
+ before_filed.height,
205
+ "| after decision:",
206
+ after_decision.height,
207
+ )
208
+
209
+ return cases, hearings
210
+
211
+
212
+ def save_clean(cases: pl.DataFrame, hearings: pl.DataFrame) -> None:
213
+ cases.write_parquet(CASES_CLEAN_PARQUET)
214
+ hearings.write_parquet(HEARINGS_CLEAN_PARQUET)
215
+ print(f"Saved cleaned cases -> {CASES_CLEAN_PARQUET}")
216
+ print(f"Saved cleaned hearings -> {HEARINGS_CLEAN_PARQUET}")
217
+
218
+ meta = {
219
+ "version": VERSION,
220
+ "timestamp": RUN_TS,
221
+ "cases_shape": list(cases.shape),
222
+ "hearings_shape": list(hearings.shape),
223
+ "cases_columns": cases.columns,
224
+ "hearings_columns": hearings.columns,
225
+ }
226
+ write_metadata(meta)
227
+
228
+
229
+ def run_load_and_clean() -> None:
230
+ cases_raw, hearings_raw = load_raw()
231
+ cases_clean, hearings_clean = clean_and_augment(cases_raw, hearings_raw)
232
+ save_clean(cases_clean, hearings_clean)
233
+
234
+
235
+ if __name__ == "__main__":
236
+ run_load_and_clean()
src/eda_parameters.py ADDED
@@ -0,0 +1,400 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Module 3: Parameter extraction for scheduling simulation / optimisation.
2
+
3
+ Responsibilities:
4
+ - Extract stage transition probabilities (per stage).
5
+ - Stage residence time distributions (medians, p90).
6
+ - Court capacity priors (median/p90 hearings per day).
7
+ - Adjournment and not-reached proxies by stage × case type.
8
+ - Entropy of stage transitions (predictability).
9
+ - Case-type summary stats (disposal, hearing counts, gaps).
10
+ - Readiness score and alert flags per case.
11
+ - Export JSON/CSV parameter files into PARAMS_DIR.
12
+ """
13
+
14
+ import json
15
+ from datetime import timedelta
16
+
17
+ import polars as pl
18
+ from src.eda_config import (
19
+ CASES_CLEAN_PARQUET,
20
+ HEARINGS_CLEAN_PARQUET,
21
+ PARAMS_DIR,
22
+ )
23
+
24
+
25
+ def load_cleaned():
26
+ cases = pl.read_parquet(CASES_CLEAN_PARQUET)
27
+ hearings = pl.read_parquet(HEARINGS_CLEAN_PARQUET)
28
+ return cases, hearings
29
+
30
+
31
+ def extract_parameters() -> None:
32
+ cases, hearings = load_cleaned()
33
+
34
+ # --------------------------------------------------
35
+ # 1. Stage transitions and probabilities
36
+ # --------------------------------------------------
37
+ stage_col = "Remappedstages" if "Remappedstages" in hearings.columns else None
38
+ transitions = None
39
+ stage_duration = None
40
+
41
+ if stage_col and "BusinessOnDate" in hearings.columns:
42
+ STAGE_ORDER = [
43
+ "PRE-ADMISSION",
44
+ "ADMISSION",
45
+ "FRAMING OF CHARGES",
46
+ "EVIDENCE",
47
+ "ARGUMENTS",
48
+ "INTERLOCUTORY APPLICATION",
49
+ "SETTLEMENT",
50
+ "ORDERS / JUDGMENT",
51
+ "FINAL DISPOSAL",
52
+ "OTHER",
53
+ "NA",
54
+ ]
55
+ order_idx = {s: i for i, s in enumerate(STAGE_ORDER)}
56
+
57
+ h_stage = (
58
+ hearings.filter(pl.col("BusinessOnDate").is_not_null())
59
+ .sort(["CNR_NUMBER", "BusinessOnDate"])
60
+ .with_columns(
61
+ [
62
+ pl.col(stage_col)
63
+ .fill_null("NA")
64
+ .map_elements(
65
+ lambda s: s if s in STAGE_ORDER else ("OTHER" if s is not None else "NA")
66
+ )
67
+ .alias("STAGE"),
68
+ pl.col("BusinessOnDate").alias("DT"),
69
+ ]
70
+ )
71
+ .with_columns(
72
+ [
73
+ (pl.col("STAGE") != pl.col("STAGE").shift(1))
74
+ .over("CNR_NUMBER")
75
+ .alias("STAGE_CHANGE"),
76
+ ]
77
+ )
78
+ )
79
+
80
+ transitions_raw = (
81
+ h_stage.with_columns(
82
+ [
83
+ pl.col("STAGE").alias("STAGE_FROM"),
84
+ pl.col("STAGE").shift(-1).over("CNR_NUMBER").alias("STAGE_TO"),
85
+ ]
86
+ )
87
+ .filter(pl.col("STAGE_TO").is_not_null())
88
+ .group_by(["STAGE_FROM", "STAGE_TO"])
89
+ .agg(pl.len().alias("N"))
90
+ )
91
+
92
+ transitions = transitions_raw.filter(
93
+ pl.col("STAGE_FROM").map_elements(lambda s: order_idx.get(s, 10))
94
+ <= pl.col("STAGE_TO").map_elements(lambda s: order_idx.get(s, 10))
95
+ ).sort("N", descending=True)
96
+
97
+ transitions.write_csv(PARAMS_DIR / "stage_transitions.csv")
98
+
99
+ # Probabilities per STAGE_FROM
100
+ row_tot = transitions.group_by("STAGE_FROM").agg(pl.col("N").sum().alias("row_n"))
101
+ trans_probs = transitions.join(row_tot, on="STAGE_FROM").with_columns(
102
+ (pl.col("N") / pl.col("row_n")).alias("p")
103
+ )
104
+ trans_probs.write_csv(PARAMS_DIR / "stage_transition_probs.csv")
105
+
106
+ # Entropy of transitions
107
+ ent = (
108
+ trans_probs.group_by("STAGE_FROM")
109
+ .agg((-(pl.col("p") * pl.col("p").log()).sum()).alias("entropy"))
110
+ .sort("entropy", descending=True)
111
+ )
112
+ ent.write_csv(PARAMS_DIR / "stage_transition_entropy.csv")
113
+
114
+ # Stage residence (runs)
115
+ runs = (
116
+ h_stage.with_columns(
117
+ [
118
+ pl.when(pl.col("STAGE_CHANGE"))
119
+ .then(1)
120
+ .otherwise(0)
121
+ .cum_sum()
122
+ .over("CNR_NUMBER")
123
+ .alias("RUN_ID")
124
+ ]
125
+ )
126
+ .group_by(["CNR_NUMBER", "STAGE", "RUN_ID"])
127
+ .agg(
128
+ [
129
+ pl.col("DT").min().alias("RUN_START"),
130
+ pl.col("DT").max().alias("RUN_END"),
131
+ pl.len().alias("HEARINGS_IN_RUN"),
132
+ ]
133
+ )
134
+ .with_columns(
135
+ ((pl.col("RUN_END") - pl.col("RUN_START")) / timedelta(days=1)).alias("RUN_DAYS")
136
+ )
137
+ )
138
+ stage_duration = (
139
+ runs.group_by("STAGE")
140
+ .agg(
141
+ [
142
+ pl.col("RUN_DAYS").median().alias("RUN_MEDIAN_DAYS"),
143
+ pl.col("RUN_DAYS").quantile(0.9).alias("RUN_P90_DAYS"),
144
+ pl.col("HEARINGS_IN_RUN").median().alias("HEARINGS_PER_RUN_MED"),
145
+ pl.len().alias("N_RUNS"),
146
+ ]
147
+ )
148
+ .sort("RUN_MEDIAN_DAYS", descending=True)
149
+ )
150
+ stage_duration.write_csv(PARAMS_DIR / "stage_duration.csv")
151
+
152
+ # --------------------------------------------------
153
+ # 2. Court capacity (cases per courtroom per day)
154
+ # --------------------------------------------------
155
+ capacity_stats = None
156
+ if {"BusinessOnDate", "CourtName"}.issubset(hearings.columns):
157
+ cap = (
158
+ hearings.filter(pl.col("BusinessOnDate").is_not_null())
159
+ .group_by(["CourtName", "BusinessOnDate"])
160
+ .agg(pl.len().alias("heard_count"))
161
+ )
162
+ cap_stats = (
163
+ cap.group_by("CourtName")
164
+ .agg(
165
+ [
166
+ pl.col("heard_count").median().alias("slots_median"),
167
+ pl.col("heard_count").quantile(0.9).alias("slots_p90"),
168
+ ]
169
+ )
170
+ .sort("slots_median", descending=True)
171
+ )
172
+ cap_stats.write_csv(PARAMS_DIR / "court_capacity_stats.csv")
173
+ # simple global aggregate
174
+ capacity_stats = {
175
+ "slots_median_global": float(cap["heard_count"].median()),
176
+ "slots_p90_global": float(cap["heard_count"].quantile(0.9)),
177
+ }
178
+ with open(PARAMS_DIR / "court_capacity_global.json", "w") as f:
179
+ json.dump(capacity_stats, f, indent=2)
180
+
181
+ # --------------------------------------------------
182
+ # 3. Adjournment and not-reached proxies
183
+ # --------------------------------------------------
184
+ if "BusinessOnDate" in hearings.columns and stage_col:
185
+ # recompute hearing gaps if needed
186
+ if "HEARING_GAP_DAYS" not in hearings.columns:
187
+ hearings = (
188
+ hearings.filter(pl.col("BusinessOnDate").is_not_null())
189
+ .sort(["CNR_NUMBER", "BusinessOnDate"])
190
+ .with_columns(
191
+ (
192
+ (pl.col("BusinessOnDate") - pl.col("BusinessOnDate").shift(1))
193
+ / timedelta(days=1)
194
+ )
195
+ .over("CNR_NUMBER")
196
+ .alias("HEARING_GAP_DAYS")
197
+ )
198
+ )
199
+
200
+ stage_median_gap = hearings.group_by("Remappedstages").agg(
201
+ pl.col("HEARING_GAP_DAYS").median().alias("gap_median")
202
+ )
203
+ hearings = hearings.join(stage_median_gap, on="Remappedstages", how="left")
204
+
205
+ def _contains_any(col: str, kws: list[str]):
206
+ expr = None
207
+ for k in kws:
208
+ e = pl.col(col).str.contains(k)
209
+ expr = e if expr is None else (expr | e)
210
+ return (expr if expr is not None else pl.lit(False)).fill_null(False)
211
+
212
+ # Not reached proxies from purpose text
213
+ text_col = None
214
+ for c in ["PurposeofHearing", "Purpose of Hearing", "PURPOSE_OF_HEARING"]:
215
+ if c in hearings.columns:
216
+ text_col = c
217
+ break
218
+
219
+ hearings = hearings.with_columns(
220
+ [
221
+ pl.when(pl.col("HEARING_GAP_DAYS") > (pl.col("gap_median") * 1.3))
222
+ .then(1)
223
+ .otherwise(0)
224
+ .alias("is_adjourn_proxy")
225
+ ]
226
+ )
227
+ if text_col:
228
+ hearings = hearings.with_columns(
229
+ pl.when(_contains_any(text_col, ["NOT REACHED", "NR", "NOT TAKEN UP", "NOT HEARD"]))
230
+ .then(1)
231
+ .otherwise(0)
232
+ .alias("is_not_reached_proxy")
233
+ )
234
+ else:
235
+ hearings = hearings.with_columns(pl.lit(0).alias("is_not_reached_proxy"))
236
+
237
+ outcome_stage = (
238
+ hearings.group_by(["Remappedstages", "casetype"])
239
+ .agg(
240
+ [
241
+ pl.mean("is_adjourn_proxy").alias("p_adjourn_proxy"),
242
+ pl.mean("is_not_reached_proxy").alias("p_not_reached_proxy"),
243
+ pl.count().alias("n"),
244
+ ]
245
+ )
246
+ .sort(["Remappedstages", "casetype"])
247
+ )
248
+ outcome_stage.write_csv(PARAMS_DIR / "adjournment_proxies.csv")
249
+
250
+ # --------------------------------------------------
251
+ # 4. Case-type summary and correlations
252
+ # --------------------------------------------------
253
+ by_type = (
254
+ cases.group_by("CASE_TYPE")
255
+ .agg(
256
+ [
257
+ pl.count().alias("n_cases"),
258
+ pl.col("DISPOSALTIME_ADJ").median().alias("disp_median"),
259
+ pl.col("DISPOSALTIME_ADJ").quantile(0.9).alias("disp_p90"),
260
+ pl.col("N_HEARINGS").median().alias("hear_median"),
261
+ pl.col("GAP_MEDIAN").median().alias("gap_median"),
262
+ ]
263
+ )
264
+ .sort("n_cases", descending=True)
265
+ )
266
+ by_type.write_csv(PARAMS_DIR / "case_type_summary.csv")
267
+
268
+ # Correlations for a quick diagnostic
269
+ corr_cols = ["DISPOSALTIME_ADJ", "N_HEARINGS", "GAP_MEDIAN"]
270
+ corr_df = cases.select(corr_cols).to_pandas()
271
+ corr = corr_df.corr(method="spearman")
272
+ corr.to_csv(PARAMS_DIR / "correlations_spearman.csv")
273
+
274
+ # --------------------------------------------------
275
+ # 5. Readiness score and alerts
276
+ # --------------------------------------------------
277
+ cases = cases.with_columns(
278
+ [
279
+ pl.when(pl.col("N_HEARINGS") > 50)
280
+ .then(50)
281
+ .otherwise(pl.col("N_HEARINGS"))
282
+ .alias("NH_CAP"),
283
+ pl.when(pl.col("GAP_MEDIAN").is_null() | (pl.col("GAP_MEDIAN") <= 0))
284
+ .then(999.0)
285
+ .otherwise(pl.col("GAP_MEDIAN"))
286
+ .alias("GAPM_SAFE"),
287
+ ]
288
+ )
289
+ cases = cases.with_columns(
290
+ pl.when(pl.col("GAPM_SAFE") > 100)
291
+ .then(100.0)
292
+ .otherwise(pl.col("GAPM_SAFE"))
293
+ .alias("GAPM_CLAMP")
294
+ )
295
+
296
+ # Stage at last hearing
297
+ if "BusinessOnDate" in hearings.columns and stage_col:
298
+ h_latest = (
299
+ hearings.filter(pl.col("BusinessOnDate").is_not_null())
300
+ .sort(["CNR_NUMBER", "BusinessOnDate"])
301
+ .group_by("CNR_NUMBER")
302
+ .agg(
303
+ [
304
+ pl.col("BusinessOnDate").max().alias("LAST_HEARING"),
305
+ pl.col(stage_col).last().alias("LAST_STAGE"),
306
+ pl.col(stage_col).n_unique().alias("N_DISTINCT_STAGES"),
307
+ ]
308
+ )
309
+ )
310
+ cases = cases.join(h_latest, on="CNR_NUMBER", how="left")
311
+ else:
312
+ cases = cases.with_columns(
313
+ [
314
+ pl.lit(None).alias("LAST_HEARING"),
315
+ pl.lit(None).alias("LAST_STAGE"),
316
+ pl.lit(None).alias("N_DISTINCT_STAGES"),
317
+ ]
318
+ )
319
+
320
+ # Normalised readiness in [0,1]
321
+ cases = cases.with_columns(
322
+ (
323
+ (pl.col("NH_CAP") / 50).clip(upper_bound=1.0) * 0.4
324
+ + (100 / pl.col("GAPM_CLAMP")).clip(upper_bound=1.0) * 0.3
325
+ + pl.when(pl.col("LAST_STAGE").is_in(["ARGUMENTS", "EVIDENCE", "ORDERS / JUDGMENT"]))
326
+ .then(0.3)
327
+ .otherwise(0.1)
328
+ ).alias("READINESS_SCORE")
329
+ )
330
+
331
+ # Alert flags (within case type)
332
+ try:
333
+ cases = cases.with_columns(
334
+ [
335
+ (
336
+ pl.col("DISPOSALTIME_ADJ")
337
+ > pl.col("DISPOSALTIME_ADJ").quantile(0.9).over("CASE_TYPE")
338
+ ).alias("ALERT_P90_TYPE"),
339
+ (pl.col("N_HEARINGS") > pl.col("N_HEARINGS").quantile(0.9).over("CASE_TYPE")).alias(
340
+ "ALERT_HEARING_HEAVY"
341
+ ),
342
+ (pl.col("GAP_MEDIAN") > pl.col("GAP_MEDIAN").quantile(0.9).over("CASE_TYPE")).alias(
343
+ "ALERT_LONG_GAP"
344
+ ),
345
+ ]
346
+ )
347
+ except Exception as e:
348
+ print("Alert flag computation error:", e)
349
+
350
+ feature_cols = [
351
+ "CNR_NUMBER",
352
+ "CASE_TYPE",
353
+ "YEAR_FILED",
354
+ "YEAR_DECISION",
355
+ "DISPOSALTIME_ADJ",
356
+ "N_HEARINGS",
357
+ "GAP_MEDIAN",
358
+ "GAP_STD",
359
+ "LAST_HEARING",
360
+ "LAST_STAGE",
361
+ "READINESS_SCORE",
362
+ "ALERT_P90_TYPE",
363
+ "ALERT_HEARING_HEAVY",
364
+ "ALERT_LONG_GAP",
365
+ ]
366
+ feature_cols_existing = [c for c in feature_cols if c in cases.columns]
367
+ cases.select(feature_cols_existing).write_csv(PARAMS_DIR / "cases_features.csv")
368
+
369
+ # Simple age funnel
370
+ if {"DATE_FILED", "DECISION_DATE"}.issubset(cases.columns):
371
+ age_funnel = (
372
+ cases.with_columns(
373
+ ((pl.col("DECISION_DATE") - pl.col("DATE_FILED")) / timedelta(days=365)).alias(
374
+ "AGE_YRS"
375
+ )
376
+ )
377
+ .with_columns(
378
+ pl.when(pl.col("AGE_YRS") < 1)
379
+ .then(pl.lit("<1y"))
380
+ .when(pl.col("AGE_YRS") < 3)
381
+ .then(pl.lit("1-3y"))
382
+ .when(pl.col("AGE_YRS") < 5)
383
+ .then(pl.lit("3-5y"))
384
+ .otherwise(pl.lit(">5y"))
385
+ .alias("AGE_BUCKET")
386
+ )
387
+ .group_by("AGE_BUCKET")
388
+ .agg(pl.len().alias("N"))
389
+ .sort("AGE_BUCKET")
390
+ )
391
+ age_funnel.write_csv(PARAMS_DIR / "age_funnel.csv")
392
+
393
+
394
+ def run_parameter_export() -> None:
395
+ extract_parameters()
396
+ print("Parameter extraction complete. Files in:", PARAMS_DIR.resolve())
397
+
398
+
399
+ if __name__ == "__main__":
400
+ run_parameter_export()