Spaces:
Sleeping
Sleeping
File size: 2,380 Bytes
54c8522 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 | """Analyze historical case and hearing data to understand realistic patterns."""
import pandas as pd
from pathlib import Path
# Load historical data
cases = pd.read_csv("data/ISDMHack_Cases_WPfinal.csv")
hearings = pd.read_csv("data/ISDMHack_Hear.csv")
print("="*80)
print("HISTORICAL DATA ANALYSIS")
print("="*80)
print(f"\nTotal cases: {len(cases):,}")
print(f"Total hearings: {len(hearings):,}")
print(f"Avg hearings per case: {len(hearings) / len(cases):.2f}")
# Hearing frequency per case
hear_per_case = hearings.groupby('CNR').size()
print(f"\nHearings per case distribution:")
print(hear_per_case.describe())
# Time between hearings
hearings['NEXT_HEARING_DATE'] = pd.to_datetime(hearings['NEXT_HEARING_DATE'], errors='coerce')
hearings = hearings.sort_values(['CNR', 'NEXT_HEARING_DATE'])
hearings['days_since_prev'] = hearings.groupby('CNR')['NEXT_HEARING_DATE'].diff().dt.days
print(f"\nDays between consecutive hearings (same case):")
print(hearings['days_since_prev'].describe())
print(f"Median gap: {hearings['days_since_prev'].median()} days")
# Cases filed per day
cases['FILING_DATE'] = pd.to_datetime(cases['FILING_DATE'], errors='coerce')
daily_filings = cases.groupby(cases['FILING_DATE'].dt.date).size()
print(f"\nDaily filing rate:")
print(daily_filings.describe())
print(f"Median: {daily_filings.median():.0f} cases/day")
# Case age at latest hearing
cases['DISPOSAL_DATE'] = pd.to_datetime(cases['DISPOSAL_DATE'], errors='coerce')
cases['age_days'] = (cases['DISPOSAL_DATE'] - cases['FILING_DATE']).dt.days
print(f"\nCase lifespan (filing to disposal):")
print(cases['age_days'].describe())
# Active cases at any point (pending)
cases_with_stage = cases[cases['CURRENT_STAGE'].notna()]
print(f"\nCurrent stage distribution:")
print(cases_with_stage['CURRENT_STAGE'].value_counts().head(10))
# Recommendation for simulation
print("\n" + "="*80)
print("RECOMMENDATIONS FOR REALISTIC SIMULATION")
print("="*80)
print(f"1. Case pool size: {len(cases):,} cases (use actual dataset size)")
print(f"2. Avg hearings/case: {len(hearings) / len(cases):.1f}")
print(f"3. Median gap between hearings: {hearings['days_since_prev'].median():.0f} days")
print(f"4. Daily filing rate: {daily_filings.median():.0f} cases/day")
print(f"5. For submission: Use ACTUAL case data, not synthetic")
print(f"6. Simulation period: Match historical period for validation")
|