Spaces:

RoyAalekh
/

hackathon_code4change

Sleeping

App Files Files Community

hackathon_code4change / eda /exploration.py

RoyAalekh

enhancements, added view for scehduled cases as tickets

9eaac57 about 2 months ago

raw

history blame contribute delete

18.9 kB

	"""Module 2: Visual and descriptive EDA.

	Responsibilities:
	- Case type distribution, filing trends, disposal distribution.
	- Hearing gap distributions by type.
	- Stage transition Sankey & stage bottlenecks.
	- Cohorts by filing year.
	- Seasonality and monthly anomalies.
	- Judge and courtroom workload.
	- Purpose tags and stage frequency.

	Inputs:
	- Cleaned Parquet from eda_load_clean.

	Outputs:
	- Interactive HTML plots in FIGURES_DIR and versioned copies in _get_run_dir().
	- Some CSV summaries (e.g., stage_duration.csv, transitions.csv, monthly_anomalies.csv).
	"""

	from datetime import timedelta

	import plotly.express as px
	import plotly.graph_objects as go
	import plotly.io as pio
	import polars as pl

	from eda.config import (
	_get_cases_parquet,
	_get_hearings_parquet,
	_get_run_dir,
	safe_write_figure,
	)


	px.defaults.template = "plotly_white"
	px.defaults.color_discrete_sequence = px.colors.qualitative.Set2
	pio.templates.default = "plotly_white"


	def load_cleaned():
	cases = pl.read_parquet(_get_cases_parquet())
	hearings = pl.read_parquet(_get_hearings_parquet())
	print("Loaded cleaned data for exploration")
	print("Cases:", cases.shape, "Hearings:", hearings.shape)
	return cases, hearings


	def run_exploration() -> None:
	cases, hearings = load_cleaned()
	# 1. Case Type Distribution
	# --------------------------------------------------
	try:
	ct_counts = (
	cases.group_by("CASE_TYPE")
	.agg(pl.len().alias("COUNT"))
	.sort("COUNT", descending=True)
	)
	fig1 = px.bar(
	ct_counts.to_pandas(),
	x="CASE_TYPE",
	y="COUNT",
	color="CASE_TYPE",
	title="Case Type Distribution",
	)
	fig1.update_layout(
	showlegend=False,
	xaxis_title="Case Type",
	yaxis_title="Number of Cases",
	xaxis_tickangle=-45,
	)
	safe_write_figure(fig1, "1_case_type_distribution.html")
	except Exception as e:
	print("Case type distribution error:", e)

	# --------------------------------------------------
	# 2. Filing Trends by Year (single line, no slider)
	# --------------------------------------------------

	if "YEAR_FILED" in cases.columns:
	year_counts = (
	cases.group_by("YEAR_FILED")
	.agg(pl.len().alias("Count"))
	.sort("YEAR_FILED", descending=False)
	)
	df_year = year_counts.to_pandas()
	fig2 = px.line(
	df_year,
	x="YEAR_FILED",
	y="Count",
	markers=True,
	title="Cases Filed by Year",
	)
	fig2.update_layout(xaxis_title="Year", yaxis_title="Cases")
	# Fix y-axis max to 10k (counts are known to be < 10k)
	fig2.update_yaxes(range=[0, 10000])
	f2 = "2_cases_filed_by_year.html"
	safe_write_figure(fig2, f2)

	# --------------------------------------------------
	# 3. Disposal Duration Distribution
	# --------------------------------------------------
	if "DISPOSALTIME_ADJ" in cases.columns:
	fig3 = px.histogram(
	x=cases["DISPOSALTIME_ADJ"].to_list(),
	nbins=50,
	title="Distribution of Disposal Time (Adjusted Days)",
	color_discrete_sequence=["indianred"],
	)
	fig3.update_layout(xaxis_title="Days", yaxis_title="Cases")
	f3 = "3_disposal_time_distribution.html"
	safe_write_figure(fig3, f3)

	# --------------------------------------------------
	# 4. Hearings vs Disposal Time
	# --------------------------------------------------
	if {"N_HEARINGS", "DISPOSALTIME_ADJ"}.issubset(set(cases.columns)):
	# Convert only necessary columns for plotting with color/hover metadata
	cases_scatter = cases.select(
	["N_HEARINGS", "DISPOSALTIME_ADJ", "CASE_TYPE", "CNR_NUMBER", "YEAR_FILED"]
	).to_pandas()
	fig4 = px.scatter(
	cases_scatter,
	x="N_HEARINGS",
	y="DISPOSALTIME_ADJ",
	color="CASE_TYPE",
	hover_data=["CNR_NUMBER", "YEAR_FILED"],
	title="Hearings vs Disposal Duration",
	)
	fig4.update_traces(marker=dict(size=6, opacity=0.7))
	f4 = "4_hearings_vs_disposal.html"
	safe_write_figure(fig4, f4)

	# --------------------------------------------------
	# 5. Boxplot by Case Type
	# --------------------------------------------------
	fig5 = px.box(
	cases.select(["CASE_TYPE", "DISPOSALTIME_ADJ"]).to_pandas(),
	x="CASE_TYPE",
	y="DISPOSALTIME_ADJ",
	color="CASE_TYPE",
	title="Disposal Time (Adjusted) by Case Type",
	)
	fig5.update_layout(showlegend=False, xaxis_tickangle=-45)
	f5 = "5_box_disposal_by_type.html"
	safe_write_figure(fig5, f5)

	# --------------------------------------------------
	# 6. Stage Frequency
	# --------------------------------------------------
	if "Remappedstages" in hearings.columns:
	stage_counts = (
	hearings["Remappedstages"]
	.value_counts()
	.rename({"Remappedstages": "Stage", "count": "Count"})
	)
	fig6 = px.bar(
	stage_counts.to_pandas(),
	x="Stage",
	y="Count",
	color="Stage",
	title="Frequency of Hearing Stages (Log Scale)",
	log_y=True,
	)
	fig6.update_layout(
	showlegend=False,
	xaxis_title="Stage",
	yaxis_title="Count (log scale)",
	xaxis_tickangle=-45,
	height=500,
	)
	f6 = "6_stage_frequency.html"
	safe_write_figure(fig6, f6)

	# --------------------------------------------------
	# 7. Gap median by case type
	# --------------------------------------------------
	if "GAP_MEDIAN" in cases.columns:
	fig_gap = px.box(
	cases.select(["CASE_TYPE", "GAP_MEDIAN"]).to_pandas(),
	x="CASE_TYPE",
	y="GAP_MEDIAN",
	points=False,
	title="Median Hearing Gap by Case Type",
	)
	fig_gap.update_layout(xaxis_tickangle=-45)
	fg = "9_gap_median_by_type.html"
	safe_write_figure(fig_gap, fg)

	# --------------------------------------------------
	# 8. Stage transitions & bottleneck plot
	# --------------------------------------------------
	stage_col = "Remappedstages" if "Remappedstages" in hearings.columns else None
	transitions = None
	stage_duration = None
	if stage_col and "BusinessOnDate" in hearings.columns:
	STAGE_ORDER = [
	"PRE-ADMISSION",
	"ADMISSION",
	"FRAMING OF CHARGES",
	"EVIDENCE",
	"ARGUMENTS",
	"INTERLOCUTORY APPLICATION",
	"SETTLEMENT",
	"ORDERS / JUDGMENT",
	"FINAL DISPOSAL",
	"OTHER",
	"NA",
	]
	order_idx = {s: i for i, s in enumerate(STAGE_ORDER)}

	h_stage = (
	hearings.filter(pl.col("BusinessOnDate").is_not_null())
	.sort(["CNR_NUMBER", "BusinessOnDate"])
	.with_columns(
	[
	pl.col(stage_col)
	.fill_null("NA")
	.map_elements(
	lambda s: s
	if s in STAGE_ORDER
	else ("OTHER" if s is not None else "NA")
	)
	.alias("STAGE"),
	pl.col("BusinessOnDate").alias("DT"),
	]
	)
	.with_columns(
	[
	(pl.col("STAGE") != pl.col("STAGE").shift(1))
	.over("CNR_NUMBER")
	.alias("STAGE_CHANGE"),
	]
	)
	)

	transitions_raw = (
	h_stage.with_columns(
	[
	pl.col("STAGE").alias("STAGE_FROM"),
	pl.col("STAGE").shift(-1).over("CNR_NUMBER").alias("STAGE_TO"),
	]
	)
	.filter(pl.col("STAGE_TO").is_not_null())
	.group_by(["STAGE_FROM", "STAGE_TO"])
	.agg(pl.len().alias("N"))
	)

	transitions = transitions_raw.filter(
	pl.col("STAGE_FROM").map_elements(lambda s: order_idx.get(s, 10))
	<= pl.col("STAGE_TO").map_elements(lambda s: order_idx.get(s, 10))
	).sort("N", descending=True)

	transitions.write_csv(str(_get_run_dir() / "transitions.csv"))

	runs = (
	h_stage.with_columns(
	[
	pl.when(pl.col("STAGE_CHANGE"))
	.then(1)
	.otherwise(0)
	.cum_sum()
	.over("CNR_NUMBER")
	.alias("RUN_ID")
	]
	)
	.group_by(["CNR_NUMBER", "STAGE", "RUN_ID"])
	.agg(
	[
	pl.col("DT").min().alias("RUN_START"),
	pl.col("DT").max().alias("RUN_END"),
	pl.len().alias("HEARINGS_IN_RUN"),
	]
	)
	.with_columns(
	((pl.col("RUN_END") - pl.col("RUN_START")) / timedelta(days=1)).alias(
	"RUN_DAYS"
	)
	)
	)
	stage_duration = (
	runs.group_by("STAGE")
	.agg(
	[
	pl.col("RUN_DAYS").median().alias("RUN_MEDIAN_DAYS"),
	pl.col("RUN_DAYS").mean().alias("RUN_MEAN_DAYS"),
	pl.col("HEARINGS_IN_RUN").median().alias("HEARINGS_PER_RUN_MED"),
	pl.len().alias("N_RUNS"),
	]
	)
	.sort("RUN_MEDIAN_DAYS", descending=True)
	)
	stage_duration.write_csv(str(_get_run_dir() / "stage_duration.csv"))

	# Sankey
	try:
	tr_df = transitions.to_pandas()
	labels = [
	s
	for s in STAGE_ORDER
	if s in set(tr_df["STAGE_FROM"]).union(set(tr_df["STAGE_TO"]))
	]
	idx = {label: i for i, label in enumerate(labels)}
	tr_df = tr_df[
	tr_df["STAGE_FROM"].isin(labels) & tr_df["STAGE_TO"].isin(labels)
	].copy()
	tr_df = tr_df.sort_values(
	by=["STAGE_FROM", "STAGE_TO"], key=lambda c: c.map(idx)
	)
	sankey = go.Figure(
	data=[
	go.Sankey(
	arrangement="snap",
	node=dict(label=labels, pad=15, thickness=18),
	link=dict(
	source=tr_df["STAGE_FROM"].map(idx).tolist(),
	target=tr_df["STAGE_TO"].map(idx).tolist(),
	value=tr_df["N"].tolist(),
	),
	)
	]
	)
	sankey.update_layout(
	title_text="Stage Transition Sankey (Ordered)",
	height=800,
	margin=dict(t=50, b=50, l=50, r=50),
	)
	f10 = "10_stage_transition_sankey.html"
	safe_write_figure(sankey, f10)
	except Exception as e:
	print("Sankey error:", e)

	# Bottleneck impact
	try:
	st_pd = stage_duration.with_columns(
	(pl.col("RUN_MEDIAN_DAYS") * pl.col("N_RUNS")).alias("IMPACT")
	).to_pandas()
	fig_b = px.bar(
	st_pd.sort_values("IMPACT", ascending=False),
	x="STAGE",
	y="IMPACT",
	title="Stage Bottleneck Impact (Median Days x Runs)",
	)
	fig_b.update_layout(xaxis_tickangle=-45)
	fb = "15_bottleneck_impact.html"
	safe_write_figure(fig_b, fb)
	except Exception as e:
	print("Bottleneck plot error:", e)

	# --------------------------------------------------
	# 9. Monthly seasonality and anomalies
	# --------------------------------------------------
	if "BusinessOnDate" in hearings.columns:
	m_hear = (
	hearings.filter(pl.col("BusinessOnDate").is_not_null())
	.with_columns(
	[
	pl.col("BusinessOnDate").dt.year().alias("Y"),
	pl.col("BusinessOnDate").dt.month().alias("M"),
	]
	)
	.with_columns(pl.date(pl.col("Y"), pl.col("M"), pl.lit(1)).alias("YM"))
	)
	monthly_listings = (
	m_hear.group_by("YM").agg(pl.len().alias("N_HEARINGS")).sort("YM")
	)
	monthly_listings.write_csv(str(_get_run_dir() / "monthly_hearings.csv"))

	try:
	fig_m = px.line(
	monthly_listings.to_pandas(),
	x="YM",
	y="N_HEARINGS",
	title="Monthly Hearings Listed",
	)
	fig_m.update_layout(yaxis=dict(tickformat=",d"))
	fm = "11_monthly_hearings.html"
	safe_write_figure(fig_m, fm)
	except Exception as e:
	print("Monthly listings error:", e)

	# Anomaly detection (no waterfall plot)
	try:
	ml = monthly_listings.with_columns(
	[
	pl.col("N_HEARINGS").shift(1).alias("PREV"),
	(pl.col("N_HEARINGS") - pl.col("N_HEARINGS").shift(1)).alias(
	"DELTA"
	),
	]
	)
	ml_pd = ml.to_pandas()
	ml_pd["ROLL_MEAN"] = (
	ml_pd["N_HEARINGS"].rolling(window=12, min_periods=6).mean()
	)
	ml_pd["ROLL_STD"] = (
	ml_pd["N_HEARINGS"].rolling(window=12, min_periods=6).std()
	)
	ml_pd["Z"] = (ml_pd["N_HEARINGS"] - ml_pd["ROLL_MEAN"]) / ml_pd["ROLL_STD"]
	ml_pd["ANOM"] = ml_pd["Z"].abs() >= 3.0

	# Export anomalies and enriched monthly series
	ml_pd_out = ml_pd.copy()
	ml_pd_out["YM"] = ml_pd_out["YM"].astype(str)
	ml_pd_out.to_csv(str(_get_run_dir() / "monthly_anomalies.csv"), index=False)
	except Exception as e:
	print("Monthly anomalies computation error:", e)

	# --------------------------------------------------
	# 10. Judge and court workload
	# --------------------------------------------------
	judge_col = None
	for c in [
	"BeforeHonourableJudge",
	"Before Hon'ble Judges",
	"Before_Honble_Judges",
	"NJDG_JUDGE_NAME",
	]:
	if c in hearings.columns:
	judge_col = c
	break

	if judge_col and "BusinessOnDate" in hearings.columns:
	jday = (
	hearings.filter(pl.col("BusinessOnDate").is_not_null())
	.group_by([judge_col, "BusinessOnDate"])
	.agg(pl.len().alias("N_HEARINGS"))
	)
	try:
	fig_j = px.box(
	jday.to_pandas(),
	x=judge_col,
	y="N_HEARINGS",
	title="Per-day Hearings per Judge",
	)
	fig_j.update_layout(
	xaxis={"categoryorder": "total descending", "tickangle": -45},
	yaxis=dict(tickformat=",d"),
	)
	fj = "12_judge_day_load.html"
	safe_write_figure(fig_j, fj)
	except Exception as e:
	print("Judge workload error:", e)

	court_col = None
	for cc in ["COURT_NUMBER", "CourtName"]:
	if cc in hearings.columns:
	court_col = cc
	break
	if court_col and "BusinessOnDate" in hearings.columns:
	cday = (
	hearings.filter(pl.col("BusinessOnDate").is_not_null())
	.group_by([court_col, "BusinessOnDate"])
	.agg(pl.len().alias("N_HEARINGS"))
	)
	try:
	fig_court = px.box(
	cday.to_pandas(),
	x=court_col,
	y="N_HEARINGS",
	title="Per-day Hearings per Courtroom",
	)
	fig_court.update_layout(
	xaxis={"categoryorder": "total descending", "tickangle": -45},
	yaxis=dict(tickformat=",d"),
	)
	fc = "12b_court_day_load.html"
	safe_write_figure(fig_court, fc)
	except Exception as e:
	print("Court workload error:", e)

	# --------------------------------------------------
	# 11. Purpose tagging distributions
	# --------------------------------------------------
	text_col = None
	for c in ["PurposeofHearing", "Purpose of Hearing", "PURPOSE_OF_HEARING"]:
	if c in hearings.columns:
	text_col = c
	break

	def _has_kw_expr(col: str, kws: list[str]):
	expr = None
	for k in kws:
	e = pl.col(col).str.contains(k)
	expr = e if expr is None else (expr \| e)
	return (expr if expr is not None else pl.lit(False)).fill_null(False)

	if text_col:
	hear_txt = hearings.with_columns(
	pl.col(text_col)
	.cast(pl.Utf8)
	.str.strip_chars()
	.str.to_uppercase()
	.alias("PURPOSE_TXT")
	)
	async_kw = [
	"NON-COMPLIANCE",
	"OFFICE OBJECTION",
	"COMPLIANCE",
	"NOTICE",
	"SERVICE",
	]
	subs_kw = [
	"EVIDENCE",
	"ARGUMENT",
	"FINAL HEARING",
	"JUDGMENT",
	"ORDER",
	"DISPOSAL",
	]
	hear_txt = hear_txt.with_columns(
	pl.when(_has_kw_expr("PURPOSE_TXT", async_kw))
	.then(pl.lit("ASYNC_OR_ADMIN"))
	.when(_has_kw_expr("PURPOSE_TXT", subs_kw))
	.then(pl.lit("SUBSTANTIVE"))
	.otherwise(pl.lit("UNKNOWN"))
	.alias("PURPOSE_TAG")
	)
	tag_share = (
	hear_txt.group_by(["CASE_TYPE", "PURPOSE_TAG"])
	.agg(pl.len().alias("N"))
	.with_columns(
	(pl.col("N") / pl.col("N").sum().over("CASE_TYPE")).alias("SHARE")
	)
	.sort(["CASE_TYPE", "SHARE"], descending=[False, True])
	)
	tag_share.write_csv(str(_get_run_dir() / "purpose_tag_shares.csv"))
	try:
	fig_t = px.bar(
	tag_share.to_pandas(),
	x="CASE_TYPE",
	y="SHARE",
	color="PURPOSE_TAG",
	title="Purpose Tag Shares by Case Type",
	barmode="stack",
	)
	fig_t.update_layout(xaxis_tickangle=-45)
	ft = "14_purpose_tag_shares.html"
	safe_write_figure(fig_t, ft)
	except Exception as e:
	print("Purpose shares error:", e)


	if __name__ == "__main__":
	run_exploration()