Spaces:
Sleeping
Sleeping
File size: 8,637 Bytes
f87e795 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 | """
eda.py
------
Exploratory Data Analysis for MNREGA unified dataset.
Automatically adapts to Maharashtra-only or All-India data.
Figures produced:
01_statewide_trend.png
02_district_performance_ranking.png
03_efficiency_ranking.png
04_covid_impact.png
05_correlation_heatmap.png
"""
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import seaborn as sns
FIGURES_DIR = os.path.join("reports", "figures")
os.makedirs(FIGURES_DIR, exist_ok=True)
sns.set_theme(style="whitegrid", palette="muted")
plt.rcParams.update({"figure.dpi": 120, "font.size": 10})
# Use a font that supports the rupee symbol if available, else fallback
def _get_font():
available = [f.name for f in fm.fontManager.ttflist]
for font in ["DejaVu Sans", "FreeSans", "Liberation Sans", "Arial"]:
if font in available:
return font
return None
FONT = _get_font()
if FONT:
plt.rcParams["font.family"] = FONT
def run_eda(df: pd.DataFrame, scope: str = "Maharashtra") -> None:
print(f"\n[eda] Starting EDA β scope: {scope}")
_summary_stats(df)
_plot_trend(df, scope)
_plot_top_bottom_districts(df, scope)
_plot_efficiency_ranking(df, scope)
_plot_covid_impact(df)
_plot_correlation_heatmap(df)
print(f"[eda] All figures saved to: {FIGURES_DIR}/")
# ββ 1. Summary ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def _summary_stats(df: pd.DataFrame) -> None:
print(f"\n[eda] {'β'*50}")
print(f"[eda] Rows : {len(df)}")
print(f"[eda] States : {df['state'].nunique()}")
print(f"[eda] Districts : {df['district'].nunique()}")
print(f"[eda] Years : {df['financial_year'].min()} β {df['financial_year'].max()}")
print(f"[eda] Total persondays: {df['person_days_lakhs'].sum():,.1f} lakh")
if "expenditure_lakhs" in df.columns:
print(f"[eda] Total expenditure: Rs. {df['expenditure_lakhs'].sum():,.1f} lakh")
print(f"\n[eda] Person days by year (state-aggregated mean):")
by_year = df.groupby("financial_year")["person_days_lakhs"].mean()
max_val = by_year.max()
for yr, val in by_year.items():
bar = "β" * int(val / max_val * 28)
print(f" {yr}: {bar} {val:.2f}")
print(f"[eda] {'β'*50}")
# ββ 2. Trend ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def _plot_trend(df: pd.DataFrame, scope: str) -> None:
yearly = df.groupby("financial_year").agg(
total_persondays=("person_days_lakhs", "sum"),
).reset_index()
fig, ax1 = plt.subplots(figsize=(11, 5))
ax1.bar(yearly["financial_year"], yearly["total_persondays"],
color="#2196F3", alpha=0.75, label="Person Days (lakh)")
ax1.set_ylabel("Total Person Days (lakh)", color="#2196F3")
ax1.tick_params(axis="y", labelcolor="#2196F3")
ax1.set_xlabel("Financial Year")
plt.title(f"MNREGA Trend β {scope} (Person Days)")
fig.tight_layout()
_save("01_statewide_trend.png")
# ββ 3. District rankings ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def _plot_top_bottom_districts(df: pd.DataFrame, scope: str) -> None:
avg = df.groupby("district")["person_days_lakhs"].mean().sort_values(ascending=False)
n = min(10, len(avg) // 2)
top = avg.head(n)
bot = avg.tail(n).sort_values()
fig, axes = plt.subplots(1, 2, figsize=(14, max(5, n * 0.55)))
axes[0].barh(top.index, top.values, color="#4CAF50")
axes[0].set_title(f"Top {n} Districts")
axes[0].set_xlabel("Avg Person Days (lakh)")
axes[0].invert_yaxis()
axes[1].barh(bot.index, bot.values, color="#FF7043")
axes[1].set_title(f"Bottom {n} Districts")
axes[1].set_xlabel("Avg Person Days (lakh)")
axes[1].invert_yaxis()
plt.suptitle(f"MNREGA District Performance β {scope}", fontsize=13)
plt.tight_layout()
_save("02_district_performance_ranking.png")
print(f"\n[eda] Top 5 districts:")
for d, v in avg.head(5).items():
print(f" {d:35s}: {v:.2f} lakh")
print(f"[eda] Bottom 5 districts:")
for d, v in avg.tail(5).items():
print(f" {d:35s}: {v:.2f} lakh")
# ββ 4. Efficiency ranking βββββββββββββββββββββββββββββββββββββββββββββββββββββ
def _plot_efficiency_ranking(df: pd.DataFrame, scope: str) -> None:
if "expenditure_per_personday" not in df.columns:
print("[eda] Skipping efficiency ranking β expenditure_per_personday not in V3 features")
return
eff = (
df.groupby("district")["expenditure_per_personday"]
.mean().sort_values().dropna()
)
if len(eff) > 30:
eff = pd.concat([eff.head(15), eff.tail(15)])
fig, ax = plt.subplots(figsize=(10, max(6, len(eff) * 0.3)))
colors = ["#43A047" if v <= eff.median() else "#EF5350" for v in eff.values]
ax.barh(eff.index, eff.values, color=colors)
ax.axvline(eff.median(), color="navy", linestyle="--",
linewidth=1.5, label=f"Median: {eff.median():.1f}")
ax.set_title(f"Cost Efficiency β {scope}\n(Rs. expenditure per lakh persondays β lower is better)")
ax.set_xlabel("Rs. lakh per lakh persondays")
ax.legend()
plt.tight_layout()
_save("03_efficiency_ranking.png")
print(f"\n[eda] Most efficient : {eff.idxmin()} ({eff.min():.1f})")
print(f"[eda] Least efficient: {eff.idxmax()} ({eff.max():.1f})")
# ββ 5. COVID impact βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def _plot_covid_impact(df: pd.DataFrame) -> None:
pre = df[df["financial_year"] == 2019].groupby("district")["person_days_lakhs"].mean()
post = df[df["financial_year"] == 2020].groupby("district")["person_days_lakhs"].mean()
common = pre.index.intersection(post.index)
change = ((post[common] - pre[common]) / pre[common] * 100).sort_values(ascending=False)
# Cap at 20 districts for readability
show = pd.concat([change.head(10), change.tail(10)]) if len(change) > 20 else change
fig, ax = plt.subplots(figsize=(10, max(6, len(show) * 0.35)))
colors = ["#388E3C" if v >= 0 else "#D32F2F" for v in show.values]
ax.barh(show.index, show.values, color=colors)
ax.axvline(0, color="black", linewidth=0.8)
ax.set_title("COVID Impact: % Change in Person Days\n(2019-20 to 2020-21)")
ax.set_xlabel("% Change")
plt.tight_layout()
_save("04_covid_impact.png")
print(f"\n[eda] COVID β biggest spike : {change.idxmax()} (+{change.max():.1f}%)")
print(f"[eda] COVID β least impacted : {change.idxmin()} ({change.min():.1f}%)")
# ββ 6. Correlation heatmap ββββββββββββββββββββββββββββββββββββββββββββββββββββ
def _plot_correlation_heatmap(df: pd.DataFrame) -> None:
candidates = [
"person_days_lakhs", "expenditure_lakhs", "avg_wage_rate",
"expenditure_per_personday", "lag_person_days", "yoy_growth",
"demand_fulfillment_rate", "district_avg_persondays",
"rainfall_mm", "poverty_rate_pct", "scheme_overlap_score",
"budget_utilization_rate"
]
cols = [c for c in candidates if c in df.columns]
corr = df[cols].corr()
fig, ax = plt.subplots(figsize=(11, 9))
mask = np.triu(np.ones_like(corr, dtype=bool))
sns.heatmap(corr, mask=mask, annot=True, fmt=".2f",
cmap="coolwarm", center=0, ax=ax,
linewidths=0.5, annot_kws={"size": 8})
ax.set_title("Feature Correlation Heatmap")
plt.tight_layout()
_save("05_correlation_heatmap.png")
# ββ Helper ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def _save(filename: str) -> None:
path = os.path.join(FIGURES_DIR, filename)
plt.savefig(path, bbox_inches="tight")
plt.close()
print(f"[eda] Saved: {path}")
|