"""Phase 1c: exploratory data analysis. Produces (a) a stats dict you can dump to JSON for the report, and (b) PNG plots saved to the eda dir. Keep these in your capstone appendix. """ from __future__ import annotations import json import re import sys from pathlib import Path import matplotlib matplotlib.use("Agg") # headless backend for servers/CI import matplotlib.pyplot as plt # noqa: E402 import pandas as pd # noqa: E402 import seaborn as sns # noqa: E402 sys.path.append(str(Path(__file__).resolve().parents[2])) from src.config import load_config # noqa: E402 sns.set_theme(style="whitegrid") _WORD_RE = re.compile(r"\b\w+\b") def _doc_words(s: str) -> int: return len(_WORD_RE.findall(s)) def compute_stats(df: pd.DataFrame) -> dict: doc_words = df["docstring"].map(_doc_words) code_lines = df["code"].str.count("\n") + 1 code_chars = df["code"].str.len() return { "n_rows": int(len(df)), "languages": df["language"].value_counts().to_dict(), "docstring_words": { "mean": round(float(doc_words.mean()), 2), "median": int(doc_words.median()), "p95": int(doc_words.quantile(0.95)), "max": int(doc_words.max()), }, "code_lines": { "mean": round(float(code_lines.mean()), 2), "median": int(code_lines.median()), "p95": int(code_lines.quantile(0.95)), "max": int(code_lines.max()), }, "code_chars": { "mean": round(float(code_chars.mean()), 2), "median": int(code_chars.median()), }, } def make_plots(df: pd.DataFrame, out_dir: str, funnel: pd.DataFrame | None = None): out = Path(out_dir) out.mkdir(parents=True, exist_ok=True) saved = [] # Docstring length distribution. fig, ax = plt.subplots(figsize=(7, 4)) sns.histplot(df["docstring"].map(_doc_words), bins=40, ax=ax) ax.set(title="Docstring length (words)", xlabel="words", ylabel="count") p = out / "docstring_length.png" fig.tight_layout(); fig.savefig(p, dpi=120); plt.close(fig); saved.append(str(p)) # Code length distribution (lines). fig, ax = plt.subplots(figsize=(7, 4)) sns.histplot((df["code"].str.count("\n") + 1).clip(upper=80), bins=40, ax=ax) ax.set(title="Code length (lines, clipped at 80)", xlabel="lines", ylabel="count") p = out / "code_length.png" fig.tight_layout(); fig.savefig(p, dpi=120); plt.close(fig); saved.append(str(p)) # Language distribution. fig, ax = plt.subplots(figsize=(7, 4)) df["language"].value_counts().plot(kind="bar", ax=ax) ax.set(title="Rows per language", xlabel="language", ylabel="count") p = out / "language_distribution.png" fig.tight_layout(); fig.savefig(p, dpi=120); plt.close(fig); saved.append(str(p)) # Cleaning funnel (if provided). if funnel is not None: fig, ax = plt.subplots(figsize=(7, 4)) ax.barh(funnel["step"], funnel["rows_remaining"]) ax.invert_yaxis() ax.set(title="Cleaning funnel (rows remaining)", xlabel="rows") p = out / "cleaning_funnel.png" fig.tight_layout(); fig.savefig(p, dpi=120); plt.close(fig); saved.append(str(p)) return saved def run_eda(df: pd.DataFrame, cfg=None, funnel: pd.DataFrame | None = None) -> dict: cfg = cfg or load_config() stats = compute_stats(df) plots = make_plots(df, cfg.paths.eda_dir, funnel) stats["plots"] = plots with open(Path(cfg.paths.eda_dir) / "eda_stats.json", "w") as f: json.dump(stats, f, indent=2) return stats if __name__ == "__main__": from src.data.clean import clean from src.data.load import load_raw cfg = load_config() cleaned, funnel = clean(load_raw(cfg), cfg) print(json.dumps(run_eda(cleaned, cfg, funnel), indent=2))