Spaces:
Sleeping
Sleeping
| """Phase 1c: exploratory data analysis. | |
| Produces (a) a stats dict you can dump to JSON for the report, and | |
| (b) PNG plots saved to the eda dir. Keep these in your capstone appendix. | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import re | |
| import sys | |
| from pathlib import Path | |
| import matplotlib | |
| matplotlib.use("Agg") # headless backend for servers/CI | |
| import matplotlib.pyplot as plt # noqa: E402 | |
| import pandas as pd # noqa: E402 | |
| import seaborn as sns # noqa: E402 | |
| sys.path.append(str(Path(__file__).resolve().parents[2])) | |
| from src.config import load_config # noqa: E402 | |
| sns.set_theme(style="whitegrid") | |
| _WORD_RE = re.compile(r"\b\w+\b") | |
| def _doc_words(s: str) -> int: | |
| return len(_WORD_RE.findall(s)) | |
| def compute_stats(df: pd.DataFrame) -> dict: | |
| doc_words = df["docstring"].map(_doc_words) | |
| code_lines = df["code"].str.count("\n") + 1 | |
| code_chars = df["code"].str.len() | |
| return { | |
| "n_rows": int(len(df)), | |
| "languages": df["language"].value_counts().to_dict(), | |
| "docstring_words": { | |
| "mean": round(float(doc_words.mean()), 2), | |
| "median": int(doc_words.median()), | |
| "p95": int(doc_words.quantile(0.95)), | |
| "max": int(doc_words.max()), | |
| }, | |
| "code_lines": { | |
| "mean": round(float(code_lines.mean()), 2), | |
| "median": int(code_lines.median()), | |
| "p95": int(code_lines.quantile(0.95)), | |
| "max": int(code_lines.max()), | |
| }, | |
| "code_chars": { | |
| "mean": round(float(code_chars.mean()), 2), | |
| "median": int(code_chars.median()), | |
| }, | |
| } | |
| def make_plots(df: pd.DataFrame, out_dir: str, funnel: pd.DataFrame | None = None): | |
| out = Path(out_dir) | |
| out.mkdir(parents=True, exist_ok=True) | |
| saved = [] | |
| # Docstring length distribution. | |
| fig, ax = plt.subplots(figsize=(7, 4)) | |
| sns.histplot(df["docstring"].map(_doc_words), bins=40, ax=ax) | |
| ax.set(title="Docstring length (words)", xlabel="words", ylabel="count") | |
| p = out / "docstring_length.png" | |
| fig.tight_layout(); fig.savefig(p, dpi=120); plt.close(fig); saved.append(str(p)) | |
| # Code length distribution (lines). | |
| fig, ax = plt.subplots(figsize=(7, 4)) | |
| sns.histplot((df["code"].str.count("\n") + 1).clip(upper=80), bins=40, ax=ax) | |
| ax.set(title="Code length (lines, clipped at 80)", xlabel="lines", ylabel="count") | |
| p = out / "code_length.png" | |
| fig.tight_layout(); fig.savefig(p, dpi=120); plt.close(fig); saved.append(str(p)) | |
| # Language distribution. | |
| fig, ax = plt.subplots(figsize=(7, 4)) | |
| df["language"].value_counts().plot(kind="bar", ax=ax) | |
| ax.set(title="Rows per language", xlabel="language", ylabel="count") | |
| p = out / "language_distribution.png" | |
| fig.tight_layout(); fig.savefig(p, dpi=120); plt.close(fig); saved.append(str(p)) | |
| # Cleaning funnel (if provided). | |
| if funnel is not None: | |
| fig, ax = plt.subplots(figsize=(7, 4)) | |
| ax.barh(funnel["step"], funnel["rows_remaining"]) | |
| ax.invert_yaxis() | |
| ax.set(title="Cleaning funnel (rows remaining)", xlabel="rows") | |
| p = out / "cleaning_funnel.png" | |
| fig.tight_layout(); fig.savefig(p, dpi=120); plt.close(fig); saved.append(str(p)) | |
| return saved | |
| def run_eda(df: pd.DataFrame, cfg=None, funnel: pd.DataFrame | None = None) -> dict: | |
| cfg = cfg or load_config() | |
| stats = compute_stats(df) | |
| plots = make_plots(df, cfg.paths.eda_dir, funnel) | |
| stats["plots"] = plots | |
| with open(Path(cfg.paths.eda_dir) / "eda_stats.json", "w") as f: | |
| json.dump(stats, f, indent=2) | |
| return stats | |
| if __name__ == "__main__": | |
| from src.data.clean import clean | |
| from src.data.load import load_raw | |
| cfg = load_config() | |
| cleaned, funnel = clean(load_raw(cfg), cfg) | |
| print(json.dumps(run_eda(cleaned, cfg, funnel), indent=2)) | |