code-gen-assistant / src /eda /analyze.py
Rushabh147's picture
Initial deploy to HF Spaces (clean history, LFS for all binaries)
b89e6d6
Raw
History Blame Contribute Delete
3.84 kB
"""Phase 1c: exploratory data analysis.
Produces (a) a stats dict you can dump to JSON for the report, and
(b) PNG plots saved to the eda dir. Keep these in your capstone appendix.
"""
from __future__ import annotations
import json
import re
import sys
from pathlib import Path
import matplotlib
matplotlib.use("Agg") # headless backend for servers/CI
import matplotlib.pyplot as plt # noqa: E402
import pandas as pd # noqa: E402
import seaborn as sns # noqa: E402
sys.path.append(str(Path(__file__).resolve().parents[2]))
from src.config import load_config # noqa: E402
sns.set_theme(style="whitegrid")
_WORD_RE = re.compile(r"\b\w+\b")
def _doc_words(s: str) -> int:
return len(_WORD_RE.findall(s))
def compute_stats(df: pd.DataFrame) -> dict:
doc_words = df["docstring"].map(_doc_words)
code_lines = df["code"].str.count("\n") + 1
code_chars = df["code"].str.len()
return {
"n_rows": int(len(df)),
"languages": df["language"].value_counts().to_dict(),
"docstring_words": {
"mean": round(float(doc_words.mean()), 2),
"median": int(doc_words.median()),
"p95": int(doc_words.quantile(0.95)),
"max": int(doc_words.max()),
},
"code_lines": {
"mean": round(float(code_lines.mean()), 2),
"median": int(code_lines.median()),
"p95": int(code_lines.quantile(0.95)),
"max": int(code_lines.max()),
},
"code_chars": {
"mean": round(float(code_chars.mean()), 2),
"median": int(code_chars.median()),
},
}
def make_plots(df: pd.DataFrame, out_dir: str, funnel: pd.DataFrame | None = None):
out = Path(out_dir)
out.mkdir(parents=True, exist_ok=True)
saved = []
# Docstring length distribution.
fig, ax = plt.subplots(figsize=(7, 4))
sns.histplot(df["docstring"].map(_doc_words), bins=40, ax=ax)
ax.set(title="Docstring length (words)", xlabel="words", ylabel="count")
p = out / "docstring_length.png"
fig.tight_layout(); fig.savefig(p, dpi=120); plt.close(fig); saved.append(str(p))
# Code length distribution (lines).
fig, ax = plt.subplots(figsize=(7, 4))
sns.histplot((df["code"].str.count("\n") + 1).clip(upper=80), bins=40, ax=ax)
ax.set(title="Code length (lines, clipped at 80)", xlabel="lines", ylabel="count")
p = out / "code_length.png"
fig.tight_layout(); fig.savefig(p, dpi=120); plt.close(fig); saved.append(str(p))
# Language distribution.
fig, ax = plt.subplots(figsize=(7, 4))
df["language"].value_counts().plot(kind="bar", ax=ax)
ax.set(title="Rows per language", xlabel="language", ylabel="count")
p = out / "language_distribution.png"
fig.tight_layout(); fig.savefig(p, dpi=120); plt.close(fig); saved.append(str(p))
# Cleaning funnel (if provided).
if funnel is not None:
fig, ax = plt.subplots(figsize=(7, 4))
ax.barh(funnel["step"], funnel["rows_remaining"])
ax.invert_yaxis()
ax.set(title="Cleaning funnel (rows remaining)", xlabel="rows")
p = out / "cleaning_funnel.png"
fig.tight_layout(); fig.savefig(p, dpi=120); plt.close(fig); saved.append(str(p))
return saved
def run_eda(df: pd.DataFrame, cfg=None, funnel: pd.DataFrame | None = None) -> dict:
cfg = cfg or load_config()
stats = compute_stats(df)
plots = make_plots(df, cfg.paths.eda_dir, funnel)
stats["plots"] = plots
with open(Path(cfg.paths.eda_dir) / "eda_stats.json", "w") as f:
json.dump(stats, f, indent=2)
return stats
if __name__ == "__main__":
from src.data.clean import clean
from src.data.load import load_raw
cfg = load_config()
cleaned, funnel = clean(load_raw(cfg), cfg)
print(json.dumps(run_eda(cleaned, cfg, funnel), indent=2))