Spaces:

Rushabh147
/

code-gen-assistant

Sleeping

App Files Files Community

code-gen-assistant / src /eda /analyze.py

Rushabh147

Initial deploy to HF Spaces (clean history, LFS for all binaries)

b89e6d6 11 days ago

Raw

History Blame Contribute Delete

3.84 kB

	"""Phase 1c: exploratory data analysis.

	Produces (a) a stats dict you can dump to JSON for the report, and
	(b) PNG plots saved to the eda dir. Keep these in your capstone appendix.
	"""
	from __future__ import annotations

	import json
	import re
	import sys
	from pathlib import Path

	import matplotlib

	matplotlib.use("Agg") # headless backend for servers/CI
	import matplotlib.pyplot as plt # noqa: E402
	import pandas as pd # noqa: E402
	import seaborn as sns # noqa: E402

	sys.path.append(str(Path(__file__).resolve().parents[2]))
	from src.config import load_config # noqa: E402

	sns.set_theme(style="whitegrid")
	_WORD_RE = re.compile(r"\b\w+\b")


	def _doc_words(s: str) -> int:
	return len(_WORD_RE.findall(s))


	def compute_stats(df: pd.DataFrame) -> dict:
	doc_words = df["docstring"].map(_doc_words)
	code_lines = df["code"].str.count("\n") + 1
	code_chars = df["code"].str.len()
	return {
	"n_rows": int(len(df)),
	"languages": df["language"].value_counts().to_dict(),
	"docstring_words": {
	"mean": round(float(doc_words.mean()), 2),
	"median": int(doc_words.median()),
	"p95": int(doc_words.quantile(0.95)),
	"max": int(doc_words.max()),
	},
	"code_lines": {
	"mean": round(float(code_lines.mean()), 2),
	"median": int(code_lines.median()),
	"p95": int(code_lines.quantile(0.95)),
	"max": int(code_lines.max()),
	},
	"code_chars": {
	"mean": round(float(code_chars.mean()), 2),
	"median": int(code_chars.median()),
	},
	}


	def make_plots(df: pd.DataFrame, out_dir: str, funnel: pd.DataFrame \| None = None):
	out = Path(out_dir)
	out.mkdir(parents=True, exist_ok=True)
	saved = []

	# Docstring length distribution.
	fig, ax = plt.subplots(figsize=(7, 4))
	sns.histplot(df["docstring"].map(_doc_words), bins=40, ax=ax)
	ax.set(title="Docstring length (words)", xlabel="words", ylabel="count")
	p = out / "docstring_length.png"
	fig.tight_layout(); fig.savefig(p, dpi=120); plt.close(fig); saved.append(str(p))

	# Code length distribution (lines).
	fig, ax = plt.subplots(figsize=(7, 4))
	sns.histplot((df["code"].str.count("\n") + 1).clip(upper=80), bins=40, ax=ax)
	ax.set(title="Code length (lines, clipped at 80)", xlabel="lines", ylabel="count")
	p = out / "code_length.png"
	fig.tight_layout(); fig.savefig(p, dpi=120); plt.close(fig); saved.append(str(p))

	# Language distribution.
	fig, ax = plt.subplots(figsize=(7, 4))
	df["language"].value_counts().plot(kind="bar", ax=ax)
	ax.set(title="Rows per language", xlabel="language", ylabel="count")
	p = out / "language_distribution.png"
	fig.tight_layout(); fig.savefig(p, dpi=120); plt.close(fig); saved.append(str(p))

	# Cleaning funnel (if provided).
	if funnel is not None:
	fig, ax = plt.subplots(figsize=(7, 4))
	ax.barh(funnel["step"], funnel["rows_remaining"])
	ax.invert_yaxis()
	ax.set(title="Cleaning funnel (rows remaining)", xlabel="rows")
	p = out / "cleaning_funnel.png"
	fig.tight_layout(); fig.savefig(p, dpi=120); plt.close(fig); saved.append(str(p))

	return saved


	def run_eda(df: pd.DataFrame, cfg=None, funnel: pd.DataFrame \| None = None) -> dict:
	cfg = cfg or load_config()
	stats = compute_stats(df)
	plots = make_plots(df, cfg.paths.eda_dir, funnel)
	stats["plots"] = plots
	with open(Path(cfg.paths.eda_dir) / "eda_stats.json", "w") as f:
	json.dump(stats, f, indent=2)
	return stats


	if __name__ == "__main__":
	from src.data.clean import clean
	from src.data.load import load_raw

	cfg = load_config()
	cleaned, funnel = clean(load_raw(cfg), cfg)
	print(json.dumps(run_eda(cleaned, cfg, funnel), indent=2))