Spaces:

Rushabh147
/

code-gen-assistant

Sleeping

App Files Files Community

code-gen-assistant / src /data /clean.py

Rushabh147

Initial deploy to HF Spaces (clean history, LFS for all binaries)

b89e6d6 6 days ago

Raw

History Blame Contribute Delete

4.1 kB

	"""Phase 1b: clean and filter the raw (docstring, code) pairs.

	CodeSearchNet is noisy. A clean, filtered subset trains and retrieves better
	than the raw dump. Every filter records how many rows it removed so you can
	report the funnel in your EDA / write-up.
	"""
	from __future__ import annotations

	import re
	import sys
	from pathlib import Path

	import pandas as pd

	sys.path.append(str(Path(__file__).resolve().parents[2]))
	from src.config import load_config # noqa: E402

	_WORD_RE = re.compile(r"\b\w+\b")


	def _first_line(text: str) -> str:
	"""CodeSearchNet docstrings often have a summary first line + details.
	For NL->code we keep the summary line (the actual 'intent')."""
	return text.strip().split("\n")[0].strip() if isinstance(text, str) else ""


	def _word_count(text: str) -> int:
	return len(_WORD_RE.findall(text)) if isinstance(text, str) else 0


	def _ascii_ratio(text: str) -> float:
	if not text:
	return 1.0
	ascii_chars = sum(1 for ch in text if ord(ch) < 128)
	return ascii_chars / len(text)


	def _approx_tokens(code: str) -> int:
	"""Cheap proxy for token count (whitespace + punctuation split)."""
	return len(re.findall(r"\w+\|[^\s\w]", code)) if isinstance(code, str) else 0


	def clean(df: pd.DataFrame, cfg=None) -> tuple[pd.DataFrame, pd.DataFrame]:
	"""Return (cleaned_df, funnel_df). funnel_df logs rows removed per step."""
	cfg = cfg or load_config()
	cc = cfg.cleaning
	funnel = [("raw", len(df))]
	df = df.copy()

	# Use only the summary line of each docstring as the NL intent.
	df["docstring"] = df["docstring"].map(_first_line)
	df["code"] = df["code"].fillna("").astype(str)

	# 1. Drop empty docstring or code.
	df = df[(df["docstring"].str.len() > 0) & (df["code"].str.len() > 0)]
	funnel.append(("non_empty", len(df)))

	# 2. Docstring word-count window.
	wc = df["docstring"].map(_word_count)
	df = df[(wc >= cc.min_doc_words) & (wc <= cc.max_doc_words)]
	funnel.append(("doc_word_window", len(df)))

	# 3. Minimum code length.
	df = df[df["code"].str.len() >= cc.min_code_chars]
	funnel.append(("min_code_chars", len(df)))

	# 4. Maximum code tokens (budget for the generator's context).
	df = df[df["code"].map(_approx_tokens) <= cc.max_code_tokens]
	funnel.append(("max_code_tokens", len(df)))

	# 5. Blocklisted / autogenerated docstrings.
	pattern = "\|".join(re.escape(t) for t in cc.doc_blocklist)
	if pattern:
	df = df[~df["docstring"].str.lower().str.contains(pattern, regex=True)]
	funnel.append(("doc_blocklist", len(df)))

	# 6. Drop mostly-non-ASCII docstrings (non-English noise).
	if cc.drop_non_ascii_docs:
	df = df[df["docstring"].map(_ascii_ratio) >= 0.9]
	funnel.append(("ascii_docs", len(df)))

	# 7. Exact duplicate removal (same code or same docstring).
	if cc.drop_exact_duplicates:
	df = df.drop_duplicates(subset=["code"]).drop_duplicates(subset=["docstring"])
	funnel.append(("dedup", len(df)))

	df = df.reset_index(drop=True)
	funnel_df = pd.DataFrame(funnel, columns=["step", "rows_remaining"])
	funnel_df["removed"] = funnel_df["rows_remaining"].shift(1).fillna(
	funnel_df["rows_remaining"].iloc[0]
	).astype(int) - funnel_df["rows_remaining"]
	return df, funnel_df


	def split(df: pd.DataFrame, cfg=None) -> dict[str, pd.DataFrame]:
	"""Random train/val/test split per the config ratios."""
	cfg = cfg or load_config()
	df = df.sample(frac=1.0, random_state=cfg.split.seed).reset_index(drop=True)
	n = len(df)
	n_train = int(n * cfg.split.train)
	n_val = int(n * cfg.split.val)
	return {
	"train": df.iloc[:n_train].reset_index(drop=True),
	"val": df.iloc[n_train:n_train + n_val].reset_index(drop=True),
	"test": df.iloc[n_train + n_val:].reset_index(drop=True),
	}


	if __name__ == "__main__":
	from src.data.load import load_raw

	cfg = load_config()
	raw = load_raw(cfg)
	cleaned, funnel = clean(raw, cfg)
	print(funnel.to_string(index=False))
	print("cleaned rows:", len(cleaned))