"""Phase 1b: clean and filter the raw (docstring, code) pairs. CodeSearchNet is noisy. A clean, filtered subset trains and retrieves better than the raw dump. Every filter records how many rows it removed so you can report the funnel in your EDA / write-up. """ from __future__ import annotations import re import sys from pathlib import Path import pandas as pd sys.path.append(str(Path(__file__).resolve().parents[2])) from src.config import load_config # noqa: E402 _WORD_RE = re.compile(r"\b\w+\b") def _first_line(text: str) -> str: """CodeSearchNet docstrings often have a summary first line + details. For NL->code we keep the summary line (the actual 'intent').""" return text.strip().split("\n")[0].strip() if isinstance(text, str) else "" def _word_count(text: str) -> int: return len(_WORD_RE.findall(text)) if isinstance(text, str) else 0 def _ascii_ratio(text: str) -> float: if not text: return 1.0 ascii_chars = sum(1 for ch in text if ord(ch) < 128) return ascii_chars / len(text) def _approx_tokens(code: str) -> int: """Cheap proxy for token count (whitespace + punctuation split).""" return len(re.findall(r"\w+|[^\s\w]", code)) if isinstance(code, str) else 0 def clean(df: pd.DataFrame, cfg=None) -> tuple[pd.DataFrame, pd.DataFrame]: """Return (cleaned_df, funnel_df). funnel_df logs rows removed per step.""" cfg = cfg or load_config() cc = cfg.cleaning funnel = [("raw", len(df))] df = df.copy() # Use only the summary line of each docstring as the NL intent. df["docstring"] = df["docstring"].map(_first_line) df["code"] = df["code"].fillna("").astype(str) # 1. Drop empty docstring or code. df = df[(df["docstring"].str.len() > 0) & (df["code"].str.len() > 0)] funnel.append(("non_empty", len(df))) # 2. Docstring word-count window. wc = df["docstring"].map(_word_count) df = df[(wc >= cc.min_doc_words) & (wc <= cc.max_doc_words)] funnel.append(("doc_word_window", len(df))) # 3. Minimum code length. df = df[df["code"].str.len() >= cc.min_code_chars] funnel.append(("min_code_chars", len(df))) # 4. Maximum code tokens (budget for the generator's context). df = df[df["code"].map(_approx_tokens) <= cc.max_code_tokens] funnel.append(("max_code_tokens", len(df))) # 5. Blocklisted / autogenerated docstrings. pattern = "|".join(re.escape(t) for t in cc.doc_blocklist) if pattern: df = df[~df["docstring"].str.lower().str.contains(pattern, regex=True)] funnel.append(("doc_blocklist", len(df))) # 6. Drop mostly-non-ASCII docstrings (non-English noise). if cc.drop_non_ascii_docs: df = df[df["docstring"].map(_ascii_ratio) >= 0.9] funnel.append(("ascii_docs", len(df))) # 7. Exact duplicate removal (same code or same docstring). if cc.drop_exact_duplicates: df = df.drop_duplicates(subset=["code"]).drop_duplicates(subset=["docstring"]) funnel.append(("dedup", len(df))) df = df.reset_index(drop=True) funnel_df = pd.DataFrame(funnel, columns=["step", "rows_remaining"]) funnel_df["removed"] = funnel_df["rows_remaining"].shift(1).fillna( funnel_df["rows_remaining"].iloc[0] ).astype(int) - funnel_df["rows_remaining"] return df, funnel_df def split(df: pd.DataFrame, cfg=None) -> dict[str, pd.DataFrame]: """Random train/val/test split per the config ratios.""" cfg = cfg or load_config() df = df.sample(frac=1.0, random_state=cfg.split.seed).reset_index(drop=True) n = len(df) n_train = int(n * cfg.split.train) n_val = int(n * cfg.split.val) return { "train": df.iloc[:n_train].reset_index(drop=True), "val": df.iloc[n_train:n_train + n_val].reset_index(drop=True), "test": df.iloc[n_train + n_val:].reset_index(drop=True), } if __name__ == "__main__": from src.data.load import load_raw cfg = load_config() raw = load_raw(cfg) cleaned, funnel = clean(raw, cfg) print(funnel.to_string(index=False)) print("cleaned rows:", len(cleaned))