Spaces:
Sleeping
Sleeping
| """Phase 1b: clean and filter the raw (docstring, code) pairs. | |
| CodeSearchNet is noisy. A clean, filtered subset trains and retrieves better | |
| than the raw dump. Every filter records how many rows it removed so you can | |
| report the funnel in your EDA / write-up. | |
| """ | |
| from __future__ import annotations | |
| import re | |
| import sys | |
| from pathlib import Path | |
| import pandas as pd | |
| sys.path.append(str(Path(__file__).resolve().parents[2])) | |
| from src.config import load_config # noqa: E402 | |
| _WORD_RE = re.compile(r"\b\w+\b") | |
| def _first_line(text: str) -> str: | |
| """CodeSearchNet docstrings often have a summary first line + details. | |
| For NL->code we keep the summary line (the actual 'intent').""" | |
| return text.strip().split("\n")[0].strip() if isinstance(text, str) else "" | |
| def _word_count(text: str) -> int: | |
| return len(_WORD_RE.findall(text)) if isinstance(text, str) else 0 | |
| def _ascii_ratio(text: str) -> float: | |
| if not text: | |
| return 1.0 | |
| ascii_chars = sum(1 for ch in text if ord(ch) < 128) | |
| return ascii_chars / len(text) | |
| def _approx_tokens(code: str) -> int: | |
| """Cheap proxy for token count (whitespace + punctuation split).""" | |
| return len(re.findall(r"\w+|[^\s\w]", code)) if isinstance(code, str) else 0 | |
| def clean(df: pd.DataFrame, cfg=None) -> tuple[pd.DataFrame, pd.DataFrame]: | |
| """Return (cleaned_df, funnel_df). funnel_df logs rows removed per step.""" | |
| cfg = cfg or load_config() | |
| cc = cfg.cleaning | |
| funnel = [("raw", len(df))] | |
| df = df.copy() | |
| # Use only the summary line of each docstring as the NL intent. | |
| df["docstring"] = df["docstring"].map(_first_line) | |
| df["code"] = df["code"].fillna("").astype(str) | |
| # 1. Drop empty docstring or code. | |
| df = df[(df["docstring"].str.len() > 0) & (df["code"].str.len() > 0)] | |
| funnel.append(("non_empty", len(df))) | |
| # 2. Docstring word-count window. | |
| wc = df["docstring"].map(_word_count) | |
| df = df[(wc >= cc.min_doc_words) & (wc <= cc.max_doc_words)] | |
| funnel.append(("doc_word_window", len(df))) | |
| # 3. Minimum code length. | |
| df = df[df["code"].str.len() >= cc.min_code_chars] | |
| funnel.append(("min_code_chars", len(df))) | |
| # 4. Maximum code tokens (budget for the generator's context). | |
| df = df[df["code"].map(_approx_tokens) <= cc.max_code_tokens] | |
| funnel.append(("max_code_tokens", len(df))) | |
| # 5. Blocklisted / autogenerated docstrings. | |
| pattern = "|".join(re.escape(t) for t in cc.doc_blocklist) | |
| if pattern: | |
| df = df[~df["docstring"].str.lower().str.contains(pattern, regex=True)] | |
| funnel.append(("doc_blocklist", len(df))) | |
| # 6. Drop mostly-non-ASCII docstrings (non-English noise). | |
| if cc.drop_non_ascii_docs: | |
| df = df[df["docstring"].map(_ascii_ratio) >= 0.9] | |
| funnel.append(("ascii_docs", len(df))) | |
| # 7. Exact duplicate removal (same code or same docstring). | |
| if cc.drop_exact_duplicates: | |
| df = df.drop_duplicates(subset=["code"]).drop_duplicates(subset=["docstring"]) | |
| funnel.append(("dedup", len(df))) | |
| df = df.reset_index(drop=True) | |
| funnel_df = pd.DataFrame(funnel, columns=["step", "rows_remaining"]) | |
| funnel_df["removed"] = funnel_df["rows_remaining"].shift(1).fillna( | |
| funnel_df["rows_remaining"].iloc[0] | |
| ).astype(int) - funnel_df["rows_remaining"] | |
| return df, funnel_df | |
| def split(df: pd.DataFrame, cfg=None) -> dict[str, pd.DataFrame]: | |
| """Random train/val/test split per the config ratios.""" | |
| cfg = cfg or load_config() | |
| df = df.sample(frac=1.0, random_state=cfg.split.seed).reset_index(drop=True) | |
| n = len(df) | |
| n_train = int(n * cfg.split.train) | |
| n_val = int(n * cfg.split.val) | |
| return { | |
| "train": df.iloc[:n_train].reset_index(drop=True), | |
| "val": df.iloc[n_train:n_train + n_val].reset_index(drop=True), | |
| "test": df.iloc[n_train + n_val:].reset_index(drop=True), | |
| } | |
| if __name__ == "__main__": | |
| from src.data.load import load_raw | |
| cfg = load_config() | |
| raw = load_raw(cfg) | |
| cleaned, funnel = clean(raw, cfg) | |
| print(funnel.to_string(index=False)) | |
| print("cleaned rows:", len(cleaned)) | |