Rushabh147's picture
Initial deploy to HF Spaces (clean history, LFS for all binaries)
b89e6d6
Raw
History Blame Contribute Delete
4.1 kB
"""Phase 1b: clean and filter the raw (docstring, code) pairs.
CodeSearchNet is noisy. A clean, filtered subset trains and retrieves better
than the raw dump. Every filter records how many rows it removed so you can
report the funnel in your EDA / write-up.
"""
from __future__ import annotations
import re
import sys
from pathlib import Path
import pandas as pd
sys.path.append(str(Path(__file__).resolve().parents[2]))
from src.config import load_config # noqa: E402
_WORD_RE = re.compile(r"\b\w+\b")
def _first_line(text: str) -> str:
"""CodeSearchNet docstrings often have a summary first line + details.
For NL->code we keep the summary line (the actual 'intent')."""
return text.strip().split("\n")[0].strip() if isinstance(text, str) else ""
def _word_count(text: str) -> int:
return len(_WORD_RE.findall(text)) if isinstance(text, str) else 0
def _ascii_ratio(text: str) -> float:
if not text:
return 1.0
ascii_chars = sum(1 for ch in text if ord(ch) < 128)
return ascii_chars / len(text)
def _approx_tokens(code: str) -> int:
"""Cheap proxy for token count (whitespace + punctuation split)."""
return len(re.findall(r"\w+|[^\s\w]", code)) if isinstance(code, str) else 0
def clean(df: pd.DataFrame, cfg=None) -> tuple[pd.DataFrame, pd.DataFrame]:
"""Return (cleaned_df, funnel_df). funnel_df logs rows removed per step."""
cfg = cfg or load_config()
cc = cfg.cleaning
funnel = [("raw", len(df))]
df = df.copy()
# Use only the summary line of each docstring as the NL intent.
df["docstring"] = df["docstring"].map(_first_line)
df["code"] = df["code"].fillna("").astype(str)
# 1. Drop empty docstring or code.
df = df[(df["docstring"].str.len() > 0) & (df["code"].str.len() > 0)]
funnel.append(("non_empty", len(df)))
# 2. Docstring word-count window.
wc = df["docstring"].map(_word_count)
df = df[(wc >= cc.min_doc_words) & (wc <= cc.max_doc_words)]
funnel.append(("doc_word_window", len(df)))
# 3. Minimum code length.
df = df[df["code"].str.len() >= cc.min_code_chars]
funnel.append(("min_code_chars", len(df)))
# 4. Maximum code tokens (budget for the generator's context).
df = df[df["code"].map(_approx_tokens) <= cc.max_code_tokens]
funnel.append(("max_code_tokens", len(df)))
# 5. Blocklisted / autogenerated docstrings.
pattern = "|".join(re.escape(t) for t in cc.doc_blocklist)
if pattern:
df = df[~df["docstring"].str.lower().str.contains(pattern, regex=True)]
funnel.append(("doc_blocklist", len(df)))
# 6. Drop mostly-non-ASCII docstrings (non-English noise).
if cc.drop_non_ascii_docs:
df = df[df["docstring"].map(_ascii_ratio) >= 0.9]
funnel.append(("ascii_docs", len(df)))
# 7. Exact duplicate removal (same code or same docstring).
if cc.drop_exact_duplicates:
df = df.drop_duplicates(subset=["code"]).drop_duplicates(subset=["docstring"])
funnel.append(("dedup", len(df)))
df = df.reset_index(drop=True)
funnel_df = pd.DataFrame(funnel, columns=["step", "rows_remaining"])
funnel_df["removed"] = funnel_df["rows_remaining"].shift(1).fillna(
funnel_df["rows_remaining"].iloc[0]
).astype(int) - funnel_df["rows_remaining"]
return df, funnel_df
def split(df: pd.DataFrame, cfg=None) -> dict[str, pd.DataFrame]:
"""Random train/val/test split per the config ratios."""
cfg = cfg or load_config()
df = df.sample(frac=1.0, random_state=cfg.split.seed).reset_index(drop=True)
n = len(df)
n_train = int(n * cfg.split.train)
n_val = int(n * cfg.split.val)
return {
"train": df.iloc[:n_train].reset_index(drop=True),
"val": df.iloc[n_train:n_train + n_val].reset_index(drop=True),
"test": df.iloc[n_train + n_val:].reset_index(drop=True),
}
if __name__ == "__main__":
from src.data.load import load_raw
cfg = load_config()
raw = load_raw(cfg)
cleaned, funnel = clean(raw, cfg)
print(funnel.to_string(index=False))
print("cleaned rows:", len(cleaned))