File size: 4,096 Bytes
b89e6d6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
"""Phase 1b: clean and filter the raw (docstring, code) pairs.

CodeSearchNet is noisy. A clean, filtered subset trains and retrieves better
than the raw dump. Every filter records how many rows it removed so you can
report the funnel in your EDA / write-up.
"""
from __future__ import annotations

import re
import sys
from pathlib import Path

import pandas as pd

sys.path.append(str(Path(__file__).resolve().parents[2]))
from src.config import load_config  # noqa: E402

_WORD_RE = re.compile(r"\b\w+\b")


def _first_line(text: str) -> str:
    """CodeSearchNet docstrings often have a summary first line + details.
    For NL->code we keep the summary line (the actual 'intent')."""
    return text.strip().split("\n")[0].strip() if isinstance(text, str) else ""


def _word_count(text: str) -> int:
    return len(_WORD_RE.findall(text)) if isinstance(text, str) else 0


def _ascii_ratio(text: str) -> float:
    if not text:
        return 1.0
    ascii_chars = sum(1 for ch in text if ord(ch) < 128)
    return ascii_chars / len(text)


def _approx_tokens(code: str) -> int:
    """Cheap proxy for token count (whitespace + punctuation split)."""
    return len(re.findall(r"\w+|[^\s\w]", code)) if isinstance(code, str) else 0


def clean(df: pd.DataFrame, cfg=None) -> tuple[pd.DataFrame, pd.DataFrame]:
    """Return (cleaned_df, funnel_df). funnel_df logs rows removed per step."""
    cfg = cfg or load_config()
    cc = cfg.cleaning
    funnel = [("raw", len(df))]
    df = df.copy()

    # Use only the summary line of each docstring as the NL intent.
    df["docstring"] = df["docstring"].map(_first_line)
    df["code"] = df["code"].fillna("").astype(str)

    # 1. Drop empty docstring or code.
    df = df[(df["docstring"].str.len() > 0) & (df["code"].str.len() > 0)]
    funnel.append(("non_empty", len(df)))

    # 2. Docstring word-count window.
    wc = df["docstring"].map(_word_count)
    df = df[(wc >= cc.min_doc_words) & (wc <= cc.max_doc_words)]
    funnel.append(("doc_word_window", len(df)))

    # 3. Minimum code length.
    df = df[df["code"].str.len() >= cc.min_code_chars]
    funnel.append(("min_code_chars", len(df)))

    # 4. Maximum code tokens (budget for the generator's context).
    df = df[df["code"].map(_approx_tokens) <= cc.max_code_tokens]
    funnel.append(("max_code_tokens", len(df)))

    # 5. Blocklisted / autogenerated docstrings.
    pattern = "|".join(re.escape(t) for t in cc.doc_blocklist)
    if pattern:
        df = df[~df["docstring"].str.lower().str.contains(pattern, regex=True)]
    funnel.append(("doc_blocklist", len(df)))

    # 6. Drop mostly-non-ASCII docstrings (non-English noise).
    if cc.drop_non_ascii_docs:
        df = df[df["docstring"].map(_ascii_ratio) >= 0.9]
    funnel.append(("ascii_docs", len(df)))

    # 7. Exact duplicate removal (same code or same docstring).
    if cc.drop_exact_duplicates:
        df = df.drop_duplicates(subset=["code"]).drop_duplicates(subset=["docstring"])
    funnel.append(("dedup", len(df)))

    df = df.reset_index(drop=True)
    funnel_df = pd.DataFrame(funnel, columns=["step", "rows_remaining"])
    funnel_df["removed"] = funnel_df["rows_remaining"].shift(1).fillna(
        funnel_df["rows_remaining"].iloc[0]
    ).astype(int) - funnel_df["rows_remaining"]
    return df, funnel_df


def split(df: pd.DataFrame, cfg=None) -> dict[str, pd.DataFrame]:
    """Random train/val/test split per the config ratios."""
    cfg = cfg or load_config()
    df = df.sample(frac=1.0, random_state=cfg.split.seed).reset_index(drop=True)
    n = len(df)
    n_train = int(n * cfg.split.train)
    n_val = int(n * cfg.split.val)
    return {
        "train": df.iloc[:n_train].reset_index(drop=True),
        "val": df.iloc[n_train:n_train + n_val].reset_index(drop=True),
        "test": df.iloc[n_train + n_val:].reset_index(drop=True),
    }


if __name__ == "__main__":
    from src.data.load import load_raw

    cfg = load_config()
    raw = load_raw(cfg)
    cleaned, funnel = clean(raw, cfg)
    print(funnel.to_string(index=False))
    print("cleaned rows:", len(cleaned))