Rushabh147's picture
Initial deploy to HF Spaces (clean history, LFS for all binaries)
b89e6d6
Raw
History Blame Contribute Delete
2.94 kB
"""Phase 1a: load raw CodeSearchNet (or the synthetic sample).
We normalise everything to a pandas DataFrame with two key columns:
- docstring : natural-language description (model INPUT)
- code : function body (model TARGET)
plus useful metadata (language, repo, url) for traceability.
"""
from __future__ import annotations
import sys
from pathlib import Path
import pandas as pd
sys.path.append(str(Path(__file__).resolve().parents[2]))
from src.config import load_config # noqa: E402
from src.data.make_sample import make_sample # noqa: E402
# Map CodeSearchNet's verbose column names to our canonical names.
_COLUMN_MAP = {
"func_documentation_string": "docstring",
"func_code_string": "code",
"language": "language",
"repository_name": "repo",
"func_code_url": "url",
}
def _from_huggingface(cfg) -> pd.DataFrame:
"""Stream CodeSearchNet per-language from HuggingFace and concatenate."""
from datasets import load_dataset
max_rows = getattr(cfg.data, "max_rows", 0)
frames = []
for lang in cfg.data.languages:
print(f"[load] downloading CodeSearchNet '{lang}' ...")
# CodeSearchNet ships train/validation/test; we pull all and re-split later.
ds = load_dataset(cfg.data.hf_dataset_id, lang)
for split in ds.keys():
df = ds[split].to_pandas()
keep = [c for c in _COLUMN_MAP if c in df.columns]
df = df[keep].rename(columns=_COLUMN_MAP)
frames.append(df)
out = pd.concat(frames, ignore_index=True)
print(f"[load] total raw rows: {len(out):,}")
if max_rows and max_rows > 0 and len(out) > max_rows:
out = out.sample(n=max_rows, random_state=42).reset_index(drop=True)
print(f"[load] capped to {max_rows:,} rows (max_rows setting)")
return out
def _from_sample(cfg) -> pd.DataFrame:
print(f"[load] using synthetic sample (n={cfg.data.sample_size})")
df = make_sample(cfg.data.sample_size, cfg.split.seed)
keep = [c for c in _COLUMN_MAP if c in df.columns]
return df[keep].rename(columns=_COLUMN_MAP)
def load_raw(cfg=None) -> pd.DataFrame:
cfg = cfg or load_config()
if cfg.data.use_sample:
df = _from_sample(cfg)
else:
try:
df = _from_huggingface(cfg)
except Exception as e: # noqa: BLE001
print(
f"[load] HuggingFace load failed ({e}).\n"
f" Tip: try hf_dataset_id: 'code-search-net/code_search_net' "
f"in config.yaml, or set use_sample: true.",
file=sys.stderr,
)
raise
# Guarantee the columns downstream code expects, even if metadata missing.
for col in ("docstring", "code", "language", "repo", "url"):
if col not in df.columns:
df[col] = ""
return df
if __name__ == "__main__":
df = load_raw()
print(df.shape)
print(df.head(3).to_string())