| | |
| | """Phase 1: Data preparation for Qwen3-4B""" |
| |
|
| | import sys |
| | from pathlib import Path |
| | from datasets import Dataset, load_dataset |
| | from tqdm import tqdm |
| |
|
| | |
| | MAX_SAMPLES = 10000 |
| | NGRAM_SIZE = 13 |
| | DATA_DIR = Path("./qwen3_pipeline/data") |
| | DATA_DIR.mkdir(parents=True, exist_ok=True) |
| |
|
| | print("="*70) |
| | print("PHASE 1: DATA PREPARATION") |
| | print("="*70) |
| |
|
| | |
| | print("\n[1/4] Loading rStar-Coder...") |
| | try: |
| | ds = load_dataset( |
| | "microsoft/rStar-Coder", |
| | "synthetic_sft", |
| | split="train" |
| | ) |
| | print(f" Loaded: {len(ds)} examples") |
| | except Exception as e: |
| | print(f" rStar-Coder failed: {e}") |
| | print(" Trying OpenCodeReasoning...") |
| | try: |
| | ds = load_dataset("nvidia/OpenCodeReasoning", split="train") |
| | print(f" Loaded: {len(ds)} examples") |
| | except: |
| | print(" ERROR: Could not load any dataset") |
| | sys.exit(1) |
| |
|
| | |
| | print("\n[2/4] Building decontamination index...") |
| | benchmark_ngrams = set() |
| |
|
| | for name, loader in [ |
| | ("HumanEval", lambda: load_dataset("openai_humaneval", split="test")), |
| | ("MBPP", lambda: load_dataset("mbpp", "sanitized", split="test")), |
| | ]: |
| | try: |
| | bm = loader() |
| | for item in bm: |
| | text = str(item.get("prompt","")) + str(item.get("canonical_solution", item.get("code",""))) |
| | words = text.lower().split() |
| | for i in range(max(0, len(words) - NGRAM_SIZE + 1)): |
| | benchmark_ngrams.add(" ".join(words[i:i+NGRAM_SIZE])) |
| | print(f" {name}: indexed") |
| | except: |
| | print(f" {name}: skip") |
| |
|
| | print(f" Total n-grams: {len(benchmark_ngrams)}") |
| |
|
| | def is_contaminated(text): |
| | if not benchmark_ngrams: |
| | return False |
| | words = text.lower().split() |
| | return any( |
| | " ".join(words[i:i+NGRAM_SIZE]) in benchmark_ngrams |
| | for i in range(max(0, len(words) - NGRAM_SIZE + 1)) |
| | ) |
| |
|
| | |
| | print("\n[3/4] Filtering...") |
| | filtered = [] |
| |
|
| | for ex in tqdm(ds, desc=" Processing"): |
| | problem = (ex.get("problem") or ex.get("question") or |
| | ex.get("prompt") or ex.get("instruction") or "") |
| | solution = (ex.get("solution") or ex.get("answer") or |
| | ex.get("response") or ex.get("output") or "") |
| | |
| | if not problem or not solution: |
| | continue |
| | |
| | sol = str(solution) |
| | if len(sol) < 50 or len(sol) > 12000: |
| | continue |
| | |
| | lang = str(ex.get("language", ex.get("lang", "python"))).lower() |
| | if lang not in ("python", "python3", "py", ""): |
| | continue |
| | |
| | if not any(kw in sol for kw in ("def ", "class ")): |
| | continue |
| | |
| | if benchmark_ngrams and is_contaminated(f"{problem} {sol}"): |
| | continue |
| |
|
| | |
| | length_score = min(len(sol) / 8000, 1.0) |
| | hard_kw = ["dynamic programming","dp[","recursion","bfs","dfs","graph","heap"] |
| | kw_score = sum(1 for k in hard_kw if k in sol.lower()) / len(hard_kw) |
| | diff = str(ex.get("difficulty", "")).lower() |
| | label_score = 0.2 if "easy" in diff else 0.8 if "hard" in diff else 0.5 |
| |
|
| | filtered.append({ |
| | "problem": problem, |
| | "solution": sol, |
| | "score": 0.4 * length_score + 0.3 * kw_score + 0.3 * label_score, |
| | }) |
| | |
| | if len(filtered) >= MAX_SAMPLES * 3: |
| | break |
| |
|
| | print(f" Filtered: {len(filtered)} samples") |
| |
|
| | |
| | print("\n[4/4] Curriculum sorting...") |
| | filtered.sort(key=lambda x: x["score"]) |
| | filtered = filtered[:MAX_SAMPLES] |
| | print(f" Final: {len(filtered)} samples (easy→hard)") |
| |
|
| | |
| | sft_list = [{ |
| | "messages": [ |
| | {"role": "system", "content": "You are a programming expert."}, |
| | {"role": "user", "content": ex["problem"]}, |
| | {"role": "assistant", "content": ex["solution"]}, |
| | ] |
| | } for ex in filtered] |
| |
|
| | sft_ds = Dataset.from_list(sft_list) |
| | sft_ds.save_to_disk(str(DATA_DIR / "sft")) |
| |
|
| | print(f"\n✓ Saved to: {DATA_DIR / 'sft'}") |
| | print(f"✓ Samples: {len(sft_ds)}") |
| | print("\n➡️ Next: python phase2_train.py") |
| |
|