#!/usr/bin/env python3 """Phase 1: Data preparation for Qwen3-4B""" import sys from pathlib import Path from datasets import Dataset, load_dataset from tqdm import tqdm # Config MAX_SAMPLES = 10000 NGRAM_SIZE = 13 DATA_DIR = Path("./qwen3_pipeline/data") DATA_DIR.mkdir(parents=True, exist_ok=True) print("="*70) print("PHASE 1: DATA PREPARATION") print("="*70) # [1/4] Load training data print("\n[1/4] Loading rStar-Coder...") try: ds = load_dataset( "microsoft/rStar-Coder", "synthetic_sft", # IMPORTANT: Must specify config split="train" ) print(f" Loaded: {len(ds)} examples") except Exception as e: print(f" rStar-Coder failed: {e}") print(" Trying OpenCodeReasoning...") try: ds = load_dataset("nvidia/OpenCodeReasoning", split="train") print(f" Loaded: {len(ds)} examples") except: print(" ERROR: Could not load any dataset") sys.exit(1) # [2/4] Decontamination index print("\n[2/4] Building decontamination index...") benchmark_ngrams = set() for name, loader in [ ("HumanEval", lambda: load_dataset("openai_humaneval", split="test")), ("MBPP", lambda: load_dataset("mbpp", "sanitized", split="test")), ]: try: bm = loader() for item in bm: text = str(item.get("prompt","")) + str(item.get("canonical_solution", item.get("code",""))) words = text.lower().split() for i in range(max(0, len(words) - NGRAM_SIZE + 1)): benchmark_ngrams.add(" ".join(words[i:i+NGRAM_SIZE])) print(f" {name}: indexed") except: print(f" {name}: skip") print(f" Total n-grams: {len(benchmark_ngrams)}") def is_contaminated(text): if not benchmark_ngrams: return False words = text.lower().split() return any( " ".join(words[i:i+NGRAM_SIZE]) in benchmark_ngrams for i in range(max(0, len(words) - NGRAM_SIZE + 1)) ) # [3/4] Filter print("\n[3/4] Filtering...") filtered = [] for ex in tqdm(ds, desc=" Processing"): problem = (ex.get("problem") or ex.get("question") or ex.get("prompt") or ex.get("instruction") or "") solution = (ex.get("solution") or ex.get("answer") or ex.get("response") or ex.get("output") or "") if not problem or not solution: continue sol = str(solution) if len(sol) < 50 or len(sol) > 12000: continue lang = str(ex.get("language", ex.get("lang", "python"))).lower() if lang not in ("python", "python3", "py", ""): continue if not any(kw in sol for kw in ("def ", "class ")): continue if benchmark_ngrams and is_contaminated(f"{problem} {sol}"): continue # Difficulty score length_score = min(len(sol) / 8000, 1.0) hard_kw = ["dynamic programming","dp[","recursion","bfs","dfs","graph","heap"] kw_score = sum(1 for k in hard_kw if k in sol.lower()) / len(hard_kw) diff = str(ex.get("difficulty", "")).lower() label_score = 0.2 if "easy" in diff else 0.8 if "hard" in diff else 0.5 filtered.append({ "problem": problem, "solution": sol, "score": 0.4 * length_score + 0.3 * kw_score + 0.3 * label_score, }) if len(filtered) >= MAX_SAMPLES * 3: break print(f" Filtered: {len(filtered)} samples") # [4/4] Curriculum sort print("\n[4/4] Curriculum sorting...") filtered.sort(key=lambda x: x["score"]) filtered = filtered[:MAX_SAMPLES] print(f" Final: {len(filtered)} samples (easy→hard)") # Format sft_list = [{ "messages": [ {"role": "system", "content": "You are a programming expert."}, {"role": "user", "content": ex["problem"]}, {"role": "assistant", "content": ex["solution"]}, ] } for ex in filtered] sft_ds = Dataset.from_list(sft_list) sft_ds.save_to_disk(str(DATA_DIR / "sft")) print(f"\n✓ Saved to: {DATA_DIR / 'sft'}") print(f"✓ Samples: {len(sft_ds)}") print("\n➡️ Next: python phase2_train.py")