File size: 4,015 Bytes
d199eb1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 | #!/usr/bin/env python3
"""Phase 1: Data preparation for Qwen3-4B"""
import sys
from pathlib import Path
from datasets import Dataset, load_dataset
from tqdm import tqdm
# Config
MAX_SAMPLES = 10000
NGRAM_SIZE = 13
DATA_DIR = Path("./qwen3_pipeline/data")
DATA_DIR.mkdir(parents=True, exist_ok=True)
print("="*70)
print("PHASE 1: DATA PREPARATION")
print("="*70)
# [1/4] Load training data
print("\n[1/4] Loading rStar-Coder...")
try:
ds = load_dataset(
"microsoft/rStar-Coder",
"synthetic_sft", # IMPORTANT: Must specify config
split="train"
)
print(f" Loaded: {len(ds)} examples")
except Exception as e:
print(f" rStar-Coder failed: {e}")
print(" Trying OpenCodeReasoning...")
try:
ds = load_dataset("nvidia/OpenCodeReasoning", split="train")
print(f" Loaded: {len(ds)} examples")
except:
print(" ERROR: Could not load any dataset")
sys.exit(1)
# [2/4] Decontamination index
print("\n[2/4] Building decontamination index...")
benchmark_ngrams = set()
for name, loader in [
("HumanEval", lambda: load_dataset("openai_humaneval", split="test")),
("MBPP", lambda: load_dataset("mbpp", "sanitized", split="test")),
]:
try:
bm = loader()
for item in bm:
text = str(item.get("prompt","")) + str(item.get("canonical_solution", item.get("code","")))
words = text.lower().split()
for i in range(max(0, len(words) - NGRAM_SIZE + 1)):
benchmark_ngrams.add(" ".join(words[i:i+NGRAM_SIZE]))
print(f" {name}: indexed")
except:
print(f" {name}: skip")
print(f" Total n-grams: {len(benchmark_ngrams)}")
def is_contaminated(text):
if not benchmark_ngrams:
return False
words = text.lower().split()
return any(
" ".join(words[i:i+NGRAM_SIZE]) in benchmark_ngrams
for i in range(max(0, len(words) - NGRAM_SIZE + 1))
)
# [3/4] Filter
print("\n[3/4] Filtering...")
filtered = []
for ex in tqdm(ds, desc=" Processing"):
problem = (ex.get("problem") or ex.get("question") or
ex.get("prompt") or ex.get("instruction") or "")
solution = (ex.get("solution") or ex.get("answer") or
ex.get("response") or ex.get("output") or "")
if not problem or not solution:
continue
sol = str(solution)
if len(sol) < 50 or len(sol) > 12000:
continue
lang = str(ex.get("language", ex.get("lang", "python"))).lower()
if lang not in ("python", "python3", "py", ""):
continue
if not any(kw in sol for kw in ("def ", "class ")):
continue
if benchmark_ngrams and is_contaminated(f"{problem} {sol}"):
continue
# Difficulty score
length_score = min(len(sol) / 8000, 1.0)
hard_kw = ["dynamic programming","dp[","recursion","bfs","dfs","graph","heap"]
kw_score = sum(1 for k in hard_kw if k in sol.lower()) / len(hard_kw)
diff = str(ex.get("difficulty", "")).lower()
label_score = 0.2 if "easy" in diff else 0.8 if "hard" in diff else 0.5
filtered.append({
"problem": problem,
"solution": sol,
"score": 0.4 * length_score + 0.3 * kw_score + 0.3 * label_score,
})
if len(filtered) >= MAX_SAMPLES * 3:
break
print(f" Filtered: {len(filtered)} samples")
# [4/4] Curriculum sort
print("\n[4/4] Curriculum sorting...")
filtered.sort(key=lambda x: x["score"])
filtered = filtered[:MAX_SAMPLES]
print(f" Final: {len(filtered)} samples (easy→hard)")
# Format
sft_list = [{
"messages": [
{"role": "system", "content": "You are a programming expert."},
{"role": "user", "content": ex["problem"]},
{"role": "assistant", "content": ex["solution"]},
]
} for ex in filtered]
sft_ds = Dataset.from_list(sft_list)
sft_ds.save_to_disk(str(DATA_DIR / "sft"))
print(f"\n✓ Saved to: {DATA_DIR / 'sft'}")
print(f"✓ Samples: {len(sft_ds)}")
print("\n➡️ Next: python phase2_train.py")
|