File size: 4,015 Bytes
d199eb1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#!/usr/bin/env python3
"""Phase 1: Data preparation for Qwen3-4B"""

import sys
from pathlib import Path
from datasets import Dataset, load_dataset
from tqdm import tqdm

# Config
MAX_SAMPLES = 10000
NGRAM_SIZE = 13
DATA_DIR = Path("./qwen3_pipeline/data")
DATA_DIR.mkdir(parents=True, exist_ok=True)

print("="*70)
print("PHASE 1: DATA PREPARATION")
print("="*70)

# [1/4] Load training data
print("\n[1/4] Loading rStar-Coder...")
try:
    ds = load_dataset(
        "microsoft/rStar-Coder",
        "synthetic_sft",  # IMPORTANT: Must specify config
        split="train"
    )
    print(f"  Loaded: {len(ds)} examples")
except Exception as e:
    print(f"  rStar-Coder failed: {e}")
    print("  Trying OpenCodeReasoning...")
    try:
        ds = load_dataset("nvidia/OpenCodeReasoning", split="train")
        print(f"  Loaded: {len(ds)} examples")
    except:
        print("  ERROR: Could not load any dataset")
        sys.exit(1)

# [2/4] Decontamination index
print("\n[2/4] Building decontamination index...")
benchmark_ngrams = set()

for name, loader in [
    ("HumanEval", lambda: load_dataset("openai_humaneval", split="test")),
    ("MBPP", lambda: load_dataset("mbpp", "sanitized", split="test")),
]:
    try:
        bm = loader()
        for item in bm:
            text = str(item.get("prompt","")) + str(item.get("canonical_solution", item.get("code","")))
            words = text.lower().split()
            for i in range(max(0, len(words) - NGRAM_SIZE + 1)):
                benchmark_ngrams.add(" ".join(words[i:i+NGRAM_SIZE]))
        print(f"  {name}: indexed")
    except:
        print(f"  {name}: skip")

print(f"  Total n-grams: {len(benchmark_ngrams)}")

def is_contaminated(text):
    if not benchmark_ngrams:
        return False
    words = text.lower().split()
    return any(
        " ".join(words[i:i+NGRAM_SIZE]) in benchmark_ngrams
        for i in range(max(0, len(words) - NGRAM_SIZE + 1))
    )

# [3/4] Filter
print("\n[3/4] Filtering...")
filtered = []

for ex in tqdm(ds, desc="  Processing"):
    problem = (ex.get("problem") or ex.get("question") or 
               ex.get("prompt") or ex.get("instruction") or "")
    solution = (ex.get("solution") or ex.get("answer") or 
                ex.get("response") or ex.get("output") or "")
    
    if not problem or not solution:
        continue
    
    sol = str(solution)
    if len(sol) < 50 or len(sol) > 12000:
        continue
    
    lang = str(ex.get("language", ex.get("lang", "python"))).lower()
    if lang not in ("python", "python3", "py", ""):
        continue
    
    if not any(kw in sol for kw in ("def ", "class ")):
        continue
    
    if benchmark_ngrams and is_contaminated(f"{problem} {sol}"):
        continue

    # Difficulty score
    length_score = min(len(sol) / 8000, 1.0)
    hard_kw = ["dynamic programming","dp[","recursion","bfs","dfs","graph","heap"]
    kw_score = sum(1 for k in hard_kw if k in sol.lower()) / len(hard_kw)
    diff = str(ex.get("difficulty", "")).lower()
    label_score = 0.2 if "easy" in diff else 0.8 if "hard" in diff else 0.5

    filtered.append({
        "problem": problem,
        "solution": sol,
        "score": 0.4 * length_score + 0.3 * kw_score + 0.3 * label_score,
    })
    
    if len(filtered) >= MAX_SAMPLES * 3:
        break

print(f"  Filtered: {len(filtered)} samples")

# [4/4] Curriculum sort
print("\n[4/4] Curriculum sorting...")
filtered.sort(key=lambda x: x["score"])
filtered = filtered[:MAX_SAMPLES]
print(f"  Final: {len(filtered)} samples (easy→hard)")

# Format
sft_list = [{
    "messages": [
        {"role": "system", "content": "You are a programming expert."},
        {"role": "user", "content": ex["problem"]},
        {"role": "assistant", "content": ex["solution"]},
    ]
} for ex in filtered]

sft_ds = Dataset.from_list(sft_list)
sft_ds.save_to_disk(str(DATA_DIR / "sft"))

print(f"\n✓ Saved to: {DATA_DIR / 'sft'}")
print(f"✓ Samples: {len(sft_ds)}")
print("\n➡️  Next: python phase2_train.py")