File size: 6,205 Bytes
49ac326
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
"""
Faz 6 ön-hazırlık — bilingual SFT verisi üretici.

PIVOT-DİL kararı: reasoning TEK DİLDE = İngilizce. EN=Magpie (CoT, reasoning taşır),
TR=Quardo Turkish-Alpaca-GPT-4O (CoT'suz direkt-cevap, akıcılık/talimat-takibi).
Magpie(EN) + Quardo(TR) → filtrele/dengele → {instruction, (input), output} JSONL.
Filtre: token uzunluğu ≤ max_len (KESME YOK, sığmayan atılır) + Magpie kalite (input_quality/difficulty).
Çıktı faz6_sft.py'nin --data'sına verilir.

Çalıştırma (Colab/yerel; datasets + sentencepiece kurulu, HF login — SP tokenizer private repo'da):
  pip install "datasets>=2.18" sentencepiece
  HF_TOKEN=hf_xxx python faz6_prep_data.py --out sft.jsonl --max_len 2048 --n_en 15000 --n_tr 15000
Not: 250K Magpie tarandığı için birkaç dk sürer.
"""
import os, sys, json, re, random, argparse

EN_REPO = "Magpie-Align/Magpie-Reasoning-V2-250K-CoT-Deepseek-R1-Llama-70B"  # EN reasoning (CoT)
TR_REPO = "Quardo/Turkish-Alpaca-GPT-4O-V2"  # TR direkt-cevap (CoT'suz, GPT-4o, instruction/input/output)
THINK_RE = re.compile(r"<think>.*?</think>\s*", re.DOTALL)


# ───────────── saf-mantık (yerelde test edilebilir) ─────────────
def build_prompt(instr, inp=""):
    instr = instr.strip(); inp = (inp or "").strip()
    if inp:
        return f"### Talimat:\n{instr}\n\n### Girdi:\n{inp}\n\n### Yanıt:\n"
    return f"### Talimat:\n{instr}\n\n### Yanıt:\n"


def parse_messages(msgs):
    """chat 'messages' listesinden ilk (user, assistant) çiftini çıkar (system atlanır)."""
    instr = out = None
    for m in msgs or []:
        role = m.get("role"); c = (m.get("content") or "").strip()
        if role == "user" and instr is None:
            instr = c
        elif role == "assistant" and instr is not None and out is None:
            out = c
    return instr, out


def maybe_strip_think(text, strip):
    return THINK_RE.sub("", text).strip() if strip else text


def tok_len(sp, instr, out, inp=""):
    """faz6_sft ile aynı tokenizasyon: prompt(+input) + yanıt + eos."""
    return len(sp.encode(build_prompt(instr, inp) + out.strip(), out_type=int)) + 1


# ───────────── yükleyiciler (datasets gerekir) ─────────────
def load_tok(token):
    import sentencepiece as spm
    from huggingface_hub import hf_hub_download
    p = hf_hub_download("kdirgul/smartcore-v1", "tokenizer/tokenizer.model", repo_type="model", token=token)
    return spm.SentencePieceProcessor(model_file=p)


def gather_en(sp, args):
    from datasets import load_dataset
    ds = load_dataset(EN_REPO, split="train")
    quals = set(args.quality.split(",")); diffs = set(args.difficulty.split(","))
    out = []; seen = 0
    for ex in ds:
        seen += 1
        if (ex.get("language") or "EN").upper() != "EN":
            continue
        if ex.get("input_quality") not in quals or ex.get("difficulty") not in diffs:
            continue
        instr = (ex.get("instruction") or "").strip()
        resp = maybe_strip_think((ex.get("response") or "").strip(), args.strip_think)
        if not instr or not resp:
            continue
        if len(instr) + len(resp) > args.max_len * 6:        # ucuz ön-eleme (kesin uzun)
            continue
        if tok_len(sp, instr, resp) > args.max_len:
            continue
        out.append({"instruction": instr, "output": resp})
    print(f"[en] tarandı {seen} → tutuldu {len(out)}", flush=True)
    return out


def gather_tr(sp, args, token):
    """TR direkt-cevap (Quardo Turkish-Alpaca-GPT-4O, instruction/input/output). CoT yok."""
    from datasets import load_dataset
    ds = load_dataset(TR_REPO, split="train", token=token)
    out = []
    for ex in ds:
        instr = (ex.get("instruction") or "").strip()
        inp = (ex.get("input") or "").strip()
        resp = maybe_strip_think((ex.get("output") or "").strip(), args.strip_think)
        if not instr or not resp:
            continue
        if len(instr) + len(inp) + len(resp) > args.max_len * 6:
            continue
        if tok_len(sp, instr, resp, inp) > args.max_len:
            continue
        r = {"instruction": instr, "output": resp}
        if inp:
            r["input"] = inp
        out.append(r)
    print(f"[tr] {TR_REPO} → tutuldu {len(out)}", flush=True)
    return out


def stats(sp, rows, name):
    if not rows:
        print(f"[{name}] 0 örnek", flush=True); return
    sample = rows if len(rows) <= 3000 else random.sample(rows, 3000)
    ls = sorted(tok_len(sp, r["instruction"], r["output"], r.get("input", "")) for r in sample)
    print(f"[{name}] n={len(rows)} | token: med={ls[len(ls)//2]} p90={ls[int(len(ls)*0.9)]} max={ls[-1]}", flush=True)


def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--out", default="sft.jsonl")
    ap.add_argument("--max_len", type=int, default=2048)
    ap.add_argument("--n_en", type=int, default=15000)
    ap.add_argument("--n_tr", type=int, default=15000)
    ap.add_argument("--quality", default="good,excellent")
    ap.add_argument("--difficulty", default="easy,medium")
    ap.add_argument("--strip_think", action="store_true", help="yanıttan <think>...</think> at (177M için daha güvenli)")
    ap.add_argument("--seed", type=int, default=42)
    args = ap.parse_args()

    token = os.environ.get("HF_TOKEN")
    try:
        from huggingface_hub import get_token
        token = token or get_token()
    except Exception:
        pass
    sp = load_tok(token)
    rng = random.Random(args.seed)

    print("=== EN (Magpie) ===", flush=True)
    en = gather_en(sp, args); rng.shuffle(en); stats(sp, en, "EN-filtreli"); en = en[:args.n_en]
    print("=== TR (Bilge) ===", flush=True)
    tr = gather_tr(sp, args, token); rng.shuffle(tr); stats(sp, tr, "TR-filtreli"); tr = tr[:args.n_tr]

    data = en + tr; rng.shuffle(data)
    with open(args.out, "w", encoding="utf-8") as f:
        for r in data:
            f.write(json.dumps(r, ensure_ascii=False) + "\n")
    stats(sp, data, "TOPLAM")
    print(f"\n[bitti] {len(data)} örnek (EN {len(en)} + TR {len(tr)}) -> {args.out}", flush=True)


if __name__ == "__main__":
    main()