"""
Faz 6 ön-hazırlık — bilingual SFT verisi üretici.
PIVOT-DİL kararı: reasoning TEK DİLDE = İngilizce. EN=Magpie (CoT, reasoning taşır),
TR=Quardo Turkish-Alpaca-GPT-4O (CoT'suz direkt-cevap, akıcılık/talimat-takibi).
Magpie(EN) + Quardo(TR) → filtrele/dengele → {instruction, (input), output} JSONL.
Filtre: token uzunluğu ≤ max_len (KESME YOK, sığmayan atılır) + Magpie kalite (input_quality/difficulty).
Çıktı faz6_sft.py'nin --data'sına verilir.
Çalıştırma (Colab/yerel; datasets + sentencepiece kurulu, HF login — SP tokenizer private repo'da):
pip install "datasets>=2.18" sentencepiece
HF_TOKEN=hf_xxx python faz6_prep_data.py --out sft.jsonl --max_len 2048 --n_en 15000 --n_tr 15000
Not: 250K Magpie tarandığı için birkaç dk sürer.
"""
import os, sys, json, re, random, argparse
EN_REPO = "Magpie-Align/Magpie-Reasoning-V2-250K-CoT-Deepseek-R1-Llama-70B" # EN reasoning (CoT)
TR_REPO = "Quardo/Turkish-Alpaca-GPT-4O-V2" # TR direkt-cevap (CoT'suz, GPT-4o, instruction/input/output)
THINK_RE = re.compile(r".*?\s*", re.DOTALL)
# ───────────── saf-mantık (yerelde test edilebilir) ─────────────
def build_prompt(instr, inp=""):
instr = instr.strip(); inp = (inp or "").strip()
if inp:
return f"### Talimat:\n{instr}\n\n### Girdi:\n{inp}\n\n### Yanıt:\n"
return f"### Talimat:\n{instr}\n\n### Yanıt:\n"
def parse_messages(msgs):
"""chat 'messages' listesinden ilk (user, assistant) çiftini çıkar (system atlanır)."""
instr = out = None
for m in msgs or []:
role = m.get("role"); c = (m.get("content") or "").strip()
if role == "user" and instr is None:
instr = c
elif role == "assistant" and instr is not None and out is None:
out = c
return instr, out
def maybe_strip_think(text, strip):
return THINK_RE.sub("", text).strip() if strip else text
def tok_len(sp, instr, out, inp=""):
"""faz6_sft ile aynı tokenizasyon: prompt(+input) + yanıt + eos."""
return len(sp.encode(build_prompt(instr, inp) + out.strip(), out_type=int)) + 1
# ───────────── yükleyiciler (datasets gerekir) ─────────────
def load_tok(token):
import sentencepiece as spm
from huggingface_hub import hf_hub_download
p = hf_hub_download("kdirgul/smartcore-v1", "tokenizer/tokenizer.model", repo_type="model", token=token)
return spm.SentencePieceProcessor(model_file=p)
def gather_en(sp, args):
from datasets import load_dataset
ds = load_dataset(EN_REPO, split="train")
quals = set(args.quality.split(",")); diffs = set(args.difficulty.split(","))
out = []; seen = 0
for ex in ds:
seen += 1
if (ex.get("language") or "EN").upper() != "EN":
continue
if ex.get("input_quality") not in quals or ex.get("difficulty") not in diffs:
continue
instr = (ex.get("instruction") or "").strip()
resp = maybe_strip_think((ex.get("response") or "").strip(), args.strip_think)
if not instr or not resp:
continue
if len(instr) + len(resp) > args.max_len * 6: # ucuz ön-eleme (kesin uzun)
continue
if tok_len(sp, instr, resp) > args.max_len:
continue
out.append({"instruction": instr, "output": resp})
print(f"[en] tarandı {seen} → tutuldu {len(out)}", flush=True)
return out
def gather_tr(sp, args, token):
"""TR direkt-cevap (Quardo Turkish-Alpaca-GPT-4O, instruction/input/output). CoT yok."""
from datasets import load_dataset
ds = load_dataset(TR_REPO, split="train", token=token)
out = []
for ex in ds:
instr = (ex.get("instruction") or "").strip()
inp = (ex.get("input") or "").strip()
resp = maybe_strip_think((ex.get("output") or "").strip(), args.strip_think)
if not instr or not resp:
continue
if len(instr) + len(inp) + len(resp) > args.max_len * 6:
continue
if tok_len(sp, instr, resp, inp) > args.max_len:
continue
r = {"instruction": instr, "output": resp}
if inp:
r["input"] = inp
out.append(r)
print(f"[tr] {TR_REPO} → tutuldu {len(out)}", flush=True)
return out
def stats(sp, rows, name):
if not rows:
print(f"[{name}] 0 örnek", flush=True); return
sample = rows if len(rows) <= 3000 else random.sample(rows, 3000)
ls = sorted(tok_len(sp, r["instruction"], r["output"], r.get("input", "")) for r in sample)
print(f"[{name}] n={len(rows)} | token: med={ls[len(ls)//2]} p90={ls[int(len(ls)*0.9)]} max={ls[-1]}", flush=True)
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--out", default="sft.jsonl")
ap.add_argument("--max_len", type=int, default=2048)
ap.add_argument("--n_en", type=int, default=15000)
ap.add_argument("--n_tr", type=int, default=15000)
ap.add_argument("--quality", default="good,excellent")
ap.add_argument("--difficulty", default="easy,medium")
ap.add_argument("--strip_think", action="store_true", help="yanıttan ... at (177M için daha güvenli)")
ap.add_argument("--seed", type=int, default=42)
args = ap.parse_args()
token = os.environ.get("HF_TOKEN")
try:
from huggingface_hub import get_token
token = token or get_token()
except Exception:
pass
sp = load_tok(token)
rng = random.Random(args.seed)
print("=== EN (Magpie) ===", flush=True)
en = gather_en(sp, args); rng.shuffle(en); stats(sp, en, "EN-filtreli"); en = en[:args.n_en]
print("=== TR (Bilge) ===", flush=True)
tr = gather_tr(sp, args, token); rng.shuffle(tr); stats(sp, tr, "TR-filtreli"); tr = tr[:args.n_tr]
data = en + tr; rng.shuffle(data)
with open(args.out, "w", encoding="utf-8") as f:
for r in data:
f.write(json.dumps(r, ensure_ascii=False) + "\n")
stats(sp, data, "TOPLAM")
print(f"\n[bitti] {len(data)} örnek (EN {len(en)} + TR {len(tr)}) -> {args.out}", flush=True)
if __name__ == "__main__":
main()