| """ |
| Faz 6 ön-hazırlık — bilingual SFT verisi üretici. |
| |
| PIVOT-DİL kararı: reasoning TEK DİLDE = İngilizce. EN=Magpie (CoT, reasoning taşır), |
| TR=Quardo Turkish-Alpaca-GPT-4O (CoT'suz direkt-cevap, akıcılık/talimat-takibi). |
| Magpie(EN) + Quardo(TR) → filtrele/dengele → {instruction, (input), output} JSONL. |
| Filtre: token uzunluğu ≤ max_len (KESME YOK, sığmayan atılır) + Magpie kalite (input_quality/difficulty). |
| Çıktı faz6_sft.py'nin --data'sına verilir. |
| |
| Çalıştırma (Colab/yerel; datasets + sentencepiece kurulu, HF login — SP tokenizer private repo'da): |
| pip install "datasets>=2.18" sentencepiece |
| HF_TOKEN=hf_xxx python faz6_prep_data.py --out sft.jsonl --max_len 2048 --n_en 15000 --n_tr 15000 |
| Not: 250K Magpie tarandığı için birkaç dk sürer. |
| """ |
| import os, sys, json, re, random, argparse |
|
|
| EN_REPO = "Magpie-Align/Magpie-Reasoning-V2-250K-CoT-Deepseek-R1-Llama-70B" |
| TR_REPO = "Quardo/Turkish-Alpaca-GPT-4O-V2" |
| THINK_RE = re.compile(r"<think>.*?</think>\s*", re.DOTALL) |
|
|
|
|
| |
| def build_prompt(instr, inp=""): |
| instr = instr.strip(); inp = (inp or "").strip() |
| if inp: |
| return f"### Talimat:\n{instr}\n\n### Girdi:\n{inp}\n\n### Yanıt:\n" |
| return f"### Talimat:\n{instr}\n\n### Yanıt:\n" |
|
|
|
|
| def parse_messages(msgs): |
| """chat 'messages' listesinden ilk (user, assistant) çiftini çıkar (system atlanır).""" |
| instr = out = None |
| for m in msgs or []: |
| role = m.get("role"); c = (m.get("content") or "").strip() |
| if role == "user" and instr is None: |
| instr = c |
| elif role == "assistant" and instr is not None and out is None: |
| out = c |
| return instr, out |
|
|
|
|
| def maybe_strip_think(text, strip): |
| return THINK_RE.sub("", text).strip() if strip else text |
|
|
|
|
| def tok_len(sp, instr, out, inp=""): |
| """faz6_sft ile aynı tokenizasyon: prompt(+input) + yanıt + eos.""" |
| return len(sp.encode(build_prompt(instr, inp) + out.strip(), out_type=int)) + 1 |
|
|
|
|
| |
| def load_tok(token): |
| import sentencepiece as spm |
| from huggingface_hub import hf_hub_download |
| p = hf_hub_download("kdirgul/smartcore-v1", "tokenizer/tokenizer.model", repo_type="model", token=token) |
| return spm.SentencePieceProcessor(model_file=p) |
|
|
|
|
| def gather_en(sp, args): |
| from datasets import load_dataset |
| ds = load_dataset(EN_REPO, split="train") |
| quals = set(args.quality.split(",")); diffs = set(args.difficulty.split(",")) |
| out = []; seen = 0 |
| for ex in ds: |
| seen += 1 |
| if (ex.get("language") or "EN").upper() != "EN": |
| continue |
| if ex.get("input_quality") not in quals or ex.get("difficulty") not in diffs: |
| continue |
| instr = (ex.get("instruction") or "").strip() |
| resp = maybe_strip_think((ex.get("response") or "").strip(), args.strip_think) |
| if not instr or not resp: |
| continue |
| if len(instr) + len(resp) > args.max_len * 6: |
| continue |
| if tok_len(sp, instr, resp) > args.max_len: |
| continue |
| out.append({"instruction": instr, "output": resp}) |
| print(f"[en] tarandı {seen} → tutuldu {len(out)}", flush=True) |
| return out |
|
|
|
|
| def gather_tr(sp, args, token): |
| """TR direkt-cevap (Quardo Turkish-Alpaca-GPT-4O, instruction/input/output). CoT yok.""" |
| from datasets import load_dataset |
| ds = load_dataset(TR_REPO, split="train", token=token) |
| out = [] |
| for ex in ds: |
| instr = (ex.get("instruction") or "").strip() |
| inp = (ex.get("input") or "").strip() |
| resp = maybe_strip_think((ex.get("output") or "").strip(), args.strip_think) |
| if not instr or not resp: |
| continue |
| if len(instr) + len(inp) + len(resp) > args.max_len * 6: |
| continue |
| if tok_len(sp, instr, resp, inp) > args.max_len: |
| continue |
| r = {"instruction": instr, "output": resp} |
| if inp: |
| r["input"] = inp |
| out.append(r) |
| print(f"[tr] {TR_REPO} → tutuldu {len(out)}", flush=True) |
| return out |
|
|
|
|
| def stats(sp, rows, name): |
| if not rows: |
| print(f"[{name}] 0 örnek", flush=True); return |
| sample = rows if len(rows) <= 3000 else random.sample(rows, 3000) |
| ls = sorted(tok_len(sp, r["instruction"], r["output"], r.get("input", "")) for r in sample) |
| print(f"[{name}] n={len(rows)} | token: med={ls[len(ls)//2]} p90={ls[int(len(ls)*0.9)]} max={ls[-1]}", flush=True) |
|
|
|
|
| def main(): |
| ap = argparse.ArgumentParser() |
| ap.add_argument("--out", default="sft.jsonl") |
| ap.add_argument("--max_len", type=int, default=2048) |
| ap.add_argument("--n_en", type=int, default=15000) |
| ap.add_argument("--n_tr", type=int, default=15000) |
| ap.add_argument("--quality", default="good,excellent") |
| ap.add_argument("--difficulty", default="easy,medium") |
| ap.add_argument("--strip_think", action="store_true", help="yanıttan <think>...</think> at (177M için daha güvenli)") |
| ap.add_argument("--seed", type=int, default=42) |
| args = ap.parse_args() |
|
|
| token = os.environ.get("HF_TOKEN") |
| try: |
| from huggingface_hub import get_token |
| token = token or get_token() |
| except Exception: |
| pass |
| sp = load_tok(token) |
| rng = random.Random(args.seed) |
|
|
| print("=== EN (Magpie) ===", flush=True) |
| en = gather_en(sp, args); rng.shuffle(en); stats(sp, en, "EN-filtreli"); en = en[:args.n_en] |
| print("=== TR (Bilge) ===", flush=True) |
| tr = gather_tr(sp, args, token); rng.shuffle(tr); stats(sp, tr, "TR-filtreli"); tr = tr[:args.n_tr] |
|
|
| data = en + tr; rng.shuffle(data) |
| with open(args.out, "w", encoding="utf-8") as f: |
| for r in data: |
| f.write(json.dumps(r, ensure_ascii=False) + "\n") |
| stats(sp, data, "TOPLAM") |
| print(f"\n[bitti] {len(data)} örnek (EN {len(en)} + TR {len(tr)}) -> {args.out}", flush=True) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|