File size: 6,205 Bytes
49ac326 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 | """
Faz 6 ön-hazırlık — bilingual SFT verisi üretici.
PIVOT-DİL kararı: reasoning TEK DİLDE = İngilizce. EN=Magpie (CoT, reasoning taşır),
TR=Quardo Turkish-Alpaca-GPT-4O (CoT'suz direkt-cevap, akıcılık/talimat-takibi).
Magpie(EN) + Quardo(TR) → filtrele/dengele → {instruction, (input), output} JSONL.
Filtre: token uzunluğu ≤ max_len (KESME YOK, sığmayan atılır) + Magpie kalite (input_quality/difficulty).
Çıktı faz6_sft.py'nin --data'sına verilir.
Çalıştırma (Colab/yerel; datasets + sentencepiece kurulu, HF login — SP tokenizer private repo'da):
pip install "datasets>=2.18" sentencepiece
HF_TOKEN=hf_xxx python faz6_prep_data.py --out sft.jsonl --max_len 2048 --n_en 15000 --n_tr 15000
Not: 250K Magpie tarandığı için birkaç dk sürer.
"""
import os, sys, json, re, random, argparse
EN_REPO = "Magpie-Align/Magpie-Reasoning-V2-250K-CoT-Deepseek-R1-Llama-70B" # EN reasoning (CoT)
TR_REPO = "Quardo/Turkish-Alpaca-GPT-4O-V2" # TR direkt-cevap (CoT'suz, GPT-4o, instruction/input/output)
THINK_RE = re.compile(r"<think>.*?</think>\s*", re.DOTALL)
# ───────────── saf-mantık (yerelde test edilebilir) ─────────────
def build_prompt(instr, inp=""):
instr = instr.strip(); inp = (inp or "").strip()
if inp:
return f"### Talimat:\n{instr}\n\n### Girdi:\n{inp}\n\n### Yanıt:\n"
return f"### Talimat:\n{instr}\n\n### Yanıt:\n"
def parse_messages(msgs):
"""chat 'messages' listesinden ilk (user, assistant) çiftini çıkar (system atlanır)."""
instr = out = None
for m in msgs or []:
role = m.get("role"); c = (m.get("content") or "").strip()
if role == "user" and instr is None:
instr = c
elif role == "assistant" and instr is not None and out is None:
out = c
return instr, out
def maybe_strip_think(text, strip):
return THINK_RE.sub("", text).strip() if strip else text
def tok_len(sp, instr, out, inp=""):
"""faz6_sft ile aynı tokenizasyon: prompt(+input) + yanıt + eos."""
return len(sp.encode(build_prompt(instr, inp) + out.strip(), out_type=int)) + 1
# ───────────── yükleyiciler (datasets gerekir) ─────────────
def load_tok(token):
import sentencepiece as spm
from huggingface_hub import hf_hub_download
p = hf_hub_download("kdirgul/smartcore-v1", "tokenizer/tokenizer.model", repo_type="model", token=token)
return spm.SentencePieceProcessor(model_file=p)
def gather_en(sp, args):
from datasets import load_dataset
ds = load_dataset(EN_REPO, split="train")
quals = set(args.quality.split(",")); diffs = set(args.difficulty.split(","))
out = []; seen = 0
for ex in ds:
seen += 1
if (ex.get("language") or "EN").upper() != "EN":
continue
if ex.get("input_quality") not in quals or ex.get("difficulty") not in diffs:
continue
instr = (ex.get("instruction") or "").strip()
resp = maybe_strip_think((ex.get("response") or "").strip(), args.strip_think)
if not instr or not resp:
continue
if len(instr) + len(resp) > args.max_len * 6: # ucuz ön-eleme (kesin uzun)
continue
if tok_len(sp, instr, resp) > args.max_len:
continue
out.append({"instruction": instr, "output": resp})
print(f"[en] tarandı {seen} → tutuldu {len(out)}", flush=True)
return out
def gather_tr(sp, args, token):
"""TR direkt-cevap (Quardo Turkish-Alpaca-GPT-4O, instruction/input/output). CoT yok."""
from datasets import load_dataset
ds = load_dataset(TR_REPO, split="train", token=token)
out = []
for ex in ds:
instr = (ex.get("instruction") or "").strip()
inp = (ex.get("input") or "").strip()
resp = maybe_strip_think((ex.get("output") or "").strip(), args.strip_think)
if not instr or not resp:
continue
if len(instr) + len(inp) + len(resp) > args.max_len * 6:
continue
if tok_len(sp, instr, resp, inp) > args.max_len:
continue
r = {"instruction": instr, "output": resp}
if inp:
r["input"] = inp
out.append(r)
print(f"[tr] {TR_REPO} → tutuldu {len(out)}", flush=True)
return out
def stats(sp, rows, name):
if not rows:
print(f"[{name}] 0 örnek", flush=True); return
sample = rows if len(rows) <= 3000 else random.sample(rows, 3000)
ls = sorted(tok_len(sp, r["instruction"], r["output"], r.get("input", "")) for r in sample)
print(f"[{name}] n={len(rows)} | token: med={ls[len(ls)//2]} p90={ls[int(len(ls)*0.9)]} max={ls[-1]}", flush=True)
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--out", default="sft.jsonl")
ap.add_argument("--max_len", type=int, default=2048)
ap.add_argument("--n_en", type=int, default=15000)
ap.add_argument("--n_tr", type=int, default=15000)
ap.add_argument("--quality", default="good,excellent")
ap.add_argument("--difficulty", default="easy,medium")
ap.add_argument("--strip_think", action="store_true", help="yanıttan <think>...</think> at (177M için daha güvenli)")
ap.add_argument("--seed", type=int, default=42)
args = ap.parse_args()
token = os.environ.get("HF_TOKEN")
try:
from huggingface_hub import get_token
token = token or get_token()
except Exception:
pass
sp = load_tok(token)
rng = random.Random(args.seed)
print("=== EN (Magpie) ===", flush=True)
en = gather_en(sp, args); rng.shuffle(en); stats(sp, en, "EN-filtreli"); en = en[:args.n_en]
print("=== TR (Bilge) ===", flush=True)
tr = gather_tr(sp, args, token); rng.shuffle(tr); stats(sp, tr, "TR-filtreli"); tr = tr[:args.n_tr]
data = en + tr; rng.shuffle(data)
with open(args.out, "w", encoding="utf-8") as f:
for r in data:
f.write(json.dumps(r, ensure_ascii=False) + "\n")
stats(sp, data, "TOPLAM")
print(f"\n[bitti] {len(data)} örnek (EN {len(en)} + TR {len(tr)}) -> {args.out}", flush=True)
if __name__ == "__main__":
main()
|