smartcore-v1 / code /kod /faz6_prep_data.py

Upload code/kod/faz6_prep_data.py with huggingface_hub

49ac326 verified 8 days ago

6.21 kB

	"""
	Faz 6 ön-hazırlık — bilingual SFT verisi üretici.

	PIVOT-DİL kararı: reasoning TEK DİLDE = İngilizce. EN=Magpie (CoT, reasoning taşır),
	TR=Quardo Turkish-Alpaca-GPT-4O (CoT'suz direkt-cevap, akıcılık/talimat-takibi).
	Magpie(EN) + Quardo(TR) → filtrele/dengele → {instruction, (input), output} JSONL.
	Filtre: token uzunluğu ≤ max_len (KESME YOK, sığmayan atılır) + Magpie kalite (input_quality/difficulty).
	Çıktı faz6_sft.py'nin --data'sına verilir.

	Çalıştırma (Colab/yerel; datasets + sentencepiece kurulu, HF login — SP tokenizer private repo'da):
	pip install "datasets>=2.18" sentencepiece
	HF_TOKEN=hf_xxx python faz6_prep_data.py --out sft.jsonl --max_len 2048 --n_en 15000 --n_tr 15000
	Not: 250K Magpie tarandığı için birkaç dk sürer.
	"""
	import os, sys, json, re, random, argparse

	EN_REPO = "Magpie-Align/Magpie-Reasoning-V2-250K-CoT-Deepseek-R1-Llama-70B" # EN reasoning (CoT)
	TR_REPO = "Quardo/Turkish-Alpaca-GPT-4O-V2" # TR direkt-cevap (CoT'suz, GPT-4o, instruction/input/output)
	THINK_RE = re.compile(r"<think>.?</think>\s", re.DOTALL)


	# ───────────── saf-mantık (yerelde test edilebilir) ─────────────
	def build_prompt(instr, inp=""):
	instr = instr.strip(); inp = (inp or "").strip()
	if inp:
	return f"### Talimat:\n{instr}\n\n### Girdi:\n{inp}\n\n### Yanıt:\n"
	return f"### Talimat:\n{instr}\n\n### Yanıt:\n"


	def parse_messages(msgs):
	"""chat 'messages' listesinden ilk (user, assistant) çiftini çıkar (system atlanır)."""
	instr = out = None
	for m in msgs or []:
	role = m.get("role"); c = (m.get("content") or "").strip()
	if role == "user" and instr is None:
	instr = c
	elif role == "assistant" and instr is not None and out is None:
	out = c
	return instr, out


	def maybe_strip_think(text, strip):
	return THINK_RE.sub("", text).strip() if strip else text


	def tok_len(sp, instr, out, inp=""):
	"""faz6_sft ile aynı tokenizasyon: prompt(+input) + yanıt + eos."""
	return len(sp.encode(build_prompt(instr, inp) + out.strip(), out_type=int)) + 1


	# ───────────── yükleyiciler (datasets gerekir) ─────────────
	def load_tok(token):
	import sentencepiece as spm
	from huggingface_hub import hf_hub_download
	p = hf_hub_download("kdirgul/smartcore-v1", "tokenizer/tokenizer.model", repo_type="model", token=token)
	return spm.SentencePieceProcessor(model_file=p)


	def gather_en(sp, args):
	from datasets import load_dataset
	ds = load_dataset(EN_REPO, split="train")
	quals = set(args.quality.split(",")); diffs = set(args.difficulty.split(","))
	out = []; seen = 0
	for ex in ds:
	seen += 1
	if (ex.get("language") or "EN").upper() != "EN":
	continue
	if ex.get("input_quality") not in quals or ex.get("difficulty") not in diffs:
	continue
	instr = (ex.get("instruction") or "").strip()
	resp = maybe_strip_think((ex.get("response") or "").strip(), args.strip_think)
	if not instr or not resp:
	continue
	if len(instr) + len(resp) > args.max_len * 6: # ucuz ön-eleme (kesin uzun)
	continue
	if tok_len(sp, instr, resp) > args.max_len:
	continue
	out.append({"instruction": instr, "output": resp})
	print(f"[en] tarandı {seen} → tutuldu {len(out)}", flush=True)
	return out


	def gather_tr(sp, args, token):
	"""TR direkt-cevap (Quardo Turkish-Alpaca-GPT-4O, instruction/input/output). CoT yok."""
	from datasets import load_dataset
	ds = load_dataset(TR_REPO, split="train", token=token)
	out = []
	for ex in ds:
	instr = (ex.get("instruction") or "").strip()
	inp = (ex.get("input") or "").strip()
	resp = maybe_strip_think((ex.get("output") or "").strip(), args.strip_think)
	if not instr or not resp:
	continue
	if len(instr) + len(inp) + len(resp) > args.max_len * 6:
	continue
	if tok_len(sp, instr, resp, inp) > args.max_len:
	continue
	r = {"instruction": instr, "output": resp}
	if inp:
	r["input"] = inp
	out.append(r)
	print(f"[tr] {TR_REPO} → tutuldu {len(out)}", flush=True)
	return out


	def stats(sp, rows, name):
	if not rows:
	print(f"[{name}] 0 örnek", flush=True); return
	sample = rows if len(rows) <= 3000 else random.sample(rows, 3000)
	ls = sorted(tok_len(sp, r["instruction"], r["output"], r.get("input", "")) for r in sample)
	print(f"[{name}] n={len(rows)} \| token: med={ls[len(ls)//2]} p90={ls[int(len(ls)*0.9)]} max={ls[-1]}", flush=True)


	def main():
	ap = argparse.ArgumentParser()
	ap.add_argument("--out", default="sft.jsonl")
	ap.add_argument("--max_len", type=int, default=2048)
	ap.add_argument("--n_en", type=int, default=15000)
	ap.add_argument("--n_tr", type=int, default=15000)
	ap.add_argument("--quality", default="good,excellent")
	ap.add_argument("--difficulty", default="easy,medium")
	ap.add_argument("--strip_think", action="store_true", help="yanıttan <think>...</think> at (177M için daha güvenli)")
	ap.add_argument("--seed", type=int, default=42)
	args = ap.parse_args()

	token = os.environ.get("HF_TOKEN")
	try:
	from huggingface_hub import get_token
	token = token or get_token()
	except Exception:
	pass
	sp = load_tok(token)
	rng = random.Random(args.seed)

	print("=== EN (Magpie) ===", flush=True)
	en = gather_en(sp, args); rng.shuffle(en); stats(sp, en, "EN-filtreli"); en = en[:args.n_en]
	print("=== TR (Bilge) ===", flush=True)
	tr = gather_tr(sp, args, token); rng.shuffle(tr); stats(sp, tr, "TR-filtreli"); tr = tr[:args.n_tr]

	data = en + tr; rng.shuffle(data)
	with open(args.out, "w", encoding="utf-8") as f:
	for r in data:
	f.write(json.dumps(r, ensure_ascii=False) + "\n")
	stats(sp, data, "TOPLAM")
	print(f"\n[bitti] {len(data)} örnek (EN {len(en)} + TR {len(tr)}) -> {args.out}", flush=True)


	if __name__ == "__main__":
	main()