| """ |
| Faz 5 — SmartCore V1 eval baseline (STANDALONE / kendine yeten). |
| |
| Çoktan seçmeli log-likelihood: her şık için modelin koşullu log-olasılığını hesapla → |
| en yükseği seç → altın cevapla karşılaştır. acc=ham toplam, acc_norm=token-uzunluğa bölünmüş. |
| |
| Model tanımı GÖMÜLÜ (faz3_train.py ile birebir) → hiçbir yerel import'a bağlı DEĞİL. |
| Ortam: Colab GPU + mamba-og fork. Veri: HF datasets (Faz1'de dekontamine → adil). |
| |
| Kullanım (fork kurulu + HF login'li): |
| HF_TOKEN=hf_xxx python faz5_eval.py --tasks xcopa --limit 100 |
| HF_TOKEN=hf_xxx python faz5_eval.py --tasks xcopa,belebele,hellaswag --limit 200 |
| """ |
| import os, sys, argparse |
| import torch, torch.nn as nn, torch.nn.functional as F |
| from functools import partial |
|
|
| try: |
| from mamba_ssm.modules.block import Block |
| from mamba_ssm.modules.mamba3 import Mamba3 |
| from mamba_ssm.modules.mlp import GatedMLP |
| from mamba_ssm.ops.triton.layer_norm import RMSNorm |
| except Exception as e: |
| sys.exit(f"[hata] mamba-og fork yok ({e!r}). Önce wheel kurulum hücresini çalıştır (CUDA gerekir).") |
|
|
|
|
| |
| def _rms(x, w, eps=1e-5): |
| return (x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + eps)) * w |
|
|
|
|
| def _rot_half(x): |
| a, b = x.chunk(2, -1) |
| return torch.cat((-b, a), -1) |
|
|
|
|
| class GQAMixer(nn.Module): |
| def __init__(self, dim, n_heads=12, n_kv=3, base=10000.0, layer_idx=None, device=None, dtype=None): |
| super().__init__() |
| self.nh, self.nkv, self.hd = n_heads, n_kv, dim // n_heads |
| self.rep = n_heads // n_kv |
| fk = {"device": device, "dtype": dtype} |
| self.q_proj = nn.Linear(dim, n_heads * self.hd, bias=False, **fk) |
| self.k_proj = nn.Linear(dim, n_kv * self.hd, bias=False, **fk) |
| self.v_proj = nn.Linear(dim, n_kv * self.hd, bias=False, **fk) |
| self.out_proj = nn.Linear(n_heads * self.hd, dim, bias=False, **fk) |
| self.qn = nn.Parameter(torch.ones(self.hd, **fk)) |
| self.kn = nn.Parameter(torch.ones(self.hd, **fk)) |
| self.register_buffer( |
| "inv", 1.0 / (base ** (torch.arange(0, self.hd, 2, device=device).float() / self.hd)), |
| persistent=False) |
|
|
| def _rope(self, x, T): |
| f = torch.outer(torch.arange(T, device=x.device, dtype=torch.float32), self.inv) |
| e = torch.cat((f, f), -1) |
| return (x * e.cos()[None, None] + _rot_half(x) * e.sin()[None, None]).to(x.dtype) |
|
|
| def forward(self, x, **kw): |
| B, T, _ = x.shape |
| q = self.q_proj(x).view(B, T, self.nh, self.hd).transpose(1, 2) |
| k = self.k_proj(x).view(B, T, self.nkv, self.hd).transpose(1, 2) |
| v = self.v_proj(x).view(B, T, self.nkv, self.hd).transpose(1, 2) |
| q = _rms(q.float(), self.qn.float()).to(x.dtype) |
| k = _rms(k.float(), self.kn.float()).to(x.dtype) |
| q, k = self._rope(q, T), self._rope(k, T) |
| k = k.repeat_interleave(self.rep, 1) |
| v = v.repeat_interleave(self.rep, 1) |
| y = F.scaled_dot_product_attention(q, k, v, is_causal=True) |
| return self.out_proj(y.transpose(1, 2).contiguous().view(B, T, -1)) |
|
|
|
|
| class HybridLM(nn.Module): |
| def __init__(self, cfg, device=None, dtype=None): |
| super().__init__() |
| self.cfg = cfg |
| self.vocab = cfg["vocab_size"] |
| self.scaled_embed = cfg.get("scaled_embed", False) |
| d = cfg["d_model"] |
| self.embedding = nn.Embedding(self.vocab, d, device=device, dtype=dtype) |
| self.layers = nn.ModuleList() |
| self.attn_idx = [] |
| for i in range(cfg["n_layers"]): |
| is_attn = ((i + 1) % cfg["attn_every"] == 0) and i != 0 and i != cfg["n_layers"] - 1 |
| fk = {"device": device, "dtype": dtype} |
| if is_attn: |
| mixer_cls = partial(GQAMixer, n_heads=cfg["n_heads"], n_kv=cfg["n_kv_heads"], |
| layer_idx=i, **fk) |
| self.attn_idx.append(i) |
| else: |
| ssm = dict(d_state=cfg["d_state"], expand=cfg["expand"], headdim=cfg["head_dim"], |
| ngroups=cfg["ngroups"], rope_fraction=cfg["rope_fraction"], |
| is_outproj_norm=False, is_mimo=cfg["is_mimo"], mimo_rank=cfg["mimo_rank"], |
| chunk_size=cfg["chunk_size"]) |
| mixer_cls = partial(Mamba3, layer_idx=i, **ssm, **fk) |
| blk = Block(d, mixer_cls, |
| partial(GatedMLP, hidden_features=cfg["d_intermediate"], out_features=d, **fk), |
| norm_cls=partial(RMSNorm, eps=1e-5, **fk), |
| fused_add_norm=True, residual_in_fp32=True) |
| blk.layer_idx = i |
| self.layers.append(blk) |
| self.norm_f = RMSNorm(d, eps=1e-5, device=device, dtype=dtype) |
| self.lm_head = nn.Linear(d, self.vocab, bias=False, device=device, dtype=dtype) |
| self.lm_head.weight = self.embedding.weight |
|
|
| def forward(self, ids): |
| h = self.embedding(ids) |
| if self.scaled_embed: |
| h = h * (self.cfg["d_model"] ** 0.5) |
| res = None |
| for l in self.layers: |
| h, res = l(h, res) |
| h = self.norm_f((h + res) if res is not None else h) |
| return self.lm_head(h.to(self.lm_head.weight.dtype)) |
|
|
|
|
| |
| def load_tok(token): |
| import sentencepiece as spm |
| from huggingface_hub import hf_hub_download |
| p = hf_hub_download("kdirgul/smartcore-v1", "tokenizer/tokenizer.model", repo_type="model", token=token) |
| sp = spm.SentencePieceProcessor(model_file=p) |
| print(f"[tok] vocab={sp.get_piece_size()}", flush=True) |
| return sp |
|
|
|
|
| def latest_ckpt(token): |
| from huggingface_hub import HfApi, hf_hub_download |
| api = HfApi(token=token) |
| fs = [f for f in api.list_repo_files("kdirgul/smartcore-v1", repo_type="model") |
| if f.startswith("checkpoints/step_") and f.endswith("ckpt.pt")] |
| latest = max(fs) |
| print(f"[ckpt] {latest}", flush=True) |
| return hf_hub_download("kdirgul/smartcore-v1", latest, repo_type="model", token=token) |
|
|
|
|
| def resolve_ckpt(spec, token): |
| """spec yoksa base latest; yerel .pt varsa onu; aksi halde HF yolu (örn sft/epoch_2/ckpt.pt).""" |
| if not spec: |
| return latest_ckpt(token) |
| if os.path.exists(spec): |
| return spec |
| from huggingface_hub import hf_hub_download |
| print(f"[ckpt] HF: {spec}", flush=True) |
| return hf_hub_download("kdirgul/smartcore-v1", spec, repo_type="model", token=token) |
|
|
|
|
| |
| @torch.no_grad() |
| def loglik(model, sp, ctx, cont): |
| ctx_ids = sp.encode(ctx, out_type=int) |
| full_ids = sp.encode(ctx + cont, out_type=int) |
| n_cont = max(1, len(full_ids) - len(ctx_ids)) |
| full_ids = full_ids[-2048:] |
| x = torch.tensor([full_ids], device="cuda") |
| with torch.autocast(device_type="cuda", dtype=torch.bfloat16): |
| logits = model(x)[0].float() |
| lp = F.log_softmax(logits[:-1], dim=-1) |
| tgt = torch.tensor(full_ids[1:], device="cuda") |
| tok_lp = lp[torch.arange(len(tgt), device="cuda"), tgt] |
| return tok_lp[-n_cont:].sum().item(), n_cont |
|
|
|
|
| def predict(model, sp, ctx, conts): |
| sc = [loglik(model, sp, ctx, c) for c in conts] |
| raw = max(range(len(conts)), key=lambda i: sc[i][0]) |
| norm = max(range(len(conts)), key=lambda i: sc[i][0] / sc[i][1]) |
| return raw, norm |
|
|
|
|
| |
| def task_xcopa(limit): |
| from datasets import load_dataset |
| ds = load_dataset("cambridgeltl/xcopa", "tr", split="validation") |
| conn = {"cause": "çünkü", "effect": "bu yüzden"} |
| out = [] |
| for ex in ds: |
| ctx = f"{ex['premise'].strip().rstrip('.')} {conn[ex['question']]}" |
| conts = [" " + ex["choice1"].strip(), " " + ex["choice2"].strip()] |
| out.append((ctx, conts, int(ex["label"]))) |
| if limit and len(out) >= limit: |
| break |
| return out |
|
|
|
|
| def task_belebele(limit): |
| from datasets import load_dataset |
| ds = load_dataset("facebook/belebele", "tur_Latn", split="test") |
| out = [] |
| for ex in ds: |
| ctx = f"{ex['flores_passage'].strip()}\nSoru: {ex['question'].strip()}\nCevap:" |
| conts = [" " + ex[f"mc_answer{i}"].strip() for i in (1, 2, 3, 4)] |
| out.append((ctx, conts, int(ex["correct_answer_num"]) - 1)) |
| if limit and len(out) >= limit: |
| break |
| return out |
|
|
|
|
| def task_hellaswag(limit): |
| from datasets import load_dataset |
| ds = load_dataset("Rowan/hellaswag", split="validation") |
| out = [] |
| for ex in ds: |
| ctx = ex["ctx"].strip() |
| conts = [" " + e.strip() for e in ex["endings"]] |
| out.append((ctx, conts, int(ex["label"]))) |
| if limit and len(out) >= limit: |
| break |
| return out |
|
|
|
|
| def task_xnli(limit): |
| """XNLI-tr (NLI) → 3-şık MC: premise+hypothesis → Doğru/Belirsiz/Yanlış. |
| label 0=entailment→Doğru, 1=neutral→Belirsiz, 2=contradiction→Yanlış.""" |
| from datasets import load_dataset |
| ds = load_dataset("facebook/xnli", "tr", split="validation") |
| opts = [" Doğru", " Belirsiz", " Yanlış"] |
| out = [] |
| for ex in ds: |
| ctx = (f"{ex['premise'].strip()}\nSoru: \"{ex['hypothesis'].strip()}\" — " |
| f"doğru mu, belirsiz mi, yanlış mı?\nCevap:") |
| out.append((ctx, opts, int(ex["label"]))) |
| if limit and len(out) >= limit: |
| break |
| return out |
|
|
|
|
| def task_turkishmmlu(limit): |
| """TurkishMMLU (All config) → 4-5 şık akademik MC. answer = 0-tabanlı int index.""" |
| from datasets import load_dataset |
| ds = load_dataset("AYueksel/TurkishMMLU", "All", split="test") |
| out = [] |
| for ex in ds: |
| choices = ex["choices"] |
| gold = int(ex["answer"]) |
| if not (0 <= gold < len(choices)): |
| continue |
| ctx = f"Soru: {ex['question'].strip()}\nCevap:" |
| conts = [" " + str(c).strip() for c in choices] |
| out.append((ctx, conts, gold)) |
| if limit and len(out) >= limit: |
| break |
| return out |
|
|
|
|
| TASKS = {"xcopa": task_xcopa, "belebele": task_belebele, "hellaswag": task_hellaswag, |
| "xnli": task_xnli, "turkishmmlu": task_turkishmmlu} |
|
|
|
|
| def run_task(model, sp, name, items): |
| raw_ok = norm_ok = 0 |
| for i, (ctx, conts, gold) in enumerate(items): |
| r, n = predict(model, sp, ctx, conts) |
| raw_ok += (r == gold); norm_ok += (n == gold) |
| if (i + 1) % 50 == 0: |
| print(f" {name} {i+1}/{len(items)}...", flush=True) |
| N = len(items); rnd = 1.0 / len(items[0][1]) |
| print(f"[{name}] acc={raw_ok/N:.3f} | acc_norm={norm_ok/N:.3f} | N={N} | random={rnd:.3f}", flush=True) |
| return raw_ok / N, norm_ok / N |
|
|
|
|
| def main(): |
| ap = argparse.ArgumentParser() |
| ap.add_argument("--tasks", default="xcopa,belebele,hellaswag") |
| ap.add_argument("--limit", type=int, default=200) |
| ap.add_argument("--ckpt", default=None, help="HF yolu (sft/epoch_2/ckpt.pt) | yerel .pt | boş=base latest") |
| args = ap.parse_args() |
|
|
| assert torch.cuda.is_available(), "CUDA yok (Colab GPU gerekir)." |
| torch.set_float32_matmul_precision("high") |
| from huggingface_hub import get_token |
| token = os.environ.get("HF_TOKEN") or get_token() |
|
|
| sp = load_tok(token) |
| st = torch.load(resolve_ckpt(args.ckpt, token), map_location="cpu") |
| model = HybridLM(st["cfg"], device="cuda", dtype=torch.bfloat16) |
| model.load_state_dict(st["model"], strict=False); model.eval() |
| tag = f"sft/epoch={st.get('epoch')}" if st.get("sft") else f"base step={st.get('step','?')}" |
| print(f"[model] {tag} | {'MIMO' if st['cfg'].get('is_mimo') else 'SISO'}\n", flush=True) |
|
|
| results = {} |
| for name in [t.strip() for t in args.tasks.split(",") if t.strip()]: |
| if name not in TASKS: |
| print(f"[atla] bilinmeyen görev: {name}"); continue |
| print(f"=== {name} yükleniyor ===", flush=True) |
| results[name] = run_task(model, sp, name, TASKS[name](args.limit or None)) |
|
|
| print("\n===== ÖZET (baseline) =====") |
| for name, (acc, accn) in results.items(): |
| print(f"{name:12s} acc={acc:.3f} acc_norm={accn:.3f}") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|