| """ |
| Faz 7 — RAG (retrieval-augmented generation) — SmartCore V1. (Basamak 2) |
| |
| Pipeline: doküman → chunk → embed → index | soru → embed → top-k ara → |
| bağlamı SFT şablonunun '### Girdi:' alanına enjekte → v1-instruct cevaplar. |
| Amaç: 177M'in olgusal halüsinasyonunu retrieval ile çöz (ezberden değil, getirerek). |
| |
| Generator: HybridLM gömülü (model-bağımsız). Embedding: sentence-transformers (çok dilli TR+EN). |
| Index: numpy cosine (prototip; ölçek için FAISS/turbovec sonra). 2048 bağlam → bütçe-kırpma |
| (REFRAG/headroom sıkıştırma = sonraki sürüm). |
| |
| Ortam: Colab GPU + mamba-og fork (wheel) + `pip install sentence-transformers`. |
| Kullanım: |
| HF_TOKEN=hf_xxx python faz7_rag.py --demo --query "Türkiye'nin başkenti neresi?" |
| HF_TOKEN=hf_xxx python faz7_rag.py --docs /content/dokumanlar # interaktif |
| """ |
| import os, sys, glob, re, argparse |
| import torch, torch.nn as nn, torch.nn.functional as F |
| from functools import partial |
|
|
| try: |
| from mamba_ssm.modules.block import Block |
| from mamba_ssm.modules.mamba3 import Mamba3 |
| from mamba_ssm.modules.mlp import GatedMLP |
| from mamba_ssm.ops.triton.layer_norm import RMSNorm |
| _FORK_OK, _FORK_ERR = True, None |
| except Exception as e: |
| Block = Mamba3 = GatedMLP = RMSNorm = None |
| _FORK_OK, _FORK_ERR = False, e |
|
|
|
|
| |
| def _rms(x, w, eps=1e-5): |
| return (x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + eps)) * w |
|
|
|
|
| def _rot_half(x): |
| a, b = x.chunk(2, -1) |
| return torch.cat((-b, a), -1) |
|
|
|
|
| class GQAMixer(nn.Module): |
| def __init__(self, dim, n_heads=12, n_kv=3, base=10000.0, layer_idx=None, device=None, dtype=None): |
| super().__init__() |
| self.nh, self.nkv, self.hd = n_heads, n_kv, dim // n_heads |
| self.rep = n_heads // n_kv |
| fk = {"device": device, "dtype": dtype} |
| self.q_proj = nn.Linear(dim, n_heads * self.hd, bias=False, **fk) |
| self.k_proj = nn.Linear(dim, n_kv * self.hd, bias=False, **fk) |
| self.v_proj = nn.Linear(dim, n_kv * self.hd, bias=False, **fk) |
| self.out_proj = nn.Linear(n_heads * self.hd, dim, bias=False, **fk) |
| self.qn = nn.Parameter(torch.ones(self.hd, **fk)) |
| self.kn = nn.Parameter(torch.ones(self.hd, **fk)) |
| self.register_buffer( |
| "inv", 1.0 / (base ** (torch.arange(0, self.hd, 2, device=device).float() / self.hd)), |
| persistent=False) |
|
|
| def _rope(self, x, T): |
| f = torch.outer(torch.arange(T, device=x.device, dtype=torch.float32), self.inv) |
| e = torch.cat((f, f), -1) |
| return (x * e.cos()[None, None] + _rot_half(x) * e.sin()[None, None]).to(x.dtype) |
|
|
| def forward(self, x, **kw): |
| B, T, _ = x.shape |
| q = self.q_proj(x).view(B, T, self.nh, self.hd).transpose(1, 2) |
| k = self.k_proj(x).view(B, T, self.nkv, self.hd).transpose(1, 2) |
| v = self.v_proj(x).view(B, T, self.nkv, self.hd).transpose(1, 2) |
| q = _rms(q.float(), self.qn.float()).to(x.dtype) |
| k = _rms(k.float(), self.kn.float()).to(x.dtype) |
| q, k = self._rope(q, T), self._rope(k, T) |
| k = k.repeat_interleave(self.rep, 1) |
| v = v.repeat_interleave(self.rep, 1) |
| y = F.scaled_dot_product_attention(q, k, v, is_causal=True) |
| return self.out_proj(y.transpose(1, 2).contiguous().view(B, T, -1)) |
|
|
|
|
| class HybridLM(nn.Module): |
| def __init__(self, cfg, device=None, dtype=None): |
| super().__init__() |
| self.cfg = cfg |
| self.vocab = cfg["vocab_size"] |
| self.scaled_embed = cfg.get("scaled_embed", False) |
| d = cfg["d_model"] |
| self.embedding = nn.Embedding(self.vocab, d, device=device, dtype=dtype) |
| self.layers = nn.ModuleList() |
| for i in range(cfg["n_layers"]): |
| is_attn = ((i + 1) % cfg["attn_every"] == 0) and i != 0 and i != cfg["n_layers"] - 1 |
| fk = {"device": device, "dtype": dtype} |
| if is_attn: |
| mixer_cls = partial(GQAMixer, n_heads=cfg["n_heads"], n_kv=cfg["n_kv_heads"], layer_idx=i, **fk) |
| else: |
| ssm = dict(d_state=cfg["d_state"], expand=cfg["expand"], headdim=cfg["head_dim"], |
| ngroups=cfg["ngroups"], rope_fraction=cfg["rope_fraction"], |
| is_outproj_norm=False, is_mimo=cfg["is_mimo"], mimo_rank=cfg["mimo_rank"], |
| chunk_size=cfg["chunk_size"]) |
| mixer_cls = partial(Mamba3, layer_idx=i, **ssm, **fk) |
| blk = Block(d, mixer_cls, |
| partial(GatedMLP, hidden_features=cfg["d_intermediate"], out_features=d, **fk), |
| norm_cls=partial(RMSNorm, eps=1e-5, **fk), fused_add_norm=True, residual_in_fp32=True) |
| blk.layer_idx = i |
| self.layers.append(blk) |
| self.norm_f = RMSNorm(d, eps=1e-5, device=device, dtype=dtype) |
| self.lm_head = nn.Linear(d, self.vocab, bias=False, device=device, dtype=dtype) |
| self.lm_head.weight = self.embedding.weight |
|
|
| def forward(self, ids): |
| h = self.embedding(ids) |
| if self.scaled_embed: |
| h = h * (self.cfg["d_model"] ** 0.5) |
| res = None |
| for l in self.layers: |
| h, res = l(h, res) |
| h = self.norm_f((h + res) if res is not None else h) |
| return self.lm_head(h.to(self.lm_head.weight.dtype)) |
|
|
|
|
| |
| def load_tok(token, local=None): |
| import sentencepiece as spm |
| if local and os.path.exists(local): |
| return spm.SentencePieceProcessor(model_file=local) |
| from huggingface_hub import hf_hub_download |
| p = hf_hub_download("kdirgul/smartcore-v1", "tokenizer/tokenizer.model", repo_type="model", token=token) |
| return spm.SentencePieceProcessor(model_file=p) |
|
|
|
|
| def resolve_ckpt(spec, token): |
| if os.path.exists(spec): |
| return spec |
| from huggingface_hub import hf_hub_download |
| print(f"[ckpt] HF: {spec}", flush=True) |
| return hf_hub_download("kdirgul/smartcore-v1", spec, repo_type="model", token=token) |
|
|
|
|
| @torch.no_grad() |
| def generate(model, sp, prompt, max_new=160, temperature=0.3, top_k=40, top_p=0.9, rep_penalty=1.2, device="cuda"): |
| eos = sp.eos_id() |
| ids = sp.encode(prompt, out_type=int) |
| x = torch.tensor([ids], device=device); out = list(ids) |
| use_amp = (device == "cuda") |
| for _ in range(max_new): |
| if use_amp: |
| with torch.autocast(device_type="cuda", dtype=torch.bfloat16): |
| logits = model(x)[0, -1].float() |
| else: |
| logits = model(x)[0, -1].float() |
| if rep_penalty != 1.0: |
| for t in set(out): |
| logits[t] = logits[t] / rep_penalty if logits[t] > 0 else logits[t] * rep_penalty |
| if temperature <= 0: |
| nxt = int(logits.argmax()) |
| else: |
| logits = logits / temperature |
| if top_k: |
| kth = torch.topk(logits, min(top_k, logits.numel())).values[-1]; logits[logits < kth] = -float("inf") |
| probs = F.softmax(logits, -1) |
| if top_p < 1.0: |
| s, si = torch.sort(probs, descending=True); cut = torch.cumsum(s, -1) > top_p |
| cut[1:] = cut[:-1].clone(); cut[0] = False; s[cut] = 0 |
| probs = torch.zeros_like(probs).scatter_(0, si, s); probs /= probs.sum() |
| nxt = int(torch.multinomial(probs, 1)) |
| if nxt == eos: |
| break |
| out.append(nxt); x = torch.cat([x, torch.tensor([[nxt]], device=device)], 1) |
| if x.shape[1] >= 2048: |
| x = x[:, -2048:] |
| return sp.decode([t for t in out[len(ids):] if t != eos]) |
|
|
|
|
| |
| DEMO_DOCS = [ |
| "Türkiye'nin başkenti Ankara'dır. Ankara ülkenin ikinci en kalabalık şehridir ve yaklaşık 5,7 milyon nüfusa sahiptir.", |
| "İstanbul, Türkiye'nin en kalabalık şehridir; nüfusu yaklaşık 15,5 milyondur. İstanbul iki kıtaya yayılır.", |
| "Fotosentez, bitkilerin güneş ışığı, su ve karbondioksiti kullanarak glikoz ve oksijen ürettiği biyokimyasal süreçtir.", |
| "SmartCore V1, sıfırdan eğitilmiş 177 milyon parametreli, Mamba-3 ve GQA hibrit mimarili, Türkçe ve İngilizce bir dil modelidir.", |
| "The capital of France is Paris, located on the river Seine. Paris is the most populous city in France.", |
| "Water boils at 100 degrees Celsius at sea level under one atmosphere of pressure.", |
| "The speed of light in vacuum is approximately 299,792 kilometers per second.", |
| "Mount Everest is the highest mountain on Earth, with its peak at 8,849 meters above sea level.", |
| "Mamba is a state space model (SSM) architecture for sequence modeling, proposed as a linear-time alternative to the Transformer.", |
| "Mimar Sinan, Osmanlı İmparatorluğu'nun baş mimarıdır; Süleymaniye ve Selimiye camilerini tasarlamıştır.", |
| ] |
|
|
| |
| DEMO_QUERIES = [ |
| "Türkiye'nin başkenti neresi?", |
| "Fotosentez nedir?", |
| "SmartCore V1 nedir?", |
| "What is the capital of France?", |
| "Su deniz seviyesinde kaç derecede kaynar?", |
| "How tall is Mount Everest?", |
| ] |
|
|
|
|
| def chunk_text(text, size=600, overlap=100): |
| text = " ".join(text.split()) |
| out, i = [], 0 |
| step = max(1, size - overlap) |
| while i < len(text): |
| seg = text[i:i + size].strip() |
| if seg: |
| out.append(seg) |
| i += step |
| return out |
|
|
|
|
| |
| _TR_WORDS = {"nedir", "ne", "hangi", "neresi", "nerede", "nereye", "kim", "kimdir", "neden", |
| "niçin", "niye", "mıdır", "midir", "mudur", "müdür", "kaç", "kaçtır", "nasıl", |
| "mı", "mi", "mu", "mü", "için", "ile", "kaçıncı"} |
|
|
|
|
| def _is_tr(question): |
| ql = question.lower() |
| if any(ch in ql for ch in "çğıöşü"): |
| return True |
| words = set(ql.replace("?", " ").replace("'", " ").replace(".", " ").split()) |
| return bool(words & _TR_WORDS) |
|
|
|
|
| def build_rag_prompt(question, hits, max_ctx_chars=2400): |
| ctx = "" |
| for c, _ in hits: |
| if len(ctx) + len(c) + 2 > max_ctx_chars: |
| break |
| ctx += c + "\n\n" |
| |
| |
| |
| tr = _is_tr(question) |
| instr = (f"Aşağıdaki bağlamı kullanarak soruyu kısa ve doğrudan yanıtla; adım adım düşünme. " |
| f"Cevap bağlamda yoksa \"bilmiyorum\" de.\nSoru: {question}" |
| if tr else |
| f"Answer the question using only the context below, briefly and directly; do not think step by step. " |
| f"If the answer is not in the context, say \"I don't know\".\nQuestion: {question}") |
| return f"### Talimat:\n{instr}\n\n### Girdi:\n{ctx.strip()}\n\n### Yanıt:\n" |
|
|
|
|
| _THINK = re.compile(r"<think>.*?</think>|<think>.*", re.S) |
|
|
|
|
| def strip_think(text): |
| """Magpie R1 reasoning sızıntısını temizle (açık/kapalı <think> blokları).""" |
| return _THINK.sub("", text).strip() |
|
|
|
|
| def load_docs(path): |
| docs = [] |
| files = ([path] if os.path.isfile(path) |
| else glob.glob(os.path.join(path, "**", "*.txt"), recursive=True) |
| + glob.glob(os.path.join(path, "**", "*.md"), recursive=True)) |
| for f in files: |
| with open(f, encoding="utf-8", errors="ignore") as fh: |
| docs.append(fh.read()) |
| return docs |
|
|
|
|
| |
| def build_index(embedder, chunks): |
| return embedder.encode(chunks, normalize_embeddings=True, convert_to_numpy=True, |
| batch_size=64, show_progress_bar=False) |
|
|
|
|
| def retrieve(embedder, index, chunks, query, k): |
| q = embedder.encode([query], normalize_embeddings=True, convert_to_numpy=True)[0] |
| sims = index @ q |
| top = sims.argsort()[::-1][:k] |
| return [(chunks[i], float(sims[i])) for i in top] |
|
|
|
|
| def main(): |
| ap = argparse.ArgumentParser() |
| ap.add_argument("--docs", default=None, help="doküman dosyası/dizini (.txt/.md)") |
| ap.add_argument("--demo", action="store_true", help="gömülü TR+EN demo korpusu") |
| ap.add_argument("--query", default=None, help="boşsa interaktif") |
| ap.add_argument("--ckpt", default="sft/epoch_2/ckpt.pt", help="HF yolu | yerel .pt") |
| ap.add_argument("--tokenizer", default=None, help="yerel tokenizer.model yolu (public/offline; boşsa HF'den çeker)") |
| ap.add_argument("--embed_model", default="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2") |
| ap.add_argument("--top_k", type=int, default=3) |
| ap.add_argument("--chunk_size", type=int, default=600) |
| ap.add_argument("--overlap", type=int, default=100) |
| ap.add_argument("--max_ctx_chars", type=int, default=2400) |
| ap.add_argument("--temperature", type=float, default=0.3) |
| ap.add_argument("--max_new", type=int, default=96, help="üretilecek max token (RAG extractive için kısa tut)") |
| ap.add_argument("--no_rag", action="store_true", help="RAG'sız (kıyas için: bağlam enjekte etme)") |
| ap.add_argument("--ab", action="store_true", |
| help="A/B: her soru için RAG vs ham cevabı yan yana üret (tek koşu, tek model yüklemesi)") |
| ap.add_argument("--device", default="auto", choices=["auto", "cuda", "cpu"], |
| help="cpu = saf-PyTorch lamba_cpu (GPU/Triton gerekmez); auto = cuda varsa cuda") |
| ap.add_argument("--int8", action="store_true", |
| help="CPU int8 dynamic quant (~2× küçük bellek, ~1.2× hız; sadece --device cpu)") |
| args = ap.parse_args() |
|
|
| device = ("cuda" if torch.cuda.is_available() else "cpu") if args.device == "auto" else args.device |
| if device == "cuda": |
| assert torch.cuda.is_available(), "CUDA yok — --device cpu kullan (saf-PyTorch lamba_cpu)." |
| torch.set_float32_matmul_precision("high") |
| print(f"[device] {device}", flush=True) |
| token = os.environ.get("HF_TOKEN") |
| try: |
| from huggingface_hub import get_token |
| token = token or get_token() |
| except Exception: |
| pass |
|
|
| |
| docs = DEMO_DOCS if args.demo else (load_docs(args.docs) if args.docs else None) |
| assert docs, "doküman yok: --demo ya da --docs ver." |
| chunks = [] |
| for d in docs: |
| chunks += chunk_text(d, args.chunk_size, args.overlap) |
| print(f"[rag] {len(docs)} doküman → {len(chunks)} chunk | embed: {args.embed_model}", flush=True) |
| from sentence_transformers import SentenceTransformer |
| embedder = SentenceTransformer(args.embed_model, device=device) |
| index = build_index(embedder, chunks) |
|
|
| |
| sp = load_tok(token, args.tokenizer) |
| ckpt_path = resolve_ckpt(args.ckpt, token) |
| st = torch.load(ckpt_path, map_location="cpu") |
| if device == "cpu": |
| import lamba_cpu as LC |
| model, _ = LC.load_lamba(ckpt_path); gen = LC.generate |
| if args.int8: |
| model = LC.quantize_int8(model); print("[int8] dynamic quant uygulandı", flush=True) |
| else: |
| assert _FORK_OK, f"GPU yolu mamba-og fork ister ({_FORK_ERR!r}). Wheel kur ya da --device cpu kullan." |
| model = HybridLM(st["cfg"], device="cuda", dtype=torch.bfloat16) |
| model.load_state_dict(st["model"], strict=False); model.eval(); gen = generate |
| tag = f"sft epoch={st.get('epoch')}" if st.get("sft") else f"base step={st.get('step','?')}" |
| print(f"[model] {tag} | {'MIMO' if st['cfg'].get('is_mimo') else 'SISO'} | {device}\n", flush=True) |
|
|
| def answer(q, use_rag): |
| hits = retrieve(embedder, index, chunks, q, args.top_k) |
| prompt = (build_rag_prompt(q, hits, args.max_ctx_chars) if use_rag |
| else f"### Talimat:\n{q}\n\n### Yanıt:\n") |
| return hits, strip_think(gen(model, sp, prompt, max_new=args.max_new, |
| temperature=args.temperature, device=device)) |
|
|
| def show(q, use_rag): |
| hits, ans = answer(q, use_rag) |
| if use_rag: |
| print("[getirilen]") |
| for c, s in hits: |
| print(f" ({s:.2f}) {c[:90]}") |
| print(f"CEVAP: {ans}\n") |
|
|
| if args.ab: |
| queries = [args.query] if args.query else DEMO_QUERIES |
| for q in queries: |
| hits = retrieve(embedder, index, chunks, q, args.top_k) |
| print("=" * 72) |
| print(f"SORU: {q}") |
| print(f"[getirilen] ({hits[0][1]:.2f}) {hits[0][0][:90]}") |
| a_rag = strip_think(gen(model, sp, build_rag_prompt(q, hits, args.max_ctx_chars), |
| max_new=args.max_new, temperature=args.temperature, device=device)) |
| a_raw = strip_think(gen(model, sp, f"### Talimat:\n{q}\n\n### Yanıt:\n", |
| max_new=args.max_new, temperature=args.temperature, device=device)) |
| print(f" [A · RAG]: {a_rag}") |
| print(f" [B · ham]: {a_raw}") |
| return |
|
|
| if args.query: |
| show(args.query, not args.no_rag) |
| else: |
| print("İnteraktif RAG — soru yaz (boş/çık = quit).") |
| while True: |
| try: |
| q = input("\n> ").strip() |
| except (EOFError, KeyboardInterrupt): |
| break |
| if not q or q.lower() in ("quit", "exit", "çık"): |
| break |
| show(q, not args.no_rag) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|