Spaces:

OpenTransformer
/

AGILLM-M2_Pro

Sleeping

App Files Files Community

OpenTransformer commited on Nov 3, 2025

Commit

a3960f8

verified ·

1 Parent(s): a3d7afc

Upload app.py

Browse files

Files changed (1) hide show

app.py +458 -0

app.py ADDED Viewed

	@@ -0,0 +1,458 @@

+#!/usr/bin/env python3
+# app.py — AGILLM2 chat app for Hugging Face Spaces (single-file).
+# Loads a torch .pt checkpoint from the Hub, rebuilds the tiny AR-only model,
+# applies Qwen chat templating, and serves a Gradio chat UI with streaming.
+from __future__ import annotations
+import os, re, math, time, pathlib
+from typing import List, Dict, Optional, Tuple, Any
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import gradio as gr
+from huggingface_hub import HfApi, hf_hub_download
+from transformers import AutoTokenizer, logging as hf_log
+# =============== Config ===============
+REPO_ID = os.environ.get("REPO_ID", "OpenTransformer/AGILLM2-fast-training")
+TOKENIZER_ID = os.environ.get("TOKENIZER_ID", "Qwen/Qwen3-235B-A22B-Thinking-2507")
+SYSTEM_DEFAULT = os.environ.get("SYSTEM_PROMPT", "You are a concise, helpful assistant.")
+# =====================================
+hf_log.set_verbosity_error()
+DEV = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+torch.backends.cuda.matmul.allow_tf32 = True
+try:
+    torch.set_float32_matmul_precision("high")
+except Exception:
+    pass
+# -------- Tokenizer (same as training) --------
+tok = AutoTokenizer.from_pretrained(TOKENIZER_ID, use_fast=True, trust_remote_code=True)
+if tok.pad_token is None:
+    tok.add_special_tokens({"pad_token": "[PAD]"})
+VOCAB = max(tok.get_vocab().values()) + 1
+BLANK = tok.pad_token_id
+EOS = tok.eos_token_id if tok.eos_token_id is not None else tok.sep_token_id
+# -------- Tiny-arch presets (fallback only) --------
+PRESETS = {
+    "small":   dict(d=512, layers=8,  heads=16, rank=64),
+    "smallx2": dict(d=512, layers=16, heads=16, rank=64),
+    "base":    dict(d=768, layers=12, heads=24, rank=96),
+}
+# -------- AMP helper --------
+try:
+    from torch.amp import autocast as _ac  # noqa: F401
+except ImportError:
+    from torch.cuda.amp import autocast as _ac  # noqa: F401
+def _auto_amp_dtype():
+    if DEV.type != "cuda":
+        return torch.float32
+    try:
+        return torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
+    except Exception:
+        return torch.float16
+def amp(enabled: bool):
+    if not (enabled and DEV.type == "cuda"):
+        from contextlib import nullcontext
+        return nullcontext()
+    return _ac(device_type="cuda", dtype=_auto_amp_dtype())
+# -------- ALiBi --------
+def _alibi_slopes(n_heads: int):
+    import math as _m
+    def pow2slopes(n):
+        start = 2 ** (-2 ** -(_m.log2(n) - 3))
+        ratio = start
+        return [start * (ratio ** i) for i in range(n)]
+    if _m.log2(n_heads).is_integer():
+        vals = pow2slopes(n_heads)
+    else:
+        closest = 2 ** _m.floor(_m.log2(n_heads))
+        vals = pow2slopes(closest)
+        extra = pow2slopes(2 * closest)
+        vals += extra[0::2][: n_heads - closest]
+    return torch.tensor(vals, device=DEV).view(1, n_heads, 1, 1)
+def alibi_bias(n_heads: int, n_tokens: int):
+    i = torch.arange(n_tokens, device=DEV).view(1, 1, n_tokens, 1)
+    j = torch.arange(n_tokens, device=DEV).view(1, 1, 1, n_tokens)
+    dist = (j - i).clamp_min(0)
+    slopes = _alibi_slopes(n_heads)
+    return -slopes * dist
+# -------- Model --------
+class LowRankMHA(nn.Module):
+    def __init__(self, d: int, h: int, r: int, use_relpos: bool = True):
+        super().__init__()
+        assert d % h == 0, "d must be divisible by number of heads"
+        self.h, self.dk = h, d // h
+        self.use_relpos = use_relpos
+        self.q = nn.Linear(d, d, bias=False)
+        self.k = nn.Linear(d, d, bias=False)
+        self.v = nn.Linear(d, d, bias=False)
+        self.U = nn.Parameter(torch.randn(self.dk, r))
+        nn.init.orthogonal_(self.U)
+        self.proj = nn.Linear(h * r, d, bias=False)
+        self.drop = nn.Dropout(0.0)
+    def _proj(self, x):
+        B, N, _ = x.shape
+        return (x.view(B, N, self.h, self.dk).transpose(1, 2) @ self.U)
+    def forward(self, x: torch.Tensor, mask: Optional[torch.Tensor] = None,
+                rel_bias_tokens: Optional[int] = None,
+                kv_cache: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+                use_cache: bool = False):
+        q = self._proj(self.q(x))
+        k_new = self._proj(self.k(x))
+        v_new = self._proj(self.v(x))
+        if kv_cache is None:
+            k, v = k_new, v_new
+        else:
+            k, v = kv_cache
+            if use_cache:
+                k = torch.cat([k, k_new], dim=2)
+                v = torch.cat([v, v_new], dim=2)
+        att = (q @ k.transpose(-1, -2)) / math.sqrt(self.dk)
+        if q.size(2) == k.size(2):
+            if self.use_relpos and rel_bias_tokens is not None:
+                att = att + alibi_bias(self.h, rel_bias_tokens)
+            if mask is not None:
+                att = att + mask
+        z = (att.softmax(-1) @ v).transpose(1, 2)
+        z = z.reshape(x.size(0), x.size(1), -1)
+        out = self.drop(self.proj(z))
+        return (out, (k, v)) if use_cache else out
+class Block(nn.Module):
+    def __init__(self, d: int, h: int, r: int):
+        super().__init__()
+        self.ln1, self.ln2 = nn.LayerNorm(d), nn.LayerNorm(d)
+        self.mha = LowRankMHA(d, h, r, use_relpos=True)
+        self.ff = nn.Sequential(nn.Linear(d, 4 * d), nn.ReLU(), nn.Linear(4 * d, d))
+    def forward(self, x: torch.Tensor, mask: Optional[torch.Tensor],
+                kv: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+                use_cache: bool = False):
+        n = x.size(1)
+        if use_cache:
+            y, new_kv = self.mha(self.ln1(x), mask,
+                                 rel_bias_tokens=n if mask is not None else None,
+                                 kv_cache=kv, use_cache=True)
+            x = x + y
+            x = x + self.ff(self.ln2(x))
+            return x, new_kv
+        else:
+            x = x + self.mha(self.ln1(x), mask, rel_bias_tokens=n)
+            return x + self.ff(self.ln2(x))
+class Encoder(nn.Module):
+    def __init__(self, cfg: Dict[str, int]):
+        super().__init__()
+        d, l, h, r = cfg["d"], cfg["layers"], cfg["heads"], cfg["rank"]
+        self.emb = nn.Embedding(VOCAB, d)
+        self.blocks = nn.ModuleList([Block(d, h, r) for _ in range(l)])
+        self.ln = nn.LayerNorm(d)
+    def forward(self, ids: torch.Tensor, mask: Optional[torch.Tensor],
+                kv_caches: Optional[List[Optional[Tuple[torch.Tensor, torch.Tensor]]]] = None,
+                use_cache: bool = False):
+        x = self.emb(ids)
+        if not use_cache:
+            for blk in self.blocks:
+                x = blk(x, mask)
+            return self.ln(x)
+        new_kvs: List[Tuple[torch.Tensor, torch.Tensor]] = []
+        for i, blk in enumerate(self.blocks):
+            kv = kv_caches[i] if (kv_caches is not None) else None
+            x, kv_out = blk(x, mask, kv, use_cache=True)
+            new_kvs.append(kv_out)
+        return self.ln(x), new_kvs
+class ARHead(nn.Module):
+    def __init__(self, d):
+        super().__init__()
+        self.proj = nn.Linear(d, VOCAB)
+    def forward(self, h): return self.proj(h)
+def causal_mask(n):
+    m = torch.full((1, 1, n, n), float("-inf"), device=DEV)
+    return torch.triu(m, 1)
+# -------- Checkpoint loading from Hub --------
+def _try_load(path: pathlib.Path):
+    try:
+        return torch.load(path, map_location="cpu")
+    except Exception as e:
+        print(f"[ckpt] failed to load {path}: {e}")
+        return None
+def _prefer_ckpt(repo_id: str) -> pathlib.Path:
+    api = HfApi()
+    files = api.list_repo_files(repo_id=repo_id, repo_type="model")
+    if "final.pt" in files:
+        f = hf_hub_download(repo_id=repo_id, repo_type="model", filename="final.pt")
+        return pathlib.Path(f)
+    step_files = [f for f in files if re.match(r"step\d+\.pt$", f)]
+    if not step_files:
+        raise FileNotFoundError("No final.pt or step*.pt found in the repo.")
+    step_files.sort(key=lambda s: int(re.findall(r"\d+", s)[0]), reverse=True)
+    f = hf_hub_download(repo_id=repo_id, repo_type="model", filename=step_files[0])
+    return pathlib.Path(f)
+def infer_cfg_from_ckpt_blob(sd: Dict[str, Any]) -> Optional[Dict[str, int]]:
+    if isinstance(sd, dict) and "cfg" in sd and isinstance(sd["cfg"], dict):
+        return dict(sd["cfg"])
+    core = sd.get("core") if isinstance(sd, dict) else None
+    if core is None: return None
+    emb_w = core.get("emb.weight")
+    if emb_w is None: return None
+    d = emb_w.shape[1]
+    layer_ids = []
+    for k in core.keys():
+        if k.startswith("blocks."):
+            parts = k.split(".")
+            if len(parts) > 2 and parts[1].isdigit():
+                layer_ids.append(int(parts[1]))
+    layers = (max(layer_ids) + 1) if layer_ids else None
+    U = core.get("blocks.0.mha.U")
+    heads = rank = None
+    if U is not None:
+        dk, r = U.shape
+        rank = r
+        heads = d // dk if dk > 0 else None
+    out = {"d": d}
+    if layers is not None: out["layers"] = layers
+    if heads is not None:  out["heads"] = heads
+    if rank is not None:   out["rank"] = rank
+    return out
+def load_joint_from_hub(repo_id: str, fallback_preset: str = "smallx2"):
+    path = _prefer_ckpt(repo_id)
+    ck = _try_load(path)
+    if ck is None:
+        raise FileNotFoundError("Could not load checkpoint from Hub.")
+    cfg = infer_cfg_from_ckpt_blob(ck) or PRESETS[fallback_preset]
+    core = Encoder(cfg).to(DEV)
+    ar_h = ARHead(cfg["d"]).to(DEV)
+    core.load_state_dict(ck["core"])
+    if "ar" in ck:
+        ar_h.load_state_dict(ck["ar"])
+    core.eval(); ar_h.eval()
+    return core, ar_h, cfg
+# -------- Sampling utils --------
+def _apply_no_repeat_ngram(logits: torch.Tensor, ids: torch.Tensor, n: int):
+    if n <= 0 or ids.size(1) < n - 1:
+        return logits
+    prefix = ids[0, -(n - 1):].tolist()
+    banned = []
+    tokens = ids[0].tolist()
+    for i in range(len(tokens) - n + 1):
+        if tokens[i:i + n - 1] == prefix:
+            banned.append(tokens[i + n - 1])
+    if banned:
+        banned_idx = torch.tensor(banned, device=logits.device, dtype=torch.long)
+        logits[..., banned_idx] = float("-inf")
+    return logits
+def _apply_rep_presence_frequency(
+    logits: torch.Tensor, ids: torch.Tensor, last_n: int,
+    repetition_penalty: float, presence_penalty: float, frequency_penalty: float
+):
+    if ids.numel() == 0:
+        return logits
+    hist = ids[0, -last_n:].to(torch.long) if last_n > 0 else ids[0].to(torch.long)
+    if hist.numel() == 0:
+        return logits
+    uniq, counts = torch.unique(hist, return_counts=True)
+    if presence_penalty != 0.0 or frequency_penalty != 0.0:
+        adjust = presence_penalty + frequency_penalty * counts.to(logits.dtype)
+        logits[..., uniq] = logits[..., uniq] - adjust
+    if repetition_penalty and abs(repetition_penalty - 1.0) > 1e-6:
+        sel = logits[..., uniq]
+        sel = torch.where(sel > 0, sel / repetition_penalty, sel * repetition_penalty)
+        logits[..., uniq] = sel
+    return logits
+def _filter_top_k_top_p_min_p(
+    logits: torch.Tensor, top_k: int, top_p: float, min_p: float, temperature: float
+) -> torch.Tensor:
+    logits = logits / max(temperature, 1e-8)
+    if logits.dim() == 1:
+        logits = logits.unsqueeze(0)
+    probs = logits.softmax(-1)
+    V = probs.size(-1)
+    if top_k and top_k < V:
+        vals, idx = torch.topk(probs, top_k, dim=-1)
+        mask = torch.full_like(probs, 0.0)
+        mask.scatter_(1, idx, 1.0)
+        probs = probs * mask
+    if top_p < 1.0:
+        sorted_probs, sorted_idx = torch.sort(probs, descending=True, dim=-1)
+        cumsum = torch.cumsum(sorted_probs, dim=-1)
+        keep = cumsum <= top_p
+        keep[..., 0] = True
+        mask = torch.zeros_like(probs)
+        mask.scatter_(1, sorted_idx, keep.to(mask.dtype))
+        probs = probs * mask
+    if min_p > 0.0:
+        probs = torch.where(probs >= min_p, probs, torch.zeros_like(probs))
+    sums = probs.sum(-1, keepdim=True)
+    empty = (sums == 0)
+    if empty.any():
+        fallback_idx = logits.argmax(-1, keepdim=True)
+        probs = torch.where(empty, torch.zeros_like(probs), probs)
+        probs.scatter_(-1, fallback_idx, torch.where(empty, torch.ones_like(sums), torch.zeros_like(sums)))
+    probs = probs / probs.sum(-1, keepdim=True)
+    return probs
+# -------- Chat generation (streaming) --------
+@torch.no_grad()
+def generate_stream(core, ar_h,
+                    messages: List[Dict[str, str]],
+                    max_new: int = 192,
+                    temperature: float = 0.8,
+                    top_k: int = 0,
+                    top_p: float = 0.95,
+                    min_p: float = 0.0,
+                    repetition_penalty: float = 1.05,
+                    presence_penalty: float = 0.0,
+                    frequency_penalty: float = 0.0,
+                    penalty_last_n: int = 64,
+                    no_repeat_ngram_size: int = 0,
+                    use_amp: bool = True):
+    # Use tokenizer chat template
+    try:
+        prompt = tok.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    except Exception:
+        # Fallback if someone breaks the template
+        parts = []
+        for m in messages:
+            r = m.get("role","user")
+            parts.append(("User: " if r=="user" else "Assistant: ") + str(m.get("content","")))
+        prompt = "\n".join(parts) + "\nAssistant:"
+    ids = torch.tensor([tok.encode(prompt)], device=DEV)
+    out_tokens: List[int] = []
+    text_last = ""
+    with amp(use_amp):
+        # Prime context
+        h_full, kvs = core(ids, causal_mask(ids.size(1)), use_cache=True)
+        for _ in range(max_new):
+            logits = ar_h(h_full)[:, -1]
+            logits = _apply_no_repeat_ngram(logits, ids, no_repeat_ngram_size)
+            logits = _apply_rep_presence_frequency(
+                logits, ids, penalty_last_n, repetition_penalty, presence_penalty, frequency_penalty
+            )
+            if temperature <= 1e-6 and top_k == 0 and top_p >= 1.0 and min_p <= 0.0:
+                nxt = logits.argmax(-1, keepdim=True)
+            else:
+                probs = _filter_top_k_top_p_min_p(logits.squeeze(0), top_k, top_p, min_p, temperature)
+                nxt = probs.multinomial(1)
+            token_id = int(nxt.item())
+            out_tokens.append(token_id)
+            ids = torch.cat([ids, nxt.unsqueeze(0) if nxt.dim()==1 else nxt], 1)
+            x = ids[:, -1:]
+            h_full, kvs = core(x, None, kv_caches=kvs, use_cache=True)
+            # Stream partial text
+            text_now = tok.decode(out_tokens, skip_special_tokens=True)
+            if text_now != text_last:
+                yield text_now
+                text_last = text_now
+            if token_id == EOS:
+                break
+# ================== Load model once at app start ==================
+core, ar_h, cfg = load_joint_from_hub(REPO_ID)
+print(f"[ready] repo={REPO_ID} cfg={cfg} device={DEV}")
+# ================== Gradio UI ==================
+def chat_stream_ui(history, user, system,
+                   temp, top_p, top_k, max_new,
+                   rep_pen, pres_pen, freq_pen, ngram, last_n):
+    if not user or not user.strip():
+        return history, ""
+    # Convert history [(user, bot), ...] to messages
+    messages: List[Dict[str,str]] = [{"role":"system","content":system or SYSTEM_DEFAULT}]
+    for u, a in history:
+        if u: messages.append({"role":"user","content":u})
+        if a: messages.append({"role":"assistant","content":a})
+    messages.append({"role":"user","content":user})
+    # Start streaming
+    reply = ""
+    history = history + [(user, reply)]
+    gen_iter = generate_stream(core, ar_h, messages,
+                               max_new=int(max_new),
+                               temperature=float(temp),
+                               top_k=int(top_k), top_p=float(top_p),
+                               repetition_penalty=float(rep_pen),
+                               presence_penalty=float(pres_pen),
+                               frequency_penalty=float(freq_pen),
+                               penalty_last_n=int(last_n),
+                               no_repeat_ngram_size=int(ngram),
+                               use_amp=True)
+    t0 = time.time()
+    for chunk in gen_iter:
+        reply = chunk
+        history[-1] = (user, reply)
+        yield history, ""
+    dt = time.time() - t0
+    print(f"[gen] {len(tok.encode(reply))} tok in {dt:.2f}s")
+with gr.Blocks(fill_height=True) as demo:
+    gr.Markdown("## AGILLM2 Chat · OpenTransformer")
+    with gr.Row():
+        system = gr.Textbox(value=SYSTEM_DEFAULT, label="System prompt", lines=2)
+    chatbot = gr.Chatbot(height=480, type="tuple")
+    with gr.Row():
+        temp   = gr.Slider(0.0, 1.5, value=0.8, step=0.05, label="Temperature")
+        top_p  = gr.Slider(0.1, 1.0, value=0.95, step=0.01, label="top_p")
+        top_k  = gr.Slider(0, 200, value=0, step=1, label="top_k")
+        max_new = gr.Slider(16, 1024, value=256, step=8, label="Max new tokens")
+    with gr.Row():
+        rep_pen = gr.Slider(1.0, 1.5, value=1.05, step=0.01, label="Repetition penalty")
+        pres_pen = gr.Slider(0.0, 1.0, value=0.0, step=0.05, label="Presence penalty")
+        freq_pen = gr.Slider(0.0, 1.0, value=0.0, step=0.05, label="Frequency penalty")
+        ngram = gr.Slider(0, 6, value=0, step=1, label="No-repeat n-gram")
+        last_n = gr.Slider(16, 1024, value=64, step=16, label="Penalty last-N")
+    with gr.Row():
+        msg = gr.Textbox(placeholder="Type a message and press Enter", lines=2, scale=4)
+    with gr.Row():
+        send = gr.Button("Send", variant="primary")
+        clear = gr.Button("Clear history")
+    send.click(chat_stream_ui,
+               inputs=[chatbot, msg, system, temp, top_p, top_k, max_new, rep_pen, pres_pen, freq_pen, ngram, last_n],
+               outputs=[chatbot, msg],
+               queue=True)
+    msg.submit(chat_stream_ui,
+               inputs=[chatbot, msg, system, temp, top_p, top_k, max_new, rep_pen, pres_pen, freq_pen, ngram, last_n],
+               outputs=[chatbot, msg],
+               queue=True)
+    clear.click(lambda: [], None, chatbot, queue=False)
+demo.queue(max_size=32).launch()