Spaces:

Mochiva-team
/

backend

Runtime error

App Files Files Community

backend / app.py

Bc-AI

Update app.py

1a09475 verified about 2 months ago

raw

history blame contribute delete

19.2 kB

	"""
	hf_space/app.py
	──────────────────────────────────────────────────────────────────────────────
	Mochiva inference server — runs on HuggingFace Spaces (free CPU tier).

	Fixes vs original:
	• No longer reads special_tokens.json or generation_config.json (never created)
	• BOS/EOS/PAD resolved directly from tokenizer vocab
	• Prompt format updated to match new <s>...</s><user>...</user><mochi>...</mochi>
	• Generation stops on </mochi> tag in addition to EOS token
	• Graceful fallback if config keys are missing
	"""

	from __future__ import annotations
	import os
	import json
	import math
	import time
	import threading
	import queue
	import re
	from typing import Iterator, Optional

	import torch
	import torch.nn as nn
	import torch.nn.functional as F

	from fastapi import FastAPI
	from fastapi.middleware.cors import CORSMiddleware
	from fastapi.responses import StreamingResponse
	from pydantic import BaseModel, Field

	from huggingface_hub import snapshot_download
	from tokenizers import Tokenizer


	# ─── Config ───────────────────────────────────────────────────────────────────

	MODEL_REPO = os.environ.get("MODEL_REPO", "Mochiva-team/Mochiva-model")
	HF_TOKEN = os.environ.get("HF_TOKEN", None)
	DEVICE = "cpu"
	MAX_CTX = int(os.environ.get("MAX_CTX", "4096"))


	# ─── Model ────────────────────────────────────────────────────────────────────

	class RMSNorm(nn.Module):
	def __init__(self, dim: int, eps: float = 1e-6):
	super().__init__()
	self.eps = eps
	self.scale = nn.Parameter(torch.ones(dim))

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	rms = x.float().pow(2).mean(-1, keepdim=True).add(self.eps).sqrt()
	return (x.float() / rms).to(x.dtype) * self.scale


	def precompute_freqs_cis(
	head_dim: int,
	max_seq: int,
	theta: float = 10_000.0,
	scaling_factor: float = 1.0,
	) -> torch.Tensor:
	half = head_dim // 2
	freqs = 1.0 / (theta ** (torch.arange(0, half, dtype=torch.float32) / half))
	freqs = freqs / scaling_factor
	t = torch.arange(max_seq, dtype=torch.float32)
	freqs = torch.outer(t, freqs)
	return torch.polar(torch.ones_like(freqs), freqs)


	def apply_rope(
	xq: torch.Tensor,
	xk: torch.Tensor,
	freqs_cis: torch.Tensor,
	) -> tuple[torch.Tensor, torch.Tensor]:
	def rotate(x):
	x_c = x.float().reshape(*x.shape[:-1], x.shape[-1] // 2, 2)
	x_c = torch.view_as_complex(x_c)
	fc = freqs_cis.unsqueeze(0).unsqueeze(2)
	out = torch.view_as_real(x_c * fc).reshape(*x.shape)
	return out.to(x.dtype)
	return rotate(xq), rotate(xk)


	class CausalSelfAttention(nn.Module):
	def __init__(self, cfg: dict):
	super().__init__()
	self.nh = cfg["num_attention_heads"]
	self.hd = cfg["head_dim"]
	H = cfg["hidden_size"]
	self.q_proj = nn.Linear(H, self.nh * self.hd, bias=False)
	self.k_proj = nn.Linear(H, self.nh * self.hd, bias=False)
	self.v_proj = nn.Linear(H, self.nh * self.hd, bias=False)
	self.o_proj = nn.Linear(self.nh * self.hd, H, bias=False)

	def forward(self, x, freqs_cis, mask, kv_cache=None):
	B, T, _ = x.shape
	nh, hd = self.nh, self.hd

	q = self.q_proj(x).view(B, T, nh, hd)
	k = self.k_proj(x).view(B, T, nh, hd)
	v = self.v_proj(x).view(B, T, nh, hd)
	q, k = apply_rope(q, k, freqs_cis)

	if kv_cache is not None:
	if "k" in kv_cache:
	k = torch.cat([kv_cache["k"], k], dim=1)
	v = torch.cat([kv_cache["v"], v], dim=1)
	kv_cache["k"] = k
	kv_cache["v"] = v

	q = q.transpose(1, 2)
	k = k.transpose(1, 2)
	v = v.transpose(1, 2)

	scale = 1.0 / math.sqrt(hd)
	attn = torch.einsum("bhqd,bhkd->bhqk", q, k) * scale

	Tq, Tk = attn.shape[-2], attn.shape[-1]
	if mask is not None:
	attn = attn.masked_fill(~mask[..., :Tq, :Tk], float("-inf"))

	attn = F.softmax(attn.float(), dim=-1).to(q.dtype)
	out = torch.einsum("bhqk,bhkd->bhqd", attn, v)
	out = out.transpose(1, 2).contiguous().view(B, Tq, nh * hd)
	return self.o_proj(out)


	class SwiGLUMLP(nn.Module):
	def __init__(self, cfg: dict):
	super().__init__()
	H, I = cfg["hidden_size"], cfg["intermediate_size"]
	self.gate_proj = nn.Linear(H, I, bias=False)
	self.up_proj = nn.Linear(H, I, bias=False)
	self.down_proj = nn.Linear(I, H, bias=False)

	def forward(self, x):
	return self.down_proj(F.silu(self.gate_proj(x)) * self.up_proj(x))


	class MochivaBlock(nn.Module):
	def __init__(self, cfg: dict):
	super().__init__()
	eps = cfg.get("rms_norm_eps", 1e-6)
	self.attn_norm = RMSNorm(cfg["hidden_size"], eps)
	self.mlp_norm = RMSNorm(cfg["hidden_size"], eps)
	self.attn = CausalSelfAttention(cfg)
	self.mlp = SwiGLUMLP(cfg)

	def forward(self, x, freqs_cis, mask, kv_cache=None):
	x = x + self.attn(self.attn_norm(x), freqs_cis, mask, kv_cache)
	x = x + self.mlp(self.mlp_norm(x))
	return x


	class MochivaForInference(nn.Module):
	def __init__(self, cfg: dict):
	super().__init__()
	self.cfg = cfg
	V, H, L = cfg["vocab_size"], cfg["hidden_size"], cfg["num_hidden_layers"]
	self.embed_tokens = nn.Embedding(V, H)
	self.layers = nn.ModuleList([MochivaBlock(cfg) for _ in range(L)])
	self.norm = RMSNorm(H, cfg.get("rms_norm_eps", 1e-6))

	hd = cfg["head_dim"]
	ctx = cfg["max_position_embeddings"]
	theta = cfg.get("rope_theta", 10_000.0)
	scale = cfg.get("rope_scaling_factor", 1.0)
	self.register_buffer("freqs_cis", precompute_freqs_cis(hd, ctx, theta, scale))

	def forward(self, input_ids, kv_caches=None):
	B, T = input_ids.shape
	offset = 0
	if kv_caches and "k" in kv_caches[0]:
	offset = kv_caches[0]["k"].shape[1]

	x = self.embed_tokens(input_ids)
	full_len = offset + T
	mask = torch.tril(torch.ones(full_len, full_len, dtype=torch.bool, device=x.device))
	mask = mask.unsqueeze(0).unsqueeze(0)
	freqs = self.freqs_cis[offset: offset + T]

	for i, layer in enumerate(self.layers):
	kvc = kv_caches[i] if kv_caches else None
	x = layer(x, freqs, mask, kvc)

	x = self.norm(x)
	return x @ self.embed_tokens.weight.T

	@torch.inference_mode()
	def generate_stream(self, input_ids, max_new_tokens=256, temperature=0.8,
	top_p=0.9, top_k=50, repetition_penalty=1.1,
	eos_token_id=2, stop_token_ids=None):
	stop_ids = set(stop_token_ids or [])
	stop_ids.add(eos_token_id)
	kv_caches = [{} for _ in self.layers]

	logits = self(input_ids, kv_caches)
	# Pass input_ids flattened to 1D
	next_token = _sample(logits[:, -1, :], temperature, top_p, top_k,
	input_ids.reshape(-1), repetition_penalty)
	tok_id = int(next_token)
	if tok_id not in stop_ids:
	yield tok_id

	generated = input_ids.reshape(-1).tolist() + [tok_id]
	cur = next_token.unsqueeze(0)

	for _ in range(max_new_tokens - 1):
	logits = self(cur, kv_caches)
	# Pass generated as a 1D tensor
	next_token = _sample(logits[:, -1, :], temperature, top_p, top_k,
	torch.tensor(generated, dtype=torch.long), repetition_penalty)
	tok_id = int(next_token)
	if tok_id in stop_ids:
	break
	generated.append(tok_id)
	yield tok_id
	cur = next_token.unsqueeze(0)

	# ─── Sampling ─────────────────────────────────────────────────────────────────

	def _sample(logits, temperature, top_p, top_k, context_ids, repetition_penalty):
	logits = logits.float().squeeze(0)

	if repetition_penalty != 1.0:
	# Flatten safely regardless of whether context_ids is a tensor or nested list
	if isinstance(context_ids, torch.Tensor):
	flat_ids = context_ids.reshape(-1).tolist()
	else:
	flat_ids = context_ids if isinstance(context_ids[0], int) else [t for row in context_ids for t in row]
	for tok in set(flat_ids):
	logits[tok] = logits[tok] / repetition_penalty if logits[tok] > 0 \
	else logits[tok] * repetition_penalty

	if temperature < 1e-4:
	return logits.argmax(keepdim=True)

	logits /= temperature

	if top_k > 0:
	v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
	logits[logits < v[-1]] = float("-inf")

	if top_p < 1.0:
	sorted_logits, sorted_idx = torch.sort(logits, descending=True)
	cum_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
	sorted_remove = cum_probs - F.softmax(sorted_logits, dim=-1) > top_p
	sorted_logits[sorted_remove] = float("-inf")
	logits = torch.zeros_like(logits).scatter_(0, sorted_idx, sorted_logits)

	return torch.multinomial(F.softmax(logits, dim=-1), num_samples=1)

	# ─── Weight loading ───────────────────────────────────────────────────────────

	def _remap_key(key: str) -> str:
	key = key.replace("/", ".")
	key = key.replace("embed_tokens.embedding", "embed_tokens.weight")
	key = re.sub(r"layer_(\d+)\.", r"layers.\1.", key)
	key = key.replace(".kernel", ".weight")
	return key


	def load_weights(model: MochivaForInference, weights_path: str):
	try:
	from safetensors.torch import load_file
	flat = load_file(weights_path, device=DEVICE)
	except Exception:
	import numpy as np
	npz = np.load(weights_path)
	flat = {k: torch.from_numpy(v) for k, v in npz.items()}

	state_dict = model.state_dict()
	mapped = {}

	for raw_key, tensor in flat.items():
	pt_key = _remap_key(raw_key)
	if pt_key in state_dict:
	if ("weight" in pt_key
	and pt_key != "embed_tokens.weight"
	and len(tensor.shape) == 2):
	tensor = tensor.T
	mapped[pt_key] = tensor.to(state_dict[pt_key].dtype)
	else:
	print(f"[model] No match for: {raw_key} → {pt_key}")

	missing, unexpected = model.load_state_dict(mapped, strict=False)
	if missing:
	print(f"[model] Missing: {missing[:8]}")
	if unexpected:
	print(f"[model] Unexpected: {unexpected[:8]}")
	print(f"[model] Loaded {len(mapped)} / {len(state_dict)} tensors")


	# ─── Token ID helpers ─────────────────────────────────────────────────────────

	def _tok_id(tokenizer: Tokenizer, token: str, fallback: int) -> int:
	"""Look up a special token id; return fallback if absent."""
	tid = tokenizer.token_to_id(token)
	return tid if tid is not None else fallback


	# ─── Startup ──────────────────────────────────────────────────────────────────

	print(f"[startup] Downloading {MODEL_REPO} …")
	t0 = time.time()

	model_dir = snapshot_download(
	MODEL_REPO,
	token=HF_TOKEN,
	ignore_patterns=[".msgpack", "flax_model"],
	)

	with open(f"{model_dir}/config.json") as f:
	hf_cfg = json.load(f)

	tokenizer = Tokenizer.from_file(f"{model_dir}/tokenizer.json")

	# ── Resolve special token IDs directly from tokenizer vocab ──────────────────
	# No special_tokens.json needed — everything is in the tokenizer itself.
	BOS_ID = _tok_id(tokenizer, "<bos>", 1)
	EOS_ID = _tok_id(tokenizer, "<eos>", 2)
	PAD_ID = _tok_id(tokenizer, "<pad>", 0)

	# We also want to stop generation when the model closes the <mochi> tag
	MOCHI_CLOSE_ID = _tok_id(tokenizer, "</mochi>", -1)
	STOP_IDS = [EOS_ID]
	if MOCHI_CLOSE_ID != -1:
	STOP_IDS.append(MOCHI_CLOSE_ID)

	print(f"[startup] Special tokens — bos={BOS_ID} eos={EOS_ID} pad={PAD_ID} "
	f"</mochi>={MOCHI_CLOSE_ID}")

	# ── Default generation params (hardcoded since generation_config.json doesn't exist) ──
	DEFAULT_GEN = {
	"max_new_tokens": 256,
	"temperature": 0.8,
	"top_p": 0.9,
	"top_k": 50,
	"repetition_penalty": 1.1,
	}

	model = MochivaForInference(hf_cfg)
	model.eval()

	weights_file = f"{model_dir}/model.safetensors"
	if not os.path.exists(weights_file):
	weights_file = f"{model_dir}/model_weights.npz"

	load_weights(model, weights_file)
	print(f"[startup] Ready in {time.time()-t0:.1f}s "
	f"({sum(p.numel() for p in model.parameters())/1e6:.1f}M params)")


	# ─── FastAPI ──────────────────────────────────────────────────────────────────

	app = FastAPI(title="Mochiva Inference", version="2.0.0")
	app.add_middleware(
	CORSMiddleware,
	allow_origins=["*"],
	allow_methods=["*"],
	allow_headers=["*"],
	)


	class GenerateRequest(BaseModel):
	prompt: str
	persona: str = "berry" # which mochi persona
	hunger: float = Field(default=0.5, ge=0.0, le=1.0)
	happiness: float = Field(default=0.7, ge=0.0, le=1.0)
	bond: float = Field(default=0.5, ge=0.0, le=1.0)
	time_of_day: str = "afternoon"
	user_tone: str = "friendly"
	max_new_tokens: int = Field(default=256, ge=1, le=1024)
	temperature: float = Field(default=0.8, ge=0.01, le=2.0)
	top_p: float = Field(default=0.9, ge=0.0, le=1.0)
	top_k: int = Field(default=50, ge=0, le=500)
	repetition_penalty: float = Field(default=1.1, ge=1.0, le=3.0)


	PERSONA_STYLES = {
	"vanilla": "calm, soft, warm, gently poetic",
	"apple": "chaotic, fast, high energy, scattered but lovable",
	"cocoa": "sleepy, slow, cozy, dreamy",
	"berry": "emotional, expressive, dramatic, deeply caring",
	"lemon": "sarcastic, sharp, secretly soft, plays it cool",
	}


	def build_prompt(req: GenerateRequest) -> str:
	"""
	Build the full input string in the format the model was trained on:
	<s>system context</s><user>user message</user><mochi>
	The trailing <mochi> tag prompts the model to start its response.
	"""
	style = PERSONA_STYLES.get(req.persona.lower(), "friendly and playful")
	system = (
	f"You are {req.persona.title()}, a {style} Mochi character. "
	f"hunger: {req.hunger:.2f}, happiness: {req.happiness:.2f}, bond: {req.bond:.2f}. "
	f"Time: {req.time_of_day}. User tone: {req.user_tone}."
	)
	return f"<s>{system}</s><user>{req.prompt}</user><mochi>"


	# ─── SSE helpers ──────────────────────────────────────────────────────────────

	def _sse(token: str = "", done: bool = False) -> str:
	return f"data: {json.dumps({'token': token, 'done': done})}\n\n"


	def _generate_sse(req: GenerateRequest) -> Iterator[str]:
	prompt = build_prompt(req)
	ids = [BOS_ID] + tokenizer.encode(prompt, add_special_tokens=False).ids
	if len(ids) > MAX_CTX - req.max_new_tokens:
	ids = ids[-(MAX_CTX - req.max_new_tokens):]

	input_ids = torch.tensor([ids], dtype=torch.long)
	tok_queue: queue.Queue[Optional[int]] = queue.Queue()

	def _worker():
	try:
	for tok_id in model.generate_stream(
	input_ids,
	max_new_tokens = req.max_new_tokens,
	temperature = req.temperature,
	top_p = req.top_p,
	top_k = req.top_k,
	repetition_penalty = req.repetition_penalty,
	eos_token_id = EOS_ID,
	stop_token_ids = STOP_IDS,
	):
	tok_queue.put(tok_id)
	finally:
	tok_queue.put(None)

	threading.Thread(target=_worker, daemon=True).start()

	buf = []
	while True:
	tok_id = tok_queue.get()
	if tok_id is None:
	break
	buf.append(tok_id)
	text = tokenizer.decode(buf)
	# Hold back until we have a complete UTF-8 character
	if text.endswith("\ufffd"):
	continue
	# Strip any closing mochi tag that leaked through
	text = text.replace("</mochi>", "").replace("<mochi>", "")
	if text:
	yield _sse(token=text)
	buf = []

	if buf:
	text = tokenizer.decode(buf).replace("</mochi>", "").replace("<mochi>", "")
	if text:
	yield _sse(token=text)
	yield _sse(done=True)


	# ─── Endpoints ────────────────────────────────────────────────────────────────

	@app.post("/generate")
	def generate_stream(req: GenerateRequest):
	return StreamingResponse(
	_generate_sse(req),
	media_type="text/event-stream",
	headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no"},
	)


	@app.post("/generate_full")
	def generate_full(req: GenerateRequest):
	tokens = []
	for chunk in _generate_sse(req):
	if chunk.startswith("data: "):
	obj = json.loads(chunk[6:])
	if not obj["done"]:
	tokens.append(obj["token"])
	return {"text": "".join(tokens), "persona": req.persona, "model": MODEL_REPO}


	@app.get("/health")
	def health():
	return {"status": "ok", "model": MODEL_REPO}


	@app.get("/info")
	def info():
	return {
	"model": MODEL_REPO,
	"vocab_size": hf_cfg["vocab_size"],
	"layers": hf_cfg["num_hidden_layers"],
	"hidden": hf_cfg["hidden_size"],
	"context": hf_cfg["max_position_embeddings"],
	"personas": list(PERSONA_STYLES.keys()),
	"special_toks": {"bos": BOS_ID, "eos": EOS_ID, "pad": PAD_ID},
	"device": DEVICE,
	}


	if __name__ == "__main__":
	import uvicorn
	uvicorn.run(app, host="0.0.0.0", port=7860)