Spaces:

MonumentalSystems
/

randygpt-s2-space

Sleeping

App Files Files Community

randygpt-s2-space / server.py

rsafier

Upload folder using huggingface_hub

fd436a0 verified 6 days ago

raw

history blame contribute delete

15.5 kB

	"""
	server.py — randyGPT OpenAI-compatible inference server
	Serves POST /v1/chat/completions and GET /v1/models on port 7860.

	Loads model weights from HuggingFace Hub at startup.
	Compatible with OpenAI SDK, OpenRouter, LangChain, etc.
	"""

	import json
	import math
	import time
	import uuid
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from pathlib import Path
	from fastapi import FastAPI, HTTPException, Request
	from fastapi.responses import JSONResponse, StreamingResponse
	from fastapi.middleware.cors import CORSMiddleware
	from fastapi.exceptions import RequestValidationError
	from pydantic import BaseModel
	from typing import List, Optional
	from huggingface_hub import hf_hub_download
	from safetensors.torch import load_file

	# ── Inline model ──────────────────────────────────────────────────────────────

	class Cfg:
	def __init__(self, **kw):
	self.vocab_size = kw.get("vocab_size", 1500)
	self.n_embd = kw.get("n_embd", 128)
	self.n_head = kw.get("n_head", 4)
	self.n_layer = kw.get("n_layer", 8)
	self.block_size = kw.get("block_size", 256)
	self.head_dim = self.n_embd // self.n_head
	self.mlp_dim = 4 * self.n_embd

	def rmsnorm(x, eps=1e-5):
	return x * (x.pow(2).mean(-1, keepdim=True) + eps).rsqrt()

	class Attn(nn.Module):
	def __init__(self, c):
	super().__init__()
	self.nh, self.hd = c.n_head, c.head_dim
	self.sc = 1.0 / math.sqrt(c.head_dim)
	self.wq = nn.Linear(c.n_embd, c.n_embd, bias=False)
	self.wk = nn.Linear(c.n_embd, c.n_embd, bias=False)
	self.wv = nn.Linear(c.n_embd, c.n_embd, bias=False)
	self.wo = nn.Linear(c.n_embd, c.n_embd, bias=False)
	def forward(self, x):
	B, T, C = x.shape
	q = self.wq(x).view(B, T, self.nh, self.hd).transpose(1, 2)
	k = self.wk(x).view(B, T, self.nh, self.hd).transpose(1, 2)
	v = self.wv(x).view(B, T, self.nh, self.hd).transpose(1, 2)
	s = q @ k.transpose(-2, -1) * self.sc
	s = s + torch.full((T, T), float('-inf'), device=x.device).triu(1)
	return self.wo((F.softmax(s, dim=-1) @ v).transpose(1, 2).contiguous().view(B, T, C))

	class MLP(nn.Module):
	def __init__(self, c):
	super().__init__()
	self.fc1 = nn.Linear(c.n_embd, c.mlp_dim, bias=False)
	self.fc2 = nn.Linear(c.mlp_dim, c.n_embd, bias=False)
	def forward(self, x):
	return self.fc2(F.relu(self.fc1(x)).pow(2))

	class Block(nn.Module):
	def __init__(self, c):
	super().__init__()
	self.attn = Attn(c)
	self.mlp = MLP(c)
	def forward(self, x):
	x = x + self.attn(rmsnorm(x))
	return x + self.mlp(rmsnorm(x))

	class RandyGPT(nn.Module):
	def __init__(self, c):
	super().__init__()
	self.c = c
	self.wte = nn.Embedding(c.vocab_size, c.n_embd)
	self.wpe = nn.Embedding(c.block_size, c.n_embd)
	self.layers = nn.ModuleList([Block(c) for _ in range(c.n_layer)])
	self.lm_head = nn.Linear(c.n_embd, c.vocab_size, bias=False)

	def forward(self, ids):
	B, T = ids.shape
	x = self.wte(ids) + self.wpe(torch.arange(T, device=ids.device).unsqueeze(0))
	for b in self.layers:
	x = b(x)
	return self.lm_head(x)

	@torch.no_grad()
	def generate(self, ids, max_new_tokens=200, temperature=0.8, top_p=0.9):
	self.eval()
	for _ in range(max_new_tokens):
	ctx = ids[:, -self.c.block_size:]
	logits = self(ctx)[:, -1, :] / max(temperature, 1e-6)
	probs = F.softmax(logits, dim=-1)
	sp, si = torch.sort(probs, descending=True)
	cum = sp.cumsum(-1)
	sp[cum - sp > top_p] = 0.0
	sp /= sp.sum()
	nxt = si[0, torch.multinomial(sp[0], 1)]
	ids = torch.cat([ids, nxt.view(1, 1)], dim=1)
	if nxt.item() == 1:
	break
	return ids

	@torch.no_grad()
	def generate_stream(self, ids, max_new_tokens=200, temperature=0.8, top_p=0.9):
	"""Yields (token_id, is_last) one token at a time."""
	self.eval()
	for i in range(max_new_tokens):
	ctx = ids[:, -self.c.block_size:]
	logits = self(ctx)[:, -1, :] / max(temperature, 1e-6)
	probs = F.softmax(logits, dim=-1)
	sp, si = torch.sort(probs, descending=True)
	cum = sp.cumsum(-1)
	sp[cum - sp > top_p] = 0.0
	sp /= sp.sum()
	nxt = si[0, torch.multinomial(sp[0], 1)]
	ids = torch.cat([ids, nxt.view(1, 1)], dim=1)
	token_id = nxt.item()
	is_last = (token_id == 1) or (i == max_new_tokens - 1)
	yield token_id, is_last
	if token_id == 1:
	break


	# ── Tokenizer ─────────────────────────────────────────────────────────────────

	class Tokenizer:
	def __init__(self, vocab, merges):
	self.vocab = vocab
	self.t2i = {s: i for i, s in enumerate(vocab)}
	self.bos = self.t2i.get("<\|bos\|>", 0)
	self.eos = self.t2i.get("<\|eos\|>", 1)
	self.mmap = {}
	for l, r in merges:
	li, ri, mi = self.t2i.get(l), self.t2i.get(r), self.t2i.get(l + r)
	if li is not None and ri is not None and mi is not None:
	self.mmap.setdefault((li, ri), mi)

	@classmethod
	def from_json(cls, path):
	d = json.loads(Path(path).read_text(encoding="utf-8"))
	return cls(d["vocab"], [tuple(m) for m in d["merges"]])

	def _chunk(self, text):
	tokens = [self.t2i[c] for c in text if c in self.t2i]
	if len(tokens) < 2:
	return tokens
	while True:
	best = None
	for i in range(len(tokens) - 1):
	m = self.mmap.get((tokens[i], tokens[i+1]))
	if m is not None and (best is None or m < best):
	best = m
	if best is None:
	break
	out, i = [], 0
	while i < len(tokens):
	if i+1 < len(tokens) and self.mmap.get((tokens[i], tokens[i+1])) == best:
	out.append(best); i += 2
	else:
	out.append(tokens[i]); i += 1
	tokens = out
	return tokens

	def encode(self, text):
	nl = self.t2i.get("\n")
	lines, result = text.split("\n"), []
	for i, line in enumerate(lines):
	result.extend(self._chunk(line))
	if i < len(lines) - 1 and nl is not None:
	result.append(nl)
	return result

	def decode(self, ids):
	return "".join(self.vocab[i] for i in ids
	if i not in (self.bos, self.eos) and 0 <= i < len(self.vocab))


	# ── Load model at startup ──────────────────────────────────────────────────────

	import os
	import threading

	REPO = os.environ.get("MODEL_REPO", "MonumentalSystems/randygpt-s")
	MODEL_ID = REPO.split("/")[-1]

	_model_lock = threading.Lock()
	_reload_lock = threading.Lock() # only one reload at a time
	_is_reloading = False # debounce flag

	def _get_remote_sha() -> str:
	"""Fetch the current commit SHA of model.safetensors from Hub metadata."""
	from huggingface_hub import get_paths_info
	infos = list(get_paths_info(REPO, ["model.safetensors"], repo_type="model"))
	return infos[0].lfs.sha256 if infos and infos[0].lfs else ""

	def load_model(force_weights=False):
	print(f"Loading {REPO} …")
	cfg_path = hf_hub_download(repo_id=REPO, filename="config.json", force_download=False)
	st_path = hf_hub_download(repo_id=REPO, filename="model.safetensors", force_download=force_weights)
	tok_path = hf_hub_download(repo_id=REPO, filename="tokenizer.json", force_download=False)
	_cfg = Cfg(**json.loads(Path(cfg_path).read_text()))
	_tok = Tokenizer.from_json(tok_path)
	_mdl = RandyGPT(_cfg)
	_mdl.load_state_dict(load_file(st_path, device="cpu"))
	_mdl.eval()
	print("Model ready.")
	return _cfg, _tok, _mdl

	cfg, tok, model = load_model()
	_current_sha = _get_remote_sha()


	# ── FastAPI app ────────────────────────────────────────────────────────────────

	app = FastAPI(title="randyGPT", version="0.9.6")

	app.add_middleware(
	CORSMiddleware,
	allow_origins=["*"],
	allow_methods=["*"],
	allow_headers=["*"],
	)

	def _openai_error(status: int, message: str, err_type: str = "invalid_request_error", code: str = None):
	body = {"error": {"message": message, "type": err_type}}
	if code:
	body["error"]["code"] = code
	return JSONResponse(status_code=status, content=body)

	@app.exception_handler(HTTPException)
	async def http_exception_handler(request: Request, exc: HTTPException):
	return _openai_error(exc.status_code, str(exc.detail))

	@app.exception_handler(RequestValidationError)
	async def validation_exception_handler(request: Request, exc: RequestValidationError):
	msg = "; ".join(f"{e['loc'][-1]}: {e['msg']}" for e in exc.errors())
	return _openai_error(422, msg, code="invalid_request_error")


	@app.get("/v1/models")
	def list_models():
	return {
	"object": "list",
	"data": [{
	"id": MODEL_ID,
	"object": "model",
	"created": 1700000000,
	"owned_by": "MonumentalSystems",
	}]
	}


	class Message(BaseModel):
	role: str
	content: str

	class ChatRequest(BaseModel):
	model: Optional[str] = MODEL_ID
	messages: List[Message]
	max_tokens: Optional[int] = 200
	temperature: Optional[float] = 0.8
	top_p: Optional[float] = 0.9
	n: Optional[int] = 1
	stream: Optional[bool] = False


	def _sse(data: dict) -> str:
	return f"data: {json.dumps(data)}\n\n"


	def _stream_completion(ids, max_tokens, temperature, top_p, completion_id, _model, _tok):
	"""Generator that yields SSE chunks one token at a time.
	Takes model/tok as arguments (snapshotted at request time) so reloads
	mid-stream don't affect this request."""
	tensor = torch.tensor([ids], dtype=torch.long)
	token_count = 0

	for token_id, is_last in _model.generate_stream(
	tensor, max_new_tokens=max_tokens,
	temperature=temperature, top_p=top_p
	):
	token_text = _tok.decode([token_id])
	token_count += 1
	finish_reason = ("length" if token_count >= max_tokens else "stop") if is_last else None

	chunk = {
	"id": completion_id,
	"object": "chat.completion.chunk",
	"created": int(time.time()),
	"model": MODEL_ID,
	"choices": [{
	"index": 0,
	"delta": {"content": token_text},
	"finish_reason": finish_reason,
	}],
	}
	yield _sse(chunk)

	yield "data: [DONE]\n\n"


	@app.post("/v1/chat/completions")
	def chat_completions(req: ChatRequest):
	# Snapshot globals at request start — concurrent requests and reloads
	# are both safe because each request holds its own references.
	_m, _t, _c = model, tok, cfg

	prompt = req.messages[-1].content.strip() if req.messages else ""
	if not prompt:
	raise HTTPException(status_code=400, detail="No content in messages")

	ids = _t.encode(prompt)
	if not ids:
	raise HTTPException(status_code=400, detail="Prompt tokenized to empty sequence")

	max_tokens = max(1, min(req.max_tokens or 200, _c.block_size))
	temperature = max(0.01, min(req.temperature or 0.8, 2.0))
	top_p = req.top_p or 0.9
	n = max(1, min(req.n or 1, 4))
	completion_id = f"chatcmpl-{uuid.uuid4().hex[:8]}"

	# ── Streaming ─────────────────────────────────────────────────────────────
	if req.stream:
	return StreamingResponse(
	_stream_completion(ids, max_tokens, temperature, top_p, completion_id, _m, _t),
	media_type="text/event-stream",
	headers={"X-Accel-Buffering": "no"},
	)

	# ── Non-streaming ─────────────────────────────────────────────────────────
	choices = []
	total_completion_tokens = 0

	for i in range(n):
	tensor = torch.tensor([ids], dtype=torch.long)
	out = _m.generate(tensor, max_new_tokens=max_tokens,
	temperature=temperature, top_p=top_p)
	full = _t.decode(out[0].tolist())
	completion = full[len(prompt):].lstrip() if full.startswith(prompt) else full
	comp_tokens = len(_t.encode(completion))
	total_completion_tokens += comp_tokens
	choices.append({
	"index": i,
	"message": {"role": "assistant", "content": completion},
	"finish_reason": "length" if comp_tokens >= max_tokens else "stop",
	})

	return {
	"id": completion_id,
	"object": "chat.completion",
	"created": int(time.time()),
	"model": MODEL_ID,
	"system_fingerprint": f"{MODEL_ID}-v0.9.6",
	"choices": choices,
	"usage": {
	"prompt_tokens": len(ids),
	"completion_tokens": total_completion_tokens,
	"total_tokens": len(ids) + total_completion_tokens,
	},
	}


	@app.post("/reload")
	def reload_weights():
	"""Hot-reload model weights from Hub. Debounced — returns 200 immediately if already reloading.
	Only swaps weights if Hub has a newer version of model.safetensors."""
	global cfg, tok, model, _current_sha, _is_reloading

	# Debounce: if already reloading, return immediately
	if _is_reloading:
	return {"status": "ok", "model": MODEL_ID, "reloaded": False, "reason": "already reloading"}

	with _reload_lock:
	if _is_reloading:
	return {"status": "ok", "model": MODEL_ID, "reloaded": False, "reason": "already reloading"}
	_is_reloading = True

	try:
	new_sha = _get_remote_sha()
	if new_sha == _current_sha:
	return {"status": "ok", "model": MODEL_ID, "reloaded": False, "reason": "weights unchanged"}

	print(f"New weights detected ({_current_sha[:8]} → {new_sha[:8]}), reloading…")
	new_cfg, new_tok, new_model = load_model(force_weights=True)

	with _model_lock:
	cfg, tok, model = new_cfg, new_tok, new_model
	_current_sha = new_sha

	return {"status": "ok", "model": MODEL_ID, "reloaded": True, "sha": new_sha[:16]}
	finally:
	_is_reloading = False


	@app.get("/")
	def root():
	return {"model": MODEL_ID, "endpoints": ["/v1/models", "/v1/chat/completions", "/reload"]}