Spaces:

LisaMegaWatts
/

JuliaSLM-compressed-svd

Sleeping

File size: 21,215 Bytes

0a315c6

"""
server.py — OpenAI-compatible inference server for JuliaSLM-compressed-svd

Serves the SVD-90 compressed JuliaSLM model (4.81M params, ~4.5% smaller).
Downloads checkpoint and tokenizer from HuggingFace on first run.

SVD compression: each linear layer W ≈ A @ B (low-rank factorization),
reducing parameter count while preserving model quality.

Endpoints:
    GET  /                       -> health check / API info
    GET  /v1/models              -> list available models
    POST /v1/chat/completions    -> generate text (OpenAI format, streaming supported)
"""

import json
import os
import regex
import time
import uuid
from http.server import HTTPServer, BaseHTTPRequestHandler
from threading import Lock

import torch
import torch.nn.functional as F
from huggingface_hub import hf_hub_download

from juliaslm_svd_model import SVDConfig, JuliaSLM_SVD

# ═══════════════════════════════════════════════════════════════════
# Configuration
# ═══════════════════════════════════════════════════════════════════

HF_MODEL_REPO = os.environ.get("HF_MODEL_REPO", "LisaMegaWatts/JuliaSLM-compressed-svd")
HF_TOKENIZER_REPO = os.environ.get("HF_TOKENIZER_REPO", "LisaMegaWatts/JuliaSLM")
CHECKPOINT_NAME = os.environ.get("CHECKPOINT_NAME", "svd_SVD-90_best.pt")
PORT = int(os.environ.get("PORT", "7860"))
CKPT_DIR = "checkpoints"
MODEL_ID = "juliaslm-compressed-svd-90"

# ═══════════════════════════════════════════════════════════════════
# BPE Tokenizer (vocab.json + merges.txt)
# ═══════════════════════════════════════════════════════════════════

GPT2_PATTERN = regex.compile(
    r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
    regex.UNICODE,
)


def _build_byte_to_unicode():
    bs = list(range(0x21, 0x7F)) + list(range(0xA1, 0xAD)) + list(range(0xAE, 0x100))
    cs = list(bs)
    n = 0
    for b in range(256):
        if b not in bs:
            bs.append(b)
            cs.append(256 + n)
            n += 1
    return {b: chr(c) for b, c in zip(bs, cs)}


BYTE_TO_UNICODE = _build_byte_to_unicode()
UNICODE_TO_BYTE = {v: k for k, v in BYTE_TO_UNICODE.items()}


class BPETokenizer:
    def __init__(self, vocab_path: str, merges_path: str):
        with open(vocab_path, "r", encoding="utf-8") as f:
            self.vocab = json.load(f)
        self.id_to_token = {v: k for k, v in self.vocab.items()}

        self.merges = []
        self.merge_rank = {}
        with open(merges_path, "r", encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if not line or line.startswith("#"):
                    continue
                parts = line.split()
                if len(parts) == 2:
                    pair = (parts[0], parts[1])
                    self.merges.append(pair)
                    self.merge_rank[pair] = len(self.merge_rank)

        self.cache = {}

    def _bpe_word(self, chars: list[str]) -> list[str]:
        tokens = list(chars)
        while len(tokens) >= 2:
            best_rank = float("inf")
            best_pair = None
            for i in range(len(tokens) - 1):
                pair = (tokens[i], tokens[i + 1])
                rank = self.merge_rank.get(pair, float("inf"))
                if rank < best_rank:
                    best_rank = rank
                    best_pair = pair
            if best_pair is None or best_rank == float("inf"):
                break
            a, b = best_pair
            new_tokens = []
            i = 0
            while i < len(tokens):
                if i < len(tokens) - 1 and tokens[i] == a and tokens[i + 1] == b:
                    new_tokens.append(a + b)
                    i += 2
                else:
                    new_tokens.append(tokens[i])
                    i += 1
            tokens = new_tokens
        return tokens

    def encode(self, text: str) -> list[int]:
        ids = []
        for m in GPT2_PATTERN.finditer(text):
            word = m.group()
            if word in self.cache:
                ids.extend(self.cache[word])
                continue
            chars = [BYTE_TO_UNICODE[b] for b in word.encode("utf-8")]
            tokens = self._bpe_word(chars)
            word_ids = [self.vocab[t] for t in tokens if t in self.vocab]
            self.cache[word] = word_ids
            ids.extend(word_ids)
        return ids

    def decode(self, ids: list[int]) -> str:
        text = "".join(self.id_to_token.get(i, "") for i in ids)
        byte_vals = [UNICODE_TO_BYTE[c] for c in text if c in UNICODE_TO_BYTE]
        return bytes(byte_vals).decode("utf-8", errors="replace")


# ═══════════════════════════════════════════════════════════════════
# Sampling helpers
# ═══════════════════════════════════════════════════════════════════


def _sample_logits(logits: torch.Tensor, temperature: float, top_k: int,
                   top_p: float, vocab_size: int) -> int:
    if temperature <= 0:
        return logits.argmax().item()

    logits = logits / temperature

    if 0 < top_k < vocab_size:
        topk_vals, _ = torch.topk(logits, top_k)
        logits[logits < topk_vals[-1]] = float("-inf")

    if top_p < 1.0:
        sorted_logits, sorted_idx = torch.sort(logits, descending=True)
        cum_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
        remove = cum_probs - F.softmax(sorted_logits, dim=-1) >= top_p
        sorted_logits[remove] = float("-inf")
        logits = sorted_logits.scatter(0, sorted_idx, sorted_logits)

    probs = F.softmax(logits, dim=-1)
    return torch.multinomial(probs, 1).item()


# ═══════════════════════════════════════════════════════════════════
# Text generation with KV cache
# ═══════════════════════════════════════════════════════════════════


@torch.inference_mode()
def generate(
    model: JuliaSLM_SVD,
    tokenizer: BPETokenizer,
    prompt: str,
    max_tokens: int = 200,
    temperature: float = 0.8,
    top_k: int = 40,
    top_p: float = 1.0,
) -> tuple[str, int]:
    config = model.config
    input_ids = tokenizer.encode(prompt)
    prompt_len = len(input_ids)
    ids = input_ids[-config.context_length:]

    x = torch.tensor([ids], dtype=torch.long, device=DEVICE)
    logits, kv_caches = model(x)
    next_logits = logits[0, -1, :].float()

    generated_ids = []
    seq_len = len(ids)

    for _ in range(max_tokens):
        if seq_len >= config.context_length:
            break

        idx = _sample_logits(next_logits, temperature, top_k, top_p, config.vocab_size)
        generated_ids.append(idx)
        seq_len += 1

        x = torch.tensor([[idx]], dtype=torch.long, device=DEVICE)
        logits, kv_caches = model(x, kv_caches)
        next_logits = logits[0, -1, :].float()

    return tokenizer.decode(generated_ids), prompt_len


@torch.inference_mode()
def generate_streaming(
    model: JuliaSLM_SVD,
    tokenizer: BPETokenizer,
    prompt: str,
    max_tokens: int = 200,
    temperature: float = 0.8,
    top_k: int = 40,
    top_p: float = 1.0,
):
    config = model.config
    input_ids = tokenizer.encode(prompt)
    prompt_len = len(input_ids)
    ids = input_ids[-config.context_length:]

    x = torch.tensor([ids], dtype=torch.long, device=DEVICE)
    logits, kv_caches = model(x)
    next_logits = logits[0, -1, :].float()

    seq_len = len(ids)

    for _ in range(max_tokens):
        if seq_len >= config.context_length:
            break

        idx = _sample_logits(next_logits, temperature, top_k, top_p, config.vocab_size)
        seq_len += 1

        yield tokenizer.decode([idx]), prompt_len

        x = torch.tensor([[idx]], dtype=torch.long, device=DEVICE)
        logits, kv_caches = model(x, kv_caches)
        next_logits = logits[0, -1, :].float()


# ═══════════════════════════════════════════════════════════════════
# Download artifacts from HuggingFace
# ═══════════════════════════════════════════════════════════════════


def ensure_artifacts():
    os.makedirs(CKPT_DIR, exist_ok=True)
    files = {}

    # Model checkpoint from SVD-compressed repo
    ckpt_local = os.path.join(CKPT_DIR, CHECKPOINT_NAME)
    if not os.path.isfile(ckpt_local):
        print(f"Downloading {CHECKPOINT_NAME} from {HF_MODEL_REPO} ...")
        hf_hub_download(repo_id=HF_MODEL_REPO, filename=CHECKPOINT_NAME, local_dir=CKPT_DIR)
        sz_mb = os.path.getsize(ckpt_local) / (1024 * 1024)
        print(f"  -> {ckpt_local} ({sz_mb:.1f} MB)")
    files["checkpoint"] = ckpt_local

    # Tokenizer from original JuliaSLM repo
    for fname in ("vocab.json", "merges.txt"):
        local = os.path.join(CKPT_DIR, fname)
        if not os.path.isfile(local):
            print(f"Downloading {fname} from {HF_TOKENIZER_REPO} ...")
            hf_hub_download(repo_id=HF_TOKENIZER_REPO, filename=fname, local_dir=CKPT_DIR)
            sz_mb = os.path.getsize(local) / (1024 * 1024)
            print(f"  -> {local} ({sz_mb:.1f} MB)")
        files[fname] = local

    return files


# ═══════════════════════════════════════════════════════════════════
# Load model
# ═══════════════════════════════════════════════════════════════════

print("Downloading artifacts...")
ARTIFACT_PATHS = ensure_artifacts()

print("\nLoading SVD-compressed model...")
state_dict = torch.load(ARTIFACT_PATHS["checkpoint"], map_location="cpu", weights_only=True)

# Build config from checkpoint (auto-detects ranks per layer)
CONFIG = SVDConfig.from_checkpoint(state_dict)
MODEL = JuliaSLM_SVD(CONFIG)
MODEL.load_state_dict(state_dict, strict=False)
MODEL.eval()
DEVICE = torch.device("cpu")

print("Loading tokenizer...")
TOKENIZER = BPETokenizer(
    ARTIFACT_PATHS["vocab.json"],
    ARTIFACT_PATHS["merges.txt"],
)

MODEL_CREATED_AT = int(time.time())
NUM_PARAMS = MODEL.num_parameters
print(
    f"\nSVD-compressed model ready: vocab={CONFIG.vocab_size}, d_model={CONFIG.d_model}, "
    f"layers={CONFIG.n_layers}, heads={CONFIG.n_heads}, "
    f"ctx={CONFIG.context_length}, params={NUM_PARAMS:,}"
)
print("SVD-90 compression: ~4.5% parameter reduction")
print("KV cache enabled: O(1) per-token decoding")

MODEL_LOCK = Lock()

# ═══════════════════════════════════════════════════════════════════
# HTTP helpers
# ═══════════════════════════════════════════════════════════════════

CORS_HEADERS = {
    "Access-Control-Allow-Origin": "*",
    "Access-Control-Allow-Methods": "GET, POST, OPTIONS",
    "Access-Control-Allow-Headers": "Content-Type, Authorization",
}


def extract_prompt(messages):
    if not messages:
        return ""
    for msg in reversed(messages):
        if msg.get("role") == "user":
            return msg.get("content", "")
    return messages[-1].get("content", "")


# ═══════════════════════════════════════════════════════════════════
# Request handler
# ═══════════════════════════════════════════════════════════════════


class Handler(BaseHTTPRequestHandler):
    def log_message(self, format, *args):
        print(f"[{self.log_date_time_string()}] {format % args}")

    def _send_json(self, status, body):
        data = json.dumps(body).encode()
        self.send_response(status)
        self.send_header("Content-Type", "application/json")
        for k, v in CORS_HEADERS.items():
            self.send_header(k, v)
        self.send_header("Content-Length", str(len(data)))
        self.end_headers()
        self.wfile.write(data)

    def do_OPTIONS(self):
        self.send_response(204)
        for k, v in CORS_HEADERS.items():
            self.send_header(k, v)
        self.end_headers()

    def do_GET(self):
        if self.path == "/":
            self._send_json(200, {
                "name": "JuliaSLM-compressed-svd",
                "version": "1.0.0",
                "description": "SVD-compressed JuliaSLM — low-rank factorized weight matrices for efficient inference",
                "architecture": "MHA + RoPE + SwiGLU + RMSNorm + weight tying + SVD compression",
                "compression": {
                    "method": "SVD-90",
                    "original_params": 5_040_000,
                    "compressed_params": NUM_PARAMS,
                    "reduction_pct": round((1 - NUM_PARAMS / 5_040_000) * 100, 1),
                    "val_loss": 3.756,
                    "original_val_loss": 3.552,
                },
                "model": {
                    "vocab_size": CONFIG.vocab_size,
                    "d_model": CONFIG.d_model,
                    "n_layers": CONFIG.n_layers,
                    "n_heads": CONFIG.n_heads,
                    "context_length": CONFIG.context_length,
                    "parameters": NUM_PARAMS,
                },
                "endpoints": ["/v1/models", "/v1/chat/completions"],
                "features": ["streaming", "OpenAI-compatible", "top-k", "top-p", "kv-cache"],
                "compatible_with": ["OpenAI API", "OpenRouter"],
            })
        elif self.path == "/v1/models":
            self._send_json(200, {
                "object": "list",
                "data": [{
                    "id": MODEL_ID,
                    "object": "model",
                    "created": MODEL_CREATED_AT,
                    "owned_by": "juliaslm",
                }],
            })
        else:
            self._send_json(404, {"error": {
                "message": f"Not found: GET {self.path}",
                "type": "invalid_request_error",
                "code": "not_found",
            }})

    def do_POST(self):
        if self.path != "/v1/chat/completions":
            self._send_json(404, {"error": {
                "message": f"Not found: POST {self.path}",
                "type": "invalid_request_error",
                "code": "not_found",
            }})
            return

        content_length = int(self.headers.get("Content-Length", 0))
        try:
            body = json.loads(self.rfile.read(content_length))
        except (json.JSONDecodeError, ValueError):
            self._send_json(400, {"error": {
                "message": "Invalid JSON in request body",
                "type": "invalid_request_error",
                "code": "invalid_json",
            }})
            return

        temperature = max(0.0, min(2.0, float(body.get("temperature", 0.8))))
        max_tokens = max(1, min(CONFIG.context_length, int(body.get("max_tokens", 200))))
        top_k_val = max(0, min(CONFIG.vocab_size, int(body.get("top_k", 40))))
        top_p_val = max(0.0, min(1.0, float(body.get("top_p", 1.0))))
        stream = bool(body.get("stream", False))

        messages = body.get("messages", [])
        prompt_text = extract_prompt(messages)
        completion_id = f"chatcmpl-{uuid.uuid4()}"
        created = int(time.time())

        with MODEL_LOCK:
            if stream:
                self._handle_stream(
                    prompt_text, max_tokens, temperature, top_k_val, top_p_val,
                    completion_id, created,
                )
            else:
                self._handle_non_stream(
                    prompt_text, max_tokens, temperature, top_k_val, top_p_val,
                    completion_id, created,
                )

    def _handle_stream(self, prompt_text, max_tokens, temperature, top_k, top_p,
                       completion_id, created):
        self.send_response(200)
        self.send_header("Content-Type", "text/event-stream")
        self.send_header("Cache-Control", "no-cache")
        self.send_header("X-Accel-Buffering", "no")
        for k, v in CORS_HEADERS.items():
            self.send_header(k, v)
        self.end_headers()

        def sse(data):
            self.wfile.write(f"data: {json.dumps(data)}\n\n".encode())
            self.wfile.flush()

        sse({
            "id": completion_id,
            "object": "chat.completion.chunk",
            "created": created,
            "model": MODEL_ID,
            "choices": [{"index": 0, "delta": {"role": "assistant", "content": ""}, "finish_reason": None}],
        })

        token_count = 0
        prompt_tokens = 0
        for token_str, p_len in generate_streaming(
            MODEL, TOKENIZER, prompt_text,
            max_tokens=max_tokens, temperature=temperature,
            top_k=top_k, top_p=top_p,
        ):
            token_count += 1
            prompt_tokens = p_len
            sse({
                "id": completion_id,
                "object": "chat.completion.chunk",
                "created": created,
                "model": MODEL_ID,
                "choices": [{"index": 0, "delta": {"content": token_str}, "finish_reason": None}],
            })

        sse({
            "id": completion_id,
            "object": "chat.completion.chunk",
            "created": created,
            "model": MODEL_ID,
            "choices": [{"index": 0, "delta": {}, "finish_reason": "length" if token_count >= max_tokens else "stop"}],
            "usage": {
                "prompt_tokens": prompt_tokens,
                "completion_tokens": token_count,
                "total_tokens": prompt_tokens + token_count,
            },
        })
        self.wfile.write(b"data: [DONE]\n\n")
        self.wfile.flush()

    def _handle_non_stream(self, prompt_text, max_tokens, temperature, top_k, top_p,
                           completion_id, created):
        text, prompt_tokens = generate(
            MODEL, TOKENIZER, prompt_text,
            max_tokens=max_tokens, temperature=temperature,
            top_k=top_k, top_p=top_p,
        )
        completion_tokens = len(TOKENIZER.encode(text))
        finish_reason = "length" if completion_tokens >= max_tokens else "stop"

        self._send_json(200, {
            "id": completion_id,
            "object": "chat.completion",
            "created": created,
            "model": MODEL_ID,
            "choices": [{
                "index": 0,
                "message": {"role": "assistant", "content": text},
                "finish_reason": finish_reason,
            }],
            "usage": {
                "prompt_tokens": prompt_tokens,
                "completion_tokens": completion_tokens,
                "total_tokens": prompt_tokens + completion_tokens,
            },
            "system_fingerprint": "juliaslm-svd90-v1",
        })


# ═══════════════════════════════════════════════════════════════════
# Start server
# ═══════════════════════════════════════════════════════════════════

if __name__ == "__main__":
    print(f"\nJuliaSLM-compressed-svd server starting on 0.0.0.0:{PORT} ...")
    print(f"  GET  http://localhost:{PORT}/")
    print(f"  GET  http://localhost:{PORT}/v1/models")
    print(f"  POST http://localhost:{PORT}/v1/chat/completions")
    print(f"  POST http://localhost:{PORT}/v1/chat/completions  (stream=true)")
    print()

    server = HTTPServer(("0.0.0.0", PORT), Handler)
    server.serve_forever()