Spaces:

triflix
/

diffusionLLM

Paused

File size: 10,993 Bytes

55fb776

import os
import time
import uuid
import json
import logging
import asyncio
from contextlib import asynccontextmanager
from typing import Optional

import torch
from fastapi import FastAPI, HTTPException, Request
from fastapi.responses import StreamingResponse, JSONResponse
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel, Field
from transformers import AutoModel, AutoTokenizer

# ─── Config ──────────────────────────────────────────────────

MODEL_NAME = "Dream-org/Dream-v0-Instruct-1B"
API_MODEL_ID = "dream-diffusion-1b"
PORT = int(os.environ.get("PORT", 7860))
QUANTIZE = os.environ.get("QUANTIZE", "true").lower() == "true"

logging.basicConfig(
    level=logging.INFO,
    format="[%(asctime)s] %(levelname)s %(message)s",
    datefmt="%H:%M:%S",
)
log = logging.getLogger("dream-api")

# ─── Global Model References ─────────────────────────────────

model = None
tokenizer = None
model_loaded = False


# ─── Model Loading ────────────────────────────────────────────

def load_model():
    global model, tokenizer, model_loaded

    log.info(f"Loading tokenizer: {MODEL_NAME}")
    tokenizer = AutoTokenizer.from_pretrained(
        MODEL_NAME,
        trust_remote_code=True,
    )

    log.info(f"Loading model: {MODEL_NAME}")
    start = time.time()

    model = AutoModel.from_pretrained(
        MODEL_NAME,
        torch_dtype=torch.float32,
        trust_remote_code=True,
    )
    model.eval()

    # INT8 Dynamic Quantization
    if QUANTIZE:
        try:
            from torch.ao.quantization import quantize_dynamic
            model = quantize_dynamic(
                model,
                {torch.nn.Linear},
                dtype=torch.qint8,
            )
            log.info("✅ INT8 quantization applied")
        except Exception as e:
            log.warning(f"⚠️ Quantization failed: {e}")

    elapsed = time.time() - start
    log.info(f"✅ Model loaded in {elapsed:.1f}s")
    model_loaded = True


# ─── Lifespan ─────────────────────────────────────────────────

@asynccontextmanager
async def lifespan(app: FastAPI):
    # Startup: load model in a thread so we don't block
    loop = asyncio.get_event_loop()
    await loop.run_in_executor(None, load_model)
    yield
    # Shutdown
    log.info("Shutting down")


# ─── FastAPI App ──────────────────────────────────────────────

app = FastAPI(
    title="Dream Diffusion LLM API",
    version="1.0.0",
    lifespan=lifespan,
)

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_methods=["*"],
    allow_headers=["*"],
)


# ─── Pydantic Models ─────────────────────────────────────────

class Message(BaseModel):
    role: str
    content: str


class ChatCompletionRequest(BaseModel):
    model: str = API_MODEL_ID
    messages: list[Message]
    max_tokens: Optional[int] = Field(default=256, le=1024, ge=1)
    temperature: Optional[float] = Field(default=0.35, ge=0.0, le=2.0)
    top_p: Optional[float] = Field(default=0.95, ge=0.0, le=1.0)
    stream: Optional[bool] = False
    # Diffusion-specific
    steps: Optional[int] = Field(default=64, le=256, ge=1)


class ChatCompletionMessage(BaseModel):
    role: str = "assistant"
    content: str


class Choice(BaseModel):
    index: int = 0
    message: ChatCompletionMessage
    finish_reason: str = "stop"


class Usage(BaseModel):
    prompt_tokens: int
    completion_tokens: int
    total_tokens: int


class ChatCompletionResponse(BaseModel):
    id: str
    object: str = "chat.completion"
    created: int
    model: str
    choices: list[Choice]
    usage: Usage


# ─── Inference Function ──────────────────────────────────────

def run_inference(
    messages: list[Message],
    max_tokens: int,
    steps: int,
    temperature: float,
    top_p: float,
) -> tuple[str, float]:
    """Run diffusion generation. Returns (text, elapsed_ms)."""

    # Build chat prompt
    msgs = [{"role": m.role, "content": m.content} for m in messages]
    input_text = tokenizer.apply_chat_template(
        msgs,
        tokenize=False,
        add_generation_prompt=True,
    )

    input_ids = tokenizer(input_text, return_tensors="pt")["input_ids"]
    attention_mask = torch.ones_like(input_ids)
    prompt_len = input_ids.shape[1]

    # Generate
    start = time.time()
    with torch.no_grad():
        output = model.diffusion_generate(
            input_ids,
            attention_mask=attention_mask,
            max_new_tokens=max_tokens,
            output_history=False,
            steps=steps,
            temperature=temperature,
            top_p=top_p,
            alg="entropy",
            alg_temp=0.1,
        )
    elapsed_ms = (time.time() - start) * 1000

    # Decode
    generated_ids = output[0, prompt_len:]
    text = tokenizer.decode(generated_ids, skip_special_tokens=True).strip()

    return text, elapsed_ms


# ─── Token Estimator ─────────────────────────────────────────

def estimate_tokens(text: str) -> int:
    words = len(text.split())
    return max(int(words / 0.75), 1)


# ─── Generate Request ID ─────────────────────────────────────

def gen_id() -> str:
    return f"chatcmpl-{uuid.uuid4().hex[:12]}"


# ─── SSE Streaming Generator ─────────────────────────────────

async def stream_generator(text: str, req_id: str):
    """Yield SSE chunks word-by-word from the generated text."""
    now = int(time.time())

    # 1) Role chunk
    role_chunk = {
        "id": req_id,
        "object": "chat.completion.chunk",
        "created": now,
        "model": API_MODEL_ID,
        "choices": [{
            "index": 0,
            "delta": {"role": "assistant"},
            "finish_reason": None,
        }],
    }
    yield f"data: {json.dumps(role_chunk)}\n\n"

    # 2) Content chunks — word by word
    words = text.split()
    for i, word in enumerate(words):
        content = word + ("" if i == len(words) - 1 else " ")
        chunk = {
            "id": req_id,
            "object": "chat.completion.chunk",
            "created": now,
            "model": API_MODEL_ID,
            "choices": [{
                "index": 0,
                "delta": {"content": content},
                "finish_reason": None,
            }],
        }
        yield f"data: {json.dumps(chunk)}\n\n"
        await asyncio.sleep(0.015)  # typing effect

    # 3) Stop chunk
    stop_chunk = {
        "id": req_id,
        "object": "chat.completion.chunk",
        "created": now,
        "model": API_MODEL_ID,
        "choices": [{
            "index": 0,
            "delta": {},
            "finish_reason": "stop",
        }],
    }
    yield f"data: {json.dumps(stop_chunk)}\n\n"
    yield "data: [DONE]\n\n"


# ─── Routes ──────────────────────────────────────────────────

@app.get("/")
async def root():
    return {
        "name": "Dream Diffusion LLM API",
        "model": API_MODEL_ID,
        "version": "1.0.0",
        "openai_compatible": True,
        "endpoints": {
            "chat": "POST /v1/chat/completions",
            "models": "GET  /v1/models",
            "health": "GET  /health",
        },
    }


@app.get("/health")
async def health():
    if not model_loaded:
        return JSONResponse(
            status_code=503,
            content={"status": "loading", "model": MODEL_NAME},
        )
    return {"status": "healthy", "model": MODEL_NAME}


@app.get("/v1/models")
async def list_models():
    return {
        "object": "list",
        "data": [
            {
                "id": API_MODEL_ID,
                "object": "model",
                "created": 1700000000,
                "owned_by": "dream-org",
            }
        ],
    }


@app.post("/v1/chat/completions")
async def chat_completions(req: ChatCompletionRequest):
    if not model_loaded:
        raise HTTPException(status_code=503, detail="Model is still loading")

    if not req.messages:
        raise HTTPException(status_code=400, detail="messages array is required")

    log.info(
        f"Request: steps={req.steps}, max_tokens={req.max_tokens}, "
        f"temp={req.temperature}, stream={req.stream}"
    )

    # Run inference in thread pool (blocking call)
    loop = asyncio.get_event_loop()
    try:
        text, elapsed_ms = await loop.run_in_executor(
            None,
            run_inference,
            req.messages,
            req.max_tokens,
            req.steps,
            req.temperature,
            req.top_p,
        )
    except Exception as e:
        log.error(f"Inference error: {e}", exc_info=True)
        raise HTTPException(status_code=500, detail=f"Inference failed: {str(e)}")

    log.info(f"Generated {len(text)} chars in {elapsed_ms:.0f}ms")

    req_id = gen_id()

    # ── Streaming Response ──
    if req.stream:
        return StreamingResponse(
            stream_generator(text, req_id),
            media_type="text/event-stream",
            headers={
                "Cache-Control": "no-cache",
                "Connection": "keep-alive",
                "X-Accel-Buffering": "no",
            },
        )

    # ── Non-Streaming Response ──
    prompt_tokens = sum(estimate_tokens(m.content) for m in req.messages)
    completion_tokens = estimate_tokens(text)

    return ChatCompletionResponse(
        id=req_id,
        created=int(time.time()),
        model=API_MODEL_ID,
        choices=[
            Choice(
                message=ChatCompletionMessage(content=text),
            )
        ],
        usage=Usage(
            prompt_tokens=prompt_tokens,
            completion_tokens=completion_tokens,
            total_tokens=prompt_tokens + completion_tokens,
        ),
    )


# ─── Run ──────────────────────────────────────────────────────

if __name__ == "__main__":
    import uvicorn

    uvicorn.run(
        "app:app",
        host="0.0.0.0",
        port=PORT,
        workers=1,
        log_level="info",
    )