|
|
import uvicorn |
|
|
import json |
|
|
import asyncio |
|
|
from fastapi import FastAPI, Request |
|
|
from fastapi.responses import StreamingResponse |
|
|
from ziprc import ZIPRCModel, ZIPRCConfig, ZIPRCSampler |
|
|
|
|
|
|
|
|
HOST = "0.0.0.0" |
|
|
PORT = 8000 |
|
|
MODEL_ID = "dataopsnick/Qwen3-4B-Instruct-2507-zip-rc" |
|
|
|
|
|
|
|
|
print(f"Loading {MODEL_ID}...") |
|
|
cfg = ZIPRCConfig(model_name=MODEL_ID) |
|
|
model = ZIPRCModel(cfg) |
|
|
sampler = ZIPRCSampler(model) |
|
|
print("Model loaded. Starting server...") |
|
|
|
|
|
app = FastAPI(title="ZIP-RC OpenAI Compatible API") |
|
|
|
|
|
@app.post("/v1/chat/completions") |
|
|
async def chat_completions(request: Request): |
|
|
""" |
|
|
Standard OpenAI Chat Completion endpoint. |
|
|
Streams JSON chunks as Server-Sent Events (SSE). |
|
|
""" |
|
|
data = await request.json() |
|
|
messages = data.get("messages", []) |
|
|
max_tokens = data.get("max_tokens", 512) |
|
|
|
|
|
|
|
|
stream = sampler.openai(messages, max_tokens=max_tokens) |
|
|
|
|
|
|
|
|
async def sse_generator(): |
|
|
async for chunk in stream: |
|
|
|
|
|
payload = json.dumps(dict(chunk)) |
|
|
yield f"data: {payload}\n\n" |
|
|
yield "data: [DONE]\n\n" |
|
|
|
|
|
return StreamingResponse(sse_generator(), media_type="text/event-stream") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
|
|
config = uvicorn.Config(app, host=HOST, port=PORT) |
|
|
server = uvicorn.Server(config) |
|
|
|
|
|
try: |
|
|
|
|
|
loop = asyncio.get_running_loop() |
|
|
print(f"Server started in background task on http://{HOST}:{PORT}") |
|
|
loop.create_task(server.serve()) |
|
|
except RuntimeError: |
|
|
|
|
|
print(f"Server starting on http://{HOST}:{PORT}") |
|
|
asyncio.run(server.serve()) |