dataopsnick commited on
Commit
1f5fda3
·
verified ·
1 Parent(s): 8ff292c

feat: Add OpenAI-compatible API server (`server.py`)

Browse files

- Added a standalone FastAPI script that exposes the ZIP-RC model via a standard `/v1/chat/completions` endpoint.
- Wraps the `ZIPRCSampler` to stream introspection data alongside generated text using Server-Sent Events (SSE).
- Enables direct integration with external tools like ChatKit, OpenWebUI, and the official OpenAI Python client.
- Includes robust event loop handling to ensure the server runs correctly in both standard Python environments and Jupyter/Colab notebooks.

Files changed (1) hide show
  1. server.py +58 -0
server.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import uvicorn
2
+ import json
3
+ import asyncio
4
+ from fastapi import FastAPI, Request
5
+ from fastapi.responses import StreamingResponse
6
+ from ziprc import ZIPRCModel, ZIPRCConfig, ZIPRCSampler
7
+
8
+ # --- Configuration ---
9
+ HOST = "0.0.0.0"
10
+ PORT = 8000
11
+ MODEL_ID = "dataopsnick/Qwen3-4B-Instruct-2507-zip-rc"
12
+
13
+ # --- Load Model Once ---
14
+ print(f"Loading {MODEL_ID}...")
15
+ cfg = ZIPRCConfig(model_name=MODEL_ID)
16
+ model = ZIPRCModel(cfg)
17
+ sampler = ZIPRCSampler(model)
18
+ print("Model loaded. Starting server...")
19
+
20
+ app = FastAPI(title="ZIP-RC OpenAI Compatible API")
21
+
22
+ @app.post("/v1/chat/completions")
23
+ async def chat_completions(request: Request):
24
+ """
25
+ Standard OpenAI Chat Completion endpoint.
26
+ Streams JSON chunks as Server-Sent Events (SSE).
27
+ """
28
+ data = await request.json()
29
+ messages = data.get("messages", [])
30
+ max_tokens = data.get("max_tokens", 512)
31
+
32
+ # 1. Use the sampler's generator
33
+ stream = sampler.openai(messages, max_tokens=max_tokens)
34
+
35
+ # 2. Convert to SSE format
36
+ async def sse_generator():
37
+ async for chunk in stream:
38
+ # chunk is an OpenAIObject (dict-like)
39
+ payload = json.dumps(dict(chunk))
40
+ yield f"data: {payload}\n\n"
41
+ yield "data: [DONE]\n\n"
42
+
43
+ return StreamingResponse(sse_generator(), media_type="text/event-stream")
44
+
45
+ if __name__ == "__main__":
46
+ # Use direct Server instantiation to avoid nested-asyncio conflicts in Notebooks
47
+ config = uvicorn.Config(app, host=HOST, port=PORT)
48
+ server = uvicorn.Server(config)
49
+
50
+ try:
51
+ # Detect if we are already in an event loop (e.g. Colab/Jupyter)
52
+ loop = asyncio.get_running_loop()
53
+ print(f"Server started in background task on http://{HOST}:{PORT}")
54
+ loop.create_task(server.serve())
55
+ except RuntimeError:
56
+ # Standard script execution
57
+ print(f"Server starting on http://{HOST}:{PORT}")
58
+ asyncio.run(server.serve())