owlninjam commited on
Commit
5f26357
·
verified ·
1 Parent(s): f061302

Create api.py

Browse files
Files changed (1) hide show
  1. api.py +207 -0
api.py ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, HTTPException, Depends, status
2
+ from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
3
+ from fastapi.middleware.cors import CORSMiddleware
4
+ from pydantic import BaseModel
5
+ from llama_cpp import Llama
6
+ import os
7
+ import uvicorn
8
+ from typing import Optional, List, Dict, Union, Literal
9
+ import time
10
+ import json
11
+ import uuid
12
+ from datetime import datetime
13
+
14
+ # Configuration
15
+ VALID_API_KEYS = {
16
+ "sk-adminkey02",
17
+ "sk-testkey123",
18
+ "sk-userkey456",
19
+ "sk-demokey789"
20
+ }
21
+
22
+ # Global model variable
23
+ llm = None
24
+ security = HTTPBearer()
25
+
26
+ class Message(BaseModel):
27
+ role: Literal["system", "user", "assistant"]
28
+ content: str
29
+
30
+ class ChatCompletionRequest(BaseModel):
31
+ model: str = "zephyr-quiklang-3b-4k"
32
+ messages: List[Message]
33
+ max_tokens: Optional[int] = 512
34
+ temperature: Optional[float] = 0.7
35
+ top_p: Optional[float] = 0.9
36
+ n: Optional[int] = 1
37
+ stream: Optional[bool] = False
38
+ stop: Optional[Union[str, List[str]]] = None
39
+
40
+ class ChatCompletionChoice(BaseModel):
41
+ index: int
42
+ message: Message
43
+ finish_reason: Literal["stop", "length", "content_filter"]
44
+
45
+ class Usage(BaseModel):
46
+ prompt_tokens: int
47
+ completion_tokens: int
48
+ total_tokens: int
49
+
50
+ class ChatCompletionResponse(BaseModel):
51
+ id: str
52
+ object: str = "chat.completion"
53
+ created: int
54
+ model: str
55
+ choices: List[ChatCompletionChoice]
56
+ usage: Usage
57
+
58
+ class Model(BaseModel):
59
+ id: str
60
+ object: str = "model"
61
+ created: int
62
+ owned_by: str
63
+
64
+ class ModelsResponse(BaseModel):
65
+ object: str = "list"
66
+ data: List[Model]
67
+
68
+ app = FastAPI(
69
+ title="Zephyr Quiklang OpenAI API",
70
+ description="OpenAI-compatible API for Zephyr-Quiklang-3B-4K",
71
+ version="1.0.0",
72
+ docs_url="/v1/docs",
73
+ redoc_url="/v1/redoc"
74
+ )
75
+
76
+ app.add_middleware(
77
+ CORSMiddleware,
78
+ allow_origins=["*"],
79
+ allow_credentials=True,
80
+ allow_methods=["*"],
81
+ allow_headers=["*"],
82
+ )
83
+
84
+ def verify_api_key(credentials: HTTPAuthorizationCredentials = Depends(security)):
85
+ if credentials.credentials not in VALID_API_KEYS:
86
+ raise HTTPException(status_code=status.HTTP_401_UNAUTHORIZED, detail="Invalid API key")
87
+ return credentials.credentials
88
+
89
+ def load_model():
90
+ global llm
91
+ model_path = "zephyr-quiklang-3b-4k.Q4_K_M.gguf"
92
+
93
+ if not os.path.exists(model_path):
94
+ raise Exception(f"Model file {model_path} not found!")
95
+
96
+ llm = Llama(
97
+ model_path=model_path,
98
+ n_ctx=4096,
99
+ n_threads=2,
100
+ n_batch=512,
101
+ verbose=False,
102
+ use_mlock=True,
103
+ n_gpu_layers=0,
104
+ )
105
+
106
+ def format_messages(messages: List[Message]) -> str:
107
+ formatted = ""
108
+ for message in messages:
109
+ formatted += f"<|im_start|>{message.role}\n{message.content}\n<|im_end|>\n"
110
+ formatted += "<|im_start|>assistant\n"
111
+ return formatted
112
+
113
+ def count_tokens_rough(text: str) -> int:
114
+ return len(text.split())
115
+
116
+ @app.on_event("startup")
117
+ async def startup_event():
118
+ print("🚀 Starting Zephyr Quiklang API...")
119
+ load_model()
120
+ print("✅ Model loaded.")
121
+
122
+ @app.get("/v1/models", response_model=ModelsResponse)
123
+ async def list_models(api_key: str = Depends(verify_api_key)):
124
+ return ModelsResponse(data=[
125
+ Model(
126
+ id="zephyr-quiklang-3b-4k",
127
+ created=int(datetime.now().timestamp()),
128
+ owned_by="local"
129
+ )
130
+ ])
131
+
132
+ @app.post("/v1/chat/completions", response_model=ChatCompletionResponse)
133
+ async def create_chat_completion(request: ChatCompletionRequest, api_key: str = Depends(verify_api_key)):
134
+ if llm is None:
135
+ raise HTTPException(status_code=503, detail="Model not loaded")
136
+
137
+ prompt = format_messages(request.messages)
138
+ prompt_tokens = count_tokens_rough(prompt)
139
+ start_time = time.time()
140
+
141
+ try:
142
+ response = llm(
143
+ prompt,
144
+ max_tokens=request.max_tokens,
145
+ temperature=request.temperature,
146
+ top_p=request.top_p,
147
+ stop=["<|im_end|>", "<|im_start|>"] + (request.stop or []),
148
+ echo=False
149
+ )
150
+ except Exception as e:
151
+ raise HTTPException(status_code=500, detail=f"Error generating response: {str(e)}")
152
+
153
+ end_time = time.time()
154
+ generation_time = end_time - start_time
155
+ response_text = response['choices'][0]['text'].strip()
156
+ completion_tokens = count_tokens_rough(response_text)
157
+
158
+ return ChatCompletionResponse(
159
+ id=f"chatcmpl-{uuid.uuid4().hex[:8]}",
160
+ created=int(time.time()),
161
+ model=request.model,
162
+ choices=[
163
+ ChatCompletionChoice(
164
+ index=0,
165
+ message=Message(role="assistant", content=response_text),
166
+ finish_reason="stop"
167
+ )
168
+ ],
169
+ usage=Usage(
170
+ prompt_tokens=prompt_tokens,
171
+ completion_tokens=completion_tokens,
172
+ total_tokens=prompt_tokens + completion_tokens
173
+ )
174
+ )
175
+
176
+ @app.get("/v1/health")
177
+ async def health_check():
178
+ if llm is None:
179
+ raise HTTPException(status_code=503, detail="Model not loaded")
180
+ return {
181
+ "status": "healthy",
182
+ "model_loaded": True,
183
+ "model": "zephyr-quiklang-3b-4k",
184
+ "timestamp": datetime.now().isoformat()
185
+ }
186
+
187
+ @app.get("/v1")
188
+ async def api_info():
189
+ return {
190
+ "message": "Zephyr Quiklang OpenAI-Compatible API",
191
+ "model": "zephyr-quiklang-3b-4k (Q4_K_M)",
192
+ "endpoints": {
193
+ "chat_completions": "/v1/chat/completions",
194
+ "models": "/v1/models",
195
+ "health": "/v1/health",
196
+ "docs": "/v1/docs"
197
+ },
198
+ "authentication": {
199
+ "required": True,
200
+ "type": "Bearer token",
201
+ "valid_keys": list(VALID_API_KEYS)
202
+ },
203
+ "performance": {
204
+ "context_length": 4096,
205
+ "expected_speed": "2–8 tok/s (CPU)"
206
+ }
207
+ }