owlninjam commited on
Commit
cc22945
·
verified ·
1 Parent(s): 252854a

Create api.py

Browse files
Files changed (1) hide show
  1. api.py +269 -0
api.py ADDED
@@ -0,0 +1,269 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, HTTPException, Depends, status
2
+ from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
3
+ from fastapi.middleware.cors import CORSMiddleware
4
+ from pydantic import BaseModel
5
+ from llama_cpp import Llama
6
+ import os
7
+ import uvicorn
8
+ from typing import Optional, List, Dict, Union, Literal
9
+ import time
10
+ import json
11
+ import uuid
12
+ from datetime import datetime
13
+
14
+ # Configuration
15
+ VALID_API_KEYS = {
16
+ "sk-adminkey02",
17
+ "sk-testkey123",
18
+ "sk-userkey456",
19
+ "sk-demokey789"
20
+ }
21
+
22
+ # Global model variable
23
+ llm = None
24
+ security = HTTPBearer()
25
+
26
+ # OpenAI-compatible request/response models
27
+ class Message(BaseModel):
28
+ role: Literal["system", "user", "assistant"]
29
+ content: str
30
+
31
+ class ChatCompletionRequest(BaseModel):
32
+ model: str = "capybarahermes-2.5-mistral-7b"
33
+ messages: List[Message]
34
+ max_tokens: Optional[int] = 512
35
+ temperature: Optional[float] = 0.7
36
+ top_p: Optional[float] = 0.9
37
+ n: Optional[int] = 1
38
+ stream: Optional[bool] = False
39
+ stop: Optional[Union[str, List[str]]] = None
40
+
41
+ class ChatCompletionChoice(BaseModel):
42
+ index: int
43
+ message: Message
44
+ finish_reason: Literal["stop", "length", "content_filter"]
45
+
46
+ class Usage(BaseModel):
47
+ prompt_tokens: int
48
+ completion_tokens: int
49
+ total_tokens: int
50
+
51
+ class ChatCompletionResponse(BaseModel):
52
+ id: str
53
+ object: str = "chat.completion"
54
+ created: int
55
+ model: str
56
+ choices: List[ChatCompletionChoice]
57
+ usage: Usage
58
+
59
+ class Model(BaseModel):
60
+ id: str
61
+ object: str = "model"
62
+ created: int
63
+ owned_by: str
64
+
65
+ class ModelsResponse(BaseModel):
66
+ object: str = "list"
67
+ data: List[Model]
68
+
69
+ # Initialize FastAPI
70
+ app = FastAPI(
71
+ title="CapybaraHermes OpenAI API",
72
+ description="OpenAI-compatible API for CapybaraHermes-2.5-Mistral-7B",
73
+ version="1.0.0",
74
+ docs_url="/v1/docs",
75
+ redoc_url="/v1/redoc"
76
+ )
77
+
78
+ # Add CORS middleware
79
+ app.add_middleware(
80
+ CORSMiddleware,
81
+ allow_origins=["*"],
82
+ allow_credentials=True,
83
+ allow_methods=["*"],
84
+ allow_headers=["*"],
85
+ )
86
+
87
+ def verify_api_key(credentials: HTTPAuthorizationCredentials = Depends(security)):
88
+ """Verify API key"""
89
+ if credentials.credentials not in VALID_API_KEYS:
90
+ raise HTTPException(
91
+ status_code=status.HTTP_401_UNAUTHORIZED,
92
+ detail="Invalid API key"
93
+ )
94
+ return credentials.credentials
95
+
96
+ def load_model():
97
+ """Load the GGUF model"""
98
+ global llm
99
+ model_path = "capybarahermes-2.5-mistral-7b.Q5_K_M.gguf"
100
+
101
+ if not os.path.exists(model_path):
102
+ raise Exception(f"Model file {model_path} not found!")
103
+
104
+ try:
105
+ llm = Llama(
106
+ model_path=model_path,
107
+ n_ctx=4096,
108
+ n_threads=2,
109
+ n_batch=512,
110
+ verbose=False,
111
+ use_mlock=True,
112
+ n_gpu_layers=0,
113
+ )
114
+ print("✅ Model loaded successfully!")
115
+ except Exception as e:
116
+ raise Exception(f"❌ Error loading model: {str(e)}")
117
+
118
+ def format_messages(messages: List[Message]) -> str:
119
+ """Format messages for ChatML format"""
120
+ formatted = ""
121
+
122
+ for message in messages:
123
+ formatted += f"<|im_start|>{message.role}\n{message.content}\n<|im_end|>\n"
124
+
125
+ formatted += "<|im_start|>assistant\n"
126
+ return formatted
127
+
128
+ def count_tokens_rough(text: str) -> int:
129
+ """Rough token counting"""
130
+ return len(text.split())
131
+
132
+ @app.on_event("startup")
133
+ async def startup_event():
134
+ """Load model on startup"""
135
+ try:
136
+ print("🚀 Starting CapybaraHermes API server...")
137
+ load_model()
138
+ except Exception as e:
139
+ print(f"💥 Failed to load model: {e}")
140
+ raise e
141
+
142
+ # API endpoints with authentication
143
+ @app.get("/v1/models", response_model=ModelsResponse)
144
+ async def list_models(api_key: str = Depends(verify_api_key)):
145
+ """List available models"""
146
+ return ModelsResponse(
147
+ data=[
148
+ Model(
149
+ id="capybarahermes-2.5-mistral-7b",
150
+ created=int(datetime.now().timestamp()),
151
+ owned_by="local"
152
+ )
153
+ ]
154
+ )
155
+
156
+ @app.post("/v1/chat/completions", response_model=ChatCompletionResponse)
157
+ async def create_chat_completion(
158
+ request: ChatCompletionRequest,
159
+ api_key: str = Depends(verify_api_key)
160
+ ):
161
+ """Create chat completion"""
162
+ if llm is None:
163
+ raise HTTPException(status_code=503, detail="Model not loaded")
164
+
165
+ try:
166
+ # Format messages
167
+ prompt = format_messages(request.messages)
168
+ prompt_tokens = count_tokens_rough(prompt)
169
+
170
+ start_time = time.time()
171
+
172
+ # Generate response
173
+ response = llm(
174
+ prompt,
175
+ max_tokens=request.max_tokens,
176
+ temperature=request.temperature,
177
+ top_p=request.top_p,
178
+ stop=["<|im_end|>", "<|im_start|>"] + (request.stop or []),
179
+ echo=False
180
+ )
181
+
182
+ end_time = time.time()
183
+ generation_time = end_time - start_time
184
+
185
+ # Extract response
186
+ response_text = response['choices'][0]['text'].strip()
187
+ completion_tokens = count_tokens_rough(response_text)
188
+ tokens_per_second = completion_tokens / generation_time if generation_time > 0 else 0
189
+
190
+ print(f"⚡ Generated {completion_tokens} tokens in {generation_time:.2f}s ({tokens_per_second:.2f} tok/s)")
191
+
192
+ return ChatCompletionResponse(
193
+ id=f"chatcmpl-{uuid.uuid4().hex[:8]}",
194
+ created=int(time.time()),
195
+ model=request.model,
196
+ choices=[
197
+ ChatCompletionChoice(
198
+ index=0,
199
+ message=Message(role="assistant", content=response_text),
200
+ finish_reason="stop"
201
+ )
202
+ ],
203
+ usage=Usage(
204
+ prompt_tokens=prompt_tokens,
205
+ completion_tokens=completion_tokens,
206
+ total_tokens=prompt_tokens + completion_tokens
207
+ )
208
+ )
209
+
210
+ except Exception as e:
211
+ raise HTTPException(status_code=500, detail=f"Error generating response: {str(e)}")
212
+
213
+ @app.get("/v1/health")
214
+ async def health_check():
215
+ """Health check (no auth required)"""
216
+ if llm is None:
217
+ raise HTTPException(status_code=503, detail="Model not loaded")
218
+ return {
219
+ "status": "healthy",
220
+ "model_loaded": True,
221
+ "timestamp": datetime.now().isoformat(),
222
+ "model": "capybarahermes-2.5-mistral-7b"
223
+ }
224
+
225
+ @app.get("/v1")
226
+ async def api_info():
227
+ """API information"""
228
+ return {
229
+ "message": "🦙 CapybaraHermes OpenAI Compatible API",
230
+ "model": "CapybaraHermes-2.5-Mistral-7B (Q5_K_M quantized)",
231
+ "endpoints": {
232
+ "chat_completions": "/v1/chat/completions",
233
+ "models": "/v1/models",
234
+ "health": "/v1/health",
235
+ "docs": "/v1/docs"
236
+ },
237
+ "authentication": {
238
+ "required": True,
239
+ "type": "Bearer token",
240
+ "header": "Authorization: Bearer sk-your-api-key",
241
+ "valid_keys": ["sk-adminkey02", "sk-testkey123", "sk-userkey456", "sk-demokey789"]
242
+ },
243
+ "usage": {
244
+ "sdk": "pip install openai",
245
+ "base_url": "https://your-username-your-space.hf.space/v1",
246
+ "example": "client = OpenAI(base_url='https://your-space.hf.space/v1', api_key='sk-adminkey02')"
247
+ },
248
+ "performance": {
249
+ "expected_speed": "2-8 tokens/second (CPU)",
250
+ "context_length": 4096,
251
+ "quantization": "Q5_K_M"
252
+ }
253
+ }
254
+
255
+ # Public endpoint for basic info (no auth)
256
+ @app.get("/api")
257
+ async def public_api_info():
258
+ """Public API information"""
259
+ return {
260
+ "service": "CapybaraHermes API",
261
+ "status": "running",
262
+ "endpoints": "/v1/",
263
+ "docs": "/v1/docs",
264
+ "chat_ui": "/",
265
+ "authentication": "API key required for /v1/* endpoints"
266
+ }
267
+
268
+ if __name__ == "__main__":
269
+ uvicorn.run(app, host="0.0.0.0", port=8000)