truegleai commited on
Commit
f0c9bc7
·
verified ·
1 Parent(s): 6048d7a

Deploy FastAPI server with CodeLlama 7B

Browse files
Files changed (1) hide show
  1. app.py +310 -29
app.py CHANGED
@@ -1,37 +1,318 @@
1
- import gradio as gr
 
 
 
 
 
 
 
 
 
 
2
  from huggingface_hub import hf_hub_download
3
  from llama_cpp import Llama
4
- import time
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
- # DOWNLOAD the correct 6.7B model
7
- MODEL_NAME = "deepseek-coder-6.7b-instruct.Q4_K_M.gguf"
8
- model_path = hf_hub_download(
9
- repo_id="TheBloke/DeepSeek-Coder-6.7B-Instruct-GGUF",
10
- filename=MODEL_NAME,
11
- local_dir="./models"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  )
13
 
14
- # LOAD model (once on startup)
15
- print("Loading model...")
16
- llm = Llama(
17
- model_path=model_path,
18
- n_ctx=2048,
19
- n_threads=2,
20
- n_gpu_layers=0,
21
- verbose=False
22
  )
23
- print("Model loaded. Ready.")
24
-
25
- # GENERATION function
26
- def generate(prompt, max_tokens=512):
27
- response = llm(
28
- f"### Instruction:\n{prompt}\n\n### Response:\n",
29
- max_tokens=max_tokens,
30
- stop=["###", "</s>"],
31
- echo=False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  )
33
- return response['choices'][0]['text']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
- # SIMPLE Gradio UI (also provides API endpoint)
36
- iface = gr.Interface(fn=generate, inputs="textbox", outputs="text")
37
- iface.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ FastAPI server providing OpenAI-compatible endpoints for code generation.
3
+ Designed to work with MCP servers and provide unlimited tokens with minimal rate limiting.
4
+ """
5
+ import os
6
+ import time
7
+ import uuid
8
+ from typing import Optional, List, Dict, Any
9
+ from fastapi import FastAPI, HTTPException
10
+ from fastapi.middleware.cors import CORSMiddleware
11
+ from pydantic import BaseModel, Field
12
  from huggingface_hub import hf_hub_download
13
  from llama_cpp import Llama
14
+ import uvicorn
15
+
16
+ # ============================================================================
17
+ # CONFIGURATION
18
+ # ============================================================================
19
+ MODEL_REPO = "TheBloke/CodeLlama-7B-Instruct-GGUF"
20
+ MODEL_FILE = "codellama-7b-instruct.Q4_K_M.gguf"
21
+ MODEL_NAME = "codellama-7b-instruct"
22
+
23
+ # Context and generation settings for "unlimited" tokens
24
+ MAX_CONTEXT = 4096 # Larger context window
25
+ MAX_TOKENS = 4096 # Allow very long responses
26
+ DEFAULT_TEMP = 0.7
27
+ DEFAULT_TOP_P = 0.95
28
+
29
+ # ============================================================================
30
+ # PYDANTIC MODELS (OpenAI-compatible)
31
+ # ============================================================================
32
+ class Message(BaseModel):
33
+ role: str
34
+ content: str
35
+
36
+ class ChatCompletionRequest(BaseModel):
37
+ model: str = MODEL_NAME
38
+ messages: List[Message]
39
+ temperature: Optional[float] = DEFAULT_TEMP
40
+ top_p: Optional[float] = DEFAULT_TOP_P
41
+ max_tokens: Optional[int] = MAX_TOKENS
42
+ stream: Optional[bool] = False
43
+ stop: Optional[List[str]] = None
44
+
45
+ class CompletionRequest(BaseModel):
46
+ model: str = MODEL_NAME
47
+ prompt: str
48
+ temperature: Optional[float] = DEFAULT_TEMP
49
+ top_p: Optional[float] = DEFAULT_TOP_P
50
+ max_tokens: Optional[int] = MAX_TOKENS
51
+ stop: Optional[List[str]] = None
52
+
53
+ class Usage(BaseModel):
54
+ prompt_tokens: int
55
+ completion_tokens: int
56
+ total_tokens: int
57
+
58
+ class ChatCompletionChoice(BaseModel):
59
+ index: int
60
+ message: Message
61
+ finish_reason: str
62
+
63
+ class ChatCompletionResponse(BaseModel):
64
+ id: str
65
+ object: str = "chat.completion"
66
+ created: int
67
+ model: str
68
+ choices: List[ChatCompletionChoice]
69
+ usage: Usage
70
 
71
+ class CompletionChoice(BaseModel):
72
+ index: int
73
+ text: str
74
+ finish_reason: str
75
+
76
+ class CompletionResponse(BaseModel):
77
+ id: str
78
+ object: str = "text_completion"
79
+ created: int
80
+ model: str
81
+ choices: List[CompletionChoice]
82
+ usage: Usage
83
+
84
+ # ============================================================================
85
+ # FASTAPI APP
86
+ # ============================================================================
87
+ app = FastAPI(
88
+ title="Code LLM API",
89
+ description="OpenAI-compatible API for code generation with minimal rate limiting",
90
+ version="1.0.0"
91
  )
92
 
93
+ # Enable CORS for MCP server access
94
+ app.add_middleware(
95
+ CORSMiddleware,
96
+ allow_origins=["*"],
97
+ allow_credentials=True,
98
+ allow_methods=["*"],
99
+ allow_headers=["*"],
 
100
  )
101
+
102
+ # Global model instance
103
+ llm: Optional[Llama] = None
104
+
105
+ # ============================================================================
106
+ # MODEL LOADING
107
+ # ============================================================================
108
+ @app.on_event("startup")
109
+ async def load_model():
110
+ """Load the LLM model on startup."""
111
+ global llm
112
+ print(f"Downloading model {MODEL_REPO}/{MODEL_FILE}...")
113
+ model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE)
114
+ print(f"Model downloaded to: {model_path}")
115
+
116
+ print("Loading model into memory...")
117
+ llm = Llama(
118
+ model_path=model_path,
119
+ n_ctx=MAX_CONTEXT,
120
+ n_threads=4, # Use more threads for better performance
121
+ n_batch=512,
122
+ verbose=False,
123
+ n_gpu_layers=0 # CPU only (change if GPU available)
124
  )
125
+ print("Model loaded successfully!")
126
+
127
+ # ============================================================================
128
+ # HELPER FUNCTIONS
129
+ # ============================================================================
130
+ def messages_to_prompt(messages: List[Message]) -> str:
131
+ """Convert OpenAI-style messages to a prompt for CodeLlama."""
132
+ prompt_parts = []
133
+
134
+ for msg in messages:
135
+ if msg.role == "system":
136
+ prompt_parts.append(f"### System: {msg.content}")
137
+ elif msg.role == "user":
138
+ prompt_parts.append(f"### Instruction: {msg.content}")
139
+ elif msg.role == "assistant":
140
+ prompt_parts.append(f"### Response: {msg.content}")
141
+
142
+ prompt_parts.append("### Response:")
143
+ return "\n".join(prompt_parts)
144
+
145
+ def estimate_tokens(text: str) -> int:
146
+ """Rough token estimation (1 token ≈ 4 chars)."""
147
+ return len(text) // 4
148
 
149
+ # ============================================================================
150
+ # API ENDPOINTS
151
+ # ============================================================================
152
+ @app.get("/")
153
+ async def root():
154
+ """Health check endpoint."""
155
+ return {
156
+ "status": "online",
157
+ "model": MODEL_NAME,
158
+ "max_context": MAX_CONTEXT,
159
+ "max_tokens": MAX_TOKENS,
160
+ "endpoints": {
161
+ "chat": "/v1/chat/completions",
162
+ "completion": "/v1/completions",
163
+ "models": "/v1/models"
164
+ }
165
+ }
166
+
167
+ @app.get("/health")
168
+ async def health():
169
+ """Health check for monitoring."""
170
+ return {
171
+ "status": "healthy" if llm is not None else "loading",
172
+ "model_loaded": llm is not None
173
+ }
174
+
175
+ @app.get("/v1/models")
176
+ async def list_models():
177
+ """List available models (OpenAI-compatible)."""
178
+ return {
179
+ "object": "list",
180
+ "data": [
181
+ {
182
+ "id": MODEL_NAME,
183
+ "object": "model",
184
+ "created": int(time.time()),
185
+ "owned_by": "huggingface",
186
+ "permission": [],
187
+ "root": MODEL_NAME,
188
+ "parent": None
189
+ }
190
+ ]
191
+ }
192
+
193
+ @app.post("/v1/chat/completions", response_model=ChatCompletionResponse)
194
+ async def chat_completions(request: ChatCompletionRequest):
195
+ """
196
+ OpenAI-compatible chat completions endpoint.
197
+ No rate limiting - designed for unlimited use.
198
+ """
199
+ if llm is None:
200
+ raise HTTPException(status_code=503, detail="Model still loading")
201
+
202
+ if request.stream:
203
+ raise HTTPException(status_code=501, detail="Streaming not yet implemented")
204
+
205
+ # Convert messages to prompt
206
+ prompt = messages_to_prompt(request.messages)
207
+
208
+ # Generate response
209
+ try:
210
+ output = llm(
211
+ prompt,
212
+ max_tokens=request.max_tokens or MAX_TOKENS,
213
+ temperature=request.temperature or DEFAULT_TEMP,
214
+ top_p=request.top_p or DEFAULT_TOP_P,
215
+ stop=request.stop or ["###", "\n\n\n"],
216
+ echo=False
217
+ )
218
+
219
+ generated_text = output['choices'][0]['text'].strip()
220
+
221
+ # Estimate token usage
222
+ prompt_tokens = estimate_tokens(prompt)
223
+ completion_tokens = estimate_tokens(generated_text)
224
+
225
+ return ChatCompletionResponse(
226
+ id=f"chatcmpl-{uuid.uuid4().hex[:8]}",
227
+ created=int(time.time()),
228
+ model=request.model,
229
+ choices=[
230
+ ChatCompletionChoice(
231
+ index=0,
232
+ message=Message(role="assistant", content=generated_text),
233
+ finish_reason="stop"
234
+ )
235
+ ],
236
+ usage=Usage(
237
+ prompt_tokens=prompt_tokens,
238
+ completion_tokens=completion_tokens,
239
+ total_tokens=prompt_tokens + completion_tokens
240
+ )
241
+ )
242
+ except Exception as e:
243
+ raise HTTPException(status_code=500, detail=f"Generation failed: {str(e)}")
244
+
245
+ @app.post("/v1/completions", response_model=CompletionResponse)
246
+ async def completions(request: CompletionRequest):
247
+ """
248
+ OpenAI-compatible completions endpoint.
249
+ No rate limiting - designed for unlimited use.
250
+ """
251
+ if llm is None:
252
+ raise HTTPException(status_code=503, detail="Model still loading")
253
+
254
+ try:
255
+ output = llm(
256
+ request.prompt,
257
+ max_tokens=request.max_tokens or MAX_TOKENS,
258
+ temperature=request.temperature or DEFAULT_TEMP,
259
+ top_p=request.top_p or DEFAULT_TOP_P,
260
+ stop=request.stop or [],
261
+ echo=False
262
+ )
263
+
264
+ generated_text = output['choices'][0]['text'].strip()
265
+
266
+ # Estimate token usage
267
+ prompt_tokens = estimate_tokens(request.prompt)
268
+ completion_tokens = estimate_tokens(generated_text)
269
+
270
+ return CompletionResponse(
271
+ id=f"cmpl-{uuid.uuid4().hex[:8]}",
272
+ created=int(time.time()),
273
+ model=request.model,
274
+ choices=[
275
+ CompletionChoice(
276
+ index=0,
277
+ text=generated_text,
278
+ finish_reason="stop"
279
+ )
280
+ ],
281
+ usage=Usage(
282
+ prompt_tokens=prompt_tokens,
283
+ completion_tokens=completion_tokens,
284
+ total_tokens=prompt_tokens + completion_tokens
285
+ )
286
+ )
287
+ except Exception as e:
288
+ raise HTTPException(status_code=500, detail=f"Generation failed: {str(e)}")
289
+
290
+ # ============================================================================
291
+ # SIMPLE ENDPOINTS (for easier testing)
292
+ # ============================================================================
293
+ @app.post("/generate")
294
+ async def generate(prompt: str, max_tokens: int = 512):
295
+ """Simple generation endpoint for quick testing."""
296
+ if llm is None:
297
+ raise HTTPException(status_code=503, detail="Model still loading")
298
+
299
+ try:
300
+ output = llm(prompt, max_tokens=max_tokens, temperature=0.7)
301
+ return {
302
+ "prompt": prompt,
303
+ "response": output['choices'][0]['text'].strip(),
304
+ "model": MODEL_NAME
305
+ }
306
+ except Exception as e:
307
+ raise HTTPException(status_code=500, detail=str(e))
308
+
309
+ # ============================================================================
310
+ # MAIN
311
+ # ============================================================================
312
+ if __name__ == "__main__":
313
+ uvicorn.run(
314
+ app,
315
+ host="0.0.0.0",
316
+ port=int(os.getenv("PORT", "7860")), # HF Spaces uses port 7860
317
+ log_level="info"
318
+ )