Dmitry Beresnev commited on
Commit
2295174
·
1 Parent(s): 9345f95

Add automatic API documentation and in-memory model caching

Browse files

feat: implement LRU model cache for instant switching
- Add ModelCache class with LRU eviction policy
- Support up to 2 concurrent models in memory
- Each model runs on separate port (8080+)
- Instant switching between cached models
- New GET /cache/info endpoint for cache status

feat: enhance API documentation with OpenAPI
- Add comprehensive endpoint descriptions and examples
- Enhanced Pydantic models with Field descriptions
- Add response models for better documentation
- Organize endpoints with tags (status, models, chat, documentation)
- Add GET /openapi.json endpoint to export specification
- Auto-generated docs available at /docs and /redoc

perf: eliminate model reload delays
- Cache hit: instant model switch (< 1s)
- Cache miss: load once, reuse multiple times
- Automatic cleanup on shutdown

docs: add detailed docstrings with usage examples
- Request/response format documentation
- Parameter descriptions and constraints
- Usage examples for all endpoints

Files changed (1) hide show
  1. app.py +635 -74
app.py CHANGED
@@ -1,15 +1,69 @@
1
- from fastapi import FastAPI, HTTPException
2
- from pydantic import BaseModel
3
  import subprocess
4
  import signal
5
  import os
6
- import requests
7
  import time
8
- from typing import Optional
 
 
 
 
 
 
 
9
  from duckduckgo_search import DDGS
10
  from bs4 import BeautifulSoup
11
 
12
- app = FastAPI()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
  # Predefined list of available models (TheBloke only - verified, fits 18GB Space)
15
  AVAILABLE_MODELS = {
@@ -27,37 +81,279 @@ AVAILABLE_MODELS = {
27
  "llama-7b": "TheBloke/Llama-2-7B-Chat-GGUF:llama-2-7b-chat.Q4_K_M.gguf",
28
  }
29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  # Global state
31
  current_model = "deepseek-chat" # Default model
32
- llama_process: Optional[subprocess.Popen] = None
33
- LLAMA_SERVER_PORT = 8080
34
- LLAMA_SERVER_URL = f"http://localhost:{LLAMA_SERVER_PORT}"
35
 
36
 
37
  class ModelSwitchRequest(BaseModel):
38
- model_name: str
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
 
41
  class ChatCompletionRequest(BaseModel):
42
- messages: list[dict]
43
- max_tokens: int = 256
44
- temperature: float = 0.7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
 
47
  class WebChatRequest(BaseModel):
48
- messages: list[dict]
49
- max_tokens: int = 512
50
- temperature: float = 0.7
51
- max_search_results: int = 5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
53
 
54
- def start_llama_server(model_id: str) -> subprocess.Popen:
55
- """Start llama-server with specified model (optimized for speed)."""
 
 
 
 
 
 
 
 
 
 
 
 
56
  cmd = [
57
  "llama-server",
58
  "-hf", model_id,
59
  "--host", "0.0.0.0",
60
- "--port", str(LLAMA_SERVER_PORT),
61
  "-c", "2048", # Context size
62
  "-t", "4", # CPU threads (adjust based on cores)
63
  "-ngl", "0", # GPU layers (0 for CPU-only)
@@ -65,7 +361,7 @@ def start_llama_server(model_id: str) -> subprocess.Popen:
65
  "-b", "512", # Batch size
66
  ]
67
 
68
- print(f"Starting llama-server with model: {model_id}")
69
  print("This may take 2-3 minutes to download and load the model...")
70
 
71
  process = subprocess.Popen(
@@ -79,6 +375,8 @@ def start_llama_server(model_id: str) -> subprocess.Popen:
79
 
80
  # Wait for server to be ready (increased timeout for model download)
81
  max_retries = 300 # 5 minutes
 
 
82
  for i in range(max_retries):
83
  # Check if process died
84
  if process.poll() is not None:
@@ -89,14 +387,14 @@ def start_llama_server(model_id: str) -> subprocess.Popen:
89
 
90
  try:
91
  # Try root endpoint instead of /health
92
- response = requests.get(f"{LLAMA_SERVER_URL}/", timeout=2)
93
  if response.status_code in [200, 404]: # 404 is ok, means server is up
94
  print(f"llama-server ready after {i+1} seconds")
95
  return process
96
  except requests.exceptions.ConnectionError:
97
  # Server not ready yet
98
  pass
99
- except Exception as e:
100
  # Other errors, keep waiting
101
  pass
102
 
@@ -105,62 +403,116 @@ def start_llama_server(model_id: str) -> subprocess.Popen:
105
  raise RuntimeError("llama-server failed to start within 5 minutes")
106
 
107
 
108
- def stop_llama_server():
109
- """Stop the running llama-server."""
110
- global llama_process
111
- if llama_process:
112
- print("Stopping llama-server...")
113
- try:
114
- if os.name != 'nt':
115
- os.killpg(os.getpgid(llama_process.pid), signal.SIGTERM)
116
- else:
117
- llama_process.terminate()
118
- llama_process.wait(timeout=10)
119
- except:
120
- if os.name != 'nt':
121
- os.killpg(os.getpgid(llama_process.pid), signal.SIGKILL)
122
- else:
123
- llama_process.kill()
124
- llama_process = None
125
- time.sleep(2) # Give it time to fully shut down
126
-
127
-
128
  @app.on_event("startup")
129
  async def startup_event():
130
- """Start with default model."""
131
- global llama_process
132
  model_id = AVAILABLE_MODELS[current_model]
133
- llama_process = start_llama_server(model_id)
 
 
 
 
134
 
135
 
136
  @app.on_event("shutdown")
137
  async def shutdown_event():
138
- """Clean shutdown."""
139
- stop_llama_server()
140
 
141
 
142
- @app.get("/")
 
 
 
 
 
 
143
  async def root():
 
 
 
 
 
 
 
 
144
  return {
145
- "status": "DeepSeek API with dynamic model switching",
146
  "current_model": current_model,
147
  "available_models": list(AVAILABLE_MODELS.keys())
148
  }
149
 
150
 
151
- @app.get("/models")
 
 
 
 
 
 
152
  async def list_models():
153
- """List all available models."""
 
 
 
 
 
 
 
 
154
  return {
155
  "current_model": current_model,
156
  "available_models": list(AVAILABLE_MODELS.keys())
157
  }
158
 
159
 
160
- @app.post("/switch-model")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
  async def switch_model(request: ModelSwitchRequest):
162
- """Switch to a different model."""
163
- global current_model, llama_process
 
 
 
 
 
 
 
 
 
 
 
 
 
164
 
165
  if request.model_name not in AVAILABLE_MODELS:
166
  raise HTTPException(
@@ -169,29 +521,103 @@ async def switch_model(request: ModelSwitchRequest):
169
  )
170
 
171
  if request.model_name == current_model:
172
- return {"message": f"Already using model: {current_model}"}
173
 
174
- # Stop current server
175
- stop_llama_server()
176
 
177
- # Start with new model
178
- model_id = AVAILABLE_MODELS[request.model_name]
179
- llama_process = start_llama_server(model_id)
180
- current_model = request.model_name
 
 
 
181
 
182
- return {
183
- "message": f"Switched to model: {current_model}",
184
- "model": current_model
185
- }
186
 
 
 
 
 
187
 
188
- @app.post("/v1/chat/completions")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
189
  async def chat_completions(request: ChatCompletionRequest):
190
- """OpenAI-compatible chat completions endpoint."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
  try:
 
 
 
 
 
192
  # Forward to llama-server
193
  response = requests.post(
194
- f"{LLAMA_SERVER_URL}/v1/chat/completions",
195
  json={
196
  "messages": request.messages,
197
  "max_tokens": request.max_tokens,
@@ -237,13 +663,74 @@ def format_search_context(query: str, search_results: list[dict]) -> str:
237
  return context
238
 
239
 
240
- @app.post("/v1/web-chat/completions")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
241
  async def web_chat_completions(request: WebChatRequest):
242
  """
243
- Chat completions with web search augmentation.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
244
 
245
- The last user message is used as the search query.
246
- Search results are injected into the context before sending to the LLM.
247
  """
248
  try:
249
  # Get the last user message as search query
@@ -277,9 +764,14 @@ Always cite sources when using information from the search results."""
277
  # Insert system message before the last user message
278
  augmented_messages.insert(-1, system_prompt)
279
 
 
 
 
 
 
280
  # Forward to llama-server with augmented context
281
  response = requests.post(
282
- f"{LLAMA_SERVER_URL}/v1/chat/completions",
283
  json={
284
  "messages": augmented_messages,
285
  "max_tokens": request.max_tokens,
@@ -303,4 +795,73 @@ Always cite sources when using information from the search results."""
303
  except requests.exceptions.RequestException as e:
304
  raise HTTPException(status_code=500, detail=f"llama-server error: {str(e)}")
305
  except Exception as e:
306
- raise HTTPException(status_code=500, detail=f"Error: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import subprocess
2
  import signal
3
  import os
 
4
  import time
5
+ from typing import Optional, Dict
6
+ from dataclasses import dataclass
7
+ from collections import OrderedDict
8
+
9
+ import requests
10
+ from fastapi import FastAPI, HTTPException
11
+ from fastapi.openapi.utils import get_openapi
12
+ from pydantic import BaseModel, Field
13
  from duckduckgo_search import DDGS
14
  from bs4 import BeautifulSoup
15
 
16
+ app = FastAPI(
17
+ title="AGI Multi-Model API",
18
+ description="""
19
+ **Dynamic Multi-Model LLM API with Web Search Capabilities**
20
+
21
+ This API provides:
22
+ * 🔄 Dynamic model switching between multiple LLM models
23
+ * 💬 OpenAI-compatible chat completions
24
+ * 🌐 Web-augmented chat with real-time search
25
+ * 📊 Model management and status monitoring
26
+
27
+ ## Available Models
28
+ - **deepseek-chat** (default): General purpose conversational model
29
+ - **mistral-7b**: Financial analysis and summarization
30
+ - **openhermes-7b**: Advanced instruction following
31
+ - **deepseek-coder**: Specialized coding assistance
32
+ - **llama-7b**: Lightweight and fast responses
33
+
34
+ ## Quick Start
35
+ 1. Check available models: `GET /models`
36
+ 2. Switch model (optional): `POST /switch-model`
37
+ 3. Chat: `POST /v1/chat/completions`
38
+ 4. Chat with web search: `POST /v1/web-chat/completions`
39
+ """,
40
+ version="0.0.1.2025.12.04",
41
+ contact={
42
+ "name": "API Support",
43
+ "email": "support@example.com",
44
+ },
45
+ license_info={
46
+ "name": "MIT",
47
+ },
48
+ openapi_tags=[
49
+ {
50
+ "name": "status",
51
+ "description": "System status and health checks",
52
+ },
53
+ {
54
+ "name": "models",
55
+ "description": "Model management and switching operations",
56
+ },
57
+ {
58
+ "name": "chat",
59
+ "description": "Chat completion endpoints (OpenAI-compatible)",
60
+ },
61
+ {
62
+ "name": "documentation",
63
+ "description": "API documentation and OpenAPI specification",
64
+ },
65
+ ]
66
+ )
67
 
68
  # Predefined list of available models (TheBloke only - verified, fits 18GB Space)
69
  AVAILABLE_MODELS = {
 
81
  "llama-7b": "TheBloke/Llama-2-7B-Chat-GGUF:llama-2-7b-chat.Q4_K_M.gguf",
82
  }
83
 
84
+ # Configuration
85
+ MAX_CACHED_MODELS = 2 # Maximum number of models to keep in memory
86
+ BASE_PORT = 8080 # Starting port for llama-server instances
87
+
88
+
89
+ @dataclass
90
+ class CachedModel:
91
+ """Represents a cached model with its process and connection info."""
92
+ name: str
93
+ model_id: str
94
+ process: subprocess.Popen
95
+ port: int
96
+ url: str
97
+ last_used: float
98
+
99
+
100
+ class ModelCache:
101
+ """
102
+ In-memory LRU cache for loaded models.
103
+
104
+ Manages multiple llama-server processes, each on a different port.
105
+ Automatically evicts least recently used models when cache is full.
106
+ """
107
+
108
+ def __init__(self, max_size: int = MAX_CACHED_MODELS):
109
+ self.max_size = max_size
110
+ self.cache: OrderedDict[str, CachedModel] = OrderedDict()
111
+ self.port_counter = BASE_PORT
112
+ self.used_ports = set()
113
+
114
+ def _get_next_port(self) -> int:
115
+ """Get next available port for a model."""
116
+ while self.port_counter in self.used_ports:
117
+ self.port_counter += 1
118
+ port = self.port_counter
119
+ self.used_ports.add(port)
120
+ self.port_counter += 1
121
+ return port
122
+
123
+ def _release_port(self, port: int):
124
+ """Release a port back to the pool."""
125
+ self.used_ports.discard(port)
126
+
127
+ def _evict_lru(self):
128
+ """Evict the least recently used model."""
129
+ if not self.cache:
130
+ return
131
+
132
+ # Get the first (oldest) item
133
+ model_name, cached_model = self.cache.popitem(last=False)
134
+ print(f"Evicting model from cache: {model_name}")
135
+
136
+ # Stop the process
137
+ try:
138
+ if os.name != 'nt':
139
+ os.killpg(os.getpgid(cached_model.process.pid), signal.SIGTERM)
140
+ else:
141
+ cached_model.process.terminate()
142
+ cached_model.process.wait(timeout=10)
143
+ except Exception as e:
144
+ print(f"Error stopping model {model_name}: {e}")
145
+ try:
146
+ if os.name != 'nt':
147
+ os.killpg(os.getpgid(cached_model.process.pid), signal.SIGKILL)
148
+ else:
149
+ cached_model.process.kill()
150
+ except:
151
+ pass
152
+
153
+ # Release the port
154
+ self._release_port(cached_model.port)
155
+ time.sleep(1)
156
+
157
+ def get(self, model_name: str) -> Optional[CachedModel]:
158
+ """Get a model from cache, updating its last used time."""
159
+ if model_name in self.cache:
160
+ cached_model = self.cache[model_name]
161
+ cached_model.last_used = time.time()
162
+ # Move to end (most recently used)
163
+ self.cache.move_to_end(model_name)
164
+ print(f"Cache hit for model: {model_name}")
165
+ return cached_model
166
+ print(f"Cache miss for model: {model_name}")
167
+ return None
168
+
169
+ def put(self, model_name: str, model_id: str, process: subprocess.Popen, port: int):
170
+ """Add a model to the cache."""
171
+ # Evict if cache is full
172
+ while len(self.cache) >= self.max_size:
173
+ self._evict_lru()
174
+
175
+ url = f"http://localhost:{port}"
176
+ cached_model = CachedModel(
177
+ name=model_name,
178
+ model_id=model_id,
179
+ process=process,
180
+ port=port,
181
+ url=url,
182
+ last_used=time.time()
183
+ )
184
+ self.cache[model_name] = cached_model
185
+ print(f"Cached model: {model_name} on port {port}")
186
+
187
+ def clear(self):
188
+ """Clear all cached models."""
189
+ print("Clearing model cache...")
190
+ for model_name, cached_model in list(self.cache.items()):
191
+ try:
192
+ if os.name != 'nt':
193
+ os.killpg(os.getpgid(cached_model.process.pid), signal.SIGTERM)
194
+ else:
195
+ cached_model.process.terminate()
196
+ cached_model.process.wait(timeout=10)
197
+ except:
198
+ try:
199
+ if os.name != 'nt':
200
+ os.killpg(os.getpgid(cached_model.process.pid), signal.SIGKILL)
201
+ else:
202
+ cached_model.process.kill()
203
+ except:
204
+ pass
205
+ self._release_port(cached_model.port)
206
+
207
+ self.cache.clear()
208
+
209
+ def get_cache_info(self) -> Dict:
210
+ """Get information about cached models."""
211
+ return {
212
+ "max_size": self.max_size,
213
+ "current_size": len(self.cache),
214
+ "cached_models": [
215
+ {
216
+ "name": name,
217
+ "port": model.port,
218
+ "url": model.url,
219
+ "last_used": model.last_used
220
+ }
221
+ for name, model in self.cache.items()
222
+ ]
223
+ }
224
+
225
+
226
  # Global state
227
  current_model = "deepseek-chat" # Default model
228
+ model_cache = ModelCache(max_size=MAX_CACHED_MODELS)
 
 
229
 
230
 
231
  class ModelSwitchRequest(BaseModel):
232
+ """Request to switch the active LLM model."""
233
+ model_name: str = Field(
234
+ ...,
235
+ description="Name of the model to switch to",
236
+ examples=["deepseek-chat", "mistral-7b", "deepseek-coder"]
237
+ )
238
+
239
+ model_config = {
240
+ "json_schema_extra": {
241
+ "examples": [
242
+ {"model_name": "deepseek-coder"},
243
+ {"model_name": "mistral-7b"}
244
+ ]
245
+ }
246
+ }
247
 
248
 
249
  class ChatCompletionRequest(BaseModel):
250
+ """OpenAI-compatible chat completion request."""
251
+ messages: list[dict] = Field(
252
+ ...,
253
+ description="Array of message objects with 'role' and 'content' fields",
254
+ examples=[[
255
+ {"role": "system", "content": "You are a helpful assistant."},
256
+ {"role": "user", "content": "Hello!"}
257
+ ]]
258
+ )
259
+ max_tokens: int = Field(
260
+ default=256,
261
+ description="Maximum number of tokens to generate",
262
+ ge=1,
263
+ le=4096
264
+ )
265
+ temperature: float = Field(
266
+ default=0.7,
267
+ description="Sampling temperature (0.0 to 2.0). Higher values make output more random.",
268
+ ge=0.0,
269
+ le=2.0
270
+ )
271
+
272
+ model_config = {
273
+ "json_schema_extra": {
274
+ "examples": [
275
+ {
276
+ "messages": [
277
+ {"role": "user", "content": "What is the capital of France?"}
278
+ ],
279
+ "max_tokens": 100,
280
+ "temperature": 0.7
281
+ }
282
+ ]
283
+ }
284
+ }
285
 
286
 
287
  class WebChatRequest(BaseModel):
288
+ """Chat completion request with web search augmentation."""
289
+ messages: list[dict] = Field(
290
+ ...,
291
+ description="Array of message objects. The last user message is used for web search.",
292
+ examples=[[
293
+ {"role": "user", "content": "What are the latest developments in AI?"}
294
+ ]]
295
+ )
296
+ max_tokens: int = Field(
297
+ default=512,
298
+ description="Maximum number of tokens to generate",
299
+ ge=1,
300
+ le=4096
301
+ )
302
+ temperature: float = Field(
303
+ default=0.7,
304
+ description="Sampling temperature (0.0 to 2.0)",
305
+ ge=0.0,
306
+ le=2.0
307
+ )
308
+ max_search_results: int = Field(
309
+ default=5,
310
+ description="Maximum number of web search results to include in context",
311
+ ge=1,
312
+ le=10
313
+ )
314
+
315
+ model_config = {
316
+ "json_schema_extra": {
317
+ "examples": [
318
+ {
319
+ "messages": [
320
+ {"role": "user", "content": "What's the weather like today in San Francisco?"}
321
+ ],
322
+ "max_tokens": 512,
323
+ "temperature": 0.7,
324
+ "max_search_results": 5
325
+ }
326
+ ]
327
+ }
328
+ }
329
+
330
+
331
+ class StatusResponse(BaseModel):
332
+ """API status response."""
333
+ status: str = Field(..., description="Current API status")
334
+ current_model: str = Field(..., description="Currently active model")
335
+ available_models: list[str] = Field(..., description="List of available models")
336
 
337
 
338
+ class ModelsResponse(BaseModel):
339
+ """Available models response."""
340
+ current_model: str = Field(..., description="Currently active model")
341
+ available_models: list[str] = Field(..., description="List of all available models")
342
+
343
+
344
+ class ModelSwitchResponse(BaseModel):
345
+ """Model switch response."""
346
+ message: str = Field(..., description="Status message")
347
+ model: str = Field(..., description="New active model name")
348
+
349
+
350
+ def start_llama_server(model_id: str, port: int) -> subprocess.Popen:
351
+ """Start llama-server with specified model on a specific port."""
352
  cmd = [
353
  "llama-server",
354
  "-hf", model_id,
355
  "--host", "0.0.0.0",
356
+ "--port", str(port),
357
  "-c", "2048", # Context size
358
  "-t", "4", # CPU threads (adjust based on cores)
359
  "-ngl", "0", # GPU layers (0 for CPU-only)
 
361
  "-b", "512", # Batch size
362
  ]
363
 
364
+ print(f"Starting llama-server with model: {model_id} on port {port}")
365
  print("This may take 2-3 minutes to download and load the model...")
366
 
367
  process = subprocess.Popen(
 
375
 
376
  # Wait for server to be ready (increased timeout for model download)
377
  max_retries = 300 # 5 minutes
378
+ server_url = f"http://localhost:{port}"
379
+
380
  for i in range(max_retries):
381
  # Check if process died
382
  if process.poll() is not None:
 
387
 
388
  try:
389
  # Try root endpoint instead of /health
390
+ response = requests.get(f"{server_url}/", timeout=2)
391
  if response.status_code in [200, 404]: # 404 is ok, means server is up
392
  print(f"llama-server ready after {i+1} seconds")
393
  return process
394
  except requests.exceptions.ConnectionError:
395
  # Server not ready yet
396
  pass
397
+ except Exception:
398
  # Other errors, keep waiting
399
  pass
400
 
 
403
  raise RuntimeError("llama-server failed to start within 5 minutes")
404
 
405
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
406
  @app.on_event("startup")
407
  async def startup_event():
408
+ """Start with default model and cache it."""
409
+ global current_model
410
  model_id = AVAILABLE_MODELS[current_model]
411
+ port = model_cache._get_next_port()
412
+
413
+ process = start_llama_server(model_id, port)
414
+ model_cache.put(current_model, model_id, process, port)
415
+ print(f"Started with default model: {current_model}")
416
 
417
 
418
  @app.on_event("shutdown")
419
  async def shutdown_event():
420
+ """Clean shutdown - clear all cached models."""
421
+ model_cache.clear()
422
 
423
 
424
+ @app.get(
425
+ "/",
426
+ response_model=StatusResponse,
427
+ tags=["status"],
428
+ summary="API Status",
429
+ description="Get the current status of the API, including active model and available models."
430
+ )
431
  async def root():
432
+ """
433
+ Returns the current status of the AGI Multi-Model API.
434
+
435
+ This endpoint provides information about:
436
+ - Current API status
437
+ - Currently active LLM model
438
+ - List of all available models
439
+ """
440
  return {
441
+ "status": "AGI Multi-Model API with dynamic model switching and web search",
442
  "current_model": current_model,
443
  "available_models": list(AVAILABLE_MODELS.keys())
444
  }
445
 
446
 
447
+ @app.get(
448
+ "/models",
449
+ response_model=ModelsResponse,
450
+ tags=["models"],
451
+ summary="List Available Models",
452
+ description="Get a list of all available LLM models and the currently active model."
453
+ )
454
  async def list_models():
455
+ """
456
+ List all available LLM models.
457
+
458
+ Returns:
459
+ - current_model: The model currently in use
460
+ - available_models: Array of all available model names
461
+
462
+ Use this endpoint to see which models you can switch to.
463
+ """
464
  return {
465
  "current_model": current_model,
466
  "available_models": list(AVAILABLE_MODELS.keys())
467
  }
468
 
469
 
470
+ @app.post(
471
+ "/switch-model",
472
+ response_model=ModelSwitchResponse,
473
+ tags=["models"],
474
+ summary="Switch Active Model",
475
+ description="Switch to a different LLM model. Uses caching for instant switching to recently used models.",
476
+ responses={
477
+ 200: {
478
+ "description": "Model switched successfully",
479
+ "content": {
480
+ "application/json": {
481
+ "example": {
482
+ "message": "Switched to model: deepseek-coder (from cache)",
483
+ "model": "deepseek-coder"
484
+ }
485
+ }
486
+ }
487
+ },
488
+ 400: {
489
+ "description": "Invalid model name",
490
+ "content": {
491
+ "application/json": {
492
+ "example": {
493
+ "detail": "Model 'invalid-model' not found. Available: ['deepseek-chat', 'mistral-7b', ...]"
494
+ }
495
+ }
496
+ }
497
+ }
498
+ }
499
+ )
500
  async def switch_model(request: ModelSwitchRequest):
501
+ """
502
+ Switch to a different LLM model with intelligent caching.
503
+
504
+ **How it works:**
505
+ 1. Checks if requested model is already active (no switch needed)
506
+ 2. Checks cache for the model (instant switch if cached)
507
+ 3. If not cached, loads the model (may take 2-3 minutes)
508
+
509
+ **Caching:**
510
+ - Up to 2 models kept in memory
511
+ - LRU (Least Recently Used) eviction policy
512
+ - Each model runs on a separate port
513
+ - Instant switching between cached models
514
+ """
515
+ global current_model
516
 
517
  if request.model_name not in AVAILABLE_MODELS:
518
  raise HTTPException(
 
521
  )
522
 
523
  if request.model_name == current_model:
524
+ return {"message": f"Already using model: {current_model}", "model": current_model}
525
 
526
+ # Try to get from cache
527
+ cached_model = model_cache.get(request.model_name)
528
 
529
+ if cached_model:
530
+ # Model is cached, instant switch
531
+ current_model = request.model_name
532
+ return {
533
+ "message": f"Switched to model: {current_model} (from cache)",
534
+ "model": current_model
535
+ }
536
 
537
+ # Model not cached, need to load it
538
+ model_id = AVAILABLE_MODELS[request.model_name]
539
+ port = model_cache._get_next_port()
 
540
 
541
+ try:
542
+ process = start_llama_server(model_id, port)
543
+ model_cache.put(request.model_name, model_id, process, port)
544
+ current_model = request.model_name
545
 
546
+ return {
547
+ "message": f"Switched to model: {current_model} (newly loaded)",
548
+ "model": current_model
549
+ }
550
+ except Exception as e:
551
+ # Release port if failed
552
+ model_cache._release_port(port)
553
+ raise HTTPException(status_code=500, detail=f"Failed to load model: {str(e)}")
554
+
555
+
556
+ @app.post(
557
+ "/v1/chat/completions",
558
+ tags=["chat"],
559
+ summary="Chat Completions",
560
+ description="OpenAI-compatible chat completions endpoint. Send messages and get AI-generated responses.",
561
+ responses={
562
+ 200: {
563
+ "description": "Successful response",
564
+ "content": {
565
+ "application/json": {
566
+ "example": {
567
+ "id": "chatcmpl-123",
568
+ "object": "chat.completion",
569
+ "created": 1677652288,
570
+ "model": "deepseek-chat",
571
+ "choices": [{
572
+ "index": 0,
573
+ "message": {
574
+ "role": "assistant",
575
+ "content": "Hello! How can I help you today?"
576
+ },
577
+ "finish_reason": "stop"
578
+ }]
579
+ }
580
+ }
581
+ }
582
+ },
583
+ 500: {
584
+ "description": "LLM server error"
585
+ }
586
+ }
587
+ )
588
  async def chat_completions(request: ChatCompletionRequest):
589
+ """
590
+ OpenAI-compatible chat completions endpoint.
591
+
592
+ This endpoint forwards your request to the currently active LLM model
593
+ and returns the response in OpenAI-compatible format.
594
+
595
+ **Message Format:**
596
+ ```json
597
+ {
598
+ "messages": [
599
+ {"role": "system", "content": "You are a helpful assistant."},
600
+ {"role": "user", "content": "Hello!"}
601
+ ],
602
+ "max_tokens": 256,
603
+ "temperature": 0.7
604
+ }
605
+ ```
606
+
607
+ **Supported Roles:**
608
+ - `system`: Sets the behavior of the assistant
609
+ - `user`: User messages
610
+ - `assistant`: Assistant responses (for multi-turn conversations)
611
+ """
612
  try:
613
+ # Get current model from cache
614
+ cached_model = model_cache.get(current_model)
615
+ if not cached_model:
616
+ raise HTTPException(status_code=500, detail="Current model not loaded")
617
+
618
  # Forward to llama-server
619
  response = requests.post(
620
+ f"{cached_model.url}/v1/chat/completions",
621
  json={
622
  "messages": request.messages,
623
  "max_tokens": request.max_tokens,
 
663
  return context
664
 
665
 
666
+ @app.post(
667
+ "/v1/web-chat/completions",
668
+ tags=["chat"],
669
+ summary="Web-Augmented Chat Completions",
670
+ description="Chat completions enhanced with real-time web search. The last user message is used as a search query.",
671
+ responses={
672
+ 200: {
673
+ "description": "Successful response with web search metadata",
674
+ "content": {
675
+ "application/json": {
676
+ "example": {
677
+ "id": "chatcmpl-123",
678
+ "object": "chat.completion",
679
+ "created": 1677652288,
680
+ "model": "deepseek-chat",
681
+ "choices": [{
682
+ "index": 0,
683
+ "message": {
684
+ "role": "assistant",
685
+ "content": "Based on recent search results, here's what I found..."
686
+ },
687
+ "finish_reason": "stop"
688
+ }],
689
+ "web_search": {
690
+ "query": "latest AI developments",
691
+ "results_count": 5,
692
+ "sources": ["https://example.com/1", "https://example.com/2"]
693
+ }
694
+ }
695
+ }
696
+ }
697
+ },
698
+ 400: {
699
+ "description": "No user message found"
700
+ },
701
+ 500: {
702
+ "description": "LLM server or search error"
703
+ }
704
+ }
705
+ )
706
  async def web_chat_completions(request: WebChatRequest):
707
  """
708
+ Chat completions with real-time web search augmentation.
709
+
710
+ **How it works:**
711
+ 1. Extracts the last user message as the search query
712
+ 2. Performs a web search using DuckDuckGo
713
+ 3. Injects search results into the LLM context
714
+ 4. Returns the AI response with source citations
715
+
716
+ **Use cases:**
717
+ - Current events and news
718
+ - Recent information beyond the model's training data
719
+ - Fact-checking with web sources
720
+ - Research with live data
721
+
722
+ **Example:**
723
+ ```json
724
+ {
725
+ "messages": [
726
+ {"role": "user", "content": "What's the latest news about SpaceX?"}
727
+ ],
728
+ "max_tokens": 512,
729
+ "max_search_results": 5
730
+ }
731
+ ```
732
 
733
+ The response includes a `web_search` field with metadata about sources used.
 
734
  """
735
  try:
736
  # Get the last user message as search query
 
764
  # Insert system message before the last user message
765
  augmented_messages.insert(-1, system_prompt)
766
 
767
+ # Get current model from cache
768
+ cached_model = model_cache.get(current_model)
769
+ if not cached_model:
770
+ raise HTTPException(status_code=500, detail="Current model not loaded")
771
+
772
  # Forward to llama-server with augmented context
773
  response = requests.post(
774
+ f"{cached_model.url}/v1/chat/completions",
775
  json={
776
  "messages": augmented_messages,
777
  "max_tokens": request.max_tokens,
 
795
  except requests.exceptions.RequestException as e:
796
  raise HTTPException(status_code=500, detail=f"llama-server error: {str(e)}")
797
  except Exception as e:
798
+ raise HTTPException(status_code=500, detail=f"Error: {str(e)}")
799
+
800
+
801
+ @app.get(
802
+ "/cache/info",
803
+ tags=["models"],
804
+ summary="Get Cache Information",
805
+ description="Returns information about the model cache, including cached models and cache statistics."
806
+ )
807
+ async def get_cache_info():
808
+ """
809
+ Get information about the in-memory model cache.
810
+
811
+ Returns:
812
+ - max_size: Maximum number of models that can be cached
813
+ - current_size: Current number of cached models
814
+ - cached_models: List of currently cached models with their metadata
815
+
816
+ **Example Response:**
817
+ ```json
818
+ {
819
+ "max_size": 2,
820
+ "current_size": 2,
821
+ "cached_models": [
822
+ {
823
+ "name": "deepseek-chat",
824
+ "port": 8080,
825
+ "url": "http://localhost:8080",
826
+ "last_used": 1234567890.123
827
+ },
828
+ {
829
+ "name": "mistral-7b",
830
+ "port": 8081,
831
+ "url": "http://localhost:8081",
832
+ "last_used": 1234567895.456
833
+ }
834
+ ]
835
+ }
836
+ ```
837
+ """
838
+ return model_cache.get_cache_info()
839
+
840
+
841
+ @app.get(
842
+ "/openapi.json",
843
+ tags=["documentation"],
844
+ summary="Get OpenAPI Specification",
845
+ description="Returns the complete OpenAPI 3.0 specification for this API in JSON format.",
846
+ include_in_schema=False
847
+ )
848
+ async def get_openapi_spec():
849
+ """
850
+ Export the OpenAPI specification for this API.
851
+
852
+ This endpoint returns the complete OpenAPI 3.0 specification that can be used with:
853
+ - API documentation tools (Swagger UI, ReDoc)
854
+ - Code generators (openapi-generator, swagger-codegen)
855
+ - API testing tools (Postman, Insomnia)
856
+ - SDK generation
857
+
858
+ Save this to a file and use it with tools like:
859
+ ```bash
860
+ # Generate Python client
861
+ openapi-generator generate -i openapi.json -g python -o ./client
862
+
863
+ # Generate TypeScript client
864
+ openapi-generator generate -i openapi.json -g typescript-fetch -o ./client
865
+ ```
866
+ """
867
+ return app.openapi()