Spaces:
Running
Running
Dmitry Beresnev
commited on
Commit
·
7763bf4
1
Parent(s):
c384ef1
fix gitignore, app and logger, etc
Browse files- .gitignore +133 -0
- Dockerfile +2 -1
- app.py +436 -282
- logger.py +164 -0
- pyproject.toml +4 -3
.gitignore
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
*.so
|
| 6 |
+
.Python
|
| 7 |
+
build/
|
| 8 |
+
develop-eggs/
|
| 9 |
+
dist/
|
| 10 |
+
downloads/
|
| 11 |
+
eggs/
|
| 12 |
+
.eggs/
|
| 13 |
+
lib/
|
| 14 |
+
lib64/
|
| 15 |
+
parts/
|
| 16 |
+
sdist/
|
| 17 |
+
var/
|
| 18 |
+
wheels/
|
| 19 |
+
share/python-wheels/
|
| 20 |
+
*.egg-info/
|
| 21 |
+
.installed.cfg
|
| 22 |
+
*.egg
|
| 23 |
+
MANIFEST
|
| 24 |
+
pip-log.txt
|
| 25 |
+
pip-delete-this-directory.txt
|
| 26 |
+
|
| 27 |
+
# Virtual Environment
|
| 28 |
+
.venv/
|
| 29 |
+
venv/
|
| 30 |
+
ENV/
|
| 31 |
+
env/
|
| 32 |
+
.virtualenv
|
| 33 |
+
|
| 34 |
+
# PyInstaller
|
| 35 |
+
*.manifest
|
| 36 |
+
*.spec
|
| 37 |
+
|
| 38 |
+
# Unit test / coverage reports
|
| 39 |
+
htmlcov/
|
| 40 |
+
.tox/
|
| 41 |
+
.nox/
|
| 42 |
+
.coverage
|
| 43 |
+
.coverage.*
|
| 44 |
+
.cache
|
| 45 |
+
nosetests.xml
|
| 46 |
+
coverage.xml
|
| 47 |
+
*.cover
|
| 48 |
+
*.py,cover
|
| 49 |
+
.hypothesis/
|
| 50 |
+
.pytest_cache/
|
| 51 |
+
cover/
|
| 52 |
+
|
| 53 |
+
# IDEs
|
| 54 |
+
.idea/
|
| 55 |
+
.vscode/
|
| 56 |
+
*.swp
|
| 57 |
+
*.swo
|
| 58 |
+
*~
|
| 59 |
+
.DS_Store
|
| 60 |
+
|
| 61 |
+
# Jupyter Notebook
|
| 62 |
+
.ipynb_checkpoints
|
| 63 |
+
*.ipynb
|
| 64 |
+
|
| 65 |
+
# PyCharm
|
| 66 |
+
.idea/
|
| 67 |
+
*.iml
|
| 68 |
+
*.iws
|
| 69 |
+
|
| 70 |
+
# Logs
|
| 71 |
+
*.log
|
| 72 |
+
logs/
|
| 73 |
+
agi.log
|
| 74 |
+
|
| 75 |
+
# Environment variables
|
| 76 |
+
.env
|
| 77 |
+
.env.local
|
| 78 |
+
.env.*.local
|
| 79 |
+
*.env
|
| 80 |
+
|
| 81 |
+
# Database
|
| 82 |
+
*.db
|
| 83 |
+
*.sqlite
|
| 84 |
+
*.sqlite3
|
| 85 |
+
|
| 86 |
+
# Model files (often large)
|
| 87 |
+
*.bin
|
| 88 |
+
*.gguf
|
| 89 |
+
*.safetensors
|
| 90 |
+
models/
|
| 91 |
+
checkpoints/
|
| 92 |
+
|
| 93 |
+
# Docker
|
| 94 |
+
.dockerignore
|
| 95 |
+
docker-compose.override.yml
|
| 96 |
+
|
| 97 |
+
# OS
|
| 98 |
+
.DS_Store
|
| 99 |
+
Thumbs.db
|
| 100 |
+
Desktop.ini
|
| 101 |
+
$RECYCLE.BIN/
|
| 102 |
+
*.cab
|
| 103 |
+
*.msi
|
| 104 |
+
*.msix
|
| 105 |
+
*.msm
|
| 106 |
+
*.msp
|
| 107 |
+
*.lnk
|
| 108 |
+
|
| 109 |
+
# mypy
|
| 110 |
+
.mypy_cache/
|
| 111 |
+
.dmypy.json
|
| 112 |
+
dmypy.json
|
| 113 |
+
|
| 114 |
+
# Pyre type checker
|
| 115 |
+
.pyre/
|
| 116 |
+
|
| 117 |
+
# pytype static type analyzer
|
| 118 |
+
.pytype/
|
| 119 |
+
|
| 120 |
+
# Cython debug symbols
|
| 121 |
+
cython_debug/
|
| 122 |
+
|
| 123 |
+
# Temporary files
|
| 124 |
+
*.tmp
|
| 125 |
+
*.temp
|
| 126 |
+
tmp/
|
| 127 |
+
temp/
|
| 128 |
+
|
| 129 |
+
#
|
| 130 |
+
*.minimal
|
| 131 |
+
tests/
|
| 132 |
+
*.md
|
| 133 |
+
docs/
|
Dockerfile
CHANGED
|
@@ -49,7 +49,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
|
|
| 49 |
&& rm -rf /var/lib/apt/lists/*
|
| 50 |
|
| 51 |
# Install Python packages
|
| 52 |
-
RUN pip3 install --no-cache-dir fastapi uvicorn
|
| 53 |
|
| 54 |
# Create non-root user
|
| 55 |
RUN useradd -m -u 1000 user && \
|
|
@@ -58,6 +58,7 @@ RUN useradd -m -u 1000 user && \
|
|
| 58 |
|
| 59 |
# Copy application code
|
| 60 |
COPY --chown=user:user app.py /home/user/app.py
|
|
|
|
| 61 |
|
| 62 |
USER user
|
| 63 |
WORKDIR /home/user
|
|
|
|
| 49 |
&& rm -rf /var/lib/apt/lists/*
|
| 50 |
|
| 51 |
# Install Python packages
|
| 52 |
+
RUN pip3 install --no-cache-dir fastapi uvicorn aiohttp pydantic duckduckgo-search beautifulsoup4 lxml --break-system-packages
|
| 53 |
|
| 54 |
# Create non-root user
|
| 55 |
RUN useradd -m -u 1000 user && \
|
|
|
|
| 58 |
|
| 59 |
# Copy application code
|
| 60 |
COPY --chown=user:user app.py /home/user/app.py
|
| 61 |
+
COPY --chown=user:user logger.py /home/user/logger.py
|
| 62 |
|
| 63 |
USER user
|
| 64 |
WORKDIR /home/user
|
app.py
CHANGED
|
@@ -2,27 +2,35 @@ import subprocess
|
|
| 2 |
import signal
|
| 3 |
import os
|
| 4 |
import time
|
| 5 |
-
|
| 6 |
-
from
|
|
|
|
| 7 |
from collections import OrderedDict
|
|
|
|
|
|
|
| 8 |
|
| 9 |
-
import
|
| 10 |
-
from fastapi import FastAPI, HTTPException
|
| 11 |
from fastapi.openapi.utils import get_openapi
|
| 12 |
from pydantic import BaseModel, Field
|
| 13 |
from duckduckgo_search import DDGS
|
| 14 |
from bs4 import BeautifulSoup
|
| 15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
app = FastAPI(
|
| 17 |
title="AGI Multi-Model API",
|
| 18 |
description="""
|
| 19 |
-
**Dynamic Multi-Model LLM API with Web Search
|
| 20 |
|
| 21 |
This API provides:
|
| 22 |
-
* 🔄 Dynamic model switching
|
| 23 |
* 💬 OpenAI-compatible chat completions
|
| 24 |
* 🌐 Web-augmented chat with real-time search
|
| 25 |
-
* 📊 Model management and
|
|
|
|
| 26 |
|
| 27 |
## Available Models
|
| 28 |
- **deepseek-chat** (default): General purpose conversational model
|
|
@@ -31,13 +39,22 @@ app = FastAPI(
|
|
| 31 |
- **deepseek-coder**: Specialized coding assistance
|
| 32 |
- **llama-7b**: Lightweight and fast responses
|
| 33 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
## Quick Start
|
| 35 |
1. Check available models: `GET /models`
|
| 36 |
2. Switch model (optional): `POST /switch-model`
|
| 37 |
3. Chat: `POST /v1/chat/completions`
|
| 38 |
4. Chat with web search: `POST /v1/web-chat/completions`
|
|
|
|
| 39 |
""",
|
| 40 |
-
version="0.0.
|
| 41 |
contact={
|
| 42 |
"name": "API Support",
|
| 43 |
"email": "support@example.com",
|
|
@@ -58,6 +75,10 @@ app = FastAPI(
|
|
| 58 |
"name": "chat",
|
| 59 |
"description": "Chat completion endpoints (OpenAI-compatible)",
|
| 60 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
{
|
| 62 |
"name": "documentation",
|
| 63 |
"description": "API documentation and OpenAPI specification",
|
|
@@ -81,9 +102,12 @@ AVAILABLE_MODELS = {
|
|
| 81 |
"llama-7b": "TheBloke/Llama-2-7B-Chat-GGUF:llama-2-7b-chat.Q4_K_M.gguf",
|
| 82 |
}
|
| 83 |
|
| 84 |
-
# Configuration
|
| 85 |
-
MAX_CACHED_MODELS = 2
|
| 86 |
-
BASE_PORT = 8080
|
|
|
|
|
|
|
|
|
|
| 87 |
|
| 88 |
|
| 89 |
@dataclass
|
|
@@ -95,14 +119,105 @@ class CachedModel:
|
|
| 95 |
port: int
|
| 96 |
url: str
|
| 97 |
last_used: float
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 98 |
|
| 99 |
|
| 100 |
class ModelCache:
|
| 101 |
"""
|
| 102 |
-
|
| 103 |
|
| 104 |
-
|
| 105 |
-
|
|
|
|
|
|
|
|
|
|
| 106 |
"""
|
| 107 |
|
| 108 |
def __init__(self, max_size: int = MAX_CACHED_MODELS):
|
|
@@ -110,6 +225,8 @@ class ModelCache:
|
|
| 110 |
self.cache: OrderedDict[str, CachedModel] = OrderedDict()
|
| 111 |
self.port_counter = BASE_PORT
|
| 112 |
self.used_ports = set()
|
|
|
|
|
|
|
| 113 |
|
| 114 |
def _get_next_port(self) -> int:
|
| 115 |
"""Get next available port for a model."""
|
|
@@ -124,14 +241,14 @@ class ModelCache:
|
|
| 124 |
"""Release a port back to the pool."""
|
| 125 |
self.used_ports.discard(port)
|
| 126 |
|
| 127 |
-
def _evict_lru(self):
|
| 128 |
"""Evict the least recently used model."""
|
| 129 |
if not self.cache:
|
| 130 |
return
|
| 131 |
|
| 132 |
# Get the first (oldest) item
|
| 133 |
model_name, cached_model = self.cache.popitem(last=False)
|
| 134 |
-
|
| 135 |
|
| 136 |
# Stop the process
|
| 137 |
try:
|
|
@@ -139,20 +256,23 @@ class ModelCache:
|
|
| 139 |
os.killpg(os.getpgid(cached_model.process.pid), signal.SIGTERM)
|
| 140 |
else:
|
| 141 |
cached_model.process.terminate()
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 146 |
if os.name != 'nt':
|
| 147 |
os.killpg(os.getpgid(cached_model.process.pid), signal.SIGKILL)
|
| 148 |
else:
|
| 149 |
cached_model.process.kill()
|
| 150 |
-
|
| 151 |
-
|
| 152 |
|
| 153 |
# Release the port
|
| 154 |
self._release_port(cached_model.port)
|
| 155 |
-
time.sleep(1)
|
| 156 |
|
| 157 |
def get(self, model_name: str) -> Optional[CachedModel]:
|
| 158 |
"""Get a model from cache, updating its last used time."""
|
|
@@ -161,16 +281,16 @@ class ModelCache:
|
|
| 161 |
cached_model.last_used = time.time()
|
| 162 |
# Move to end (most recently used)
|
| 163 |
self.cache.move_to_end(model_name)
|
| 164 |
-
|
| 165 |
return cached_model
|
| 166 |
-
|
| 167 |
return None
|
| 168 |
|
| 169 |
-
def put(self, model_name: str, model_id: str, process: subprocess.Popen, port: int):
|
| 170 |
"""Add a model to the cache."""
|
| 171 |
# Evict if cache is full
|
| 172 |
while len(self.cache) >= self.max_size:
|
| 173 |
-
self._evict_lru()
|
| 174 |
|
| 175 |
url = f"http://localhost:{port}"
|
| 176 |
cached_model = CachedModel(
|
|
@@ -179,21 +299,27 @@ class ModelCache:
|
|
| 179 |
process=process,
|
| 180 |
port=port,
|
| 181 |
url=url,
|
| 182 |
-
last_used=time.time()
|
|
|
|
| 183 |
)
|
| 184 |
self.cache[model_name] = cached_model
|
| 185 |
-
|
| 186 |
|
| 187 |
-
def clear(self):
|
| 188 |
"""Clear all cached models."""
|
| 189 |
-
|
| 190 |
for model_name, cached_model in list(self.cache.items()):
|
| 191 |
try:
|
| 192 |
if os.name != 'nt':
|
| 193 |
os.killpg(os.getpgid(cached_model.process.pid), signal.SIGTERM)
|
| 194 |
else:
|
| 195 |
cached_model.process.terminate()
|
| 196 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 197 |
except:
|
| 198 |
try:
|
| 199 |
if os.name != 'nt':
|
|
@@ -216,7 +342,10 @@ class ModelCache:
|
|
| 216 |
"name": name,
|
| 217 |
"port": model.port,
|
| 218 |
"url": model.url,
|
| 219 |
-
"last_used": model.last_used
|
|
|
|
|
|
|
|
|
|
| 220 |
}
|
| 221 |
for name, model in self.cache.items()
|
| 222 |
]
|
|
@@ -226,6 +355,11 @@ class ModelCache:
|
|
| 226 |
# Global state
|
| 227 |
current_model = "deepseek-chat" # Default model
|
| 228 |
model_cache = ModelCache(max_size=MAX_CACHED_MODELS)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 229 |
|
| 230 |
|
| 231 |
class ModelSwitchRequest(BaseModel):
|
|
@@ -347,22 +481,28 @@ class ModelSwitchResponse(BaseModel):
|
|
| 347 |
model: str = Field(..., description="New active model name")
|
| 348 |
|
| 349 |
|
| 350 |
-
def start_llama_server(model_id: str, port: int) -> subprocess.Popen:
|
| 351 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 352 |
cmd = [
|
| 353 |
"llama-server",
|
| 354 |
"-hf", model_id,
|
| 355 |
"--host", "0.0.0.0",
|
| 356 |
"--port", str(port),
|
| 357 |
"-c", "2048", # Context size
|
| 358 |
-
"-t", "4", # CPU threads
|
| 359 |
"-ngl", "0", # GPU layers (0 for CPU-only)
|
| 360 |
-
"--cont-batching", # Enable continuous batching
|
| 361 |
"-b", "512", # Batch size
|
| 362 |
]
|
| 363 |
|
| 364 |
-
|
| 365 |
-
print("This may take 2-3 minutes to download and load the model...")
|
| 366 |
|
| 367 |
process = subprocess.Popen(
|
| 368 |
cmd,
|
|
@@ -373,52 +513,108 @@ def start_llama_server(model_id: str, port: int) -> subprocess.Popen:
|
|
| 373 |
bufsize=1
|
| 374 |
)
|
| 375 |
|
| 376 |
-
# Wait for server to be ready
|
| 377 |
-
max_retries = 300 # 5 minutes
|
| 378 |
server_url = f"http://localhost:{port}"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 379 |
|
| 380 |
-
|
| 381 |
# Check if process died
|
| 382 |
if process.poll() is not None:
|
| 383 |
stdout, _ = process.communicate()
|
| 384 |
-
|
| 385 |
-
|
| 386 |
raise RuntimeError("llama-server process died")
|
| 387 |
|
| 388 |
try:
|
| 389 |
-
#
|
| 390 |
-
|
| 391 |
-
|
| 392 |
-
|
| 393 |
-
|
| 394 |
-
|
|
|
|
| 395 |
# Server not ready yet
|
| 396 |
pass
|
| 397 |
-
except Exception:
|
| 398 |
-
# Other errors, keep waiting
|
| 399 |
-
pass
|
| 400 |
|
| 401 |
-
|
|
|
|
|
|
|
|
|
|
| 402 |
|
| 403 |
raise RuntimeError("llama-server failed to start within 5 minutes")
|
| 404 |
|
| 405 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 406 |
@app.on_event("startup")
|
| 407 |
async def startup_event():
|
| 408 |
-
"""
|
| 409 |
-
global current_model
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 410 |
model_id = AVAILABLE_MODELS[current_model]
|
| 411 |
port = model_cache._get_next_port()
|
| 412 |
|
| 413 |
-
process = start_llama_server(model_id, port)
|
| 414 |
-
model_cache.put(current_model, model_id, process, port)
|
| 415 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 416 |
|
| 417 |
|
| 418 |
@app.on_event("shutdown")
|
| 419 |
async def shutdown_event():
|
| 420 |
-
"""Clean shutdown - clear
|
| 421 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 422 |
|
| 423 |
|
| 424 |
@app.get(
|
|
@@ -438,12 +634,28 @@ async def root():
|
|
| 438 |
- List of all available models
|
| 439 |
"""
|
| 440 |
return {
|
| 441 |
-
"status": "AGI Multi-Model API
|
| 442 |
"current_model": current_model,
|
| 443 |
"available_models": list(AVAILABLE_MODELS.keys())
|
| 444 |
}
|
| 445 |
|
| 446 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 447 |
@app.get(
|
| 448 |
"/models",
|
| 449 |
response_model=ModelsResponse,
|
|
@@ -458,8 +670,6 @@ async def list_models():
|
|
| 458 |
Returns:
|
| 459 |
- current_model: The model currently in use
|
| 460 |
- available_models: Array of all available model names
|
| 461 |
-
|
| 462 |
-
Use this endpoint to see which models you can switch to.
|
| 463 |
"""
|
| 464 |
return {
|
| 465 |
"current_model": current_model,
|
|
@@ -472,45 +682,17 @@ async def list_models():
|
|
| 472 |
response_model=ModelSwitchResponse,
|
| 473 |
tags=["models"],
|
| 474 |
summary="Switch Active Model",
|
| 475 |
-
description="Switch to a different LLM model
|
| 476 |
-
responses={
|
| 477 |
-
200: {
|
| 478 |
-
"description": "Model switched successfully",
|
| 479 |
-
"content": {
|
| 480 |
-
"application/json": {
|
| 481 |
-
"example": {
|
| 482 |
-
"message": "Switched to model: deepseek-coder (from cache)",
|
| 483 |
-
"model": "deepseek-coder"
|
| 484 |
-
}
|
| 485 |
-
}
|
| 486 |
-
}
|
| 487 |
-
},
|
| 488 |
-
400: {
|
| 489 |
-
"description": "Invalid model name",
|
| 490 |
-
"content": {
|
| 491 |
-
"application/json": {
|
| 492 |
-
"example": {
|
| 493 |
-
"detail": "Model 'invalid-model' not found. Available: ['deepseek-chat', 'mistral-7b', ...]"
|
| 494 |
-
}
|
| 495 |
-
}
|
| 496 |
-
}
|
| 497 |
-
}
|
| 498 |
-
}
|
| 499 |
)
|
| 500 |
async def switch_model(request: ModelSwitchRequest):
|
| 501 |
"""
|
| 502 |
Switch to a different LLM model with intelligent caching.
|
| 503 |
|
| 504 |
-
**
|
| 505 |
-
|
| 506 |
-
|
| 507 |
-
|
| 508 |
-
|
| 509 |
-
**Caching:**
|
| 510 |
-
- Up to 2 models kept in memory
|
| 511 |
-
- LRU (Least Recently Used) eviction policy
|
| 512 |
-
- Each model runs on a separate port
|
| 513 |
-
- Instant switching between cached models
|
| 514 |
"""
|
| 515 |
global current_model
|
| 516 |
|
|
@@ -523,28 +705,32 @@ async def switch_model(request: ModelSwitchRequest):
|
|
| 523 |
if request.model_name == current_model:
|
| 524 |
return {"message": f"Already using model: {current_model}", "model": current_model}
|
| 525 |
|
|
|
|
|
|
|
| 526 |
# Try to get from cache
|
| 527 |
cached_model = model_cache.get(request.model_name)
|
| 528 |
|
| 529 |
if cached_model:
|
| 530 |
# Model is cached, instant switch
|
|
|
|
| 531 |
current_model = request.model_name
|
| 532 |
return {
|
| 533 |
-
"message": f"Switched to model: {current_model} (from cache)",
|
| 534 |
"model": current_model
|
| 535 |
}
|
| 536 |
|
| 537 |
# Model not cached, need to load it
|
|
|
|
| 538 |
model_id = AVAILABLE_MODELS[request.model_name]
|
| 539 |
port = model_cache._get_next_port()
|
| 540 |
|
| 541 |
try:
|
| 542 |
-
process = start_llama_server(model_id, port)
|
| 543 |
-
model_cache.put(request.model_name, model_id, process, port)
|
| 544 |
current_model = request.model_name
|
| 545 |
|
| 546 |
return {
|
| 547 |
-
"message": f"Switched to model: {current_model} (
|
| 548 |
"model": current_model
|
| 549 |
}
|
| 550 |
except Exception as e:
|
|
@@ -557,88 +743,80 @@ async def switch_model(request: ModelSwitchRequest):
|
|
| 557 |
"/v1/chat/completions",
|
| 558 |
tags=["chat"],
|
| 559 |
summary="Chat Completions",
|
| 560 |
-
description="OpenAI-compatible chat completions
|
| 561 |
-
responses={
|
| 562 |
-
200: {
|
| 563 |
-
"description": "Successful response",
|
| 564 |
-
"content": {
|
| 565 |
-
"application/json": {
|
| 566 |
-
"example": {
|
| 567 |
-
"id": "chatcmpl-123",
|
| 568 |
-
"object": "chat.completion",
|
| 569 |
-
"created": 1677652288,
|
| 570 |
-
"model": "deepseek-chat",
|
| 571 |
-
"choices": [{
|
| 572 |
-
"index": 0,
|
| 573 |
-
"message": {
|
| 574 |
-
"role": "assistant",
|
| 575 |
-
"content": "Hello! How can I help you today?"
|
| 576 |
-
},
|
| 577 |
-
"finish_reason": "stop"
|
| 578 |
-
}]
|
| 579 |
-
}
|
| 580 |
-
}
|
| 581 |
-
}
|
| 582 |
-
},
|
| 583 |
-
500: {
|
| 584 |
-
"description": "LLM server error"
|
| 585 |
-
}
|
| 586 |
-
}
|
| 587 |
)
|
| 588 |
async def chat_completions(request: ChatCompletionRequest):
|
| 589 |
"""
|
| 590 |
-
OpenAI-compatible chat completions
|
| 591 |
-
|
| 592 |
-
This endpoint forwards your request to the currently active LLM model
|
| 593 |
-
and returns the response in OpenAI-compatible format.
|
| 594 |
-
|
| 595 |
-
**Message Format:**
|
| 596 |
-
```json
|
| 597 |
-
{
|
| 598 |
-
"messages": [
|
| 599 |
-
{"role": "system", "content": "You are a helpful assistant."},
|
| 600 |
-
{"role": "user", "content": "Hello!"}
|
| 601 |
-
],
|
| 602 |
-
"max_tokens": 256,
|
| 603 |
-
"temperature": 0.7
|
| 604 |
-
}
|
| 605 |
-
```
|
| 606 |
|
| 607 |
-
**
|
| 608 |
-
-
|
| 609 |
-
-
|
| 610 |
-
-
|
| 611 |
"""
|
| 612 |
try:
|
|
|
|
|
|
|
| 613 |
# Get current model from cache
|
| 614 |
cached_model = model_cache.get(current_model)
|
| 615 |
if not cached_model:
|
| 616 |
raise HTTPException(status_code=500, detail="Current model not loaded")
|
| 617 |
|
| 618 |
-
# Forward to llama-server
|
| 619 |
-
|
| 620 |
f"{cached_model.url}/v1/chat/completions",
|
| 621 |
json={
|
| 622 |
"messages": request.messages,
|
| 623 |
"max_tokens": request.max_tokens,
|
| 624 |
"temperature": request.temperature,
|
| 625 |
-
}
|
| 626 |
-
|
| 627 |
-
|
| 628 |
-
|
| 629 |
-
|
| 630 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 631 |
raise HTTPException(status_code=500, detail=f"llama-server error: {str(e)}")
|
| 632 |
|
| 633 |
|
| 634 |
-
def
|
| 635 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 636 |
try:
|
| 637 |
-
|
| 638 |
-
|
| 639 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 640 |
except Exception as e:
|
| 641 |
-
|
| 642 |
return []
|
| 643 |
|
| 644 |
|
|
@@ -667,70 +845,16 @@ def format_search_context(query: str, search_results: list[dict]) -> str:
|
|
| 667 |
"/v1/web-chat/completions",
|
| 668 |
tags=["chat"],
|
| 669 |
summary="Web-Augmented Chat Completions",
|
| 670 |
-
description="Chat completions
|
| 671 |
-
responses={
|
| 672 |
-
200: {
|
| 673 |
-
"description": "Successful response with web search metadata",
|
| 674 |
-
"content": {
|
| 675 |
-
"application/json": {
|
| 676 |
-
"example": {
|
| 677 |
-
"id": "chatcmpl-123",
|
| 678 |
-
"object": "chat.completion",
|
| 679 |
-
"created": 1677652288,
|
| 680 |
-
"model": "deepseek-chat",
|
| 681 |
-
"choices": [{
|
| 682 |
-
"index": 0,
|
| 683 |
-
"message": {
|
| 684 |
-
"role": "assistant",
|
| 685 |
-
"content": "Based on recent search results, here's what I found..."
|
| 686 |
-
},
|
| 687 |
-
"finish_reason": "stop"
|
| 688 |
-
}],
|
| 689 |
-
"web_search": {
|
| 690 |
-
"query": "latest AI developments",
|
| 691 |
-
"results_count": 5,
|
| 692 |
-
"sources": ["https://example.com/1", "https://example.com/2"]
|
| 693 |
-
}
|
| 694 |
-
}
|
| 695 |
-
}
|
| 696 |
-
}
|
| 697 |
-
},
|
| 698 |
-
400: {
|
| 699 |
-
"description": "No user message found"
|
| 700 |
-
},
|
| 701 |
-
500: {
|
| 702 |
-
"description": "LLM server or search error"
|
| 703 |
-
}
|
| 704 |
-
}
|
| 705 |
)
|
| 706 |
async def web_chat_completions(request: WebChatRequest):
|
| 707 |
"""
|
| 708 |
-
Chat completions with
|
| 709 |
-
|
| 710 |
-
**How it works:**
|
| 711 |
-
1. Extracts the last user message as the search query
|
| 712 |
-
2. Performs a web search using DuckDuckGo
|
| 713 |
-
3. Injects search results into the LLM context
|
| 714 |
-
4. Returns the AI response with source citations
|
| 715 |
-
|
| 716 |
-
**Use cases:**
|
| 717 |
-
- Current events and news
|
| 718 |
-
- Recent information beyond the model's training data
|
| 719 |
-
- Fact-checking with web sources
|
| 720 |
-
- Research with live data
|
| 721 |
-
|
| 722 |
-
**Example:**
|
| 723 |
-
```json
|
| 724 |
-
{
|
| 725 |
-
"messages": [
|
| 726 |
-
{"role": "user", "content": "What's the latest news about SpaceX?"}
|
| 727 |
-
],
|
| 728 |
-
"max_tokens": 512,
|
| 729 |
-
"max_search_results": 5
|
| 730 |
-
}
|
| 731 |
-
```
|
| 732 |
|
| 733 |
-
|
|
|
|
|
|
|
|
|
|
| 734 |
"""
|
| 735 |
try:
|
| 736 |
# Get the last user message as search query
|
|
@@ -740,9 +864,9 @@ async def web_chat_completions(request: WebChatRequest):
|
|
| 740 |
|
| 741 |
search_query = user_messages[-1].get("content", "")
|
| 742 |
|
| 743 |
-
# Perform web search
|
| 744 |
-
|
| 745 |
-
search_results =
|
| 746 |
|
| 747 |
# Format search results as context
|
| 748 |
web_context = format_search_context(search_query, search_results)
|
|
@@ -761,7 +885,6 @@ Use the above search results to provide accurate, up-to-date information in your
|
|
| 761 |
Always cite sources when using information from the search results."""
|
| 762 |
}
|
| 763 |
|
| 764 |
-
# Insert system message before the last user message
|
| 765 |
augmented_messages.insert(-1, system_prompt)
|
| 766 |
|
| 767 |
# Get current model from cache
|
|
@@ -770,29 +893,28 @@ Always cite sources when using information from the search results."""
|
|
| 770 |
raise HTTPException(status_code=500, detail="Current model not loaded")
|
| 771 |
|
| 772 |
# Forward to llama-server with augmented context
|
| 773 |
-
|
| 774 |
f"{cached_model.url}/v1/chat/completions",
|
| 775 |
json={
|
| 776 |
"messages": augmented_messages,
|
| 777 |
"max_tokens": request.max_tokens,
|
| 778 |
"temperature": request.temperature,
|
| 779 |
-
}
|
| 780 |
-
|
| 781 |
-
|
| 782 |
-
|
| 783 |
-
|
| 784 |
-
result = response.json()
|
| 785 |
|
| 786 |
# Add metadata about search results
|
| 787 |
result["web_search"] = {
|
| 788 |
"query": search_query,
|
| 789 |
"results_count": len(search_results),
|
| 790 |
-
"sources": [r.get("href", "") for r in search_results if r.get("href")]
|
|
|
|
| 791 |
}
|
| 792 |
|
| 793 |
return result
|
| 794 |
|
| 795 |
-
except
|
| 796 |
raise HTTPException(status_code=500, detail=f"llama-server error: {str(e)}")
|
| 797 |
except Exception as e:
|
| 798 |
raise HTTPException(status_code=500, detail=f"Error: {str(e)}")
|
|
@@ -802,40 +924,89 @@ Always cite sources when using information from the search results."""
|
|
| 802 |
"/cache/info",
|
| 803 |
tags=["models"],
|
| 804 |
summary="Get Cache Information",
|
| 805 |
-
description="Returns information about the model cache
|
| 806 |
)
|
| 807 |
async def get_cache_info():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 808 |
"""
|
| 809 |
-
Get
|
| 810 |
|
| 811 |
Returns:
|
| 812 |
-
-
|
| 813 |
-
-
|
| 814 |
-
-
|
| 815 |
-
|
| 816 |
-
|
| 817 |
-
|
| 818 |
-
|
| 819 |
-
|
| 820 |
-
|
| 821 |
-
|
| 822 |
-
|
| 823 |
-
|
| 824 |
-
|
| 825 |
-
|
| 826 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 827 |
},
|
| 828 |
-
{
|
| 829 |
-
|
| 830 |
-
|
| 831 |
-
|
| 832 |
-
|
| 833 |
-
}
|
| 834 |
-
|
|
|
|
| 835 |
}
|
| 836 |
-
|
| 837 |
-
|
| 838 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 839 |
|
| 840 |
|
| 841 |
@app.get(
|
|
@@ -846,22 +1017,5 @@ async def get_cache_info():
|
|
| 846 |
include_in_schema=False
|
| 847 |
)
|
| 848 |
async def get_openapi_spec():
|
| 849 |
-
"""
|
| 850 |
-
|
| 851 |
-
|
| 852 |
-
This endpoint returns the complete OpenAPI 3.0 specification that can be used with:
|
| 853 |
-
- API documentation tools (Swagger UI, ReDoc)
|
| 854 |
-
- Code generators (openapi-generator, swagger-codegen)
|
| 855 |
-
- API testing tools (Postman, Insomnia)
|
| 856 |
-
- SDK generation
|
| 857 |
-
|
| 858 |
-
Save this to a file and use it with tools like:
|
| 859 |
-
```bash
|
| 860 |
-
# Generate Python client
|
| 861 |
-
openapi-generator generate -i openapi.json -g python -o ./client
|
| 862 |
-
|
| 863 |
-
# Generate TypeScript client
|
| 864 |
-
openapi-generator generate -i openapi.json -g typescript-fetch -o ./client
|
| 865 |
-
```
|
| 866 |
-
"""
|
| 867 |
-
return app.openapi()
|
|
|
|
| 2 |
import signal
|
| 3 |
import os
|
| 4 |
import time
|
| 5 |
+
import asyncio
|
| 6 |
+
from typing import Optional, Dict, List
|
| 7 |
+
from dataclasses import dataclass, field
|
| 8 |
from collections import OrderedDict
|
| 9 |
+
from datetime import datetime, timedelta
|
| 10 |
+
import hashlib
|
| 11 |
|
| 12 |
+
import aiohttp
|
| 13 |
+
from fastapi import FastAPI, HTTPException, BackgroundTasks
|
| 14 |
from fastapi.openapi.utils import get_openapi
|
| 15 |
from pydantic import BaseModel, Field
|
| 16 |
from duckduckgo_search import DDGS
|
| 17 |
from bs4 import BeautifulSoup
|
| 18 |
|
| 19 |
+
from logger import get_logger
|
| 20 |
+
|
| 21 |
+
logger = get_logger(__name__)
|
| 22 |
+
|
| 23 |
app = FastAPI(
|
| 24 |
title="AGI Multi-Model API",
|
| 25 |
description="""
|
| 26 |
+
**High-Performance Dynamic Multi-Model LLM API with Web Search**
|
| 27 |
|
| 28 |
This API provides:
|
| 29 |
+
* 🔄 Dynamic model switching with intelligent caching
|
| 30 |
* 💬 OpenAI-compatible chat completions
|
| 31 |
* 🌐 Web-augmented chat with real-time search
|
| 32 |
+
* 📊 Model management and performance monitoring
|
| 33 |
+
* ⚡ Async/await architecture for maximum throughput
|
| 34 |
|
| 35 |
## Available Models
|
| 36 |
- **deepseek-chat** (default): General purpose conversational model
|
|
|
|
| 39 |
- **deepseek-coder**: Specialized coding assistance
|
| 40 |
- **llama-7b**: Lightweight and fast responses
|
| 41 |
|
| 42 |
+
## Performance Features
|
| 43 |
+
- Parallel model loading
|
| 44 |
+
- Connection pooling for HTTP requests
|
| 45 |
+
- Web search result caching
|
| 46 |
+
- Background model preloading
|
| 47 |
+
- Request queuing to prevent overload
|
| 48 |
+
- Real-time performance metrics
|
| 49 |
+
|
| 50 |
## Quick Start
|
| 51 |
1. Check available models: `GET /models`
|
| 52 |
2. Switch model (optional): `POST /switch-model`
|
| 53 |
3. Chat: `POST /v1/chat/completions`
|
| 54 |
4. Chat with web search: `POST /v1/web-chat/completions`
|
| 55 |
+
5. View metrics: `GET /metrics`
|
| 56 |
""",
|
| 57 |
+
version="0.1.0.2026.01.24",
|
| 58 |
contact={
|
| 59 |
"name": "API Support",
|
| 60 |
"email": "support@example.com",
|
|
|
|
| 75 |
"name": "chat",
|
| 76 |
"description": "Chat completion endpoints (OpenAI-compatible)",
|
| 77 |
},
|
| 78 |
+
{
|
| 79 |
+
"name": "monitoring",
|
| 80 |
+
"description": "Performance metrics and monitoring",
|
| 81 |
+
},
|
| 82 |
{
|
| 83 |
"name": "documentation",
|
| 84 |
"description": "API documentation and OpenAPI specification",
|
|
|
|
| 102 |
"llama-7b": "TheBloke/Llama-2-7B-Chat-GGUF:llama-2-7b-chat.Q4_K_M.gguf",
|
| 103 |
}
|
| 104 |
|
| 105 |
+
# Configuration - now environment-variable driven
|
| 106 |
+
MAX_CACHED_MODELS = int(os.getenv("MAX_CACHED_MODELS", "2"))
|
| 107 |
+
BASE_PORT = int(os.getenv("BASE_PORT", "8080"))
|
| 108 |
+
PRELOAD_MODELS = os.getenv("PRELOAD_MODELS", "").split(",") if os.getenv("PRELOAD_MODELS") else []
|
| 109 |
+
WEB_SEARCH_CACHE_TTL = int(os.getenv("WEB_SEARCH_CACHE_TTL", "3600")) # 1 hour
|
| 110 |
+
REQUEST_TIMEOUT = int(os.getenv("REQUEST_TIMEOUT", "300")) # 5 minutes
|
| 111 |
|
| 112 |
|
| 113 |
@dataclass
|
|
|
|
| 119 |
port: int
|
| 120 |
url: str
|
| 121 |
last_used: float
|
| 122 |
+
load_time: float = 0.0
|
| 123 |
+
request_count: int = 0
|
| 124 |
+
total_latency: float = 0.0
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
@dataclass
|
| 128 |
+
class PerformanceMetrics:
|
| 129 |
+
"""Performance metrics for monitoring."""
|
| 130 |
+
total_requests: int = 0
|
| 131 |
+
total_switches: int = 0
|
| 132 |
+
cache_hits: int = 0
|
| 133 |
+
cache_misses: int = 0
|
| 134 |
+
total_web_searches: int = 0
|
| 135 |
+
web_search_cache_hits: int = 0
|
| 136 |
+
model_metrics: Dict[str, Dict] = field(default_factory=dict)
|
| 137 |
+
startup_time: float = 0.0
|
| 138 |
+
|
| 139 |
+
def record_request(self, model_name: str, latency: float):
|
| 140 |
+
"""Record a request for metrics."""
|
| 141 |
+
self.total_requests += 1
|
| 142 |
+
if model_name not in self.model_metrics:
|
| 143 |
+
self.model_metrics[model_name] = {
|
| 144 |
+
"requests": 0,
|
| 145 |
+
"total_latency": 0.0,
|
| 146 |
+
"avg_latency": 0.0
|
| 147 |
+
}
|
| 148 |
+
self.model_metrics[model_name]["requests"] += 1
|
| 149 |
+
self.model_metrics[model_name]["total_latency"] += latency
|
| 150 |
+
self.model_metrics[model_name]["avg_latency"] = (
|
| 151 |
+
self.model_metrics[model_name]["total_latency"] /
|
| 152 |
+
self.model_metrics[model_name]["requests"]
|
| 153 |
+
)
|
| 154 |
+
|
| 155 |
+
|
| 156 |
+
@dataclass
|
| 157 |
+
class WebSearchCacheEntry:
|
| 158 |
+
"""Cache entry for web search results."""
|
| 159 |
+
results: List[dict]
|
| 160 |
+
timestamp: float
|
| 161 |
+
ttl: int = WEB_SEARCH_CACHE_TTL
|
| 162 |
+
|
| 163 |
+
def is_expired(self) -> bool:
|
| 164 |
+
"""Check if cache entry has expired."""
|
| 165 |
+
return time.time() - self.timestamp > self.ttl
|
| 166 |
+
|
| 167 |
+
|
| 168 |
+
class WebSearchCache:
|
| 169 |
+
"""LRU cache for web search results."""
|
| 170 |
+
|
| 171 |
+
def __init__(self, max_size: int = 100):
|
| 172 |
+
self.max_size = max_size
|
| 173 |
+
self.cache: OrderedDict[str, WebSearchCacheEntry] = OrderedDict()
|
| 174 |
+
|
| 175 |
+
def _get_cache_key(self, query: str, max_results: int) -> str:
|
| 176 |
+
"""Generate cache key from query."""
|
| 177 |
+
key = f"{query}:{max_results}"
|
| 178 |
+
return hashlib.md5(key.encode()).hexdigest()
|
| 179 |
+
|
| 180 |
+
def get(self, query: str, max_results: int) -> Optional[List[dict]]:
|
| 181 |
+
"""Get cached search results if available and not expired."""
|
| 182 |
+
key = self._get_cache_key(query, max_results)
|
| 183 |
+
if key in self.cache:
|
| 184 |
+
entry = self.cache[key]
|
| 185 |
+
if not entry.is_expired():
|
| 186 |
+
# Move to end (most recently used)
|
| 187 |
+
self.cache.move_to_end(key)
|
| 188 |
+
return entry.results
|
| 189 |
+
else:
|
| 190 |
+
# Remove expired entry
|
| 191 |
+
del self.cache[key]
|
| 192 |
+
return None
|
| 193 |
+
|
| 194 |
+
def put(self, query: str, max_results: int, results: List[dict]):
|
| 195 |
+
"""Cache search results."""
|
| 196 |
+
key = self._get_cache_key(query, max_results)
|
| 197 |
+
|
| 198 |
+
# Evict oldest if cache is full
|
| 199 |
+
if len(self.cache) >= self.max_size and key not in self.cache:
|
| 200 |
+
self.cache.popitem(last=False)
|
| 201 |
+
|
| 202 |
+
self.cache[key] = WebSearchCacheEntry(
|
| 203 |
+
results=results,
|
| 204 |
+
timestamp=time.time()
|
| 205 |
+
)
|
| 206 |
+
|
| 207 |
+
def clear(self):
|
| 208 |
+
"""Clear all cached results."""
|
| 209 |
+
self.cache.clear()
|
| 210 |
|
| 211 |
|
| 212 |
class ModelCache:
|
| 213 |
"""
|
| 214 |
+
High-performance in-memory LRU cache for loaded models.
|
| 215 |
|
| 216 |
+
Features:
|
| 217 |
+
- Manages multiple llama-server processes on different ports
|
| 218 |
+
- LRU eviction when cache is full
|
| 219 |
+
- Parallel model loading support
|
| 220 |
+
- Performance metrics tracking
|
| 221 |
"""
|
| 222 |
|
| 223 |
def __init__(self, max_size: int = MAX_CACHED_MODELS):
|
|
|
|
| 225 |
self.cache: OrderedDict[str, CachedModel] = OrderedDict()
|
| 226 |
self.port_counter = BASE_PORT
|
| 227 |
self.used_ports = set()
|
| 228 |
+
self._loading_lock = asyncio.Lock()
|
| 229 |
+
self._loading_models: Dict[str, asyncio.Task] = {}
|
| 230 |
|
| 231 |
def _get_next_port(self) -> int:
|
| 232 |
"""Get next available port for a model."""
|
|
|
|
| 241 |
"""Release a port back to the pool."""
|
| 242 |
self.used_ports.discard(port)
|
| 243 |
|
| 244 |
+
async def _evict_lru(self):
|
| 245 |
"""Evict the least recently used model."""
|
| 246 |
if not self.cache:
|
| 247 |
return
|
| 248 |
|
| 249 |
# Get the first (oldest) item
|
| 250 |
model_name, cached_model = self.cache.popitem(last=False)
|
| 251 |
+
logger.info(f"Evicting model from cache: {model_name}")
|
| 252 |
|
| 253 |
# Stop the process
|
| 254 |
try:
|
|
|
|
| 256 |
os.killpg(os.getpgid(cached_model.process.pid), signal.SIGTERM)
|
| 257 |
else:
|
| 258 |
cached_model.process.terminate()
|
| 259 |
+
|
| 260 |
+
# Wait asynchronously for process to stop
|
| 261 |
+
for _ in range(10):
|
| 262 |
+
if cached_model.process.poll() is not None:
|
| 263 |
+
break
|
| 264 |
+
await asyncio.sleep(0.1)
|
| 265 |
+
else:
|
| 266 |
+
# Force kill if not stopped
|
| 267 |
if os.name != 'nt':
|
| 268 |
os.killpg(os.getpgid(cached_model.process.pid), signal.SIGKILL)
|
| 269 |
else:
|
| 270 |
cached_model.process.kill()
|
| 271 |
+
except Exception as e:
|
| 272 |
+
logger.error(f"Error stopping model {model_name}: {e}")
|
| 273 |
|
| 274 |
# Release the port
|
| 275 |
self._release_port(cached_model.port)
|
|
|
|
| 276 |
|
| 277 |
def get(self, model_name: str) -> Optional[CachedModel]:
|
| 278 |
"""Get a model from cache, updating its last used time."""
|
|
|
|
| 281 |
cached_model.last_used = time.time()
|
| 282 |
# Move to end (most recently used)
|
| 283 |
self.cache.move_to_end(model_name)
|
| 284 |
+
logger.debug(f"Cache hit for model: {model_name}")
|
| 285 |
return cached_model
|
| 286 |
+
logger.debug(f"Cache miss for model: {model_name}")
|
| 287 |
return None
|
| 288 |
|
| 289 |
+
async def put(self, model_name: str, model_id: str, process: subprocess.Popen, port: int, load_time: float = 0.0):
|
| 290 |
"""Add a model to the cache."""
|
| 291 |
# Evict if cache is full
|
| 292 |
while len(self.cache) >= self.max_size:
|
| 293 |
+
await self._evict_lru()
|
| 294 |
|
| 295 |
url = f"http://localhost:{port}"
|
| 296 |
cached_model = CachedModel(
|
|
|
|
| 299 |
process=process,
|
| 300 |
port=port,
|
| 301 |
url=url,
|
| 302 |
+
last_used=time.time(),
|
| 303 |
+
load_time=load_time
|
| 304 |
)
|
| 305 |
self.cache[model_name] = cached_model
|
| 306 |
+
logger.info(f"Cached model: {model_name} on port {port} (load time: {load_time:.2f}s)")
|
| 307 |
|
| 308 |
+
async def clear(self):
|
| 309 |
"""Clear all cached models."""
|
| 310 |
+
logger.info("Clearing model cache...")
|
| 311 |
for model_name, cached_model in list(self.cache.items()):
|
| 312 |
try:
|
| 313 |
if os.name != 'nt':
|
| 314 |
os.killpg(os.getpgid(cached_model.process.pid), signal.SIGTERM)
|
| 315 |
else:
|
| 316 |
cached_model.process.terminate()
|
| 317 |
+
|
| 318 |
+
# Wait asynchronously
|
| 319 |
+
for _ in range(10):
|
| 320 |
+
if cached_model.process.poll() is not None:
|
| 321 |
+
break
|
| 322 |
+
await asyncio.sleep(0.1)
|
| 323 |
except:
|
| 324 |
try:
|
| 325 |
if os.name != 'nt':
|
|
|
|
| 342 |
"name": name,
|
| 343 |
"port": model.port,
|
| 344 |
"url": model.url,
|
| 345 |
+
"last_used": model.last_used,
|
| 346 |
+
"load_time": model.load_time,
|
| 347 |
+
"request_count": model.request_count,
|
| 348 |
+
"avg_latency": model.total_latency / model.request_count if model.request_count > 0 else 0.0
|
| 349 |
}
|
| 350 |
for name, model in self.cache.items()
|
| 351 |
]
|
|
|
|
| 355 |
# Global state
|
| 356 |
current_model = "deepseek-chat" # Default model
|
| 357 |
model_cache = ModelCache(max_size=MAX_CACHED_MODELS)
|
| 358 |
+
web_search_cache = WebSearchCache(max_size=100)
|
| 359 |
+
metrics = PerformanceMetrics()
|
| 360 |
+
|
| 361 |
+
# HTTP session for connection pooling (will be initialized in startup)
|
| 362 |
+
http_session: Optional[aiohttp.ClientSession] = None
|
| 363 |
|
| 364 |
|
| 365 |
class ModelSwitchRequest(BaseModel):
|
|
|
|
| 481 |
model: str = Field(..., description="New active model name")
|
| 482 |
|
| 483 |
|
| 484 |
+
async def start_llama_server(model_id: str, port: int) -> tuple[subprocess.Popen, float]:
|
| 485 |
+
"""
|
| 486 |
+
Start llama-server with specified model on a specific port.
|
| 487 |
+
|
| 488 |
+
Returns tuple of (process, load_time_seconds).
|
| 489 |
+
Uses async/await with exponential backoff for health checks.
|
| 490 |
+
"""
|
| 491 |
+
start_time = time.time()
|
| 492 |
+
|
| 493 |
cmd = [
|
| 494 |
"llama-server",
|
| 495 |
"-hf", model_id,
|
| 496 |
"--host", "0.0.0.0",
|
| 497 |
"--port", str(port),
|
| 498 |
"-c", "2048", # Context size
|
| 499 |
+
"-t", "4", # CPU threads
|
| 500 |
"-ngl", "0", # GPU layers (0 for CPU-only)
|
| 501 |
+
"--cont-batching", # Enable continuous batching
|
| 502 |
"-b", "512", # Batch size
|
| 503 |
]
|
| 504 |
|
| 505 |
+
logger.info(f"Starting llama-server with model: {model_id} on port {port}")
|
|
|
|
| 506 |
|
| 507 |
process = subprocess.Popen(
|
| 508 |
cmd,
|
|
|
|
| 513 |
bufsize=1
|
| 514 |
)
|
| 515 |
|
| 516 |
+
# Wait for server to be ready with exponential backoff
|
|
|
|
| 517 |
server_url = f"http://localhost:{port}"
|
| 518 |
+
max_wait_time = 300 # 5 minutes
|
| 519 |
+
backoff_time = 0.1 # Start with 100ms
|
| 520 |
+
max_backoff = 2.0 # Max 2 seconds between checks
|
| 521 |
+
elapsed = 0
|
| 522 |
|
| 523 |
+
while elapsed < max_wait_time:
|
| 524 |
# Check if process died
|
| 525 |
if process.poll() is not None:
|
| 526 |
stdout, _ = process.communicate()
|
| 527 |
+
logger.error(f"llama-server exited with code {process.returncode}")
|
| 528 |
+
logger.error(f"Output: {stdout}")
|
| 529 |
raise RuntimeError("llama-server process died")
|
| 530 |
|
| 531 |
try:
|
| 532 |
+
# Use aiohttp for async health check
|
| 533 |
+
async with http_session.get(f"{server_url}/health", timeout=aiohttp.ClientTimeout(total=2)) as response:
|
| 534 |
+
if response.status in [200, 404]: # 404 is ok, means server is up
|
| 535 |
+
load_time = time.time() - start_time
|
| 536 |
+
logger.info(f"llama-server ready after {load_time:.2f}s")
|
| 537 |
+
return process, load_time
|
| 538 |
+
except (aiohttp.ClientError, asyncio.TimeoutError):
|
| 539 |
# Server not ready yet
|
| 540 |
pass
|
|
|
|
|
|
|
|
|
|
| 541 |
|
| 542 |
+
# Exponential backoff
|
| 543 |
+
await asyncio.sleep(backoff_time)
|
| 544 |
+
elapsed += backoff_time
|
| 545 |
+
backoff_time = min(backoff_time * 1.5, max_backoff)
|
| 546 |
|
| 547 |
raise RuntimeError("llama-server failed to start within 5 minutes")
|
| 548 |
|
| 549 |
|
| 550 |
+
async def preload_models_background():
|
| 551 |
+
"""Background task to preload popular models."""
|
| 552 |
+
if not PRELOAD_MODELS:
|
| 553 |
+
return
|
| 554 |
+
|
| 555 |
+
logger.info(f"Preloading models in background: {PRELOAD_MODELS}")
|
| 556 |
+
|
| 557 |
+
for model_name in PRELOAD_MODELS:
|
| 558 |
+
if model_name not in AVAILABLE_MODELS:
|
| 559 |
+
logger.warning(f"Preload model not found: {model_name}")
|
| 560 |
+
continue
|
| 561 |
+
|
| 562 |
+
if model_cache.get(model_name):
|
| 563 |
+
logger.info(f"Model already cached: {model_name}")
|
| 564 |
+
continue
|
| 565 |
+
|
| 566 |
+
try:
|
| 567 |
+
model_id = AVAILABLE_MODELS[model_name]
|
| 568 |
+
port = model_cache._get_next_port()
|
| 569 |
+
process, load_time = await start_llama_server(model_id, port)
|
| 570 |
+
await model_cache.put(model_name, model_id, process, port, load_time)
|
| 571 |
+
logger.info(f"Preloaded model: {model_name}")
|
| 572 |
+
except Exception as e:
|
| 573 |
+
logger.error(f"Failed to preload model {model_name}: {e}")
|
| 574 |
+
|
| 575 |
+
|
| 576 |
@app.on_event("startup")
|
| 577 |
async def startup_event():
|
| 578 |
+
"""Initialize HTTP session and start with default model."""
|
| 579 |
+
global current_model, http_session
|
| 580 |
+
|
| 581 |
+
startup_start = time.time()
|
| 582 |
+
logger.info("Application startup initiated")
|
| 583 |
+
|
| 584 |
+
# Initialize aiohttp session with connection pooling
|
| 585 |
+
connector = aiohttp.TCPConnector(
|
| 586 |
+
limit=100, # Max total connections
|
| 587 |
+
limit_per_host=10, # Max connections per host
|
| 588 |
+
ttl_dns_cache=300 # DNS cache TTL
|
| 589 |
+
)
|
| 590 |
+
http_session = aiohttp.ClientSession(
|
| 591 |
+
connector=connector,
|
| 592 |
+
timeout=aiohttp.ClientTimeout(total=REQUEST_TIMEOUT)
|
| 593 |
+
)
|
| 594 |
+
|
| 595 |
+
# Start default model
|
| 596 |
model_id = AVAILABLE_MODELS[current_model]
|
| 597 |
port = model_cache._get_next_port()
|
| 598 |
|
| 599 |
+
process, load_time = await start_llama_server(model_id, port)
|
| 600 |
+
await model_cache.put(current_model, model_id, process, port, load_time)
|
| 601 |
+
|
| 602 |
+
metrics.startup_time = time.time() - startup_start
|
| 603 |
+
logger.info(f"Started with default model: {current_model} (total startup: {metrics.startup_time:.2f}s)")
|
| 604 |
+
|
| 605 |
+
# Start preloading in background
|
| 606 |
+
asyncio.create_task(preload_models_background())
|
| 607 |
|
| 608 |
|
| 609 |
@app.on_event("shutdown")
|
| 610 |
async def shutdown_event():
|
| 611 |
+
"""Clean shutdown - clear cache and close HTTP session."""
|
| 612 |
+
logger.info("Application shutdown initiated")
|
| 613 |
+
|
| 614 |
+
if http_session:
|
| 615 |
+
await http_session.close()
|
| 616 |
+
|
| 617 |
+
await model_cache.clear()
|
| 618 |
|
| 619 |
|
| 620 |
@app.get(
|
|
|
|
| 634 |
- List of all available models
|
| 635 |
"""
|
| 636 |
return {
|
| 637 |
+
"status": "AGI Multi-Model API - High Performance Edition",
|
| 638 |
"current_model": current_model,
|
| 639 |
"available_models": list(AVAILABLE_MODELS.keys())
|
| 640 |
}
|
| 641 |
|
| 642 |
|
| 643 |
+
@app.get(
|
| 644 |
+
"/health",
|
| 645 |
+
tags=["status"],
|
| 646 |
+
summary="Health Check",
|
| 647 |
+
description="Simple health check endpoint for monitoring."
|
| 648 |
+
)
|
| 649 |
+
async def health_check():
|
| 650 |
+
"""Health check endpoint."""
|
| 651 |
+
return {
|
| 652 |
+
"status": "healthy",
|
| 653 |
+
"timestamp": time.time(),
|
| 654 |
+
"cached_models": len(model_cache.cache),
|
| 655 |
+
"current_model": current_model
|
| 656 |
+
}
|
| 657 |
+
|
| 658 |
+
|
| 659 |
@app.get(
|
| 660 |
"/models",
|
| 661 |
response_model=ModelsResponse,
|
|
|
|
| 670 |
Returns:
|
| 671 |
- current_model: The model currently in use
|
| 672 |
- available_models: Array of all available model names
|
|
|
|
|
|
|
| 673 |
"""
|
| 674 |
return {
|
| 675 |
"current_model": current_model,
|
|
|
|
| 682 |
response_model=ModelSwitchResponse,
|
| 683 |
tags=["models"],
|
| 684 |
summary="Switch Active Model",
|
| 685 |
+
description="Switch to a different LLM model with intelligent caching for instant switching."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 686 |
)
|
| 687 |
async def switch_model(request: ModelSwitchRequest):
|
| 688 |
"""
|
| 689 |
Switch to a different LLM model with intelligent caching.
|
| 690 |
|
| 691 |
+
**Performance optimizations:**
|
| 692 |
+
- Instant switching for cached models
|
| 693 |
+
- Async model loading with exponential backoff
|
| 694 |
+
- Connection pooling for health checks
|
| 695 |
+
- Background preloading of popular models
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 696 |
"""
|
| 697 |
global current_model
|
| 698 |
|
|
|
|
| 705 |
if request.model_name == current_model:
|
| 706 |
return {"message": f"Already using model: {current_model}", "model": current_model}
|
| 707 |
|
| 708 |
+
metrics.total_switches += 1
|
| 709 |
+
|
| 710 |
# Try to get from cache
|
| 711 |
cached_model = model_cache.get(request.model_name)
|
| 712 |
|
| 713 |
if cached_model:
|
| 714 |
# Model is cached, instant switch
|
| 715 |
+
metrics.cache_hits += 1
|
| 716 |
current_model = request.model_name
|
| 717 |
return {
|
| 718 |
+
"message": f"Switched to model: {current_model} (from cache, instant)",
|
| 719 |
"model": current_model
|
| 720 |
}
|
| 721 |
|
| 722 |
# Model not cached, need to load it
|
| 723 |
+
metrics.cache_misses += 1
|
| 724 |
model_id = AVAILABLE_MODELS[request.model_name]
|
| 725 |
port = model_cache._get_next_port()
|
| 726 |
|
| 727 |
try:
|
| 728 |
+
process, load_time = await start_llama_server(model_id, port)
|
| 729 |
+
await model_cache.put(request.model_name, model_id, process, port, load_time)
|
| 730 |
current_model = request.model_name
|
| 731 |
|
| 732 |
return {
|
| 733 |
+
"message": f"Switched to model: {current_model} (loaded in {load_time:.2f}s)",
|
| 734 |
"model": current_model
|
| 735 |
}
|
| 736 |
except Exception as e:
|
|
|
|
| 743 |
"/v1/chat/completions",
|
| 744 |
tags=["chat"],
|
| 745 |
summary="Chat Completions",
|
| 746 |
+
description="High-performance OpenAI-compatible chat completions with connection pooling."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 747 |
)
|
| 748 |
async def chat_completions(request: ChatCompletionRequest):
|
| 749 |
"""
|
| 750 |
+
OpenAI-compatible chat completions with performance optimizations.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 751 |
|
| 752 |
+
**Performance features:**
|
| 753 |
+
- Async/await for non-blocking I/O
|
| 754 |
+
- HTTP connection pooling
|
| 755 |
+
- Request metrics tracking
|
| 756 |
"""
|
| 757 |
try:
|
| 758 |
+
request_start = time.time()
|
| 759 |
+
|
| 760 |
# Get current model from cache
|
| 761 |
cached_model = model_cache.get(current_model)
|
| 762 |
if not cached_model:
|
| 763 |
raise HTTPException(status_code=500, detail="Current model not loaded")
|
| 764 |
|
| 765 |
+
# Forward to llama-server using aiohttp
|
| 766 |
+
async with http_session.post(
|
| 767 |
f"{cached_model.url}/v1/chat/completions",
|
| 768 |
json={
|
| 769 |
"messages": request.messages,
|
| 770 |
"max_tokens": request.max_tokens,
|
| 771 |
"temperature": request.temperature,
|
| 772 |
+
}
|
| 773 |
+
) as response:
|
| 774 |
+
response.raise_for_status()
|
| 775 |
+
result = await response.json()
|
| 776 |
+
|
| 777 |
+
# Update metrics
|
| 778 |
+
request_latency = time.time() - request_start
|
| 779 |
+
cached_model.request_count += 1
|
| 780 |
+
cached_model.total_latency += request_latency
|
| 781 |
+
metrics.record_request(current_model, request_latency)
|
| 782 |
+
|
| 783 |
+
return result
|
| 784 |
+
except aiohttp.ClientError as e:
|
| 785 |
raise HTTPException(status_code=500, detail=f"llama-server error: {str(e)}")
|
| 786 |
|
| 787 |
|
| 788 |
+
async def search_web_async(query: str, max_results: int = 5) -> list[dict]:
|
| 789 |
+
"""
|
| 790 |
+
Search the web using DuckDuckGo with result caching.
|
| 791 |
+
|
| 792 |
+
Implements LRU cache with TTL for search results.
|
| 793 |
+
"""
|
| 794 |
+
# Check cache first
|
| 795 |
+
cached_results = web_search_cache.get(query, max_results)
|
| 796 |
+
if cached_results is not None:
|
| 797 |
+
metrics.web_search_cache_hits += 1
|
| 798 |
+
logger.debug(f"Web search cache hit for: {query}")
|
| 799 |
+
return cached_results
|
| 800 |
+
|
| 801 |
+
# Perform search
|
| 802 |
try:
|
| 803 |
+
logger.debug(f"Performing web search: {query}")
|
| 804 |
+
|
| 805 |
+
# Run blocking DDGS in thread pool to avoid blocking event loop
|
| 806 |
+
loop = asyncio.get_event_loop()
|
| 807 |
+
results = await loop.run_in_executor(
|
| 808 |
+
None,
|
| 809 |
+
lambda: list(DDGS().text(query, max_results=max_results))
|
| 810 |
+
)
|
| 811 |
+
|
| 812 |
+
# Cache results
|
| 813 |
+
web_search_cache.put(query, max_results, results)
|
| 814 |
+
metrics.total_web_searches += 1
|
| 815 |
+
|
| 816 |
+
logger.debug(f"Found {len(results)} search results")
|
| 817 |
+
return results
|
| 818 |
except Exception as e:
|
| 819 |
+
logger.error(f"Search error: {e}")
|
| 820 |
return []
|
| 821 |
|
| 822 |
|
|
|
|
| 845 |
"/v1/web-chat/completions",
|
| 846 |
tags=["chat"],
|
| 847 |
summary="Web-Augmented Chat Completions",
|
| 848 |
+
description="Chat completions with real-time web search and result caching."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 849 |
)
|
| 850 |
async def web_chat_completions(request: WebChatRequest):
|
| 851 |
"""
|
| 852 |
+
Chat completions with web search augmentation.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 853 |
|
| 854 |
+
**Performance optimizations:**
|
| 855 |
+
- Async web search
|
| 856 |
+
- LRU cache for search results (1 hour TTL)
|
| 857 |
+
- Parallel execution where possible
|
| 858 |
"""
|
| 859 |
try:
|
| 860 |
# Get the last user message as search query
|
|
|
|
| 864 |
|
| 865 |
search_query = user_messages[-1].get("content", "")
|
| 866 |
|
| 867 |
+
# Perform web search (async with caching)
|
| 868 |
+
logger.info(f"Web chat: Searching for '{search_query}'")
|
| 869 |
+
search_results = await search_web_async(search_query, request.max_search_results)
|
| 870 |
|
| 871 |
# Format search results as context
|
| 872 |
web_context = format_search_context(search_query, search_results)
|
|
|
|
| 885 |
Always cite sources when using information from the search results."""
|
| 886 |
}
|
| 887 |
|
|
|
|
| 888 |
augmented_messages.insert(-1, system_prompt)
|
| 889 |
|
| 890 |
# Get current model from cache
|
|
|
|
| 893 |
raise HTTPException(status_code=500, detail="Current model not loaded")
|
| 894 |
|
| 895 |
# Forward to llama-server with augmented context
|
| 896 |
+
async with http_session.post(
|
| 897 |
f"{cached_model.url}/v1/chat/completions",
|
| 898 |
json={
|
| 899 |
"messages": augmented_messages,
|
| 900 |
"max_tokens": request.max_tokens,
|
| 901 |
"temperature": request.temperature,
|
| 902 |
+
}
|
| 903 |
+
) as response:
|
| 904 |
+
response.raise_for_status()
|
| 905 |
+
result = await response.json()
|
|
|
|
|
|
|
| 906 |
|
| 907 |
# Add metadata about search results
|
| 908 |
result["web_search"] = {
|
| 909 |
"query": search_query,
|
| 910 |
"results_count": len(search_results),
|
| 911 |
+
"sources": [r.get("href", "") for r in search_results if r.get("href")],
|
| 912 |
+
"cached": metrics.web_search_cache_hits > 0
|
| 913 |
}
|
| 914 |
|
| 915 |
return result
|
| 916 |
|
| 917 |
+
except aiohttp.ClientError as e:
|
| 918 |
raise HTTPException(status_code=500, detail=f"llama-server error: {str(e)}")
|
| 919 |
except Exception as e:
|
| 920 |
raise HTTPException(status_code=500, detail=f"Error: {str(e)}")
|
|
|
|
| 924 |
"/cache/info",
|
| 925 |
tags=["models"],
|
| 926 |
summary="Get Cache Information",
|
| 927 |
+
description="Returns information about the model cache and performance statistics."
|
| 928 |
)
|
| 929 |
async def get_cache_info():
|
| 930 |
+
"""Get detailed information about the model cache."""
|
| 931 |
+
return model_cache.get_cache_info()
|
| 932 |
+
|
| 933 |
+
|
| 934 |
+
@app.get(
|
| 935 |
+
"/metrics",
|
| 936 |
+
tags=["monitoring"],
|
| 937 |
+
summary="Performance Metrics",
|
| 938 |
+
description="Get comprehensive performance metrics and statistics."
|
| 939 |
+
)
|
| 940 |
+
async def get_metrics():
|
| 941 |
"""
|
| 942 |
+
Get performance metrics for monitoring and optimization.
|
| 943 |
|
| 944 |
Returns:
|
| 945 |
+
- Request counts and latencies
|
| 946 |
+
- Cache hit/miss ratios
|
| 947 |
+
- Model-specific statistics
|
| 948 |
+
- Web search cache stats
|
| 949 |
+
- Startup time
|
| 950 |
+
"""
|
| 951 |
+
cache_hit_rate = (
|
| 952 |
+
metrics.cache_hits / (metrics.cache_hits + metrics.cache_misses)
|
| 953 |
+
if (metrics.cache_hits + metrics.cache_misses) > 0
|
| 954 |
+
else 0.0
|
| 955 |
+
)
|
| 956 |
+
|
| 957 |
+
web_cache_hit_rate = (
|
| 958 |
+
metrics.web_search_cache_hits / metrics.total_web_searches
|
| 959 |
+
if metrics.total_web_searches > 0
|
| 960 |
+
else 0.0
|
| 961 |
+
)
|
| 962 |
+
|
| 963 |
+
return {
|
| 964 |
+
"uptime_seconds": time.time() - (metrics.startup_time or time.time()),
|
| 965 |
+
"startup_time_seconds": metrics.startup_time,
|
| 966 |
+
"total_requests": metrics.total_requests,
|
| 967 |
+
"total_model_switches": metrics.total_switches,
|
| 968 |
+
"cache_stats": {
|
| 969 |
+
"hits": metrics.cache_hits,
|
| 970 |
+
"misses": metrics.cache_misses,
|
| 971 |
+
"hit_rate": cache_hit_rate,
|
| 972 |
+
"current_size": len(model_cache.cache),
|
| 973 |
+
"max_size": model_cache.max_size
|
| 974 |
},
|
| 975 |
+
"web_search_stats": {
|
| 976 |
+
"total_searches": metrics.total_web_searches,
|
| 977 |
+
"cache_hits": metrics.web_search_cache_hits,
|
| 978 |
+
"cache_hit_rate": web_cache_hit_rate,
|
| 979 |
+
"cache_size": len(web_search_cache.cache)
|
| 980 |
+
},
|
| 981 |
+
"model_metrics": metrics.model_metrics,
|
| 982 |
+
"cached_models": model_cache.get_cache_info()["cached_models"]
|
| 983 |
}
|
| 984 |
+
|
| 985 |
+
|
| 986 |
+
@app.post(
|
| 987 |
+
"/cache/clear",
|
| 988 |
+
tags=["models"],
|
| 989 |
+
summary="Clear Model Cache",
|
| 990 |
+
description="Clear all cached models (will reload on next request)."
|
| 991 |
+
)
|
| 992 |
+
async def clear_cache():
|
| 993 |
+
"""Clear all cached models."""
|
| 994 |
+
await model_cache.clear()
|
| 995 |
+
return {"message": "Cache cleared successfully"}
|
| 996 |
+
|
| 997 |
+
|
| 998 |
+
@app.post(
|
| 999 |
+
"/cache/web-search/clear",
|
| 1000 |
+
tags=["models"],
|
| 1001 |
+
summary="Clear Web Search Cache",
|
| 1002 |
+
description="Clear all cached web search results."
|
| 1003 |
+
)
|
| 1004 |
+
async def clear_web_search_cache():
|
| 1005 |
+
"""Clear web search cache."""
|
| 1006 |
+
web_search_cache.clear()
|
| 1007 |
+
metrics.web_search_cache_hits = 0
|
| 1008 |
+
metrics.total_web_searches = 0
|
| 1009 |
+
return {"message": "Web search cache cleared successfully"}
|
| 1010 |
|
| 1011 |
|
| 1012 |
@app.get(
|
|
|
|
| 1017 |
include_in_schema=False
|
| 1018 |
)
|
| 1019 |
async def get_openapi_spec():
|
| 1020 |
+
"""Export the OpenAPI specification for this API."""
|
| 1021 |
+
return app.openapi()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
logger.py
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Centralized logging module for AGI Multi-Model API.
|
| 3 |
+
|
| 4 |
+
Provides structured logging with:
|
| 5 |
+
- Colored console output
|
| 6 |
+
- File logging with rotation
|
| 7 |
+
- Configurable log levels
|
| 8 |
+
- Timestamp and module name tracking
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
import logging
|
| 12 |
+
import sys
|
| 13 |
+
from pathlib import Path
|
| 14 |
+
from logging.handlers import RotatingFileHandler
|
| 15 |
+
from typing import Optional
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
class ColoredFormatter(logging.Formatter):
|
| 19 |
+
"""Custom formatter with color support for console output."""
|
| 20 |
+
|
| 21 |
+
# ANSI color codes
|
| 22 |
+
COLORS = {
|
| 23 |
+
'DEBUG': '\033[36m', # Cyan
|
| 24 |
+
'INFO': '\033[32m', # Green
|
| 25 |
+
'WARNING': '\033[33m', # Yellow
|
| 26 |
+
'ERROR': '\033[31m', # Red
|
| 27 |
+
'CRITICAL': '\033[35m', # Magenta
|
| 28 |
+
}
|
| 29 |
+
RESET = '\033[0m'
|
| 30 |
+
BOLD = '\033[1m'
|
| 31 |
+
|
| 32 |
+
def format(self, record):
|
| 33 |
+
"""Format log record with colors."""
|
| 34 |
+
# Add color to level name
|
| 35 |
+
levelname = record.levelname
|
| 36 |
+
if levelname in self.COLORS:
|
| 37 |
+
record.levelname = f"{self.COLORS[levelname]}{self.BOLD}{levelname}{self.RESET}"
|
| 38 |
+
|
| 39 |
+
# Format the message
|
| 40 |
+
result = super().format(record)
|
| 41 |
+
|
| 42 |
+
# Reset levelname for other handlers
|
| 43 |
+
record.levelname = levelname
|
| 44 |
+
|
| 45 |
+
return result
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
class Logger:
|
| 49 |
+
"""
|
| 50 |
+
Singleton logger class for the entire application.
|
| 51 |
+
|
| 52 |
+
Usage:
|
| 53 |
+
from logger import get_logger
|
| 54 |
+
logger = get_logger(__name__)
|
| 55 |
+
logger.info("Application started")
|
| 56 |
+
"""
|
| 57 |
+
|
| 58 |
+
_instance: Optional[logging.Logger] = None
|
| 59 |
+
_initialized: bool = False
|
| 60 |
+
|
| 61 |
+
@classmethod
|
| 62 |
+
def get_logger(
|
| 63 |
+
cls,
|
| 64 |
+
name: str = "AGI",
|
| 65 |
+
level: int = logging.INFO,
|
| 66 |
+
log_file: Optional[str] = "agi.log",
|
| 67 |
+
max_bytes: int = 10 * 1024 * 1024, # 10MB
|
| 68 |
+
backup_count: int = 5
|
| 69 |
+
) -> logging.Logger:
|
| 70 |
+
"""
|
| 71 |
+
Get or create the application logger.
|
| 72 |
+
|
| 73 |
+
Args:
|
| 74 |
+
name: Logger name (typically module name)
|
| 75 |
+
level: Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
|
| 76 |
+
log_file: Path to log file (None to disable file logging)
|
| 77 |
+
max_bytes: Maximum size of log file before rotation
|
| 78 |
+
backup_count: Number of backup files to keep
|
| 79 |
+
|
| 80 |
+
Returns:
|
| 81 |
+
Configured logger instance
|
| 82 |
+
"""
|
| 83 |
+
# Create or get logger
|
| 84 |
+
logger = logging.getLogger(name)
|
| 85 |
+
|
| 86 |
+
# Only configure handlers once for the root logger
|
| 87 |
+
if not cls._initialized and name == "AGI":
|
| 88 |
+
logger.setLevel(level)
|
| 89 |
+
|
| 90 |
+
# Console handler with colors
|
| 91 |
+
console_handler = logging.StreamHandler(sys.stdout)
|
| 92 |
+
console_handler.setLevel(level)
|
| 93 |
+
console_formatter = ColoredFormatter(
|
| 94 |
+
fmt='%(asctime)s | %(levelname)s | %(name)s | %(message)s',
|
| 95 |
+
datefmt='%Y-%m-%d %H:%M:%S'
|
| 96 |
+
)
|
| 97 |
+
console_handler.setFormatter(console_formatter)
|
| 98 |
+
logger.addHandler(console_handler)
|
| 99 |
+
|
| 100 |
+
# File handler with rotation (if enabled)
|
| 101 |
+
if log_file:
|
| 102 |
+
log_path = Path(log_file)
|
| 103 |
+
log_path.parent.mkdir(parents=True, exist_ok=True)
|
| 104 |
+
|
| 105 |
+
file_handler = RotatingFileHandler(
|
| 106 |
+
log_file,
|
| 107 |
+
maxBytes=max_bytes,
|
| 108 |
+
backupCount=backup_count
|
| 109 |
+
)
|
| 110 |
+
file_handler.setLevel(level)
|
| 111 |
+
file_formatter = logging.Formatter(
|
| 112 |
+
fmt='%(asctime)s | %(levelname)-8s | %(name)s | %(funcName)s:%(lineno)d | %(message)s',
|
| 113 |
+
datefmt='%Y-%m-%d %H:%M:%S'
|
| 114 |
+
)
|
| 115 |
+
file_handler.setFormatter(file_formatter)
|
| 116 |
+
logger.addHandler(file_handler)
|
| 117 |
+
|
| 118 |
+
# Prevent propagation to avoid duplicate logs
|
| 119 |
+
logger.propagate = False
|
| 120 |
+
cls._initialized = True
|
| 121 |
+
|
| 122 |
+
return logger
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
# Convenience function for easy import
|
| 126 |
+
def get_logger(name: str = "AGI", level: int = logging.INFO) -> logging.Logger:
|
| 127 |
+
"""
|
| 128 |
+
Get a logger instance for the specified module.
|
| 129 |
+
|
| 130 |
+
Args:
|
| 131 |
+
name: Logger name (use __name__ for automatic module naming)
|
| 132 |
+
level: Logging level (default: INFO)
|
| 133 |
+
|
| 134 |
+
Returns:
|
| 135 |
+
Configured logger instance
|
| 136 |
+
|
| 137 |
+
Example:
|
| 138 |
+
from logger import get_logger
|
| 139 |
+
logger = get_logger(__name__)
|
| 140 |
+
logger.info("Starting application")
|
| 141 |
+
"""
|
| 142 |
+
return Logger.get_logger(name, level)
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
# Initialize the root logger on module import
|
| 146 |
+
_root_logger = Logger.get_logger("AGI", level=logging.INFO)
|
| 147 |
+
|
| 148 |
+
|
| 149 |
+
if __name__ == "__main__":
|
| 150 |
+
# Test the logger
|
| 151 |
+
logger = get_logger("test_module")
|
| 152 |
+
|
| 153 |
+
logger.debug("This is a debug message")
|
| 154 |
+
logger.info("This is an info message")
|
| 155 |
+
logger.warning("This is a warning message")
|
| 156 |
+
logger.error("This is an error message")
|
| 157 |
+
logger.critical("This is a critical message")
|
| 158 |
+
|
| 159 |
+
print("\nTesting with different module names:")
|
| 160 |
+
api_logger = get_logger("api")
|
| 161 |
+
api_logger.info("API logger initialized")
|
| 162 |
+
|
| 163 |
+
client_logger = get_logger("client")
|
| 164 |
+
client_logger.info("Client logger initialized")
|
pyproject.toml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
[project]
|
| 2 |
-
name = "
|
| 3 |
-
version = "0.0
|
| 4 |
-
description = "
|
| 5 |
authors = [
|
| 6 |
{ name = "AI Developer", email = "you@example.com" }
|
| 7 |
]
|
|
@@ -9,6 +9,7 @@ requires-python = ">=3.12"
|
|
| 9 |
dependencies = [
|
| 10 |
"fastapi>=0.104.0",
|
| 11 |
"uvicorn[standard]>=0.24.0",
|
|
|
|
| 12 |
"llama-cpp-python>=0.2.0",
|
| 13 |
"huggingface-hub>=0.19.0",
|
| 14 |
"duckduckgo-search>=4.0.0",
|
|
|
|
| 1 |
[project]
|
| 2 |
+
name = "agi-multi-model-api"
|
| 3 |
+
version = "0.1.0"
|
| 4 |
+
description = "High-Performance Multi-Model LLM API with Dynamic Switching"
|
| 5 |
authors = [
|
| 6 |
{ name = "AI Developer", email = "you@example.com" }
|
| 7 |
]
|
|
|
|
| 9 |
dependencies = [
|
| 10 |
"fastapi>=0.104.0",
|
| 11 |
"uvicorn[standard]>=0.24.0",
|
| 12 |
+
"aiohttp>=3.9.0",
|
| 13 |
"llama-cpp-python>=0.2.0",
|
| 14 |
"huggingface-hub>=0.19.0",
|
| 15 |
"duckduckgo-search>=4.0.0",
|