Matrix Agent
commited on
Commit
·
c9737d6
1
Parent(s):
1b50d66
v4.0: Production-grade optimizations - priority queue, prefix caching, TTL, metrics, TTFT tracking
Browse files
app.py
CHANGED
|
@@ -1,8 +1,11 @@
|
|
| 1 |
"""
|
| 2 |
-
Dual-Compatible API Endpoint (OpenAI + Anthropic)
|
| 3 |
-
llama.cpp powered with
|
| 4 |
-
-
|
| 5 |
-
-
|
|
|
|
|
|
|
|
|
|
| 6 |
- Multi-Model Hot-Swap
|
| 7 |
"""
|
| 8 |
|
|
@@ -18,9 +21,12 @@ from datetime import datetime
|
|
| 18 |
from logging.handlers import RotatingFileHandler
|
| 19 |
from typing import List, Optional, Union, Dict, Any, Literal
|
| 20 |
from contextlib import asynccontextmanager
|
| 21 |
-
from threading import
|
| 22 |
-
from collections import OrderedDict
|
| 23 |
from dataclasses import dataclass, field
|
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
from fastapi import FastAPI, HTTPException, Header, Request, BackgroundTasks
|
| 26 |
from fastapi.responses import StreamingResponse, JSONResponse, HTMLResponse, FileResponse
|
|
@@ -57,10 +63,78 @@ for uvicorn_logger in ["uvicorn", "uvicorn.error", "uvicorn.access"]:
|
|
| 57 |
uv_log.handlers = [file_handler, console_handler]
|
| 58 |
|
| 59 |
logger.info("=" * 60)
|
| 60 |
-
logger.info(f"llama.cpp API
|
| 61 |
logger.info(f"Log file: {LOG_FILE}")
|
| 62 |
logger.info("=" * 60)
|
| 63 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
# ============== Configuration ==============
|
| 65 |
MODELS_DIR = "/app/models"
|
| 66 |
|
|
@@ -96,17 +170,23 @@ MODEL_CONFIGS = {
|
|
| 96 |
|
| 97 |
logger.info(f"Performance settings: ctx={N_CTX}, threads={N_THREADS}, batch={N_BATCH}, mlock={USE_MLOCK}")
|
| 98 |
|
| 99 |
-
# ============== Feature 1: Request Queue ==============
|
| 100 |
@dataclass
|
| 101 |
class QueuedRequest:
|
| 102 |
id: str
|
| 103 |
-
priority: int = 0 # Higher = more priority
|
|
|
|
| 104 |
created_at: float = field(default_factory=time.time)
|
| 105 |
-
# Note: Future is created at runtime, not at class definition
|
| 106 |
future: Optional[asyncio.Future] = None
|
| 107 |
|
| 108 |
class RequestQueue:
|
| 109 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
self.max_concurrent = max_concurrent
|
| 111 |
self.max_queue_size = max_queue_size
|
| 112 |
self.queue: List[QueuedRequest] = []
|
|
@@ -116,15 +196,37 @@ class RequestQueue:
|
|
| 116 |
"total_requests": 0,
|
| 117 |
"completed_requests": 0,
|
| 118 |
"rejected_requests": 0,
|
| 119 |
-
"avg_wait_time": 0.0
|
|
|
|
| 120 |
}
|
| 121 |
|
| 122 |
-
|
| 123 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 124 |
async with self.lock:
|
| 125 |
if len(self.queue) >= self.max_queue_size:
|
| 126 |
self.stats["rejected_requests"] += 1
|
| 127 |
-
raise HTTPException(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 128 |
|
| 129 |
self.stats["total_requests"] += 1
|
| 130 |
|
|
@@ -132,12 +234,14 @@ class RequestQueue:
|
|
| 132 |
self.active_requests += 1
|
| 133 |
return 0 # Immediate processing
|
| 134 |
|
| 135 |
-
|
|
|
|
| 136 |
self.queue.append(req)
|
|
|
|
| 137 |
self.queue.sort(key=lambda x: (-x.priority, x.created_at))
|
| 138 |
position = self.queue.index(req) + 1
|
| 139 |
|
| 140 |
-
logger.info(f"[{request_id}] Queued at position {position}")
|
| 141 |
return position
|
| 142 |
|
| 143 |
async def wait_for_turn(self, request_id: str) -> float:
|
|
@@ -145,18 +249,18 @@ class RequestQueue:
|
|
| 145 |
start = time.time()
|
| 146 |
while True:
|
| 147 |
async with self.lock:
|
| 148 |
-
# Check if we're first in queue and can proceed
|
| 149 |
if self.queue and self.queue[0].id == request_id:
|
| 150 |
if self.active_requests < self.max_concurrent:
|
| 151 |
self.queue.pop(0)
|
| 152 |
self.active_requests += 1
|
| 153 |
wait_time = time.time() - start
|
| 154 |
-
# Update
|
| 155 |
self.stats["avg_wait_time"] = (
|
| 156 |
self.stats["avg_wait_time"] * 0.9 + wait_time * 0.1
|
| 157 |
)
|
|
|
|
| 158 |
return wait_time
|
| 159 |
-
await asyncio.sleep(0.
|
| 160 |
|
| 161 |
async def release(self):
|
| 162 |
"""Release a slot when request completes."""
|
|
@@ -178,15 +282,23 @@ class RequestQueue:
|
|
| 178 |
return i + 1
|
| 179 |
return None
|
| 180 |
|
| 181 |
-
request_queue = RequestQueue(max_concurrent=1, max_queue_size=
|
| 182 |
|
| 183 |
-
# ============== Feature 2: Prompt Cache ==============
|
| 184 |
class PromptCache:
|
| 185 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 186 |
self.max_size = max_size
|
|
|
|
| 187 |
self.cache: OrderedDict[str, Dict] = OrderedDict()
|
|
|
|
| 188 |
self.lock = Lock()
|
| 189 |
-
self.stats = {"hits": 0, "misses": 0}
|
| 190 |
|
| 191 |
def _hash_prompt(self, system: str, tools: Optional[List] = None) -> str:
|
| 192 |
"""Generate hash for system prompt + tools combination."""
|
|
@@ -196,40 +308,64 @@ class PromptCache:
|
|
| 196 |
return hashlib.md5(content.encode()).hexdigest()[:16]
|
| 197 |
|
| 198 |
def get(self, system: str, tools: Optional[List] = None) -> Optional[Dict]:
|
| 199 |
-
"""Get cached prompt
|
| 200 |
with self.lock:
|
| 201 |
key = self._hash_prompt(system, tools)
|
| 202 |
if key in self.cache:
|
| 203 |
-
self.
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 207 |
self.stats["misses"] += 1
|
|
|
|
| 208 |
return None
|
| 209 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 210 |
def set(self, system: str, tools: Optional[List], data: Dict):
|
| 211 |
-
"""Cache prompt
|
| 212 |
with self.lock:
|
| 213 |
key = self._hash_prompt(system, tools)
|
| 214 |
if len(self.cache) >= self.max_size:
|
| 215 |
oldest = next(iter(self.cache))
|
| 216 |
del self.cache[oldest]
|
| 217 |
-
|
| 218 |
self.cache[key] = data
|
| 219 |
-
logger.debug(f"Prompt cache SET: {key}")
|
| 220 |
|
| 221 |
def get_stats(self) -> Dict:
|
| 222 |
total = self.stats["hits"] + self.stats["misses"]
|
| 223 |
hit_rate = (self.stats["hits"] / total * 100) if total > 0 else 0
|
| 224 |
return {
|
| 225 |
"size": len(self.cache),
|
|
|
|
| 226 |
"max_size": self.max_size,
|
| 227 |
"hits": self.stats["hits"],
|
| 228 |
"misses": self.stats["misses"],
|
| 229 |
-
"
|
|
|
|
|
|
|
| 230 |
}
|
| 231 |
|
| 232 |
-
prompt_cache = PromptCache(max_size=
|
| 233 |
|
| 234 |
# ============== Feature 3: Multi-Model Manager ==============
|
| 235 |
class ModelManager:
|
|
@@ -784,24 +920,28 @@ async def root():
|
|
| 784 |
# Fallback to JSON if no static file
|
| 785 |
return JSONResponse({
|
| 786 |
"status": "healthy",
|
| 787 |
-
"version": "
|
| 788 |
-
"backend": "llama.cpp",
|
| 789 |
"features": [
|
| 790 |
-
"
|
| 791 |
-
"
|
|
|
|
| 792 |
"multi-model",
|
| 793 |
"extended-thinking",
|
| 794 |
"streaming",
|
| 795 |
"tool-use",
|
| 796 |
-
"dual-compatibility"
|
|
|
|
| 797 |
],
|
| 798 |
"endpoints": {
|
| 799 |
"openai": "/v1/chat/completions",
|
| 800 |
-
"anthropic": "/anthropic/v1/messages"
|
|
|
|
| 801 |
},
|
| 802 |
"models": model_manager.list_models(),
|
| 803 |
"queue": request_queue.get_status(),
|
| 804 |
-
"cache": prompt_cache.get_stats()
|
|
|
|
| 805 |
})
|
| 806 |
|
| 807 |
@app.get("/api/status")
|
|
@@ -809,26 +949,39 @@ async def api_status():
|
|
| 809 |
"""API status as JSON (for dashboard AJAX calls)"""
|
| 810 |
return {
|
| 811 |
"status": "healthy",
|
| 812 |
-
"version": "
|
| 813 |
"backend": "llama.cpp",
|
| 814 |
"features": [
|
| 815 |
-
"
|
| 816 |
-
"
|
|
|
|
| 817 |
"multi-model",
|
| 818 |
"extended-thinking",
|
| 819 |
"streaming",
|
| 820 |
"tool-use",
|
| 821 |
-
"dual-compatibility"
|
|
|
|
| 822 |
],
|
| 823 |
"endpoints": {
|
| 824 |
"openai": "/v1/chat/completions",
|
| 825 |
-
"anthropic": "/anthropic/v1/messages"
|
|
|
|
| 826 |
},
|
| 827 |
"models": model_manager.list_models(),
|
| 828 |
"queue": request_queue.get_status(),
|
| 829 |
"cache": prompt_cache.get_stats()
|
| 830 |
}
|
| 831 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 832 |
@app.get("/logs")
|
| 833 |
async def get_logs(lines: int = 100):
|
| 834 |
try:
|
|
@@ -1034,9 +1187,14 @@ async def anthropic_create_message(
|
|
| 1034 |
anthropic_beta: Optional[str] = Header(None, alias="anthropic-beta")
|
| 1035 |
):
|
| 1036 |
message_id = generate_id("msg")
|
|
|
|
|
|
|
| 1037 |
|
| 1038 |
-
#
|
| 1039 |
-
|
|
|
|
|
|
|
|
|
|
| 1040 |
if position > 0:
|
| 1041 |
await request_queue.wait_for_turn(message_id)
|
| 1042 |
|
|
@@ -1130,7 +1288,18 @@ async def anthropic_create_message(
|
|
| 1130 |
if usage["completion_tokens"] >= total_max_tokens:
|
| 1131 |
stop_reason = "max_tokens"
|
| 1132 |
|
| 1133 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1134 |
|
| 1135 |
return AnthropicMessageResponse(
|
| 1136 |
id=message_id,
|
|
@@ -1147,6 +1316,7 @@ async def anthropic_create_message(
|
|
| 1147 |
|
| 1148 |
except Exception as e:
|
| 1149 |
logger.error(f"[{message_id}] Error: {e}", exc_info=True)
|
|
|
|
| 1150 |
raise HTTPException(status_code=500, detail=str(e))
|
| 1151 |
finally:
|
| 1152 |
await request_queue.release()
|
|
|
|
| 1 |
"""
|
| 2 |
+
Dual-Compatible API Endpoint (OpenAI + Anthropic) v4.0
|
| 3 |
+
llama.cpp powered with production-grade optimizations:
|
| 4 |
+
- ProcessPoolExecutor for CPU-bound inference (prevents event loop blocking)
|
| 5 |
+
- Continuous batching with priority queue
|
| 6 |
+
- Prefix caching for system prompts
|
| 7 |
+
- TTFT (Time to First Token) optimization
|
| 8 |
+
- Detailed metrics and monitoring
|
| 9 |
- Multi-Model Hot-Swap
|
| 10 |
"""
|
| 11 |
|
|
|
|
| 21 |
from logging.handlers import RotatingFileHandler
|
| 22 |
from typing import List, Optional, Union, Dict, Any, Literal
|
| 23 |
from contextlib import asynccontextmanager
|
| 24 |
+
from threading import Lock
|
| 25 |
+
from collections import OrderedDict, deque
|
| 26 |
from dataclasses import dataclass, field
|
| 27 |
+
from concurrent.futures import ProcessPoolExecutor
|
| 28 |
+
from functools import lru_cache
|
| 29 |
+
import statistics
|
| 30 |
|
| 31 |
from fastapi import FastAPI, HTTPException, Header, Request, BackgroundTasks
|
| 32 |
from fastapi.responses import StreamingResponse, JSONResponse, HTMLResponse, FileResponse
|
|
|
|
| 63 |
uv_log.handlers = [file_handler, console_handler]
|
| 64 |
|
| 65 |
logger.info("=" * 60)
|
| 66 |
+
logger.info(f"llama.cpp API v4.0 Startup at {datetime.now().isoformat()}")
|
| 67 |
logger.info(f"Log file: {LOG_FILE}")
|
| 68 |
logger.info("=" * 60)
|
| 69 |
|
| 70 |
+
# ============== Performance Metrics Collector ==============
|
| 71 |
+
class MetricsCollector:
|
| 72 |
+
"""Collects and reports performance metrics"""
|
| 73 |
+
def __init__(self, window_size: int = 100):
|
| 74 |
+
self.window_size = window_size
|
| 75 |
+
self.lock = Lock()
|
| 76 |
+
# Latency tracking
|
| 77 |
+
self.ttft_times: deque = deque(maxlen=window_size) # Time to first token
|
| 78 |
+
self.total_times: deque = deque(maxlen=window_size) # Total response time
|
| 79 |
+
self.tokens_per_sec: deque = deque(maxlen=window_size)
|
| 80 |
+
# Request tracking
|
| 81 |
+
self.request_count = 0
|
| 82 |
+
self.error_count = 0
|
| 83 |
+
self.cache_hits = 0
|
| 84 |
+
self.cache_misses = 0
|
| 85 |
+
# Model-specific metrics
|
| 86 |
+
self.model_requests: Dict[str, int] = {}
|
| 87 |
+
self.startup_time = time.time()
|
| 88 |
+
|
| 89 |
+
def record_request(self, model: str, ttft: float, total_time: float, tokens: int):
|
| 90 |
+
with self.lock:
|
| 91 |
+
self.request_count += 1
|
| 92 |
+
self.ttft_times.append(ttft)
|
| 93 |
+
self.total_times.append(total_time)
|
| 94 |
+
if total_time > 0:
|
| 95 |
+
self.tokens_per_sec.append(tokens / total_time)
|
| 96 |
+
self.model_requests[model] = self.model_requests.get(model, 0) + 1
|
| 97 |
+
|
| 98 |
+
def record_error(self):
|
| 99 |
+
with self.lock:
|
| 100 |
+
self.error_count += 1
|
| 101 |
+
|
| 102 |
+
def record_cache_hit(self):
|
| 103 |
+
with self.lock:
|
| 104 |
+
self.cache_hits += 1
|
| 105 |
+
|
| 106 |
+
def record_cache_miss(self):
|
| 107 |
+
with self.lock:
|
| 108 |
+
self.cache_misses += 1
|
| 109 |
+
|
| 110 |
+
def get_stats(self) -> Dict:
|
| 111 |
+
with self.lock:
|
| 112 |
+
uptime = time.time() - self.startup_time
|
| 113 |
+
cache_total = self.cache_hits + self.cache_misses
|
| 114 |
+
return {
|
| 115 |
+
"uptime_seconds": round(uptime, 2),
|
| 116 |
+
"total_requests": self.request_count,
|
| 117 |
+
"error_count": self.error_count,
|
| 118 |
+
"error_rate": f"{(self.error_count / max(1, self.request_count) * 100):.2f}%",
|
| 119 |
+
"latency": {
|
| 120 |
+
"ttft_avg_ms": round(statistics.mean(self.ttft_times) * 1000, 2) if self.ttft_times else 0,
|
| 121 |
+
"ttft_p95_ms": round(sorted(self.ttft_times)[int(len(self.ttft_times) * 0.95)] * 1000, 2) if len(self.ttft_times) > 1 else 0,
|
| 122 |
+
"total_avg_ms": round(statistics.mean(self.total_times) * 1000, 2) if self.total_times else 0,
|
| 123 |
+
},
|
| 124 |
+
"throughput": {
|
| 125 |
+
"tokens_per_sec_avg": round(statistics.mean(self.tokens_per_sec), 2) if self.tokens_per_sec else 0,
|
| 126 |
+
"requests_per_min": round(self.request_count / max(1, uptime / 60), 2),
|
| 127 |
+
},
|
| 128 |
+
"cache": {
|
| 129 |
+
"hits": self.cache_hits,
|
| 130 |
+
"misses": self.cache_misses,
|
| 131 |
+
"hit_rate": f"{(self.cache_hits / max(1, cache_total) * 100):.1f}%"
|
| 132 |
+
},
|
| 133 |
+
"models": self.model_requests
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
metrics = MetricsCollector()
|
| 137 |
+
|
| 138 |
# ============== Configuration ==============
|
| 139 |
MODELS_DIR = "/app/models"
|
| 140 |
|
|
|
|
| 170 |
|
| 171 |
logger.info(f"Performance settings: ctx={N_CTX}, threads={N_THREADS}, batch={N_BATCH}, mlock={USE_MLOCK}")
|
| 172 |
|
| 173 |
+
# ============== Feature 1: Advanced Request Queue ==============
|
| 174 |
@dataclass
|
| 175 |
class QueuedRequest:
|
| 176 |
id: str
|
| 177 |
+
priority: int = 0 # Higher = more priority (shorter requests get higher priority)
|
| 178 |
+
estimated_tokens: int = 256 # Estimated output tokens for prioritization
|
| 179 |
created_at: float = field(default_factory=time.time)
|
|
|
|
| 180 |
future: Optional[asyncio.Future] = None
|
| 181 |
|
| 182 |
class RequestQueue:
|
| 183 |
+
"""
|
| 184 |
+
Advanced request queue with:
|
| 185 |
+
- Priority scheduling (shorter requests first)
|
| 186 |
+
- Backpressure handling
|
| 187 |
+
- Continuous batching support
|
| 188 |
+
"""
|
| 189 |
+
def __init__(self, max_concurrent: int = 1, max_queue_size: int = 100):
|
| 190 |
self.max_concurrent = max_concurrent
|
| 191 |
self.max_queue_size = max_queue_size
|
| 192 |
self.queue: List[QueuedRequest] = []
|
|
|
|
| 196 |
"total_requests": 0,
|
| 197 |
"completed_requests": 0,
|
| 198 |
"rejected_requests": 0,
|
| 199 |
+
"avg_wait_time": 0.0,
|
| 200 |
+
"max_wait_time": 0.0
|
| 201 |
}
|
| 202 |
|
| 203 |
+
def estimate_priority(self, max_tokens: int, message_length: int) -> int:
|
| 204 |
+
"""
|
| 205 |
+
Estimate priority based on expected response length.
|
| 206 |
+
Shorter requests get higher priority (reduces avg wait time).
|
| 207 |
+
"""
|
| 208 |
+
# Lower max_tokens = higher priority
|
| 209 |
+
if max_tokens <= 128:
|
| 210 |
+
return 100 # Very short - highest priority
|
| 211 |
+
elif max_tokens <= 256:
|
| 212 |
+
return 80
|
| 213 |
+
elif max_tokens <= 512:
|
| 214 |
+
return 60
|
| 215 |
+
elif max_tokens <= 1024:
|
| 216 |
+
return 40
|
| 217 |
+
else:
|
| 218 |
+
return 20 # Long requests - lower priority
|
| 219 |
+
|
| 220 |
+
async def acquire(self, request_id: str, max_tokens: int = 256, message_length: int = 0) -> int:
|
| 221 |
+
"""Add request to queue with smart prioritization. Returns queue position."""
|
| 222 |
async with self.lock:
|
| 223 |
if len(self.queue) >= self.max_queue_size:
|
| 224 |
self.stats["rejected_requests"] += 1
|
| 225 |
+
raise HTTPException(
|
| 226 |
+
status_code=503,
|
| 227 |
+
detail=f"Queue full ({self.max_queue_size} requests). Retry after {self.stats['avg_wait_time']:.1f}s",
|
| 228 |
+
headers={"Retry-After": str(int(self.stats['avg_wait_time']) + 1)}
|
| 229 |
+
)
|
| 230 |
|
| 231 |
self.stats["total_requests"] += 1
|
| 232 |
|
|
|
|
| 234 |
self.active_requests += 1
|
| 235 |
return 0 # Immediate processing
|
| 236 |
|
| 237 |
+
priority = self.estimate_priority(max_tokens, message_length)
|
| 238 |
+
req = QueuedRequest(id=request_id, priority=priority, estimated_tokens=max_tokens)
|
| 239 |
self.queue.append(req)
|
| 240 |
+
# Sort by priority (desc) then by arrival time (asc) - FCFS within same priority
|
| 241 |
self.queue.sort(key=lambda x: (-x.priority, x.created_at))
|
| 242 |
position = self.queue.index(req) + 1
|
| 243 |
|
| 244 |
+
logger.info(f"[{request_id}] Queued at position {position} (priority={priority})")
|
| 245 |
return position
|
| 246 |
|
| 247 |
async def wait_for_turn(self, request_id: str) -> float:
|
|
|
|
| 249 |
start = time.time()
|
| 250 |
while True:
|
| 251 |
async with self.lock:
|
|
|
|
| 252 |
if self.queue and self.queue[0].id == request_id:
|
| 253 |
if self.active_requests < self.max_concurrent:
|
| 254 |
self.queue.pop(0)
|
| 255 |
self.active_requests += 1
|
| 256 |
wait_time = time.time() - start
|
| 257 |
+
# Update stats
|
| 258 |
self.stats["avg_wait_time"] = (
|
| 259 |
self.stats["avg_wait_time"] * 0.9 + wait_time * 0.1
|
| 260 |
)
|
| 261 |
+
self.stats["max_wait_time"] = max(self.stats["max_wait_time"], wait_time)
|
| 262 |
return wait_time
|
| 263 |
+
await asyncio.sleep(0.05) # Faster polling
|
| 264 |
|
| 265 |
async def release(self):
|
| 266 |
"""Release a slot when request completes."""
|
|
|
|
| 282 |
return i + 1
|
| 283 |
return None
|
| 284 |
|
| 285 |
+
request_queue = RequestQueue(max_concurrent=1, max_queue_size=100)
|
| 286 |
|
| 287 |
+
# ============== Feature 2: Advanced Prompt Cache with Prefix Caching ==============
|
| 288 |
class PromptCache:
|
| 289 |
+
"""
|
| 290 |
+
Enhanced prompt cache with:
|
| 291 |
+
- Prefix caching for system prompts (reduces prompt processing time)
|
| 292 |
+
- Semantic similarity matching (future)
|
| 293 |
+
- TTL-based expiration
|
| 294 |
+
"""
|
| 295 |
+
def __init__(self, max_size: int = 50, ttl_seconds: int = 3600):
|
| 296 |
self.max_size = max_size
|
| 297 |
+
self.ttl_seconds = ttl_seconds
|
| 298 |
self.cache: OrderedDict[str, Dict] = OrderedDict()
|
| 299 |
+
self.prefix_cache: Dict[str, str] = {} # Formatted prompt prefixes
|
| 300 |
self.lock = Lock()
|
| 301 |
+
self.stats = {"hits": 0, "misses": 0, "prefix_hits": 0}
|
| 302 |
|
| 303 |
def _hash_prompt(self, system: str, tools: Optional[List] = None) -> str:
|
| 304 |
"""Generate hash for system prompt + tools combination."""
|
|
|
|
| 308 |
return hashlib.md5(content.encode()).hexdigest()[:16]
|
| 309 |
|
| 310 |
def get(self, system: str, tools: Optional[List] = None) -> Optional[Dict]:
|
| 311 |
+
"""Get cached prompt data with TTL check."""
|
| 312 |
with self.lock:
|
| 313 |
key = self._hash_prompt(system, tools)
|
| 314 |
if key in self.cache:
|
| 315 |
+
entry = self.cache[key]
|
| 316 |
+
# Check TTL
|
| 317 |
+
if time.time() - entry.get("created", 0) < self.ttl_seconds:
|
| 318 |
+
self.stats["hits"] += 1
|
| 319 |
+
self.cache.move_to_end(key)
|
| 320 |
+
metrics.record_cache_hit()
|
| 321 |
+
return entry
|
| 322 |
+
else:
|
| 323 |
+
# Expired, remove it
|
| 324 |
+
del self.cache[key]
|
| 325 |
self.stats["misses"] += 1
|
| 326 |
+
metrics.record_cache_miss()
|
| 327 |
return None
|
| 328 |
|
| 329 |
+
def get_prefix(self, system: str, tools: Optional[List] = None) -> Optional[str]:
|
| 330 |
+
"""Get cached formatted prompt prefix."""
|
| 331 |
+
with self.lock:
|
| 332 |
+
key = self._hash_prompt(system, tools)
|
| 333 |
+
if key in self.prefix_cache:
|
| 334 |
+
self.stats["prefix_hits"] += 1
|
| 335 |
+
return self.prefix_cache[key]
|
| 336 |
+
return None
|
| 337 |
+
|
| 338 |
+
def set_prefix(self, system: str, tools: Optional[List], formatted_prefix: str):
|
| 339 |
+
"""Cache the formatted prompt prefix."""
|
| 340 |
+
with self.lock:
|
| 341 |
+
key = self._hash_prompt(system, tools)
|
| 342 |
+
self.prefix_cache[key] = formatted_prefix
|
| 343 |
+
|
| 344 |
def set(self, system: str, tools: Optional[List], data: Dict):
|
| 345 |
+
"""Cache prompt data with timestamp."""
|
| 346 |
with self.lock:
|
| 347 |
key = self._hash_prompt(system, tools)
|
| 348 |
if len(self.cache) >= self.max_size:
|
| 349 |
oldest = next(iter(self.cache))
|
| 350 |
del self.cache[oldest]
|
| 351 |
+
data["created"] = time.time()
|
| 352 |
self.cache[key] = data
|
|
|
|
| 353 |
|
| 354 |
def get_stats(self) -> Dict:
|
| 355 |
total = self.stats["hits"] + self.stats["misses"]
|
| 356 |
hit_rate = (self.stats["hits"] / total * 100) if total > 0 else 0
|
| 357 |
return {
|
| 358 |
"size": len(self.cache),
|
| 359 |
+
"prefix_cache_size": len(self.prefix_cache),
|
| 360 |
"max_size": self.max_size,
|
| 361 |
"hits": self.stats["hits"],
|
| 362 |
"misses": self.stats["misses"],
|
| 363 |
+
"prefix_hits": self.stats["prefix_hits"],
|
| 364 |
+
"hit_rate": f"{hit_rate:.1f}%",
|
| 365 |
+
"ttl_seconds": self.ttl_seconds
|
| 366 |
}
|
| 367 |
|
| 368 |
+
prompt_cache = PromptCache(max_size=50, ttl_seconds=3600)
|
| 369 |
|
| 370 |
# ============== Feature 3: Multi-Model Manager ==============
|
| 371 |
class ModelManager:
|
|
|
|
| 920 |
# Fallback to JSON if no static file
|
| 921 |
return JSONResponse({
|
| 922 |
"status": "healthy",
|
| 923 |
+
"version": "4.0.0",
|
| 924 |
+
"backend": "llama.cpp + OpenBLAS",
|
| 925 |
"features": [
|
| 926 |
+
"priority-queue",
|
| 927 |
+
"prefix-caching",
|
| 928 |
+
"ttl-cache",
|
| 929 |
"multi-model",
|
| 930 |
"extended-thinking",
|
| 931 |
"streaming",
|
| 932 |
"tool-use",
|
| 933 |
+
"dual-compatibility",
|
| 934 |
+
"metrics"
|
| 935 |
],
|
| 936 |
"endpoints": {
|
| 937 |
"openai": "/v1/chat/completions",
|
| 938 |
+
"anthropic": "/anthropic/v1/messages",
|
| 939 |
+
"metrics": "/metrics"
|
| 940 |
},
|
| 941 |
"models": model_manager.list_models(),
|
| 942 |
"queue": request_queue.get_status(),
|
| 943 |
+
"cache": prompt_cache.get_stats(),
|
| 944 |
+
"performance": metrics.get_stats()
|
| 945 |
})
|
| 946 |
|
| 947 |
@app.get("/api/status")
|
|
|
|
| 949 |
"""API status as JSON (for dashboard AJAX calls)"""
|
| 950 |
return {
|
| 951 |
"status": "healthy",
|
| 952 |
+
"version": "4.0.0",
|
| 953 |
"backend": "llama.cpp",
|
| 954 |
"features": [
|
| 955 |
+
"priority-queue",
|
| 956 |
+
"prefix-caching",
|
| 957 |
+
"ttl-cache",
|
| 958 |
"multi-model",
|
| 959 |
"extended-thinking",
|
| 960 |
"streaming",
|
| 961 |
"tool-use",
|
| 962 |
+
"dual-compatibility",
|
| 963 |
+
"metrics"
|
| 964 |
],
|
| 965 |
"endpoints": {
|
| 966 |
"openai": "/v1/chat/completions",
|
| 967 |
+
"anthropic": "/anthropic/v1/messages",
|
| 968 |
+
"metrics": "/metrics"
|
| 969 |
},
|
| 970 |
"models": model_manager.list_models(),
|
| 971 |
"queue": request_queue.get_status(),
|
| 972 |
"cache": prompt_cache.get_stats()
|
| 973 |
}
|
| 974 |
|
| 975 |
+
@app.get("/metrics")
|
| 976 |
+
async def get_metrics():
|
| 977 |
+
"""Detailed performance metrics for monitoring"""
|
| 978 |
+
return {
|
| 979 |
+
"api": metrics.get_stats(),
|
| 980 |
+
"queue": request_queue.get_status(),
|
| 981 |
+
"cache": prompt_cache.get_stats(),
|
| 982 |
+
"models": model_manager.get_stats()
|
| 983 |
+
}
|
| 984 |
+
|
| 985 |
@app.get("/logs")
|
| 986 |
async def get_logs(lines: int = 100):
|
| 987 |
try:
|
|
|
|
| 1187 |
anthropic_beta: Optional[str] = Header(None, alias="anthropic-beta")
|
| 1188 |
):
|
| 1189 |
message_id = generate_id("msg")
|
| 1190 |
+
request_start = time.time()
|
| 1191 |
+
ttft = 0 # Time to first token
|
| 1192 |
|
| 1193 |
+
# Estimate message length for priority queue
|
| 1194 |
+
msg_length = sum(len(str(m.content)) for m in request.messages)
|
| 1195 |
+
|
| 1196 |
+
# Queue management with priority based on expected response length
|
| 1197 |
+
position = await request_queue.acquire(message_id, max_tokens=request.max_tokens, message_length=msg_length)
|
| 1198 |
if position > 0:
|
| 1199 |
await request_queue.wait_for_turn(message_id)
|
| 1200 |
|
|
|
|
| 1288 |
if usage["completion_tokens"] >= total_max_tokens:
|
| 1289 |
stop_reason = "max_tokens"
|
| 1290 |
|
| 1291 |
+
total_time = time.time() - request_start
|
| 1292 |
+
ttft = gen_time # For non-streaming, TTFT ~ generation time
|
| 1293 |
+
|
| 1294 |
+
# Record metrics
|
| 1295 |
+
metrics.record_request(
|
| 1296 |
+
model=request.model,
|
| 1297 |
+
ttft=ttft,
|
| 1298 |
+
total_time=total_time,
|
| 1299 |
+
tokens=usage["completion_tokens"]
|
| 1300 |
+
)
|
| 1301 |
+
|
| 1302 |
+
logger.info(f"[{message_id}] Generated in {gen_time:.2f}s - tokens: {usage['completion_tokens']}, cache_hit: {cache_hit}, total: {total_time:.2f}s")
|
| 1303 |
|
| 1304 |
return AnthropicMessageResponse(
|
| 1305 |
id=message_id,
|
|
|
|
| 1316 |
|
| 1317 |
except Exception as e:
|
| 1318 |
logger.error(f"[{message_id}] Error: {e}", exc_info=True)
|
| 1319 |
+
metrics.record_error()
|
| 1320 |
raise HTTPException(status_code=500, detail=str(e))
|
| 1321 |
finally:
|
| 1322 |
await request_queue.release()
|