Dmitry Beresnev commited on
Commit
7763bf4
·
1 Parent(s): c384ef1

fix gitignore, app and logger, etc

Browse files
Files changed (5) hide show
  1. .gitignore +133 -0
  2. Dockerfile +2 -1
  3. app.py +436 -282
  4. logger.py +164 -0
  5. pyproject.toml +4 -3
.gitignore ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ build/
8
+ develop-eggs/
9
+ dist/
10
+ downloads/
11
+ eggs/
12
+ .eggs/
13
+ lib/
14
+ lib64/
15
+ parts/
16
+ sdist/
17
+ var/
18
+ wheels/
19
+ share/python-wheels/
20
+ *.egg-info/
21
+ .installed.cfg
22
+ *.egg
23
+ MANIFEST
24
+ pip-log.txt
25
+ pip-delete-this-directory.txt
26
+
27
+ # Virtual Environment
28
+ .venv/
29
+ venv/
30
+ ENV/
31
+ env/
32
+ .virtualenv
33
+
34
+ # PyInstaller
35
+ *.manifest
36
+ *.spec
37
+
38
+ # Unit test / coverage reports
39
+ htmlcov/
40
+ .tox/
41
+ .nox/
42
+ .coverage
43
+ .coverage.*
44
+ .cache
45
+ nosetests.xml
46
+ coverage.xml
47
+ *.cover
48
+ *.py,cover
49
+ .hypothesis/
50
+ .pytest_cache/
51
+ cover/
52
+
53
+ # IDEs
54
+ .idea/
55
+ .vscode/
56
+ *.swp
57
+ *.swo
58
+ *~
59
+ .DS_Store
60
+
61
+ # Jupyter Notebook
62
+ .ipynb_checkpoints
63
+ *.ipynb
64
+
65
+ # PyCharm
66
+ .idea/
67
+ *.iml
68
+ *.iws
69
+
70
+ # Logs
71
+ *.log
72
+ logs/
73
+ agi.log
74
+
75
+ # Environment variables
76
+ .env
77
+ .env.local
78
+ .env.*.local
79
+ *.env
80
+
81
+ # Database
82
+ *.db
83
+ *.sqlite
84
+ *.sqlite3
85
+
86
+ # Model files (often large)
87
+ *.bin
88
+ *.gguf
89
+ *.safetensors
90
+ models/
91
+ checkpoints/
92
+
93
+ # Docker
94
+ .dockerignore
95
+ docker-compose.override.yml
96
+
97
+ # OS
98
+ .DS_Store
99
+ Thumbs.db
100
+ Desktop.ini
101
+ $RECYCLE.BIN/
102
+ *.cab
103
+ *.msi
104
+ *.msix
105
+ *.msm
106
+ *.msp
107
+ *.lnk
108
+
109
+ # mypy
110
+ .mypy_cache/
111
+ .dmypy.json
112
+ dmypy.json
113
+
114
+ # Pyre type checker
115
+ .pyre/
116
+
117
+ # pytype static type analyzer
118
+ .pytype/
119
+
120
+ # Cython debug symbols
121
+ cython_debug/
122
+
123
+ # Temporary files
124
+ *.tmp
125
+ *.temp
126
+ tmp/
127
+ temp/
128
+
129
+ #
130
+ *.minimal
131
+ tests/
132
+ *.md
133
+ docs/
Dockerfile CHANGED
@@ -49,7 +49,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
49
  && rm -rf /var/lib/apt/lists/*
50
 
51
  # Install Python packages
52
- RUN pip3 install --no-cache-dir fastapi uvicorn requests pydantic duckduckgo-search beautifulsoup4 lxml --break-system-packages
53
 
54
  # Create non-root user
55
  RUN useradd -m -u 1000 user && \
@@ -58,6 +58,7 @@ RUN useradd -m -u 1000 user && \
58
 
59
  # Copy application code
60
  COPY --chown=user:user app.py /home/user/app.py
 
61
 
62
  USER user
63
  WORKDIR /home/user
 
49
  && rm -rf /var/lib/apt/lists/*
50
 
51
  # Install Python packages
52
+ RUN pip3 install --no-cache-dir fastapi uvicorn aiohttp pydantic duckduckgo-search beautifulsoup4 lxml --break-system-packages
53
 
54
  # Create non-root user
55
  RUN useradd -m -u 1000 user && \
 
58
 
59
  # Copy application code
60
  COPY --chown=user:user app.py /home/user/app.py
61
+ COPY --chown=user:user logger.py /home/user/logger.py
62
 
63
  USER user
64
  WORKDIR /home/user
app.py CHANGED
@@ -2,27 +2,35 @@ import subprocess
2
  import signal
3
  import os
4
  import time
5
- from typing import Optional, Dict
6
- from dataclasses import dataclass
 
7
  from collections import OrderedDict
 
 
8
 
9
- import requests
10
- from fastapi import FastAPI, HTTPException
11
  from fastapi.openapi.utils import get_openapi
12
  from pydantic import BaseModel, Field
13
  from duckduckgo_search import DDGS
14
  from bs4 import BeautifulSoup
15
 
 
 
 
 
16
  app = FastAPI(
17
  title="AGI Multi-Model API",
18
  description="""
19
- **Dynamic Multi-Model LLM API with Web Search Capabilities**
20
 
21
  This API provides:
22
- * 🔄 Dynamic model switching between multiple LLM models
23
  * 💬 OpenAI-compatible chat completions
24
  * 🌐 Web-augmented chat with real-time search
25
- * 📊 Model management and status monitoring
 
26
 
27
  ## Available Models
28
  - **deepseek-chat** (default): General purpose conversational model
@@ -31,13 +39,22 @@ app = FastAPI(
31
  - **deepseek-coder**: Specialized coding assistance
32
  - **llama-7b**: Lightweight and fast responses
33
 
 
 
 
 
 
 
 
 
34
  ## Quick Start
35
  1. Check available models: `GET /models`
36
  2. Switch model (optional): `POST /switch-model`
37
  3. Chat: `POST /v1/chat/completions`
38
  4. Chat with web search: `POST /v1/web-chat/completions`
 
39
  """,
40
- version="0.0.1.2025.12.04",
41
  contact={
42
  "name": "API Support",
43
  "email": "support@example.com",
@@ -58,6 +75,10 @@ app = FastAPI(
58
  "name": "chat",
59
  "description": "Chat completion endpoints (OpenAI-compatible)",
60
  },
 
 
 
 
61
  {
62
  "name": "documentation",
63
  "description": "API documentation and OpenAPI specification",
@@ -81,9 +102,12 @@ AVAILABLE_MODELS = {
81
  "llama-7b": "TheBloke/Llama-2-7B-Chat-GGUF:llama-2-7b-chat.Q4_K_M.gguf",
82
  }
83
 
84
- # Configuration
85
- MAX_CACHED_MODELS = 2 # Maximum number of models to keep in memory
86
- BASE_PORT = 8080 # Starting port for llama-server instances
 
 
 
87
 
88
 
89
  @dataclass
@@ -95,14 +119,105 @@ class CachedModel:
95
  port: int
96
  url: str
97
  last_used: float
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
 
99
 
100
  class ModelCache:
101
  """
102
- In-memory LRU cache for loaded models.
103
 
104
- Manages multiple llama-server processes, each on a different port.
105
- Automatically evicts least recently used models when cache is full.
 
 
 
106
  """
107
 
108
  def __init__(self, max_size: int = MAX_CACHED_MODELS):
@@ -110,6 +225,8 @@ class ModelCache:
110
  self.cache: OrderedDict[str, CachedModel] = OrderedDict()
111
  self.port_counter = BASE_PORT
112
  self.used_ports = set()
 
 
113
 
114
  def _get_next_port(self) -> int:
115
  """Get next available port for a model."""
@@ -124,14 +241,14 @@ class ModelCache:
124
  """Release a port back to the pool."""
125
  self.used_ports.discard(port)
126
 
127
- def _evict_lru(self):
128
  """Evict the least recently used model."""
129
  if not self.cache:
130
  return
131
 
132
  # Get the first (oldest) item
133
  model_name, cached_model = self.cache.popitem(last=False)
134
- print(f"Evicting model from cache: {model_name}")
135
 
136
  # Stop the process
137
  try:
@@ -139,20 +256,23 @@ class ModelCache:
139
  os.killpg(os.getpgid(cached_model.process.pid), signal.SIGTERM)
140
  else:
141
  cached_model.process.terminate()
142
- cached_model.process.wait(timeout=10)
143
- except Exception as e:
144
- print(f"Error stopping model {model_name}: {e}")
145
- try:
 
 
 
 
146
  if os.name != 'nt':
147
  os.killpg(os.getpgid(cached_model.process.pid), signal.SIGKILL)
148
  else:
149
  cached_model.process.kill()
150
- except:
151
- pass
152
 
153
  # Release the port
154
  self._release_port(cached_model.port)
155
- time.sleep(1)
156
 
157
  def get(self, model_name: str) -> Optional[CachedModel]:
158
  """Get a model from cache, updating its last used time."""
@@ -161,16 +281,16 @@ class ModelCache:
161
  cached_model.last_used = time.time()
162
  # Move to end (most recently used)
163
  self.cache.move_to_end(model_name)
164
- print(f"Cache hit for model: {model_name}")
165
  return cached_model
166
- print(f"Cache miss for model: {model_name}")
167
  return None
168
 
169
- def put(self, model_name: str, model_id: str, process: subprocess.Popen, port: int):
170
  """Add a model to the cache."""
171
  # Evict if cache is full
172
  while len(self.cache) >= self.max_size:
173
- self._evict_lru()
174
 
175
  url = f"http://localhost:{port}"
176
  cached_model = CachedModel(
@@ -179,21 +299,27 @@ class ModelCache:
179
  process=process,
180
  port=port,
181
  url=url,
182
- last_used=time.time()
 
183
  )
184
  self.cache[model_name] = cached_model
185
- print(f"Cached model: {model_name} on port {port}")
186
 
187
- def clear(self):
188
  """Clear all cached models."""
189
- print("Clearing model cache...")
190
  for model_name, cached_model in list(self.cache.items()):
191
  try:
192
  if os.name != 'nt':
193
  os.killpg(os.getpgid(cached_model.process.pid), signal.SIGTERM)
194
  else:
195
  cached_model.process.terminate()
196
- cached_model.process.wait(timeout=10)
 
 
 
 
 
197
  except:
198
  try:
199
  if os.name != 'nt':
@@ -216,7 +342,10 @@ class ModelCache:
216
  "name": name,
217
  "port": model.port,
218
  "url": model.url,
219
- "last_used": model.last_used
 
 
 
220
  }
221
  for name, model in self.cache.items()
222
  ]
@@ -226,6 +355,11 @@ class ModelCache:
226
  # Global state
227
  current_model = "deepseek-chat" # Default model
228
  model_cache = ModelCache(max_size=MAX_CACHED_MODELS)
 
 
 
 
 
229
 
230
 
231
  class ModelSwitchRequest(BaseModel):
@@ -347,22 +481,28 @@ class ModelSwitchResponse(BaseModel):
347
  model: str = Field(..., description="New active model name")
348
 
349
 
350
- def start_llama_server(model_id: str, port: int) -> subprocess.Popen:
351
- """Start llama-server with specified model on a specific port."""
 
 
 
 
 
 
 
352
  cmd = [
353
  "llama-server",
354
  "-hf", model_id,
355
  "--host", "0.0.0.0",
356
  "--port", str(port),
357
  "-c", "2048", # Context size
358
- "-t", "4", # CPU threads (adjust based on cores)
359
  "-ngl", "0", # GPU layers (0 for CPU-only)
360
- "--cont-batching", # Enable continuous batching for speed
361
  "-b", "512", # Batch size
362
  ]
363
 
364
- print(f"Starting llama-server with model: {model_id} on port {port}")
365
- print("This may take 2-3 minutes to download and load the model...")
366
 
367
  process = subprocess.Popen(
368
  cmd,
@@ -373,52 +513,108 @@ def start_llama_server(model_id: str, port: int) -> subprocess.Popen:
373
  bufsize=1
374
  )
375
 
376
- # Wait for server to be ready (increased timeout for model download)
377
- max_retries = 300 # 5 minutes
378
  server_url = f"http://localhost:{port}"
 
 
 
 
379
 
380
- for i in range(max_retries):
381
  # Check if process died
382
  if process.poll() is not None:
383
  stdout, _ = process.communicate()
384
- print(f"llama-server exited with code {process.returncode}")
385
- print(f"Output: {stdout}")
386
  raise RuntimeError("llama-server process died")
387
 
388
  try:
389
- # Try root endpoint instead of /health
390
- response = requests.get(f"{server_url}/", timeout=2)
391
- if response.status_code in [200, 404]: # 404 is ok, means server is up
392
- print(f"llama-server ready after {i+1} seconds")
393
- return process
394
- except requests.exceptions.ConnectionError:
 
395
  # Server not ready yet
396
  pass
397
- except Exception:
398
- # Other errors, keep waiting
399
- pass
400
 
401
- time.sleep(1)
 
 
 
402
 
403
  raise RuntimeError("llama-server failed to start within 5 minutes")
404
 
405
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
406
  @app.on_event("startup")
407
  async def startup_event():
408
- """Start with default model and cache it."""
409
- global current_model
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
410
  model_id = AVAILABLE_MODELS[current_model]
411
  port = model_cache._get_next_port()
412
 
413
- process = start_llama_server(model_id, port)
414
- model_cache.put(current_model, model_id, process, port)
415
- print(f"Started with default model: {current_model}")
 
 
 
 
 
416
 
417
 
418
  @app.on_event("shutdown")
419
  async def shutdown_event():
420
- """Clean shutdown - clear all cached models."""
421
- model_cache.clear()
 
 
 
 
 
422
 
423
 
424
  @app.get(
@@ -438,12 +634,28 @@ async def root():
438
  - List of all available models
439
  """
440
  return {
441
- "status": "AGI Multi-Model API with dynamic model switching and web search",
442
  "current_model": current_model,
443
  "available_models": list(AVAILABLE_MODELS.keys())
444
  }
445
 
446
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
447
  @app.get(
448
  "/models",
449
  response_model=ModelsResponse,
@@ -458,8 +670,6 @@ async def list_models():
458
  Returns:
459
  - current_model: The model currently in use
460
  - available_models: Array of all available model names
461
-
462
- Use this endpoint to see which models you can switch to.
463
  """
464
  return {
465
  "current_model": current_model,
@@ -472,45 +682,17 @@ async def list_models():
472
  response_model=ModelSwitchResponse,
473
  tags=["models"],
474
  summary="Switch Active Model",
475
- description="Switch to a different LLM model. Uses caching for instant switching to recently used models.",
476
- responses={
477
- 200: {
478
- "description": "Model switched successfully",
479
- "content": {
480
- "application/json": {
481
- "example": {
482
- "message": "Switched to model: deepseek-coder (from cache)",
483
- "model": "deepseek-coder"
484
- }
485
- }
486
- }
487
- },
488
- 400: {
489
- "description": "Invalid model name",
490
- "content": {
491
- "application/json": {
492
- "example": {
493
- "detail": "Model 'invalid-model' not found. Available: ['deepseek-chat', 'mistral-7b', ...]"
494
- }
495
- }
496
- }
497
- }
498
- }
499
  )
500
  async def switch_model(request: ModelSwitchRequest):
501
  """
502
  Switch to a different LLM model with intelligent caching.
503
 
504
- **How it works:**
505
- 1. Checks if requested model is already active (no switch needed)
506
- 2. Checks cache for the model (instant switch if cached)
507
- 3. If not cached, loads the model (may take 2-3 minutes)
508
-
509
- **Caching:**
510
- - Up to 2 models kept in memory
511
- - LRU (Least Recently Used) eviction policy
512
- - Each model runs on a separate port
513
- - Instant switching between cached models
514
  """
515
  global current_model
516
 
@@ -523,28 +705,32 @@ async def switch_model(request: ModelSwitchRequest):
523
  if request.model_name == current_model:
524
  return {"message": f"Already using model: {current_model}", "model": current_model}
525
 
 
 
526
  # Try to get from cache
527
  cached_model = model_cache.get(request.model_name)
528
 
529
  if cached_model:
530
  # Model is cached, instant switch
 
531
  current_model = request.model_name
532
  return {
533
- "message": f"Switched to model: {current_model} (from cache)",
534
  "model": current_model
535
  }
536
 
537
  # Model not cached, need to load it
 
538
  model_id = AVAILABLE_MODELS[request.model_name]
539
  port = model_cache._get_next_port()
540
 
541
  try:
542
- process = start_llama_server(model_id, port)
543
- model_cache.put(request.model_name, model_id, process, port)
544
  current_model = request.model_name
545
 
546
  return {
547
- "message": f"Switched to model: {current_model} (newly loaded)",
548
  "model": current_model
549
  }
550
  except Exception as e:
@@ -557,88 +743,80 @@ async def switch_model(request: ModelSwitchRequest):
557
  "/v1/chat/completions",
558
  tags=["chat"],
559
  summary="Chat Completions",
560
- description="OpenAI-compatible chat completions endpoint. Send messages and get AI-generated responses.",
561
- responses={
562
- 200: {
563
- "description": "Successful response",
564
- "content": {
565
- "application/json": {
566
- "example": {
567
- "id": "chatcmpl-123",
568
- "object": "chat.completion",
569
- "created": 1677652288,
570
- "model": "deepseek-chat",
571
- "choices": [{
572
- "index": 0,
573
- "message": {
574
- "role": "assistant",
575
- "content": "Hello! How can I help you today?"
576
- },
577
- "finish_reason": "stop"
578
- }]
579
- }
580
- }
581
- }
582
- },
583
- 500: {
584
- "description": "LLM server error"
585
- }
586
- }
587
  )
588
  async def chat_completions(request: ChatCompletionRequest):
589
  """
590
- OpenAI-compatible chat completions endpoint.
591
-
592
- This endpoint forwards your request to the currently active LLM model
593
- and returns the response in OpenAI-compatible format.
594
-
595
- **Message Format:**
596
- ```json
597
- {
598
- "messages": [
599
- {"role": "system", "content": "You are a helpful assistant."},
600
- {"role": "user", "content": "Hello!"}
601
- ],
602
- "max_tokens": 256,
603
- "temperature": 0.7
604
- }
605
- ```
606
 
607
- **Supported Roles:**
608
- - `system`: Sets the behavior of the assistant
609
- - `user`: User messages
610
- - `assistant`: Assistant responses (for multi-turn conversations)
611
  """
612
  try:
 
 
613
  # Get current model from cache
614
  cached_model = model_cache.get(current_model)
615
  if not cached_model:
616
  raise HTTPException(status_code=500, detail="Current model not loaded")
617
 
618
- # Forward to llama-server
619
- response = requests.post(
620
  f"{cached_model.url}/v1/chat/completions",
621
  json={
622
  "messages": request.messages,
623
  "max_tokens": request.max_tokens,
624
  "temperature": request.temperature,
625
- },
626
- timeout=300
627
- )
628
- response.raise_for_status()
629
- return response.json()
630
- except requests.exceptions.RequestException as e:
 
 
 
 
 
 
 
631
  raise HTTPException(status_code=500, detail=f"llama-server error: {str(e)}")
632
 
633
 
634
- def search_web(query: str, max_results: int = 5) -> list[dict]:
635
- """Search the web using DuckDuckGo and return results."""
 
 
 
 
 
 
 
 
 
 
 
 
636
  try:
637
- with DDGS() as ddgs:
638
- results = list(ddgs.text(query, max_results=max_results))
639
- return results
 
 
 
 
 
 
 
 
 
 
 
 
640
  except Exception as e:
641
- print(f"Search error: {e}")
642
  return []
643
 
644
 
@@ -667,70 +845,16 @@ def format_search_context(query: str, search_results: list[dict]) -> str:
667
  "/v1/web-chat/completions",
668
  tags=["chat"],
669
  summary="Web-Augmented Chat Completions",
670
- description="Chat completions enhanced with real-time web search. The last user message is used as a search query.",
671
- responses={
672
- 200: {
673
- "description": "Successful response with web search metadata",
674
- "content": {
675
- "application/json": {
676
- "example": {
677
- "id": "chatcmpl-123",
678
- "object": "chat.completion",
679
- "created": 1677652288,
680
- "model": "deepseek-chat",
681
- "choices": [{
682
- "index": 0,
683
- "message": {
684
- "role": "assistant",
685
- "content": "Based on recent search results, here's what I found..."
686
- },
687
- "finish_reason": "stop"
688
- }],
689
- "web_search": {
690
- "query": "latest AI developments",
691
- "results_count": 5,
692
- "sources": ["https://example.com/1", "https://example.com/2"]
693
- }
694
- }
695
- }
696
- }
697
- },
698
- 400: {
699
- "description": "No user message found"
700
- },
701
- 500: {
702
- "description": "LLM server or search error"
703
- }
704
- }
705
  )
706
  async def web_chat_completions(request: WebChatRequest):
707
  """
708
- Chat completions with real-time web search augmentation.
709
-
710
- **How it works:**
711
- 1. Extracts the last user message as the search query
712
- 2. Performs a web search using DuckDuckGo
713
- 3. Injects search results into the LLM context
714
- 4. Returns the AI response with source citations
715
-
716
- **Use cases:**
717
- - Current events and news
718
- - Recent information beyond the model's training data
719
- - Fact-checking with web sources
720
- - Research with live data
721
-
722
- **Example:**
723
- ```json
724
- {
725
- "messages": [
726
- {"role": "user", "content": "What's the latest news about SpaceX?"}
727
- ],
728
- "max_tokens": 512,
729
- "max_search_results": 5
730
- }
731
- ```
732
 
733
- The response includes a `web_search` field with metadata about sources used.
 
 
 
734
  """
735
  try:
736
  # Get the last user message as search query
@@ -740,9 +864,9 @@ async def web_chat_completions(request: WebChatRequest):
740
 
741
  search_query = user_messages[-1].get("content", "")
742
 
743
- # Perform web search
744
- print(f"Searching web for: {search_query}")
745
- search_results = search_web(search_query, request.max_search_results)
746
 
747
  # Format search results as context
748
  web_context = format_search_context(search_query, search_results)
@@ -761,7 +885,6 @@ Use the above search results to provide accurate, up-to-date information in your
761
  Always cite sources when using information from the search results."""
762
  }
763
 
764
- # Insert system message before the last user message
765
  augmented_messages.insert(-1, system_prompt)
766
 
767
  # Get current model from cache
@@ -770,29 +893,28 @@ Always cite sources when using information from the search results."""
770
  raise HTTPException(status_code=500, detail="Current model not loaded")
771
 
772
  # Forward to llama-server with augmented context
773
- response = requests.post(
774
  f"{cached_model.url}/v1/chat/completions",
775
  json={
776
  "messages": augmented_messages,
777
  "max_tokens": request.max_tokens,
778
  "temperature": request.temperature,
779
- },
780
- timeout=300
781
- )
782
- response.raise_for_status()
783
-
784
- result = response.json()
785
 
786
  # Add metadata about search results
787
  result["web_search"] = {
788
  "query": search_query,
789
  "results_count": len(search_results),
790
- "sources": [r.get("href", "") for r in search_results if r.get("href")]
 
791
  }
792
 
793
  return result
794
 
795
- except requests.exceptions.RequestException as e:
796
  raise HTTPException(status_code=500, detail=f"llama-server error: {str(e)}")
797
  except Exception as e:
798
  raise HTTPException(status_code=500, detail=f"Error: {str(e)}")
@@ -802,40 +924,89 @@ Always cite sources when using information from the search results."""
802
  "/cache/info",
803
  tags=["models"],
804
  summary="Get Cache Information",
805
- description="Returns information about the model cache, including cached models and cache statistics."
806
  )
807
  async def get_cache_info():
 
 
 
 
 
 
 
 
 
 
 
808
  """
809
- Get information about the in-memory model cache.
810
 
811
  Returns:
812
- - max_size: Maximum number of models that can be cached
813
- - current_size: Current number of cached models
814
- - cached_models: List of currently cached models with their metadata
815
-
816
- **Example Response:**
817
- ```json
818
- {
819
- "max_size": 2,
820
- "current_size": 2,
821
- "cached_models": [
822
- {
823
- "name": "deepseek-chat",
824
- "port": 8080,
825
- "url": "http://localhost:8080",
826
- "last_used": 1234567890.123
 
 
 
 
 
 
 
 
 
 
 
 
 
 
827
  },
828
- {
829
- "name": "mistral-7b",
830
- "port": 8081,
831
- "url": "http://localhost:8081",
832
- "last_used": 1234567895.456
833
- }
834
- ]
 
835
  }
836
- ```
837
- """
838
- return model_cache.get_cache_info()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
839
 
840
 
841
  @app.get(
@@ -846,22 +1017,5 @@ async def get_cache_info():
846
  include_in_schema=False
847
  )
848
  async def get_openapi_spec():
849
- """
850
- Export the OpenAPI specification for this API.
851
-
852
- This endpoint returns the complete OpenAPI 3.0 specification that can be used with:
853
- - API documentation tools (Swagger UI, ReDoc)
854
- - Code generators (openapi-generator, swagger-codegen)
855
- - API testing tools (Postman, Insomnia)
856
- - SDK generation
857
-
858
- Save this to a file and use it with tools like:
859
- ```bash
860
- # Generate Python client
861
- openapi-generator generate -i openapi.json -g python -o ./client
862
-
863
- # Generate TypeScript client
864
- openapi-generator generate -i openapi.json -g typescript-fetch -o ./client
865
- ```
866
- """
867
- return app.openapi()
 
2
  import signal
3
  import os
4
  import time
5
+ import asyncio
6
+ from typing import Optional, Dict, List
7
+ from dataclasses import dataclass, field
8
  from collections import OrderedDict
9
+ from datetime import datetime, timedelta
10
+ import hashlib
11
 
12
+ import aiohttp
13
+ from fastapi import FastAPI, HTTPException, BackgroundTasks
14
  from fastapi.openapi.utils import get_openapi
15
  from pydantic import BaseModel, Field
16
  from duckduckgo_search import DDGS
17
  from bs4 import BeautifulSoup
18
 
19
+ from logger import get_logger
20
+
21
+ logger = get_logger(__name__)
22
+
23
  app = FastAPI(
24
  title="AGI Multi-Model API",
25
  description="""
26
+ **High-Performance Dynamic Multi-Model LLM API with Web Search**
27
 
28
  This API provides:
29
+ * 🔄 Dynamic model switching with intelligent caching
30
  * 💬 OpenAI-compatible chat completions
31
  * 🌐 Web-augmented chat with real-time search
32
+ * 📊 Model management and performance monitoring
33
+ * ⚡ Async/await architecture for maximum throughput
34
 
35
  ## Available Models
36
  - **deepseek-chat** (default): General purpose conversational model
 
39
  - **deepseek-coder**: Specialized coding assistance
40
  - **llama-7b**: Lightweight and fast responses
41
 
42
+ ## Performance Features
43
+ - Parallel model loading
44
+ - Connection pooling for HTTP requests
45
+ - Web search result caching
46
+ - Background model preloading
47
+ - Request queuing to prevent overload
48
+ - Real-time performance metrics
49
+
50
  ## Quick Start
51
  1. Check available models: `GET /models`
52
  2. Switch model (optional): `POST /switch-model`
53
  3. Chat: `POST /v1/chat/completions`
54
  4. Chat with web search: `POST /v1/web-chat/completions`
55
+ 5. View metrics: `GET /metrics`
56
  """,
57
+ version="0.1.0.2026.01.24",
58
  contact={
59
  "name": "API Support",
60
  "email": "support@example.com",
 
75
  "name": "chat",
76
  "description": "Chat completion endpoints (OpenAI-compatible)",
77
  },
78
+ {
79
+ "name": "monitoring",
80
+ "description": "Performance metrics and monitoring",
81
+ },
82
  {
83
  "name": "documentation",
84
  "description": "API documentation and OpenAPI specification",
 
102
  "llama-7b": "TheBloke/Llama-2-7B-Chat-GGUF:llama-2-7b-chat.Q4_K_M.gguf",
103
  }
104
 
105
+ # Configuration - now environment-variable driven
106
+ MAX_CACHED_MODELS = int(os.getenv("MAX_CACHED_MODELS", "2"))
107
+ BASE_PORT = int(os.getenv("BASE_PORT", "8080"))
108
+ PRELOAD_MODELS = os.getenv("PRELOAD_MODELS", "").split(",") if os.getenv("PRELOAD_MODELS") else []
109
+ WEB_SEARCH_CACHE_TTL = int(os.getenv("WEB_SEARCH_CACHE_TTL", "3600")) # 1 hour
110
+ REQUEST_TIMEOUT = int(os.getenv("REQUEST_TIMEOUT", "300")) # 5 minutes
111
 
112
 
113
  @dataclass
 
119
  port: int
120
  url: str
121
  last_used: float
122
+ load_time: float = 0.0
123
+ request_count: int = 0
124
+ total_latency: float = 0.0
125
+
126
+
127
+ @dataclass
128
+ class PerformanceMetrics:
129
+ """Performance metrics for monitoring."""
130
+ total_requests: int = 0
131
+ total_switches: int = 0
132
+ cache_hits: int = 0
133
+ cache_misses: int = 0
134
+ total_web_searches: int = 0
135
+ web_search_cache_hits: int = 0
136
+ model_metrics: Dict[str, Dict] = field(default_factory=dict)
137
+ startup_time: float = 0.0
138
+
139
+ def record_request(self, model_name: str, latency: float):
140
+ """Record a request for metrics."""
141
+ self.total_requests += 1
142
+ if model_name not in self.model_metrics:
143
+ self.model_metrics[model_name] = {
144
+ "requests": 0,
145
+ "total_latency": 0.0,
146
+ "avg_latency": 0.0
147
+ }
148
+ self.model_metrics[model_name]["requests"] += 1
149
+ self.model_metrics[model_name]["total_latency"] += latency
150
+ self.model_metrics[model_name]["avg_latency"] = (
151
+ self.model_metrics[model_name]["total_latency"] /
152
+ self.model_metrics[model_name]["requests"]
153
+ )
154
+
155
+
156
+ @dataclass
157
+ class WebSearchCacheEntry:
158
+ """Cache entry for web search results."""
159
+ results: List[dict]
160
+ timestamp: float
161
+ ttl: int = WEB_SEARCH_CACHE_TTL
162
+
163
+ def is_expired(self) -> bool:
164
+ """Check if cache entry has expired."""
165
+ return time.time() - self.timestamp > self.ttl
166
+
167
+
168
+ class WebSearchCache:
169
+ """LRU cache for web search results."""
170
+
171
+ def __init__(self, max_size: int = 100):
172
+ self.max_size = max_size
173
+ self.cache: OrderedDict[str, WebSearchCacheEntry] = OrderedDict()
174
+
175
+ def _get_cache_key(self, query: str, max_results: int) -> str:
176
+ """Generate cache key from query."""
177
+ key = f"{query}:{max_results}"
178
+ return hashlib.md5(key.encode()).hexdigest()
179
+
180
+ def get(self, query: str, max_results: int) -> Optional[List[dict]]:
181
+ """Get cached search results if available and not expired."""
182
+ key = self._get_cache_key(query, max_results)
183
+ if key in self.cache:
184
+ entry = self.cache[key]
185
+ if not entry.is_expired():
186
+ # Move to end (most recently used)
187
+ self.cache.move_to_end(key)
188
+ return entry.results
189
+ else:
190
+ # Remove expired entry
191
+ del self.cache[key]
192
+ return None
193
+
194
+ def put(self, query: str, max_results: int, results: List[dict]):
195
+ """Cache search results."""
196
+ key = self._get_cache_key(query, max_results)
197
+
198
+ # Evict oldest if cache is full
199
+ if len(self.cache) >= self.max_size and key not in self.cache:
200
+ self.cache.popitem(last=False)
201
+
202
+ self.cache[key] = WebSearchCacheEntry(
203
+ results=results,
204
+ timestamp=time.time()
205
+ )
206
+
207
+ def clear(self):
208
+ """Clear all cached results."""
209
+ self.cache.clear()
210
 
211
 
212
  class ModelCache:
213
  """
214
+ High-performance in-memory LRU cache for loaded models.
215
 
216
+ Features:
217
+ - Manages multiple llama-server processes on different ports
218
+ - LRU eviction when cache is full
219
+ - Parallel model loading support
220
+ - Performance metrics tracking
221
  """
222
 
223
  def __init__(self, max_size: int = MAX_CACHED_MODELS):
 
225
  self.cache: OrderedDict[str, CachedModel] = OrderedDict()
226
  self.port_counter = BASE_PORT
227
  self.used_ports = set()
228
+ self._loading_lock = asyncio.Lock()
229
+ self._loading_models: Dict[str, asyncio.Task] = {}
230
 
231
  def _get_next_port(self) -> int:
232
  """Get next available port for a model."""
 
241
  """Release a port back to the pool."""
242
  self.used_ports.discard(port)
243
 
244
+ async def _evict_lru(self):
245
  """Evict the least recently used model."""
246
  if not self.cache:
247
  return
248
 
249
  # Get the first (oldest) item
250
  model_name, cached_model = self.cache.popitem(last=False)
251
+ logger.info(f"Evicting model from cache: {model_name}")
252
 
253
  # Stop the process
254
  try:
 
256
  os.killpg(os.getpgid(cached_model.process.pid), signal.SIGTERM)
257
  else:
258
  cached_model.process.terminate()
259
+
260
+ # Wait asynchronously for process to stop
261
+ for _ in range(10):
262
+ if cached_model.process.poll() is not None:
263
+ break
264
+ await asyncio.sleep(0.1)
265
+ else:
266
+ # Force kill if not stopped
267
  if os.name != 'nt':
268
  os.killpg(os.getpgid(cached_model.process.pid), signal.SIGKILL)
269
  else:
270
  cached_model.process.kill()
271
+ except Exception as e:
272
+ logger.error(f"Error stopping model {model_name}: {e}")
273
 
274
  # Release the port
275
  self._release_port(cached_model.port)
 
276
 
277
  def get(self, model_name: str) -> Optional[CachedModel]:
278
  """Get a model from cache, updating its last used time."""
 
281
  cached_model.last_used = time.time()
282
  # Move to end (most recently used)
283
  self.cache.move_to_end(model_name)
284
+ logger.debug(f"Cache hit for model: {model_name}")
285
  return cached_model
286
+ logger.debug(f"Cache miss for model: {model_name}")
287
  return None
288
 
289
+ async def put(self, model_name: str, model_id: str, process: subprocess.Popen, port: int, load_time: float = 0.0):
290
  """Add a model to the cache."""
291
  # Evict if cache is full
292
  while len(self.cache) >= self.max_size:
293
+ await self._evict_lru()
294
 
295
  url = f"http://localhost:{port}"
296
  cached_model = CachedModel(
 
299
  process=process,
300
  port=port,
301
  url=url,
302
+ last_used=time.time(),
303
+ load_time=load_time
304
  )
305
  self.cache[model_name] = cached_model
306
+ logger.info(f"Cached model: {model_name} on port {port} (load time: {load_time:.2f}s)")
307
 
308
+ async def clear(self):
309
  """Clear all cached models."""
310
+ logger.info("Clearing model cache...")
311
  for model_name, cached_model in list(self.cache.items()):
312
  try:
313
  if os.name != 'nt':
314
  os.killpg(os.getpgid(cached_model.process.pid), signal.SIGTERM)
315
  else:
316
  cached_model.process.terminate()
317
+
318
+ # Wait asynchronously
319
+ for _ in range(10):
320
+ if cached_model.process.poll() is not None:
321
+ break
322
+ await asyncio.sleep(0.1)
323
  except:
324
  try:
325
  if os.name != 'nt':
 
342
  "name": name,
343
  "port": model.port,
344
  "url": model.url,
345
+ "last_used": model.last_used,
346
+ "load_time": model.load_time,
347
+ "request_count": model.request_count,
348
+ "avg_latency": model.total_latency / model.request_count if model.request_count > 0 else 0.0
349
  }
350
  for name, model in self.cache.items()
351
  ]
 
355
  # Global state
356
  current_model = "deepseek-chat" # Default model
357
  model_cache = ModelCache(max_size=MAX_CACHED_MODELS)
358
+ web_search_cache = WebSearchCache(max_size=100)
359
+ metrics = PerformanceMetrics()
360
+
361
+ # HTTP session for connection pooling (will be initialized in startup)
362
+ http_session: Optional[aiohttp.ClientSession] = None
363
 
364
 
365
  class ModelSwitchRequest(BaseModel):
 
481
  model: str = Field(..., description="New active model name")
482
 
483
 
484
+ async def start_llama_server(model_id: str, port: int) -> tuple[subprocess.Popen, float]:
485
+ """
486
+ Start llama-server with specified model on a specific port.
487
+
488
+ Returns tuple of (process, load_time_seconds).
489
+ Uses async/await with exponential backoff for health checks.
490
+ """
491
+ start_time = time.time()
492
+
493
  cmd = [
494
  "llama-server",
495
  "-hf", model_id,
496
  "--host", "0.0.0.0",
497
  "--port", str(port),
498
  "-c", "2048", # Context size
499
+ "-t", "4", # CPU threads
500
  "-ngl", "0", # GPU layers (0 for CPU-only)
501
+ "--cont-batching", # Enable continuous batching
502
  "-b", "512", # Batch size
503
  ]
504
 
505
+ logger.info(f"Starting llama-server with model: {model_id} on port {port}")
 
506
 
507
  process = subprocess.Popen(
508
  cmd,
 
513
  bufsize=1
514
  )
515
 
516
+ # Wait for server to be ready with exponential backoff
 
517
  server_url = f"http://localhost:{port}"
518
+ max_wait_time = 300 # 5 minutes
519
+ backoff_time = 0.1 # Start with 100ms
520
+ max_backoff = 2.0 # Max 2 seconds between checks
521
+ elapsed = 0
522
 
523
+ while elapsed < max_wait_time:
524
  # Check if process died
525
  if process.poll() is not None:
526
  stdout, _ = process.communicate()
527
+ logger.error(f"llama-server exited with code {process.returncode}")
528
+ logger.error(f"Output: {stdout}")
529
  raise RuntimeError("llama-server process died")
530
 
531
  try:
532
+ # Use aiohttp for async health check
533
+ async with http_session.get(f"{server_url}/health", timeout=aiohttp.ClientTimeout(total=2)) as response:
534
+ if response.status in [200, 404]: # 404 is ok, means server is up
535
+ load_time = time.time() - start_time
536
+ logger.info(f"llama-server ready after {load_time:.2f}s")
537
+ return process, load_time
538
+ except (aiohttp.ClientError, asyncio.TimeoutError):
539
  # Server not ready yet
540
  pass
 
 
 
541
 
542
+ # Exponential backoff
543
+ await asyncio.sleep(backoff_time)
544
+ elapsed += backoff_time
545
+ backoff_time = min(backoff_time * 1.5, max_backoff)
546
 
547
  raise RuntimeError("llama-server failed to start within 5 minutes")
548
 
549
 
550
+ async def preload_models_background():
551
+ """Background task to preload popular models."""
552
+ if not PRELOAD_MODELS:
553
+ return
554
+
555
+ logger.info(f"Preloading models in background: {PRELOAD_MODELS}")
556
+
557
+ for model_name in PRELOAD_MODELS:
558
+ if model_name not in AVAILABLE_MODELS:
559
+ logger.warning(f"Preload model not found: {model_name}")
560
+ continue
561
+
562
+ if model_cache.get(model_name):
563
+ logger.info(f"Model already cached: {model_name}")
564
+ continue
565
+
566
+ try:
567
+ model_id = AVAILABLE_MODELS[model_name]
568
+ port = model_cache._get_next_port()
569
+ process, load_time = await start_llama_server(model_id, port)
570
+ await model_cache.put(model_name, model_id, process, port, load_time)
571
+ logger.info(f"Preloaded model: {model_name}")
572
+ except Exception as e:
573
+ logger.error(f"Failed to preload model {model_name}: {e}")
574
+
575
+
576
  @app.on_event("startup")
577
  async def startup_event():
578
+ """Initialize HTTP session and start with default model."""
579
+ global current_model, http_session
580
+
581
+ startup_start = time.time()
582
+ logger.info("Application startup initiated")
583
+
584
+ # Initialize aiohttp session with connection pooling
585
+ connector = aiohttp.TCPConnector(
586
+ limit=100, # Max total connections
587
+ limit_per_host=10, # Max connections per host
588
+ ttl_dns_cache=300 # DNS cache TTL
589
+ )
590
+ http_session = aiohttp.ClientSession(
591
+ connector=connector,
592
+ timeout=aiohttp.ClientTimeout(total=REQUEST_TIMEOUT)
593
+ )
594
+
595
+ # Start default model
596
  model_id = AVAILABLE_MODELS[current_model]
597
  port = model_cache._get_next_port()
598
 
599
+ process, load_time = await start_llama_server(model_id, port)
600
+ await model_cache.put(current_model, model_id, process, port, load_time)
601
+
602
+ metrics.startup_time = time.time() - startup_start
603
+ logger.info(f"Started with default model: {current_model} (total startup: {metrics.startup_time:.2f}s)")
604
+
605
+ # Start preloading in background
606
+ asyncio.create_task(preload_models_background())
607
 
608
 
609
  @app.on_event("shutdown")
610
  async def shutdown_event():
611
+ """Clean shutdown - clear cache and close HTTP session."""
612
+ logger.info("Application shutdown initiated")
613
+
614
+ if http_session:
615
+ await http_session.close()
616
+
617
+ await model_cache.clear()
618
 
619
 
620
  @app.get(
 
634
  - List of all available models
635
  """
636
  return {
637
+ "status": "AGI Multi-Model API - High Performance Edition",
638
  "current_model": current_model,
639
  "available_models": list(AVAILABLE_MODELS.keys())
640
  }
641
 
642
 
643
+ @app.get(
644
+ "/health",
645
+ tags=["status"],
646
+ summary="Health Check",
647
+ description="Simple health check endpoint for monitoring."
648
+ )
649
+ async def health_check():
650
+ """Health check endpoint."""
651
+ return {
652
+ "status": "healthy",
653
+ "timestamp": time.time(),
654
+ "cached_models": len(model_cache.cache),
655
+ "current_model": current_model
656
+ }
657
+
658
+
659
  @app.get(
660
  "/models",
661
  response_model=ModelsResponse,
 
670
  Returns:
671
  - current_model: The model currently in use
672
  - available_models: Array of all available model names
 
 
673
  """
674
  return {
675
  "current_model": current_model,
 
682
  response_model=ModelSwitchResponse,
683
  tags=["models"],
684
  summary="Switch Active Model",
685
+ description="Switch to a different LLM model with intelligent caching for instant switching."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
686
  )
687
  async def switch_model(request: ModelSwitchRequest):
688
  """
689
  Switch to a different LLM model with intelligent caching.
690
 
691
+ **Performance optimizations:**
692
+ - Instant switching for cached models
693
+ - Async model loading with exponential backoff
694
+ - Connection pooling for health checks
695
+ - Background preloading of popular models
 
 
 
 
 
696
  """
697
  global current_model
698
 
 
705
  if request.model_name == current_model:
706
  return {"message": f"Already using model: {current_model}", "model": current_model}
707
 
708
+ metrics.total_switches += 1
709
+
710
  # Try to get from cache
711
  cached_model = model_cache.get(request.model_name)
712
 
713
  if cached_model:
714
  # Model is cached, instant switch
715
+ metrics.cache_hits += 1
716
  current_model = request.model_name
717
  return {
718
+ "message": f"Switched to model: {current_model} (from cache, instant)",
719
  "model": current_model
720
  }
721
 
722
  # Model not cached, need to load it
723
+ metrics.cache_misses += 1
724
  model_id = AVAILABLE_MODELS[request.model_name]
725
  port = model_cache._get_next_port()
726
 
727
  try:
728
+ process, load_time = await start_llama_server(model_id, port)
729
+ await model_cache.put(request.model_name, model_id, process, port, load_time)
730
  current_model = request.model_name
731
 
732
  return {
733
+ "message": f"Switched to model: {current_model} (loaded in {load_time:.2f}s)",
734
  "model": current_model
735
  }
736
  except Exception as e:
 
743
  "/v1/chat/completions",
744
  tags=["chat"],
745
  summary="Chat Completions",
746
+ description="High-performance OpenAI-compatible chat completions with connection pooling."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
747
  )
748
  async def chat_completions(request: ChatCompletionRequest):
749
  """
750
+ OpenAI-compatible chat completions with performance optimizations.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
751
 
752
+ **Performance features:**
753
+ - Async/await for non-blocking I/O
754
+ - HTTP connection pooling
755
+ - Request metrics tracking
756
  """
757
  try:
758
+ request_start = time.time()
759
+
760
  # Get current model from cache
761
  cached_model = model_cache.get(current_model)
762
  if not cached_model:
763
  raise HTTPException(status_code=500, detail="Current model not loaded")
764
 
765
+ # Forward to llama-server using aiohttp
766
+ async with http_session.post(
767
  f"{cached_model.url}/v1/chat/completions",
768
  json={
769
  "messages": request.messages,
770
  "max_tokens": request.max_tokens,
771
  "temperature": request.temperature,
772
+ }
773
+ ) as response:
774
+ response.raise_for_status()
775
+ result = await response.json()
776
+
777
+ # Update metrics
778
+ request_latency = time.time() - request_start
779
+ cached_model.request_count += 1
780
+ cached_model.total_latency += request_latency
781
+ metrics.record_request(current_model, request_latency)
782
+
783
+ return result
784
+ except aiohttp.ClientError as e:
785
  raise HTTPException(status_code=500, detail=f"llama-server error: {str(e)}")
786
 
787
 
788
+ async def search_web_async(query: str, max_results: int = 5) -> list[dict]:
789
+ """
790
+ Search the web using DuckDuckGo with result caching.
791
+
792
+ Implements LRU cache with TTL for search results.
793
+ """
794
+ # Check cache first
795
+ cached_results = web_search_cache.get(query, max_results)
796
+ if cached_results is not None:
797
+ metrics.web_search_cache_hits += 1
798
+ logger.debug(f"Web search cache hit for: {query}")
799
+ return cached_results
800
+
801
+ # Perform search
802
  try:
803
+ logger.debug(f"Performing web search: {query}")
804
+
805
+ # Run blocking DDGS in thread pool to avoid blocking event loop
806
+ loop = asyncio.get_event_loop()
807
+ results = await loop.run_in_executor(
808
+ None,
809
+ lambda: list(DDGS().text(query, max_results=max_results))
810
+ )
811
+
812
+ # Cache results
813
+ web_search_cache.put(query, max_results, results)
814
+ metrics.total_web_searches += 1
815
+
816
+ logger.debug(f"Found {len(results)} search results")
817
+ return results
818
  except Exception as e:
819
+ logger.error(f"Search error: {e}")
820
  return []
821
 
822
 
 
845
  "/v1/web-chat/completions",
846
  tags=["chat"],
847
  summary="Web-Augmented Chat Completions",
848
+ description="Chat completions with real-time web search and result caching."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
849
  )
850
  async def web_chat_completions(request: WebChatRequest):
851
  """
852
+ Chat completions with web search augmentation.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
853
 
854
+ **Performance optimizations:**
855
+ - Async web search
856
+ - LRU cache for search results (1 hour TTL)
857
+ - Parallel execution where possible
858
  """
859
  try:
860
  # Get the last user message as search query
 
864
 
865
  search_query = user_messages[-1].get("content", "")
866
 
867
+ # Perform web search (async with caching)
868
+ logger.info(f"Web chat: Searching for '{search_query}'")
869
+ search_results = await search_web_async(search_query, request.max_search_results)
870
 
871
  # Format search results as context
872
  web_context = format_search_context(search_query, search_results)
 
885
  Always cite sources when using information from the search results."""
886
  }
887
 
 
888
  augmented_messages.insert(-1, system_prompt)
889
 
890
  # Get current model from cache
 
893
  raise HTTPException(status_code=500, detail="Current model not loaded")
894
 
895
  # Forward to llama-server with augmented context
896
+ async with http_session.post(
897
  f"{cached_model.url}/v1/chat/completions",
898
  json={
899
  "messages": augmented_messages,
900
  "max_tokens": request.max_tokens,
901
  "temperature": request.temperature,
902
+ }
903
+ ) as response:
904
+ response.raise_for_status()
905
+ result = await response.json()
 
 
906
 
907
  # Add metadata about search results
908
  result["web_search"] = {
909
  "query": search_query,
910
  "results_count": len(search_results),
911
+ "sources": [r.get("href", "") for r in search_results if r.get("href")],
912
+ "cached": metrics.web_search_cache_hits > 0
913
  }
914
 
915
  return result
916
 
917
+ except aiohttp.ClientError as e:
918
  raise HTTPException(status_code=500, detail=f"llama-server error: {str(e)}")
919
  except Exception as e:
920
  raise HTTPException(status_code=500, detail=f"Error: {str(e)}")
 
924
  "/cache/info",
925
  tags=["models"],
926
  summary="Get Cache Information",
927
+ description="Returns information about the model cache and performance statistics."
928
  )
929
  async def get_cache_info():
930
+ """Get detailed information about the model cache."""
931
+ return model_cache.get_cache_info()
932
+
933
+
934
+ @app.get(
935
+ "/metrics",
936
+ tags=["monitoring"],
937
+ summary="Performance Metrics",
938
+ description="Get comprehensive performance metrics and statistics."
939
+ )
940
+ async def get_metrics():
941
  """
942
+ Get performance metrics for monitoring and optimization.
943
 
944
  Returns:
945
+ - Request counts and latencies
946
+ - Cache hit/miss ratios
947
+ - Model-specific statistics
948
+ - Web search cache stats
949
+ - Startup time
950
+ """
951
+ cache_hit_rate = (
952
+ metrics.cache_hits / (metrics.cache_hits + metrics.cache_misses)
953
+ if (metrics.cache_hits + metrics.cache_misses) > 0
954
+ else 0.0
955
+ )
956
+
957
+ web_cache_hit_rate = (
958
+ metrics.web_search_cache_hits / metrics.total_web_searches
959
+ if metrics.total_web_searches > 0
960
+ else 0.0
961
+ )
962
+
963
+ return {
964
+ "uptime_seconds": time.time() - (metrics.startup_time or time.time()),
965
+ "startup_time_seconds": metrics.startup_time,
966
+ "total_requests": metrics.total_requests,
967
+ "total_model_switches": metrics.total_switches,
968
+ "cache_stats": {
969
+ "hits": metrics.cache_hits,
970
+ "misses": metrics.cache_misses,
971
+ "hit_rate": cache_hit_rate,
972
+ "current_size": len(model_cache.cache),
973
+ "max_size": model_cache.max_size
974
  },
975
+ "web_search_stats": {
976
+ "total_searches": metrics.total_web_searches,
977
+ "cache_hits": metrics.web_search_cache_hits,
978
+ "cache_hit_rate": web_cache_hit_rate,
979
+ "cache_size": len(web_search_cache.cache)
980
+ },
981
+ "model_metrics": metrics.model_metrics,
982
+ "cached_models": model_cache.get_cache_info()["cached_models"]
983
  }
984
+
985
+
986
+ @app.post(
987
+ "/cache/clear",
988
+ tags=["models"],
989
+ summary="Clear Model Cache",
990
+ description="Clear all cached models (will reload on next request)."
991
+ )
992
+ async def clear_cache():
993
+ """Clear all cached models."""
994
+ await model_cache.clear()
995
+ return {"message": "Cache cleared successfully"}
996
+
997
+
998
+ @app.post(
999
+ "/cache/web-search/clear",
1000
+ tags=["models"],
1001
+ summary="Clear Web Search Cache",
1002
+ description="Clear all cached web search results."
1003
+ )
1004
+ async def clear_web_search_cache():
1005
+ """Clear web search cache."""
1006
+ web_search_cache.clear()
1007
+ metrics.web_search_cache_hits = 0
1008
+ metrics.total_web_searches = 0
1009
+ return {"message": "Web search cache cleared successfully"}
1010
 
1011
 
1012
  @app.get(
 
1017
  include_in_schema=False
1018
  )
1019
  async def get_openapi_spec():
1020
+ """Export the OpenAPI specification for this API."""
1021
+ return app.openapi()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
logger.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Centralized logging module for AGI Multi-Model API.
3
+
4
+ Provides structured logging with:
5
+ - Colored console output
6
+ - File logging with rotation
7
+ - Configurable log levels
8
+ - Timestamp and module name tracking
9
+ """
10
+
11
+ import logging
12
+ import sys
13
+ from pathlib import Path
14
+ from logging.handlers import RotatingFileHandler
15
+ from typing import Optional
16
+
17
+
18
+ class ColoredFormatter(logging.Formatter):
19
+ """Custom formatter with color support for console output."""
20
+
21
+ # ANSI color codes
22
+ COLORS = {
23
+ 'DEBUG': '\033[36m', # Cyan
24
+ 'INFO': '\033[32m', # Green
25
+ 'WARNING': '\033[33m', # Yellow
26
+ 'ERROR': '\033[31m', # Red
27
+ 'CRITICAL': '\033[35m', # Magenta
28
+ }
29
+ RESET = '\033[0m'
30
+ BOLD = '\033[1m'
31
+
32
+ def format(self, record):
33
+ """Format log record with colors."""
34
+ # Add color to level name
35
+ levelname = record.levelname
36
+ if levelname in self.COLORS:
37
+ record.levelname = f"{self.COLORS[levelname]}{self.BOLD}{levelname}{self.RESET}"
38
+
39
+ # Format the message
40
+ result = super().format(record)
41
+
42
+ # Reset levelname for other handlers
43
+ record.levelname = levelname
44
+
45
+ return result
46
+
47
+
48
+ class Logger:
49
+ """
50
+ Singleton logger class for the entire application.
51
+
52
+ Usage:
53
+ from logger import get_logger
54
+ logger = get_logger(__name__)
55
+ logger.info("Application started")
56
+ """
57
+
58
+ _instance: Optional[logging.Logger] = None
59
+ _initialized: bool = False
60
+
61
+ @classmethod
62
+ def get_logger(
63
+ cls,
64
+ name: str = "AGI",
65
+ level: int = logging.INFO,
66
+ log_file: Optional[str] = "agi.log",
67
+ max_bytes: int = 10 * 1024 * 1024, # 10MB
68
+ backup_count: int = 5
69
+ ) -> logging.Logger:
70
+ """
71
+ Get or create the application logger.
72
+
73
+ Args:
74
+ name: Logger name (typically module name)
75
+ level: Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
76
+ log_file: Path to log file (None to disable file logging)
77
+ max_bytes: Maximum size of log file before rotation
78
+ backup_count: Number of backup files to keep
79
+
80
+ Returns:
81
+ Configured logger instance
82
+ """
83
+ # Create or get logger
84
+ logger = logging.getLogger(name)
85
+
86
+ # Only configure handlers once for the root logger
87
+ if not cls._initialized and name == "AGI":
88
+ logger.setLevel(level)
89
+
90
+ # Console handler with colors
91
+ console_handler = logging.StreamHandler(sys.stdout)
92
+ console_handler.setLevel(level)
93
+ console_formatter = ColoredFormatter(
94
+ fmt='%(asctime)s | %(levelname)s | %(name)s | %(message)s',
95
+ datefmt='%Y-%m-%d %H:%M:%S'
96
+ )
97
+ console_handler.setFormatter(console_formatter)
98
+ logger.addHandler(console_handler)
99
+
100
+ # File handler with rotation (if enabled)
101
+ if log_file:
102
+ log_path = Path(log_file)
103
+ log_path.parent.mkdir(parents=True, exist_ok=True)
104
+
105
+ file_handler = RotatingFileHandler(
106
+ log_file,
107
+ maxBytes=max_bytes,
108
+ backupCount=backup_count
109
+ )
110
+ file_handler.setLevel(level)
111
+ file_formatter = logging.Formatter(
112
+ fmt='%(asctime)s | %(levelname)-8s | %(name)s | %(funcName)s:%(lineno)d | %(message)s',
113
+ datefmt='%Y-%m-%d %H:%M:%S'
114
+ )
115
+ file_handler.setFormatter(file_formatter)
116
+ logger.addHandler(file_handler)
117
+
118
+ # Prevent propagation to avoid duplicate logs
119
+ logger.propagate = False
120
+ cls._initialized = True
121
+
122
+ return logger
123
+
124
+
125
+ # Convenience function for easy import
126
+ def get_logger(name: str = "AGI", level: int = logging.INFO) -> logging.Logger:
127
+ """
128
+ Get a logger instance for the specified module.
129
+
130
+ Args:
131
+ name: Logger name (use __name__ for automatic module naming)
132
+ level: Logging level (default: INFO)
133
+
134
+ Returns:
135
+ Configured logger instance
136
+
137
+ Example:
138
+ from logger import get_logger
139
+ logger = get_logger(__name__)
140
+ logger.info("Starting application")
141
+ """
142
+ return Logger.get_logger(name, level)
143
+
144
+
145
+ # Initialize the root logger on module import
146
+ _root_logger = Logger.get_logger("AGI", level=logging.INFO)
147
+
148
+
149
+ if __name__ == "__main__":
150
+ # Test the logger
151
+ logger = get_logger("test_module")
152
+
153
+ logger.debug("This is a debug message")
154
+ logger.info("This is an info message")
155
+ logger.warning("This is a warning message")
156
+ logger.error("This is an error message")
157
+ logger.critical("This is a critical message")
158
+
159
+ print("\nTesting with different module names:")
160
+ api_logger = get_logger("api")
161
+ api_logger.info("API logger initialized")
162
+
163
+ client_logger = get_logger("client")
164
+ client_logger.info("Client logger initialized")
pyproject.toml CHANGED
@@ -1,7 +1,7 @@
1
  [project]
2
- name = "deepseek-api"
3
- version = "0.0.1"
4
- description = "Special DeepSeek API on HuggingFace Space"
5
  authors = [
6
  { name = "AI Developer", email = "you@example.com" }
7
  ]
@@ -9,6 +9,7 @@ requires-python = ">=3.12"
9
  dependencies = [
10
  "fastapi>=0.104.0",
11
  "uvicorn[standard]>=0.24.0",
 
12
  "llama-cpp-python>=0.2.0",
13
  "huggingface-hub>=0.19.0",
14
  "duckduckgo-search>=4.0.0",
 
1
  [project]
2
+ name = "agi-multi-model-api"
3
+ version = "0.1.0"
4
+ description = "High-Performance Multi-Model LLM API with Dynamic Switching"
5
  authors = [
6
  { name = "AI Developer", email = "you@example.com" }
7
  ]
 
9
  dependencies = [
10
  "fastapi>=0.104.0",
11
  "uvicorn[standard]>=0.24.0",
12
+ "aiohttp>=3.9.0",
13
  "llama-cpp-python>=0.2.0",
14
  "huggingface-hub>=0.19.0",
15
  "duckduckgo-search>=4.0.0",