Spaces:

ResearchEngineering
/

AGI

Sleeping

App Files Files Community

Dmitry Beresnev commited on Jan 24

Commit

7763bf4

1 Parent(s): c384ef1

fix gitignore, app and logger, etc

Browse files

Files changed (5) hide show

.gitignore +133 -0
Dockerfile +2 -1
app.py +436 -282
logger.py +164 -0
pyproject.toml +4 -3

.gitignore ADDED Viewed

	@@ -0,0 +1,133 @@

+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+pip-log.txt
+pip-delete-this-directory.txt
+# Virtual Environment
+.venv/
+venv/
+ENV/
+env/
+.virtualenv
+# PyInstaller
+*.manifest
+*.spec
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# IDEs
+.idea/
+.vscode/
+*.swp
+*.swo
+*~
+.DS_Store
+# Jupyter Notebook
+.ipynb_checkpoints
+*.ipynb
+# PyCharm
+.idea/
+*.iml
+*.iws
+# Logs
+*.log
+logs/
+agi.log
+# Environment variables
+.env
+.env.local
+.env.*.local
+*.env
+# Database
+*.db
+*.sqlite
+*.sqlite3
+# Model files (often large)
+*.bin
+*.gguf
+*.safetensors
+models/
+checkpoints/
+# Docker
+.dockerignore
+docker-compose.override.yml
+# OS
+.DS_Store
+Thumbs.db
+Desktop.ini
+$RECYCLE.BIN/
+*.cab
+*.msi
+*.msix
+*.msm
+*.msp
+*.lnk
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# Temporary files
+*.tmp
+*.temp
+tmp/
+temp/
+#
+*.minimal
+tests/
+*.md
+docs/

Dockerfile CHANGED Viewed

@@ -49,7 +49,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     && rm -rf /var/lib/apt/lists/*
 # Install Python packages
-RUN pip3 install --no-cache-dir fastapi uvicorn requests pydantic duckduckgo-search beautifulsoup4 lxml --break-system-packages
 # Create non-root user
 RUN useradd -m -u 1000 user && \
@@ -58,6 +58,7 @@ RUN useradd -m -u 1000 user && \
 # Copy application code
 COPY --chown=user:user app.py /home/user/app.py
 USER user
 WORKDIR /home/user

     && rm -rf /var/lib/apt/lists/*
 # Install Python packages
+RUN pip3 install --no-cache-dir fastapi uvicorn aiohttp pydantic duckduckgo-search beautifulsoup4 lxml --break-system-packages
 # Create non-root user
 RUN useradd -m -u 1000 user && \
 # Copy application code
 COPY --chown=user:user app.py /home/user/app.py
+COPY --chown=user:user logger.py /home/user/logger.py
 USER user
 WORKDIR /home/user

app.py CHANGED Viewed

@@ -2,27 +2,35 @@ import subprocess
 import signal
 import os
 import time
-from typing import Optional, Dict
-from dataclasses import dataclass
 from collections import OrderedDict
-import requests
-from fastapi import FastAPI, HTTPException
 from fastapi.openapi.utils import get_openapi
 from pydantic import BaseModel, Field
 from duckduckgo_search import DDGS
 from bs4 import BeautifulSoup
 app = FastAPI(
     title="AGI Multi-Model API",
     description="""
-    **Dynamic Multi-Model LLM API with Web Search Capabilities**
     This API provides:
-    * 🔄 Dynamic model switching between multiple LLM models
     * 💬 OpenAI-compatible chat completions
     * 🌐 Web-augmented chat with real-time search
-    * 📊 Model management and status monitoring
     ## Available Models
     - **deepseek-chat** (default): General purpose conversational model
@@ -31,13 +39,22 @@ app = FastAPI(
     - **deepseek-coder**: Specialized coding assistance
     - **llama-7b**: Lightweight and fast responses
     ## Quick Start
     1. Check available models: `GET /models`
     2. Switch model (optional): `POST /switch-model`
     3. Chat: `POST /v1/chat/completions`
     4. Chat with web search: `POST /v1/web-chat/completions`
     """,
-    version="0.0.1.2025.12.04",
     contact={
         "name": "API Support",
         "email": "support@example.com",
@@ -58,6 +75,10 @@ app = FastAPI(
             "name": "chat",
             "description": "Chat completion endpoints (OpenAI-compatible)",
         },
         {
             "name": "documentation",
             "description": "API documentation and OpenAPI specification",
@@ -81,9 +102,12 @@ AVAILABLE_MODELS = {
     "llama-7b": "TheBloke/Llama-2-7B-Chat-GGUF:llama-2-7b-chat.Q4_K_M.gguf",
 }
-# Configuration
-MAX_CACHED_MODELS = 2  # Maximum number of models to keep in memory
-BASE_PORT = 8080  # Starting port for llama-server instances
 @dataclass
@@ -95,14 +119,105 @@ class CachedModel:
     port: int
     url: str
     last_used: float
 class ModelCache:
     """
-    In-memory LRU cache for loaded models.
-    Manages multiple llama-server processes, each on a different port.
-    Automatically evicts least recently used models when cache is full.
     """
     def __init__(self, max_size: int = MAX_CACHED_MODELS):
@@ -110,6 +225,8 @@ class ModelCache:
         self.cache: OrderedDict[str, CachedModel] = OrderedDict()
         self.port_counter = BASE_PORT
         self.used_ports = set()
     def _get_next_port(self) -> int:
         """Get next available port for a model."""
@@ -124,14 +241,14 @@ class ModelCache:
         """Release a port back to the pool."""
         self.used_ports.discard(port)
-    def _evict_lru(self):
         """Evict the least recently used model."""
         if not self.cache:
             return
         # Get the first (oldest) item
         model_name, cached_model = self.cache.popitem(last=False)
-        print(f"Evicting model from cache: {model_name}")
         # Stop the process
         try:
@@ -139,20 +256,23 @@ class ModelCache:
                 os.killpg(os.getpgid(cached_model.process.pid), signal.SIGTERM)
             else:
                 cached_model.process.terminate()
-            cached_model.process.wait(timeout=10)
-        except Exception as e:
-            print(f"Error stopping model {model_name}: {e}")
-            try:
                 if os.name != 'nt':
                     os.killpg(os.getpgid(cached_model.process.pid), signal.SIGKILL)
                 else:
                     cached_model.process.kill()
-            except:
-                pass
         # Release the port
         self._release_port(cached_model.port)
-        time.sleep(1)
     def get(self, model_name: str) -> Optional[CachedModel]:
         """Get a model from cache, updating its last used time."""
@@ -161,16 +281,16 @@ class ModelCache:
             cached_model.last_used = time.time()
             # Move to end (most recently used)
             self.cache.move_to_end(model_name)
-            print(f"Cache hit for model: {model_name}")
             return cached_model
-        print(f"Cache miss for model: {model_name}")
         return None
-    def put(self, model_name: str, model_id: str, process: subprocess.Popen, port: int):
         """Add a model to the cache."""
         # Evict if cache is full
         while len(self.cache) >= self.max_size:
-            self._evict_lru()
         url = f"http://localhost:{port}"
         cached_model = CachedModel(
@@ -179,21 +299,27 @@ class ModelCache:
             process=process,
             port=port,
             url=url,
-            last_used=time.time()
         )
         self.cache[model_name] = cached_model
-        print(f"Cached model: {model_name} on port {port}")
-    def clear(self):
         """Clear all cached models."""
-        print("Clearing model cache...")
         for model_name, cached_model in list(self.cache.items()):
             try:
                 if os.name != 'nt':
                     os.killpg(os.getpgid(cached_model.process.pid), signal.SIGTERM)
                 else:
                     cached_model.process.terminate()
-                cached_model.process.wait(timeout=10)
             except:
                 try:
                     if os.name != 'nt':
@@ -216,7 +342,10 @@ class ModelCache:
                     "name": name,
                     "port": model.port,
                     "url": model.url,
-                    "last_used": model.last_used
                 }
                 for name, model in self.cache.items()
             ]
@@ -226,6 +355,11 @@ class ModelCache:
 # Global state
 current_model = "deepseek-chat"  # Default model
 model_cache = ModelCache(max_size=MAX_CACHED_MODELS)
 class ModelSwitchRequest(BaseModel):
@@ -347,22 +481,28 @@ class ModelSwitchResponse(BaseModel):
     model: str = Field(..., description="New active model name")
-def start_llama_server(model_id: str, port: int) -> subprocess.Popen:
-    """Start llama-server with specified model on a specific port."""
     cmd = [
         "llama-server",
         "-hf", model_id,
         "--host", "0.0.0.0",
         "--port", str(port),
         "-c", "2048",           # Context size
-        "-t", "4",              # CPU threads (adjust based on cores)
         "-ngl", "0",            # GPU layers (0 for CPU-only)
-        "--cont-batching",      # Enable continuous batching for speed
         "-b", "512",            # Batch size
     ]
-    print(f"Starting llama-server with model: {model_id} on port {port}")
-    print("This may take 2-3 minutes to download and load the model...")
     process = subprocess.Popen(
         cmd,
@@ -373,52 +513,108 @@ def start_llama_server(model_id: str, port: int) -> subprocess.Popen:
         bufsize=1
     )
-    # Wait for server to be ready (increased timeout for model download)
-    max_retries = 300  # 5 minutes
     server_url = f"http://localhost:{port}"
-    for i in range(max_retries):
         # Check if process died
         if process.poll() is not None:
             stdout, _ = process.communicate()
-            print(f"llama-server exited with code {process.returncode}")
-            print(f"Output: {stdout}")
             raise RuntimeError("llama-server process died")
         try:
-            # Try root endpoint instead of /health
-            response = requests.get(f"{server_url}/", timeout=2)
-            if response.status_code in [200, 404]:  # 404 is ok, means server is up
-                print(f"llama-server ready after {i+1} seconds")
-                return process
-        except requests.exceptions.ConnectionError:
             # Server not ready yet
             pass
-        except Exception:
-            # Other errors, keep waiting
-            pass
-        time.sleep(1)
     raise RuntimeError("llama-server failed to start within 5 minutes")
 @app.on_event("startup")
 async def startup_event():
-    """Start with default model and cache it."""
-    global current_model
     model_id = AVAILABLE_MODELS[current_model]
     port = model_cache._get_next_port()
-    process = start_llama_server(model_id, port)
-    model_cache.put(current_model, model_id, process, port)
-    print(f"Started with default model: {current_model}")
 @app.on_event("shutdown")
 async def shutdown_event():
-    """Clean shutdown - clear all cached models."""
-    model_cache.clear()
 @app.get(
@@ -438,12 +634,28 @@ async def root():
     - List of all available models
     """
     return {
-        "status": "AGI Multi-Model API with dynamic model switching and web search",
         "current_model": current_model,
         "available_models": list(AVAILABLE_MODELS.keys())
     }
 @app.get(
     "/models",
     response_model=ModelsResponse,
@@ -458,8 +670,6 @@ async def list_models():
     Returns:
     - current_model: The model currently in use
     - available_models: Array of all available model names
-    Use this endpoint to see which models you can switch to.
     """
     return {
         "current_model": current_model,
@@ -472,45 +682,17 @@ async def list_models():
     response_model=ModelSwitchResponse,
     tags=["models"],
     summary="Switch Active Model",
-    description="Switch to a different LLM model. Uses caching for instant switching to recently used models.",
-    responses={
-        200: {
-            "description": "Model switched successfully",
-            "content": {
-                "application/json": {
-                    "example": {
-                        "message": "Switched to model: deepseek-coder (from cache)",
-                        "model": "deepseek-coder"
-                    }
-                }
-            }
-        },
-        400: {
-            "description": "Invalid model name",
-            "content": {
-                "application/json": {
-                    "example": {
-                        "detail": "Model 'invalid-model' not found. Available: ['deepseek-chat', 'mistral-7b', ...]"
-                    }
-                }
-            }
-        }
-    }
 )
 async def switch_model(request: ModelSwitchRequest):
     """
     Switch to a different LLM model with intelligent caching.
-    **How it works:**
-    1. Checks if requested model is already active (no switch needed)
-    2. Checks cache for the model (instant switch if cached)
-    3. If not cached, loads the model (may take 2-3 minutes)
-    **Caching:**
-    - Up to 2 models kept in memory
-    - LRU (Least Recently Used) eviction policy
-    - Each model runs on a separate port
-    - Instant switching between cached models
     """
     global current_model
@@ -523,28 +705,32 @@ async def switch_model(request: ModelSwitchRequest):
     if request.model_name == current_model:
         return {"message": f"Already using model: {current_model}", "model": current_model}
     # Try to get from cache
     cached_model = model_cache.get(request.model_name)
     if cached_model:
         # Model is cached, instant switch
         current_model = request.model_name
         return {
-            "message": f"Switched to model: {current_model} (from cache)",
             "model": current_model
         }
     # Model not cached, need to load it
     model_id = AVAILABLE_MODELS[request.model_name]
     port = model_cache._get_next_port()
     try:
-        process = start_llama_server(model_id, port)
-        model_cache.put(request.model_name, model_id, process, port)
         current_model = request.model_name
         return {
-            "message": f"Switched to model: {current_model} (newly loaded)",
             "model": current_model
         }
     except Exception as e:
@@ -557,88 +743,80 @@ async def switch_model(request: ModelSwitchRequest):
     "/v1/chat/completions",
     tags=["chat"],
     summary="Chat Completions",
-    description="OpenAI-compatible chat completions endpoint. Send messages and get AI-generated responses.",
-    responses={
-        200: {
-            "description": "Successful response",
-            "content": {
-                "application/json": {
-                    "example": {
-                        "id": "chatcmpl-123",
-                        "object": "chat.completion",
-                        "created": 1677652288,
-                        "model": "deepseek-chat",
-                        "choices": [{
-                            "index": 0,
-                            "message": {
-                                "role": "assistant",
-                                "content": "Hello! How can I help you today?"
-                            },
-                            "finish_reason": "stop"
-                        }]
-                    }
-                }
-            }
-        },
-        500: {
-            "description": "LLM server error"
-        }
-    }
 )
 async def chat_completions(request: ChatCompletionRequest):
     """
-    OpenAI-compatible chat completions endpoint.
-    This endpoint forwards your request to the currently active LLM model
-    and returns the response in OpenAI-compatible format.
-    **Message Format:**
-    ```json
-    {
-      "messages": [
-        {"role": "system", "content": "You are a helpful assistant."},
-        {"role": "user", "content": "Hello!"}
-      ],
-      "max_tokens": 256,
-      "temperature": 0.7
-    }
-    ```
-    **Supported Roles:**
-    - `system`: Sets the behavior of the assistant
-    - `user`: User messages
-    - `assistant`: Assistant responses (for multi-turn conversations)
     """
     try:
         # Get current model from cache
         cached_model = model_cache.get(current_model)
         if not cached_model:
             raise HTTPException(status_code=500, detail="Current model not loaded")
-        # Forward to llama-server
-        response = requests.post(
             f"{cached_model.url}/v1/chat/completions",
             json={
                 "messages": request.messages,
                 "max_tokens": request.max_tokens,
                 "temperature": request.temperature,
-            },
-            timeout=300
-        )
-        response.raise_for_status()
-        return response.json()
-    except requests.exceptions.RequestException as e:
         raise HTTPException(status_code=500, detail=f"llama-server error: {str(e)}")
-def search_web(query: str, max_results: int = 5) -> list[dict]:
-    """Search the web using DuckDuckGo and return results."""
     try:
-        with DDGS() as ddgs:
-            results = list(ddgs.text(query, max_results=max_results))
-            return results
     except Exception as e:
-        print(f"Search error: {e}")
         return []
@@ -667,70 +845,16 @@ def format_search_context(query: str, search_results: list[dict]) -> str:
     "/v1/web-chat/completions",
     tags=["chat"],
     summary="Web-Augmented Chat Completions",
-    description="Chat completions enhanced with real-time web search. The last user message is used as a search query.",
-    responses={
-        200: {
-            "description": "Successful response with web search metadata",
-            "content": {
-                "application/json": {
-                    "example": {
-                        "id": "chatcmpl-123",
-                        "object": "chat.completion",
-                        "created": 1677652288,
-                        "model": "deepseek-chat",
-                        "choices": [{
-                            "index": 0,
-                            "message": {
-                                "role": "assistant",
-                                "content": "Based on recent search results, here's what I found..."
-                            },
-                            "finish_reason": "stop"
-                        }],
-                        "web_search": {
-                            "query": "latest AI developments",
-                            "results_count": 5,
-                            "sources": ["https://example.com/1", "https://example.com/2"]
-                        }
-                    }
-                }
-            }
-        },
-        400: {
-            "description": "No user message found"
-        },
-        500: {
-            "description": "LLM server or search error"
-        }
-    }
 )
 async def web_chat_completions(request: WebChatRequest):
     """
-    Chat completions with real-time web search augmentation.
-    **How it works:**
-    1. Extracts the last user message as the search query
-    2. Performs a web search using DuckDuckGo
-    3. Injects search results into the LLM context
-    4. Returns the AI response with source citations
-    **Use cases:**
-    - Current events and news
-    - Recent information beyond the model's training data
-    - Fact-checking with web sources
-    - Research with live data
-    **Example:**
-    ```json
-    {
-      "messages": [
-        {"role": "user", "content": "What's the latest news about SpaceX?"}
-      ],
-      "max_tokens": 512,
-      "max_search_results": 5
-    }
-    ```
-    The response includes a `web_search` field with metadata about sources used.
     """
     try:
         # Get the last user message as search query
@@ -740,9 +864,9 @@ async def web_chat_completions(request: WebChatRequest):
         search_query = user_messages[-1].get("content", "")
-        # Perform web search
-        print(f"Searching web for: {search_query}")
-        search_results = search_web(search_query, request.max_search_results)
         # Format search results as context
         web_context = format_search_context(search_query, search_results)
@@ -761,7 +885,6 @@ Use the above search results to provide accurate, up-to-date information in your
 Always cite sources when using information from the search results."""
         }
-        # Insert system message before the last user message
         augmented_messages.insert(-1, system_prompt)
         # Get current model from cache
@@ -770,29 +893,28 @@ Always cite sources when using information from the search results."""
             raise HTTPException(status_code=500, detail="Current model not loaded")
         # Forward to llama-server with augmented context
-        response = requests.post(
             f"{cached_model.url}/v1/chat/completions",
             json={
                 "messages": augmented_messages,
                 "max_tokens": request.max_tokens,
                 "temperature": request.temperature,
-            },
-            timeout=300
-        )
-        response.raise_for_status()
-        result = response.json()
         # Add metadata about search results
         result["web_search"] = {
             "query": search_query,
             "results_count": len(search_results),
-            "sources": [r.get("href", "") for r in search_results if r.get("href")]
         }
         return result
-    except requests.exceptions.RequestException as e:
         raise HTTPException(status_code=500, detail=f"llama-server error: {str(e)}")
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"Error: {str(e)}")
@@ -802,40 +924,89 @@ Always cite sources when using information from the search results."""
     "/cache/info",
     tags=["models"],
     summary="Get Cache Information",
-    description="Returns information about the model cache, including cached models and cache statistics."
 )
 async def get_cache_info():
     """
-    Get information about the in-memory model cache.
     Returns:
-    - max_size: Maximum number of models that can be cached
-    - current_size: Current number of cached models
-    - cached_models: List of currently cached models with their metadata
-    **Example Response:**
-    ```json
-    {
-      "max_size": 2,
-      "current_size": 2,
-      "cached_models": [
-        {
-          "name": "deepseek-chat",
-          "port": 8080,
-          "url": "http://localhost:8080",
-          "last_used": 1234567890.123
         },
-        {
-          "name": "mistral-7b",
-          "port": 8081,
-          "url": "http://localhost:8081",
-          "last_used": 1234567895.456
-        }
-      ]
     }
-    ```
-    """
-    return model_cache.get_cache_info()
 @app.get(
@@ -846,22 +1017,5 @@ async def get_cache_info():
     include_in_schema=False
 )
 async def get_openapi_spec():
-    """
-    Export the OpenAPI specification for this API.
-    This endpoint returns the complete OpenAPI 3.0 specification that can be used with:
-    - API documentation tools (Swagger UI, ReDoc)
-    - Code generators (openapi-generator, swagger-codegen)
-    - API testing tools (Postman, Insomnia)
-    - SDK generation
-    Save this to a file and use it with tools like:
-    ```bash
-    # Generate Python client
-    openapi-generator generate -i openapi.json -g python -o ./client
-    # Generate TypeScript client
-    openapi-generator generate -i openapi.json -g typescript-fetch -o ./client
-    ```
-    """
-    return app.openapi()

 import signal
 import os
 import time
+import asyncio
+from typing import Optional, Dict, List
+from dataclasses import dataclass, field
 from collections import OrderedDict
+from datetime import datetime, timedelta
+import hashlib
+import aiohttp
+from fastapi import FastAPI, HTTPException, BackgroundTasks
 from fastapi.openapi.utils import get_openapi
 from pydantic import BaseModel, Field
 from duckduckgo_search import DDGS
 from bs4 import BeautifulSoup
+from logger import get_logger
+logger = get_logger(__name__)
 app = FastAPI(
     title="AGI Multi-Model API",
     description="""
+    **High-Performance Dynamic Multi-Model LLM API with Web Search**
     This API provides:
+    * 🔄 Dynamic model switching with intelligent caching
     * 💬 OpenAI-compatible chat completions
     * 🌐 Web-augmented chat with real-time search
+    * 📊 Model management and performance monitoring
+    * ⚡ Async/await architecture for maximum throughput
     ## Available Models
     - **deepseek-chat** (default): General purpose conversational model
     - **deepseek-coder**: Specialized coding assistance
     - **llama-7b**: Lightweight and fast responses
+    ## Performance Features
+    - Parallel model loading
+    - Connection pooling for HTTP requests
+    - Web search result caching
+    - Background model preloading
+    - Request queuing to prevent overload
+    - Real-time performance metrics
     ## Quick Start
     1. Check available models: `GET /models`
     2. Switch model (optional): `POST /switch-model`
     3. Chat: `POST /v1/chat/completions`
     4. Chat with web search: `POST /v1/web-chat/completions`
+    5. View metrics: `GET /metrics`
     """,
+    version="0.1.0.2026.01.24",
     contact={
         "name": "API Support",
         "email": "support@example.com",
             "name": "chat",
             "description": "Chat completion endpoints (OpenAI-compatible)",
         },
+        {
+            "name": "monitoring",
+            "description": "Performance metrics and monitoring",
+        },
         {
             "name": "documentation",
             "description": "API documentation and OpenAPI specification",
     "llama-7b": "TheBloke/Llama-2-7B-Chat-GGUF:llama-2-7b-chat.Q4_K_M.gguf",
 }
+# Configuration - now environment-variable driven
+MAX_CACHED_MODELS = int(os.getenv("MAX_CACHED_MODELS", "2"))
+BASE_PORT = int(os.getenv("BASE_PORT", "8080"))
+PRELOAD_MODELS = os.getenv("PRELOAD_MODELS", "").split(",") if os.getenv("PRELOAD_MODELS") else []
+WEB_SEARCH_CACHE_TTL = int(os.getenv("WEB_SEARCH_CACHE_TTL", "3600"))  # 1 hour
+REQUEST_TIMEOUT = int(os.getenv("REQUEST_TIMEOUT", "300"))  # 5 minutes
 @dataclass
     port: int
     url: str
     last_used: float
+    load_time: float = 0.0
+    request_count: int = 0
+    total_latency: float = 0.0
+@dataclass
+class PerformanceMetrics:
+    """Performance metrics for monitoring."""
+    total_requests: int = 0
+    total_switches: int = 0
+    cache_hits: int = 0
+    cache_misses: int = 0
+    total_web_searches: int = 0
+    web_search_cache_hits: int = 0
+    model_metrics: Dict[str, Dict] = field(default_factory=dict)
+    startup_time: float = 0.0
+    def record_request(self, model_name: str, latency: float):
+        """Record a request for metrics."""
+        self.total_requests += 1
+        if model_name not in self.model_metrics:
+            self.model_metrics[model_name] = {
+                "requests": 0,
+                "total_latency": 0.0,
+                "avg_latency": 0.0
+            }
+        self.model_metrics[model_name]["requests"] += 1
+        self.model_metrics[model_name]["total_latency"] += latency
+        self.model_metrics[model_name]["avg_latency"] = (
+            self.model_metrics[model_name]["total_latency"] /
+            self.model_metrics[model_name]["requests"]
+        )
+@dataclass
+class WebSearchCacheEntry:
+    """Cache entry for web search results."""
+    results: List[dict]
+    timestamp: float
+    ttl: int = WEB_SEARCH_CACHE_TTL
+    def is_expired(self) -> bool:
+        """Check if cache entry has expired."""
+        return time.time() - self.timestamp > self.ttl
+class WebSearchCache:
+    """LRU cache for web search results."""
+    def __init__(self, max_size: int = 100):
+        self.max_size = max_size
+        self.cache: OrderedDict[str, WebSearchCacheEntry] = OrderedDict()
+    def _get_cache_key(self, query: str, max_results: int) -> str:
+        """Generate cache key from query."""
+        key = f"{query}:{max_results}"
+        return hashlib.md5(key.encode()).hexdigest()
+    def get(self, query: str, max_results: int) -> Optional[List[dict]]:
+        """Get cached search results if available and not expired."""
+        key = self._get_cache_key(query, max_results)
+        if key in self.cache:
+            entry = self.cache[key]
+            if not entry.is_expired():
+                # Move to end (most recently used)
+                self.cache.move_to_end(key)
+                return entry.results
+            else:
+                # Remove expired entry
+                del self.cache[key]
+        return None
+    def put(self, query: str, max_results: int, results: List[dict]):
+        """Cache search results."""
+        key = self._get_cache_key(query, max_results)
+        # Evict oldest if cache is full
+        if len(self.cache) >= self.max_size and key not in self.cache:
+            self.cache.popitem(last=False)
+        self.cache[key] = WebSearchCacheEntry(
+            results=results,
+            timestamp=time.time()
+        )
+    def clear(self):
+        """Clear all cached results."""
+        self.cache.clear()
 class ModelCache:
     """
+    High-performance in-memory LRU cache for loaded models.
+    Features:
+    - Manages multiple llama-server processes on different ports
+    - LRU eviction when cache is full
+    - Parallel model loading support
+    - Performance metrics tracking
     """
     def __init__(self, max_size: int = MAX_CACHED_MODELS):
         self.cache: OrderedDict[str, CachedModel] = OrderedDict()
         self.port_counter = BASE_PORT
         self.used_ports = set()
+        self._loading_lock = asyncio.Lock()
+        self._loading_models: Dict[str, asyncio.Task] = {}
     def _get_next_port(self) -> int:
         """Get next available port for a model."""
         """Release a port back to the pool."""
         self.used_ports.discard(port)
+    async def _evict_lru(self):
         """Evict the least recently used model."""
         if not self.cache:
             return
         # Get the first (oldest) item
         model_name, cached_model = self.cache.popitem(last=False)
+        logger.info(f"Evicting model from cache: {model_name}")
         # Stop the process
         try:
                 os.killpg(os.getpgid(cached_model.process.pid), signal.SIGTERM)
             else:
                 cached_model.process.terminate()
+            # Wait asynchronously for process to stop
+            for _ in range(10):
+                if cached_model.process.poll() is not None:
+                    break
+                await asyncio.sleep(0.1)
+            else:
+                # Force kill if not stopped
                 if os.name != 'nt':
                     os.killpg(os.getpgid(cached_model.process.pid), signal.SIGKILL)
                 else:
                     cached_model.process.kill()
+        except Exception as e:
+            logger.error(f"Error stopping model {model_name}: {e}")
         # Release the port
         self._release_port(cached_model.port)
     def get(self, model_name: str) -> Optional[CachedModel]:
         """Get a model from cache, updating its last used time."""
             cached_model.last_used = time.time()
             # Move to end (most recently used)
             self.cache.move_to_end(model_name)
+            logger.debug(f"Cache hit for model: {model_name}")
             return cached_model
+        logger.debug(f"Cache miss for model: {model_name}")
         return None
+    async def put(self, model_name: str, model_id: str, process: subprocess.Popen, port: int, load_time: float = 0.0):
         """Add a model to the cache."""
         # Evict if cache is full
         while len(self.cache) >= self.max_size:
+            await self._evict_lru()
         url = f"http://localhost:{port}"
         cached_model = CachedModel(
             process=process,
             port=port,
             url=url,
+            last_used=time.time(),
+            load_time=load_time
         )
         self.cache[model_name] = cached_model
+        logger.info(f"Cached model: {model_name} on port {port} (load time: {load_time:.2f}s)")
+    async def clear(self):
         """Clear all cached models."""
+        logger.info("Clearing model cache...")
         for model_name, cached_model in list(self.cache.items()):
             try:
                 if os.name != 'nt':
                     os.killpg(os.getpgid(cached_model.process.pid), signal.SIGTERM)
                 else:
                     cached_model.process.terminate()
+                # Wait asynchronously
+                for _ in range(10):
+                    if cached_model.process.poll() is not None:
+                        break
+                    await asyncio.sleep(0.1)
             except:
                 try:
                     if os.name != 'nt':
                     "name": name,
                     "port": model.port,
                     "url": model.url,
+                    "last_used": model.last_used,
+                    "load_time": model.load_time,
+                    "request_count": model.request_count,
+                    "avg_latency": model.total_latency / model.request_count if model.request_count > 0 else 0.0
                 }
                 for name, model in self.cache.items()
             ]
 # Global state
 current_model = "deepseek-chat"  # Default model
 model_cache = ModelCache(max_size=MAX_CACHED_MODELS)
+web_search_cache = WebSearchCache(max_size=100)
+metrics = PerformanceMetrics()
+# HTTP session for connection pooling (will be initialized in startup)
+http_session: Optional[aiohttp.ClientSession] = None
 class ModelSwitchRequest(BaseModel):
     model: str = Field(..., description="New active model name")
+async def start_llama_server(model_id: str, port: int) -> tuple[subprocess.Popen, float]:
+    """
+    Start llama-server with specified model on a specific port.
+    Returns tuple of (process, load_time_seconds).
+    Uses async/await with exponential backoff for health checks.
+    """
+    start_time = time.time()
     cmd = [
         "llama-server",
         "-hf", model_id,
         "--host", "0.0.0.0",
         "--port", str(port),
         "-c", "2048",           # Context size
+        "-t", "4",              # CPU threads
         "-ngl", "0",            # GPU layers (0 for CPU-only)
+        "--cont-batching",      # Enable continuous batching
         "-b", "512",            # Batch size
     ]
+    logger.info(f"Starting llama-server with model: {model_id} on port {port}")
     process = subprocess.Popen(
         cmd,
         bufsize=1
     )
+    # Wait for server to be ready with exponential backoff
     server_url = f"http://localhost:{port}"
+    max_wait_time = 300  # 5 minutes
+    backoff_time = 0.1  # Start with 100ms
+    max_backoff = 2.0   # Max 2 seconds between checks
+    elapsed = 0
+    while elapsed < max_wait_time:
         # Check if process died
         if process.poll() is not None:
             stdout, _ = process.communicate()
+            logger.error(f"llama-server exited with code {process.returncode}")
+            logger.error(f"Output: {stdout}")
             raise RuntimeError("llama-server process died")
         try:
+            # Use aiohttp for async health check
+            async with http_session.get(f"{server_url}/health", timeout=aiohttp.ClientTimeout(total=2)) as response:
+                if response.status in [200, 404]:  # 404 is ok, means server is up
+                    load_time = time.time() - start_time
+                    logger.info(f"llama-server ready after {load_time:.2f}s")
+                    return process, load_time
+        except (aiohttp.ClientError, asyncio.TimeoutError):
             # Server not ready yet
             pass
+        # Exponential backoff
+        await asyncio.sleep(backoff_time)
+        elapsed += backoff_time
+        backoff_time = min(backoff_time * 1.5, max_backoff)
     raise RuntimeError("llama-server failed to start within 5 minutes")
+async def preload_models_background():
+    """Background task to preload popular models."""
+    if not PRELOAD_MODELS:
+        return
+    logger.info(f"Preloading models in background: {PRELOAD_MODELS}")
+    for model_name in PRELOAD_MODELS:
+        if model_name not in AVAILABLE_MODELS:
+            logger.warning(f"Preload model not found: {model_name}")
+            continue
+        if model_cache.get(model_name):
+            logger.info(f"Model already cached: {model_name}")
+            continue
+        try:
+            model_id = AVAILABLE_MODELS[model_name]
+            port = model_cache._get_next_port()
+            process, load_time = await start_llama_server(model_id, port)
+            await model_cache.put(model_name, model_id, process, port, load_time)
+            logger.info(f"Preloaded model: {model_name}")
+        except Exception as e:
+            logger.error(f"Failed to preload model {model_name}: {e}")
 @app.on_event("startup")
 async def startup_event():
+    """Initialize HTTP session and start with default model."""
+    global current_model, http_session
+    startup_start = time.time()
+    logger.info("Application startup initiated")
+    # Initialize aiohttp session with connection pooling
+    connector = aiohttp.TCPConnector(
+        limit=100,  # Max total connections
+        limit_per_host=10,  # Max connections per host
+        ttl_dns_cache=300  # DNS cache TTL
+    )
+    http_session = aiohttp.ClientSession(
+        connector=connector,
+        timeout=aiohttp.ClientTimeout(total=REQUEST_TIMEOUT)
+    )
+    # Start default model
     model_id = AVAILABLE_MODELS[current_model]
     port = model_cache._get_next_port()
+    process, load_time = await start_llama_server(model_id, port)
+    await model_cache.put(current_model, model_id, process, port, load_time)
+    metrics.startup_time = time.time() - startup_start
+    logger.info(f"Started with default model: {current_model} (total startup: {metrics.startup_time:.2f}s)")
+    # Start preloading in background
+    asyncio.create_task(preload_models_background())
 @app.on_event("shutdown")
 async def shutdown_event():
+    """Clean shutdown - clear cache and close HTTP session."""
+    logger.info("Application shutdown initiated")
+    if http_session:
+        await http_session.close()
+    await model_cache.clear()
 @app.get(
     - List of all available models
     """
     return {
+        "status": "AGI Multi-Model API - High Performance Edition",
         "current_model": current_model,
         "available_models": list(AVAILABLE_MODELS.keys())
     }
+@app.get(
+    "/health",
+    tags=["status"],
+    summary="Health Check",
+    description="Simple health check endpoint for monitoring."
+)
+async def health_check():
+    """Health check endpoint."""
+    return {
+        "status": "healthy",
+        "timestamp": time.time(),
+        "cached_models": len(model_cache.cache),
+        "current_model": current_model
+    }
 @app.get(
     "/models",
     response_model=ModelsResponse,
     Returns:
     - current_model: The model currently in use
     - available_models: Array of all available model names
     """
     return {
         "current_model": current_model,
     response_model=ModelSwitchResponse,
     tags=["models"],
     summary="Switch Active Model",
+    description="Switch to a different LLM model with intelligent caching for instant switching."
 )
 async def switch_model(request: ModelSwitchRequest):
     """
     Switch to a different LLM model with intelligent caching.
+    **Performance optimizations:**
+    - Instant switching for cached models
+    - Async model loading with exponential backoff
+    - Connection pooling for health checks
+    - Background preloading of popular models
     """
     global current_model
     if request.model_name == current_model:
         return {"message": f"Already using model: {current_model}", "model": current_model}
+    metrics.total_switches += 1
     # Try to get from cache
     cached_model = model_cache.get(request.model_name)
     if cached_model:
         # Model is cached, instant switch
+        metrics.cache_hits += 1
         current_model = request.model_name
         return {
+            "message": f"Switched to model: {current_model} (from cache, instant)",
             "model": current_model
         }
     # Model not cached, need to load it
+    metrics.cache_misses += 1
     model_id = AVAILABLE_MODELS[request.model_name]
     port = model_cache._get_next_port()
     try:
+        process, load_time = await start_llama_server(model_id, port)
+        await model_cache.put(request.model_name, model_id, process, port, load_time)
         current_model = request.model_name
         return {
+            "message": f"Switched to model: {current_model} (loaded in {load_time:.2f}s)",
             "model": current_model
         }
     except Exception as e:
     "/v1/chat/completions",
     tags=["chat"],
     summary="Chat Completions",
+    description="High-performance OpenAI-compatible chat completions with connection pooling."
 )
 async def chat_completions(request: ChatCompletionRequest):
     """
+    OpenAI-compatible chat completions with performance optimizations.
+    **Performance features:**
+    - Async/await for non-blocking I/O
+    - HTTP connection pooling
+    - Request metrics tracking
     """
     try:
+        request_start = time.time()
         # Get current model from cache
         cached_model = model_cache.get(current_model)
         if not cached_model:
             raise HTTPException(status_code=500, detail="Current model not loaded")
+        # Forward to llama-server using aiohttp
+        async with http_session.post(
             f"{cached_model.url}/v1/chat/completions",
             json={
                 "messages": request.messages,
                 "max_tokens": request.max_tokens,
                 "temperature": request.temperature,
+            }
+        ) as response:
+            response.raise_for_status()
+            result = await response.json()
+        # Update metrics
+        request_latency = time.time() - request_start
+        cached_model.request_count += 1
+        cached_model.total_latency += request_latency
+        metrics.record_request(current_model, request_latency)
+        return result
+    except aiohttp.ClientError as e:
         raise HTTPException(status_code=500, detail=f"llama-server error: {str(e)}")
+async def search_web_async(query: str, max_results: int = 5) -> list[dict]:
+    """
+    Search the web using DuckDuckGo with result caching.
+    Implements LRU cache with TTL for search results.
+    """
+    # Check cache first
+    cached_results = web_search_cache.get(query, max_results)
+    if cached_results is not None:
+        metrics.web_search_cache_hits += 1
+        logger.debug(f"Web search cache hit for: {query}")
+        return cached_results
+    # Perform search
     try:
+        logger.debug(f"Performing web search: {query}")
+        # Run blocking DDGS in thread pool to avoid blocking event loop
+        loop = asyncio.get_event_loop()
+        results = await loop.run_in_executor(
+            None,
+            lambda: list(DDGS().text(query, max_results=max_results))
+        )
+        # Cache results
+        web_search_cache.put(query, max_results, results)
+        metrics.total_web_searches += 1
+        logger.debug(f"Found {len(results)} search results")
+        return results
     except Exception as e:
+        logger.error(f"Search error: {e}")
         return []
     "/v1/web-chat/completions",
     tags=["chat"],
     summary="Web-Augmented Chat Completions",
+    description="Chat completions with real-time web search and result caching."
 )
 async def web_chat_completions(request: WebChatRequest):
     """
+    Chat completions with web search augmentation.
+    **Performance optimizations:**
+    - Async web search
+    - LRU cache for search results (1 hour TTL)
+    - Parallel execution where possible
     """
     try:
         # Get the last user message as search query
         search_query = user_messages[-1].get("content", "")
+        # Perform web search (async with caching)
+        logger.info(f"Web chat: Searching for '{search_query}'")
+        search_results = await search_web_async(search_query, request.max_search_results)
         # Format search results as context
         web_context = format_search_context(search_query, search_results)
 Always cite sources when using information from the search results."""
         }
         augmented_messages.insert(-1, system_prompt)
         # Get current model from cache
             raise HTTPException(status_code=500, detail="Current model not loaded")
         # Forward to llama-server with augmented context
+        async with http_session.post(
             f"{cached_model.url}/v1/chat/completions",
             json={
                 "messages": augmented_messages,
                 "max_tokens": request.max_tokens,
                 "temperature": request.temperature,
+            }
+        ) as response:
+            response.raise_for_status()
+            result = await response.json()
         # Add metadata about search results
         result["web_search"] = {
             "query": search_query,
             "results_count": len(search_results),
+            "sources": [r.get("href", "") for r in search_results if r.get("href")],
+            "cached": metrics.web_search_cache_hits > 0
         }
         return result
+    except aiohttp.ClientError as e:
         raise HTTPException(status_code=500, detail=f"llama-server error: {str(e)}")
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"Error: {str(e)}")
     "/cache/info",
     tags=["models"],
     summary="Get Cache Information",
+    description="Returns information about the model cache and performance statistics."
 )
 async def get_cache_info():
+    """Get detailed information about the model cache."""
+    return model_cache.get_cache_info()
+@app.get(
+    "/metrics",
+    tags=["monitoring"],
+    summary="Performance Metrics",
+    description="Get comprehensive performance metrics and statistics."
+)
+async def get_metrics():
     """
+    Get performance metrics for monitoring and optimization.
     Returns:
+    - Request counts and latencies
+    - Cache hit/miss ratios
+    - Model-specific statistics
+    - Web search cache stats
+    - Startup time
+    """
+    cache_hit_rate = (
+        metrics.cache_hits / (metrics.cache_hits + metrics.cache_misses)
+        if (metrics.cache_hits + metrics.cache_misses) > 0
+        else 0.0
+    )
+    web_cache_hit_rate = (
+        metrics.web_search_cache_hits / metrics.total_web_searches
+        if metrics.total_web_searches > 0
+        else 0.0
+    )
+    return {
+        "uptime_seconds": time.time() - (metrics.startup_time or time.time()),
+        "startup_time_seconds": metrics.startup_time,
+        "total_requests": metrics.total_requests,
+        "total_model_switches": metrics.total_switches,
+        "cache_stats": {
+            "hits": metrics.cache_hits,
+            "misses": metrics.cache_misses,
+            "hit_rate": cache_hit_rate,
+            "current_size": len(model_cache.cache),
+            "max_size": model_cache.max_size
         },
+        "web_search_stats": {
+            "total_searches": metrics.total_web_searches,
+            "cache_hits": metrics.web_search_cache_hits,
+            "cache_hit_rate": web_cache_hit_rate,
+            "cache_size": len(web_search_cache.cache)
+        },
+        "model_metrics": metrics.model_metrics,
+        "cached_models": model_cache.get_cache_info()["cached_models"]
     }
+@app.post(
+    "/cache/clear",
+    tags=["models"],
+    summary="Clear Model Cache",
+    description="Clear all cached models (will reload on next request)."
+)
+async def clear_cache():
+    """Clear all cached models."""
+    await model_cache.clear()
+    return {"message": "Cache cleared successfully"}
+@app.post(
+    "/cache/web-search/clear",
+    tags=["models"],
+    summary="Clear Web Search Cache",
+    description="Clear all cached web search results."
+)
+async def clear_web_search_cache():
+    """Clear web search cache."""
+    web_search_cache.clear()
+    metrics.web_search_cache_hits = 0
+    metrics.total_web_searches = 0
+    return {"message": "Web search cache cleared successfully"}
 @app.get(
     include_in_schema=False
 )
 async def get_openapi_spec():
+    """Export the OpenAPI specification for this API."""
+    return app.openapi()

logger.py ADDED Viewed

	@@ -0,0 +1,164 @@

+"""
+Centralized logging module for AGI Multi-Model API.
+Provides structured logging with:
+- Colored console output
+- File logging with rotation
+- Configurable log levels
+- Timestamp and module name tracking
+"""
+import logging
+import sys
+from pathlib import Path
+from logging.handlers import RotatingFileHandler
+from typing import Optional
+class ColoredFormatter(logging.Formatter):
+    """Custom formatter with color support for console output."""
+    # ANSI color codes
+    COLORS = {
+        'DEBUG': '\033[36m',      # Cyan
+        'INFO': '\033[32m',       # Green
+        'WARNING': '\033[33m',    # Yellow
+        'ERROR': '\033[31m',      # Red
+        'CRITICAL': '\033[35m',   # Magenta
+    }
+    RESET = '\033[0m'
+    BOLD = '\033[1m'
+    def format(self, record):
+        """Format log record with colors."""
+        # Add color to level name
+        levelname = record.levelname
+        if levelname in self.COLORS:
+            record.levelname = f"{self.COLORS[levelname]}{self.BOLD}{levelname}{self.RESET}"
+        # Format the message
+        result = super().format(record)
+        # Reset levelname for other handlers
+        record.levelname = levelname
+        return result
+class Logger:
+    """
+    Singleton logger class for the entire application.
+    Usage:
+        from logger import get_logger
+        logger = get_logger(__name__)
+        logger.info("Application started")
+    """
+    _instance: Optional[logging.Logger] = None
+    _initialized: bool = False
+    @classmethod
+    def get_logger(
+        cls,
+        name: str = "AGI",
+        level: int = logging.INFO,
+        log_file: Optional[str] = "agi.log",
+        max_bytes: int = 10 * 1024 * 1024,  # 10MB
+        backup_count: int = 5
+    ) -> logging.Logger:
+        """
+        Get or create the application logger.
+        Args:
+            name: Logger name (typically module name)
+            level: Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
+            log_file: Path to log file (None to disable file logging)
+            max_bytes: Maximum size of log file before rotation
+            backup_count: Number of backup files to keep
+        Returns:
+            Configured logger instance
+        """
+        # Create or get logger
+        logger = logging.getLogger(name)
+        # Only configure handlers once for the root logger
+        if not cls._initialized and name == "AGI":
+            logger.setLevel(level)
+            # Console handler with colors
+            console_handler = logging.StreamHandler(sys.stdout)
+            console_handler.setLevel(level)
+            console_formatter = ColoredFormatter(
+                fmt='%(asctime)s | %(levelname)s | %(name)s | %(message)s',
+                datefmt='%Y-%m-%d %H:%M:%S'
+            )
+            console_handler.setFormatter(console_formatter)
+            logger.addHandler(console_handler)
+            # File handler with rotation (if enabled)
+            if log_file:
+                log_path = Path(log_file)
+                log_path.parent.mkdir(parents=True, exist_ok=True)
+                file_handler = RotatingFileHandler(
+                    log_file,
+                    maxBytes=max_bytes,
+                    backupCount=backup_count
+                )
+                file_handler.setLevel(level)
+                file_formatter = logging.Formatter(
+                    fmt='%(asctime)s | %(levelname)-8s | %(name)s | %(funcName)s:%(lineno)d | %(message)s',
+                    datefmt='%Y-%m-%d %H:%M:%S'
+                )
+                file_handler.setFormatter(file_formatter)
+                logger.addHandler(file_handler)
+            # Prevent propagation to avoid duplicate logs
+            logger.propagate = False
+            cls._initialized = True
+        return logger
+# Convenience function for easy import
+def get_logger(name: str = "AGI", level: int = logging.INFO) -> logging.Logger:
+    """
+    Get a logger instance for the specified module.
+    Args:
+        name: Logger name (use __name__ for automatic module naming)
+        level: Logging level (default: INFO)
+    Returns:
+        Configured logger instance
+    Example:
+        from logger import get_logger
+        logger = get_logger(__name__)
+        logger.info("Starting application")
+    """
+    return Logger.get_logger(name, level)
+# Initialize the root logger on module import
+_root_logger = Logger.get_logger("AGI", level=logging.INFO)
+if __name__ == "__main__":
+    # Test the logger
+    logger = get_logger("test_module")
+    logger.debug("This is a debug message")
+    logger.info("This is an info message")
+    logger.warning("This is a warning message")
+    logger.error("This is an error message")
+    logger.critical("This is a critical message")
+    print("\nTesting with different module names:")
+    api_logger = get_logger("api")
+    api_logger.info("API logger initialized")
+    client_logger = get_logger("client")
+    client_logger.info("Client logger initialized")

pyproject.toml CHANGED Viewed

@@ -1,7 +1,7 @@
 [project]
-name = "deepseek-api"
-version = "0.0.1"
-description = "Special DeepSeek API on HuggingFace Space"
 authors = [
     { name = "AI Developer", email = "you@example.com" }
 ]
@@ -9,6 +9,7 @@ requires-python = ">=3.12"
 dependencies = [
     "fastapi>=0.104.0",
     "uvicorn[standard]>=0.24.0",
     "llama-cpp-python>=0.2.0",
     "huggingface-hub>=0.19.0",
     "duckduckgo-search>=4.0.0",

 [project]
+name = "agi-multi-model-api"
+version = "0.1.0"
+description = "High-Performance Multi-Model LLM API with Dynamic Switching"
 authors = [
     { name = "AI Developer", email = "you@example.com" }
 ]
 dependencies = [
     "fastapi>=0.104.0",
     "uvicorn[standard]>=0.24.0",
+    "aiohttp>=3.9.0",
     "llama-cpp-python>=0.2.0",
     "huggingface-hub>=0.19.0",
     "duckduckgo-search>=4.0.0",