Spaces:

ResearchEngineering
/

AGI

Sleeping

AGI

File size: 6,501 Bytes

from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
import subprocess
import signal
import os
import requests
import time
from typing import Optional

app = FastAPI()

# Predefined list of available models
AVAILABLE_MODELS = {
    # === Financial & Summarization Models (Recommended) ===
    "qwen-2.5-7b": "bartowski/Qwen2.5-7B-Instruct-GGUF:Qwen2.5-7B-Instruct-Q4_K_M.gguf",  # Best for financial + multilingual
    "kimi-k2-9b": "bartowski/k2-chat-GGUF:k2-chat-Q4_K_M.gguf",  # Kimi K2 - long context, good reasoning
    "yi-1.5-9b": "bartowski/Yi-1.5-9B-Chat-GGUF:Yi-1.5-9B-Chat-Q4_K_M.gguf",  # Excellent for finance
    "llama-3.1-8b": "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf",  # Great reasoning
    "mistral-7b": "TheBloke/Mistral-7B-Instruct-v0.3-GGUF:mistral-7b-instruct-v0.3.Q4_K_M.gguf",  # Reliable summarization

    # === Coding Models ===
    "deepseek-coder": "TheBloke/deepseek-coder-6.7B-instruct-GGUF:deepseek-coder-6.7b-instruct.Q4_K_M.gguf",

    # === General Purpose ===
    "deepseek-chat": "TheBloke/deepseek-llm-7B-chat-GGUF:deepseek-llm-7b-chat.Q4_K_M.gguf",
    "llama-3.2-3b": "bartowski/Llama-3.2-3B-Instruct-GGUF:Llama-3.2-3B-Instruct-Q4_K_M.gguf",  # Fast & lightweight
}

# Global state
current_model = "deepseek-chat"  # Default model
llama_process: Optional[subprocess.Popen] = None
LLAMA_SERVER_PORT = 8080
LLAMA_SERVER_URL = f"http://localhost:{LLAMA_SERVER_PORT}"


class ModelSwitchRequest(BaseModel):
    model_name: str


class ChatCompletionRequest(BaseModel):
    messages: list[dict]
    max_tokens: int = 256
    temperature: float = 0.7


def start_llama_server(model_id: str) -> subprocess.Popen:
    """Start llama-server with specified model (optimized for speed)."""
    cmd = [
        "llama-server",
        "-hf", model_id,
        "--host", "0.0.0.0",
        "--port", str(LLAMA_SERVER_PORT),
        "-c", "2048",           # Context size
        "-t", "4",              # CPU threads (adjust based on cores)
        "-ngl", "0",            # GPU layers (0 for CPU-only)
        "--cont-batching",      # Enable continuous batching for speed
        "-b", "512",            # Batch size
    ]

    print(f"Starting llama-server with model: {model_id}")
    print("This may take 2-3 minutes to download and load the model...")

    process = subprocess.Popen(
        cmd,
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT,
        preexec_fn=os.setsid if os.name != 'nt' else None,
        text=True,
        bufsize=1
    )

    # Wait for server to be ready (increased timeout for model download)
    max_retries = 300  # 5 minutes
    for i in range(max_retries):
        # Check if process died
        if process.poll() is not None:
            stdout, _ = process.communicate()
            print(f"llama-server exited with code {process.returncode}")
            print(f"Output: {stdout}")
            raise RuntimeError("llama-server process died")

        try:
            # Try root endpoint instead of /health
            response = requests.get(f"{LLAMA_SERVER_URL}/", timeout=2)
            if response.status_code in [200, 404]:  # 404 is ok, means server is up
                print(f"llama-server ready after {i+1} seconds")
                return process
        except requests.exceptions.ConnectionError:
            # Server not ready yet
            pass
        except Exception as e:
            # Other errors, keep waiting
            pass

        time.sleep(1)

    raise RuntimeError("llama-server failed to start within 5 minutes")


def stop_llama_server():
    """Stop the running llama-server."""
    global llama_process
    if llama_process:
        print("Stopping llama-server...")
        try:
            if os.name != 'nt':
                os.killpg(os.getpgid(llama_process.pid), signal.SIGTERM)
            else:
                llama_process.terminate()
            llama_process.wait(timeout=10)
        except:
            if os.name != 'nt':
                os.killpg(os.getpgid(llama_process.pid), signal.SIGKILL)
            else:
                llama_process.kill()
        llama_process = None
        time.sleep(2)  # Give it time to fully shut down


@app.on_event("startup")
async def startup_event():
    """Start with default model."""
    global llama_process
    model_id = AVAILABLE_MODELS[current_model]
    llama_process = start_llama_server(model_id)


@app.on_event("shutdown")
async def shutdown_event():
    """Clean shutdown."""
    stop_llama_server()


@app.get("/")
async def root():
    return {
        "status": "DeepSeek API with dynamic model switching",
        "current_model": current_model,
        "available_models": list(AVAILABLE_MODELS.keys())
    }


@app.get("/models")
async def list_models():
    """List all available models."""
    return {
        "current_model": current_model,
        "available_models": list(AVAILABLE_MODELS.keys())
    }


@app.post("/switch-model")
async def switch_model(request: ModelSwitchRequest):
    """Switch to a different model."""
    global current_model, llama_process

    if request.model_name not in AVAILABLE_MODELS:
        raise HTTPException(
            status_code=400,
            detail=f"Model '{request.model_name}' not found. Available: {list(AVAILABLE_MODELS.keys())}"
        )

    if request.model_name == current_model:
        return {"message": f"Already using model: {current_model}"}

    # Stop current server
    stop_llama_server()

    # Start with new model
    model_id = AVAILABLE_MODELS[request.model_name]
    llama_process = start_llama_server(model_id)
    current_model = request.model_name

    return {
        "message": f"Switched to model: {current_model}",
        "model": current_model
    }


@app.post("/v1/chat/completions")
async def chat_completions(request: ChatCompletionRequest):
    """OpenAI-compatible chat completions endpoint."""
    try:
        # Forward to llama-server
        response = requests.post(
            f"{LLAMA_SERVER_URL}/v1/chat/completions",
            json={
                "messages": request.messages,
                "max_tokens": request.max_tokens,
                "temperature": request.temperature,
            },
            timeout=300
        )
        response.raise_for_status()
        return response.json()
    except requests.exceptions.RequestException as e:
        raise HTTPException(status_code=500, detail=f"llama-server error: {str(e)}")