AGI / app.py
Dmitry Beresnev
fix dockerfile
e80973f
raw
history blame
6.5 kB
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
import subprocess
import signal
import os
import requests
import time
from typing import Optional
app = FastAPI()
# Predefined list of available models
AVAILABLE_MODELS = {
# === Financial & Summarization Models (Recommended) ===
"qwen-2.5-7b": "bartowski/Qwen2.5-7B-Instruct-GGUF:Qwen2.5-7B-Instruct-Q4_K_M.gguf", # Best for financial + multilingual
"kimi-k2-9b": "bartowski/k2-chat-GGUF:k2-chat-Q4_K_M.gguf", # Kimi K2 - long context, good reasoning
"yi-1.5-9b": "bartowski/Yi-1.5-9B-Chat-GGUF:Yi-1.5-9B-Chat-Q4_K_M.gguf", # Excellent for finance
"llama-3.1-8b": "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf", # Great reasoning
"mistral-7b": "TheBloke/Mistral-7B-Instruct-v0.3-GGUF:mistral-7b-instruct-v0.3.Q4_K_M.gguf", # Reliable summarization
# === Coding Models ===
"deepseek-coder": "TheBloke/deepseek-coder-6.7B-instruct-GGUF:deepseek-coder-6.7b-instruct.Q4_K_M.gguf",
# === General Purpose ===
"deepseek-chat": "TheBloke/deepseek-llm-7B-chat-GGUF:deepseek-llm-7b-chat.Q4_K_M.gguf",
"llama-3.2-3b": "bartowski/Llama-3.2-3B-Instruct-GGUF:Llama-3.2-3B-Instruct-Q4_K_M.gguf", # Fast & lightweight
}
# Global state
current_model = "deepseek-chat" # Default model
llama_process: Optional[subprocess.Popen] = None
LLAMA_SERVER_PORT = 8080
LLAMA_SERVER_URL = f"http://localhost:{LLAMA_SERVER_PORT}"
class ModelSwitchRequest(BaseModel):
model_name: str
class ChatCompletionRequest(BaseModel):
messages: list[dict]
max_tokens: int = 256
temperature: float = 0.7
def start_llama_server(model_id: str) -> subprocess.Popen:
"""Start llama-server with specified model (optimized for speed)."""
cmd = [
"llama-server",
"-hf", model_id,
"--host", "0.0.0.0",
"--port", str(LLAMA_SERVER_PORT),
"-c", "2048", # Context size
"-t", "4", # CPU threads (adjust based on cores)
"-ngl", "0", # GPU layers (0 for CPU-only)
"--cont-batching", # Enable continuous batching for speed
"-b", "512", # Batch size
]
print(f"Starting llama-server with model: {model_id}")
print("This may take 2-3 minutes to download and load the model...")
process = subprocess.Popen(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
preexec_fn=os.setsid if os.name != 'nt' else None,
text=True,
bufsize=1
)
# Wait for server to be ready (increased timeout for model download)
max_retries = 300 # 5 minutes
for i in range(max_retries):
# Check if process died
if process.poll() is not None:
stdout, _ = process.communicate()
print(f"llama-server exited with code {process.returncode}")
print(f"Output: {stdout}")
raise RuntimeError("llama-server process died")
try:
# Try root endpoint instead of /health
response = requests.get(f"{LLAMA_SERVER_URL}/", timeout=2)
if response.status_code in [200, 404]: # 404 is ok, means server is up
print(f"llama-server ready after {i+1} seconds")
return process
except requests.exceptions.ConnectionError:
# Server not ready yet
pass
except Exception as e:
# Other errors, keep waiting
pass
time.sleep(1)
raise RuntimeError("llama-server failed to start within 5 minutes")
def stop_llama_server():
"""Stop the running llama-server."""
global llama_process
if llama_process:
print("Stopping llama-server...")
try:
if os.name != 'nt':
os.killpg(os.getpgid(llama_process.pid), signal.SIGTERM)
else:
llama_process.terminate()
llama_process.wait(timeout=10)
except:
if os.name != 'nt':
os.killpg(os.getpgid(llama_process.pid), signal.SIGKILL)
else:
llama_process.kill()
llama_process = None
time.sleep(2) # Give it time to fully shut down
@app.on_event("startup")
async def startup_event():
"""Start with default model."""
global llama_process
model_id = AVAILABLE_MODELS[current_model]
llama_process = start_llama_server(model_id)
@app.on_event("shutdown")
async def shutdown_event():
"""Clean shutdown."""
stop_llama_server()
@app.get("/")
async def root():
return {
"status": "DeepSeek API with dynamic model switching",
"current_model": current_model,
"available_models": list(AVAILABLE_MODELS.keys())
}
@app.get("/models")
async def list_models():
"""List all available models."""
return {
"current_model": current_model,
"available_models": list(AVAILABLE_MODELS.keys())
}
@app.post("/switch-model")
async def switch_model(request: ModelSwitchRequest):
"""Switch to a different model."""
global current_model, llama_process
if request.model_name not in AVAILABLE_MODELS:
raise HTTPException(
status_code=400,
detail=f"Model '{request.model_name}' not found. Available: {list(AVAILABLE_MODELS.keys())}"
)
if request.model_name == current_model:
return {"message": f"Already using model: {current_model}"}
# Stop current server
stop_llama_server()
# Start with new model
model_id = AVAILABLE_MODELS[request.model_name]
llama_process = start_llama_server(model_id)
current_model = request.model_name
return {
"message": f"Switched to model: {current_model}",
"model": current_model
}
@app.post("/v1/chat/completions")
async def chat_completions(request: ChatCompletionRequest):
"""OpenAI-compatible chat completions endpoint."""
try:
# Forward to llama-server
response = requests.post(
f"{LLAMA_SERVER_URL}/v1/chat/completions",
json={
"messages": request.messages,
"max_tokens": request.max_tokens,
"temperature": request.temperature,
},
timeout=300
)
response.raise_for_status()
return response.json()
except requests.exceptions.RequestException as e:
raise HTTPException(status_code=500, detail=f"llama-server error: {str(e)}")