Spaces:

ResearchEngineering
/

AGI

Sleeping

AGI / app.py

Dmitry Beresnev

fix dockerfile

e80973f 15 days ago

6.5 kB

	from fastapi import FastAPI, HTTPException
	from pydantic import BaseModel
	import subprocess
	import signal
	import os
	import requests
	import time
	from typing import Optional

	app = FastAPI()

	# Predefined list of available models
	AVAILABLE_MODELS = {
	# === Financial & Summarization Models (Recommended) ===
	"qwen-2.5-7b": "bartowski/Qwen2.5-7B-Instruct-GGUF:Qwen2.5-7B-Instruct-Q4_K_M.gguf", # Best for financial + multilingual
	"kimi-k2-9b": "bartowski/k2-chat-GGUF:k2-chat-Q4_K_M.gguf", # Kimi K2 - long context, good reasoning
	"yi-1.5-9b": "bartowski/Yi-1.5-9B-Chat-GGUF:Yi-1.5-9B-Chat-Q4_K_M.gguf", # Excellent for finance
	"llama-3.1-8b": "bartowski/Meta-Llama-3.1-8B-Instruct-GGUF:Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf", # Great reasoning
	"mistral-7b": "TheBloke/Mistral-7B-Instruct-v0.3-GGUF:mistral-7b-instruct-v0.3.Q4_K_M.gguf", # Reliable summarization

	# === Coding Models ===
	"deepseek-coder": "TheBloke/deepseek-coder-6.7B-instruct-GGUF:deepseek-coder-6.7b-instruct.Q4_K_M.gguf",

	# === General Purpose ===
	"deepseek-chat": "TheBloke/deepseek-llm-7B-chat-GGUF:deepseek-llm-7b-chat.Q4_K_M.gguf",
	"llama-3.2-3b": "bartowski/Llama-3.2-3B-Instruct-GGUF:Llama-3.2-3B-Instruct-Q4_K_M.gguf", # Fast & lightweight
	}

	# Global state
	current_model = "deepseek-chat" # Default model
	llama_process: Optional[subprocess.Popen] = None
	LLAMA_SERVER_PORT = 8080
	LLAMA_SERVER_URL = f"http://localhost:{LLAMA_SERVER_PORT}"


	class ModelSwitchRequest(BaseModel):
	model_name: str


	class ChatCompletionRequest(BaseModel):
	messages: list[dict]
	max_tokens: int = 256
	temperature: float = 0.7


	def start_llama_server(model_id: str) -> subprocess.Popen:
	"""Start llama-server with specified model (optimized for speed)."""
	cmd = [
	"llama-server",
	"-hf", model_id,
	"--host", "0.0.0.0",
	"--port", str(LLAMA_SERVER_PORT),
	"-c", "2048", # Context size
	"-t", "4", # CPU threads (adjust based on cores)
	"-ngl", "0", # GPU layers (0 for CPU-only)
	"--cont-batching", # Enable continuous batching for speed
	"-b", "512", # Batch size
	]

	print(f"Starting llama-server with model: {model_id}")
	print("This may take 2-3 minutes to download and load the model...")

	process = subprocess.Popen(
	cmd,
	stdout=subprocess.PIPE,
	stderr=subprocess.STDOUT,
	preexec_fn=os.setsid if os.name != 'nt' else None,
	text=True,
	bufsize=1
	)

	# Wait for server to be ready (increased timeout for model download)
	max_retries = 300 # 5 minutes
	for i in range(max_retries):
	# Check if process died
	if process.poll() is not None:
	stdout, _ = process.communicate()
	print(f"llama-server exited with code {process.returncode}")
	print(f"Output: {stdout}")
	raise RuntimeError("llama-server process died")

	try:
	# Try root endpoint instead of /health
	response = requests.get(f"{LLAMA_SERVER_URL}/", timeout=2)
	if response.status_code in [200, 404]: # 404 is ok, means server is up
	print(f"llama-server ready after {i+1} seconds")
	return process
	except requests.exceptions.ConnectionError:
	# Server not ready yet
	pass
	except Exception as e:
	# Other errors, keep waiting
	pass

	time.sleep(1)

	raise RuntimeError("llama-server failed to start within 5 minutes")


	def stop_llama_server():
	"""Stop the running llama-server."""
	global llama_process
	if llama_process:
	print("Stopping llama-server...")
	try:
	if os.name != 'nt':
	os.killpg(os.getpgid(llama_process.pid), signal.SIGTERM)
	else:
	llama_process.terminate()
	llama_process.wait(timeout=10)
	except:
	if os.name != 'nt':
	os.killpg(os.getpgid(llama_process.pid), signal.SIGKILL)
	else:
	llama_process.kill()
	llama_process = None
	time.sleep(2) # Give it time to fully shut down


	@app.on_event("startup")
	async def startup_event():
	"""Start with default model."""
	global llama_process
	model_id = AVAILABLE_MODELS[current_model]
	llama_process = start_llama_server(model_id)


	@app.on_event("shutdown")
	async def shutdown_event():
	"""Clean shutdown."""
	stop_llama_server()


	@app.get("/")
	async def root():
	return {
	"status": "DeepSeek API with dynamic model switching",
	"current_model": current_model,
	"available_models": list(AVAILABLE_MODELS.keys())
	}


	@app.get("/models")
	async def list_models():
	"""List all available models."""
	return {
	"current_model": current_model,
	"available_models": list(AVAILABLE_MODELS.keys())
	}


	@app.post("/switch-model")
	async def switch_model(request: ModelSwitchRequest):
	"""Switch to a different model."""
	global current_model, llama_process

	if request.model_name not in AVAILABLE_MODELS:
	raise HTTPException(
	status_code=400,
	detail=f"Model '{request.model_name}' not found. Available: {list(AVAILABLE_MODELS.keys())}"
	)

	if request.model_name == current_model:
	return {"message": f"Already using model: {current_model}"}

	# Stop current server
	stop_llama_server()

	# Start with new model
	model_id = AVAILABLE_MODELS[request.model_name]
	llama_process = start_llama_server(model_id)
	current_model = request.model_name

	return {
	"message": f"Switched to model: {current_model}",
	"model": current_model
	}


	@app.post("/v1/chat/completions")
	async def chat_completions(request: ChatCompletionRequest):
	"""OpenAI-compatible chat completions endpoint."""
	try:
	# Forward to llama-server
	response = requests.post(
	f"{LLAMA_SERVER_URL}/v1/chat/completions",
	json={
	"messages": request.messages,
	"max_tokens": request.max_tokens,
	"temperature": request.temperature,
	},
	timeout=300
	)
	response.raise_for_status()
	return response.json()
	except requests.exceptions.RequestException as e:
	raise HTTPException(status_code=500, detail=f"llama-server error: {str(e)}")