import os from fastapi import FastAPI, HTTPException from pydantic import BaseModel from llama_cpp import Llama from huggingface_hub import hf_hub_download app = FastAPI() # Optimized Model Configuration (Qwen2.5-3B is faster than Phi-3) MODEL_REPO = "bartowski/Qwen2.5-3B-Instruct-GGUF" MODEL_FILE = "Qwen2.5-3B-Instruct-Q4_K_M.gguf" print("Downloading Qwen2.5-3B model (Faster & Smarter)...") model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE) print("Loading model for fast CPU inference...") llm = Llama( model_path=model_path, n_ctx=2048, # Context window n_threads=2, # HF Free tier has 2 vCPUs n_batch=512, # Process 512 tokens at once for speed verbose=False ) class PromptRequest(BaseModel): prompt: str @app.get("/") def read_root(): return {"message": "High-Speed Qwen2.5-3B API is running. Use POST /api."} @app.get("/health") def health_check(): return {"status": "alive"} @app.post("/api") async def generate_response(request: PromptRequest): try: # Qwen2.5 uses ChatML format for best results formatted_prompt = f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{request.prompt}<|im_end|>\n<|im_start|>assistant\n" output = llm( formatted_prompt, max_tokens=1024, # Increased limit stop=["<|im_end|>"], echo=False ) response_text = output['choices'][0]['text'].strip() return { "status": "success", "text": response_text } except Exception as e: print(f"Error: {e}") raise HTTPException(status_code=500, detail=str(e)) if __name__ == "__main__": import uvicorn uvicorn.run(app, host="0.0.0.0", port=7860)