from fastapi import FastAPI from fastapi.middleware.cors import CORSMiddleware from pydantic import BaseModel from llama_cpp import Llama from huggingface_hub import hf_hub_download import os app = FastAPI() # CORS (allow all for now) app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"], ) # Global model variable llm = None def load_model(): global llm try: print("📥 Downloading model from Hugging Face...") model_path = hf_hub_download( repo_id="devray11/Aevis-Medical-SLM", filename="Aevis.Q4_K_M.gguf" ) print("⚙️ Initializing model...") llm = Llama( model_path=model_path, n_ctx=128, # Reduced for low RAM n_threads=2, # HF free CPU = 2 cores n_batch=16, use_mmap=True, use_mlock=False ) print("✅ Model Loaded Successfully") except Exception as e: print(f"❌ Model Load Error: {e}") llm = None # Load model at startup load_model() class Query(BaseModel): prompt: str @app.post("/generate") async def generate(query: Query): global llm if llm is None: return {"error": "Model not initialized"} try: output = llm( f"### Instruction:\n{query.prompt}\n\n### Response:\n", max_tokens=64, # Reduced for speed stop=["###"], echo=False ) return { "response": output["choices"][0]["text"].strip() } except Exception as e: return {"error": str(e)} @app.get("/") def health(): return {"status": "Aevis API is running 🚀"}