Spaces:
Sleeping
Sleeping
| from fastapi import FastAPI | |
| from fastapi.middleware.cors import CORSMiddleware | |
| from pydantic import BaseModel | |
| from llama_cpp import Llama | |
| from huggingface_hub import hf_hub_download | |
| import os | |
| app = FastAPI() | |
| # CORS (allow all for now) | |
| app.add_middleware( | |
| CORSMiddleware, | |
| allow_origins=["*"], | |
| allow_methods=["*"], | |
| allow_headers=["*"], | |
| ) | |
| # Global model variable | |
| llm = None | |
| def load_model(): | |
| global llm | |
| try: | |
| print("π₯ Downloading model from Hugging Face...") | |
| model_path = hf_hub_download( | |
| repo_id="devray11/Aevis-Medical-SLM", | |
| filename="Aevis.Q4_K_M.gguf" | |
| ) | |
| print("βοΈ Initializing model...") | |
| llm = Llama( | |
| model_path=model_path, | |
| n_ctx=128, # Reduced for low RAM | |
| n_threads=2, # HF free CPU = 2 cores | |
| n_batch=16, | |
| use_mmap=True, | |
| use_mlock=False | |
| ) | |
| print("β Model Loaded Successfully") | |
| except Exception as e: | |
| print(f"β Model Load Error: {e}") | |
| llm = None | |
| # Load model at startup | |
| load_model() | |
| class Query(BaseModel): | |
| prompt: str | |
| async def generate(query: Query): | |
| global llm | |
| if llm is None: | |
| return {"error": "Model not initialized"} | |
| try: | |
| output = llm( | |
| f"### Instruction:\n{query.prompt}\n\n### Response:\n", | |
| max_tokens=64, # Reduced for speed | |
| stop=["###"], | |
| echo=False | |
| ) | |
| return { | |
| "response": output["choices"][0]["text"].strip() | |
| } | |
| except Exception as e: | |
| return {"error": str(e)} | |
| def health(): | |
| return {"status": "Aevis API is running π"} |