Spaces:
Sleeping
Sleeping
File size: 1,742 Bytes
bcc2a47 a2f6c5c bcc2a47 ff047e9 bcc2a47 ff047e9 a2f6c5c ff047e9 2bf6e1e ff047e9 4d7f8c5 ff047e9 4d7f8c5 ff047e9 8da5546 252a572 bcc2a47 ff047e9 bcc2a47 252a572 ff047e9 252a572 ff047e9 4d7f8c5 ff047e9 bcc2a47 252a572 ff047e9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 | from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
import os
app = FastAPI()
# CORS (allow all for now)
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_methods=["*"],
allow_headers=["*"],
)
# Global model variable
llm = None
def load_model():
global llm
try:
print("📥 Downloading model from Hugging Face...")
model_path = hf_hub_download(
repo_id="devray11/Aevis-Medical-SLM",
filename="Aevis.Q4_K_M.gguf"
)
print("⚙️ Initializing model...")
llm = Llama(
model_path=model_path,
n_ctx=128, # Reduced for low RAM
n_threads=2, # HF free CPU = 2 cores
n_batch=16,
use_mmap=True,
use_mlock=False
)
print("✅ Model Loaded Successfully")
except Exception as e:
print(f"❌ Model Load Error: {e}")
llm = None
# Load model at startup
load_model()
class Query(BaseModel):
prompt: str
@app.post("/generate")
async def generate(query: Query):
global llm
if llm is None:
return {"error": "Model not initialized"}
try:
output = llm(
f"### Instruction:\n{query.prompt}\n\n### Response:\n",
max_tokens=64, # Reduced for speed
stop=["###"],
echo=False
)
return {
"response": output["choices"][0]["text"].strip()
}
except Exception as e:
return {"error": str(e)}
@app.get("/")
def health():
return {"status": "Aevis API is running 🚀"} |