File size: 1,269 Bytes
a88f919
 
a5c36c8
816e3ae
a5c36c8
 
 
 
 
816e3ae
 
 
a5c36c8
 
 
 
 
 
 
a88f919
 
a5c36c8
a88f919
 
 
 
 
 
 
 
 
 
 
a5c36c8
 
a88f919
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
from fastapi import FastAPI
from pydantic import BaseModel
from ctransformers import AutoModelForCausalLM
import os

# Model configuration for ctransformers (CPU-friendly)
MODEL_REPO_ID = "TheBloke/Mistral-7B-Instruct-v0.1-GGUF"
MODEL_FILE = "mistral-7b-instruct-v0.1.Q4_K_M.gguf"

# Ensure cache dir is writable in container
os.makedirs(os.environ.get("HUGGINGFACE_HUB_CACHE", "/code/.cache/huggingface"), exist_ok=True)

# Load the model once at startup
llm = AutoModelForCausalLM.from_pretrained(
    MODEL_REPO_ID,
    model_file=MODEL_FILE,
    model_type="mistral",
    gpu_layers=0,
    context_length=2048,
)

app = FastAPI(title="Mistral GGUF LLM API (ctransformers)", version="1.0.0")

class InferenceRequest(BaseModel):
    prompt: str
    max_tokens: int = 256

class InferenceResponse(BaseModel):
    output: str

@app.post("/infer", response_model=InferenceResponse)
def infer(req: InferenceRequest):
    try:
        generated_text = llm(req.prompt, max_new_tokens=req.max_tokens)
        return InferenceResponse(output=str(generated_text).strip())
    except Exception as e:
        return InferenceResponse(output=f"Error generating response: {str(e)}")

@app.get("/")
def health():
    return {"status": "LLM is running. Visit /docs for Swagger UI"}