Spaces:

ariji1
/

Docker-JD

Runtime error

ariji1 commited on Aug 8, 2025

Commit

a5c36c8

verified ·

1 Parent(s): 4507e80

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,29 +1,21 @@
 from fastapi import FastAPI
 from pydantic import BaseModel
-from llama_cpp import Llama
-import os
-import subprocess
-MODEL_PATH = "./model/mistral-7b-instruct-v0.1.Q4_K_M.gguf"
-MODEL_URL = "https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF/resolve/main/mistral-7b-instruct-v0.1.Q4_K_M.gguf"
-# Download model if not already present
-os.makedirs("model", exist_ok=True)
-if not os.path.exists(MODEL_PATH):
-    print("Downloading model...")
-    subprocess.run(["wget", MODEL_URL, "-O", MODEL_PATH], check=True)
-# Load the model
-llm = Llama(
-    model_path=MODEL_PATH,
-    n_ctx=2048,
-    n_threads=2,
-    n_batch=64,
-    use_mlock=True
 )
-# FastAPI app
-app = FastAPI(title="Mistral GGUF LLM API", version="1.0.0")
 class InferenceRequest(BaseModel):
     prompt: str
@@ -35,8 +27,8 @@ class InferenceResponse(BaseModel):
 @app.post("/infer", response_model=InferenceResponse)
 def infer(req: InferenceRequest):
     try:
-        result = llm(req.prompt, max_tokens=req.max_tokens, stop=["</s>"])
-        return InferenceResponse(output=result["choices"][0]["text"].strip())
     except Exception as e:
         return InferenceResponse(output=f"Error generating response: {str(e)}")

 from fastapi import FastAPI
 from pydantic import BaseModel
+from ctransformers import AutoModelForCausalLM
+# Model configuration for ctransformers (CPU-friendly)
+MODEL_REPO_ID = "TheBloke/Mistral-7B-Instruct-v0.1-GGUF"
+MODEL_FILE = "mistral-7b-instruct-v0.1.Q4_K_M.gguf"
+# Load the model once at startup
+llm = AutoModelForCausalLM.from_pretrained(
+    MODEL_REPO_ID,
+    model_file=MODEL_FILE,
+    model_type="mistral",
+    gpu_layers=0,
+    context_length=2048,
 )
+app = FastAPI(title="Mistral GGUF LLM API (ctransformers)", version="1.0.0")
 class InferenceRequest(BaseModel):
     prompt: str
 @app.post("/infer", response_model=InferenceResponse)
 def infer(req: InferenceRequest):
     try:
+        generated_text = llm(req.prompt, max_new_tokens=req.max_tokens)
+        return InferenceResponse(output=str(generated_text).strip())
     except Exception as e:
         return InferenceResponse(output=f"Error generating response: {str(e)}")