Simple

Sleeping

Waheeb2001 commited on Jun 25, 2025

Commit

df2f11e

verified ·

1 Parent(s): 8b98824

Update main.py

Files changed (1) hide show

main.py CHANGED Viewed

@@ -2,8 +2,14 @@ from ctransformers import AutoModelForCausalLM
 from fastapi import FastAPI, Form
 from pydantic import BaseModel
 import logging
 logging.basicConfig(level=logging.INFO)
 try:
     llm = AutoModelForCausalLM.from_pretrained(
         "zephyr-7b-beta.Q4_K_S.gguf",
@@ -15,26 +21,26 @@ try:
 except Exception as e:
     logging.error(f"Model failed to load: {e}")
     raise e
-#Model loading
-llm = AutoModelForCausalLM.from_pretrained("zephyr-7b-beta.Q4_K_S.gguf",
-model_type='mistral',
-max_new_tokens = 1096,
-threads = 3,
-)
-#Pydantic object
-class validation(BaseModel):
     prompt: str
-#Fast API
-app = FastAPI()
-#Zephyr completion
 @app.post("/llm_on_cpu")
-async def stream(item: validation):
     system_prompt = 'Below is an instruction that describes a task. Write a response that appropriately completes the request.'
     E_INST = "</s>"
     user, assistant = "<|user|>", "<|assistant|>"
     prompt = f"{system_prompt}{E_INST}\n{user}\n{item.prompt.strip()}{E_INST}\n{assistant}\n"
-    return llm(prompt)

 from fastapi import FastAPI, Form
 from pydantic import BaseModel
 import logging
+# Set up logging
 logging.basicConfig(level=logging.INFO)
+# Initialize FastAPI app
+app = FastAPI()
+# Load the GGUF model once
 try:
     llm = AutoModelForCausalLM.from_pretrained(
         "zephyr-7b-beta.Q4_K_S.gguf",
 except Exception as e:
     logging.error(f"Model failed to load: {e}")
     raise e
+# Define Pydantic model for input validation
+class ValidationModel(BaseModel):
     prompt: str
+# Root endpoint for health checks and UI
+@app.get("/")
+def read_root():
+    return {
+        "status": "running",
+        "message": "Zephyr LLM API is active",
+        "endpoints": ["/llm_on_cpu (POST)"]
+    }
+# LLM inference endpoint
 @app.post("/llm_on_cpu")
+async def stream(item: ValidationModel):
     system_prompt = 'Below is an instruction that describes a task. Write a response that appropriately completes the request.'
     E_INST = "</s>"
     user, assistant = "<|user|>", "<|assistant|>"
     prompt = f"{system_prompt}{E_INST}\n{user}\n{item.prompt.strip()}{E_INST}\n{assistant}\n"
+    response = llm(prompt)
+    return {"response": response}