Spaces:

SharmaGroups07
/

ai-engine

Running

App Files Files Community

SharmaGroups07 commited on Feb 18

Commit

3a784e6

verified ·

1 Parent(s): 6e1a924

Update app.py

Browse files

Files changed (1) hide show

app.py +81 -24

app.py CHANGED Viewed

@@ -1,42 +1,99 @@
-from fastapi import FastAPI
 from llama_cpp import Llama
 from huggingface_hub import hf_hub_download
-import uvicorn
 app = FastAPI()
 MODEL_REPO = "microsoft/Phi-3-mini-4k-instruct-gguf"
 MODEL_FILE = "Phi-3-mini-4k-instruct-q4.gguf"
 print("Downloading model...")
-model_path = hf_hub_download(
-    repo_id=MODEL_REPO,
-    filename=MODEL_FILE
-)
 print("Loading model...")
-llm = Llama(
-    model_path=model_path,
-    n_ctx=2048,
-    n_threads=2
-)
 print("Model loaded successfully!")
 @app.get("/")
-def root():
-    return {"status": "AI engine running"}
-@app.get("/generate")
-def generate(prompt: str):
-    output = llm(
-        prompt,
-        max_tokens=200,
-        temperature=0.7
-    )
-    return {"response": output}
-if __name__ == "__main__":
-    uvicorn.run(app, host="0.0.0.0", port=7860)

+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
 from llama_cpp import Llama
 from huggingface_hub import hf_hub_download
+import asyncio
+import time
 app = FastAPI()
+# =========================
+# MODEL LOADING
+# =========================
 MODEL_REPO = "microsoft/Phi-3-mini-4k-instruct-gguf"
 MODEL_FILE = "Phi-3-mini-4k-instruct-q4.gguf"
 print("Downloading model...")
+model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE)
 print("Loading model...")
+llm = Llama(model_path=model_path, n_ctx=2048, n_threads=2)
 print("Model loaded successfully!")
+# =========================
+# QUEUE SYSTEM
+# =========================
+request_queue = asyncio.Queue()
+MAX_CONCURRENT = 1  # Balanced mode: 1 worker for stability
+# =========================
+# REQUEST MODEL
+# =========================
+class PromptRequest(BaseModel):
+    prompt: str
+    max_tokens: int = 200
+# =========================
+# WORKER FUNCTION
+# =========================
+async def worker():
+    while True:
+        request, future = await request_queue.get()
+        try:
+            start = time.time()
+            result = llm(
+                request.prompt,
+                max_tokens=request.max_tokens,
+                stop=["</s>"]
+            )
+            response = result["choices"][0]["text"]
+            future.set_result({
+                "response": response,
+                "processing_time": round(time.time() - start, 2)
+            })
+        except Exception as e:
+            future.set_exception(e)
+        request_queue.task_done()
+# =========================
+# START WORKER ON STARTUP
+# =========================
+@app.on_event("startup")
+async def startup_event():
+    for _ in range(MAX_CONCURRENT):
+        asyncio.create_task(worker())
+# =========================
+# API ENDPOINTS
+# =========================
 @app.get("/")
+def health():
+    return {
+        "status": "AI Gateway Running",
+        "queue_size": request_queue.qsize(),
+        "mode": "Balanced"
+    }
+@app.post("/generate")
+async def generate(request: PromptRequest):
+    future = asyncio.get_event_loop().create_future()
+    await request_queue.put((request, future))
+    try:
+        result = await asyncio.wait_for(future, timeout=120)
+        return result
+    except asyncio.TimeoutError:
+        raise HTTPException(status_code=504, detail="Request timed out")