SharmaGroups07 commited on
Commit
3a784e6
·
verified ·
1 Parent(s): 6e1a924

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +81 -24
app.py CHANGED
@@ -1,42 +1,99 @@
1
- from fastapi import FastAPI
 
2
  from llama_cpp import Llama
3
  from huggingface_hub import hf_hub_download
4
- import uvicorn
 
5
 
6
  app = FastAPI()
7
 
 
 
 
 
8
  MODEL_REPO = "microsoft/Phi-3-mini-4k-instruct-gguf"
9
  MODEL_FILE = "Phi-3-mini-4k-instruct-q4.gguf"
10
 
11
  print("Downloading model...")
12
- model_path = hf_hub_download(
13
- repo_id=MODEL_REPO,
14
- filename=MODEL_FILE
15
- )
16
 
17
  print("Loading model...")
18
- llm = Llama(
19
- model_path=model_path,
20
- n_ctx=2048,
21
- n_threads=2
22
- )
23
-
24
  print("Model loaded successfully!")
25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  @app.get("/")
27
- def root():
28
- return {"status": "AI engine running"}
 
 
 
 
29
 
30
- @app.get("/generate")
31
- def generate(prompt: str):
32
- output = llm(
33
- prompt,
34
- max_tokens=200,
35
- temperature=0.7
36
- )
37
- return {"response": output}
38
 
 
39
 
 
 
 
 
 
40
 
41
- if __name__ == "__main__":
42
- uvicorn.run(app, host="0.0.0.0", port=7860)
 
1
+ from fastapi import FastAPI, HTTPException
2
+ from pydantic import BaseModel
3
  from llama_cpp import Llama
4
  from huggingface_hub import hf_hub_download
5
+ import asyncio
6
+ import time
7
 
8
  app = FastAPI()
9
 
10
+ # =========================
11
+ # MODEL LOADING
12
+ # =========================
13
+
14
  MODEL_REPO = "microsoft/Phi-3-mini-4k-instruct-gguf"
15
  MODEL_FILE = "Phi-3-mini-4k-instruct-q4.gguf"
16
 
17
  print("Downloading model...")
18
+ model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE)
 
 
 
19
 
20
  print("Loading model...")
21
+ llm = Llama(model_path=model_path, n_ctx=2048, n_threads=2)
 
 
 
 
 
22
  print("Model loaded successfully!")
23
 
24
+ # =========================
25
+ # QUEUE SYSTEM
26
+ # =========================
27
+
28
+ request_queue = asyncio.Queue()
29
+ MAX_CONCURRENT = 1 # Balanced mode: 1 worker for stability
30
+
31
+ # =========================
32
+ # REQUEST MODEL
33
+ # =========================
34
+
35
+ class PromptRequest(BaseModel):
36
+ prompt: str
37
+ max_tokens: int = 200
38
+
39
+ # =========================
40
+ # WORKER FUNCTION
41
+ # =========================
42
+
43
+ async def worker():
44
+ while True:
45
+ request, future = await request_queue.get()
46
+ try:
47
+ start = time.time()
48
+
49
+ result = llm(
50
+ request.prompt,
51
+ max_tokens=request.max_tokens,
52
+ stop=["</s>"]
53
+ )
54
+
55
+ response = result["choices"][0]["text"]
56
+
57
+ future.set_result({
58
+ "response": response,
59
+ "processing_time": round(time.time() - start, 2)
60
+ })
61
+
62
+ except Exception as e:
63
+ future.set_exception(e)
64
+
65
+ request_queue.task_done()
66
+
67
+ # =========================
68
+ # START WORKER ON STARTUP
69
+ # =========================
70
+
71
+ @app.on_event("startup")
72
+ async def startup_event():
73
+ for _ in range(MAX_CONCURRENT):
74
+ asyncio.create_task(worker())
75
+
76
+ # =========================
77
+ # API ENDPOINTS
78
+ # =========================
79
+
80
  @app.get("/")
81
+ def health():
82
+ return {
83
+ "status": "AI Gateway Running",
84
+ "queue_size": request_queue.qsize(),
85
+ "mode": "Balanced"
86
+ }
87
 
88
+ @app.post("/generate")
89
+ async def generate(request: PromptRequest):
90
+ future = asyncio.get_event_loop().create_future()
 
 
 
 
 
91
 
92
+ await request_queue.put((request, future))
93
 
94
+ try:
95
+ result = await asyncio.wait_for(future, timeout=120)
96
+ return result
97
+ except asyncio.TimeoutError:
98
+ raise HTTPException(status_code=504, detail="Request timed out")
99