AnatoliiG commited on
Commit
1f21c8c
·
1 Parent(s): 19b570f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +103 -0
app.py CHANGED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ import gradio as gr
4
+ import uvicorn
5
+ from fastapi import FastAPI, Request
6
+ from fastapi.middleware.cors import CORSMiddleware
7
+ from fastapi.responses import JSONResponse, RedirectResponse, StreamingResponse
8
+ from gradio import mount_gradio_app
9
+ from huggingface_hub import hf_hub_download
10
+ from llama_cpp import Llama
11
+
12
+ REPO_ID = "Qwen/Qwen2.5-Coder-7B-Instruct-GGUF"
13
+ FILENAME = "qwen2.5-coder-7b-instruct-q5_k_m.gguf"
14
+
15
+ print(f"Loading model {REPO_ID}...")
16
+ model_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME)
17
+
18
+ print("Initializing Llama...")
19
+ llm = Llama(
20
+ model_path=model_path,
21
+ n_ctx=8192,
22
+ n_threads=2,
23
+ verbose=False,
24
+ )
25
+
26
+ app = FastAPI()
27
+
28
+ app.add_middleware(
29
+ CORSMiddleware,
30
+ allow_origins=["*"],
31
+ allow_credentials=True,
32
+ allow_methods=["*"],
33
+ allow_headers=["*"],
34
+ )
35
+
36
+
37
+ @app.post("/v1/chat/completions")
38
+ async def chat_completions(request: Request):
39
+ data = await request.json()
40
+ messages = data.get("messages", [])
41
+ stream = data.get("stream", False)
42
+ temperature = data.get("temperature", 0.2)
43
+ max_tokens = data.get("max_tokens", 2048)
44
+
45
+ output = llm.create_chat_completion(
46
+ messages=messages, max_tokens=max_tokens, temperature=temperature, stream=stream
47
+ )
48
+
49
+ if stream:
50
+
51
+ def iter_content():
52
+ try:
53
+ for chunk in output:
54
+ yield f"data: {json.dumps(chunk)}\n\n"
55
+ except Exception as e:
56
+ print(f"Streaming error: {e}")
57
+ finally:
58
+ yield "data: [DONE]\n\n"
59
+
60
+ return StreamingResponse(
61
+ iter_content(),
62
+ media_type="text/event-stream",
63
+ headers={"Cache-Control": "no-cache", "Connection": "keep-alive"},
64
+ )
65
+
66
+ return JSONResponse(content=output)
67
+
68
+
69
+ def gradio_interface(message, history):
70
+ messages = [{"role": "system", "content": "You are an expert coding assistant."}]
71
+ for u, a in history:
72
+ messages.append({"role": "user", "content": u})
73
+ messages.append({"role": "assistant", "content": a})
74
+ messages.append({"role": "user", "content": message})
75
+
76
+ response_stream = llm.create_chat_completion(
77
+ messages=messages, max_tokens=2048, temperature=0.4, stream=True
78
+ )
79
+
80
+ partial_text = ""
81
+ for chunk in response_stream:
82
+ delta = chunk["choices"][0]["delta"]
83
+ if "content" in delta:
84
+ partial_text += delta["content"]
85
+ yield partial_text
86
+
87
+
88
+ demo = gr.ChatInterface(
89
+ fn=gradio_interface,
90
+ title="Qwen 2.5 Coder API",
91
+ description="API endpoint: /v1/chat/completions",
92
+ )
93
+
94
+ app = mount_gradio_app(app, demo, path="/ui")
95
+
96
+
97
+ @app.get("/")
98
+ async def root():
99
+ return RedirectResponse(url="/ui")
100
+
101
+
102
+ if __name__ == "__main__":
103
+ uvicorn.run(app, host="0.0.0.0", port=7860)