AnatoliiG commited on
Commit
97ce0ea
·
1 Parent(s): 23a3fca

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +102 -53
app.py CHANGED
@@ -1,27 +1,37 @@
1
  import json
 
2
 
3
  import gradio as gr
4
  import uvicorn
5
  from fastapi import FastAPI, Request
6
  from fastapi.middleware.cors import CORSMiddleware
7
- from fastapi.responses import JSONResponse, RedirectResponse, StreamingResponse
8
  from gradio import mount_gradio_app
9
  from huggingface_hub import hf_hub_download
10
  from llama_cpp import Llama
11
 
 
12
  REPO_ID = "Qwen/Qwen2.5-Coder-7B-Instruct-GGUF"
13
  FILENAME = "qwen2.5-coder-7b-instruct-q5_k_m.gguf"
14
 
 
 
 
15
  print(f"Loading model {REPO_ID}...")
16
- model_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME)
17
-
18
- print("Initializing Llama...")
19
- llm = Llama(
20
- model_path=model_path,
21
- n_ctx=8192,
22
- n_threads=2,
23
- verbose=True,
24
- )
 
 
 
 
 
25
 
26
  app = FastAPI()
27
 
@@ -36,66 +46,105 @@ app.add_middleware(
36
 
37
  @app.post("/v1/chat/completions")
38
  async def chat_completions(request: Request):
39
- data = await request.json()
40
- messages = data.get("messages", [])
41
- stream = data.get("stream", False)
42
- temperature = data.get("temperature", 0.2)
43
- max_tokens = data.get("max_tokens", 2048)
44
-
45
- if not messages:
46
- return JSONResponse(content={"error": "No messages provided"}, status_code=400)
47
-
48
- output = llm.create_chat_completion(
49
- messages=messages, max_tokens=max_tokens, temperature=temperature, stream=stream
50
- )
51
-
52
- if stream:
53
-
54
- def iter_content():
55
- try:
56
- for chunk in output:
57
- yield f"data: {json.dumps(chunk)}\n\n"
58
- except Exception as e:
59
- print(f"Streaming error: {e}")
60
- finally:
61
- yield "data: [DONE]\n\n"
62
-
63
- return StreamingResponse(
64
- iter_content(),
65
- media_type="text/event-stream",
66
- headers={"Cache-Control": "no-cache", "Connection": "keep-alive"},
67
  )
68
 
69
- return JSONResponse(content=output)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
 
72
  def gradio_interface(message, history):
73
- messages = [{"role": "system", "content": "You are an expert coding assistant."}]
74
- for u, a in history:
 
 
 
 
 
 
 
 
75
  messages.append({"role": "user", "content": u})
76
  messages.append({"role": "assistant", "content": a})
77
  messages.append({"role": "user", "content": message})
78
 
79
- response_stream = llm.create_chat_completion(
80
- messages=messages, max_tokens=2048, temperature=0.4, stream=True
81
- )
82
-
83
  partial_text = ""
84
- for chunk in response_stream:
85
- delta = chunk["choices"][0]["delta"]
86
- if "content" in delta:
87
- partial_text += delta["content"]
88
- yield partial_text
 
 
 
 
 
 
 
 
 
 
 
 
 
89
 
90
 
91
  demo = gr.ChatInterface(
92
  fn=gradio_interface,
93
- title="Qwen 2.5 Coder API",
94
- description="API endpoint: /v1/chat/completions",
 
 
 
 
95
  )
96
 
97
  app = mount_gradio_app(app, demo, path="/")
98
 
99
-
100
  if __name__ == "__main__":
101
  uvicorn.run(app, host="0.0.0.0", port=7860)
 
1
  import json
2
+ import traceback
3
 
4
  import gradio as gr
5
  import uvicorn
6
  from fastapi import FastAPI, Request
7
  from fastapi.middleware.cors import CORSMiddleware
8
+ from fastapi.responses import JSONResponse, StreamingResponse
9
  from gradio import mount_gradio_app
10
  from huggingface_hub import hf_hub_download
11
  from llama_cpp import Llama
12
 
13
+ # Конфигурация модели
14
  REPO_ID = "Qwen/Qwen2.5-Coder-7B-Instruct-GGUF"
15
  FILENAME = "qwen2.5-coder-7b-instruct-q5_k_m.gguf"
16
 
17
+ CONTEXT_SIZE = 8192
18
+ MAX_OUTPUT_TOKENS = 4096
19
+
20
  print(f"Loading model {REPO_ID}...")
21
+ try:
22
+ model_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME)
23
+
24
+ print("Initializing Llama...")
25
+ llm = Llama(
26
+ model_path=model_path,
27
+ n_ctx=CONTEXT_SIZE,
28
+ n_threads=2,
29
+ n_batch=512,
30
+ verbose=True,
31
+ )
32
+ except Exception as e:
33
+ print(f"Critical Error loading model: {e}")
34
+ raise e
35
 
36
  app = FastAPI()
37
 
 
46
 
47
  @app.post("/v1/chat/completions")
48
  async def chat_completions(request: Request):
49
+ try:
50
+ data = await request.json()
51
+ messages = data.get("messages", [])
52
+ stream = data.get("stream", False)
53
+ temperature = data.get("temperature", 0.4)
54
+ max_tokens = data.get("max_tokens", MAX_OUTPUT_TOKENS)
55
+
56
+ if not messages:
57
+ return JSONResponse(
58
+ content={"error": "No messages provided"}, status_code=400
59
+ )
60
+
61
+ output = llm.create_chat_completion(
62
+ messages=messages,
63
+ max_tokens=max_tokens,
64
+ temperature=temperature,
65
+ stream=stream,
 
 
 
 
 
 
 
 
 
 
 
66
  )
67
 
68
+ if stream:
69
+
70
+ def iter_content():
71
+ try:
72
+ for chunk in output:
73
+ yield f"data: {json.dumps(chunk)}\n\n"
74
+ except Exception as e:
75
+ print(f"Streaming error: {e}")
76
+ err_chunk = {
77
+ "choices": [
78
+ {
79
+ "delta": {"content": f"\n[ERROR]: {str(e)}"},
80
+ "finish_reason": "error",
81
+ }
82
+ ]
83
+ }
84
+ yield f"data: {json.dumps(err_chunk)}\n\n"
85
+ finally:
86
+ yield "data: [DONE]\n\n"
87
+
88
+ return StreamingResponse(
89
+ iter_content(),
90
+ media_type="text/event-stream",
91
+ headers={"Cache-Control": "no-cache", "Connection": "keep-alive"},
92
+ )
93
+
94
+ return JSONResponse(content=output)
95
+
96
+ except Exception as e:
97
+ print(f"API Error: {e}")
98
+ return JSONResponse(content={"error": str(e)}, status_code=500)
99
 
100
 
101
  def gradio_interface(message, history):
102
+ messages = [
103
+ {
104
+ "role": "system",
105
+ "content": "You are an expert coding assistant. Write clean, efficient code.",
106
+ }
107
+ ]
108
+
109
+ history_subset = history[-10:] if len(history) > 10 else history
110
+
111
+ for u, a in history_subset:
112
  messages.append({"role": "user", "content": u})
113
  messages.append({"role": "assistant", "content": a})
114
  messages.append({"role": "user", "content": message})
115
 
 
 
 
 
116
  partial_text = ""
117
+ try:
118
+ response_stream = llm.create_chat_completion(
119
+ messages=messages,
120
+ max_tokens=MAX_OUTPUT_TOKENS,
121
+ temperature=0.4,
122
+ stream=True,
123
+ )
124
+
125
+ for chunk in response_stream:
126
+ delta = chunk["choices"][0]["delta"]
127
+ if "content" in delta:
128
+ partial_text += delta["content"]
129
+ yield partial_text
130
+
131
+ except Exception as e:
132
+ traceback.print_exc()
133
+ error_msg = f"\n\n🚫 **Error:** {str(e)}\nTry refreshing the page or shortening the context."
134
+ yield partial_text + error_msg
135
 
136
 
137
  demo = gr.ChatInterface(
138
  fn=gradio_interface,
139
+ title="Qwen 2.5 Coder (7B-Instruct)",
140
+ description="Running on CPU. Generation might be slow. Please be patient.",
141
+ examples=[
142
+ "Write a Python script to scrape a website.",
143
+ "Explain how asyncio works in Python.",
144
+ ],
145
  )
146
 
147
  app = mount_gradio_app(app, demo, path="/")
148
 
 
149
  if __name__ == "__main__":
150
  uvicorn.run(app, host="0.0.0.0", port=7860)