hsuwill000 commited on
Commit
34ec8a9
·
verified ·
1 Parent(s): 6651044

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +65 -125
app.py CHANGED
@@ -1,10 +1,6 @@
1
  import socket
2
  import gradio as gr
3
- import requests
4
- import json
5
- import time
6
- import threading
7
- import queue
8
 
9
  def get_local_ip():
10
  s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
@@ -19,146 +15,90 @@ def get_local_ip():
19
 
20
  print("本機 IP:", get_local_ip())
21
 
22
- def llama_http_stream_worker(message, history, system_message, max_tokens, temperature, top_p, output_queue):
23
- """直接使用HTTP請求到llama.cpp,完全繞過OpenAI library"""
 
 
 
 
 
 
 
 
 
 
 
 
24
  try:
25
- url = "http://0.0.0.0:8000/v1/chat/completions"
26
- headers = {
27
- "Content-Type": "application/json",
28
- "Authorization": "Bearer sk-local"
29
- }
30
-
31
- messages = [{"role": "system", "content": system_message}]
32
- messages.extend(history)
33
- messages.append({"role": "user", "content": message})
34
 
35
- payload = {
36
- "model": "qwen3",
37
- "messages": messages,
38
- "max_tokens": max_tokens,
39
- "temperature": temperature,
40
- "top_p": top_p,
41
- "stream": True
42
- }
43
-
44
- print(f"[Request] Sending request to llama.cpp...")
45
-
46
- # 使用非常長的超時時間,並禁用連接超時
47
- response = requests.post(
48
- url,
49
- json=payload,
50
- headers=headers,
51
  stream=True,
52
- timeout=(60, 3600) # 連接超時60秒,讀取超時3600秒(1小時)
53
  )
54
-
55
- if response.status_code == 200:
56
- output = ""
57
- for line in response.iter_lines(decode_unicode=True, chunk_size=1):
58
- if line and line.startswith('data: '):
59
- data = line[6:].strip()
60
- if data == '[DONE]':
61
- break
62
-
63
- try:
64
- chunk = json.loads(data)
65
- if 'choices' in chunk and chunk['choices']:
66
- delta = chunk['choices'][0].get('delta', {})
67
- if delta and delta.get('content'):
68
- content = delta['content']
69
- output += content
70
- output_queue.put(("chunk", output))
71
- print(f"[Chunk]: {content}", end="", flush=True)
72
- except json.JSONDecodeError as e:
73
- print(f"[JSON Error] {e}, line: {line}")
74
- continue
75
-
76
- output_queue.put(("complete", output))
77
- print(f"[Request] Completed successfully")
78
-
79
- else:
80
- error_msg = f"⚠️ HTTP錯誤: {response.status_code} - {response.text}"
81
- print(f"[Error] {error_msg}")
82
- output_queue.put(("error", error_msg))
83
-
84
- except requests.exceptions.Timeout:
85
- error_msg = "⚠️ 請求超時(第一個token生成時間太長)"
86
- print(f"[Error] {error_msg}")
87
- output_queue.put(("error", error_msg))
88
- except requests.exceptions.ConnectionError:
89
- error_msg = "⚠️ 連接錯誤(請檢查llama.cpp伺服器是否運行)"
90
- print(f"[Error] {error_msg}")
91
- output_queue.put(("error", error_msg))
92
- except Exception as e:
93
- error_msg = f"⚠️ 未知錯誤: {str(e)}"
94
- print(f"[Error] {error_msg}")
95
- output_queue.put(("error", error_msg))
96
 
97
- def respond(message, history, system_message, max_tokens, temperature, top_p):
98
- """處理Gradio請求"""
99
- output_queue = queue.Queue()
100
-
101
- # 啟動工作線程
102
- worker_thread = threading.Thread(
103
- target=llama_http_stream_worker,
104
- args=(message, history, system_message, max_tokens, temperature, top_p, output_queue),
105
- daemon=True
106
- )
107
- worker_thread.start()
108
-
109
- output = ""
110
- last_output_time = time.time()
111
- heartbeat_interval = 2.0 # 每2秒發送一次心跳
112
-
113
- while True:
114
- try:
115
- # 等待輸出,設置較短超時以保持響應性
116
- item_type, content = output_queue.get(timeout=0.5)
117
-
118
- if item_type == "chunk":
119
- output = content
120
- yield {"role": "assistant", "content": output}
121
- last_output_time = time.time()
122
-
123
- elif item_type == "complete":
124
- yield {"role": "assistant", "content": content}
125
- break
126
-
127
- elif item_type == "error":
128
- yield {"role": "assistant", "content": content}
129
- break
130
-
131
- except queue.Empty:
132
- # 檢查工作線程是否還在運行
133
- if not worker_thread.is_alive():
134
- # 線程已結束但沒有發送完成信號,可能出錯了
135
- yield {"role": "assistant", "content": "⚠️ 伺服器處理異常中斷"}
136
- break
137
-
138
- # 發送心跳保持連接
139
- current_time = time.time()
140
- if current_time - last_output_time > heartbeat_interval:
141
- if output: # 如果有內容,發送當前內容作為心跳
142
  yield {"role": "assistant", "content": output}
143
- last_output_time = current_time
144
 
145
- # Gradio 介面
 
 
 
 
146
  demo = gr.ChatInterface(
147
  respond,
148
- type="messages",
149
  additional_inputs=[
150
  gr.Textbox(value="You are a friendly assistant.", label="System message"),
151
  gr.Slider(minimum=1, maximum=4096, value=1024, step=1, label="Max new tokens"),
152
  gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
153
  gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
154
  ],
155
- title="Llama.cpp Chat Interface",
156
- description="直接連接llama.cpp伺服器,避免OpenAI library超時問題"
157
  )
158
 
159
  if __name__ == "__main__":
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
160
  demo.launch(
161
  server_name="0.0.0.0",
162
  server_port=7860,
 
 
 
 
 
 
 
 
 
 
 
 
163
  share=False
164
  )
 
1
  import socket
2
  import gradio as gr
3
+ from openai import OpenAI
 
 
 
 
4
 
5
  def get_local_ip():
6
  s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
 
15
 
16
  print("本機 IP:", get_local_ip())
17
 
18
+ # 設定 base URL 連接本地 llama.cpp API
19
+ client = OpenAI(
20
+ base_url="http://0.0.0.0:8000/v1",
21
+ api_key="sk-local", # llama.cpp 不檢查內容,只要有就行
22
+ timeout=1200 # 增加 OpenAI 客戶端超時時間
23
+ )
24
+
25
+ # ✅ 回應函式 (流式 generator)
26
+ def respond(message, history, system_message, max_tokens, temperature, top_p):
27
+ # history 是 list of dict: [{"role": "user"/"assistant", "content": "..."}]
28
+ messages = [{"role": "system", "content": system_message}]
29
+ messages.extend(history) # 直接加入舊對話
30
+ messages.append({"role": "user", "content": message})
31
+
32
  try:
33
+ # 先立即返回一個等待消息,保持連接活躍
34
+ yield {"role": "assistant", "content": "⏳ 正在處理您的請求,這可能需要較長時間..."}
 
 
 
 
 
 
 
35
 
36
+ stream = client.chat.completions.create(
37
+ model="qwen3", # ⚠️ 替換成你 llama.cpp 載入的模型 general.name
38
+ messages=messages,
39
+ max_tokens=max_tokens,
40
+ temperature=temperature,
41
+ top_p=top_p,
 
 
 
 
 
 
 
 
 
 
42
  stream=True,
 
43
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
+ output = ""
46
+ for chunk in stream:
47
+ # 🔍 Debug log
48
+ # print("[DEBUG] chunk:", chunk)
49
+
50
+ if chunk.choices:
51
+ delta = chunk.choices[0].delta
52
+ if delta and delta.content:
53
+ output += delta.content
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  yield {"role": "assistant", "content": output}
 
55
 
56
+ except Exception as e:
57
+ print(f"[Error] {e}")
58
+ yield {"role": "assistant", "content": "⚠️ Llama.cpp server 沒有回應,請稍後再試。"}
59
+
60
+ # ✅ Gradio 介面 (新版必須用 type="messages")
61
  demo = gr.ChatInterface(
62
  respond,
63
+ type="messages", # 🔑 使用 OpenAI 風格訊息格式
64
  additional_inputs=[
65
  gr.Textbox(value="You are a friendly assistant.", label="System message"),
66
  gr.Slider(minimum=1, maximum=4096, value=1024, step=1, label="Max new tokens"),
67
  gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
68
  gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
69
  ],
 
 
70
  )
71
 
72
  if __name__ == "__main__":
73
+ # 修改 Gradio 的 App 類以支持長時間超時
74
+ from gradio.routes import App
75
+
76
+ class CustomApp(App):
77
+ def __init__(self, *args, **kwargs):
78
+ super().__init__(*args, **kwargs)
79
+ # 修改關鍵的超時參數
80
+ self.keepalive_timeout = 1800 # 30分鐘
81
+ if hasattr(self, 'timeout_keep_alive'):
82
+ self.timeout_keep_alive = 1800 # 30分鐘
83
+
84
+ # 替換默認的 App 類
85
+ gr.routes.App = CustomApp
86
+
87
+ # 啟動應用程序並設置超時參數
88
  demo.launch(
89
  server_name="0.0.0.0",
90
  server_port=7860,
91
+ # 關鍵:禁用 Gradio 的心跳檢測和設置長時間超時
92
+ app_kwargs={
93
+ "keepalive_timeout": 1800, # 30分鐘
94
+ "timeout_keep_alive": 1800, # 30分鐘
95
+ },
96
+ # 禁用心跳檢測
97
+ heartbeat=False,
98
+ # 顯示詳細錯誤信息
99
+ show_error=True,
100
+ # 增加隊列大小
101
+ max_threads=20,
102
+ # 允許共享
103
  share=False
104
  )