hsuwill000 commited on
Commit
214e263
·
verified ·
1 Parent(s): df53ff4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +206 -127
app.py CHANGED
@@ -3,172 +3,251 @@
3
  import os
4
  import sys
5
  import subprocess
6
- import gradio as gr
7
- from typing import List, Dict
8
- from huggingface_hub import hf_hub_download
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
- # --- 0. 內嵌安裝 llama-cpp-python ---
11
- # 警告:這是一個非標準且可能失敗的解決方案。
12
- # 建議在 Gradio Space 中使用 requirements.txt 來安裝依賴。
13
- try:
14
- print("--- 嘗試動態安裝 llama-cpp-python ---")
15
- # 執行 pip install 命令
16
- # 使用 sys.executable 確保使用當前的 Python 解譯器
17
- subprocess.check_call([
18
- sys.executable,
19
- "-m",
20
- "pip",
21
- "install",
22
- "llama-cpp-python",
23
- "--upgrade" # 確保是最新版本
24
- ])
25
- print("llama-cpp-python 安裝/更新成功。")
26
- except subprocess.CalledProcessError as e:
27
- print(f"**致命錯誤**:llama-cpp-python 安裝失敗。請檢查環境權限或系統依賴。錯誤訊息: {e}")
28
- # 由於安裝失敗,我們不能繼續執行
29
- sys.exit(1)
30
- except Exception as e:
31
- print(f"**致命錯誤**:發生未知錯誤。錯誤訊息: {e}")
32
- sys.exit(1)
33
 
 
34
 
35
- # --- 1. 引入 llama_cpp ---
36
- # 必須在嘗試安裝之後才能引入
37
  try:
 
 
 
 
 
 
 
 
 
 
 
38
  from llama_cpp import Llama
39
- except ImportError:
40
- print("**致命錯誤**:即使嘗試安裝,仍然無法引入 llama_cpp。請檢查 pip 安裝日誌。")
41
  sys.exit(1)
42
 
43
 
44
- # --- 2. 模型設定與下載 ---
45
 
46
- # 您指定的模型資訊
47
  MODEL_NAME = "Qwen3-0.6B-Q8_0.gguf"
48
  MODEL_REPO = "Qwen/Qwen3-0.6B-GGUF"
 
49
 
50
- # 固定的系統提示
51
- DEFAULT_SYSTEM_MESSAGE = "You are a friendly and helpful assistant. Please answer the user's questions concisely and accurately."
 
 
 
 
52
 
53
- # 步驟 1: 下載 GGUF 模型
54
- try:
55
- print(f"嘗試從 {MODEL_REPO} 下載 {MODEL_NAME}...")
56
- model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_NAME)
57
- print(f"模型下載完成,路徑: {model_path}")
58
- except Exception as e:
59
- print(f"**錯誤**:無法下載模型。錯誤訊息: {e}")
60
- sys.exit(1) # 無法下載模型則退出
61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
 
63
- # --- 3. Llama.cpp 初始化 ---
64
 
65
- # 步驟 2: 初始化 Llama.cpp 實例
66
- try:
67
- print("正在初始化 Llama.cpp 實例...")
68
- llm = Llama(
69
- model_path=model_path,
70
- n_ctx=4096, # 上下文長度
71
- n_batch=512, # 批次大小
72
- # 為了 Gradio Space 穩定性,使用少量 CPU 核心
73
- n_threads=os.cpu_count() // 2 or 1,
74
- n_gpu_layers=0, # CPU 推論
75
- verbose=False # 關閉內部日誌輸出
76
- )
77
- print("Llama.cpp 模型加載成功。")
78
- except Exception as e:
79
- print(f"**錯誤**:Llama.cpp 實例初始化失敗。錯誤訊息: {e}")
80
- sys.exit(1)
81
 
 
 
 
 
82
 
83
- # --- 4. 推論核心函式 ---
 
 
 
 
 
 
84
 
85
- def llama_inference(
86
- message: str,
87
- chat_history: List[List[str]],
88
- system_message: str = DEFAULT_SYSTEM_MESSAGE,
89
- max_tokens: int = 4096,
90
- temperature: float = 0.7,
 
 
 
91
  top_p: float = 0.95
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  ) -> str:
93
- """
94
- 使用 Llama.cpp 實例執行推論並返回回應。
95
- """
96
-
97
- # 格式化訊息列表,包含系統提示和聊天歷史
98
- messages = [{"role": "system", "content": system_message}]
99
-
100
- for human, assistant in chat_history:
101
- messages.append({"role": "user", "content": human})
102
- messages.append({"role": "assistant", "content": assistant})
103
-
104
- messages.append({"role": "user", "content": message})
105
 
 
 
106
 
107
  try:
108
- # 呼叫 Llama.cpp 的 create_chat_completion 介面
109
- response = llm.create_chat_completion(
110
- messages=messages,
111
  max_tokens=max_tokens,
112
  temperature=temperature,
113
  top_p=top_p,
114
  )
115
 
116
- # 解析回應
117
- if response.get('choices') and response['choices'][0].get('message'):
118
- content = response['choices'][0]['message'].get('content', "⚠️ LLM 服務回傳空內容。")
119
  return content
120
 
121
  return "⚠️ LLM 服務回傳空內容。"
122
 
123
  except Exception as e:
124
- print(f"[Error] Llama Inference failed: {e}")
125
- return f"❌ 伺服器錯誤 (Llama.cpp 推論失敗): {e}"
 
 
 
126
 
127
 
128
- # --- 5. Gradio 介面設定 ---
129
 
130
- def chat_interface(message: str, history: List[List[str]]):
131
- """Gradio 介面調用函式。"""
132
-
133
- response = llama_inference(
134
- message=message,
135
- chat_history=history,
136
- )
137
-
138
- return response
139
 
 
 
 
 
140
 
141
- # 建立 Gradio 介面
142
- with gr.Blocks(title="Qwen3-0.6B-GGUF 聊天機器人") as demo:
143
- gr.Markdown(
144
- f"""
145
- # Qwen3-0.6B-GGUF 聊天機器人
146
- 使用 **llama-cpp-python** 模組運行 **{MODEL_NAME}** 模型。
147
- """
148
- )
149
-
150
- chatbot = gr.Chatbot(
151
- label="聊天記錄",
152
- height=500
153
- )
154
-
155
- chat_input = gr.Textbox(
156
- show_label=False,
157
- placeholder="請輸入你的問題...",
158
- container=False
159
- )
 
 
 
 
 
 
 
160
 
161
- chat_input.submit(
162
- fn=chat_interface,
163
- inputs=[chat_input, chatbot],
164
- outputs=chatbot
165
- ).then(
166
- fn=lambda: "",
167
- inputs=None,
168
- outputs=chat_input,
169
- queue=False
170
- )
171
 
172
- # 啟動應用程式
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
173
  if __name__ == "__main__":
174
- demo.launch(server_name="0.0.0.0", server_port=7860)
 
 
 
3
  import os
4
  import sys
5
  import subprocess
6
+ from typing import List, Dict, Any, Optional
7
+
8
+ # --- 0. 內嵌模組安裝 (強制在程式碼內安裝所有依賴) ---
9
+
10
+ def install_required_modules():
11
+ """使用 pip 在運行時安裝所有必要的 Python 模組。"""
12
+ required_packages = [
13
+ "fastapi",
14
+ "uvicorn",
15
+ "pydantic",
16
+ "huggingface-hub",
17
+ "llama-cpp-python" # 這個通常需要較長的時間來編譯
18
+ ]
19
+
20
+ print("--- 嘗試動態安裝/升級必要的 Python 模組 ---")
21
+
22
+ try:
23
+ # 執行 pip install 命令
24
+ # 使用 sys.executable 確保使用當前的 Python 解譯器
25
+ subprocess.check_call([
26
+ sys.executable,
27
+ "-m",
28
+ "pip",
29
+ "install",
30
+ *required_packages, # 展開列表中的所有套件名
31
+ "--upgrade"
32
+ ])
33
+ print("所有模組安裝/更新成功。")
34
+ except subprocess.CalledProcessError as e:
35
+ print(f"**致命錯誤**:模組安裝失敗。請檢查環境權限或系統依賴 (尤其是 llama-cpp-python)。錯誤訊息: {e}")
36
+ sys.exit(1)
37
+ except Exception as e:
38
+ print(f"**致命錯誤**:發生未知錯誤。錯誤訊息: {e}")
39
+ sys.exit(1)
40
+
41
+ # 執行安裝
42
+ install_required_modules()
43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
+ # --- 1. 模組引入 (必須在安裝之後) ---
46
 
 
 
47
  try:
48
+ # 引入 FastAPI 相關模組
49
+ from pydantic import BaseModel, Field
50
+ from fastapi import FastAPI, HTTPException
51
+ from fastapi.responses import JSONResponse, HTMLResponse
52
+ from fastapi.middleware.cors import CORSMiddleware
53
+ import uvicorn
54
+
55
+ # 引入模型下載工具
56
+ from huggingface_hub import hf_hub_download
57
+
58
+ # 引入 Llama.cpp 模組
59
  from llama_cpp import Llama
60
+ except ImportError as e:
61
+ print(f"**致命錯誤**:模組引入失敗,即使嘗試安裝也失敗。錯誤: {e}")
62
  sys.exit(1)
63
 
64
 
65
+ # --- 2. 模型設定與初始化 ---
66
 
 
67
  MODEL_NAME = "Qwen3-0.6B-Q8_0.gguf"
68
  MODEL_REPO = "Qwen/Qwen3-0.6B-GGUF"
69
+ LLAMA_INSTANCE: Optional[Llama] = None # 定義全域 Llama 實例變數
70
 
71
+ def initialize_llm():
72
+ """下載模型並初始化 Llama 實例"""
73
+ global LLAMA_INSTANCE
74
+
75
+ if LLAMA_INSTANCE is not None:
76
+ return
77
 
78
+ print(f"--- 1. 開始下載模型 {MODEL_NAME} ---")
79
+ try:
80
+ model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_NAME)
81
+ print(f"模型下載完成,路徑: {model_path}")
82
+ except Exception as e:
83
+ print(f"**致命錯誤**:無法下載模型。錯誤訊息: {e}")
84
+ raise RuntimeError(f"無法下載模型: {e}")
 
85
 
86
+ print("--- 2. 初始化 Llama.cpp 實例 ---")
87
+ try:
88
+ LLAMA_INSTANCE = Llama(
89
+ model_path=model_path,
90
+ n_ctx=4096,
91
+ n_batch=512,
92
+ n_threads=os.cpu_count() // 2 or 1,
93
+ n_gpu_layers=0, # CPU 推論 (可根據環境調整)
94
+ verbose=False
95
+ )
96
+ print("Llama.cpp 模型加載成功。")
97
+ except Exception as e:
98
+ print(f"**致命錯誤**:Llama.cpp 實例初始化失敗。錯誤訊息: {e}")
99
+ raise RuntimeError(f"Llama 實例初始化失敗: {e}")
100
 
 
101
 
102
+ # --- 3. FastAPI 設定與中介層 (Middleware) ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
 
104
+ app = FastAPI(
105
+ title="LLM 推論 API (Llama.cpp)",
106
+ description="直接使用 Llama.cpp 進行推論的 API 服務。"
107
+ )
108
 
109
+ app.add_middleware(
110
+ CORSMiddleware,
111
+ allow_origins=["*"],
112
+ allow_credentials=True,
113
+ allow_methods=["*"],
114
+ allow_headers=["*"],
115
+ )
116
 
117
+
118
+ # --- 4. Pydantic 請求模型 ---
119
+
120
+ class InferenceRequest(BaseModel):
121
+ """推論請求的資料結構,基於 OpenAI Chat Completion 格式。"""
122
+ messages: List[Dict[str, str]]
123
+ system_message: str = "You are a friendly assistant."
124
+ max_tokens: int = 4096
125
+ temperature: float = 0.7
126
  top_p: float = 0.95
127
+ extra_params: Optional[Dict[str, Any]] = {}
128
+
129
+ class InferenceRequestMinimal(BaseModel):
130
+ """極簡推論請求的資料結構,僅接收問題。"""
131
+ question: str = Field(..., description="使用者輸入的問題或提示。")
132
+
133
+
134
+ # --- 5. 推論核心函式 (非流式) ---
135
+
136
+ def get_inference_response(
137
+ messages: List[Dict[str, str]],
138
+ system_message: str,
139
+ max_tokens: int,
140
+ temperature: float = 0.7,
141
+ top_p: float = 0.95,
142
+ extra_params: Dict[str, Any] = {}
143
  ) -> str:
144
+ """呼叫 Llama.cpp 實例並返回單一文字回應。"""
145
+
146
+ if LLAMA_INSTANCE is None:
147
+ raise HTTPException(status_code=503, detail="LLM 服務尚未初始化。")
 
 
 
 
 
 
 
 
148
 
149
+ full_messages = [{"role": "system", "content": system_message}]
150
+ full_messages.extend(messages)
151
 
152
  try:
153
+ response = LLAMA_INSTANCE.create_chat_completion(
154
+ messages=full_messages,
 
155
  max_tokens=max_tokens,
156
  temperature=temperature,
157
  top_p=top_p,
158
  )
159
 
160
+ if response.get('choices') and response['choices'][0].get('message') and response['choices'][0]['message'].get('content'):
161
+ content = response['choices'][0]['message']['content']
 
162
  return content
163
 
164
  return "⚠️ LLM 服務回傳空內容。"
165
 
166
  except Exception as e:
167
+ print(f"[Error] LLM Inference failed: {e}")
168
+ raise HTTPException(
169
+ status_code=503,
170
+ detail=f"LLM Server Response Error: {e}"
171
+ )
172
 
173
 
174
+ # --- 6. FastAPI 路由: 健康檢查/首頁 ---
175
 
176
+ @app.on_event("startup")
177
+ async def startup_event():
178
+ """FastAPI 啟動時執行模型初始化"""
179
+ try:
180
+ initialize_llm()
181
+ except Exception as e:
182
+ print(f"應用程式啟動失敗: {e}")
183
+ # 允許應用程式啟動,但 LLM 服務將會處於不可用狀態 (會拋出 503)
 
184
 
185
+ @app.get("/", summary="首頁/健康檢查")
186
+ async def root():
187
+ status = "running" if LLAMA_INSTANCE else "starting/failed (LLM unavailable)"
188
+ return HTMLResponse(content=f"<html><body><h1>LLM API Status: {status}</h1></body></html>", status_code=200)
189
 
190
+
191
+ # --- 7. FastAPI 路由: 推論端點 v1 (複雜版,與您原有的 /infer 對應) ---
192
+
193
+ @app.post("/infer", summary="執行 LLM 推論 (v1)")
194
+ async def infer_endpoint(request: InferenceRequest):
195
+ try:
196
+ content = get_inference_response(
197
+ messages=request.messages,
198
+ system_message=request.system_message,
199
+ max_tokens=request.max_tokens,
200
+ temperature=request.temperature,
201
+ top_p=request.top_p,
202
+ extra_params=request.extra_params
203
+ )
204
+ return JSONResponse(content={
205
+ "status": "success",
206
+ "response": content
207
+ })
208
+ except HTTPException as http_ex:
209
+ raise http_ex
210
+ except Exception as e:
211
+ print(f"[Fatal Error] During API call: {e}")
212
+ raise HTTPException(
213
+ status_code=500,
214
+ detail="Internal Server Error."
215
+ )
216
 
 
 
 
 
 
 
 
 
 
 
217
 
218
+ # --- 8. FastAPI 路由: 推論端點 v4 (極簡版,與您原有的 /infer4 對應) ---
219
+
220
+ @app.post("/infer4", summary="執行 LLM 推論 (v4: 極簡輸入/僅回傳 response 欄位)")
221
+ async def infer4_endpoint(request: InferenceRequestMinimal):
222
+ FIXED_SYSTEM_MESSAGE = "You are a friendly and concise assistant."
223
+ FIXED_MAX_TOKENS = 4096
224
+
225
+ try:
226
+ messages = [{"role": "user", "content": request.question}]
227
+
228
+ content = get_inference_response(
229
+ messages=messages,
230
+ system_message=FIXED_SYSTEM_MESSAGE,
231
+ max_tokens=FIXED_MAX_TOKENS,
232
+ )
233
+
234
+ return JSONResponse(content={
235
+ "response": content
236
+ })
237
+
238
+ except HTTPException as http_ex:
239
+ raise http_ex
240
+ except Exception as e:
241
+ print(f"[Fatal Error] During API call: {e}")
242
+ raise HTTPException(
243
+ status_code=500,
244
+ detail="Internal Server Error."
245
+ )
246
+
247
+
248
+ # --- 9. 啟動應用程式 ---
249
+
250
  if __name__ == "__main__":
251
+ print("FastAPI 服務正在啟動...")
252
+ # 在 Gradio Space 中,如果沒有其他設定,這裡可能是您的應用程式入口
253
+ uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=False)