hsuwill000 commited on
Commit
b13a7ae
·
verified ·
1 Parent(s): 844f989

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +91 -122
app.py CHANGED
@@ -1,138 +1,115 @@
1
  import os
2
  import sys
3
  import subprocess
 
4
  from typing import List, Dict, Any, Optional
5
 
6
- # --- 0. 內嵌模組安裝 ---
7
- # 警告: 這在許多託管環境中可能因權限不足而失敗。建議使用 requirements.txt。
 
 
 
 
 
 
8
 
9
  def install_required_modules():
10
- """使用 pip 在運行時安裝所有必要的 Python 模組,並強制啟用 AVX-512 編譯。"""
 
 
 
11
  required_packages = [
12
- "fastapi",
13
- "uvicorn",
14
- "pydantic",
15
- "huggingface-hub",
16
- "llama-cpp-python",
17
- "gradio_client" # <-- 新增 gradio_client
18
  ]
19
 
20
  # ----------------------------------------------------
21
- # **核心修改處:設定 Llama.cpp 編譯選項**
22
  # ----------------------------------------------------
23
  compile_env = os.environ.copy()
24
-
25
- # 1. 強制使用 CMake
26
  compile_env["FORCE_CMAKE"] = "1"
27
-
28
- # 2. 設定 CMake 參數,啟用 AVX512 和 AVX512_VNNI
29
- # 注意: 如果您的 CPU 不支援 AVX512,這將導致程式運行時錯誤 (Illegal instruction)。
30
- # 推薦將其設為環境變數,例如 os.environ.get("LLAMA_COMPILER_FLAGS", "-DLLAMA_AVX512=ON -DLLAMA_AVX512_VNNI=ON")
31
  compile_env["CMAKE_ARGS"] = "-DLLAMA_AVX512=ON -DLLAMA_AVX512_VNNI=ON"
32
  # ----------------------------------------------------
33
 
34
- print("--- 嘗試動態安裝/升級必要的 Python 模組 (啟用 AVX-512 編譯) ---")
35
 
36
  try:
37
- subprocess.check_call([
38
- sys.executable,
39
- "-m",
40
- "pip",
41
- "install",
42
- *required_packages,
43
- "--upgrade",
44
- "--no-cache-dir", # 確保重新編譯
45
- "--force-reinstall" # 確保重新編譯
46
- ],
47
- # 將設定好的環境變數傳遞給 subprocess
48
- env=compile_env)
49
-
50
- print("所有模組安裝/更新成功,llama-cpp-python 已使用 AVX-512 編譯。")
51
  except subprocess.CalledProcessError as e:
52
- print(f"**致命錯誤**:模組安裝失敗。錯誤訊息: {e}")
53
- print("請檢查您的 CPU 是否支援 AVX-512,或嘗試移除 CMAKE_ARGS 環境變數。")
54
  sys.exit(1)
55
  except Exception as e:
56
- print(f"**致命錯誤**:發生未知錯誤。錯誤訊息: {e}")
57
  sys.exit(1)
58
 
59
  install_required_modules()
60
 
61
 
62
- # --- 1. 模組引入 (必須在安裝之後) ---
63
-
64
  try:
65
- # 引入 FastAPI 相關模組
66
  from pydantic import BaseModel, Field
67
  from fastapi import FastAPI, HTTPException
68
  from fastapi.responses import JSONResponse, HTMLResponse
69
  from fastapi.middleware.cors import CORSMiddleware
70
- import uvicorn
71
-
72
- # 引入模型下載工具
73
  from huggingface_hub import hf_hub_download
74
-
75
- # 引入 Llama.cpp 模組
76
- from llama_cpp import Llama, llama_print_system_info # 增加 system info 檢查
77
-
78
- # 引入 gradio_client 模組
79
- from gradio_client import Client
80
-
81
  except ImportError as e:
82
- print(f"**致命錯誤**:模組引入失敗。錯誤: {e}")
83
  sys.exit(1)
84
 
85
-
86
- # --- 2. 模型設定與初始化 ---
87
-
88
- #MODEL_NAME = "Qwen3-0.6B-Q8_0.gguf"
89
- #MODEL_REPO = "Qwen/Qwen3-0.6B-GGUF"
90
- MODEL_NAME = "Qwen3-0.6B-IQ4_XS.gguf"
91
- MODEL_REPO = "unsloth/Qwen3-0.6B-GGUF"
92
- LLAMA_INSTANCE: Optional[Llama] = None # 全域 Llama 實例
93
-
94
- # Gradio Client 設��變數
95
- AMD_SPACE_ID = "amd/gpt-oss-120b-chatbot" # <-- 新增 Gradio Space ID 變數
96
 
97
  def initialize_llm():
98
- """下載模型並初始化 Llama 實例"""
99
  global LLAMA_INSTANCE
100
 
101
  if LLAMA_INSTANCE is not None:
102
  return
103
 
104
- # 檢查 AVX-512 是否啟用
105
  print("--- Llama.cpp System Info ---")
106
  print(llama_print_system_info())
107
  print("-----------------------------")
108
 
109
-
110
- print(f"--- 1. 開始下載模型 {MODEL_NAME} ---")
111
  try:
112
  model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_NAME)
113
  except Exception as e:
114
- raise RuntimeError(f"無法下載模型: {e}")
115
 
116
- print("--- 2. 初始化 Llama.cpp 實例 ---")
117
  try:
 
 
118
  LLAMA_INSTANCE = Llama(
119
  model_path=model_path,
120
  n_ctx=4096,
121
  n_batch=512,
122
- n_threads=os.cpu_count() // 2 or 1,
123
  n_gpu_layers=0,
124
  verbose=False
125
  )
126
- print("Llama.cpp 模型加載成功。")
127
  except Exception as e:
128
- raise RuntimeError(f"Llama 實例初始化失敗: {e}")
129
-
130
 
131
- # --- 3. FastAPI 設定與中介層 (Middleware) ---
132
 
 
133
  app = FastAPI(
134
- title="LLM 推論 API (Llama.cpp)",
135
- description="直接使用 Llama.cpp 進行推論的 API 服務。"
136
  )
137
 
138
  app.add_middleware(
@@ -143,16 +120,13 @@ app.add_middleware(
143
  allow_headers=["*"],
144
  )
145
 
146
-
147
- # --- 4. Pydantic 請求模型 (僅保留極簡版) ---
148
-
149
  class InferenceRequestMinimal(BaseModel):
150
- """極簡推論請求的資料結構,僅接收問題。"""
151
- question: str = Field(..., description="使用者輸入的問題或提示。")
152
 
153
 
154
- # --- 5. 推論核心函式 (非流式) ---
155
-
156
  def get_inference_response(
157
  messages: List[Dict[str, str]],
158
  system_message: str,
@@ -160,11 +134,11 @@ def get_inference_response(
160
  temperature: float = 0.7,
161
  top_p: float = 0.95,
162
  ) -> str:
163
- """呼叫 Llama.cpp 實例並返回單一文字回應。"""
164
-
165
  if LLAMA_INSTANCE is None:
166
- raise HTTPException(status_code=503, detail="LLM 服務尚未初始化。")
167
 
 
168
  full_messages = [{"role": "system", "content": system_message}]
169
  full_messages.extend(messages)
170
 
@@ -176,11 +150,13 @@ def get_inference_response(
176
  top_p=top_p,
177
  )
178
 
179
- if response.get('choices') and response['choices'][0].get('message') and response['choices'][0]['message'].get('content'):
180
- content = response['choices'][0]['message']['content']
 
 
181
  return content
182
 
183
- return "⚠️ LLM 服務回傳空內容。"
184
 
185
  except Exception as e:
186
  print(f"[Error] LLM Inference failed: {e}")
@@ -190,27 +166,29 @@ def get_inference_response(
190
  )
191
 
192
 
193
- # --- 6. FastAPI 路由: / (健康檢查/首頁) ---
194
 
195
  @app.on_event("startup")
196
  async def startup_event():
197
- """FastAPI 啟動時執行模型初始化"""
198
  try:
199
  initialize_llm()
200
  except Exception as e:
201
- print(f"應用程式啟動失敗: {e}")
202
- # 如果初始化失敗,LLM 實例為 None,推論會拋出 503 錯誤
203
 
204
- @app.get("/", summary="首頁/健康檢查")
205
  async def root():
206
  status = "running" if LLAMA_INSTANCE else "starting/failed (LLM unavailable)"
207
  return HTMLResponse(content=f"<html><body><h1>LLM API Status: {status}</h1></body></html>", status_code=200)
208
 
209
 
210
- # --- 7. FastAPI 路由: /infer4 (極簡版) ---
211
-
212
- @app.post("/local/qwen-0-6b", summary="執行 LLM 推論 (v4: 極簡輸入/僅回傳 response 欄位)")
213
- async def infer4_endpoint(request: InferenceRequestMinimal):
 
 
214
  FIXED_SYSTEM_MESSAGE = "You are a friendly and concise assistant."
215
  FIXED_MAX_TOKENS = 4096
216
 
@@ -223,60 +201,51 @@ async def infer4_endpoint(request: InferenceRequestMinimal):
223
  max_tokens=FIXED_MAX_TOKENS,
224
  )
225
 
226
- return JSONResponse(content={
227
- "response": content
228
- })
229
 
230
- except HTTPException as http_ex:
231
- raise http_ex
232
  except Exception as e:
233
- print(f"[Fatal Error] During API call: {e}")
234
- raise HTTPException(
235
- status_code=500,
236
- detail="Internal Server Error."
237
- )
238
 
239
 
240
- # --- 8. FastAPI 路由: /infer_amd (使用 Gradio Client) ---
241
-
242
- @app.post("/remote/amd", summary="使用 Gradio Client 呼叫外部 AMD LLM Space")
243
  async def infer_amd_endpoint(request: InferenceRequestMinimal):
244
  """
245
- 使用 gradio_client 呼叫 AMD_SPACE_ID 所指定的 Space 的 /chat API
246
- 輸入/輸出格式與 /infer4 相同。
247
  """
248
  try:
249
- # 初始化 Gradio Client,使用定義在全域的 AMD_SPACE_ID
250
  client = Client(AMD_SPACE_ID)
251
 
252
- # 呼叫 Space API
253
  result = client.predict(
254
- message=request.question, # 使用請求中的 question
255
  system_prompt="You are a helpful assistant.",
256
  temperature=0.7,
257
  api_name="/chat"
258
  )
259
 
260
- # 處理結果並以 /infer4 格式回傳
261
  if isinstance(result, str):
262
- return JSONResponse(content={
263
- "response": result
264
- })
265
  else:
266
- # 如果回傳不是字串,拋出內部錯誤
267
- raise ValueError("外部 API 回傳格式非預期的字串。")
268
 
269
  except Exception as e:
270
  print(f"[Fatal Error] Gradio Client API call failed: {e}")
271
- # 針對外部 API 錯誤,回傳 503 服務不可用
272
  raise HTTPException(
273
  status_code=503,
274
  detail=f"External AMD LLM Service Error: {e}"
275
  )
276
 
277
 
278
- # --- 9. 啟動應用程式 ---
279
-
280
  if __name__ == "__main__":
281
- print("FastAPI 服務正在啟動...")
 
 
282
  uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=False)
 
1
  import os
2
  import sys
3
  import subprocess
4
+ import uvicorn
5
  from typing import List, Dict, Any, Optional
6
 
7
+ # --- Configuration ---
8
+ MODEL_NAME = "Qwen3-0.6B-IQ4_XS.gguf"
9
+ MODEL_REPO = "unsloth/Qwen3-0.6B-GGUF"
10
+ AMD_SPACE_ID = "amd/gpt-oss-120b-chatbot" # Gradio Space ID for remote inference
11
+
12
+ # --- 0. Dynamic Module Installation ---
13
+ # WARNING: This may fail in many hosted environments due to permission issues.
14
+ # A `requirements.txt` is generally recommended for production.
15
 
16
  def install_required_modules():
17
+ """
18
+ Installs necessary Python modules at runtime using pip,
19
+ forcing compilation with AVX-512 flags for llama-cpp-python.
20
+ """
21
  required_packages = [
22
+ "fastapi", "uvicorn", "pydantic", "huggingface-hub",
23
+ "llama-cpp-python", "gradio_client"
 
 
 
 
24
  ]
25
 
26
  # ----------------------------------------------------
27
+ # **Core Modification: Llama.cpp Compile Options**
28
  # ----------------------------------------------------
29
  compile_env = os.environ.copy()
 
 
30
  compile_env["FORCE_CMAKE"] = "1"
31
+ # Note: If your CPU does not support AVX512, this will cause a runtime error (Illegal instruction).
 
 
 
32
  compile_env["CMAKE_ARGS"] = "-DLLAMA_AVX512=ON -DLLAMA_AVX512_VNNI=ON"
33
  # ----------------------------------------------------
34
 
35
+ print("--- Attempting Dynamic Installation/Upgrade (AVX-512 Compilation) ---")
36
 
37
  try:
38
+ subprocess.check_call(
39
+ [
40
+ sys.executable, "-m", "pip", "install",
41
+ *required_packages,
42
+ "--upgrade", "--no-cache-dir", "--force-reinstall" # Ensure recompile
43
+ ],
44
+ env=compile_env
45
+ )
46
+ print("All modules successfully installed/updated. llama-cpp-python compiled with AVX-512.")
 
 
 
 
 
47
  except subprocess.CalledProcessError as e:
48
+ print(f"**FATAL ERROR**: Module installation failed. Error: {e}")
49
+ print("Check if your CPU supports AVX-512 or try removing the CMAKE_ARGS environment variable.")
50
  sys.exit(1)
51
  except Exception as e:
52
+ print(f"**FATAL ERROR**: An unknown error occurred. Error: {e}")
53
  sys.exit(1)
54
 
55
  install_required_modules()
56
 
57
 
58
+ # --- 1. Module Imports (Must be after installation) ---
 
59
  try:
 
60
  from pydantic import BaseModel, Field
61
  from fastapi import FastAPI, HTTPException
62
  from fastapi.responses import JSONResponse, HTMLResponse
63
  from fastapi.middleware.cors import CORSMiddleware
 
 
 
64
  from huggingface_hub import hf_hub_download
65
+ from llama_cpp import Llama, llama_print_system_info
66
+ from gradio_client import Client
 
 
 
 
 
67
  except ImportError as e:
68
+ print(f"**FATAL ERROR**: Failed to import modules. Error: {e}")
69
  sys.exit(1)
70
 
71
+ # --- 2. Global State ---
72
+ LLAMA_INSTANCE: Optional[Llama] = None
 
 
 
 
 
 
 
 
 
73
 
74
  def initialize_llm():
75
+ """Downloads the model and initializes the global Llama instance."""
76
  global LLAMA_INSTANCE
77
 
78
  if LLAMA_INSTANCE is not None:
79
  return
80
 
81
+ # Check AVX-512 status
82
  print("--- Llama.cpp System Info ---")
83
  print(llama_print_system_info())
84
  print("-----------------------------")
85
 
86
+ print(f"--- 1. Starting model download: {MODEL_NAME} ---")
 
87
  try:
88
  model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_NAME)
89
  except Exception as e:
90
+ raise RuntimeError(f"Failed to download model: {e}")
91
 
92
+ print("--- 2. Initializing Llama.cpp instance ---")
93
  try:
94
+ # Use half of physical CPU cores for threads, minimum 1
95
+ n_threads = os.cpu_count() // 2 or 1
96
  LLAMA_INSTANCE = Llama(
97
  model_path=model_path,
98
  n_ctx=4096,
99
  n_batch=512,
100
+ n_threads=n_threads,
101
  n_gpu_layers=0,
102
  verbose=False
103
  )
104
+ print("Llama.cpp model successfully loaded.")
105
  except Exception as e:
106
+ raise RuntimeError(f"Llama instance initialization failed: {e}")
 
107
 
 
108
 
109
+ # --- 3. FastAPI Setup and Middleware ---
110
  app = FastAPI(
111
+ title="LLM Inference API (Llama.cpp)",
112
+ description="API service for direct inference using Llama.cpp."
113
  )
114
 
115
  app.add_middleware(
 
120
  allow_headers=["*"],
121
  )
122
 
123
+ # --- 4. Pydantic Request Model ---
 
 
124
  class InferenceRequestMinimal(BaseModel):
125
+ """Data structure for a minimal inference request, accepting only a question."""
126
+ question: str = Field(..., description="The user's input question or prompt.")
127
 
128
 
129
+ # --- 5. Core Inference Function (Non-Streaming) ---
 
130
  def get_inference_response(
131
  messages: List[Dict[str, str]],
132
  system_message: str,
 
134
  temperature: float = 0.7,
135
  top_p: float = 0.95,
136
  ) -> str:
137
+ """Calls the Llama.cpp instance and returns a single text response."""
 
138
  if LLAMA_INSTANCE is None:
139
+ raise HTTPException(status_code=503, detail="LLM Service not initialized.")
140
 
141
+ # Prepend the system message to the conversation history
142
  full_messages = [{"role": "system", "content": system_message}]
143
  full_messages.extend(messages)
144
 
 
150
  top_p=top_p,
151
  )
152
 
153
+ # Safely extract the content
154
+ content = response.get('choices', [{}])[0].get('message', {}).get('content')
155
+
156
+ if content:
157
  return content
158
 
159
+ return "⚠️ LLM service returned empty content."
160
 
161
  except Exception as e:
162
  print(f"[Error] LLM Inference failed: {e}")
 
166
  )
167
 
168
 
169
+ # --- 6. FastAPI Routes ---
170
 
171
  @app.on_event("startup")
172
  async def startup_event():
173
+ """Execute model initialization when FastAPI starts up."""
174
  try:
175
  initialize_llm()
176
  except Exception as e:
177
+ print(f"Application startup failed: {e}")
178
+ # If initialization fails, LLM_INSTANCE is None, and inference will return 503.
179
 
180
+ @app.get("/", summary="Home/Health Check")
181
  async def root():
182
  status = "running" if LLAMA_INSTANCE else "starting/failed (LLM unavailable)"
183
  return HTMLResponse(content=f"<html><body><h1>LLM API Status: {status}</h1></body></html>", status_code=200)
184
 
185
 
186
+ @app.post("/local/qwen-0-6b", summary="Execute Local LLM Inference (Minimal Input)")
187
+ async def infer_local_endpoint(request: InferenceRequestMinimal):
188
+ """
189
+ Executes inference using the local Llama.cpp instance.
190
+ Returns a JSON with the 'response' field.
191
+ """
192
  FIXED_SYSTEM_MESSAGE = "You are a friendly and concise assistant."
193
  FIXED_MAX_TOKENS = 4096
194
 
 
201
  max_tokens=FIXED_MAX_TOKENS,
202
  )
203
 
204
+ return JSONResponse(content={"response": content})
 
 
205
 
206
+ except HTTPException:
207
+ raise
208
  except Exception as e:
209
+ print(f"[Fatal Error] During local API call: {e}")
210
+ raise HTTPException(status_code=500, detail="Internal Server Error.")
 
 
 
211
 
212
 
213
+ @app.post("/remote/amd", summary="Call External AMD LLM Space via Gradio Client")
 
 
214
  async def infer_amd_endpoint(request: InferenceRequestMinimal):
215
  """
216
+ Uses gradio_client to call the /chat API of the AMD_SPACE_ID.
217
+ Input/output format is consistent with the local endpoint.
218
  """
219
  try:
220
+ # Initialize Gradio Client using the global AMD_SPACE_ID
221
  client = Client(AMD_SPACE_ID)
222
 
223
+ # Call the Space API
224
  result = client.predict(
225
+ message=request.question,
226
  system_prompt="You are a helpful assistant.",
227
  temperature=0.7,
228
  api_name="/chat"
229
  )
230
 
231
+ # Process and return result in the required format
232
  if isinstance(result, str):
233
+ return JSONResponse(content={"response": result})
 
 
234
  else:
235
+ raise ValueError("External API returned unexpected non-string format.")
 
236
 
237
  except Exception as e:
238
  print(f"[Fatal Error] Gradio Client API call failed: {e}")
239
+ # Return 503 Service Unavailable for external API errors
240
  raise HTTPException(
241
  status_code=503,
242
  detail=f"External AMD LLM Service Error: {e}"
243
  )
244
 
245
 
246
+ # --- 9. Application Startup ---
 
247
  if __name__ == "__main__":
248
+ print("FastAPI service is starting...")
249
+ # The 'app:app' structure tells uvicorn to look for the 'app' object
250
+ # inside the current module (which is also named 'app' when run directly).
251
  uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=False)