Spaces:
Sleeping
Sleeping
| # app.py | |
| import os | |
| import sys | |
| import subprocess | |
| import gradio as gr | |
| from typing import List, Dict | |
| from huggingface_hub import hf_hub_download | |
| # --- 0. 內嵌安裝 llama-cpp-python --- | |
| # 警告:這是一個非標準且可能失敗的解決方案。 | |
| # 建議在 Gradio Space 中使用 requirements.txt 來安裝依賴。 | |
| try: | |
| print("--- 嘗試動態安裝 llama-cpp-python ---") | |
| # 執行 pip install 命令 | |
| # 使用 sys.executable 確保使用當前的 Python 解譯器 | |
| subprocess.check_call([ | |
| sys.executable, | |
| "-m", | |
| "pip", | |
| "install", | |
| "llama-cpp-python", | |
| "--upgrade" # 確保是最新版本 | |
| ]) | |
| print("llama-cpp-python 安裝/更新成功。") | |
| except subprocess.CalledProcessError as e: | |
| print(f"**致命錯誤**:llama-cpp-python 安裝失敗。請檢查環境權限或系統依賴。錯誤訊息: {e}") | |
| # 由於安裝失敗,我們不能繼續執行 | |
| sys.exit(1) | |
| except Exception as e: | |
| print(f"**致命錯誤**:發生未知錯誤。錯誤訊息: {e}") | |
| sys.exit(1) | |
| # --- 1. 引入 llama_cpp --- | |
| # 必須在嘗試安裝之後才能引入 | |
| try: | |
| from llama_cpp import Llama | |
| except ImportError: | |
| print("**致命錯誤**:即使嘗試安裝,仍然無法引入 llama_cpp。請檢查 pip 安裝日誌。") | |
| sys.exit(1) | |
| # --- 2. 模型設定與下載 --- | |
| # 您指定的模型資訊 | |
| MODEL_NAME = "Qwen3-0.6B-Q8_0.gguf" | |
| MODEL_REPO = "Qwen/Qwen3-0.6B-GGUF" | |
| # 固定的系統提示 | |
| DEFAULT_SYSTEM_MESSAGE = "You are a friendly and helpful assistant. Please answer the user's questions concisely and accurately." | |
| # 步驟 1: 下載 GGUF 模型 | |
| try: | |
| print(f"嘗試從 {MODEL_REPO} 下載 {MODEL_NAME}...") | |
| model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_NAME) | |
| print(f"模型下載完成,路徑: {model_path}") | |
| except Exception as e: | |
| print(f"**錯誤**:無法下載模型。錯誤訊息: {e}") | |
| sys.exit(1) # 無法下載模型則退出 | |
| # --- 3. Llama.cpp 初始化 --- | |
| # 步驟 2: 初始化 Llama.cpp 實例 | |
| try: | |
| print("正在初始化 Llama.cpp 實例...") | |
| llm = Llama( | |
| model_path=model_path, | |
| n_ctx=4096, # 上下文長度 | |
| n_batch=512, # 批次大小 | |
| # 為了 Gradio Space 穩定性,使用少量 CPU 核心 | |
| n_threads=os.cpu_count() // 2 or 1, | |
| n_gpu_layers=0, # CPU 推論 | |
| verbose=False # 關閉內部日誌輸出 | |
| ) | |
| print("Llama.cpp 模型加載成功。") | |
| except Exception as e: | |
| print(f"**錯誤**:Llama.cpp 實例初始化失敗。錯誤訊息: {e}") | |
| sys.exit(1) | |
| # --- 4. 推論核心函式 --- | |
| def llama_inference( | |
| message: str, | |
| chat_history: List[List[str]], | |
| system_message: str = DEFAULT_SYSTEM_MESSAGE, | |
| max_tokens: int = 4096, | |
| temperature: float = 0.7, | |
| top_p: float = 0.95 | |
| ) -> str: | |
| """ | |
| 使用 Llama.cpp 實例執行推論並返回回應。 | |
| """ | |
| # 格式化訊息列表,包含系統提示和聊天歷史 | |
| messages = [{"role": "system", "content": system_message}] | |
| for human, assistant in chat_history: | |
| messages.append({"role": "user", "content": human}) | |
| messages.append({"role": "assistant", "content": assistant}) | |
| messages.append({"role": "user", "content": message}) | |
| try: | |
| # 呼叫 Llama.cpp 的 create_chat_completion 介面 | |
| response = llm.create_chat_completion( | |
| messages=messages, | |
| max_tokens=max_tokens, | |
| temperature=temperature, | |
| top_p=top_p, | |
| ) | |
| # 解析回應 | |
| if response.get('choices') and response['choices'][0].get('message'): | |
| content = response['choices'][0]['message'].get('content', "⚠️ LLM 服務回傳空內容。") | |
| return content | |
| return "⚠️ LLM 服務回傳空內容。" | |
| except Exception as e: | |
| print(f"[Error] Llama Inference failed: {e}") | |
| return f"❌ 伺服器錯誤 (Llama.cpp 推論失敗): {e}" | |
| # --- 5. Gradio 介面設定 --- | |
| def chat_interface(message: str, history: List[List[str]]): | |
| """Gradio 介面調用函式。""" | |
| response = llama_inference( | |
| message=message, | |
| chat_history=history, | |
| ) | |
| return response | |
| # 建立 Gradio 介面 | |
| with gr.Blocks(title="Qwen3-0.6B-GGUF 聊天機器人") as demo: | |
| gr.Markdown( | |
| f""" | |
| # Qwen3-0.6B-GGUF 聊天機器人 | |
| 使用 **llama-cpp-python** 模組運行 **{MODEL_NAME}** 模型。 | |
| """ | |
| ) | |
| chatbot = gr.Chatbot( | |
| label="聊天記錄", | |
| height=500 | |
| ) | |
| chat_input = gr.Textbox( | |
| show_label=False, | |
| placeholder="請輸入你的問題...", | |
| container=False | |
| ) | |
| chat_input.submit( | |
| fn=chat_interface, | |
| inputs=[chat_input, chatbot], | |
| outputs=chatbot | |
| ).then( | |
| fn=lambda: "", | |
| inputs=None, | |
| outputs=chat_input, | |
| queue=False | |
| ) | |
| # 啟動應用程式 | |
| if __name__ == "__main__": | |
| demo.launch(server_name="0.0.0.0", server_port=7860) |