Spaces:

hsuwill000
/

ESP01LLMSample

Sleeping

App Files Files Community

hsuwill000 commited on Dec 1, 2025

Commit

214e263

verified ·

1 Parent(s): df53ff4

Update app.py

Browse files

Files changed (1) hide show

app.py +206 -127

app.py CHANGED Viewed

@@ -3,172 +3,251 @@
 import os
 import sys
 import subprocess
-import gradio as gr
-from typing import List, Dict
-from huggingface_hub import hf_hub_download
-# --- 0. 內嵌安裝 llama-cpp-python ---
-# 警告：這是一個非標準且可能失敗的解決方案。
-# 建議在 Gradio Space 中使用 requirements.txt 來安裝依賴。
-try:
-    print("--- 嘗試動態安裝 llama-cpp-python ---")
-    # 執行 pip install 命令
-    # 使用 sys.executable 確保使用當前的 Python 解譯器
-    subprocess.check_call([
-        sys.executable,
-        "-m",
-        "pip",
-        "install",
-        "llama-cpp-python",
-        "--upgrade" # 確保是最新版本
-    ])
-    print("llama-cpp-python 安裝/更新成功。")
-except subprocess.CalledProcessError as e:
-    print(f"**致命錯誤**：llama-cpp-python 安裝失敗。請檢查環境權限或系統依賴。錯誤訊息: {e}")
-    # 由於安裝失敗，我們不能繼續執行
-    sys.exit(1)
-except Exception as e:
-    print(f"**致命錯誤**：發生未知錯誤。錯誤訊息: {e}")
-    sys.exit(1)
-# --- 1. 引入 llama_cpp ---
-# 必須在嘗試安裝之後才能引入
 try:
     from llama_cpp import Llama
-except ImportError:
-    print("**致命錯誤**：即使嘗試安裝，仍然無法引入 llama_cpp。請檢查 pip 安裝日誌。")
     sys.exit(1)
-# --- 2. 模型設定與下載 ---
-# 您指定的模型資訊
 MODEL_NAME = "Qwen3-0.6B-Q8_0.gguf"
 MODEL_REPO = "Qwen/Qwen3-0.6B-GGUF"
-# 固定的系統提示
-DEFAULT_SYSTEM_MESSAGE = "You are a friendly and helpful assistant. Please answer the user's questions concisely and accurately."
-# 步驟 1: 下載 GGUF 模型
-try:
-    print(f"嘗試從 {MODEL_REPO} 下載 {MODEL_NAME}...")
-    model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_NAME)
-    print(f"模型下載完成，路徑: {model_path}")
-except Exception as e:
-    print(f"**錯誤**：無法下載模型。錯誤訊息: {e}")
-    sys.exit(1) # 無法下載模型則退出
-# --- 3. Llama.cpp 初始化 ---
-# 步驟 2: 初始化 Llama.cpp 實例
-try:
-    print("正在初始化 Llama.cpp 實例...")
-    llm = Llama(
-        model_path=model_path,
-        n_ctx=4096,  # 上下文長度
-        n_batch=512, # 批次大小
-        # 為了 Gradio Space 穩定性，使用少量 CPU 核心
-        n_threads=os.cpu_count() // 2 or 1,
-        n_gpu_layers=0, # CPU 推論
-        verbose=False # 關閉內部日誌輸出
-    )
-    print("Llama.cpp 模型加載成功。")
-except Exception as e:
-    print(f"**錯誤**：Llama.cpp 實例初始化失敗。錯誤訊息: {e}")
-    sys.exit(1)
-# --- 4. 推論核心函式 ---
-def llama_inference(
-    message: str,
-    chat_history: List[List[str]],
-    system_message: str = DEFAULT_SYSTEM_MESSAGE,
-    max_tokens: int = 4096,
-    temperature: float = 0.7,
     top_p: float = 0.95
 ) -> str:
-    """
-    使用 Llama.cpp 實例執行推論並返回回應。
-    """
-    # 格式化訊息列表，包含系統提示和聊天歷史
-    messages = [{"role": "system", "content": system_message}]
-    for human, assistant in chat_history:
-        messages.append({"role": "user", "content": human})
-        messages.append({"role": "assistant", "content": assistant})
-    messages.append({"role": "user", "content": message})
     try:
-        # 呼叫 Llama.cpp 的 create_chat_completion 介面
-        response = llm.create_chat_completion(
-            messages=messages,
             max_tokens=max_tokens,
             temperature=temperature,
             top_p=top_p,
         )
-        # 解析回應
-        if response.get('choices') and response['choices'][0].get('message'):
-            content = response['choices'][0]['message'].get('content', "⚠️ LLM 服務回傳空內容。")
             return content
         return "⚠️ LLM 服務回傳空內容。"
     except Exception as e:
-        print(f"[Error] Llama Inference failed: {e}")
-        return f"❌ 伺服器錯誤 (Llama.cpp 推論失敗): {e}"
-# --- 5. Gradio 介面設定 ---
-def chat_interface(message: str, history: List[List[str]]):
-    """Gradio 介面調用函式。"""
-    response = llama_inference(
-        message=message,
-        chat_history=history,
-    )
-    return response
-# 建立 Gradio 介面
-with gr.Blocks(title="Qwen3-0.6B-GGUF 聊天機器人") as demo:
-    gr.Markdown(
-        f"""
-        # Qwen3-0.6B-GGUF 聊天機器人
-        使用 **llama-cpp-python** 模組運行 **{MODEL_NAME}** 模型。
-        """
-    )
-    chatbot = gr.Chatbot(
-        label="聊天記錄",
-        height=500
-    )
-    chat_input = gr.Textbox(
-        show_label=False,
-        placeholder="請輸入你的問題...",
-        container=False
-    )
-    chat_input.submit(
-        fn=chat_interface,
-        inputs=[chat_input, chatbot],
-        outputs=chatbot
-    ).then(
-        fn=lambda: "",
-        inputs=None,
-        outputs=chat_input,
-        queue=False
-    )
-# 啟動應用程式
 if __name__ == "__main__":
-    demo.launch(server_name="0.0.0.0", server_port=7860)

 import os
 import sys
 import subprocess
+from typing import List, Dict, Any, Optional
+# --- 0. 內嵌模組安裝 (強制在程式碼內安裝所有依賴) ---
+def install_required_modules():
+    """使用 pip 在運行時安裝所有必要的 Python 模組。"""
+    required_packages = [
+        "fastapi",
+        "uvicorn",
+        "pydantic",
+        "huggingface-hub",
+        "llama-cpp-python" # 這個通常需要較長的時間來編譯
+    ]
+    print("--- 嘗試動態安裝/升級必要的 Python 模組 ---")
+    try:
+        # 執行 pip install 命令
+        # 使用 sys.executable 確保使用當前的 Python 解譯器
+        subprocess.check_call([
+            sys.executable,
+            "-m",
+            "pip",
+            "install",
+            *required_packages, # 展開列表中的所有套件名
+            "--upgrade"
+        ])
+        print("所有模組安裝/更新成功。")
+    except subprocess.CalledProcessError as e:
+        print(f"**致命錯誤**：模組安裝失敗。請檢查環境權限或系統依賴 (尤其是 llama-cpp-python)。錯誤訊息: {e}")
+        sys.exit(1)
+    except Exception as e:
+        print(f"**致命錯誤**：發生未知錯誤。錯誤訊息: {e}")
+        sys.exit(1)
+# 執行安裝
+install_required_modules()
+# --- 1. 模組引入 (必須在安裝之後) ---
 try:
+    # 引入 FastAPI 相關模組
+    from pydantic import BaseModel, Field
+    from fastapi import FastAPI, HTTPException
+    from fastapi.responses import JSONResponse, HTMLResponse
+    from fastapi.middleware.cors import CORSMiddleware
+    import uvicorn
+    # 引入模型下載工具
+    from huggingface_hub import hf_hub_download
+    # 引入 Llama.cpp 模組
     from llama_cpp import Llama
+except ImportError as e:
+    print(f"**致命錯誤**：模組引入失敗，即使嘗試安裝也失敗。錯誤: {e}")
     sys.exit(1)
+# --- 2. 模型設定與初始化 ---
 MODEL_NAME = "Qwen3-0.6B-Q8_0.gguf"
 MODEL_REPO = "Qwen/Qwen3-0.6B-GGUF"
+LLAMA_INSTANCE: Optional[Llama] = None # 定義全域 Llama 實例變數
+def initialize_llm():
+    """下載模型並初始化 Llama 實例"""
+    global LLAMA_INSTANCE
+    if LLAMA_INSTANCE is not None:
+        return
+    print(f"--- 1. 開始下載模型 {MODEL_NAME} ---")
+    try:
+        model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_NAME)
+        print(f"模型下載完成，路徑: {model_path}")
+    except Exception as e:
+        print(f"**致命錯誤**：無法下載模型。錯誤訊息: {e}")
+        raise RuntimeError(f"無法下載模型: {e}")
+    print("--- 2. 初始化 Llama.cpp 實例 ---")
+    try:
+        LLAMA_INSTANCE = Llama(
+            model_path=model_path,
+            n_ctx=4096,
+            n_batch=512,
+            n_threads=os.cpu_count() // 2 or 1,
+            n_gpu_layers=0, # CPU 推論 (可根據環境調整)
+            verbose=False
+        )
+        print("Llama.cpp 模型加載成功。")
+    except Exception as e:
+        print(f"**致命錯誤**：Llama.cpp 實例初始化失敗。錯誤訊息: {e}")
+        raise RuntimeError(f"Llama 實例初始化失敗: {e}")
+# --- 3. FastAPI 設定與中介層 (Middleware) ---
+app = FastAPI(
+    title="LLM 推論 API (Llama.cpp)",
+    description="直接使用 Llama.cpp 進行推論的 API 服務。"
+)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# --- 4. Pydantic 請求模型 ---
+class InferenceRequest(BaseModel):
+    """推論請求的資料結構，基於 OpenAI Chat Completion 格式。"""
+    messages: List[Dict[str, str]]
+    system_message: str = "You are a friendly assistant."
+    max_tokens: int = 4096
+    temperature: float = 0.7
     top_p: float = 0.95
+    extra_params: Optional[Dict[str, Any]] = {}
+class InferenceRequestMinimal(BaseModel):
+    """極簡推論請求的資料結構，僅接收問題。"""
+    question: str = Field(..., description="使用者輸入的問題或提示。")
+# --- 5. 推論核心函式 (非流式) ---
+def get_inference_response(
+    messages: List[Dict[str, str]],
+    system_message: str,
+    max_tokens: int,
+    temperature: float = 0.7,
+    top_p: float = 0.95,
+    extra_params: Dict[str, Any] = {}
 ) -> str:
+    """呼叫 Llama.cpp 實例並返回單一文字回應。"""
+    if LLAMA_INSTANCE is None:
+        raise HTTPException(status_code=503, detail="LLM 服務尚未初始化。")
+    full_messages = [{"role": "system", "content": system_message}]
+    full_messages.extend(messages)
     try:
+        response = LLAMA_INSTANCE.create_chat_completion(
+            messages=full_messages,
             max_tokens=max_tokens,
             temperature=temperature,
             top_p=top_p,
         )
+        if response.get('choices') and response['choices'][0].get('message') and response['choices'][0]['message'].get('content'):
+            content = response['choices'][0]['message']['content']
             return content
         return "⚠️ LLM 服務回傳空內容。"
     except Exception as e:
+        print(f"[Error] LLM Inference failed: {e}")
+        raise HTTPException(
+            status_code=503,
+            detail=f"LLM Server Response Error: {e}"
+        )
+# --- 6. FastAPI 路由: 健康檢查/首頁 ---
+@app.on_event("startup")
+async def startup_event():
+    """FastAPI 啟動時執行模型初始化"""
+    try:
+        initialize_llm()
+    except Exception as e:
+        print(f"應用程式啟動失敗: {e}")
+        # 允許應用程式啟動，但 LLM 服務將會處於不可用狀態 (會拋出 503)
+@app.get("/", summary="首頁/健康檢查")
+async def root():
+    status = "running" if LLAMA_INSTANCE else "starting/failed (LLM unavailable)"
+    return HTMLResponse(content=f"<html><body><h1>LLM API Status: {status}</h1></body></html>", status_code=200)
+# --- 7. FastAPI 路由: 推論端點 v1 (複雜版，與您原有的 /infer 對應) ---
+@app.post("/infer", summary="執行 LLM 推論 (v1)")
+async def infer_endpoint(request: InferenceRequest):
+    try:
+        content = get_inference_response(
+            messages=request.messages,
+            system_message=request.system_message,
+            max_tokens=request.max_tokens,
+            temperature=request.temperature,
+            top_p=request.top_p,
+            extra_params=request.extra_params
+        )
+        return JSONResponse(content={
+            "status": "success",
+            "response": content
+        })
+    except HTTPException as http_ex:
+        raise http_ex
+    except Exception as e:
+        print(f"[Fatal Error] During API call: {e}")
+        raise HTTPException(
+            status_code=500,
+            detail="Internal Server Error."
+        )
+# --- 8. FastAPI 路由: 推論端點 v4 (極簡版，與您原有的 /infer4 對應) ---
+@app.post("/infer4", summary="執行 LLM 推論 (v4: 極簡輸入/僅回傳 response 欄位)")
+async def infer4_endpoint(request: InferenceRequestMinimal):
+    FIXED_SYSTEM_MESSAGE = "You are a friendly and concise assistant."
+    FIXED_MAX_TOKENS = 4096
+    try:
+        messages = [{"role": "user", "content": request.question}]
+        content = get_inference_response(
+            messages=messages,
+            system_message=FIXED_SYSTEM_MESSAGE,
+            max_tokens=FIXED_MAX_TOKENS,
+        )
+        return JSONResponse(content={
+            "response": content
+        })
+    except HTTPException as http_ex:
+        raise http_ex
+    except Exception as e:
+        print(f"[Fatal Error] During API call: {e}")
+        raise HTTPException(
+            status_code=500,
+            detail="Internal Server Error."
+        )
+# --- 9. 啟動應用程式 ---
 if __name__ == "__main__":
+    print("FastAPI 服務正在啟動...")
+    # 在 Gradio Space 中，如果沒有其他設定，這裡可能是您的應用程式入口
+    uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=False)