File size: 6,160 Bytes
defa84e
 
 
df53ff4
 
214e263
 
08ac672
 
214e263
 
 
 
 
 
 
 
08ac672
214e263
 
 
 
 
 
 
 
 
 
08ac672
214e263
 
 
 
08ac672
214e263
 
 
 
 
 
defa84e
df53ff4
214e263
df53ff4
 
214e263
 
 
 
 
 
 
 
 
 
 
df53ff4
214e263
08ac672
df53ff4
 
 
214e263
defa84e
 
 
08ac672
defa84e
214e263
 
 
 
 
 
defa84e
214e263
 
 
 
 
defa84e
214e263
 
 
 
 
 
 
08ac672
214e263
 
 
 
 
defa84e
 
214e263
defa84e
214e263
 
 
 
defa84e
214e263
 
 
 
 
 
 
defa84e
214e263
08ac672
214e263
 
 
 
 
 
 
 
 
 
 
 
 
 
defa84e
214e263
 
 
 
defa84e
214e263
 
defa84e
 
214e263
 
defa84e
 
 
 
 
214e263
 
defa84e
 
 
5fc3b1a
defa84e
214e263
 
 
 
 
5fc3b1a
 
08ac672
defa84e
214e263
 
 
 
 
 
 
08ac672
5fc3b1a
214e263
 
 
 
5fc3b1a
214e263
08ac672
214e263
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f1706d4
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
# app.py

import os
import sys
import subprocess
from typing import List, Dict, Any, Optional

# --- 0. 內嵌模組安裝 ---
# 警告: 這在許多託管環境中可能因權限不足而失敗。建議使用 requirements.txt。

def install_required_modules():
    """使用 pip 在運行時安裝所有必要的 Python 模組。"""
    required_packages = [
        "fastapi",
        "uvicorn",
        "pydantic",
        "huggingface-hub",
        "llama-cpp-python" 
    ]
    
    print("--- 嘗試動態安裝/升級必要的 Python 模組 ---")
    
    try:
        subprocess.check_call([
            sys.executable, 
            "-m", 
            "pip", 
            "install", 
            *required_packages, 
            "--upgrade" 
        ])
        print("所有模組安裝/更新成功。")
    except subprocess.CalledProcessError as e:
        print(f"**致命錯誤**:模組安裝失敗。錯誤訊息: {e}")
        sys.exit(1)
    except Exception as e:
        print(f"**致命錯誤**:發生未知錯誤。錯誤訊息: {e}")
        sys.exit(1)

install_required_modules()


# --- 1. 模組引入 (必須在安裝之後) ---

try:
    # 引入 FastAPI 相關模組
    from pydantic import BaseModel, Field
    from fastapi import FastAPI, HTTPException
    from fastapi.responses import JSONResponse, HTMLResponse
    from fastapi.middleware.cors import CORSMiddleware
    import uvicorn
    
    # 引入模型下載工具
    from huggingface_hub import hf_hub_download
    
    # 引入 Llama.cpp 模組
    from llama_cpp import Llama
except ImportError as e:
    print(f"**致命錯誤**:模組引入失敗。錯誤: {e}")
    sys.exit(1)


# --- 2. 模型設定與初始化 ---

MODEL_NAME = "Qwen3-0.6B-Q8_0.gguf"
MODEL_REPO = "Qwen/Qwen3-0.6B-GGUF"
LLAMA_INSTANCE: Optional[Llama] = None # 全域 Llama 實例

def initialize_llm():
    """下載模型並初始化 Llama 實例"""
    global LLAMA_INSTANCE
    
    if LLAMA_INSTANCE is not None:
        return

    print(f"--- 1. 開始下載模型 {MODEL_NAME} ---")
    try:
        model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_NAME)
    except Exception as e:
        raise RuntimeError(f"無法下載模型: {e}")

    print("--- 2. 初始化 Llama.cpp 實例 ---")
    try:
        LLAMA_INSTANCE = Llama(
            model_path=model_path,
            n_ctx=4096,
            n_batch=512,
            n_threads=os.cpu_count() // 2 or 1,
            n_gpu_layers=0,
            verbose=False
        )
        print("Llama.cpp 模型加載成功。")
    except Exception as e:
        raise RuntimeError(f"Llama 實例初始化失敗: {e}")


# --- 3. FastAPI 設定與中介層 (Middleware) ---

app = FastAPI(
    title="LLM 推論 API (Llama.cpp)",
    description="直接使用 Llama.cpp 進行推論的 API 服務。"
)

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)


# --- 4. Pydantic 請求模型 (僅保留極簡版) ---

class InferenceRequestMinimal(BaseModel):
    """極簡推論請求的資料結構,僅接收問題。"""
    question: str = Field(..., description="使用者輸入的問題或提示。")


# --- 5. 推論核心函式 (非流式) ---

def get_inference_response(
    messages: List[Dict[str, str]],
    system_message: str,
    max_tokens: int,
    temperature: float = 0.7,
    top_p: float = 0.95,
) -> str:
    """呼叫 Llama.cpp 實例並返回單一文字回應。"""

    if LLAMA_INSTANCE is None:
        raise HTTPException(status_code=503, detail="LLM 服務尚未初始化。")
    
    full_messages = [{"role": "system", "content": system_message}]
    full_messages.extend(messages)
    
    try:
        response = LLAMA_INSTANCE.create_chat_completion(
            messages=full_messages,
            max_tokens=max_tokens,
            temperature=temperature,
            top_p=top_p,
        )
        
        if response.get('choices') and response['choices'][0].get('message') and response['choices'][0]['message'].get('content'):
            content = response['choices'][0]['message']['content']
            return content
        
        return "⚠️ LLM 服務回傳空內容。"

    except Exception as e:
        print(f"[Error] LLM Inference failed: {e}")
        raise HTTPException(
            status_code=503,
            detail=f"LLM Server Response Error: {e}"
        )


# --- 6. FastAPI 路由: / (健康檢查/首頁) ---

@app.on_event("startup")
async def startup_event():
    """FastAPI 啟動時執行模型初始化"""
    try:
        initialize_llm()
    except Exception as e:
        print(f"應用程式啟動失敗: {e}")
        # 如果初始化失敗,LLM 實例為 None,推論會拋出 503 錯誤

@app.get("/", summary="首頁/健康檢查")
async def root():
    status = "running" if LLAMA_INSTANCE else "starting/failed (LLM unavailable)"
    return HTMLResponse(content=f"<html><body><h1>LLM API Status: {status}</h1></body></html>", status_code=200)


# --- 7. FastAPI 路由: /infer4 (極簡版) ---

@app.post("/infer4", summary="執行 LLM 推論 (v4: 極簡輸入/僅回傳 response 欄位)")
async def infer4_endpoint(request: InferenceRequestMinimal):
    FIXED_SYSTEM_MESSAGE = "You are a friendly and concise assistant."
    FIXED_MAX_TOKENS = 4096

    try:
        messages = [{"role": "user", "content": request.question}]

        content = get_inference_response(
            messages=messages,
            system_message=FIXED_SYSTEM_MESSAGE,
            max_tokens=FIXED_MAX_TOKENS,
        )
        
        return JSONResponse(content={
            "response": content
        })

    except HTTPException as http_ex:
        raise http_ex
    except Exception as e:
        print(f"[Fatal Error] During API call: {e}")
        raise HTTPException(
            status_code=500,
            detail="Internal Server Error."
        )
        
        
# --- 8. 啟動應用程式 ---

if __name__ == "__main__":
    print("FastAPI 服務正在啟動...")
    uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=False)