Update app.py
Browse files
app.py
CHANGED
|
@@ -1,46 +1,59 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
-
|
| 3 |
-
import
|
| 4 |
|
| 5 |
-
#
|
| 6 |
-
|
| 7 |
-
tokenizer = BertTokenizer.from_pretrained(model_name)
|
| 8 |
|
| 9 |
-
#
|
| 10 |
-
|
|
|
|
|
|
|
| 11 |
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
encodings = tokenizer.encode_plus(
|
| 16 |
text,
|
| 17 |
-
add_special_tokens=True,
|
| 18 |
-
max_length=25,
|
| 19 |
-
padding='max_length',
|
|
|
|
| 20 |
return_tensors="pt"
|
| 21 |
)
|
| 22 |
-
|
| 23 |
-
# 取得編碼後的數字 (input_ids)
|
| 24 |
input_ids = encodings['input_ids'].tolist()[0]
|
| 25 |
|
| 26 |
-
#
|
| 27 |
result = classifier(text)[0]
|
| 28 |
-
label = result['label']
|
| 29 |
-
score = result['score']
|
| 30 |
|
|
|
|
|
|
|
|
|
|
| 31 |
return {
|
| 32 |
-
"
|
| 33 |
-
"
|
| 34 |
-
"
|
|
|
|
| 35 |
}
|
| 36 |
|
| 37 |
-
#
|
| 38 |
demo = gr.Interface(
|
| 39 |
-
fn=
|
| 40 |
-
inputs=gr.Textbox(placeholder="
|
| 41 |
-
outputs="
|
| 42 |
-
title="Hugging Face
|
| 43 |
-
description="
|
| 44 |
)
|
| 45 |
|
| 46 |
if __name__ == "__main__":
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
+
import os
|
| 3 |
+
from transformers import BertTokenizer, pipeline
|
| 4 |
|
| 5 |
+
# 從 Space Secrets 中讀取 Token
|
| 6 |
+
hf_token = os.getenv("HF_TOKEN")
|
|
|
|
| 7 |
|
| 8 |
+
# 1. 載入編碼器與模型 (對應書中 2.3.1 節)
|
| 9 |
+
# 若官方模型暫時無法讀取,建議使用此穩定的中文情感分析模型
|
| 10 |
+
model_name = "shibing624/bert-chinese-sentiment"
|
| 11 |
+
tokenizer = BertTokenizer.from_pretrained(model_name, use_auth_token=hf_token)
|
| 12 |
|
| 13 |
+
# 2. 實作書中提到的字典操作 (對應 2.3.6 節)
|
| 14 |
+
# 增加新詞,使其不被拆散,例如「明月」
|
| 15 |
+
new_tokens = ['明月', '裝飾', '窗子']
|
| 16 |
+
tokenizer.add_tokens(new_tokens)
|
| 17 |
+
|
| 18 |
+
# 載入推理管線
|
| 19 |
+
classifier = pipeline("sentiment-analysis", model=model_name, use_auth_token=hf_token)
|
| 20 |
+
|
| 21 |
+
def nlp_workflow(text):
|
| 22 |
+
if not text.strip():
|
| 23 |
+
return "請輸入文字"
|
| 24 |
+
|
| 25 |
+
# 3. 進階編碼函數實作 (對應書中 2.3.4 節與表 2-2)
|
| 26 |
encodings = tokenizer.encode_plus(
|
| 27 |
text,
|
| 28 |
+
add_special_tokens=True, # 增加 [CLS], [SEP]
|
| 29 |
+
max_length=25, # 參考書中範例長度
|
| 30 |
+
padding='max_length', # 補齊長度至 25
|
| 31 |
+
truncation=True, # 超長截斷
|
| 32 |
return_tensors="pt"
|
| 33 |
)
|
| 34 |
+
|
|
|
|
| 35 |
input_ids = encodings['input_ids'].tolist()[0]
|
| 36 |
|
| 37 |
+
# 4. 測試推理結果 (對應標準流程中的「測試」階段)
|
| 38 |
result = classifier(text)[0]
|
|
|
|
|
|
|
| 39 |
|
| 40 |
+
# 5. 還原文字驗證 (對應書中 decode 測試)
|
| 41 |
+
decoded_text = tokenizer.decode(input_ids)
|
| 42 |
+
|
| 43 |
return {
|
| 44 |
+
"情感分析結果": f"{result['label']} (信心度: {result['score']:.4f})",
|
| 45 |
+
"編碼 ID (input_ids)": input_ids,
|
| 46 |
+
"解碼還原文字": decoded_text,
|
| 47 |
+
"注意": "若輸入包含「明月」,您會發現它被視為單一 Token 而非拆散。"
|
| 48 |
}
|
| 49 |
|
| 50 |
+
# 建立 Gradio 介面
|
| 51 |
demo = gr.Interface(
|
| 52 |
+
fn=nlp_workflow,
|
| 53 |
+
inputs=gr.Textbox(label="輸入中文句子", placeholder="例如:明月裝飾了你的窗子"),
|
| 54 |
+
outputs=gr.JSON(label="Hugging Face 標準流程輸出"),
|
| 55 |
+
title="李福林《Hugging Face 自然語言處理實戰》流程實作",
|
| 56 |
+
description="本程式實作了書中第一章的標準流程與第二章的編碼工具細節。"
|
| 57 |
)
|
| 58 |
|
| 59 |
if __name__ == "__main__":
|