Update app.py
Browse files
app.py
CHANGED
|
@@ -1,67 +1,75 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
import os
|
|
|
|
| 3 |
from transformers import BertTokenizer, pipeline
|
| 4 |
|
| 5 |
-
# 1. 讀取安全金鑰
|
| 6 |
hf_token = os.getenv("HF_TOKEN")
|
| 7 |
-
|
| 8 |
-
# 定義模型名稱:改用官方最常用的中文 BERT 基礎模型以確保相容性
|
| 9 |
model_name = "google-bert/bert-base-chinese"
|
| 10 |
|
| 11 |
try:
|
| 12 |
-
# 2.
|
| 13 |
-
# 注意:最新版 transformers 建議使用 token 參數而非 use_auth_token
|
| 14 |
tokenizer = BertTokenizer.from_pretrained(model_name, token=hf_token)
|
| 15 |
-
|
| 16 |
-
# 3. 實作書中提到的字典操作 (對應 2.3.6 節)
|
| 17 |
-
# 增加新詞「明月」,使其在分詞時不被拆散
|
| 18 |
tokenizer.add_tokens(['明月', '裝飾', '窗子'])
|
| 19 |
-
|
| 20 |
-
#
|
| 21 |
-
# 這裡載入一個通用的情感分析模型
|
| 22 |
classifier = pipeline("sentiment-analysis", model="LiYuan/amazon-review-sentiment-analysis", token=hf_token)
|
| 23 |
except Exception as e:
|
| 24 |
print(f"初始化失敗: {e}")
|
| 25 |
classifier = None
|
| 26 |
|
| 27 |
-
def
|
| 28 |
-
if classifier is None:
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
|
|
|
| 32 |
|
| 33 |
-
#
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
add_special_tokens=True,
|
| 37 |
-
|
| 38 |
-
padding='max_length',
|
| 39 |
-
|
| 40 |
return_tensors="pt"
|
| 41 |
)
|
| 42 |
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
|
| 51 |
-
return
|
| 52 |
-
"情感分析結果": f"{result['label']} (信心度: {result['score']:.4f})",
|
| 53 |
-
"編碼 ID (input_ids)": input_ids,
|
| 54 |
-
"解碼還原文字": decoded_text,
|
| 55 |
-
"說明": "已成功套用書中第 2 章的 Tokenizer 實作流程。"
|
| 56 |
-
}
|
| 57 |
|
| 58 |
-
# 建立 Gradio 介面
|
| 59 |
demo = gr.Interface(
|
| 60 |
-
fn=
|
| 61 |
-
inputs=gr.Textbox(
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
)
|
| 66 |
|
| 67 |
if __name__ == "__main__":
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
import os
|
| 3 |
+
import torch
|
| 4 |
from transformers import BertTokenizer, pipeline
|
| 5 |
|
| 6 |
+
# 1. 讀取安全金鑰
|
| 7 |
hf_token = os.getenv("HF_TOKEN")
|
|
|
|
|
|
|
| 8 |
model_name = "google-bert/bert-base-chinese"
|
| 9 |
|
| 10 |
try:
|
| 11 |
+
# 2. 載入編碼器並擴充字典 (對應書中 2.3.6 節)
|
|
|
|
| 12 |
tokenizer = BertTokenizer.from_pretrained(model_name, token=hf_token)
|
|
|
|
|
|
|
|
|
|
| 13 |
tokenizer.add_tokens(['明月', '裝飾', '窗子'])
|
| 14 |
+
|
| 15 |
+
# 3. 載入推理管線
|
|
|
|
| 16 |
classifier = pipeline("sentiment-analysis", model="LiYuan/amazon-review-sentiment-analysis", token=hf_token)
|
| 17 |
except Exception as e:
|
| 18 |
print(f"初始化失敗: {e}")
|
| 19 |
classifier = None
|
| 20 |
|
| 21 |
+
def enhanced_nlp_workflow(input_text):
|
| 22 |
+
if classifier is None: return "系統初始化失敗"
|
| 23 |
+
|
| 24 |
+
# 將輸入按行拆分,實作批次處理
|
| 25 |
+
lines = [line.strip() for line in input_text.split('\n') if line.strip()]
|
| 26 |
+
if not lines: return "請輸入文字"
|
| 27 |
|
| 28 |
+
# 4. 實作批次編碼函數 (對應書中 2.3.5 節)
|
| 29 |
+
batch_out = tokenizer.batch_encode_plus(
|
| 30 |
+
lines,
|
| 31 |
+
add_special_tokens=True,
|
| 32 |
+
truncation=True,
|
| 33 |
+
padding='max_length',
|
| 34 |
+
max_length=20, # 縮短長度方便在介面觀察
|
| 35 |
return_tensors="pt"
|
| 36 |
)
|
| 37 |
|
| 38 |
+
# 5. 執行批次推理
|
| 39 |
+
results = classifier(lines)
|
| 40 |
+
|
| 41 |
+
# 整理輸出資訊 (對應書中表 2-2)
|
| 42 |
+
output = []
|
| 43 |
+
for i, line in enumerate(lines):
|
| 44 |
+
res = results[i]
|
| 45 |
+
ids = batch_out['input_ids'][i].tolist()
|
| 46 |
+
mask = batch_out['attention_mask'][i].tolist()
|
| 47 |
+
|
| 48 |
+
# 移除 [PAD] 以利閱讀解碼結果
|
| 49 |
+
clean_ids = [idx for idx in ids if idx != 0]
|
| 50 |
+
decoded = tokenizer.decode(clean_ids)
|
| 51 |
+
|
| 52 |
+
output.append({
|
| 53 |
+
"原始句子": line,
|
| 54 |
+
"情感標籤": f"{res['label']} ({res['score']:.2f})",
|
| 55 |
+
"還原結果": decoded,
|
| 56 |
+
"Input IDs": clean_ids,
|
| 57 |
+
"Attention Mask (前10位)": mask[:10] # 呈現 PAD 邏輯
|
| 58 |
+
})
|
| 59 |
|
| 60 |
+
return output
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
|
| 62 |
+
# 6. 建立 Gradio 介面
|
| 63 |
demo = gr.Interface(
|
| 64 |
+
fn=enhanced_nlp_workflow,
|
| 65 |
+
inputs=gr.Textbox(
|
| 66 |
+
label="請輸入中文句子 (支援多行批次輸入)",
|
| 67 |
+
lines=3,
|
| 68 |
+
placeholder="第一行:明月裝飾了你的窗子\n第二行:這本書真的非常實用"
|
| 69 |
+
),
|
| 70 |
+
outputs=gr.JSON(label="強化版標準流程輸出結果"),
|
| 71 |
+
title="CNLP 強化實作:Hugging Face 批次處理與編碼解密",
|
| 72 |
+
description="本程式完整實作李福林老師書中第一、二章的所有關鍵功能:批次編碼、字典擴充與解碼驗證。"
|
| 73 |
)
|
| 74 |
|
| 75 |
if __name__ == "__main__":
|