Spaces:

cwadayi
/

Hf1

Sleeping

App Files Files Community

cwadayi commited on 20 days ago

Commit

fc2fc31

verified ·

1 Parent(s): a7278e8

Update app.py

Browse files

Files changed (1) hide show

app.py +50 -42

app.py CHANGED Viewed

@@ -1,67 +1,75 @@
 import gradio as gr
 import os
 from transformers import BertTokenizer, pipeline
-# 1. 讀取安全金鑰 (對應您截圖中設定的 HF_TOKEN)
 hf_token = os.getenv("HF_TOKEN")
-# 定義模型名稱：改用官方最常用的中文 BERT 基礎模型以確保相容性
 model_name = "google-bert/bert-base-chinese"
 try:
-    # 2. 載入編碼器 (對應書中 2.3.1 節)
-    # 注意：最新版 transformers 建議使用 token 參數而非 use_auth_token
     tokenizer = BertTokenizer.from_pretrained(model_name, token=hf_token)
-    # 3. 實作書中提到的字典操作 (對應 2.3.6 節)
-    # 增加新詞「明月」，使其在分詞時不被拆散
     tokenizer.add_tokens(['明月', '裝飾', '窗子'])
-    # 4. 定義模型推理管線 (對應圖 1-3 的訓練/測試階段)
-    # 這裡載入一個通用的情感分析模型
     classifier = pipeline("sentiment-analysis", model="LiYuan/amazon-review-sentiment-analysis", token=hf_token)
 except Exception as e:
     print(f"初始化失敗: {e}")
     classifier = None
-def nlp_workflow(text):
-    if classifier is None:
-        return "模型載入失敗，請檢查 HF_TOKEN 是否正確設定。"
-    if not text.strip():
-        return "請輸入文字。"
-    # 5. 進階編碼函數實作 (對應書中 2.3.4 節與表 2-2)
-    encodings = tokenizer.encode_plus(
-        text,
-        add_special_tokens=True,   # 增加 [CLS] 與 [SEP]
-        max_length=25,             # 參考書中範例長度
-        padding='max_length',      # 補齊長度
-        truncation=True,           # 超長截斷
         return_tensors="pt"
     )
-    input_ids = encodings['input_ids'].tolist()[0]
-    # 6. 執行測試推理
-    result = classifier(text)[0]
-    # 7. 還原文字驗證 (對應書中 decode 測試)
-    decoded_text = tokenizer.decode(input_ids)
-    return {
-        "情感分析結果": f"{result['label']} (信心度: {result['score']:.4f})",
-        "編碼 ID (input_ids)": input_ids,
-        "解碼還原文字": decoded_text,
-        "說明": "已成功套用書中第 2 章的 Tokenizer 實作流程。"
-    }
-# 建立 Gradio 介面
 demo = gr.Interface(
-    fn=nlp_workflow,
-    inputs=gr.Textbox(label="輸入中文句子", placeholder="例如：明月裝飾了你的窗子"),
-    outputs=gr.JSON(label="Hugging Face 標準流程輸出"),
-    title="李福林《Hugging Face 自然語言處理實戰》流程實作",
-    description="本程式實作了書中的標準研發流程與編碼工具細節。"
 )
 if __name__ == "__main__":

 import gradio as gr
 import os
+import torch
 from transformers import BertTokenizer, pipeline
+# 1. 讀取安全金鑰
 hf_token = os.getenv("HF_TOKEN")
 model_name = "google-bert/bert-base-chinese"
 try:
+    # 2. 載入編碼器並擴充字典 (對應書中 2.3.6 節)
     tokenizer = BertTokenizer.from_pretrained(model_name, token=hf_token)
     tokenizer.add_tokens(['明月', '裝飾', '窗子'])
+    # 3. 載入推理管線
     classifier = pipeline("sentiment-analysis", model="LiYuan/amazon-review-sentiment-analysis", token=hf_token)
 except Exception as e:
     print(f"初始化失敗: {e}")
     classifier = None
+def enhanced_nlp_workflow(input_text):
+    if classifier is None: return "系統初始化失敗"
+    # 將輸入按行拆分，實作批次處理
+    lines = [line.strip() for line in input_text.split('\n') if line.strip()]
+    if not lines: return "請輸入文字"
+    # 4. 實作批次編碼函數 (對應書中 2.3.5 節)
+    batch_out = tokenizer.batch_encode_plus(
+        lines,
+        add_special_tokens=True,
+        truncation=True,
+        padding='max_length',
+        max_length=20, # 縮短長度方便在介面觀察
         return_tensors="pt"
     )
+    # 5. 執行批次推理
+    results = classifier(lines)
+    # 整理輸出資訊 (對應書中表 2-2)
+    output = []
+    for i, line in enumerate(lines):
+        res = results[i]
+        ids = batch_out['input_ids'][i].tolist()
+        mask = batch_out['attention_mask'][i].tolist()
+        # 移除 [PAD] 以利閱讀解碼結果
+        clean_ids = [idx for idx in ids if idx != 0]
+        decoded = tokenizer.decode(clean_ids)
+        output.append({
+            "原始句子": line,
+            "情感標籤": f"{res['label']} ({res['score']:.2f})",
+            "還原結果": decoded,
+            "Input IDs": clean_ids,
+            "Attention Mask (前10位)": mask[:10] # 呈現 PAD 邏輯
+        })
+    return output
+# 6. 建立 Gradio 介面
 demo = gr.Interface(
+    fn=enhanced_nlp_workflow,
+    inputs=gr.Textbox(
+        label="請輸入中文句子 (支援多行批次輸入)",
+        lines=3,
+        placeholder="第一行：明月裝飾了你的窗子\n第二行：這本書真的非常實用"
+    ),
+    outputs=gr.JSON(label="強化版標準流程輸出結果"),
+    title="CNLP 強化實作：Hugging Face 批次處理與編碼解密",
+    description="本程式完整實作李福林老師書中第一、二章的所有關鍵功能：批次編碼、字典擴充與解碼驗證。"
 )
 if __name__ == "__main__":