Spaces:

cwadayi
/

Hf1

Sleeping

App Files Files Community

cwadayi commited on 17 days ago

Commit

2af716e

verified ·

1 Parent(s): 881be67

Update app.py

Browse files

Files changed (1) hide show

app.py +43 -46

app.py CHANGED Viewed

@@ -1,86 +1,83 @@
 import gradio as gr
 import os
-import torch
 from transformers import BertTokenizer, pipeline
-# 1. 安全金鑰與模型初始化
 hf_token = os.getenv("HF_TOKEN")
 model_name = "google-bert/bert-base-chinese"
 try:
-    # 載入並擴充字典（實作書中 2.3.6 節與 2.4 節）
     tokenizer = BertTokenizer.from_pretrained(model_name, token=hf_token)
-    tokenizer.add_tokens(['明月', '裝飾', '窗子', '夢境'])
-    # 載入情感分析管線（對應書中第 5 章與第 6 章概念）
     classifier = pipeline("sentiment-analysis", model="LiYuan/amazon-review-sentiment-analysis", token=hf_token)
 except Exception as e:
-    tokenizer = None
-    classifier = None
-    print(f"Error: {e}")
-def nlp_creative_lab(input_text):
-    if not tokenizer or not classifier: return "⚠️ 系統初始化失敗，請檢查 Secret 設定。"
     lines = [line.strip() for line in input_text.split('\n') if line.strip()]
-    if not lines: return "💡 請在左側輸入文字開始實驗！"
-    # 2. 實作進階批次編碼（對應書中 2.3.4 與 2.3.5 節）
     batch_out = tokenizer.batch_encode_plus(
         lines,
-        add_special_tokens=True,   # 增加 [CLS] 與 [SEP]
-        padding='max_length',      # 對齊長度
-        max_length=20,             # 方便介面觀察的固定長度
-        truncation=True,           # 超長截斷
         return_tensors="pt"
     )
-    results = classifier(lines)
     lab_reports = []
     for i, line in enumerate(lines):
-        ids = batch_out['input_ids'][i].tolist()
-        mask = batch_out['attention_mask'][i].tolist()
-        type_ids = batch_out['token_type_ids'][i].tolist()
-        # 實作書中 decode 驗證與分詞視覺化
         tokens = [tokenizer.decode([idx]) for idx in ids if idx != 0]
         lab_reports.append({
             "📝 原始句子": line,
-            "🎭 情感分析": f"{results[i]['label']} (信心: {results[i]['score']:.2f})",
-            "🧩 詞元分開 (Tokens)": " | ".join(tokens),
-            "🔢 進階編碼數據 (表 2-2)": {
-                "Input IDs (前10位)": ids[:10],
-                "Attention Mask": mask[:10],
-                "Token Type IDs": type_ids[:10]
-            },
-            "🛠️ 底層解碼還原": tokenizer.decode(ids)
         })
     return lab_reports
-# 3. 建立 Blocks 創意介面
-with gr.Blocks(theme=gr.themes.Monochrome()) as demo:
-    gr.Markdown("## 🏛️ Hugging Face 中文 NLP 標準流程實驗室")
-    gr.Markdown("本程式實作李福林著《Hugging Face 自然語言處理實戰》中的核心技術：編碼器矩陣解析與批次處理流程。")
     with gr.Row():
-        with gr.Column(scale=1):
-            input_area = gr.Textbox(
-                label="🔮 靈感輸入區 (支援多行)",
-                lines=5,
-                placeholder="輸入詩句或評論...\n例如：明月裝飾了你的窗子"
-            )
-            run_btn = gr.Button("🔥 執行標準流程測試", variant="primary")
-        with gr.Column(scale=2):
-            output_json = gr.JSON(label="📊 實驗報告 (包含 Input IDs, Mask, Type IDs)")
-    run_btn.click(fn=nlp_creative_lab, inputs=input_area, outputs=output_json)
     gr.Examples(
-        examples=[["明月裝飾了你的窗子\n你裝飾了別人的夢"], ["Hugging Face 工具集大幅降低了研發成本"]],
         inputs=input_area
     )

 import gradio as gr
 import os
 from transformers import BertTokenizer, pipeline
+# 1. 安全金鑰與工具初始化 (對應書中圖 1-3)
 hf_token = os.getenv("HF_TOKEN")
 model_name = "google-bert/bert-base-chinese"
 try:
+    # 載入編碼器並擴充字典 (實作書中 2.3.6 節)
     tokenizer = BertTokenizer.from_pretrained(model_name, token=hf_token)
+    tokenizer.add_tokens(['明月', '裝飾', '窗子', '李福林'])
+    # 載入情感分析管線 (第 5-6 章)
     classifier = pipeline("sentiment-analysis", model="LiYuan/amazon-review-sentiment-analysis", token=hf_token)
+    # 載入命名實體識別管線 (第 10-11 章實戰任務)
+    # 使用專門處理中文實體的模型
+    ner_tagger = pipeline("ner", model="ckiplab/bert-base-chinese-ner", token=hf_token)
 except Exception as e:
+    print(f"初始化錯誤: {e}")
+    tokenizer = classifier = ner_tagger = None
+def advanced_nlp_lab(input_text):
+    if not tokenizer: return "⚠️ 系統初始化失敗"
     lines = [line.strip() for line in input_text.split('\n') if line.strip()]
+    if not lines: return "💡 請輸入文字開始偵測！"
+    # 2. 實作進階批次編碼 (對應書中 2.3.5 節)
     batch_out = tokenizer.batch_encode_plus(
         lines,
+        add_special_tokens=True,
+        padding='max_length',
+        max_length=25,
+        truncation=True,
         return_tensors="pt"
     )
     lab_reports = []
     for i, line in enumerate(lines):
+        # 3. 實作 NER 實體偵測 (對應書中第 10 章任務)
+        ner_results = ner_tagger(line)
+        entities = [f"{entity['word']}({entity['entity']})" for entity in ner_results]
+        # 4. 情感分析 (對應書中第 7 章任務)
+        sentiment = classifier(line)[0]
+        ids = batch_out['input_ids'][i].tolist()
         tokens = [tokenizer.decode([idx]) for idx in ids if idx != 0]
         lab_reports.append({
             "📝 原始句子": line,
+            "🔍 實體偵測 (NER)": entities if entities else "未偵測到實體",
+            "🎭 情感分析": f"{sentiment['label']} (信心: {sentiment['score']:.2f})",
+            "🧩 分詞結構": " | ".join(tokens),
+            "🔢 機器編碼 IDs": [idx for idx in ids if idx != 0]
         })
     return lab_reports
+# 5. 建立專業風格 Blocks 介面
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("## 🕵️‍♂️ Hugging Face 中文語義全解析實驗室")
+    gr.Markdown("本程式整合了李福林著《Hugging Face 自然語言處理實戰》中的多項任務：從編碼矩陣到情感分類，再到命名實體識別。")
     with gr.Row():
+        input_area = gr.Textbox(
+            label="🔮 請輸入包含人名、地名或情感的中文 (支援多行)",
+            lines=4,
+            placeholder="例如：李福林在北京寫下了這本 Hugging Face 實戰書。"
+        )
+    run_btn = gr.Button("🚀 啟動多維度語義解析", variant="primary")
+    output_json = gr.JSON(label="📊 深度解析報告 (實作書中第 2, 7, 10 章核心)")
+    run_btn.click(fn=advanced_nlp_lab, inputs=input_area, outputs=output_json)
     gr.Examples(
+        examples=[["李福林在台北展示了 Transformer 的威力"], ["明月裝飾了你的窗子"]],
         inputs=input_area
     )