Spaces:

cwadayi
/

Hf1

Sleeping

App Files Files Community

cwadayi commited on 27 days ago

Commit

6ceffd5

verified ·

1 Parent(s): 1793f82

Update app.py

Browse files

Files changed (1) hide show

app.py +40 -27

app.py CHANGED Viewed

@@ -1,46 +1,59 @@
 import gradio as gr
-from transformers import BertTokenizer, BertForSequenceClassification, pipeline
-import torch
-# 1. 載入編碼工具與預訓練模型 (對應書中 2.3 節)
-model_name = "bert-base-chinese"
-tokenizer = BertTokenizer.from_pretrained(model_name)
-# 使用 pipeline 簡化測試流程 (對應書中第 5 章概念)
-classifier = pipeline("sentiment-analysis", model="liam168/chat-gpt-detector-roberta-chinese")
-def process_text(text):
-    # 2. 實作編碼工作流程 (對應書中表 2-1)
-    # 取得 input_ids, attention_mask 等資訊
     encodings = tokenizer.encode_plus(
         text,
-        add_special_tokens=True,
-        max_length=25,
-        padding='max_length',
         return_tensors="pt"
     )
-    # 取得編碼後的數字 (input_ids)
     input_ids = encodings['input_ids'].tolist()[0]
-    # 3. 執行推理 (對應測試階段)
     result = classifier(text)[0]
-    label = result['label']
-    score = result['score']
     return {
-        "編碼結果 (Input IDs)": str(input_ids),
-        "情感分析結果": f"{label} (信心程度: {score:.2f})",
-        "還原文字 (Decode)": tokenizer.decode(input_ids) # 對應書中 decode 測試
     }
-# 4. 建立 Gradio 介面
 demo = gr.Interface(
-    fn=process_text,
-    inputs=gr.Textbox(placeholder="請輸入中文句子，例如：你站在橋上看風景"),
-    outputs="json",
-    title="Hugging Face 標準流程實作 - 中文編碼與分析",
-    description="本程式根據李福林老師著作，實作編碼器 (Tokenizer) 流程與模型測試。"
 )
 if __name__ == "__main__":

 import gradio as gr
+import os
+from transformers import BertTokenizer, pipeline
+# 從 Space Secrets 中讀取 Token
+hf_token = os.getenv("HF_TOKEN")
+# 1. 載入編碼器與模型 (對應書中 2.3.1 節)
+# 若官方模型暫時無法讀取，建議使用此穩定的中文情感分析模型
+model_name = "shibing624/bert-chinese-sentiment"
+tokenizer = BertTokenizer.from_pretrained(model_name, use_auth_token=hf_token)
+# 2. 實作書中提到的字典操作 (對應 2.3.6 節)
+# 增加新詞，使其不被拆散，例如「明月」
+new_tokens = ['明月', '裝飾', '窗子']
+tokenizer.add_tokens(new_tokens)
+# 載入推理管線
+classifier = pipeline("sentiment-analysis", model=model_name, use_auth_token=hf_token)
+def nlp_workflow(text):
+    if not text.strip():
+        return "請輸入文字"
+    # 3. 進階編碼函數實作 (對應書中 2.3.4 節與表 2-2)
     encodings = tokenizer.encode_plus(
         text,
+        add_special_tokens=True,   # 增加 [CLS], [SEP]
+        max_length=25,             # 參考書中範例長度
+        padding='max_length',      # 補齊長度至 25
+        truncation=True,           # 超長截斷
         return_tensors="pt"
     )
     input_ids = encodings['input_ids'].tolist()[0]
+    # 4. 測試推理結果 (對應標準流程中的「測試」階段)
     result = classifier(text)[0]
+    # 5. 還原文字驗證 (對應書中 decode 測試)
+    decoded_text = tokenizer.decode(input_ids)
     return {
+        "情感分析結果": f"{result['label']} (信心度: {result['score']:.4f})",
+        "編碼 ID (input_ids)": input_ids,
+        "解碼還原文字": decoded_text,
+        "注意": "若輸入包含「明月」，您會發現它被視為單一 Token 而非拆散。"
     }
+# 建立 Gradio 介面
 demo = gr.Interface(
+    fn=nlp_workflow,
+    inputs=gr.Textbox(label="輸入中文句子", placeholder="例如：明月裝飾了你的窗子"),
+    outputs=gr.JSON(label="Hugging Face 標準流程輸出"),
+    title="李福林《Hugging Face 自然語言處理實戰》流程實作",
+    description="本程式實作了書中第一章的標準流程與第二章的編碼工具細節。"
 )
 if __name__ == "__main__":