Spaces:

cwadayi
/

Hf1

Sleeping

Hf1

File size: 3,394 Bytes

1793f82
6ceffd5
 
1793f82
2af716e
6ceffd5
a7278e8
1793f82
a7278e8
2af716e
a7278e8
2af716e
fc2fc31
2af716e
a7278e8
2af716e
 
 
 
a7278e8
2af716e
 
6ceffd5
2af716e
 
fc2fc31
 
2af716e
6ceffd5
2af716e
fc2fc31
 
2af716e
 
 
 
1793f82
 
6ceffd5
dc8b5ff
fc2fc31
2af716e
 
 
 
 
 
dc8b5ff
2af716e
881be67
dc8b5ff
 
881be67
2af716e
 
 
 
fc2fc31
6ceffd5
dc8b5ff
1793f82
2af716e
 
 
 
dc8b5ff
 
2af716e
 
 
 
 
 
 
 
dc8b5ff
2af716e
dc8b5ff
 
2af716e
dc8b5ff
 
1793f82

import gradio as gr
import os
from transformers import BertTokenizer, pipeline

# 1. 安全金鑰與工具初始化 (對應書中圖 1-3)
hf_token = os.getenv("HF_TOKEN")
model_name = "google-bert/bert-base-chinese"

try:
    # 載入編碼器並擴充字典 (實作書中 2.3.6 節)
    tokenizer = BertTokenizer.from_pretrained(model_name, token=hf_token)
    tokenizer.add_tokens(['明月', '裝飾', '窗子', '李福林']) 
    
    # 載入情感分析管線 (第 5-6 章)
    classifier = pipeline("sentiment-analysis", model="LiYuan/amazon-review-sentiment-analysis", token=hf_token)
    
    # 載入命名實體識別管線 (第 10-11 章實戰任務)
    # 使用專門處理中文實體的模型
    ner_tagger = pipeline("ner", model="ckiplab/bert-base-chinese-ner", token=hf_token)
except Exception as e:
    print(f"初始化錯誤: {e}")
    tokenizer = classifier = ner_tagger = None

def advanced_nlp_lab(input_text):
    if not tokenizer: return "⚠️ 系統初始化失敗"
    
    lines = [line.strip() for line in input_text.split('\n') if line.strip()]
    if not lines: return "💡 請輸入文字開始偵測！"

    # 2. 實作進階批次編碼 (對應書中 2.3.5 節)
    batch_out = tokenizer.batch_encode_plus(
        lines,
        add_special_tokens=True,
        padding='max_length',
        max_length=25,
        truncation=True,
        return_tensors="pt"
    )

    lab_reports = []
    for i, line in enumerate(lines):
        # 3. 實作 NER 實體偵測 (對應書中第 10 章任務)
        ner_results = ner_tagger(line)
        entities = [f"{entity['word']}({entity['entity']})" for entity in ner_results]
        
        # 4. 情感分析 (對應書中第 7 章任務)
        sentiment = classifier(line)[0]
        
        ids = batch_out['input_ids'][i].tolist()
        tokens = [tokenizer.decode([idx]) for idx in ids if idx != 0]
        
        lab_reports.append({
            "📝 原始句子": line,
            "🔍 實體偵測 (NER)": entities if entities else "未偵測到實體",
            "🎭 情感分析": f"{sentiment['label']} (信心: {sentiment['score']:.2f})",
            "🧩 分詞結構": " | ".join(tokens),
            "🔢 機器編碼 IDs": [idx for idx in ids if idx != 0]
        })

    return lab_reports

# 5. 建立專業風格 Blocks 介面
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("## 🕵️‍♂️ Hugging Face 中文語義全解析實驗室")
    gr.Markdown("本程式整合了李福林著《Hugging Face 自然語言處理實戰》中的多項任務：從編碼矩陣到情感分類，再到命名實體識別。")
    
    with gr.Row():
        input_area = gr.Textbox(
            label="🔮 請輸入包含人名、地名或情感的中文 (支援多行)", 
            lines=4, 
            placeholder="例如：李福林在北京寫下了這本 Hugging Face 實戰書。"
        )
    
    run_btn = gr.Button("🚀 啟動多維度語義解析", variant="primary")
    output_json = gr.JSON(label="📊 深度解析報告 (實作書中第 2, 7, 10 章核心)")

    run_btn.click(fn=advanced_nlp_lab, inputs=input_area, outputs=output_json)
    
    gr.Examples(
        examples=[["李福林在台北展示了 Transformer 的威力"], ["明月裝飾了你的窗子"]],
        inputs=input_area
    )

if __name__ == "__main__":
    demo.launch()