cwadayi commited on
Commit
dc8b5ff
·
verified ·
1 Parent(s): fc2fc31

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +49 -40
app.py CHANGED
@@ -3,74 +3,83 @@ import os
3
  import torch
4
  from transformers import BertTokenizer, pipeline
5
 
6
- # 1. 讀取安全金鑰
7
  hf_token = os.getenv("HF_TOKEN")
8
  model_name = "google-bert/bert-base-chinese"
9
 
10
  try:
11
- # 2. 載入編碼器並擴充字典 (對應書中 2.3.6 節)
12
  tokenizer = BertTokenizer.from_pretrained(model_name, token=hf_token)
13
- tokenizer.add_tokens(['明月', '裝飾', '窗子'])
14
 
15
- # 3. 載入推理管線
16
  classifier = pipeline("sentiment-analysis", model="LiYuan/amazon-review-sentiment-analysis", token=hf_token)
17
  except Exception as e:
18
- print(f"初始化失敗: {e}")
19
  classifier = None
20
 
21
- def enhanced_nlp_workflow(input_text):
22
- if classifier is None: return "系統初始化失敗"
23
 
24
- # 將輸入按行拆分,實作批次處理
25
  lines = [line.strip() for line in input_text.split('\n') if line.strip()]
26
- if not lines: return "請輸入文字"
27
 
28
- # 4. 實作批次編碼函數 (對應書中 2.3.5 節)
29
  batch_out = tokenizer.batch_encode_plus(
30
  lines,
31
  add_special_tokens=True,
32
- truncation=True,
33
  padding='max_length',
34
- max_length=20, # 縮短長度方便在介面觀察
 
35
  return_tensors="pt"
36
  )
37
 
38
- # 5. 執行批次推理
39
  results = classifier(lines)
 
40
 
41
- # 整理輸出資訊 (對應書中表 2-2)
42
- output = []
43
  for i, line in enumerate(lines):
44
- res = results[i]
45
  ids = batch_out['input_ids'][i].tolist()
46
- mask = batch_out['attention_mask'][i].tolist()
 
47
 
48
- # 移除 [PAD] 以利閱讀解碼結果
49
- clean_ids = [idx for idx in ids if idx != 0]
50
- decoded = tokenizer.decode(clean_ids)
51
-
52
- output.append({
53
- "原始句子": line,
54
- "情感標籤": f"{res['label']} ({res['score']:.2f})",
55
- "還原結果": decoded,
56
- "Input IDs": clean_ids,
57
- "Attention Mask (前10位)": mask[:10] # 呈現 PAD 邏輯
 
 
 
58
  })
59
 
60
- return output
61
 
62
- # 6. 建立 Gradio 介面
63
- demo = gr.Interface(
64
- fn=enhanced_nlp_workflow,
65
- inputs=gr.Textbox(
66
- label="請輸入中文句子 (支援多行批次輸入)",
67
- lines=3,
68
- placeholder="第一行:明月裝飾了你的窗子\n第二行:這本書真的非常實用"
69
- ),
70
- outputs=gr.JSON(label="強化版標準流程輸出結果"),
71
- title="CNLP 強化實作:Hugging Face 批次處理與編碼解密",
72
- description="本程式完整實作李福林老師書中第一、二章的所有關鍵功能:批次編碼、字典擴充與解碼驗證。"
73
- )
 
 
 
 
 
 
 
 
 
74
 
75
  if __name__ == "__main__":
76
  demo.launch()
 
3
  import torch
4
  from transformers import BertTokenizer, pipeline
5
 
6
+ # 1. 安全金鑰與模型初始化
7
  hf_token = os.getenv("HF_TOKEN")
8
  model_name = "google-bert/bert-base-chinese"
9
 
10
  try:
11
+ # 載入並擴充字典(實作書中 2.3.6 節)
12
  tokenizer = BertTokenizer.from_pretrained(model_name, token=hf_token)
13
+ tokenizer.add_tokens(['明月', '裝飾', '窗子', '夢境']) # 創意擴充詞彙
14
 
15
+ # 載入推理管線
16
  classifier = pipeline("sentiment-analysis", model="LiYuan/amazon-review-sentiment-analysis", token=hf_token)
17
  except Exception as e:
18
+ tokenizer = None
19
  classifier = None
20
 
21
+ def creative_nlp_lab(input_text):
22
+ if not tokenizer or not classifier: return "系統初始化失敗,請檢查 Secret 設定。"
23
 
 
24
  lines = [line.strip() for line in input_text.split('\n') if line.strip()]
25
+ if not lines: return "請輸入文字來開啟實驗!"
26
 
27
+ # 2. 執行批次編碼(實作書中 2.3.5 節)
28
  batch_out = tokenizer.batch_encode_plus(
29
  lines,
30
  add_special_tokens=True,
 
31
  padding='max_length',
32
+ max_length=15,
33
+ truncation=True,
34
  return_tensors="pt"
35
  )
36
 
 
37
  results = classifier(lines)
38
+ lab_reports = []
39
 
 
 
40
  for i, line in enumerate(lines):
 
41
  ids = batch_out['input_ids'][i].tolist()
42
+ # 實作書中 decode 驗證功能
43
+ tokens = [tokenizer.decode([idx]) for idx in ids if idx != 0]
44
 
45
+ # 創意功能:詞元化視覺呈現
46
+ visual_tokens = " | ".join(tokens)
47
+
48
+ # 創意功能:語境風格分析(基於關鍵字與情緒)
49
+ style = "現代散文"
50
+ if "明月" in line or "窗" in line: style = "經典詩意"
51
+
52
+ lab_reports.append({
53
+ "🔬 實驗對象": line,
54
+ "🎨 語境風格": style,
55
+ "🎭 情感色彩": f"{results[i]['label']} (強度: {results[i]['score']:.2f})",
56
+ "🧩 詞元拆解 (Tokens)": visual_tokens,
57
+ "🔢 機器編碼 (Input IDs)": [idx for idx in ids if idx != 0]
58
  })
59
 
60
+ return lab_reports
61
 
62
+ # 3. 建立充滿創意的介面
63
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
64
+ gr.Markdown("# 🚀 Hugging Face 中文 NLP 創意實驗室")
65
+ gr.Markdown("本實驗室基於《Hugging Face 自然語言處理實戰》架構,展示編碼器如何將感性的文字轉化為理性的數據。")
66
+
67
+ with gr.Row():
68
+ input_area = gr.Textbox(
69
+ label="輸入靈感(支援多行批次輸入)",
70
+ lines=4,
71
+ placeholder="例如:\n明月裝飾了你的窗子\n這本書讓 AI 變得簡單"
72
+ )
73
+
74
+ run_btn = gr.Button("開始實驗", variant="primary")
75
+ output_json = gr.JSON(label="實驗報告(實作書中表 2-2 數據架構)")
76
+
77
+ run_btn.click(fn=creative_nlp_lab, inputs=input_area, outputs=output_json)
78
+
79
+ gr.Examples(
80
+ examples=[["明月裝飾了你的窗子\n你裝飾了別人的夢"], ["HuggingFace 工具集真的好用"]],
81
+ inputs=input_area
82
+ )
83
 
84
  if __name__ == "__main__":
85
  demo.launch()