Spaces:

jerrynnms
/

scam-detector

Sleeping

App Files Files Community

jerrynnms commited on May 29, 2025

Commit

310d6ab

verified ·

1 Parent(s): dab03c4

Update bert_explainer.py

Browse files

Files changed (1) hide show

bert_explainer.py +73 -67

bert_explainer.py CHANGED Viewed

@@ -1,67 +1,73 @@
-import torch
-from AI_Model_architecture import BertLSTM_CNN_Classifier, BertPreprocessor
-from transformers import BertTokenizer
-import re
-import requests
-import os
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-# 從 Google Drive 載入 model.pth
-def load_model_from_drive():
-    model_url = "https://drive.google.com/uc?export=download&id=1UXkOqMPUiPUIbsy8iENHUqbNFLEHcFFg"  # 替換為你的檔案 ID
-    response = requests.get(model_url)
-    if response.status_code == 200:
-        with open("model.pth", "wb") as f:
-            f.write(response.content)
-        return True
-    return False
-if not os.path.exists("model.pth"):
-    if not load_model_from_drive():
-        raise FileNotFoundError("無法從 Google Drive 載入 model.pth")
-model = BertLSTM_CNN_Classifier()
-model.load_state_dict(torch.load("model.pth", map_location=device))
-model.to(device)
-model.eval()
-tokenizer = BertTokenizer.from_pretrained("ckiplab/bert-base-chinese")
-def predict_single_sentence(model, tokenizer, sentence, max_len=256):
-    model.eval()
-    with torch.no_grad():
-        sentence = re.sub(r"\s+", "", sentence)
-        sentence = re.sub(r"[^\u4e00-\u9fffA-Za-z0-9。，！？:/.\-]", "", sentence)
-        encoded = tokenizer(sentence, return_tensors="pt", truncation=True, padding="max_length", max_length=max_len)
-        input_ids = encoded["input_ids"].to(device)
-        attention_mask = encoded["attention_mask"].to(device)
-        token_type_ids = encoded["token_type_ids"].to(device)
-        output = model(input_ids, attention_mask, token_type_ids)
-        prob = output.item()
-        label = int(prob > 0.5)
-        if prob > 0.9:
-            risk = "🔴 高風險（極可能是詐騙）"
-        elif prob > 0.5:
-            risk = "🟡 中風險（可疑）"
-        else:
-            risk = "🟢 低風險（正常）"
-        pre_label = "詐騙" if label == 1 else "正常"
-        print(f"\n📩 訊息內容：{sentence}")
-        print(f"✅ 預測結果：{pre_label}")
-        print(f"📊 信心值：{round(prob*100, 2)}")
-        print(f"⚠️ 風險等級：{risk}")
-        return pre_label, prob, risk
-def analyze_text(text):
-    label, prob, risk = predict_single_sentence(model, tokenizer, text)
-    return {
-        "status": label,
-        "confidence": round(prob*100, 2),
-        "suspicious_keywords": [risk]
-    }

+import torch
+from AI_Model_architecture import BertLSTM_CNN_Classifier, BertPreprocessor
+from transformers import BertTokenizer
+import re
+import requests
+import os
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# ✅ 指定可寫入的路徑（/tmp 是 Hugging Face 允許寫入的暫存區）
+model_path = "/tmp/model.pth"
+# 從 Google Drive 載入 model.pth
+def load_model_from_drive():
+    model_url = "https://drive.google.com/uc?export=download&id=1UXkOqMPUiPUIbsy8iENHUqbNFLEHcFFg"
+    response = requests.get(model_url)
+    if response.status_code == 200:
+        with open(model_path, "wb") as f:
+            f.write(response.content)
+        return True
+    return False
+# ✅ 檢查 model 是否已存在，否則載入
+if not os.path.exists(model_path):
+    if not load_model_from_drive():
+        raise FileNotFoundError("❌ 無法從 Google Drive 載入 model.pth")
+# ✅ 正確讀取模型
+model = BertLSTM_CNN_Classifier()
+model.load_state_dict(torch.load(model_path, map_location=device))
+model.to(device)
+model.eval()
+# ✅ 載入中文 tokenizer
+tokenizer = BertTokenizer.from_pretrained("ckiplab/bert-base-chinese")
+def predict_single_sentence(model, tokenizer, sentence, max_len=256):
+    model.eval()
+    with torch.no_grad():
+        sentence = re.sub(r"\s+", "", sentence)
+        sentence = re.sub(r"[^\u4e00-\u9fffA-Za-z0-9。，！？:/.\-]", "", sentence)
+        encoded = tokenizer(sentence, return_tensors="pt", truncation=True, padding="max_length", max_length=max_len)
+        input_ids = encoded["input_ids"].to(device)
+        attention_mask = encoded["attention_mask"].to(device)
+        token_type_ids = encoded["token_type_ids"].to(device)
+        output = model(input_ids, attention_mask, token_type_ids)
+        prob = output.item()
+        label = int(prob > 0.5)
+        if prob > 0.9:
+            risk = "🔴 高風險（極可能是詐騙）"
+        elif prob > 0.5:
+            risk = "🟡 中風險（可疑）"
+        else:
+            risk = "🟢 低風險（正常）"
+        pre_label = "詐騙" if label == 1 else "正常"
+        print(f"\n📩 訊息內容：{sentence}")
+        print(f"✅ 預測結果：{pre_label}")
+        print(f"📊 信心值：{round(prob*100, 2)}")
+        print(f"⚠️ 風險等級：{risk}")
+        return pre_label, prob, risk
+def analyze_text(text):
+    label, prob, risk = predict_single_sentence(model, tokenizer, text)
+    return {
+        "status": label,
+        "confidence": round(prob*100, 2),
+        "suspicious_keywords": [risk]
+    }