Spaces:
Sleeping
Sleeping
File size: 9,836 Bytes
310d6ab f6b8f5d 995e13a f6b8f5d 178cfa1 f6b8f5d 24fd407 f6b8f5d 310d6ab f6b8f5d 310d6ab f6b8f5d 310d6ab 995e13a 310d6ab 995e13a 178cfa1 310d6ab f6b8f5d c423743 310d6ab f6b8f5d c423743 f6b8f5d 310d6ab c145ea1 f6b8f5d 995e13a f6b8f5d c423743 310d6ab f6b8f5d 995e13a 310d6ab f6b8f5d 995e13a 310d6ab 995e13a 310d6ab c145ea1 f6b8f5d c423743 f6b8f5d c423743 f6b8f5d 178cfa1 c423743 f6b8f5d c423743 f6b8f5d c423743 f6b8f5d c423743 f6b8f5d 178cfa1 c423743 f6b8f5d 178cfa1 f6b8f5d 178cfa1 c145ea1 f6b8f5d 995e13a f6b8f5d 995e13a c423743 178cfa1 310d6ab 995e13a 178cfa1 310d6ab f6b8f5d 24fd407 f6b8f5d 24fd407 f6b8f5d 24fd407 f6b8f5d 24fd407 f6b8f5d 24fd407 f6b8f5d 24fd407 f6b8f5d 24fd407 f6b8f5d 24fd407 f6b8f5d 24fd407 f6b8f5d 24fd407 f6b8f5d 24fd407 f6b8f5d 24fd407 f6b8f5d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 |
import os
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
import io
import re
import requests
import torch
import jieba
import numpy as np
import cv2
import pytesseract
from PIL import Image
from transformers import BertTokenizer, BertModel
from AI_Model_architecture import BertLSTM_CNN_Classifier
# ─────────────────────────────────────────────────────────────────────────────
# 1. Device 設定
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# 2. 下載並載入自訂分類模型(BertLSTM_CNN_Classifier)
model_path = "/tmp/model.pth"
model_url = "https://huggingface.co/jerrynnms/scam-model/resolve/main/model.pth"
if not os.path.exists(model_path):
print("📦 下載 model.pth 中...")
response = requests.get(model_url)
if response.status_code == 200:
with open(model_path, "wb") as f:
f.write(response.content)
print("✅ 模型下載完成")
else:
raise FileNotFoundError("❌ 無法下載 model.pth")
# 3. 初始化 tokenizer 與自訂分類模型
tokenizer = BertTokenizer.from_pretrained("ckiplab/bert-base-chinese")
model = BertLSTM_CNN_Classifier()
model.load_state_dict(torch.load(model_path, map_location=device))
model.to(device)
model.eval()
# 4. 初始化原始 BERT 模型(供 attention 使用)
bert_model = BertModel.from_pretrained("ckiplab/bert-base-chinese", output_attentions=True)
bert_model.to(device)
bert_model.eval()
# ─────────────────────────────────────────────────────────────────────────────
# ─────────────────────────────────────────────────────────────────────────────
# 5. 預測單句文字函式
def predict_single_sentence(text: str, max_len=256):
# 5.1. 簡單清洗:移除空白、保留中英文和部分標點
text = re.sub(r"\s+", "", text)
text = re.sub(r"[^\u4e00-\u9fffA-Za-z0-9。,!?:/.\-]", "", text)
# 5.2. Tokenize 並轉成 Tensor
encoded = tokenizer(text, return_tensors="pt", truncation=True,
padding="max_length", max_length=max_len)
input_ids = encoded["input_ids"].to(device)
attention_mask = encoded["attention_mask"].to(device)
token_type_ids = encoded["token_type_ids"].to(device)
# 5.3. 模型推論
with torch.no_grad():
output = model(input_ids, attention_mask, token_type_ids)
prob = output.item()
label = int(prob > 0.5)
return label, prob
# 6. 抽取高 attention token 並轉換為自然語意詞句
def extract_attention_keywords(text, top_k=5):
# 6.1. 清洗文字(去除空白)
cleaned = re.sub(r"\s+", "", text)
# 6.2. Tokenize 但只需要 attention,不需要分類模型
encoded = tokenizer(cleaned, return_tensors="pt", truncation=True,
padding="max_length", max_length=128)
input_ids = encoded["input_ids"].to(device)
attention_mask = encoded["attention_mask"].to(device)
# 6.3. 將文字丟給原始 BERT 取最後一層 attention
with torch.no_grad():
outputs = bert_model(input_ids=input_ids, attention_mask=attention_mask)
attentions = outputs.attentions # tuple: 每層 transformer block 的 attention
# 6.4. 取最末層 attention,對所有 head、所有 token 均值 → 一維向量 (seq_len)
attn = attentions[-1][0].mean(dim=0).mean(dim=0) # shape: (seq_len,)
# 6.5. 取得該句所有 token,排除特殊 token
tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
top_indices = attn.topk(top_k).indices.tolist()
top_tokens = [tokens[i] for i in top_indices if tokens[i] not in ["[CLS]", "[SEP]", "[PAD]"]]
# 6.6. 用 jieba 切詞,將高 attention 的 token 映射回中文詞組
words = list(jieba.cut(text))
suspicious = []
for word in words:
if len(word.strip()) < 2:
continue
for token in top_tokens:
if token in word and word not in suspicious:
suspicious.append(word)
break
# 6.7. 回傳 top_k 個「可疑詞」;若都沒有映射出詞,就直接回 top_tokens
return suspicious[:top_k] if suspicious else top_tokens[:top_k]
# ─────────────────────────────────────────────────────────────────────────────
# ─────────────────────────────────────────────────────────────────────────────
# 7. 文字分析主函式:回傳完整結構
def analyze_text(text: str):
"""
輸入一段文字(純文字),回傳:
{
"status": "詐騙" / "正常",
"confidence": float(百分比),
"suspicious_keywords": [已擷取詞列表]
}
"""
label, prob = predict_single_sentence(text)
prob_percent = round(prob * 100, 2)
status = "詐騙" if label == 1 else "正常"
suspicious = extract_attention_keywords(text)
return {
"status": status,
"confidence": prob_percent,
"suspicious_keywords": suspicious or ["(模型未聚焦可疑詞)"]
}
# ─────────────────────────────────────────────────────────────────────────────
# ─────────────────────────────────────────────────────────────────────────────
# 以下新增:OCR 前處理+圖片分析相關函式
# 8. 前處理:將圖片做灰階→CLAHE→HSV過濾→二值化→放大→模糊,回傳可供 pytesseract 的 PIL.Image
def preprocess_for_pytesseract(pil_image: Image.Image) -> Image.Image:
"""
將 PIL Image 做以下前處理,回傳「黑底白字」的 PIL Image,供 pytesseract 使用:
1. PIL→NumPy (RGB→BGR)
2. 轉灰階 + CLAHE(對比度增強)
3. HSV 色彩過濾 (示範過濾「橘色」海報底色)
4. 固定阈值反向二值化 (深色文字→白,其他→黑)
5. 放大2倍 + GaussianBlur 模糊
最後再把 NumPy 陣列轉回 PIL Image 回傳。
"""
# 8.1. PIL→NumPy (RGB to BGR)
img_bgr = np.array(pil_image.convert("RGB"))[:, :, ::-1]
# 8.2. 轉灰階
gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
# 8.3. CLAHE (對比度限制自適應直方圖均衡)
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
enhanced = clahe.apply(gray)
# 8.4. HSV 色彩過濾 (此範例針對橘色底色)
hsv = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2HSV)
lower_orange = np.array([5, 100, 100])
upper_orange = np.array([20, 255, 255])
mask_orange = cv2.inRange(hsv, lower_orange, upper_orange)
filtered = enhanced.copy()
filtered[mask_orange > 0] = 255 # 將橘色背景設為白
# 8.5. 固定阈值反向二值化 (深色文字→白,背景→黑)
_, thresh = cv2.threshold(filtered, 200, 255, cv2.THRESH_BINARY_INV)
# 8.6. 放大2倍 & GaussianBlur 平滑
scaled = cv2.resize(thresh, None, fx=2.0, fy=2.0, interpolation=cv2.INTER_CUBIC)
smoothed = cv2.GaussianBlur(scaled, (3, 3), 0)
# 8.7. 將 NumPy (黑底白字) 轉回 PIL Image
return Image.fromarray(smoothed)
# 9. 圖片分析:OCR 擷取文字 → BERT 分析
def analyze_image(file_bytes, explain_mode="cnn"):
"""
輸入圖片 bytes,回傳:
{
"status": "詐騙"/"正常"/"無法辨識文字",
"confidence": float,
"suspicious_keywords": [詞列表]
}
流程:
1. bytes → PIL Image
2. 影像前處理 → 得到黑底白字 PIL Image
3. pytesseract 讀取前處理後影像 → 擷取文字
4. 若讀不到文字 → 回傳「無法辨識」
否則 → 呼叫 analyze_text 做 BERT 分析
"""
# 9.1. bytes → PIL Image
image = Image.open(io.BytesIO(file_bytes))
# 9.2. 前處理:取得 PIL (黑底白字)
processed_img = preprocess_for_pytesseract(image)
# 【可選 Debug】儲存前處理後的影像供檢查
# processed_img.save("/tmp/debug_processed.png")
# 9.3. pytesseract OCR 讀取前處理後影像
# 設定 Tesseract 執行檔路徑(在 Space 上通常已經是 /usr/bin/tesseract)
pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
custom_config = r"-l chi_tra+eng --oem 3 --psm 6"
extracted_text = pytesseract.image_to_string(processed_img, config=custom_config).strip()
# 9.4. 如果沒擷取到任何文字,回傳「無法辨識」
if not extracted_text:
return {
"status": "無法辨識文字",
"confidence": 0.0,
"suspicious_keywords": ["圖片中無可辨識的中文英文"]
}
# 9.5. 如果擷取到文字,就直接呼叫 analyze_text 做 BERT 分析
return analyze_text(extracted_text, explain_mode=explain_mode)
# ─────────────────────────────────────────────────────────────────────────────
|