Spaces:
Sleeping
Sleeping
File size: 10,979 Bytes
8099de0 f550f9d fc2fb82 f550f9d e52dffb 561a7b3 c2ba359 1632676 c2ba359 561a7b3 c2ba359 561a7b3 d70bc76 546d7be 561a7b3 546d7be 561a7b3 d70bc76 546d7be 768506f 546d7be f550f9d 768506f 546d7be f550f9d fc2fb82 546d7be fc2fb82 561a7b3 546d7be f550f9d 546d7be f550f9d 1632676 d70bc76 546d7be 768506f 546d7be 1632676 d70bc76 561a7b3 546d7be 768506f 546d7be f550f9d 546d7be f550f9d fc2fb82 768506f f550f9d 546d7be 1632676 546d7be 768506f 546d7be 768506f 546d7be 768506f 546d7be f550f9d 1632676 f550f9d 546d7be 768506f 546d7be 768506f f550f9d 546d7be 1632676 546d7be 768506f 546d7be 768506f f550f9d 1632676 f550f9d 546d7be bdc5ad6 f550f9d fc2fb82 561a7b3 1632676 561a7b3 f550f9d 1632676 561a7b3 f550f9d 1632676 fc2fb82 1632676 f550f9d 1632676 fc2fb82 1632676 f550f9d fc2fb82 1632676 fc2fb82 f550f9d 1632676 f550f9d 1632676 fc2fb82 f550f9d 1632676 f550f9d 1632676 fc2fb82 561a7b3 f550f9d d70bc76 bdc5ad6 561a7b3 1632676 561a7b3 1632676 bdc5ad6 1632676 bdc5ad6 1632676 bdc5ad6 1632676 077b7a9 1632676 561a7b3 1632676 561a7b3 1632676 077b7a9 1632676 077b7a9 d70bc76 077b7a9 1632676 077b7a9 1632676 bdc5ad6 1632676 077b7a9 bdc5ad6 1632676 bdc5ad6 1632676 fc2fb82 561a7b3 f550f9d 561a7b3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 | import os
# ─────── 修復各類 cache 寫入權限問題 ───────
os.environ["XDG_CACHE_HOME"] = "/tmp/.cache"
os.environ["HF_HOME"] = "/tmp/.cache"
os.environ["TRANSFORMERS_CACHE"] = "/tmp/.cache"
os.environ["TORCH_HOME"] = "/tmp/.cache"
os.environ["HF_DATASETS_CACHE"] = "/tmp/.cache"
os.makedirs("/tmp/.cache", exist_ok=True)
# ─────── 指定 Tesseract OCR 執行檔路徑 ───────
import pytesseract
pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
import io
import json
import requests
import torch
import pytz
import cv2
import numpy as np
from PIL import Image
from datetime import datetime
from typing import Optional, List
from fastapi import FastAPI, HTTPException, File, UploadFile
from fastapi.middleware.cors import CORSMiddleware
from fastapi.staticfiles import StaticFiles
from fastapi.responses import FileResponse, JSONResponse
from pydantic import BaseModel
from firebase_admin import credentials, firestore
import firebase_admin
from AI_Model_architecture import BertLSTM_CNN_Classifier
from bert_explainer import analyze_text as bert_analyze_text
app = FastAPI(
title="詐騙訊息辨識 API",
description="使用 BERT 模型與 OCR 圖像前處理,辨識文字並做詐騙判斷",
version="1.0.0"
)
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
app.mount("/static", StaticFiles(directory="."), name="static")
@app.get("/", response_class=FileResponse)
async def serve_index():
return FileResponse("index.html")
# ──────────────────────────────────────────────────────────────────────────
# Firebase 初始化
try:
cred_data = os.getenv("FIREBASE_CREDENTIALS")
if not cred_data:
raise ValueError("FIREBASE_CREDENTIALS 未設置")
firebase_cred = credentials.Certificate({"type": "service_account", **json.loads(cred_data)})
firebase_admin.initialize_app(firebase_cred)
db = firestore.client()
except Exception as e:
print(f"Firebase 初始化錯誤: {e}")
# ──────────────────────────────────────────────────────────────────────────
# ──────────────────────────────────────────────────────────────────────────
# 載入 BERT+LSTM+CNN 模型
model_path = "/tmp/model.pth"
model_url = "https://huggingface.co/jerrynnms/scam-model/resolve/main/model.pth"
if not os.path.exists(model_path):
response = requests.get(model_url)
if response.status_code == 200:
with open(model_path, "wb") as f:
f.write(response.content)
print("✅ 模型下載完成")
else:
raise FileNotFoundError("❌ 無法從 Hugging Face 下載 model.pth")
model = BertLSTM_CNN_Classifier()
model.load_state_dict(torch.load(model_path, map_location="cpu"))
model.eval()
# ──────────────────────────────────────────────────────────────────────────
# ──────────────────────────────────────────────────────────────────────────
# Pydantic 定義
class TextAnalysisRequest(BaseModel):
text: str
user_id: Optional[str] = None
class TextAnalysisResponse(BaseModel):
status: str
confidence: float
suspicious_keywords: List[str]
analysis_timestamp: datetime
text_id: str
# ──────────────────────────────────────────────────────────────────────────
@app.post("/predict", response_model=TextAnalysisResponse)
async def analyze_text_api(request: TextAnalysisRequest):
"""
純文字輸入分析:使用 BERT 模型判斷詐騙與否,並取得可疑關鍵詞
"""
try:
tz = pytz.timezone("Asia/Taipei")
now = datetime.now(tz)
doc_id = now.strftime("%Y%m%dT%H%M%S")
date_str = now.strftime("%Y-%m-%d %H:%M:%S")
collection = now.strftime("%Y%m%d")
result = bert_analyze_text(request.text)
record = {
"text_id": doc_id,
"text": request.text,
"user_id": request.user_id,
"analysis_result": result,
"timestamp": date_str,
"type": "text_analysis"
}
try:
db.collection(collection).document(doc_id).set(record)
except Exception:
# 如果 Firestore 無法寫入,也不影響回傳結果
pass
return TextAnalysisResponse(
status=result["status"],
confidence=result["confidence"],
suspicious_keywords=result["suspicious_keywords"],
analysis_timestamp=now,
text_id=doc_id
)
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.post("/feedback")
async def save_user_feedback(feedback: dict):
"""
使用者回饋:將回饋資料寫入 Firestore
"""
try:
tz = pytz.timezone("Asia/Taipei")
timestamp_str = datetime.now(tz).strftime("%Y-%m-%d %H:%M:%S")
feedback["timestamp"] = timestamp_str
feedback["used_in_training"] = False
try:
db.collection("user_feedback").add(feedback)
except Exception:
pass
return {"message": "✅ 已記錄使用者回饋"}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
# ──────────────────────────────────────────────────────────────────────────
# 強化 OCR 前處理 + 附帶 Debug 圖輸出
def preprocess_image_for_ocr(pil_image: Image.Image) -> Image.Image:
"""
前處理流程:
1. PIL Image → NumPy BGR
2. 灰階 + CLAHE(對比度增強)
3. 橘色背景遮罩 → 將背景橘色轉為白色
4. 固定閾值反向二值化
5. 放大 & GaussianBlur 平滑
中間各步驟會將影像存到 /tmp/debug_*.png,方便除錯
"""
# 1. PIL → NumPy (RGB->BGR)
img = np.array(pil_image.convert("RGB"))[:, :, ::-1]
# 2. 灰階 + CLAHE
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8))
enhanced = clahe.apply(gray)
# Debug: CLAHE 增強後的灰階
Image.fromarray(enhanced).save("/tmp/debug_clahe.png")
# 3. HSV 色彩分離 (過濾橘色背景)
hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
lower_orange = np.array([5, 100, 100])
upper_orange = np.array([20, 255, 255])
mask_orange = cv2.inRange(hsv, lower_orange, upper_orange)
# Debug: 橘色 mask
Image.fromarray(mask_orange).save("/tmp/debug_mask_orange.png")
# 將 mask 範圍內的像素設為白色(255),其餘保留灰階
filtered = enhanced.copy()
filtered[mask_orange > 0] = 255
# Debug: 過濾橘色後的灰階
Image.fromarray(filtered).save("/tmp/debug_filtered.png")
# 4. 固定閾值反向二值化 (threshold 200)
_, thresh = cv2.threshold(filtered, 200, 255, cv2.THRESH_BINARY_INV)
# Debug: 二值化後
Image.fromarray(thresh).save("/tmp/debug_thresh.png")
# 5. 放大 3 倍 & GaussianBlur 平滑
scaled = cv2.resize(thresh, None, fx=3.0, fy=3.0, interpolation=cv2.INTER_CUBIC)
smoothed = cv2.GaussianBlur(scaled, (5, 5), 0)
# Debug: 最終前處理結果
Image.fromarray(smoothed).save("/tmp/debug_processed.png")
return Image.fromarray(smoothed)
# ──────────────────────────────────────────────────────────────────────────
@app.post("/analyze-image")
async def analyze_uploaded_image(file: UploadFile = File(...)):
"""
圖片上傳並進行 OCR 辨識:
1. 讀取 UploadFile → PIL Image
2. 呼叫 preprocess_image_for_ocr 進行前處理 (並輸出 debug)
3. 用 pytesseract 擷取文字
4. 若擷取到文字,送給 BERT 做詐騙判斷
5. 回傳 JSON 包含 extracted_text 與 analysis_result
"""
# 1) 確認收到檔案
print("🔍 [DEBUG] 收到 analyze-image,檔名 =", file.filename)
try:
# 2) 讀取圖片 bytes,再轉成 PIL Image
image_bytes = await file.read()
print("🔍 [DEBUG] 圖片 bytes 長度 =", len(image_bytes))
pil_img = Image.open(io.BytesIO(image_bytes))
print("🔍 [DEBUG] PIL 成功開啟圖片,格式 =", pil_img.format, "大小 =", pil_img.size)
# 3) 強化前處理 (並產出 debug 影像)
processed_image = preprocess_image_for_ocr(pil_img)
# 4) Tesseract OCR
custom_config = r"-l chi_tra+eng --oem 3 --psm 6"
extracted_text = pytesseract.image_to_string(
processed_image,
config=custom_config
).strip()
print("🔍 [DEBUG] Tesseract 擷取文字 =", repr(extracted_text))
# 5) 如果沒有擷取到任何文字
if not extracted_text:
return JSONResponse({
"extracted_text": "",
"analysis_result": {
"status": "無法辨識",
"confidence": 0.0,
"suspicious_keywords": ["無法擷取分析結果"]
}
})
# 6) 擷取到文字後,呼叫 BERT 模型做詐騙判斷
result = bert_analyze_text(extracted_text)
return JSONResponse({
"extracted_text": extracted_text,
"analysis_result": result
})
except Exception as e:
# 印出詳細錯誤堆疊
import traceback
traceback.print_exc()
raise HTTPException(status_code=500, detail=f"圖片辨識失敗:{str(e)}")
# ──────────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
import uvicorn
port = int(os.environ.get("PORT", 7860))
uvicorn.run(app, host="0.0.0.0", port=port)
|