File size: 10,979 Bytes
8099de0
f550f9d
 
 
 
 
 
 
 
 
fc2fb82
f550f9d
 
 
e52dffb
561a7b3
 
 
 
 
 
 
 
 
 
c2ba359
 
 
 
1632676
c2ba359
561a7b3
c2ba359
 
561a7b3
d70bc76
 
546d7be
561a7b3
546d7be
 
561a7b3
d70bc76
546d7be
 
 
 
 
 
 
 
 
 
768506f
546d7be
f550f9d
768506f
 
 
546d7be
f550f9d
 
fc2fb82
546d7be
 
 
fc2fb82
561a7b3
 
546d7be
 
 
f550f9d
 
546d7be
f550f9d
1632676
d70bc76
 
 
546d7be
 
768506f
546d7be
1632676
d70bc76
561a7b3
546d7be
 
768506f
546d7be
f550f9d
 
546d7be
f550f9d
fc2fb82
768506f
 
 
 
 
 
 
 
 
 
f550f9d
 
546d7be
 
 
1632676
 
 
546d7be
 
768506f
 
 
 
546d7be
 
 
 
768506f
546d7be
 
768506f
 
546d7be
 
f550f9d
 
1632676
 
f550f9d
546d7be
 
 
 
 
768506f
 
546d7be
 
 
768506f
f550f9d
546d7be
 
1632676
 
 
546d7be
 
768506f
546d7be
768506f
f550f9d
 
1632676
f550f9d
546d7be
 
 
bdc5ad6
f550f9d
 
fc2fb82
561a7b3
1632676
 
 
 
 
 
 
 
 
 
561a7b3
f550f9d
1632676
561a7b3
f550f9d
 
1632676
fc2fb82
 
1632676
f550f9d
 
 
 
1632676
fc2fb82
 
1632676
f550f9d
fc2fb82
1632676
fc2fb82
f550f9d
1632676
f550f9d
1632676
fc2fb82
f550f9d
1632676
f550f9d
 
1632676
fc2fb82
 
561a7b3
f550f9d
 
d70bc76
bdc5ad6
 
561a7b3
1632676
 
 
 
 
 
561a7b3
1632676
 
 
bdc5ad6
1632676
bdc5ad6
1632676
 
 
bdc5ad6
1632676
 
077b7a9
1632676
561a7b3
 
 
 
 
1632676
561a7b3
1632676
077b7a9
1632676
077b7a9
 
 
 
d70bc76
077b7a9
1632676
077b7a9
1632676
bdc5ad6
1632676
077b7a9
bdc5ad6
1632676
 
bdc5ad6
1632676
 
 
fc2fb82
561a7b3
f550f9d
 
561a7b3
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
import os

# ─────── 修復各類 cache 寫入權限問題 ───────
os.environ["XDG_CACHE_HOME"]      = "/tmp/.cache"
os.environ["HF_HOME"]             = "/tmp/.cache"
os.environ["TRANSFORMERS_CACHE"]   = "/tmp/.cache"
os.environ["TORCH_HOME"]          = "/tmp/.cache"
os.environ["HF_DATASETS_CACHE"]    = "/tmp/.cache"
os.makedirs("/tmp/.cache", exist_ok=True)

# ─────── 指定 Tesseract OCR 執行檔路徑 ───────
import pytesseract
pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"


import io
import json
import requests
import torch
import pytz
import cv2
import numpy as np
from PIL import Image
from datetime import datetime
from typing import Optional, List

from fastapi import FastAPI, HTTPException, File, UploadFile
from fastapi.middleware.cors import CORSMiddleware
from fastapi.staticfiles import StaticFiles
from fastapi.responses import FileResponse, JSONResponse
from pydantic import BaseModel

from firebase_admin import credentials, firestore
import firebase_admin

from AI_Model_architecture import BertLSTM_CNN_Classifier
from bert_explainer import analyze_text as bert_analyze_text


app = FastAPI(
    title="詐騙訊息辨識 API",
    description="使用 BERT 模型與 OCR 圖像前處理,辨識文字並做詐騙判斷",
    version="1.0.0"
)

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

app.mount("/static", StaticFiles(directory="."), name="static")


@app.get("/", response_class=FileResponse)
async def serve_index():
    return FileResponse("index.html")


# ──────────────────────────────────────────────────────────────────────────
# Firebase 初始化
try:
    cred_data = os.getenv("FIREBASE_CREDENTIALS")
    if not cred_data:
        raise ValueError("FIREBASE_CREDENTIALS 未設置")
    firebase_cred = credentials.Certificate({"type": "service_account", **json.loads(cred_data)})
    firebase_admin.initialize_app(firebase_cred)
    db = firestore.client()
except Exception as e:
    print(f"Firebase 初始化錯誤: {e}")
# ──────────────────────────────────────────────────────────────────────────


# ──────────────────────────────────────────────────────────────────────────
# 載入 BERT+LSTM+CNN 模型
model_path = "/tmp/model.pth"
model_url = "https://huggingface.co/jerrynnms/scam-model/resolve/main/model.pth"
if not os.path.exists(model_path):
    response = requests.get(model_url)
    if response.status_code == 200:
        with open(model_path, "wb") as f:
            f.write(response.content)
        print("✅ 模型下載完成")
    else:
        raise FileNotFoundError("❌ 無法從 Hugging Face 下載 model.pth")

model = BertLSTM_CNN_Classifier()
model.load_state_dict(torch.load(model_path, map_location="cpu"))
model.eval()
# ──────────────────────────────────────────────────────────────────────────


# ──────────────────────────────────────────────────────────────────────────
# Pydantic 定義
class TextAnalysisRequest(BaseModel):
    text: str
    user_id: Optional[str] = None

class TextAnalysisResponse(BaseModel):
    status: str
    confidence: float
    suspicious_keywords: List[str]
    analysis_timestamp: datetime
    text_id: str
# ──────────────────────────────────────────────────────────────────────────


@app.post("/predict", response_model=TextAnalysisResponse)
async def analyze_text_api(request: TextAnalysisRequest):
    """
    純文字輸入分析:使用 BERT 模型判斷詐騙與否,並取得可疑關鍵詞
    """
    try:
        tz = pytz.timezone("Asia/Taipei")
        now = datetime.now(tz)
        doc_id = now.strftime("%Y%m%dT%H%M%S")
        date_str = now.strftime("%Y-%m-%d %H:%M:%S")
        collection = now.strftime("%Y%m%d")

        result = bert_analyze_text(request.text)

        record = {
            "text_id": doc_id,
            "text": request.text,
            "user_id": request.user_id,
            "analysis_result": result,
            "timestamp": date_str,
            "type": "text_analysis"
        }
        try:
            db.collection(collection).document(doc_id).set(record)
        except Exception:
            # 如果 Firestore 無法寫入,也不影響回傳結果
            pass

        return TextAnalysisResponse(
            status=result["status"],
            confidence=result["confidence"],
            suspicious_keywords=result["suspicious_keywords"],
            analysis_timestamp=now,
            text_id=doc_id
        )
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))


@app.post("/feedback")
async def save_user_feedback(feedback: dict):
    """
    使用者回饋:將回饋資料寫入 Firestore
    """
    try:
        tz = pytz.timezone("Asia/Taipei")
        timestamp_str = datetime.now(tz).strftime("%Y-%m-%d %H:%M:%S")
        feedback["timestamp"] = timestamp_str
        feedback["used_in_training"] = False
        try:
            db.collection("user_feedback").add(feedback)
        except Exception:
            pass
        return {"message": "✅ 已記錄使用者回饋"}
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))


# ──────────────────────────────────────────────────────────────────────────
# 強化 OCR 前處理 + 附帶 Debug 圖輸出
def preprocess_image_for_ocr(pil_image: Image.Image) -> Image.Image:
    """
    前處理流程:
      1. PIL Image → NumPy BGR
      2. 灰階 + CLAHE(對比度增強)
      3. 橘色背景遮罩 → 將背景橘色轉為白色
      4. 固定閾值反向二值化
      5. 放大 & GaussianBlur 平滑
    中間各步驟會將影像存到 /tmp/debug_*.png,方便除錯
    """
    # 1. PIL → NumPy (RGB->BGR)
    img = np.array(pil_image.convert("RGB"))[:, :, ::-1]

    # 2. 灰階 + CLAHE
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8))
    enhanced = clahe.apply(gray)
    # Debug: CLAHE 增強後的灰階
    Image.fromarray(enhanced).save("/tmp/debug_clahe.png")

    # 3. HSV 色彩分離 (過濾橘色背景)
    hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
    lower_orange = np.array([5, 100, 100])
    upper_orange = np.array([20, 255, 255])
    mask_orange = cv2.inRange(hsv, lower_orange, upper_orange)
    # Debug: 橘色 mask
    Image.fromarray(mask_orange).save("/tmp/debug_mask_orange.png")

    # 將 mask 範圍內的像素設為白色(255),其餘保留灰階
    filtered = enhanced.copy()
    filtered[mask_orange > 0] = 255
    # Debug: 過濾橘色後的灰階
    Image.fromarray(filtered).save("/tmp/debug_filtered.png")

    # 4. 固定閾值反向二值化 (threshold 200)
    _, thresh = cv2.threshold(filtered, 200, 255, cv2.THRESH_BINARY_INV)
    # Debug: 二值化後
    Image.fromarray(thresh).save("/tmp/debug_thresh.png")

    # 5. 放大 3 倍 & GaussianBlur 平滑
    scaled = cv2.resize(thresh, None, fx=3.0, fy=3.0, interpolation=cv2.INTER_CUBIC)
    smoothed = cv2.GaussianBlur(scaled, (5, 5), 0)
    # Debug: 最終前處理結果
    Image.fromarray(smoothed).save("/tmp/debug_processed.png")

    return Image.fromarray(smoothed)
# ──────────────────────────────────────────────────────────────────────────


@app.post("/analyze-image")
async def analyze_uploaded_image(file: UploadFile = File(...)):
    """
    圖片上傳並進行 OCR 辨識:
      1. 讀取 UploadFile → PIL Image
      2. 呼叫 preprocess_image_for_ocr 進行前處理 (並輸出 debug)
      3. 用 pytesseract 擷取文字
      4. 若擷取到文字,送給 BERT 做詐騙判斷
      5. 回傳 JSON 包含 extracted_text 與 analysis_result
    """
    # 1) 確認收到檔案
    print("🔍 [DEBUG] 收到 analyze-image,檔名 =", file.filename)

    try:
        # 2) 讀取圖片 bytes,再轉成 PIL Image
        image_bytes = await file.read()
        print("🔍 [DEBUG] 圖片 bytes 長度 =", len(image_bytes))
        pil_img = Image.open(io.BytesIO(image_bytes))
        print("🔍 [DEBUG] PIL 成功開啟圖片,格式 =", pil_img.format, "大小 =", pil_img.size)

        # 3) 強化前處理 (並產出 debug 影像)
        processed_image = preprocess_image_for_ocr(pil_img)

        # 4) Tesseract OCR
        custom_config = r"-l chi_tra+eng --oem 3 --psm 6"
        extracted_text = pytesseract.image_to_string(
            processed_image,
            config=custom_config
        ).strip()
        print("🔍 [DEBUG] Tesseract 擷取文字 =", repr(extracted_text))

        # 5) 如果沒有擷取到任何文字
        if not extracted_text:
            return JSONResponse({
                "extracted_text": "",
                "analysis_result": {
                    "status": "無法辨識",
                    "confidence": 0.0,
                    "suspicious_keywords": ["無法擷取分析結果"]
                }
            })

        # 6) 擷取到文字後,呼叫 BERT 模型做詐騙判斷
        result = bert_analyze_text(extracted_text)
        return JSONResponse({
            "extracted_text": extracted_text,
            "analysis_result": result
        })

    except Exception as e:
        # 印出詳細錯誤堆疊
        import traceback
        traceback.print_exc()
        raise HTTPException(status_code=500, detail=f"圖片辨識失敗:{str(e)}")


# ──────────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
    import uvicorn
    port = int(os.environ.get("PORT", 7860))
    uvicorn.run(app, host="0.0.0.0", port=port)