harshitmahour360 commited on
Commit
ae1a5c5
·
verified ·
1 Parent(s): ef661e9

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +307 -0
app.py ADDED
@@ -0,0 +1,307 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import traceback
3
+ from typing import Dict, Any, Tuple
4
+
5
+ import gradio as gr
6
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
7
+
8
+ # ---------------------------------------------------------
9
+ # 1. CONFIG
10
+ # ---------------------------------------------------------
11
+
12
+ # Small, fast phishing / spam-style classifier
13
+ MODEL_NAME = "mrm8488/bert-tiny-finetuned-sms-spam-detection"
14
+
15
+ clf_pipe = None
16
+ model_load_error = None
17
+
18
+ # Heuristic keywords and weights (tuned for "phishy" language)
19
+ SUSPICIOUS_KEYWORDS = {
20
+ "verify your account": 0.25,
21
+ "reset your password": 0.25,
22
+ "confirm your password": 0.2,
23
+ "click the link below": 0.15,
24
+ "click here": 0.15,
25
+ "urgent": 0.1,
26
+ "immediately": 0.1,
27
+ "limited time": 0.1,
28
+ "suspend your account": 0.25,
29
+ "update your billing": 0.2,
30
+ "unusual activity": 0.2,
31
+ "bank account": 0.15,
32
+ "card details": 0.15,
33
+ "one time password": 0.15,
34
+ "otp": 0.1,
35
+ }
36
+
37
+ URL_SHORTENERS = [
38
+ "bit.ly", "tinyurl.com", "t.co", "is.gd", "ow.ly", "buff.ly", "cutt.ly"
39
+ ]
40
+
41
+ URL_REGEX = re.compile(r"https?://\S+")
42
+
43
+ # ---------------------------------------------------------
44
+ # 2. LOAD MODEL ON STARTUP
45
+ # ---------------------------------------------------------
46
+
47
+ def _load_model():
48
+ global clf_pipe, model_load_error
49
+ try:
50
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
51
+ model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
52
+ clf_pipe = pipeline(
53
+ "text-classification",
54
+ model=model,
55
+ tokenizer=tokenizer,
56
+ return_all_scores=True,
57
+ )
58
+ print(f"[INFO] Loaded HF model: {MODEL_NAME}")
59
+ except Exception as e:
60
+ model_load_error = f"Failed to load model {MODEL_NAME}: {type(e).__name__}: {e}"
61
+ clf_pipe = None
62
+ print("[ERROR]", model_load_error)
63
+
64
+
65
+ _load_model()
66
+
67
+ # ---------------------------------------------------------
68
+ # 3. HELPER FUNCTIONS
69
+ # ---------------------------------------------------------
70
+
71
+ def compute_model_spam_prob(text: str, outputs) -> Tuple[float, Dict[str, float]]:
72
+ """
73
+ Convert HF pipeline outputs into a spam / phishing probability.
74
+ """
75
+ label_scores = {o["label"]: float(o["score"]) for o in outputs}
76
+
77
+ # Try labels that explicitly mention spam/phish first
78
+ spam_prob = 0.0
79
+ for lab, score in label_scores.items():
80
+ ll = lab.lower()
81
+ if "spam" in ll or "phish" in ll:
82
+ spam_prob += score
83
+
84
+ # Fallback for LABEL_0 / LABEL_1 etc.
85
+ if spam_prob == 0.0 and len(label_scores) == 2:
86
+ # Heuristic: the "1" label is spam
87
+ spam_label = None
88
+ for lab in label_scores:
89
+ if "1" in lab or "spam" in lab.lower():
90
+ spam_label = lab
91
+ break
92
+
93
+ if spam_label is None:
94
+ # Just take the label with the higher score
95
+ spam_label = max(label_scores, key=label_scores.get)
96
+
97
+ spam_prob = label_scores.get(spam_label, 0.0)
98
+
99
+ # Clamp
100
+ spam_prob = max(0.0, min(1.0, float(spam_prob)))
101
+ return spam_prob, label_scores
102
+
103
+
104
+ def compute_heuristic_score(text: str) -> Tuple[float, Dict[str, Any]]:
105
+ """
106
+ Very lightweight rule-based scoring: keywords + URLs.
107
+ Returns (probability 0-1, info dict).
108
+ """
109
+ lowered = text.lower()
110
+ score = 0.0
111
+ keyword_hits = []
112
+
113
+ for phrase, weight in SUSPICIOUS_KEYWORDS.items():
114
+ if phrase in lowered:
115
+ score += weight
116
+ keyword_hits.append(phrase)
117
+
118
+ urls = URL_REGEX.findall(text)
119
+ url_flags = []
120
+ for url in urls:
121
+ u_lower = url.lower()
122
+ for short in URL_SHORTENERS:
123
+ if short in u_lower:
124
+ score += 0.2
125
+ url_flags.append(f"URL shortener detected: {url}")
126
+ break
127
+ if u_lower.startswith("http://"):
128
+ score += 0.1
129
+ url_flags.append(f"Insecure (http) URL: {url}")
130
+
131
+ # Slight boost if email is extremely short AND contains a link
132
+ if len(text) < 60 and urls:
133
+ score += 0.1
134
+ url_flags.append("Very short message that mainly contains a link")
135
+
136
+ # Normalize and clamp to [0, 1]
137
+ # Empirically, 0.7 is already very suspicious, so divide by 1.5
138
+ score = score / 1.5
139
+ score = max(0.0, min(1.0, score))
140
+
141
+ info = {
142
+ "keywords_triggered": keyword_hits,
143
+ "urls_found": urls,
144
+ "url_warnings": url_flags,
145
+ "raw_heuristic_score": score,
146
+ }
147
+ return score, info
148
+
149
+ # ---------------------------------------------------------
150
+ # 4. MAIN INFERENCE FUNCTION
151
+ # ---------------------------------------------------------
152
+
153
+ def analyze_email(email_text: str):
154
+ try:
155
+ text = (email_text or "").strip()
156
+
157
+ if not text:
158
+ return (
159
+ "❌ No email text provided",
160
+ 0.0,
161
+ {"error": "Please paste the full email body first."},
162
+ )
163
+
164
+ if clf_pipe is None:
165
+ return (
166
+ "❌ Model failed to load",
167
+ 0.0,
168
+ {
169
+ "error": model_load_error,
170
+ "hint": "Check the Space logs or requirements.txt.",
171
+ },
172
+ )
173
+
174
+ # --- Model-based score ---
175
+ model_outputs = clf_pipe(text, truncation=True, max_length=512)[0]
176
+ model_spam_prob, label_scores = compute_model_spam_prob(text, model_outputs)
177
+
178
+ # --- Heuristic score ---
179
+ heuristic_prob, heuristic_info = compute_heuristic_score(text)
180
+
181
+ # --- Combine scores (70% model, 30% heuristics) ---
182
+ final_prob = 0.7 * model_spam_prob + 0.3 * heuristic_prob
183
+ final_prob = max(0.0, min(1.0, float(final_prob)))
184
+
185
+ verdict = (
186
+ "⚠️ Likely phishing / suspicious"
187
+ if final_prob >= 0.5
188
+ else "✅ Likely not phishing (still be cautious)"
189
+ )
190
+
191
+ details = {
192
+ "model_name": MODEL_NAME,
193
+ "model_spam_probability": model_spam_prob,
194
+ "model_label_scores": label_scores,
195
+ "heuristic_probability": heuristic_prob,
196
+ "heuristics": heuristic_info,
197
+ "final_combined_probability": final_prob,
198
+ }
199
+
200
+ return verdict, round(final_prob, 4), details
201
+
202
+ except Exception as e:
203
+ # Never crash the Space; always return something
204
+ return (
205
+ "❌ Internal error during analysis",
206
+ 0.0,
207
+ {
208
+ "exception": f"{type(e).__name__}: {e}",
209
+ "traceback": traceback.format_exc(),
210
+ },
211
+ )
212
+
213
+ # ---------------------------------------------------------
214
+ # 5. GRADIO UI
215
+ # ---------------------------------------------------------
216
+
217
+ with gr.Blocks(title="Phishing / Spam Email Detector (Hybrid)") as demo:
218
+ gr.Markdown(
219
+ """
220
+ # 🛡️ Phishing / Spam Email Detector (Hybrid)
221
+
222
+ This tool combines:
223
+
224
+ 1. A **Hugging Face spam/phishing classifier** (`bert-tiny` – fast on CPU), and
225
+ 2. A lightweight **rule-based engine** (keywords + URL checks).
226
+
227
+ It outputs a final phishing probability and a structured JSON explanation.
228
+ """
229
+ )
230
+
231
+ with gr.Row():
232
+ with gr.Column(scale=3):
233
+ gr.Markdown("### ✉️ Email Content")
234
+ email_input = gr.Textbox(
235
+ lines=16,
236
+ placeholder="Paste the full email text here...",
237
+ label="Email body",
238
+ )
239
+
240
+ with gr.Row():
241
+ analyze_btn = gr.Button("🔍 Analyze", variant="primary")
242
+ clear_btn = gr.Button("🧹 Clear")
243
+
244
+ with gr.Column(scale=2):
245
+ gr.Markdown("### 🧾 Result")
246
+ verdict_out = gr.Textbox(
247
+ label="Overall verdict",
248
+ interactive=False,
249
+ )
250
+ prob_out = gr.Number(
251
+ label="Phishing probability (0–1)",
252
+ precision=4,
253
+ )
254
+ details_out = gr.JSON(
255
+ label="Details (model + heuristics)",
256
+ )
257
+
258
+ examples = [
259
+ [
260
+ """Subject: Important – Verify Your Account Now
261
+
262
+ Dear User,
263
+
264
+ We have detected unusual activity on your account. To avoid suspension, please verify your account immediately by clicking the link below:
265
+
266
+ http://secure-update.example-login.com/verify
267
+
268
+ Failure to do so will result in permanent closure of your account.
269
+
270
+ Thank you,
271
+ Security Team"""
272
+ ],
273
+ [
274
+ """Subject: Your Monthly Newsletter is Here!
275
+
276
+ Hello Harshit,
277
+
278
+ We’re excited to share this month’s updates, new features, and upcoming events with you.
279
+ No action is required—just click below to explore:
280
+
281
+ https://example.com/newsletter
282
+
283
+ Have a great day!
284
+ Team Example"""
285
+ ],
286
+ ]
287
+
288
+ gr.Examples(
289
+ examples=examples,
290
+ inputs=[email_input],
291
+ label="Try some example emails",
292
+ )
293
+
294
+ analyze_btn.click(
295
+ fn=analyze_email,
296
+ inputs=email_input,
297
+ outputs=[verdict_out, prob_out, details_out],
298
+ )
299
+
300
+ clear_btn.click(
301
+ fn=lambda: ("", "", 0.0, {}),
302
+ inputs=None,
303
+ outputs=[email_input, verdict_out, prob_out, details_out],
304
+ )
305
+
306
+ if __name__ == "__main__":
307
+ demo.launch()