MrUtakata commited on
Commit
cdf1735
Β·
verified Β·
1 Parent(s): 71b4366

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +79 -0
app.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+
3
+ import re
4
+ import joblib
5
+ import pandas as pd
6
+ import streamlit as st
7
+
8
+ # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
9
+ # 1) Text cleaning & feature functions
10
+ # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
11
+
12
+ def clean_text(text: str) -> str:
13
+ text = re.sub(r'[\r\n\t]+', ' ', text)
14
+ text = re.sub(r'https?://\S+', ' URL ', text)
15
+ text = re.sub(r'[^a-z0-9\s]', ' ', text)
16
+ text = re.sub(r'\s{2,}', ' ', text)
17
+ return text.strip()
18
+
19
+ def featurize(title: str, body: str) -> pd.DataFrame:
20
+ raw = f"{title or ''} {body or ''}"
21
+ txt = clean_text(raw.lower())
22
+ return pd.DataFrame([{
23
+ 'content': txt,
24
+ 'msg_len': len(txt),
25
+ 'digit_count': len(re.findall(r'\d', txt)),
26
+ 'url_count': txt.count('URL'),
27
+ 'key_flag': int(
28
+ bool(re.search(r'(opportunity|reward|service)', txt))
29
+ and (bool(re.search(r'\d', txt)) or 'URL' in txt)
30
+ )
31
+ }])
32
+
33
+ # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
34
+ # 2) Load models/artifacts
35
+ # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
36
+
37
+ @st.cache(allow_output_mutation=True)
38
+ def load_models():
39
+ # adjust paths if needed
40
+ prep_clf_pipe = joblib.load('spam_deploy_pipeline.pkl')
41
+ threshold = joblib.load('spam_threshold.pkl')
42
+ return prep_clf_pipe, threshold
43
+
44
+ pipe, thresh = load_models()
45
+
46
+ # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
47
+ # 3) Streamlit UI
48
+ # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
49
+
50
+ st.set_page_config(page_title="E-mail Spam Detection", layout="centered")
51
+ st.title("πŸ“§ E-mail Spam Detector")
52
+ st.markdown(
53
+ "Enter an e-mail subject and body below, then hit **Predict** "
54
+ "to see the spam probability and label."
55
+ )
56
+
57
+ with st.form("input_form"):
58
+ subj = st.text_input("Subject / Title")
59
+ body = st.text_area("Body text", height=200)
60
+ submitted = st.form_submit_button("Predict")
61
+
62
+ if submitted:
63
+ # featurize
64
+ X = featurize(subj, body)
65
+ # run through preprocessing + calibrated classifier
66
+ proba = pipe.predict_proba(X)[0,1]
67
+ label = "🚫 SPAM" if proba >= thresh else "βœ… Not Spam"
68
+ st.metric("Spam probability", f"{proba:.1%}", delta=None)
69
+ st.subheader(label)
70
+ if label.startswith("🚫"):
71
+ st.warning("This message is classified as spam. Proceed with caution!")
72
+ else:
73
+ st.success("This message looks clean.")
74
+
75
+ st.write("---")
76
+ st.markdown(
77
+ "Threshold for spam vs not-spam was set to "
78
+ f"**{thresh:.2f}** (optimized for Fβ‚‚ score)."
79
+ )