Spaces:

MrUtakata
/

Email_spam_detection

Sleeping

App Files Files Community

Email_spam_detection / app.py

MrUtakata

Update app.py

e9422ac verified 8 months ago

raw

history blame contribute delete

2.89 kB

	# app.py

	import re
	import joblib
	import pandas as pd
	import streamlit as st

	# ———————————————————————————
	# 0) Must set page config first
	# ———————————————————————————
	st.set_page_config(page_title="E-mail Spam Detection", layout="centered")

	# ———————————————————————————
	# 1) Text cleaning & feature functions
	# ———————————————————————————

	def clean_text(text: str) -> str:
	text = re.sub(r'[\r\n\t]+', ' ', text)
	text = re.sub(r'https?://\S+', ' URL ', text)
	text = re.sub(r'[^a-z0-9\s]', ' ', text)
	text = re.sub(r'\s{2,}', ' ', text)
	return text.strip()

	def featurize(title: str, body: str) -> pd.DataFrame:
	raw = f"{title or ''} {body or ''}"
	txt = clean_text(raw.lower())
	return pd.DataFrame([{
	'content': txt,
	'msg_len': len(txt),
	'digit_count': len(re.findall(r'\d', txt)),
	'url_count': txt.count('URL'),
	'key_flag': int(
	bool(re.search(r'(opportunity\|reward\|service)', txt))
	and (bool(re.search(r'\d', txt)) or 'URL' in txt)
	)
	}])

	# ———————————————————————————
	# 2) Load models/artifacts once
	# ———————————————————————————
	@st.cache_resource
	def load_models():
	"""
	Loads the deploy pipeline (preprocessor + calibrated RF) and threshold.
	This runs only once per session.
	"""
	deploy_pipe = joblib.load('spam_deploy_pipeline.pkl')
	threshold = joblib.load('spam_threshold.pkl')
	return deploy_pipe, threshold

	pipe, thresh = load_models()

	# ———————————————————————————
	# 3) Streamlit UI
	# ———————————————————————————

	st.title("📧 E-mail Spam Detector")
	st.markdown(
	"Enter an e-mail Subject and Body below, then click Predict "
	"to see the spam probability and label."
	)

	with st.form("input_form"):
	subj = st.text_input("Subject / Title")
	body = st.text_area("Body text", height=200)
	submitted = st.form_submit_button("Predict")

	if submitted:
	# 1) featurize
	X = featurize(subj, body)

	# 2) predict
	proba = pipe.predict_proba(X)[0, 1]
	is_spam = (proba >= thresh)

	# 3) display
	st.metric("Spam probability", f"{proba:.1%}")
	if is_spam:
	st.subheader("🚫 SPAM")
	st.warning("This message is classified as spam. Proceed with caution!")
	else:
	st.subheader("✅ Not Spam")
	st.success("This message looks clean.")

	st.divider()
	st.caption(f"Decision threshold (F₂-optimized): {thresh:.2f}")