Anwaree's picture
train_spam_model.py
483e6f5 verified
import gradio as gr
import joblib
import re
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk
# -----------------------------
# 1️⃣ Prétraitement
# -----------------------------
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
def preprocess_message(text):
"""
Prétraitement générique pour messages inconnus (nouveaux messages à prédire)
: garde ponctuation utile pour spam
"""
if pd.isna(text):
return ""
text = text.lower()
text = re.sub(r'http\S+|www\S+', '', text)
text = re.sub(r'\S+@\S+', '', text)
text = re.sub(r'\+?\d[\d -]{8,}\d', '', text)
text = re.sub(r'\d+', '', text)
# garder ponctuation typique spam
text = re.sub(r'[^a-z\s!/+>]', '', text)
words = [stemmer.stem(word) for word in text.split() if word not in stop_words]
return " ".join(words)
# -----------------------------
# 2️⃣ Chargement du modèle
# -----------------------------
model = joblib.load("spam_model.pkl")
vectorizer = joblib.load("tfidf_vectorizer.pkl")
# -----------------------------
# 3️⃣ Fonction de prédiction
# -----------------------------
def predict_message(message):
cleaned = preprocess_message(message)
X = vectorizer.transform([cleaned])
prediction = model.predict(X)[0]
probability = model.predict_proba(X)[0][1] if hasattr(model, 'predict_proba') else None
return {
"Message": message,
"Prediction": prediction,
"Spam Probability": round(float(probability), 4) if probability is not None else None
}
# -----------------------------
# 4️⃣ Interface Gradio
# -----------------------------
iface = gr.Interface(
fn=predict_message,
inputs=gr.Textbox(lines=3, placeholder="Entrez votre message ici..."),
outputs="json",
title="📩 Spam Detector",
description="Un modèle ML qui détecte si un message est SPAM ou HAM."
)
if __name__ == "__main__":
iface.launch()