# ----------------------------- # 1️⃣ Import des librairies # ----------------------------- import pandas as pd import re import joblib import nltk from nltk.corpus import stopwords from nltk.stem import PorterStemmer from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score from imblearn.over_sampling import SMOTE # Télécharger stopwords si nécessaire nltk.download('stopwords') # ----------------------------- # 2️⃣ Prétraitement des messages # ----------------------------- stop_words = set(stopwords.words('english')) stemmer = PorterStemmer() def preprocess_message(text): if pd.isna(text): return "" text = text.lower() text = re.sub(r'http\S+|www\S+', '', text) # supprimer URLs text = re.sub(r'\S+@\S+', '', text) # supprimer emails text = re.sub(r'\+?\d[\d -]{8,}\d', '', text) # supprimer numéros text = re.sub(r'\d+', '', text) # supprimer chiffres text = re.sub(r'[^a-z\s!/+>]', '', text) # garder ponctuation utile spam words = [stemmer.stem(word) for word in text.split() if word not in stop_words] return " ".join(words) # ----------------------------- # 3️⃣ Charger les données # ----------------------------- # data doit avoir les colonnes "Message" et "Category" ('spam'/'ham') data = pd.read_csv("data.csv") data['cleaned'] = data['Message'].apply(preprocess_message) X = data['cleaned'] y = data['Category'] # Split train/test stratifié X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42, stratify=y ) # ----------------------------- # 4️⃣ Vectorisation TF-IDF # ----------------------------- tfidf = TfidfVectorizer( max_features=5000, min_df=2, max_df=0.95, ngram_range=(1,2), token_pattern=r'(?u)\b\w+\b|[!/+>]' # capture mots et ponctuations importantes ) X_train_tfidf = tfidf.fit_transform(X_train) X_test_tfidf = tfidf.transform(X_test) # ----------------------------- # 5️⃣ Équilibrage des classes avec SMOTE # ----------------------------- smote = SMOTE(random_state=42) X_train_balanced, y_train_balanced = smote.fit_resample(X_train_tfidf, y_train) # ----------------------------- # 6️⃣ Entraînement du modèle Logistic Regression # ----------------------------- model = LogisticRegression(random_state=42, max_iter=1000) model.fit(X_train_balanced, y_train_balanced) # ----------------------------- # 7️⃣ Évaluation rapide # ----------------------------- y_pred = model.predict(X_test_tfidf) print("Classification Report:\n", classification_report(y_test, y_pred)) print("Matrice de confusion:\n", confusion_matrix(y_test, y_pred)) accuracy = accuracy_score(y_test, y_pred) print(f"Accuracy: {accuracy:.4f}") if hasattr(model, 'predict_proba'): y_test_binary = (y_test == 'spam').astype(int) auc = roc_auc_score(y_test_binary, model.predict_proba(X_test_tfidf)[:,1]) print(f"AUC-ROC: {auc:.4f}") # ----------------------------- # 8️⃣ Sauvegarder modèle et TF-IDF # ----------------------------- joblib.dump(model, "spam_model.pkl") joblib.dump(tfidf, "tfidf_vectorizer.pkl") print("✅ Modèle Logistic Regression et TF-IDF vectorizer sauvegardés avec succès !")