Spaces:
Runtime error
Runtime error
| # ----------------------------- | |
| # 1️⃣ Import des librairies | |
| # ----------------------------- | |
| import pandas as pd | |
| import re | |
| import joblib | |
| import nltk | |
| from nltk.corpus import stopwords | |
| from nltk.stem import PorterStemmer | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.linear_model import LogisticRegression | |
| from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score | |
| from imblearn.over_sampling import SMOTE | |
| # Télécharger stopwords si nécessaire | |
| nltk.download('stopwords') | |
| # ----------------------------- | |
| # 2️⃣ Prétraitement des messages | |
| # ----------------------------- | |
| stop_words = set(stopwords.words('english')) | |
| stemmer = PorterStemmer() | |
| def preprocess_message(text): | |
| if pd.isna(text): | |
| return "" | |
| text = text.lower() | |
| text = re.sub(r'http\S+|www\S+', '', text) # supprimer URLs | |
| text = re.sub(r'\S+@\S+', '', text) # supprimer emails | |
| text = re.sub(r'\+?\d[\d -]{8,}\d', '', text) # supprimer numéros | |
| text = re.sub(r'\d+', '', text) # supprimer chiffres | |
| text = re.sub(r'[^a-z\s!/+>]', '', text) # garder ponctuation utile spam | |
| words = [stemmer.stem(word) for word in text.split() if word not in stop_words] | |
| return " ".join(words) | |
| # ----------------------------- | |
| # 3️⃣ Charger les données | |
| # ----------------------------- | |
| # data doit avoir les colonnes "Message" et "Category" ('spam'/'ham') | |
| data = pd.read_csv("data.csv") | |
| data['cleaned'] = data['Message'].apply(preprocess_message) | |
| X = data['cleaned'] | |
| y = data['Category'] | |
| # Split train/test stratifié | |
| X_train, X_test, y_train, y_test = train_test_split( | |
| X, y, test_size=0.2, random_state=42, stratify=y | |
| ) | |
| # ----------------------------- | |
| # 4️⃣ Vectorisation TF-IDF | |
| # ----------------------------- | |
| tfidf = TfidfVectorizer( | |
| max_features=5000, | |
| min_df=2, | |
| max_df=0.95, | |
| ngram_range=(1,2), | |
| token_pattern=r'(?u)\b\w+\b|[!/+>]' # capture mots et ponctuations importantes | |
| ) | |
| X_train_tfidf = tfidf.fit_transform(X_train) | |
| X_test_tfidf = tfidf.transform(X_test) | |
| # ----------------------------- | |
| # 5️⃣ Équilibrage des classes avec SMOTE | |
| # ----------------------------- | |
| smote = SMOTE(random_state=42) | |
| X_train_balanced, y_train_balanced = smote.fit_resample(X_train_tfidf, y_train) | |
| # ----------------------------- | |
| # 6️⃣ Entraînement du modèle Logistic Regression | |
| # ----------------------------- | |
| model = LogisticRegression(random_state=42, max_iter=1000) | |
| model.fit(X_train_balanced, y_train_balanced) | |
| # ----------------------------- | |
| # 7️⃣ Évaluation rapide | |
| # ----------------------------- | |
| y_pred = model.predict(X_test_tfidf) | |
| print("Classification Report:\n", classification_report(y_test, y_pred)) | |
| print("Matrice de confusion:\n", confusion_matrix(y_test, y_pred)) | |
| accuracy = accuracy_score(y_test, y_pred) | |
| print(f"Accuracy: {accuracy:.4f}") | |
| if hasattr(model, 'predict_proba'): | |
| y_test_binary = (y_test == 'spam').astype(int) | |
| auc = roc_auc_score(y_test_binary, model.predict_proba(X_test_tfidf)[:,1]) | |
| print(f"AUC-ROC: {auc:.4f}") | |
| # ----------------------------- | |
| # 8️⃣ Sauvegarder modèle et TF-IDF | |
| # ----------------------------- | |
| joblib.dump(model, "spam_model.pkl") | |
| joblib.dump(tfidf, "tfidf_vectorizer.pkl") | |
| print("✅ Modèle Logistic Regression et TF-IDF vectorizer sauvegardés avec succès !") | |