Spaces:

Anwaree
/

message-classification

Runtime error

App Files Files Community

message-classification / train_spam_model.py

Anwaree

Create train_spam_model.py

d2cd1b6 verified 4 months ago

raw

history blame contribute delete

3.41 kB

	# -----------------------------
	# 1️⃣ Import des librairies
	# -----------------------------
	import pandas as pd
	import re
	import joblib
	import nltk
	from nltk.corpus import stopwords
	from nltk.stem import PorterStemmer
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.model_selection import train_test_split
	from sklearn.linear_model import LogisticRegression
	from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score
	from imblearn.over_sampling import SMOTE

	# Télécharger stopwords si nécessaire
	nltk.download('stopwords')

	# -----------------------------
	# 2️⃣ Prétraitement des messages
	# -----------------------------
	stop_words = set(stopwords.words('english'))
	stemmer = PorterStemmer()

	def preprocess_message(text):
	if pd.isna(text):
	return ""
	text = text.lower()
	text = re.sub(r'http\S+\|www\S+', '', text) # supprimer URLs
	text = re.sub(r'\S+@\S+', '', text) # supprimer emails
	text = re.sub(r'\+?\d[\d -]{8,}\d', '', text) # supprimer numéros
	text = re.sub(r'\d+', '', text) # supprimer chiffres
	text = re.sub(r'[^a-z\s!/+>]', '', text) # garder ponctuation utile spam
	words = [stemmer.stem(word) for word in text.split() if word not in stop_words]
	return " ".join(words)

	# -----------------------------
	# 3️⃣ Charger les données
	# -----------------------------
	# data doit avoir les colonnes "Message" et "Category" ('spam'/'ham')
	data = pd.read_csv("data.csv")
	data['cleaned'] = data['Message'].apply(preprocess_message)

	X = data['cleaned']
	y = data['Category']

	# Split train/test stratifié
	X_train, X_test, y_train, y_test = train_test_split(
	X, y, test_size=0.2, random_state=42, stratify=y
	)

	# -----------------------------
	# 4️⃣ Vectorisation TF-IDF
	# -----------------------------
	tfidf = TfidfVectorizer(
	max_features=5000,
	min_df=2,
	max_df=0.95,
	ngram_range=(1,2),
	token_pattern=r'(?u)\b\w+\b\|[!/+>]' # capture mots et ponctuations importantes
	)
	X_train_tfidf = tfidf.fit_transform(X_train)
	X_test_tfidf = tfidf.transform(X_test)

	# -----------------------------
	# 5️⃣ Équilibrage des classes avec SMOTE
	# -----------------------------
	smote = SMOTE(random_state=42)
	X_train_balanced, y_train_balanced = smote.fit_resample(X_train_tfidf, y_train)

	# -----------------------------
	# 6️⃣ Entraînement du modèle Logistic Regression
	# -----------------------------
	model = LogisticRegression(random_state=42, max_iter=1000)
	model.fit(X_train_balanced, y_train_balanced)

	# -----------------------------
	# 7️⃣ Évaluation rapide
	# -----------------------------
	y_pred = model.predict(X_test_tfidf)
	print("Classification Report:\n", classification_report(y_test, y_pred))
	print("Matrice de confusion:\n", confusion_matrix(y_test, y_pred))
	accuracy = accuracy_score(y_test, y_pred)
	print(f"Accuracy: {accuracy:.4f}")

	if hasattr(model, 'predict_proba'):
	y_test_binary = (y_test == 'spam').astype(int)
	auc = roc_auc_score(y_test_binary, model.predict_proba(X_test_tfidf)[:,1])
	print(f"AUC-ROC: {auc:.4f}")

	# -----------------------------
	# 8️⃣ Sauvegarder modèle et TF-IDF
	# -----------------------------
	joblib.dump(model, "spam_model.pkl")
	joblib.dump(tfidf, "tfidf_vectorizer.pkl")
	print("✅ Modèle Logistic Regression et TF-IDF vectorizer sauvegardés avec succès !")