import nltk import string import re import pandas as pd import numpy as np import joblib from nltk.corpus import stopwords from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.model_selection import train_test_split from sklearn.naive_bayes import MultinomialNB from sklearn.metrics import accuracy_score, classification_report from googletrans import Translator from imblearn.over_sampling import SMOTE nltk.download('stopwords') nltk.download('punkt') translator = Translator() # Load dataset data = pd.read_csv('/Users/caasidev/development/AI/datasets/train.csv', encoding='ISO-8859-1') # Drop missing values data = data.dropna(subset=['text', 'sentiment']) stop_words = set(stopwords.words('english') + stopwords.words('french')) # Function to clean text def clean_text(text): if isinstance(text, float): return "" text = text.lower() text = re.sub(f"[{string.punctuation}]", "", text) text = " ".join([word for word in text.split() if word not in stop_words]) return text # Apply text cleaning data['Cleaned_Text'] = data['text'].apply(clean_text) # **Vectorization BEFORE SMOTE** vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_df=0.85, min_df=2, max_features=10000) X_tfidf = vectorizer.fit_transform(data['Cleaned_Text']) y = data['sentiment'] # Apply SMOTE **after** vectorization smote = SMOTE(random_state=42) X_resampled, y_resampled = smote.fit_resample(X_tfidf, y) # Train-test split X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42) # Train Naive Bayes model = MultinomialNB(alpha=0.5) model.fit(X_train, y_train) # Save model and vectorizer joblib.dump(model, "naive_bayes_model.pkl") joblib.dump(vectorizer, "tfidf_vectorizer.pkl") print("Model and vectorizer saved successfully!") # Predictions y_pred = model.predict(X_test) # Evaluation accuracy = accuracy_score(y_test, y_pred) print(f"Improved Accuracy: {accuracy * 100:.2f}%") print("\nClassification Report:\n", classification_report(y_test, y_pred))