SocialMediaFoci / test.py
Bismark
Update Space
5ab54b7
import nltk
import string
import re
import pandas as pd
import numpy as np
import joblib
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from googletrans import Translator
from imblearn.over_sampling import SMOTE
nltk.download('stopwords')
nltk.download('punkt')
translator = Translator()
# Load dataset
data = pd.read_csv('/Users/caasidev/development/AI/datasets/train.csv', encoding='ISO-8859-1')
# Drop missing values
data = data.dropna(subset=['text', 'sentiment'])
stop_words = set(stopwords.words('english') + stopwords.words('french'))
# Function to clean text
def clean_text(text):
if isinstance(text, float):
return ""
text = text.lower()
text = re.sub(f"[{string.punctuation}]", "", text)
text = " ".join([word for word in text.split() if word not in stop_words])
return text
# Apply text cleaning
data['Cleaned_Text'] = data['text'].apply(clean_text)
# **Vectorization BEFORE SMOTE**
vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_df=0.85, min_df=2, max_features=10000)
X_tfidf = vectorizer.fit_transform(data['Cleaned_Text'])
y = data['sentiment']
# Apply SMOTE **after** vectorization
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_tfidf, y)
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)
# Train Naive Bayes
model = MultinomialNB(alpha=0.5)
model.fit(X_train, y_train)
# Save model and vectorizer
joblib.dump(model, "naive_bayes_model.pkl")
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")
print("Model and vectorizer saved successfully!")
# Predictions
y_pred = model.predict(X_test)
# Evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f"Improved Accuracy: {accuracy * 100:.2f}%")
print("\nClassification Report:\n", classification_report(y_test, y_pred))