File size: 2,052 Bytes
5ab54b7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import nltk
import string
import re
import pandas as pd
import numpy as np
import joblib
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from googletrans import Translator
from imblearn.over_sampling import SMOTE

nltk.download('stopwords')
nltk.download('punkt')

translator = Translator()

# Load dataset
data = pd.read_csv('/Users/caasidev/development/AI/datasets/train.csv', encoding='ISO-8859-1')

# Drop missing values
data = data.dropna(subset=['text', 'sentiment'])

stop_words = set(stopwords.words('english') + stopwords.words('french'))

# Function to clean text
def clean_text(text):
    if isinstance(text, float):
        return ""
    text = text.lower()
    text = re.sub(f"[{string.punctuation}]", "", text)
    text = " ".join([word for word in text.split() if word not in stop_words])
    return text

# Apply text cleaning
data['Cleaned_Text'] = data['text'].apply(clean_text)

# **Vectorization BEFORE SMOTE**
vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_df=0.85, min_df=2, max_features=10000)
X_tfidf = vectorizer.fit_transform(data['Cleaned_Text'])
y = data['sentiment']

# Apply SMOTE **after** vectorization
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_tfidf, y)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Train Naive Bayes
model = MultinomialNB(alpha=0.5)
model.fit(X_train, y_train)

# Save model and vectorizer
joblib.dump(model, "naive_bayes_model.pkl")
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")
print("Model and vectorizer saved successfully!")

# Predictions
y_pred = model.predict(X_test)

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f"Improved Accuracy: {accuracy * 100:.2f}%")
print("\nClassification Report:\n", classification_report(y_test, y_pred))