Spaces:

hansche
/

SocialMediaFoci

Sleeping

Bismark

Update Space

5ab54b7 3 months ago

2.05 kB

	import nltk
	import string
	import re
	import pandas as pd
	import numpy as np
	import joblib
	from nltk.corpus import stopwords
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.model_selection import train_test_split
	from sklearn.naive_bayes import MultinomialNB
	from sklearn.metrics import accuracy_score, classification_report
	from googletrans import Translator
	from imblearn.over_sampling import SMOTE

	nltk.download('stopwords')
	nltk.download('punkt')

	translator = Translator()

	# Load dataset
	data = pd.read_csv('/Users/caasidev/development/AI/datasets/train.csv', encoding='ISO-8859-1')

	# Drop missing values
	data = data.dropna(subset=['text', 'sentiment'])

	stop_words = set(stopwords.words('english') + stopwords.words('french'))

	# Function to clean text
	def clean_text(text):
	if isinstance(text, float):
	return ""
	text = text.lower()
	text = re.sub(f"[{string.punctuation}]", "", text)
	text = " ".join([word for word in text.split() if word not in stop_words])
	return text

	# Apply text cleaning
	data['Cleaned_Text'] = data['text'].apply(clean_text)

	# Vectorization BEFORE SMOTE
	vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_df=0.85, min_df=2, max_features=10000)
	X_tfidf = vectorizer.fit_transform(data['Cleaned_Text'])
	y = data['sentiment']

	# Apply SMOTE after vectorization
	smote = SMOTE(random_state=42)
	X_resampled, y_resampled = smote.fit_resample(X_tfidf, y)

	# Train-test split
	X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

	# Train Naive Bayes
	model = MultinomialNB(alpha=0.5)
	model.fit(X_train, y_train)

	# Save model and vectorizer
	joblib.dump(model, "naive_bayes_model.pkl")
	joblib.dump(vectorizer, "tfidf_vectorizer.pkl")
	print("Model and vectorizer saved successfully!")

	# Predictions
	y_pred = model.predict(X_test)

	# Evaluation
	accuracy = accuracy_score(y_test, y_pred)
	print(f"Improved Accuracy: {accuracy * 100:.2f}%")
	print("\nClassification Report:\n", classification_report(y_test, y_pred))