Spaces:
Build error
Build error
| import re | |
| from sklearn.feature_extraction.text import CountVectorizer | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.naive_bayes import MultinomialNB | |
| from sklearn.metrics import accuracy_score | |
| import pandas as pd | |
| import warnings | |
| import pickle | |
| warnings.filterwarnings('ignore') | |
| # Load dataset | |
| df = pd.read_csv('./twitter_sentiment.csv', header=None, index_col=0) | |
| df = df[[2, 3]].reset_index(drop=True) | |
| df.rename(columns={2: "sentiments", 3: "text"}, inplace=True) | |
| # Drop missing values | |
| df.dropna(inplace=True) | |
| # Preprocess text | |
| def process_text(text): | |
| text = text.lower() | |
| text = re.sub(r'http\S+', '', text) | |
| text = re.sub(r'@[a-zA-Z0-9_]+', '', text) | |
| text = re.sub(r'#', '', text) | |
| text = re.sub(r'[^a-zA-Z\s]', '', text) | |
| return text | |
| df['clean_text'] = df['text'].apply(process_text) | |
| # Vectorize text | |
| count_vectorizer = CountVectorizer(max_features=5000) | |
| count_matrix = count_vectorizer.fit_transform(df['clean_text']) | |
| # Split data | |
| X_train, X_test, y_train, y_test = train_test_split( | |
| count_matrix, df['sentiments'], test_size=0.2, random_state=42 | |
| ) | |
| # Train model | |
| nb_classifier = MultinomialNB() | |
| nb_classifier.fit(X_train, y_train) | |
| # Predict and evaluate | |
| y_pred = nb_classifier.predict(X_test) | |
| accuracy = accuracy_score(y_test, y_pred) | |
| print("Accuracy:", accuracy) | |
| with open('count_vectorizer.pkl','wb') as vectorizer_file: | |
| pickle.dump(count_vectorizer , vectorizer_file) | |
| with open('nb_classifier.pkl','wb') as classifier_file: | |
| pickle.dump(nb_classifier , classifier_file) |