sentiment_analysis / train_model.py
nadish1210's picture
Upload 8 files
f75d9fd verified
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score
import preprocess
def train():
print("Loading dataset...")
try:
df = pd.read_csv("dataset.csv")
except FileNotFoundError:
print("Error: dataset.csv not found.")
return
print("Preprocessing data...")
# Fill NaN with empty string just in case
df['text'] = df['text'].fillna('')
df['clean_text'] = df['text'].apply(preprocess.preprocess_text)
X = df['clean_text']
y = df['label']
print("Splitting data...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Setting up pipeline and grid search...")
# Pipeline: TF-IDF -> LinearSVC (often best for text)
pipeline = Pipeline([
('tfidf', TfidfVectorizer()),
('clf', LinearSVC(dual='auto'))
])
# Parameters to tune
param_grid = {
'tfidf__ngram_range': [(1, 1), (1, 2)], # Unigrams or Bigrams
'tfidf__max_df': [0.9, 1.0],
'clf__C': [0.1, 1, 10]
}
# Grid Search for best accuracy
grid_search = GridSearchCV(pipeline, param_grid, cv=3, n_jobs=-1, verbose=1)
print("Training model...")
grid_search.fit(X_train, y_train)
print(f"Best Parameters: {grid_search.best_params_}")
best_model = grid_search.best_estimator_
print("Evaluating model...")
y_pred = best_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("Saving model...")
joblib.dump(best_model, "sentiment_model_best.pkl")
print("Model saved to sentiment_model_best.pkl")
if __name__ == "__main__":
train()