import pandas as pd import joblib from sklearn.model_selection import train_test_split, GridSearchCV from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.pipeline import Pipeline from sklearn.svm import LinearSVC from sklearn.metrics import classification_report, accuracy_score import preprocess def train(): print("Loading dataset...") try: df = pd.read_csv("dataset.csv") except FileNotFoundError: print("Error: dataset.csv not found.") return print("Preprocessing data...") # Fill NaN with empty string just in case df['text'] = df['text'].fillna('') df['clean_text'] = df['text'].apply(preprocess.preprocess_text) X = df['clean_text'] y = df['label'] print("Splitting data...") X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) print("Setting up pipeline and grid search...") # Pipeline: TF-IDF -> LinearSVC (often best for text) pipeline = Pipeline([ ('tfidf', TfidfVectorizer()), ('clf', LinearSVC(dual='auto')) ]) # Parameters to tune param_grid = { 'tfidf__ngram_range': [(1, 1), (1, 2)], # Unigrams or Bigrams 'tfidf__max_df': [0.9, 1.0], 'clf__C': [0.1, 1, 10] } # Grid Search for best accuracy grid_search = GridSearchCV(pipeline, param_grid, cv=3, n_jobs=-1, verbose=1) print("Training model...") grid_search.fit(X_train, y_train) print(f"Best Parameters: {grid_search.best_params_}") best_model = grid_search.best_estimator_ print("Evaluating model...") y_pred = best_model.predict(X_test) print("Accuracy:", accuracy_score(y_test, y_pred)) print("\nClassification Report:\n", classification_report(y_test, y_pred)) print("Saving model...") joblib.dump(best_model, "sentiment_model_best.pkl") print("Model saved to sentiment_model_best.pkl") if __name__ == "__main__": train()