Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import joblib | |
| from sklearn.model_selection import train_test_split, GridSearchCV | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.pipeline import Pipeline | |
| from sklearn.svm import LinearSVC | |
| from sklearn.metrics import classification_report, accuracy_score | |
| import preprocess | |
| def train(): | |
| print("Loading dataset...") | |
| try: | |
| df = pd.read_csv("dataset.csv") | |
| except FileNotFoundError: | |
| print("Error: dataset.csv not found.") | |
| return | |
| print("Preprocessing data...") | |
| # Fill NaN with empty string just in case | |
| df['text'] = df['text'].fillna('') | |
| df['clean_text'] = df['text'].apply(preprocess.preprocess_text) | |
| X = df['clean_text'] | |
| y = df['label'] | |
| print("Splitting data...") | |
| X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) | |
| print("Setting up pipeline and grid search...") | |
| # Pipeline: TF-IDF -> LinearSVC (often best for text) | |
| pipeline = Pipeline([ | |
| ('tfidf', TfidfVectorizer()), | |
| ('clf', LinearSVC(dual='auto')) | |
| ]) | |
| # Parameters to tune | |
| param_grid = { | |
| 'tfidf__ngram_range': [(1, 1), (1, 2)], # Unigrams or Bigrams | |
| 'tfidf__max_df': [0.9, 1.0], | |
| 'clf__C': [0.1, 1, 10] | |
| } | |
| # Grid Search for best accuracy | |
| grid_search = GridSearchCV(pipeline, param_grid, cv=3, n_jobs=-1, verbose=1) | |
| print("Training model...") | |
| grid_search.fit(X_train, y_train) | |
| print(f"Best Parameters: {grid_search.best_params_}") | |
| best_model = grid_search.best_estimator_ | |
| print("Evaluating model...") | |
| y_pred = best_model.predict(X_test) | |
| print("Accuracy:", accuracy_score(y_test, y_pred)) | |
| print("\nClassification Report:\n", classification_report(y_test, y_pred)) | |
| print("Saving model...") | |
| joblib.dump(best_model, "sentiment_model_best.pkl") | |
| print("Model saved to sentiment_model_best.pkl") | |
| if __name__ == "__main__": | |
| train() | |