Spaces:
Sleeping
Sleeping
File size: 2,022 Bytes
f75d9fd | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 | import pandas as pd
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score
import preprocess
def train():
print("Loading dataset...")
try:
df = pd.read_csv("dataset.csv")
except FileNotFoundError:
print("Error: dataset.csv not found.")
return
print("Preprocessing data...")
# Fill NaN with empty string just in case
df['text'] = df['text'].fillna('')
df['clean_text'] = df['text'].apply(preprocess.preprocess_text)
X = df['clean_text']
y = df['label']
print("Splitting data...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Setting up pipeline and grid search...")
# Pipeline: TF-IDF -> LinearSVC (often best for text)
pipeline = Pipeline([
('tfidf', TfidfVectorizer()),
('clf', LinearSVC(dual='auto'))
])
# Parameters to tune
param_grid = {
'tfidf__ngram_range': [(1, 1), (1, 2)], # Unigrams or Bigrams
'tfidf__max_df': [0.9, 1.0],
'clf__C': [0.1, 1, 10]
}
# Grid Search for best accuracy
grid_search = GridSearchCV(pipeline, param_grid, cv=3, n_jobs=-1, verbose=1)
print("Training model...")
grid_search.fit(X_train, y_train)
print(f"Best Parameters: {grid_search.best_params_}")
best_model = grid_search.best_estimator_
print("Evaluating model...")
y_pred = best_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("Saving model...")
joblib.dump(best_model, "sentiment_model_best.pkl")
print("Model saved to sentiment_model_best.pkl")
if __name__ == "__main__":
train()
|