Spaces:

nadish1210
/

sentiment_analysis

Sleeping

App Files Files Community

sentiment_analysis / train_model.py

nadish1210

Upload 8 files

f75d9fd verified 2 months ago

raw

history blame contribute delete

2.02 kB

	import pandas as pd
	import joblib
	from sklearn.model_selection import train_test_split, GridSearchCV
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.pipeline import Pipeline
	from sklearn.svm import LinearSVC
	from sklearn.metrics import classification_report, accuracy_score
	import preprocess

	def train():
	print("Loading dataset...")
	try:
	df = pd.read_csv("dataset.csv")
	except FileNotFoundError:
	print("Error: dataset.csv not found.")
	return

	print("Preprocessing data...")
	# Fill NaN with empty string just in case
	df['text'] = df['text'].fillna('')
	df['clean_text'] = df['text'].apply(preprocess.preprocess_text)

	X = df['clean_text']
	y = df['label']

	print("Splitting data...")
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

	print("Setting up pipeline and grid search...")
	# Pipeline: TF-IDF -> LinearSVC (often best for text)
	pipeline = Pipeline([
	('tfidf', TfidfVectorizer()),
	('clf', LinearSVC(dual='auto'))
	])

	# Parameters to tune
	param_grid = {
	'tfidf__ngram_range': [(1, 1), (1, 2)], # Unigrams or Bigrams
	'tfidf__max_df': [0.9, 1.0],
	'clf__C': [0.1, 1, 10]
	}

	# Grid Search for best accuracy
	grid_search = GridSearchCV(pipeline, param_grid, cv=3, n_jobs=-1, verbose=1)

	print("Training model...")
	grid_search.fit(X_train, y_train)

	print(f"Best Parameters: {grid_search.best_params_}")
	best_model = grid_search.best_estimator_

	print("Evaluating model...")
	y_pred = best_model.predict(X_test)
	print("Accuracy:", accuracy_score(y_test, y_pred))
	print("\nClassification Report:\n", classification_report(y_test, y_pred))

	print("Saving model...")
	joblib.dump(best_model, "sentiment_model_best.pkl")
	print("Model saved to sentiment_model_best.pkl")

	if __name__ == "__main__":
	train()