import pandas as pd from sklearn.model_selection import train_test_split from sklearn.feature_extraction.text import TfidfVectorizer from xgboost import XGBClassifier from sklearn.metrics import accuracy_score import joblib # Load dataset df = pd.read_csv("dummy_sentiment_dataset.csv") # Split X_train, X_test, y_train, y_test = train_test_split( df["text"], df["label"], test_size=0.2, random_state=42 ) # TF-IDF tfidf = TfidfVectorizer(max_features=5000) X_train_tfidf = tfidf.fit_transform(X_train) X_test_tfidf = tfidf.transform(X_test) # Model model = XGBClassifier( n_estimators=300, max_depth=6, learning_rate=0.1, eval_metric='logloss' ) model.fit(X_train_tfidf, y_train) # Evaluate y_pred = model.predict(X_test_tfidf) print("Accuracy:", accuracy_score(y_test, y_pred)) # Save model + vectorizer joblib.dump(model, "model.joblib") joblib.dump(tfidf, "tfidf_vectorizer.joblib") print("✅ Model and vectorizer saved!")