# train.py import pandas as pd import joblib import numpy as np from sklearn.linear_model import LogisticRegression from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.multiclass import OneVsRestClassifier from sklearn.model_selection import train_test_split from sklearn.metrics import f1_score LABELS = ['admiration','anger','disgust','fear','hope','joy','love','pride','sadness'] def to_binary(label_string): present = [e.strip() for e in str(label_string).split(',')] return [1 if label in present else 0 for label in LABELS] # ── load ────────────────────────────────────────────────────────── print("Loading dataset...") df = pd.read_excel("Multi-Labeled_Emotions_Modified.xlsx") df = df[['Tweets (text)', 'Emotions (Multi-labeled)']].dropna() print(f"Total rows: {len(df)}") X = df['Tweets (text)'].tolist() y = [to_binary(row) for row in df['Emotions (Multi-labeled)']] # ── split ───────────────────────────────────────────────────────── X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42 ) print(f"Train: {len(X_train)} rows") print(f"Test: {len(X_test)} rows") # ── save test set as hidden test data ───────────────────────────── test_emotions = [ ', '.join([LABELS[i] for i, val in enumerate(row) if val == 1]) for row in y_test ] test_df = pd.DataFrame({ 'Tweets (text)': X_test, 'Emotions (Multi-labeled)': test_emotions }) test_df.to_excel("test_set.xlsx", index=False) print("Saved test_set.xlsx") # ── train ───────────────────────────────────────────────────────── print("Training...") vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1,2)) X_train_tfidf = vectorizer.fit_transform(X_train) X_test_tfidf = vectorizer.transform(X_test) classifier = OneVsRestClassifier( LogisticRegression(max_iter=1000, C=1.0) ) classifier.fit(X_train_tfidf, np.array(y_train)) print("Training done.") # ── quick check ─────────────────────────────────────────────────── y_pred = classifier.predict(X_test_tfidf) f1 = f1_score(np.array(y_test), y_pred, average='macro', zero_division=0) print(f"F1 score: {f1:.4f}") # ── save ────────────────────────────────────────────────────────── model_bundle = { "vectorizer": vectorizer, "classifier": classifier, "labels": LABELS } joblib.dump(model_bundle, "model.pkl") print("Saved model.pkl")