| |
|
|
| import pandas as pd |
| import joblib |
| import numpy as np |
| from sklearn.linear_model import LogisticRegression |
| from sklearn.feature_extraction.text import TfidfVectorizer |
| from sklearn.multiclass import OneVsRestClassifier |
| from sklearn.model_selection import train_test_split |
| from sklearn.metrics import f1_score |
|
|
| LABELS = ['admiration','anger','disgust','fear','hope','joy','love','pride','sadness'] |
|
|
| def to_binary(label_string): |
| present = [e.strip() for e in str(label_string).split(',')] |
| return [1 if label in present else 0 for label in LABELS] |
|
|
| |
| print("Loading dataset...") |
| df = pd.read_excel("Multi-Labeled_Emotions_Modified.xlsx") |
| df = df[['Tweets (text)', 'Emotions (Multi-labeled)']].dropna() |
| print(f"Total rows: {len(df)}") |
|
|
| X = df['Tweets (text)'].tolist() |
| y = [to_binary(row) for row in df['Emotions (Multi-labeled)']] |
|
|
| |
| X_train, X_test, y_train, y_test = train_test_split( |
| X, y, test_size=0.2, random_state=42 |
| ) |
| print(f"Train: {len(X_train)} rows") |
| print(f"Test: {len(X_test)} rows") |
|
|
| |
| test_emotions = [ |
| ', '.join([LABELS[i] for i, val in enumerate(row) if val == 1]) |
| for row in y_test |
| ] |
| test_df = pd.DataFrame({ |
| 'Tweets (text)': X_test, |
| 'Emotions (Multi-labeled)': test_emotions |
| }) |
| test_df.to_excel("test_set.xlsx", index=False) |
| print("Saved test_set.xlsx") |
|
|
| |
| print("Training...") |
| vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1,2)) |
| X_train_tfidf = vectorizer.fit_transform(X_train) |
| X_test_tfidf = vectorizer.transform(X_test) |
|
|
| classifier = OneVsRestClassifier( |
| LogisticRegression(max_iter=1000, C=1.0) |
| ) |
| classifier.fit(X_train_tfidf, np.array(y_train)) |
| print("Training done.") |
|
|
| |
| y_pred = classifier.predict(X_test_tfidf) |
| f1 = f1_score(np.array(y_test), y_pred, average='macro', zero_division=0) |
| print(f"F1 score: {f1:.4f}") |
|
|
| |
| model_bundle = { |
| "vectorizer": vectorizer, |
| "classifier": classifier, |
| "labels": LABELS |
| } |
| joblib.dump(model_bundle, "model.pkl") |
| print("Saved model.pkl") |