Upload README.md with huggingface_hub
Browse files
README.md
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
---
|
| 3 |
+
license: mit
|
| 4 |
+
language: en
|
| 5 |
+
pipeline_tag: text-classification
|
| 6 |
+
tags:
|
| 7 |
+
- text-classification
|
| 8 |
+
- multi-label
|
| 9 |
+
- emotion-classification
|
| 10 |
+
- ensemble
|
| 11 |
+
- deberta
|
| 12 |
+
- roberta
|
| 13 |
+
---
|
| 14 |
+
|
| 15 |
+
# Peringkat 1: Ensemble Model untuk Klasifikasi Emosi Multi-Label
|
| 16 |
+
|
| 17 |
+
Ini adalah repositori untuk sistem model *ensemble* yang meraih peringkat pertama dalam tugas klasifikasi emosi multi-label.
|
| 18 |
+
Sistem ini menggabungkan dua model kuat, **DeBERTa-v3-Large** dan **RoBERTa-Large**, yang dilatih dengan teknik LLRD (Layer-wise Learning Rate Decay) dan Focal Loss.
|
| 19 |
+
|
| 20 |
+
## Komponen Ensemble
|
| 21 |
+
- **`deberta_model`**: Model `microsoft/deberta-v3-large` yang telah di-fine-tune.
|
| 22 |
+
- **`roberta_model`**: Model `roberta-large` yang telah di-fine-tune.
|
| 23 |
+
- **`best_thresholds.json`**: Array berisi 14 nilai *threshold* optimal untuk setiap label, yang digunakan pada hasil rata-rata probabilitas kedua model.
|
| 24 |
+
|
| 25 |
+
## Cara Menggunakan
|
| 26 |
+
|
| 27 |
+
Berikut adalah contoh kode untuk memuat semua komponen dan melakukan prediksi dengan *ensemble* ini:
|
| 28 |
+
|
| 29 |
+
```python
|
| 30 |
+
import torch
|
| 31 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
| 32 |
+
from scipy.special import expit as sigmoid
|
| 33 |
+
import json
|
| 34 |
+
import requests
|
| 35 |
+
import numpy as np
|
| 36 |
+
|
| 37 |
+
# -- Informasi Repositori --
|
| 38 |
+
REPO_ID = "Trentz/emotion-classification-ensemble"
|
| 39 |
+
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
| 40 |
+
|
| 41 |
+
# -- Label Mapping --
|
| 42 |
+
LABELS = ['amusement', 'anger', 'annoyance', 'caring', 'confusion', 'disappointment', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'joy', 'love', 'sadness']
|
| 43 |
+
|
| 44 |
+
class EmotionEnsemble:
|
| 45 |
+
def __init__(self, repo_id, device="cpu"):
|
| 46 |
+
self.device = device
|
| 47 |
+
print("Memuat semua komponen model...")
|
| 48 |
+
|
| 49 |
+
# Muat DeBERTa
|
| 50 |
+
self.deberta_tokenizer = AutoTokenizer.from_pretrained(repo_id, subfolder="deberta_model")
|
| 51 |
+
self.deberta_model = AutoModelForSequenceClassification.from_pretrained(repo_id, subfolder="deberta_model").to(self.device).eval()
|
| 52 |
+
|
| 53 |
+
# Muat RoBERTa
|
| 54 |
+
self.roberta_tokenizer = AutoTokenizer.from_pretrained(repo_id, subfolder="roberta_model")
|
| 55 |
+
self.roberta_model = AutoModelForSequenceClassification.from_pretrained(repo_id, subfolder="roberta_model").to(self.device).eval()
|
| 56 |
+
|
| 57 |
+
# Muat thresholds
|
| 58 |
+
thresholds_url = f"[https://huggingface.co/](https://huggingface.co/)Trentz/emotion-classification-ensemble/resolve/main/best_thresholds.json"
|
| 59 |
+
response = requests.get(thresholds_url)
|
| 60 |
+
self.thresholds = torch.tensor(response.json(), device=self.device)
|
| 61 |
+
|
| 62 |
+
print("Semua komponen berhasil dimuat.")
|
| 63 |
+
|
| 64 |
+
def predict(self, text: str):
|
| 65 |
+
with torch.no_grad():
|
| 66 |
+
# Prediksi DeBERTa
|
| 67 |
+
deberta_inputs = self.deberta_tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(self.device)
|
| 68 |
+
deberta_probs = torch.sigmoid(self.deberta_model(**deberta_inputs).logits).squeeze()
|
| 69 |
+
|
| 70 |
+
# Prediksi RoBERTa
|
| 71 |
+
roberta_inputs = self.roberta_tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(self.device)
|
| 72 |
+
roberta_probs = torch.sigmoid(self.roberta_model(**roberta_inputs).logits).squeeze()
|
| 73 |
+
|
| 74 |
+
# Rata-ratakan probabilitas
|
| 75 |
+
avg_probs = (deberta_probs + roberta_probs) / 2.0
|
| 76 |
+
|
| 77 |
+
# Terapkan threshold & logika "Best Guess"
|
| 78 |
+
preds = (avg_probs > self.thresholds).int()
|
| 79 |
+
if preds.sum() == 0:
|
| 80 |
+
best_guess_idx = torch.argmax(avg_probs).item()
|
| 81 |
+
final_labels = [LABELS[best_guess_idx]]
|
| 82 |
+
else:
|
| 83 |
+
final_labels = [LABELS[i] for i, pred in enumerate(preds) if pred == 1]
|
| 84 |
+
|
| 85 |
+
return { "text": text, "predicted_emotions": final_labels, "scores": avg_probs.cpu().tolist() }
|
| 86 |
+
|
| 87 |
+
# -- Contoh Penggunaan --
|
| 88 |
+
# Inisialisasi model ensemble
|
| 89 |
+
ensemble_model = EmotionEnsemble(REPO_ID, device=DEVICE)
|
| 90 |
+
|
| 91 |
+
# Prediksi teks
|
| 92 |
+
example_text = "This is amazing! Thank you so much for everything, I really love it."
|
| 93 |
+
result = ensemble_model.predict(example_text)
|
| 94 |
+
print(result)
|
| 95 |
+
# Diharapkan output mengandung: 'amusement', 'excitement', 'joy', 'love', 'gratitude'
|
| 96 |
+
|
| 97 |
+
example_text_2 = "I can't believe you would do that. It's so annoying and disappointing."
|
| 98 |
+
result_2 = ensemble_model.predict(example_text_2)
|
| 99 |
+
print(result_2)
|
| 100 |
+
# Diharapkan output mengandung: 'annoyance', 'disappointment', 'anger'
|
| 101 |
+
|