|
|
|
|
|
--- |
|
|
license: mit |
|
|
language: en |
|
|
pipeline_tag: text-classification |
|
|
tags: |
|
|
- text-classification |
|
|
- multi-label |
|
|
- emotion-classification |
|
|
- ensemble |
|
|
- deberta |
|
|
- roberta |
|
|
--- |
|
|
|
|
|
# Ensemble Model untuk Klasifikasi Emosi Multi-Label |
|
|
|
|
|
Ini adalah repositori untuk sistem model *ensemble* yang meraih peringkat pertama dalam tugas klasifikasi emosi multi-label. |
|
|
Sistem ini menggabungkan dua model kuat, **DeBERTa-v3-Large** dan **RoBERTa-Large**, yang dilatih dengan teknik LLRD (Layer-wise Learning Rate Decay) dan Focal Loss. |
|
|
|
|
|
## Komponen Ensemble |
|
|
- **`deberta_model`**: Model `microsoft/deberta-v3-large` yang telah di-fine-tune. |
|
|
- **`roberta_model`**: Model `roberta-large` yang telah di-fine-tune. |
|
|
- **`best_thresholds.json`**: Array berisi 14 nilai *threshold* optimal untuk setiap label, yang digunakan pada hasil rata-rata probabilitas kedua model. |
|
|
|
|
|
## Cara Menggunakan |
|
|
|
|
|
Berikut adalah contoh kode untuk memuat semua komponen dan melakukan prediksi dengan *ensemble* ini: |
|
|
|
|
|
```python |
|
|
import torch |
|
|
from transformers import AutoTokenizer, AutoModelForSequenceClassification |
|
|
from scipy.special import expit as sigmoid |
|
|
import json |
|
|
import requests |
|
|
import numpy as np |
|
|
|
|
|
# -- Informasi Repositori -- |
|
|
REPO_ID = "Trentz/emotion-classification-ensemble" |
|
|
DEVICE = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
|
|
|
# -- Label Mapping -- |
|
|
LABELS = ['amusement', 'anger', 'annoyance', 'caring', 'confusion', 'disappointment', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'joy', 'love', 'sadness'] |
|
|
|
|
|
class EmotionEnsemble: |
|
|
def __init__(self, repo_id, device="cpu"): |
|
|
self.device = device |
|
|
print("Memuat semua komponen model...") |
|
|
|
|
|
# Muat DeBERTa |
|
|
self.deberta_tokenizer = AutoTokenizer.from_pretrained(repo_id, subfolder="deberta_model") |
|
|
self.deberta_model = AutoModelForSequenceClassification.from_pretrained(repo_id, subfolder="deberta_model").to(self.device).eval() |
|
|
|
|
|
# Muat RoBERTa |
|
|
self.roberta_tokenizer = AutoTokenizer.from_pretrained(repo_id, subfolder="roberta_model") |
|
|
self.roberta_model = AutoModelForSequenceClassification.from_pretrained(repo_id, subfolder="roberta_model").to(self.device).eval() |
|
|
|
|
|
# Muat thresholds |
|
|
thresholds_url = f"[https://huggingface.co/](https://huggingface.co/)Trentz/emotion-classification-ensemble/resolve/main/best_thresholds.json" |
|
|
response = requests.get(thresholds_url) |
|
|
self.thresholds = torch.tensor(response.json(), device=self.device) |
|
|
|
|
|
print("Semua komponen berhasil dimuat.") |
|
|
|
|
|
def predict(self, text: str): |
|
|
with torch.no_grad(): |
|
|
# Prediksi DeBERTa |
|
|
deberta_inputs = self.deberta_tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(self.device) |
|
|
deberta_probs = torch.sigmoid(self.deberta_model(**deberta_inputs).logits).squeeze() |
|
|
|
|
|
# Prediksi RoBERTa |
|
|
roberta_inputs = self.roberta_tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(self.device) |
|
|
roberta_probs = torch.sigmoid(self.roberta_model(**roberta_inputs).logits).squeeze() |
|
|
|
|
|
# Rata-ratakan probabilitas |
|
|
avg_probs = (deberta_probs + roberta_probs) / 2.0 |
|
|
|
|
|
# Terapkan threshold & logika "Best Guess" |
|
|
preds = (avg_probs > self.thresholds).int() |
|
|
if preds.sum() == 0: |
|
|
best_guess_idx = torch.argmax(avg_probs).item() |
|
|
final_labels = [LABELS[best_guess_idx]] |
|
|
else: |
|
|
final_labels = [LABELS[i] for i, pred in enumerate(preds) if pred == 1] |
|
|
|
|
|
return { "text": text, "predicted_emotions": final_labels, "scores": avg_probs.cpu().tolist() } |
|
|
|
|
|
# -- Contoh Penggunaan -- |
|
|
# Inisialisasi model ensemble |
|
|
ensemble_model = EmotionEnsemble(REPO_ID, device=DEVICE) |
|
|
|
|
|
# Prediksi teks |
|
|
example_text = "This is amazing! Thank you so much for everything, I really love it." |
|
|
result = ensemble_model.predict(example_text) |
|
|
print(result) |
|
|
# Diharapkan output mengandung: 'amusement', 'excitement', 'joy', 'love', 'gratitude' |
|
|
|
|
|
example_text_2 = "I can't believe you would do that. It's so annoying and disappointing." |
|
|
result_2 = ensemble_model.predict(example_text_2) |
|
|
print(result_2) |
|
|
# Diharapkan output mengandung: 'annoyance', 'disappointment', 'anger' |
|
|
|
|
|
|