|
|
--- |
|
|
license: mit |
|
|
pipeline_tag: text-classification |
|
|
tags: |
|
|
- argument-detection |
|
|
- stance-detection |
|
|
- multi-task-learning |
|
|
language: |
|
|
- en |
|
|
base_model: |
|
|
- answerdotai/ModernBERT-large |
|
|
--- |
|
|
|
|
|
This model has been pushed to the Hub using the [PytorchModelHubMixin](https://huggingface.co/docs/huggingface_hub/package_reference/mixins#huggingface_hub.PyTorchModelHubMixin) integration: |
|
|
|
|
|
--- |
|
|
## Model Description |
|
|
This is a multi-task learning (MTL) model built on top of `answerdotai/ModernBERT-large`. The model is designed to perform two distinct text classification tasks using a shared feature representation, enhanced by a Mixture-of-Experts (MoE) layer. |
|
|
|
|
|
The model can be used for: |
|
|
1. **Argumentativeness Classification:** Classifying a text as either "Argumentative" or "Non-argumentative." |
|
|
2. **Stance Classification:** Classifying the relationship between two claims as "Same-side" or "Opposing-side." |
|
|
|
|
|
## How to use |
|
|
You can use this model for inference by loading it with the `transformers` library. The following code demonstrates how to make a prediction: |
|
|
|
|
|
```python |
|
|
import torch |
|
|
import torch.nn as nn |
|
|
import torch.nn.functional as F |
|
|
from transformers import AutoTokenizer, AutoModel |
|
|
from huggingface_hub import PyTorchModelHubMixin |
|
|
|
|
|
class MoELayer(nn.Module): |
|
|
def __init__(self, input_dim, num_experts, top_k=2): |
|
|
super(MoELayer, self).__init__() |
|
|
self.num_experts = num_experts |
|
|
self.top_k = top_k |
|
|
|
|
|
# Define experts as independent feed-forward layers |
|
|
self.experts = nn.ModuleList([nn.Sequential( |
|
|
nn.Linear(input_dim, input_dim * 2), |
|
|
nn.ReLU(), |
|
|
nn.Linear(input_dim * 2, input_dim) |
|
|
) for _ in range(num_experts)]) |
|
|
|
|
|
self.gating_network = nn.Linear(input_dim, num_experts) |
|
|
|
|
|
def forward(self, x): |
|
|
gate_logits = self.gating_network(x) |
|
|
gate_probs = F.softmax(gate_logits, dim=-1) |
|
|
|
|
|
# Get top-k experts for each input |
|
|
topk_vals, topk_indices = torch.topk(gate_probs, self.top_k, dim=-1) |
|
|
|
|
|
# Compute contributions from top-k experts |
|
|
output = torch.zeros_like(x) |
|
|
for i in range(self.top_k): |
|
|
expert_idx = topk_indices[:, i] |
|
|
expert_weight = topk_vals[:, i].unsqueeze(-1) |
|
|
|
|
|
expert_outputs = torch.stack([self.experts[j](x[b]) for b, j in enumerate(expert_idx)], dim=0) |
|
|
|
|
|
output += expert_weight * expert_outputs |
|
|
|
|
|
return output |
|
|
|
|
|
class SentenceClassificationMoeMTLModel( |
|
|
nn.Module, |
|
|
PyTorchModelHubMixin, |
|
|
): |
|
|
def __init__(self) -> None: |
|
|
super(SentenceClassificationMoeMTLModel, self).__init__() |
|
|
self.base_model = AutoModel.from_pretrained("answerdotai/ModernBERT-large") |
|
|
|
|
|
self.moe_layer = MoELayer(input_dim=self.base_model.config.hidden_size, num_experts=8, top_k=2) |
|
|
|
|
|
self.task_1_classifier = nn.Sequential( |
|
|
nn.Linear(in_features=self.base_model.config.hidden_size, out_features=768, bias=False), |
|
|
nn.GELU(), |
|
|
nn.LayerNorm(768, eps=1e-05, elementwise_affine=True), |
|
|
nn.Linear(768, 2) |
|
|
) |
|
|
|
|
|
self.task_2_classifier = nn.Sequential( |
|
|
nn.Linear(in_features=self.base_model.config.hidden_size, out_features=768, bias=False), |
|
|
nn.GELU(), |
|
|
nn.LayerNorm(768, eps=1e-05, elementwise_affine=True), |
|
|
nn.Linear(768, 2), |
|
|
) |
|
|
|
|
|
def forward(self, task, input_ids, attention_mask): |
|
|
x = self.base_model(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state |
|
|
cls_r = x[:, 0] |
|
|
|
|
|
x = self.moe_layer(x[:, 0]) |
|
|
|
|
|
if task == "arg": |
|
|
x = self.task_1_classifier(x) |
|
|
elif task == "stance": |
|
|
x = self.task_2_classifier(x) |
|
|
|
|
|
return x, cls_r |
|
|
|
|
|
model_name = "azza1625/argument-same-side-stance-classification" |
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
|
|
|
|
model = SentenceClassificationMoeMTLModel.from_pretrained(model_name) |
|
|
model.eval() |
|
|
|
|
|
device = "cpu" |
|
|
|
|
|
def classify_sequence(seq, task, label_map): |
|
|
enc = tokenizer( |
|
|
*(seq if task == 'stance' else (seq,)), |
|
|
return_tensors="pt", |
|
|
truncation=True, |
|
|
max_length=1024 |
|
|
).to(device) |
|
|
|
|
|
with torch.no_grad(): |
|
|
logits, _ = model(task=task, **enc) |
|
|
probs = torch.softmax(logits, dim=-1).squeeze() |
|
|
pred_idx = probs.argmax().item() |
|
|
confidence = probs[pred_idx].item() |
|
|
|
|
|
return label_map[pred_idx], confidence |
|
|
|
|
|
# Example input for task 1 |
|
|
text = "A fetus or embryo is not a person; therefore, abortion should not be considered murder." |
|
|
|
|
|
label_map = {0: "Non-argumentative", 1: "Argumentative"} |
|
|
label, confidence = classify_sequence(text, 'arg', label_map) |
|
|
|
|
|
print(f"Prediction: {label} (Confidence: {confidence:.2f})") |
|
|
|
|
|
# Example input for task 2 |
|
|
claim_1 = "A fetus or embryo is not a person; therefore, abortion should not be considered murder." |
|
|
claim_2 = "Since death is the intention, such procedures should be considered murder." |
|
|
|
|
|
label_map = {0: "Same-side", 1: "Opposing-side"} |
|
|
label, confidence = classify_sequence([claim_1, claim_2], 'stance', label_map) |
|
|
|
|
|
print(f"Prediction: {label} (Confidence: {confidence:.2f})") |