rev 1 implementation
Browse files- README.md +0 -88
- config.json +32 -7
- pytorch_model.bin → model.safetensors +2 -2
- module.py +0 -54
- special_tokens_map.json +0 -7
- tokenizer.json +0 -0
- tokenizer_config.json +0 -16
- vocab.txt +0 -0
README.md
DELETED
|
@@ -1,88 +0,0 @@
|
|
| 1 |
-
---
|
| 2 |
-
license: mit
|
| 3 |
-
language:
|
| 4 |
-
- de
|
| 5 |
-
pipeline_tag: text-classification
|
| 6 |
-
metrics:
|
| 7 |
-
- f1
|
| 8 |
-
library_name: transformers
|
| 9 |
-
---
|
| 10 |
-
|
| 11 |
-
# PopBERT
|
| 12 |
-
|
| 13 |
-
PopBERT is a model for German-language populism detection in political speeches within the German Bundestag, based on the deepset/gbert-large model: https://huggingface.co/deepset/gbert-large
|
| 14 |
-
|
| 15 |
-
It is a multilabel model trained on a manually curated dataset of sentences from the 18th and 19th legislative periods.
|
| 16 |
-
In addition to capturing the foundational dimensions of populism, namely "anti-elitism" and "people-centrism," the model was also fine-tuned to identify the underlying ideological orientation as either "left-wing" or "right-wing."
|
| 17 |
-
|
| 18 |
-
# Prediction
|
| 19 |
-
|
| 20 |
-
The model outputs a Tensor of length 4.
|
| 21 |
-
The table connects the position of the predicted probability to its dimension.
|
| 22 |
-
|
| 23 |
-
| **Index** | **Dimension** |
|
| 24 |
-
|-----------|--------------------------|
|
| 25 |
-
| 0 | Anti-Elitism |
|
| 26 |
-
| 1 | People-Centrism |
|
| 27 |
-
| 2 | Left-Wing Host-Ideology |
|
| 28 |
-
| 3 | Right-Wing Host-Ideology |
|
| 29 |
-
|
| 30 |
-
# Usage Example
|
| 31 |
-
|
| 32 |
-
```python
|
| 33 |
-
import torch
|
| 34 |
-
from transformers import AutoModel
|
| 35 |
-
from transformers import AutoTokenizer
|
| 36 |
-
|
| 37 |
-
# optional commit_hash to ensure a consistent version of the model
|
| 38 |
-
commit_hash = "2354335caedc36df44da926291786f0159a502f0"
|
| 39 |
-
|
| 40 |
-
# load tokenizer
|
| 41 |
-
tokenizer = AutoTokenizer.from_pretrained("luerhard/PopBERT", revision=commit_hash)
|
| 42 |
-
|
| 43 |
-
# load model
|
| 44 |
-
# trust_remote_code is necessary to use the custom architecture of this model (module.py)
|
| 45 |
-
model = AutoModel.from_pretrained("luerhard/PopBERT", trust_remote_code=True, revision=commit_hash)
|
| 46 |
-
|
| 47 |
-
# define text to be predicted
|
| 48 |
-
text = (
|
| 49 |
-
"Das ist Klassenkampf von oben, das ist Klassenkampf im Interesse von "
|
| 50 |
-
"Vermögenden und Besitzenden gegen die Mehrheit der Steuerzahlerinnen und "
|
| 51 |
-
"Steuerzahler auf dieser Erde."
|
| 52 |
-
)
|
| 53 |
-
|
| 54 |
-
# encode text with tokenizer
|
| 55 |
-
encodings = tokenizer(text, padding=True, return_tensors="pt")
|
| 56 |
-
|
| 57 |
-
# predict
|
| 58 |
-
with torch.inference_mode():
|
| 59 |
-
_, prediction_tensor = model(**encodings)
|
| 60 |
-
|
| 61 |
-
# convert prediction from torch tensor to numpy array
|
| 62 |
-
prediction = prediction_tensor.numpy()
|
| 63 |
-
print(prediction)
|
| 64 |
-
```
|
| 65 |
-
|
| 66 |
-
```
|
| 67 |
-
[[0.84803474 0.9991047 0.9919584 0.19843338]]
|
| 68 |
-
```
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
# Performance
|
| 72 |
-
|
| 73 |
-
This table presents the classification report for a 5-fold cross-validation of our model.
|
| 74 |
-
The hyperparameters are consistent across all 5 runs. The final and published model was then trained on all data with the same hyperparameters.
|
| 75 |
-
It is evident that the model performs, on average, best for anti-elitism but performs the worst for detecting right-wing host ideology.
|
| 76 |
-
The relatively small standard deviations suggest that the split into training and test data has minimal impact on model performance.
|
| 77 |
-
Therefore, it is expected that the performance of the final model will be comparable to what is
|
| 78 |
-
depicted here.
|
| 79 |
-
|
| 80 |
-
| Dimension | Precision | Recall | F1 |
|
| 81 |
-
|---------------------|---------------|---------------|---------------|
|
| 82 |
-
| Anti-Elitism | 0.812 (0.013) | 0.885 (0.006) | 0.847 (0.007) |
|
| 83 |
-
| People-Centrism | 0.670 (0.011) | 0.725 (0.040) | 0.696 (0.019) |
|
| 84 |
-
| Left-Wing Ideology | 0.664 (0.023) | 0.771 (0.024) | 0.713 (0.010) |
|
| 85 |
-
| Right-Wing Ideology | 0.654 (0.029) | 0.698 (0.050) | 0.674 (0.031) |
|
| 86 |
-
| --- | --- | --- | --- |
|
| 87 |
-
| micro avg | 0.732 (0.009) | 0.805 (0.006) | 0.767 (0.007) |
|
| 88 |
-
| macro avg | 0.700 (0.011) | 0.770 (0.010) | 0.733 (0.010) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
config.json
CHANGED
|
@@ -1,13 +1,38 @@
|
|
| 1 |
{
|
|
|
|
| 2 |
"architectures": [
|
| 3 |
-
"
|
| 4 |
],
|
| 5 |
-
"
|
| 6 |
-
|
| 7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
},
|
| 9 |
-
"
|
| 10 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
"torch_dtype": "float32",
|
| 12 |
-
"transformers_version": "4.
|
|
|
|
|
|
|
|
|
|
| 13 |
}
|
|
|
|
| 1 |
{
|
| 2 |
+
"_name_or_path": "deepset/gbert-large",
|
| 3 |
"architectures": [
|
| 4 |
+
"BertForSequenceClassification"
|
| 5 |
],
|
| 6 |
+
"attention_probs_dropout_prob": 0.1,
|
| 7 |
+
"classifier_dropout": null,
|
| 8 |
+
"hidden_act": "gelu",
|
| 9 |
+
"hidden_dropout_prob": 0.1,
|
| 10 |
+
"hidden_size": 1024,
|
| 11 |
+
"id2label": {
|
| 12 |
+
"0": "LABEL_0",
|
| 13 |
+
"1": "LABEL_1",
|
| 14 |
+
"2": "LABEL_2",
|
| 15 |
+
"3": "LABEL_3"
|
| 16 |
},
|
| 17 |
+
"initializer_range": 0.02,
|
| 18 |
+
"intermediate_size": 4096,
|
| 19 |
+
"label2id": {
|
| 20 |
+
"LABEL_0": 0,
|
| 21 |
+
"LABEL_1": 1,
|
| 22 |
+
"LABEL_2": 2,
|
| 23 |
+
"LABEL_3": 3
|
| 24 |
+
},
|
| 25 |
+
"layer_norm_eps": 1e-12,
|
| 26 |
+
"max_position_embeddings": 512,
|
| 27 |
+
"model_type": "bert",
|
| 28 |
+
"num_attention_heads": 16,
|
| 29 |
+
"num_hidden_layers": 24,
|
| 30 |
+
"pad_token_id": 0,
|
| 31 |
+
"position_embedding_type": "absolute",
|
| 32 |
+
"problem_type": "multi_label_classification",
|
| 33 |
"torch_dtype": "float32",
|
| 34 |
+
"transformers_version": "4.35.2",
|
| 35 |
+
"type_vocab_size": 2,
|
| 36 |
+
"use_cache": true,
|
| 37 |
+
"vocab_size": 31102
|
| 38 |
}
|
pytorch_model.bin → model.safetensors
RENAMED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:81ff862f8b02ad8406e71033adf39897251f3da27b7632a2050b9527f36b159e
|
| 3 |
+
size 1343006640
|
module.py
DELETED
|
@@ -1,54 +0,0 @@
|
|
| 1 |
-
from __future__ import annotations
|
| 2 |
-
|
| 3 |
-
import torch
|
| 4 |
-
from torch import nn
|
| 5 |
-
from torch.nn import BCEWithLogitsLoss
|
| 6 |
-
from transformers import AutoModelForSequenceClassification
|
| 7 |
-
from transformers import PretrainedConfig
|
| 8 |
-
from transformers import PreTrainedModel
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
class PopBERTConfig(PretrainedConfig):
|
| 12 |
-
model_type = "popbert"
|
| 13 |
-
|
| 14 |
-
def __init__(self, num_classes: int = 4, **kwargs):
|
| 15 |
-
super().__init__(**kwargs)
|
| 16 |
-
self.num_classes = num_classes
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
class PopBERT(PreTrainedModel):
|
| 20 |
-
config_class = PopBERTConfig
|
| 21 |
-
|
| 22 |
-
def __init__(self, config):
|
| 23 |
-
super().__init__(config)
|
| 24 |
-
self.sigmoid = nn.Sigmoid()
|
| 25 |
-
self.bert = AutoModelForSequenceClassification.from_pretrained(
|
| 26 |
-
"deepset/gbert-large",
|
| 27 |
-
num_labels=config.num_classes,
|
| 28 |
-
)
|
| 29 |
-
|
| 30 |
-
def forward(
|
| 31 |
-
self,
|
| 32 |
-
input_ids: torch.Tensor | None = None,
|
| 33 |
-
attention_mask: torch.Tensor | None = None,
|
| 34 |
-
token_type_ids: torch.Tensor | None = None,
|
| 35 |
-
position_ids: torch.Tensor | None = None,
|
| 36 |
-
head_mask: torch.Tensor | None = None,
|
| 37 |
-
inputs_embeds: torch.Tensor | None = None,
|
| 38 |
-
labels: torch.Tensor | None = None,
|
| 39 |
-
):
|
| 40 |
-
pred = self.bert(
|
| 41 |
-
input_ids,
|
| 42 |
-
attention_mask=attention_mask,
|
| 43 |
-
token_type_ids=token_type_ids,
|
| 44 |
-
position_ids=position_ids,
|
| 45 |
-
head_mask=head_mask,
|
| 46 |
-
inputs_embeds=inputs_embeds,
|
| 47 |
-
)
|
| 48 |
-
|
| 49 |
-
loss = None
|
| 50 |
-
if labels is not None:
|
| 51 |
-
loss_fn = BCEWithLogitsLoss()
|
| 52 |
-
loss = loss_fn(pred.logits, labels.float())
|
| 53 |
-
|
| 54 |
-
return loss, self.sigmoid(pred.logits)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
special_tokens_map.json
DELETED
|
@@ -1,7 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"cls_token": "[CLS]",
|
| 3 |
-
"mask_token": "[MASK]",
|
| 4 |
-
"pad_token": "[PAD]",
|
| 5 |
-
"sep_token": "[SEP]",
|
| 6 |
-
"unk_token": "[UNK]"
|
| 7 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tokenizer.json
DELETED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tokenizer_config.json
DELETED
|
@@ -1,16 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"clean_up_tokenization_spaces": true,
|
| 3 |
-
"cls_token": "[CLS]",
|
| 4 |
-
"do_basic_tokenize": true,
|
| 5 |
-
"do_lower_case": false,
|
| 6 |
-
"mask_token": "[MASK]",
|
| 7 |
-
"max_len": 512,
|
| 8 |
-
"model_max_length": 512,
|
| 9 |
-
"never_split": null,
|
| 10 |
-
"pad_token": "[PAD]",
|
| 11 |
-
"sep_token": "[SEP]",
|
| 12 |
-
"strip_accents": false,
|
| 13 |
-
"tokenize_chinese_chars": true,
|
| 14 |
-
"tokenizer_class": "BertTokenizer",
|
| 15 |
-
"unk_token": "[UNK]"
|
| 16 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vocab.txt
DELETED
|
The diff for this file is too large to render.
See raw diff
|
|
|