rev 1 implementation

Browse files

Files changed (8) hide show

README.md +0 -88
config.json +32 -7
pytorch_model.bin → model.safetensors +2 -2
module.py +0 -54
special_tokens_map.json +0 -7
tokenizer.json +0 -0
tokenizer_config.json +0 -16
vocab.txt +0 -0

README.md DELETED Viewed

@@ -1,88 +0,0 @@
----
-license: mit
-language:
-- de
-pipeline_tag: text-classification
-metrics:
-- f1
-library_name: transformers
----
-# PopBERT
-PopBERT is a model for German-language populism detection in political speeches within the German Bundestag, based on the deepset/gbert-large model: https://huggingface.co/deepset/gbert-large
-It is a multilabel model trained on a manually curated dataset of sentences from the 18th and 19th legislative periods.
-In addition to capturing the foundational dimensions of populism, namely "anti-elitism" and "people-centrism," the model was also fine-tuned to identify the underlying ideological orientation as either "left-wing" or "right-wing."
-# Prediction
-The model outputs a Tensor of length 4.
-The table connects the position of the predicted probability to its dimension.
-| **Index** | **Dimension**            |
-|-----------|--------------------------|
-| 0         | Anti-Elitism             |
-| 1         | People-Centrism          |
-| 2         | Left-Wing Host-Ideology  |
-| 3         | Right-Wing Host-Ideology |
-# Usage Example
-```python
-import torch
-from transformers import AutoModel
-from transformers import AutoTokenizer
-# optional commit_hash to ensure a consistent version of the model
-commit_hash = "2354335caedc36df44da926291786f0159a502f0"
-# load tokenizer
-tokenizer = AutoTokenizer.from_pretrained("luerhard/PopBERT", revision=commit_hash)
-# load model
-# trust_remote_code is necessary to use the custom architecture of this model (module.py)
-model = AutoModel.from_pretrained("luerhard/PopBERT", trust_remote_code=True, revision=commit_hash)
-# define text to be predicted
-text = (
-    "Das ist Klassenkampf von oben, das ist Klassenkampf im Interesse von "
-    "Vermögenden und Besitzenden gegen die Mehrheit der Steuerzahlerinnen und "
-    "Steuerzahler auf dieser Erde."
-)
-# encode text with tokenizer
-encodings = tokenizer(text, padding=True, return_tensors="pt")
-# predict
-with torch.inference_mode():
-    _, prediction_tensor = model(**encodings)
-# convert prediction from torch tensor to numpy array
-prediction = prediction_tensor.numpy()
-print(prediction)
-```
-```
-[[0.84803474 0.9991047  0.9919584  0.19843338]]
-```
-# Performance
-This table presents the classification report for a 5-fold cross-validation of our model.
-The hyperparameters are consistent across all 5 runs. The final and published model was then trained on all data with the same hyperparameters.
-It is evident that the model performs, on average, best for anti-elitism but performs the worst for detecting right-wing host ideology.
-The relatively small standard deviations suggest that the split into training and test data has minimal impact on model performance.
-Therefore, it is expected that the performance of the final model will be comparable to what is
-depicted here.
-| Dimension           | Precision     | Recall        | F1            |
-|---------------------|---------------|---------------|---------------|
-| Anti-Elitism        | 0.812 (0.013) | 0.885 (0.006) | 0.847 (0.007) |
-| People-Centrism     | 0.670 (0.011) | 0.725 (0.040) | 0.696 (0.019) |
-| Left-Wing Ideology  | 0.664 (0.023) | 0.771 (0.024) | 0.713 (0.010) |
-| Right-Wing Ideology | 0.654 (0.029) | 0.698 (0.050) | 0.674 (0.031) |
-| ---                 | ---           | ---           | ---           |
-| micro avg           | 0.732 (0.009) | 0.805 (0.006) | 0.767 (0.007) |
-| macro avg           | 0.700 (0.011) | 0.770 (0.010) | 0.733 (0.010) |

config.json CHANGED Viewed

@@ -1,13 +1,38 @@
 {
   "architectures": [
-    "PopBERT"
   ],
-  "auto_map": {
-    "AutoConfig": "module.PopBERTConfig",
-    "AutoModel": "module.PopBERT"
   },
-  "model_type": "popbert",
-  "num_classes": 4,
   "torch_dtype": "float32",
-  "transformers_version": "4.29.2"
 }

 {
+  "_name_or_path": "deepset/gbert-large",
   "architectures": [
+    "BertForSequenceClassification"
   ],
+  "attention_probs_dropout_prob": 0.1,
+  "classifier_dropout": null,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 1024,
+  "id2label": {
+    "0": "LABEL_0",
+    "1": "LABEL_1",
+    "2": "LABEL_2",
+    "3": "LABEL_3"
   },
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "label2id": {
+    "LABEL_0": 0,
+    "LABEL_1": 1,
+    "LABEL_2": 2,
+    "LABEL_3": 3
+  },
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 24,
+  "pad_token_id": 0,
+  "position_embedding_type": "absolute",
+  "problem_type": "multi_label_classification",
   "torch_dtype": "float32",
+  "transformers_version": "4.35.2",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 31102
 }

pytorch_model.bin → model.safetensors RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6483ab1821245614459ff691f00ad8dd291e37e39c6952c3e67b5427f2c6ea22
-size 1343094965

 version https://git-lfs.github.com/spec/v1
+oid sha256:81ff862f8b02ad8406e71033adf39897251f3da27b7632a2050b9527f36b159e
+size 1343006640

module.py DELETED Viewed

@@ -1,54 +0,0 @@
-from __future__ import annotations
-import torch
-from torch import nn
-from torch.nn import BCEWithLogitsLoss
-from transformers import AutoModelForSequenceClassification
-from transformers import PretrainedConfig
-from transformers import PreTrainedModel
-class PopBERTConfig(PretrainedConfig):
-    model_type = "popbert"
-    def __init__(self, num_classes: int = 4, **kwargs):
-        super().__init__(**kwargs)
-        self.num_classes = num_classes
-class PopBERT(PreTrainedModel):
-    config_class = PopBERTConfig
-    def __init__(self, config):
-        super().__init__(config)
-        self.sigmoid = nn.Sigmoid()
-        self.bert = AutoModelForSequenceClassification.from_pretrained(
-            "deepset/gbert-large",
-            num_labels=config.num_classes,
-        )
-    def forward(
-        self,
-        input_ids: torch.Tensor | None = None,
-        attention_mask: torch.Tensor | None = None,
-        token_type_ids: torch.Tensor | None = None,
-        position_ids: torch.Tensor | None = None,
-        head_mask: torch.Tensor | None = None,
-        inputs_embeds: torch.Tensor | None = None,
-        labels: torch.Tensor | None = None,
-    ):
-        pred = self.bert(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-        )
-        loss = None
-        if labels is not None:
-            loss_fn = BCEWithLogitsLoss()
-            loss = loss_fn(pred.logits, labels.float())
-        return loss, self.sigmoid(pred.logits)

special_tokens_map.json DELETED Viewed

@@ -1,7 +0,0 @@
-{
-  "cls_token": "[CLS]",
-  "mask_token": "[MASK]",
-  "pad_token": "[PAD]",
-  "sep_token": "[SEP]",
-  "unk_token": "[UNK]"
-}

tokenizer.json DELETED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json DELETED Viewed

@@ -1,16 +0,0 @@
-{
-  "clean_up_tokenization_spaces": true,
-  "cls_token": "[CLS]",
-  "do_basic_tokenize": true,
-  "do_lower_case": false,
-  "mask_token": "[MASK]",
-  "max_len": 512,
-  "model_max_length": 512,
-  "never_split": null,
-  "pad_token": "[PAD]",
-  "sep_token": "[SEP]",
-  "strip_accents": false,
-  "tokenize_chinese_chars": true,
-  "tokenizer_class": "BertTokenizer",
-  "unk_token": "[UNK]"
-}

vocab.txt DELETED Viewed

The diff for this file is too large to render. See raw diff