Train & push dual-head DM (final model only) + README

Files changed (8) hide show

README.md ADDED Viewed

+# MariaOls/RussianDMRecognizer_dual
+Dual-headed model (BIO token tagging + sentence-level DM detection) fine-tuned from **viktoroo/sberbank-rubert-base-collection3**.
+Ready for inference with `AutoModel(..., trust_remote_code=True)` or your `probar.py` (sentence by sentence).
+## Eval metrics
+| metric | value |
+|---|---|
+| eval_bio_accuracy | 0.993706 |
+| eval_bio_precision | 0.925270 |
+| eval_bio_recall | 0.936318 |
+| eval_bio_f1 | 0.930762 |
+| eval_cls_accuracy | 0.951408 |
+| eval_cls_precision | 0.966135 |
+| eval_cls_recall | 0.965174 |
+| eval_cls_f1 | 0.965655 |
+## Usage (Python)
+```python
+from transformers import AutoModel, AutoTokenizer
+m = AutoModel.from_pretrained("MariaOls/RussianDMRecognizer_dual", trust_remote_code=True)
+t = AutoTokenizer.from_pretrained("MariaOls/RussianDMRecognizer_dual", use_fast=True)
+# m(...) devuelve dict con "logits": (logits_tok [B,L,3], logits_seq [B,2])

config.json ADDED Viewed

+{
+  "_name_or_path": "viktoroo/sberbank-rubert-base-collection3",
+  "architectures": [
+    "DualHeadDMModel"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "auto_map": {
+    "AutoModel": "modeling_dual_head_dm.DualHeadDMModel"
+  },
+  "classifier_dropout": null,
+  "directionality": "bidi",
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "id2label": {
+    "0": "O",
+    "1": "B-DM",
+    "2": "I-DM"
+  },
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "label2id": {
+    "B-DM": 1,
+    "I-DM": 2,
+    "O": 0
+  },
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 0,
+  "pooler_fc_size": 768,
+  "pooler_num_attention_heads": 12,
+  "pooler_num_fc_layers": 3,
+  "pooler_size_per_head": 128,
+  "pooler_type": "first_token_transform",
+  "position_embedding_type": "absolute",
+  "torch_dtype": "float32",
+  "transformers_version": "4.49.0",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 120138
+}

model.safetensors ADDED Viewed

+version https://git-lfs.github.com/spec/v1
+oid sha256:c3eccb37ffcf391416393720edaabfd8b15d2b30846991435712284d296e47d2
+size 713269028

modeling_dual_head_dm.py ADDED Viewed

+import torch, torch.nn as nn
+from transformers import PreTrainedModel, BertModel, BertConfig
+class DualHeadDMModel(PreTrainedModel):
+    config_class = BertConfig
+    base_model_prefix = "encoder"
+    def __init__(self, config, num_token_labels=3, num_seq_labels=2, seq_loss_weight=0.5):
+        super().__init__(config)
+        if not isinstance(config, BertConfig):
+            config = BertConfig.from_dict(config.to_dict())
+        self.hidden_size = config.hidden_size
+        self.encoder = BertModel(config)
+        self.dropout = nn.Dropout(0.1)
+        self.token_classifier = nn.Linear(self.hidden_size, num_token_labels)
+        self.seq_classifier   = nn.Linear(self.hidden_size, num_seq_labels)
+    def forward(self, input_ids=None, attention_mask=None, candidate_mask=None, **kwargs):
+        out = self.encoder(input_ids=input_ids, attention_mask=attention_mask, return_dict=True)
+        H = self.dropout(out.last_hidden_state)
+        logits_tok = self.token_classifier(H)
+        cls = H[:, 0, :]
+        logits_seq = self.seq_classifier(self.dropout(cls))
+        return {"logits": (logits_tok, logits_seq)}

special_tokens_map.json ADDED Viewed

+{
+  "cls_token": {
+    "content": "[CLS]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "[MASK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "[SEP]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "[UNK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "101": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "102": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "103": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "clean_up_tokenization_spaces": false,
+  "cls_token": "[CLS]",
+  "do_basic_tokenize": true,
+  "do_lower_case": true,
+  "extra_special_tokens": {},
+  "mask_token": "[MASK]",
+  "model_max_length": 1000000000000000019884624838656,
+  "never_split": null,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "unk_token": "[UNK]"
+}

vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff