Upload folder using huggingface_hub

Browse files

Files changed (7) hide show

README.md +71 -70
config.json +66 -18
head_config.json +1 -1
model.safetensors +2 -2
modeling_havelock.py +47 -109
tokenizer.json +0 -0
tokenizer_config.json +7 -5

README.md CHANGED Viewed

@@ -2,7 +2,7 @@
 license: mit
 tags:
 - token-classification
-- bert
 - orality
 - linguistics
 - multi-label
@@ -11,7 +11,7 @@ language:
 metrics:
 - f1
 base_model:
-- google-bert/bert-base-uncased
 pipeline_tag: token-classification
 library_name: transformers
 datasets:
@@ -20,7 +20,7 @@ datasets:
 # Havelock Orality Token Classifier
-BERT-based token classifier for detecting **oral and literate markers** in text, based on Walter Ong's "Orality and Literacy" (1982).
 This model performs multi-label span-level detection of 53 rhetorical marker types, where each token independently carries B/I/O labels per type — allowing overlapping spans (e.g. a token that is simultaneously part of a concessive and a nested clause).
@@ -28,13 +28,13 @@ This model performs multi-label span-level detection of 53 rhetorical marker typ
 | Property | Value |
 |----------|-------|
-| Base model | `bert-base-uncased` |
 | Task | Multi-label token classification (independent B/I/O per type) |
 | Marker types | 53 (22 oral, 31 literate) |
-| Test macro F1 | **0.386** (per-type detection, binary positive = B or I) |
-| Training | 20 epochs, batch 24, lr 3e-5, fp16 |
 | Regularization | Mixout (p=0.1) — stochastic L2 anchor to pretrained weights |
-| Loss | Per-type weighted cross-entropy with inverse-frequency type weights |
 | Min examples | 150 (types below this threshold excluded) |
 ## Usage
@@ -118,61 +118,61 @@ Per-type detection F1 on test set (binary: B or I = positive, O = negative):
 ```
 Type                                            Prec    Rec     F1    Sup
 ========================================================================
-literate_abstract_noun                         0.209  0.329  0.255    420
-literate_additive_formal                       0.243  0.479  0.322     71
-literate_agent_demoted                         0.468  0.664  0.549    414
-literate_agentless_passive                     0.555  0.648  0.598   1168
-literate_aside                                 0.481  0.469  0.475    469
-literate_categorical_statement                 0.084  0.263  0.128    118
-literate_causal_explicit                       0.314  0.386  0.347    272
-literate_citation                              0.468  0.431  0.449    255
-literate_conceptual_metaphor                   0.370  0.397  0.383    517
-literate_concessive                            0.456  0.503  0.478    533
-literate_concessive_connector                  0.250  0.603  0.353     63
-literate_concrete_setting                      0.186  0.322  0.236    298
-literate_conditional                           0.519  0.548  0.533   1514
-literate_contrastive                           0.391  0.462  0.424    424
-literate_cross_reference                       0.825  0.316  0.457    253
-literate_definitional_move                     0.443  0.432  0.438    236
-literate_enumeration                           0.147  0.306  0.198    297
-literate_epistemic_hedge                       0.236  0.431  0.305    255
-literate_evidential                            0.269  0.472  0.342    106
-literate_institutional_subject                 0.157  0.450  0.233    111
-literate_list_structure                        0.528  0.614  0.567    295
-literate_metadiscourse                         0.355  0.407  0.379    447
-literate_nested_clauses                        0.143  0.093  0.113   2044
-literate_nominalization                        0.433  0.538  0.480   1013
-literate_objectifying_stance                   0.451  0.575  0.506    113
-literate_probability                           0.439  0.720  0.545     50
-literate_qualified_assertion                   0.186  0.077  0.109    142
-literate_relative_chain                        0.344  0.606  0.439   1456
-literate_technical_abbreviation                0.500  0.705  0.585    139
-literate_technical_term                        0.278  0.423  0.336    825
-literate_temporal_embedding                    0.174  0.253  0.206    400
-oral_anaphora                                  0.500  0.303  0.377    297
-oral_antithesis                                0.298  0.339  0.317    561
-oral_discourse_formula                         0.373  0.461  0.413    492
-oral_embodied_action                           0.295  0.368  0.327    454
-oral_everyday_example                          0.279  0.307  0.293    420
-oral_imperative                                0.359  0.600  0.449    110
-oral_inclusive_we                              0.579  0.668  0.620    681
-oral_intensifier_doubling                      0.429  0.220  0.290     82
-oral_lexical_repetition                        0.328  0.382  0.353    275
-oral_named_individual                          0.359  0.712  0.478    573
-oral_parallelism                               0.111  0.114  0.112    202
-oral_phatic_check                              0.288  0.436  0.347     39
-oral_phatic_filler                             0.389  0.527  0.448    146
-oral_rhetorical_question                       0.581  0.892  0.703   1006
-oral_second_person                             0.555  0.528  0.541    718
-oral_self_correction                           0.293  0.357  0.322    115
-oral_sensory_detail                            0.194  0.402  0.262    246
-oral_simple_conjunction                        0.174  0.229  0.198    131
-oral_specific_place                            0.453  0.751  0.565    406
-oral_temporal_anchor                           0.223  0.704  0.339    257
-oral_tricolon                                  0.470  0.293  0.361    907
-oral_vocative                                  0.386  0.942  0.547     52
 ========================================================================
-Macro avg (types w/ support)                                 0.386
 ```
 </details>
@@ -180,17 +180,17 @@ Macro avg (types w/ support)                                 0.386
 **Missing labels (test set):** 0/53 — all types detected at least once.
 Notable patterns:
-- **Strong performers** (F1 > 0.5): rhetorical_question (0.703), inclusive_we (0.620), agentless_passive (0.598), technical_abbreviation (0.585), list_structure (0.567), specific_place (0.565), agent_demoted (0.549), vocative (0.547), probability (0.545), second_person (0.541), conditional (0.533), objectifying_stance (0.506)
-- **Weak performers** (F1 < 0.2): qualified_assertion (0.109), parallelism (0.112), nested_clauses (0.113), categorical_statement (0.128), enumeration (0.198), simple_conjunction (0.198)
-- **Precision-recall tradeoff**: Most types show higher recall than precision, indicating the model over-predicts markers. Notable exceptions include `cross_reference` (0.825 precision / 0.316 recall), `anaphora` (0.500 / 0.303), and `tricolon` (0.470 / 0.293), which remain high-precision but low-recall.
 ## Architecture
 Custom `MultiLabelTokenClassifier` with independent B/I/O heads per marker type:
 ```
-BertModel (bert-base-uncased)
     └── Dropout (p=0.1)
-        └── Linear (768 → num_types × 3)
             └── Reshape to (batch, seq, num_types, 3)
 ```
@@ -199,13 +199,14 @@ Each marker type gets an independent 3-way O/B/I classification, so a token can
 ### Regularization
 - **Mixout** (p=0.1): During training, each backbone weight element has a 10% chance of being replaced by its pretrained value per forward pass, acting as a stochastic L2 anchor that prevents representation drift (Lee et al., 2019)
 - **Inverse-frequency type weights**: Rare marker types receive higher loss weighting
 - **Inverse-frequency OBI weights**: B and I classes upweighted relative to dominant O class
 - **Weighted random sampling**: Examples containing rarer markers sampled more frequently
 ### Initialization
-Fine-tuned from `bert-base-uncased`. Backbone linear layers wrapped with Mixout during training (frozen pretrained copy used as anchor). The classification head is randomly initialized:
 ```
 backbone.* layers  → loaded from pretrained, anchored via Mixout
 classifier.weight  → randomly initialized
@@ -214,9 +215,8 @@ classifier.bias    → randomly initialized
 ## Limitations
-- **Recall-dominated errors**: Most types over-predict (recall > precision), producing false positives; downstream applications may need confidence thresholding
-- **Near-zero recall types**: `qualified_assertion` (0.077 recall), `nested_clauses` (0.093), and `parallelism` (0.114) are rarely detected despite being present in training data
-- **Low-precision types**: `categorical_statement` (0.084), `parallelism` (0.111), and `nested_clauses` (0.143) have precision below 0.15, meaning most predictions for those types are false positives
 - **Context window**: 128 tokens max; longer spans may be truncated
 - **Domain**: Trained primarily on historical/literary texts; may underperform on modern social media
 - **Subjectivity**: Some marker boundaries are inherently ambiguous
@@ -235,6 +235,7 @@ classifier.bias    → randomly initialized
 - Ong, Walter J. *Orality and Literacy: The Technologizing of the Word*. Routledge, 1982.
 - Lee, C. et al. "Mixout: Effective Regularization to Finetune Large-scale Pretrained Language Models." ICLR 2020.
 ---

 license: mit
 tags:
 - token-classification
+- modernbert
 - orality
 - linguistics
 - multi-label
 metrics:
 - f1
 base_model:
+- answerdotai/ModernBERT-base
 pipeline_tag: token-classification
 library_name: transformers
 datasets:
 # Havelock Orality Token Classifier
+ModernBERT-based token classifier for detecting **oral and literate markers** in text, based on Walter Ong's "Orality and Literacy" (1982).
 This model performs multi-label span-level detection of 53 rhetorical marker types, where each token independently carries B/I/O labels per type — allowing overlapping spans (e.g. a token that is simultaneously part of a concessive and a nested clause).
 | Property | Value |
 |----------|-------|
+| Base model | `answerdotai/ModernBERT-base` |
 | Task | Multi-label token classification (independent B/I/O per type) |
 | Marker types | 53 (22 oral, 31 literate) |
+| Test macro F1 | **0.378** (per-type detection, binary positive = B or I) |
+| Training | 20 epochs, fp16 |
 | Regularization | Mixout (p=0.1) — stochastic L2 anchor to pretrained weights |
+| Loss | Per-type focal loss (γ=2.0) with inverse-frequency OBI and type weights |
 | Min examples | 150 (types below this threshold excluded) |
 ## Usage
 ```
 Type                                            Prec    Rec     F1    Sup
 ========================================================================
+literate_abstract_noun                         0.190  0.325  0.240    381
+literate_additive_formal                       0.246  0.556  0.341     27
+literate_agent_demoted                         0.404  0.368  0.386    304
+literate_agentless_passive                     0.575  0.607  0.591   1133
+literate_aside                                 0.379  0.429  0.403    436
+literate_categorical_statement                 0.267  0.146  0.189    514
+literate_causal_explicit                       0.227  0.279  0.251    190
+literate_citation                              0.639  0.556  0.595    372
+literate_conceptual_metaphor                   0.310  0.364  0.335    415
+literate_concessive                            0.499  0.470  0.484    502
+literate_concessive_connector                  0.455  0.408  0.430     49
+literate_concrete_setting                      0.241  0.125  0.165    407
+literate_conditional                           0.369  0.630  0.466    760
+literate_contrastive                           0.310  0.428  0.360    341
+literate_cross_reference                       0.386  0.524  0.444     42
+literate_definitional_move                     0.395  0.185  0.252     81
+literate_enumeration                           0.495  0.483  0.489    775
+literate_epistemic_hedge                       0.421  0.481  0.449    445
+literate_evidential                            0.625  0.360  0.457    472
+literate_institutional_subject                 0.332  0.326  0.329    282
+literate_list_structure                        0.338  0.523  0.411     86
+literate_metadiscourse                         0.140  0.393  0.206    135
+literate_nested_clauses                        0.091  0.246  0.133   1169
+literate_nominalization                        0.499  0.612  0.549    991
+literate_objectifying_stance                   0.635  0.365  0.464    167
+literate_probability                           0.432  0.593  0.500     27
+literate_qualified_assertion                   0.143  0.100  0.118     40
+literate_relative_chain                        0.382  0.507  0.436   1424
+literate_technical_abbreviation                0.667  0.711  0.688    225
+literate_technical_term                        0.280  0.375  0.321    715
+literate_temporal_embedding                    0.228  0.259  0.242    526
+oral_anaphora                                  0.800  0.028  0.054    287
+oral_antithesis                                0.249  0.238  0.243    412
+oral_discourse_formula                         0.340  0.408  0.371    557
+oral_embodied_action                           0.280  0.391  0.326    425
+oral_everyday_example                          0.333  0.156  0.212    404
+oral_imperative                                0.591  0.662  0.625    293
+oral_inclusive_we                              0.516  0.632  0.568    622
+oral_intensifier_doubling                      0.680  0.200  0.309     85
+oral_lexical_repetition                        0.404  0.254  0.312    173
+oral_named_individual                          0.441  0.749  0.556    770
+oral_parallelism                               0.741  0.110  0.191    182
+oral_phatic_check                              0.611  0.733  0.667     30
+oral_phatic_filler                             0.174  0.409  0.244     93
+oral_rhetorical_question                       0.509  0.692  0.586    905
+oral_second_person                             0.576  0.552  0.564    811
+oral_self_correction                           0.158  0.235  0.189     51
+oral_sensory_detail                            0.285  0.169  0.212    461
+oral_simple_conjunction                        0.179  0.102  0.130     98
+oral_specific_place                            0.556  0.705  0.622    424
+oral_temporal_anchor                           0.410  0.559  0.473    546
+oral_tricolon                                  0.299  0.119  0.171    553
+oral_vocative                                  0.652  0.747  0.696    158
 ========================================================================
+Macro avg (types w/ support)                                 0.378
 ```
 </details>
 **Missing labels (test set):** 0/53 — all types detected at least once.
 Notable patterns:
+- **Strong performers** (F1 > 0.5): vocative (0.696), technical_abbreviation (0.688), phatic_check (0.667), imperative (0.625), specific_place (0.622), citation (0.595), agentless_passive (0.591), rhetorical_question (0.586), inclusive_we (0.568), second_person (0.564), named_individual (0.556), nominalization (0.549), probability (0.500)
+- **Weak performers** (F1 < 0.2): anaphora (0.054), qualified_assertion (0.118), simple_conjunction (0.130), nested_clauses (0.133), concrete_setting (0.165), tricolon (0.171), categorical_statement (0.189), self_correction (0.189), parallelism (0.191)
+- **Precision-recall tradeoff**: Most types show balanced precision/recall. Notable exceptions include `anaphora` (0.800 precision / 0.028 recall), `parallelism` (0.741 / 0.110), and `intensifier_doubling` (0.680 / 0.200), which remain high-precision but very low-recall.
 ## Architecture
 Custom `MultiLabelTokenClassifier` with independent B/I/O heads per marker type:
 ```
+ModernBERT (answerdotai/ModernBERT-base)
     └── Dropout (p=0.1)
+        └── Linear (hidden_size → num_types × 3)
             └── Reshape to (batch, seq, num_types, 3)
 ```
 ### Regularization
 - **Mixout** (p=0.1): During training, each backbone weight element has a 10% chance of being replaced by its pretrained value per forward pass, acting as a stochastic L2 anchor that prevents representation drift (Lee et al., 2019)
+- **Per-type focal loss** (γ=2.0): Focuses learning on hard examples, reducing the contribution of easy negatives
 - **Inverse-frequency type weights**: Rare marker types receive higher loss weighting
 - **Inverse-frequency OBI weights**: B and I classes upweighted relative to dominant O class
 - **Weighted random sampling**: Examples containing rarer markers sampled more frequently
 ### Initialization
+Fine-tuned from `answerdotai/ModernBERT-base`. Backbone linear layers wrapped with Mixout during training (frozen pretrained copy used as anchor). The classification head is randomly initialized:
 ```
 backbone.* layers  → loaded from pretrained, anchored via Mixout
 classifier.weight  → randomly initialized
 ## Limitations
+- **Near-zero recall types**: `anaphora` (0.028 recall), `simple_conjunction` (0.102), `parallelism` (0.110), and `tricolon` (0.119) are rarely detected despite being present in training data
+- **Low-precision types**: `nested_clauses` (0.091), `metadiscourse` (0.140), and `qualified_assertion` (0.143) have precision below 0.15, meaning most predictions for those types are false positives
 - **Context window**: 128 tokens max; longer spans may be truncated
 - **Domain**: Trained primarily on historical/literary texts; may underperform on modern social media
 - **Subjectivity**: Some marker boundaries are inherently ambiguous
 - Ong, Walter J. *Orality and Literacy: The Technologizing of the Word*. Routledge, 1982.
 - Lee, C. et al. "Mixout: Effective Regularization to Finetune Large-scale Pretrained Language Models." ICLR 2020.
+- Warner, A. et al. "Smarter, Better, Faster, Longer: A Modern Bidirectional Encoder for Fast, Memory Efficient, and Long Context Finetuning and Inference." 2024.
 ---

config.json CHANGED Viewed

@@ -1,19 +1,26 @@
 {
-  "add_cross_attention": false,
   "architectures": [
-    "BertForMaskedLM"
   ],
-  "attention_probs_dropout_prob": 0.1,
   "auto_map": {
     "AutoModel": "modeling_havelock.HavelockTokenClassifier"
   },
-  "bos_token_id": null,
-  "classifier_dropout": null,
   "dtype": "float32",
-  "eos_token_id": null,
   "gradient_checkpointing": false,
-  "hidden_act": "gelu",
-  "hidden_dropout_prob": 0.1,
   "hidden_size": 768,
   "id2label": {
     "0": "O-literate_abstract_noun",
@@ -176,9 +183,9 @@
     "98": "I-oral_antithesis",
     "99": "O-oral_discourse_formula"
   },
   "initializer_range": 0.02,
-  "intermediate_size": 3072,
-  "is_decoder": false,
   "label2id": {
     "B-literate_abstract_noun": 1,
     "B-literate_additive_formal": 4,
@@ -340,18 +347,59 @@
     "O-oral_tricolon": 153,
     "O-oral_vocative": 156
   },
-  "layer_norm_eps": 1e-12,
-  "max_position_embeddings": 512,
-  "model_type": "bert",
   "num_attention_heads": 12,
-  "num_hidden_layers": 12,
   "num_types": 53,
-  "pad_token_id": 0,
   "position_embedding_type": "absolute",
   "tie_word_embeddings": true,
   "transformers_version": "5.0.0",
-  "type_vocab_size": 2,
-  "use_cache": true,
   "use_crf": true,
-  "vocab_size": 30522
 }

 {
   "architectures": [
+    "ModernBertForMaskedLM"
   ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
   "auto_map": {
     "AutoModel": "modeling_havelock.HavelockTokenClassifier"
   },
+  "bos_token_id": 50281,
+  "classifier_activation": "gelu",
+  "classifier_bias": false,
+  "classifier_dropout": 0.0,
+  "classifier_pooling": "mean",
+  "cls_token_id": 50281,
+  "decoder_bias": true,
+  "deterministic_flash_attn": false,
   "dtype": "float32",
+  "embedding_dropout": 0.0,
+  "eos_token_id": 50282,
+  "global_attn_every_n_layers": 3,
   "gradient_checkpointing": false,
+  "hidden_activation": "gelu",
   "hidden_size": 768,
   "id2label": {
     "0": "O-literate_abstract_noun",
     "98": "I-oral_antithesis",
     "99": "O-oral_discourse_formula"
   },
+  "initializer_cutoff_factor": 2.0,
   "initializer_range": 0.02,
+  "intermediate_size": 1152,
   "label2id": {
     "B-literate_abstract_noun": 1,
     "B-literate_additive_formal": 4,
     "O-oral_tricolon": 153,
     "O-oral_vocative": 156
   },
+  "layer_norm_eps": 1e-05,
+  "layer_types": [
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention",
+    "sliding_attention",
+    "sliding_attention",
+    "full_attention"
+  ],
+  "local_attention": 128,
+  "max_position_embeddings": 8192,
+  "mlp_bias": false,
+  "mlp_dropout": 0.0,
+  "model_type": "modernbert",
+  "norm_bias": false,
+  "norm_eps": 1e-05,
   "num_attention_heads": 12,
+  "num_hidden_layers": 22,
   "num_types": 53,
+  "pad_token_id": 50283,
   "position_embedding_type": "absolute",
+  "repad_logits_with_grad": false,
+  "rope_parameters": {
+    "full_attention": {
+      "rope_theta": 160000.0,
+      "rope_type": "default"
+    },
+    "sliding_attention": {
+      "rope_theta": 10000.0,
+      "rope_type": "default"
+    }
+  },
+  "sep_token_id": 50282,
+  "sparse_pred_ignore_index": -100,
+  "sparse_prediction": false,
   "tie_word_embeddings": true,
   "transformers_version": "5.0.0",
   "use_crf": true,
+  "vocab_size": 50368
 }

head_config.json CHANGED Viewed

@@ -1,5 +1,5 @@
 {
-  "model_name": "bert-base-uncased",
   "num_types": 53,
   "hidden_size": 768
 }

 {
+  "model_name": "answerdotai/ModernBERT-base",
   "num_types": 53,
   "hidden_size": 768
 }

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:37d9c74b122fa304421948d1f1bc5ad1d686fb33eab36ae82079c1e8f4a03282
-size 436082548

 version https://git-lfs.github.com/spec/v1
+oid sha256:5048514ce9b2156eb090a211c30847979362fe5372cb94865373a00b5970726d
+size 596563588

modeling_havelock.py CHANGED Viewed

@@ -1,143 +1,81 @@
-"""Custom multi-label token classifier for HuggingFace Hub."""
 import torch
 import torch.nn as nn
-from transformers import BertModel, BertPreTrainedModel
-class MultiLabelCRF(nn.Module):
-    """Independent CRF per marker type for multi-label BIO tagging."""
-    def __init__(self, num_types: int) -> None:
-        super().__init__()
         self.num_types = num_types
-        self.transitions = nn.Parameter(torch.empty(num_types, 3, 3))
-        self.start_transitions = nn.Parameter(torch.empty(num_types, 3))
-        self.end_transitions = nn.Parameter(torch.empty(num_types, 3))
-        # Placeholder — will be overwritten by loaded weights if present
-        self.register_buffer("emission_bias", torch.zeros(1, 1, 1, 3))
-        self._reset_parameters()
-    def _reset_parameters(self) -> None:
-        nn.init.uniform_(self.transitions, -0.1, 0.1)
-        nn.init.uniform_(self.start_transitions, -0.1, 0.1)
-        nn.init.uniform_(self.end_transitions, -0.1, 0.1)
-        with torch.no_grad():
-            self.transitions.data[:, 0, 2] = -10000.0
-            self.start_transitions.data[:, 2] = -10000.0
-    def _apply_emission_bias(self, emissions: torch.Tensor) -> torch.Tensor:
-        if self.emission_bias is not None:
-            return emissions + self.emission_bias
-        return emissions
-    def decode(self, emissions: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
-        """Viterbi decoding.
-        Args:
-            emissions: (batch, seq, num_types, 3)
-            mask: (batch, seq) boolean
-        Returns: (batch, seq, num_types) best tag sequences
-        """
-        # Apply emission bias before decoding
-        emissions = self._apply_emission_bias(emissions)
-        batch, seq, num_types, _ = emissions.shape
-        # Reshape to (batch*num_types, seq, 3)
-        em = emissions.permute(0, 2, 1, 3).reshape(batch * num_types, seq, 3)
-        mk = mask.unsqueeze(1).expand(-1, num_types, -1).reshape(batch * num_types, seq)
-        BT = batch * num_types
-        # Expand params across batch
-        trans = (
-            self.transitions.unsqueeze(0).expand(batch, -1, -1, -1).reshape(BT, 3, 3)
-        )
-        start = self.start_transitions.unsqueeze(0).expand(batch, -1, -1).reshape(BT, 3)
-        end = self.end_transitions.unsqueeze(0).expand(batch, -1, -1).reshape(BT, 3)
-        arange = torch.arange(BT, device=em.device)
-        score = start + em[:, 0]
-        history: list[torch.Tensor] = []
-        for i in range(1, seq):
-            broadcast = score.unsqueeze(2) + trans + em[:, i].unsqueeze(1)
-            best_score, best_prev = broadcast.max(dim=1)
-            score = torch.where(mk[:, i].unsqueeze(1), best_score, score)
-            history.append(best_prev)
-        score = score + end
-        _, best_last = score.max(dim=1)
-        best_paths = torch.zeros(BT, seq, dtype=torch.long, device=em.device)
-        seq_lengths = mk.sum(dim=1).long()
-        best_paths[arange, seq_lengths - 1] = best_last
-        for i in range(seq - 2, -1, -1):
-            prev_tag = history[i][arange, best_paths[:, i + 1]]
-            should_update = i < (seq_lengths - 1)
-            best_paths[:, i] = torch.where(should_update, prev_tag, best_paths[:, i])
-        return best_paths.reshape(batch, num_types, seq).permute(0, 2, 1)
-class HavelockTokenClassifier(BertPreTrainedModel):
-    """Multi-label BIO token classifier with independent O/B/I heads per marker type.
-    Each token gets num_types independent 3-way classifications, allowing
-    overlapping spans (e.g. a token simultaneously B-anaphora and I-concessive).
-    Output logits shape: (batch, seq_len, num_types, 3)
-    """
-    def __init__(self, config):
         super().__init__(config)
         self.num_types = config.num_types
-        self.use_crf = getattr(config, "use_crf", False)
-        self.bert = BertModel(config, add_pooling_layer=False)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         self.classifier = nn.Linear(config.hidden_size, config.num_types * 3)
         if self.use_crf:
             self.crf = MultiLabelCRF(config.num_types)
-        self.post_init()
     def forward(self, input_ids, attention_mask=None, **kwargs):
-        hidden = self.bert(
             input_ids=input_ids, attention_mask=attention_mask
         ).last_hidden_state
         hidden = self.dropout(hidden)
         logits = self.classifier(hidden)
         batch, seq, _ = logits.shape
-        logits = logits.view(batch, seq, self.num_types, 3)
-        # If CRF is available and we're not training, return decoded tags
-        # stacked with logits so callers can access either
-        if self.use_crf and not self.training:
-            mask = (
-                attention_mask.bool()
-                if attention_mask is not None
-                else torch.ones(batch, seq, dtype=torch.bool, device=logits.device)
-            )
-            # Return logits — callers use .decode() or we add a decode method
-            # For HF pipeline compat, return logits; users call decode separately
-            pass
-        return logits
     def decode(self, input_ids, attention_mask=None):
-        """Run forward pass and return Viterbi-decoded tags."""
         logits = self.forward(input_ids, attention_mask)
         if self.use_crf:
             mask = (
                 attention_mask.bool()
                 if attention_mask is not None
-                else torch.ones(
-                    logits.shape[:2], dtype=torch.bool, device=logits.device
-                )
             )
             return self.crf.decode(logits, mask)
-        return logits.argmax(dim=-1)

+"""Custom multi-label token classifier — backbone-agnostic."""
 import torch
 import torch.nn as nn
+from transformers import AutoConfig, AutoModel, PreTrainedModel, PretrainedConfig
+class HavelockTokenConfig(PretrainedConfig):
+    """Config that wraps any backbone config + our custom fields."""
+    model_type = "havelock_token_classifier"
+    def __init__(self, num_types: int = 1, use_crf: bool = False, **kwargs):
+        super().__init__(**kwargs)
         self.num_types = num_types
+        self.use_crf = use_crf
+class HavelockTokenClassifier(PreTrainedModel):
+    config_class = HavelockTokenConfig
+    def __init__(self, config: HavelockTokenConfig, backbone: PreTrainedModel | None = None):
         super().__init__(config)
         self.num_types = config.num_types
+        self.use_crf = config.use_crf
+        # Accept injected backbone (from_pretrained path) or build from config
+        if backbone is not None:
+            self.backbone = backbone
+        else:
+            self.backbone = AutoModel.from_config(config)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         self.classifier = nn.Linear(config.hidden_size, config.num_types * 3)
         if self.use_crf:
             self.crf = MultiLabelCRF(config.num_types)
+    @classmethod
+    def from_backbone(
+        cls,
+        model_name: str,
+        num_types: int,
+        use_crf: bool = False,
+        obi_bias: torch.Tensor | None = None,
+    ) -> "HavelockTokenClassifier":
+        """Build from a pretrained backbone name — the training entrypoint."""
+        backbone = AutoModel.from_pretrained(model_name)
+        backbone_config = backbone.config
+        config = HavelockTokenConfig(
+            num_types=num_types,
+            use_crf=use_crf,
+            **backbone_config.to_dict(),
+        )
+        model = cls(config, backbone=backbone)
+        if use_crf and obi_bias is not None:
+            model.crf.emission_bias = obi_bias.reshape(1, 1, 1, 3)
+        return model
     def forward(self, input_ids, attention_mask=None, **kwargs):
+        hidden = self.backbone(
             input_ids=input_ids, attention_mask=attention_mask
         ).last_hidden_state
         hidden = self.dropout(hidden)
         logits = self.classifier(hidden)
         batch, seq, _ = logits.shape
+        return logits.view(batch, seq, self.num_types, 3)
     def decode(self, input_ids, attention_mask=None):
         logits = self.forward(input_ids, attention_mask)
         if self.use_crf:
             mask = (
                 attention_mask.bool()
                 if attention_mask is not None
+                else torch.ones(logits.shape[:2], dtype=torch.bool, device=logits.device)
             )
             return self.crf.decode(logits, mask)
+        return logits.argmax(dim=-1)

tokenizer.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json CHANGED Viewed

@@ -1,14 +1,16 @@
 {
   "backend": "tokenizers",
   "cls_token": "[CLS]",
-  "do_lower_case": true,
   "is_local": false,
   "mask_token": "[MASK]",
-  "model_max_length": 512,
   "pad_token": "[PAD]",
   "sep_token": "[SEP]",
-  "strip_accents": null,
-  "tokenize_chinese_chars": true,
-  "tokenizer_class": "BertTokenizer",
   "unk_token": "[UNK]"
 }

 {
   "backend": "tokenizers",
+  "clean_up_tokenization_spaces": true,
   "cls_token": "[CLS]",
   "is_local": false,
   "mask_token": "[MASK]",
+  "model_input_names": [
+    "input_ids",
+    "attention_mask"
+  ],
+  "model_max_length": 8192,
   "pad_token": "[PAD]",
   "sep_token": "[SEP]",
+  "tokenizer_class": "TokenizersBackend",
   "unk_token": "[UNK]"
 }