permutans commited on
Commit
17f1925
·
verified ·
1 Parent(s): 4becc12

Upload folder using huggingface_hub

Browse files
Files changed (3) hide show
  1. README.md +14 -9
  2. config.json +4 -0
  3. modeling_havelock.py +32 -0
README.md CHANGED
@@ -38,26 +38,29 @@ This model performs multi-label span-level detection of 53 rhetorical marker typ
38
  | Min examples | 150 (types below this threshold excluded) |
39
 
40
  ## Usage
 
41
  ```python
42
  import json
43
  import torch
44
- from transformers import AutoTokenizer
45
- from estimators.tokens.model import MultiLabelTokenClassifier
46
 
47
- model_path = "models/bert_token_classifier"
48
- tokenizer = AutoTokenizer.from_pretrained(model_path)
49
- model = MultiLabelTokenClassifier.load(model_path, device="cpu")
50
  model.eval()
51
 
52
- type_to_idx = json.loads((model_path / "type_to_idx.json").read_text())
 
 
53
  idx_to_type = {v: k for k, v in type_to_idx.items()}
54
 
55
  text = "Tell me, O Muse, of that ingenious hero who travelled far and wide"
56
  inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128)
57
 
58
  with torch.no_grad():
59
- logits = model(inputs["input_ids"], inputs["attention_mask"])
60
- preds = logits.argmax(dim=-1) # (1, seq, num_types)
61
 
62
  tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
63
  for i, token in enumerate(tokens):
@@ -70,6 +73,8 @@ for i, token in enumerate(tokens):
70
  print(f"{token:15} {', '.join(active)}")
71
  ```
72
 
 
 
73
  ## Training Data
74
 
75
  - Sources: Project Gutenberg, textfiles.com, Reddit, Wikipedia talk pages
@@ -233,4 +238,4 @@ classifier.bias → randomly initialized
233
 
234
  ---
235
 
236
- *Trained: February 2026*
 
38
  | Min examples | 150 (types below this threshold excluded) |
39
 
40
  ## Usage
41
+
42
  ```python
43
  import json
44
  import torch
45
+ from transformers import AutoModel, AutoTokenizer
46
+ from huggingface_hub import hf_hub_download
47
 
48
+ model_name = "HavelockAI/bert-token-classifier"
49
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
50
+ model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
51
  model.eval()
52
 
53
+ # Load marker type map
54
+ type_map_path = hf_hub_download(model_name, "type_to_idx.json")
55
+ type_to_idx = json.loads(open(type_map_path).read())
56
  idx_to_type = {v: k for k, v in type_to_idx.items()}
57
 
58
  text = "Tell me, O Muse, of that ingenious hero who travelled far and wide"
59
  inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128)
60
 
61
  with torch.no_grad():
62
+ logits = model(**inputs) # (1, seq_len, num_types, 3)
63
+ preds = logits.argmax(dim=-1) # (1, seq_len, num_types)
64
 
65
  tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
66
  for i, token in enumerate(tokens):
 
73
  print(f"{token:15} {', '.join(active)}")
74
  ```
75
 
76
+ > **Note:** This model uses a custom architecture (`HavelockTokenClassifier`) with independent B/I/O heads per marker type, enabling overlapping span detection. Loading requires `trust_remote_code=True`.
77
+
78
  ## Training Data
79
 
80
  - Sources: Project Gutenberg, textfiles.com, Reddit, Wikipedia talk pages
 
238
 
239
  ---
240
 
241
+ *Trained: February 2026*
config.json CHANGED
@@ -349,5 +349,9 @@
349
  "O-oral_vocative": 156,
350
  "B-oral_vocative": 157,
351
  "I-oral_vocative": 158
 
 
 
 
352
  }
353
  }
 
349
  "O-oral_vocative": 156,
350
  "B-oral_vocative": 157,
351
  "I-oral_vocative": 158
352
+ },
353
+ "num_types": 53,
354
+ "auto_map": {
355
+ "AutoModel": "modeling_havelock.HavelockTokenClassifier"
356
  }
357
  }
modeling_havelock.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Custom multi-label token classifier for HuggingFace Hub."""
2
+
3
+ import torch
4
+ import torch.nn as nn
5
+ from transformers import BertPreTrainedModel, AutoModel
6
+
7
+
8
+ class HavelockTokenClassifier(BertPreTrainedModel):
9
+ """Multi-label BIO token classifier with independent O/B/I heads per marker type.
10
+
11
+ Each token gets num_types independent 3-way classifications, allowing
12
+ overlapping spans (e.g. a token simultaneously B-anaphora and I-concessive).
13
+
14
+ Output logits shape: (batch, seq_len, num_types, 3)
15
+ """
16
+
17
+ def __init__(self, config):
18
+ super().__init__(config)
19
+ self.num_types = config.num_types
20
+ self.bert = AutoModel.from_config(config)
21
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
22
+ self.classifier = nn.Linear(config.hidden_size, config.num_types * 3)
23
+ self.post_init()
24
+
25
+ def forward(self, input_ids, attention_mask=None, **kwargs):
26
+ hidden = self.bert(
27
+ input_ids=input_ids, attention_mask=attention_mask
28
+ ).last_hidden_state
29
+ hidden = self.dropout(hidden)
30
+ logits = self.classifier(hidden)
31
+ batch, seq, _ = logits.shape
32
+ return logits.view(batch, seq, self.num_types, 3)