DiegoRossini commited on
Commit
e308c1a
·
verified ·
1 Parent(s): f13ec27

Upload 4 files

Browse files
Files changed (4) hide show
  1. README.md +23 -107
  2. config.json +13 -0
  3. configuration_mwe.py +21 -0
  4. modeling_mwe.py +43 -0
README.md CHANGED
@@ -30,128 +30,44 @@ DeBERTa-v3-large fine-tuned for multiword expression identification using binary
30
  ## Usage
31
 
32
  ```python
 
33
  import torch
34
- import spacy
35
- from transformers import AutoTokenizer, AutoModel
36
- from torch import nn
37
- from safetensors.torch import load_file
38
-
39
- # Model definition (required)
40
- class StartEndInsideModel(nn.Module):
41
- def __init__(self, model_name="microsoft/deberta-v3-large", dropout=0.3):
42
- super().__init__()
43
- self.encoder = AutoModel.from_pretrained(model_name)
44
- h = self.encoder.config.hidden_size
45
- self.drop = nn.Dropout(dropout)
46
- self.layer_norm = nn.LayerNorm(h)
47
- self.chunk_emb = nn.Embedding(2, 16)
48
- self.fc = nn.Linear(h, h // 2)
49
- self.head_start = nn.Linear(h // 2 + 16, 1)
50
- self.head_end = nn.Linear(h // 2 + 16, 1)
51
- self.head_inside = nn.Linear(h // 2 + 16, 1)
52
-
53
- def forward(self, input_ids, attention_mask, chunk_feat):
54
- out = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
55
- x = out.last_hidden_state
56
- x = self.layer_norm(x)
57
- x = self.drop(x)
58
- h = torch.relu(self.fc(x))
59
- h = self.drop(h)
60
- x_cat = torch.cat([h, self.chunk_emb(chunk_feat)], dim=-1)
61
- return (torch.sigmoid(self.head_start(x_cat)).squeeze(-1),
62
- torch.sigmoid(self.head_end(x_cat)).squeeze(-1),
63
- torch.sigmoid(self.head_inside(x_cat)).squeeze(-1))
64
 
65
  # Load model
66
- device = "cuda" if torch.cuda.is_available() else "cpu"
67
- model = StartEndInsideModel().to(device)
68
- model.load_state_dict(load_file("model.safetensors", device=device))
 
69
  model.eval()
70
 
71
- # Load tokenizer and spaCy
72
- tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-large")
73
- nlp = spacy.load("en_core_web_lg")
74
-
75
- # Inference function
76
- def detect_mwes(text, thresholds=(0.5, 0.6, 0.2)):
77
- doc = nlp(text)
78
- words = [t.text for t in doc]
79
- chunk_tags = ["O"] * len(doc)
80
- for chunk in doc.noun_chunks:
81
- for i in range(chunk.start, chunk.end):
82
- chunk_tags[i] = "NP"
83
-
84
- enc = tokenizer(words, is_split_into_words=True, truncation=True,
85
- max_length=256, add_special_tokens=True)
86
-
87
- input_ids = torch.tensor(enc["input_ids"]).unsqueeze(0).to(device)
88
- attention_mask = torch.tensor(enc["attention_mask"]).unsqueeze(0).to(device)
89
-
90
- chunk_map = {"O": 0, "NP": 1}
91
- chunk_feat = torch.zeros(len(enc["input_ids"]), dtype=torch.long)
92
- for ti, wid in enumerate(enc.word_ids()):
93
- if wid is not None:
94
- chunk_feat[ti] = chunk_map[chunk_tags[wid]]
95
- chunk_feat = chunk_feat.unsqueeze(0).to(device)
96
-
97
- with torch.no_grad():
98
- start_p, end_p, inside_p = model(input_ids, attention_mask, chunk_feat)
99
-
100
- # Map back to words
101
- scores = {"start": [0.0]*len(words), "end": [0.0]*len(words), "inside": [0.0]*len(words)}
102
- used = set()
103
- for ti, wid in enumerate(enc.word_ids()):
104
- if wid is not None and wid not in used:
105
- scores["start"][wid] = float(start_p[0, ti])
106
- scores["end"][wid] = float(end_p[0, ti])
107
- scores["inside"][wid] = float(inside_p[0, ti])
108
- used.add(wid)
109
-
110
- # Reconstruct MWEs
111
- th_s, th_e, th_i = thresholds
112
- starts = [i for i, v in enumerate(scores["start"]) if v >= th_s]
113
- ends = [i for i, v in enumerate(scores["end"]) if v >= th_e]
114
-
115
- mwes = []
116
- for s in starts:
117
- for e in ends:
118
- if e <= s or (e - s + 1) > 13:
119
- continue
120
- members = {s, e}
121
- for t in range(s + 1, e):
122
- if scores["inside"][t] >= th_i:
123
- members.add(t)
124
- if 2 <= len(members) <= 6:
125
- mwe = " ".join(words[i] for i in sorted(members))
126
- mwes.append(mwe)
127
-
128
- return list(set(mwes))
129
-
130
- # Example
131
- text = "I'm looking forward to the meeting."
132
- print(detect_mwes(text)) # ['looking forward']
133
- ```
134
 
135
- ## Requirements
 
136
 
137
- ```
138
- torch>=2.0.0
139
- transformers>=4.30.0
140
- safetensors>=0.3.0
141
- spacy>=3.5.0
142
- ```
143
 
144
- ```bash
145
- python -m spacy download en_core_web_lg
 
146
  ```
147
 
 
 
 
 
 
 
 
148
  ## Training
149
 
150
  Trained on [CoAM](https://huggingface.co/datasets/yusuke196/CoAM) with:
151
  - Encoder: DeBERTa-v3-large
152
  - Linguistic features: NP chunking
153
  - Data augmentation: 30% oversampling
154
- - Thresholds: τ_start=0.5, τ_end=0.6, τ_inside=0.2
155
 
156
  Code: [github.com/DiegoRossini/binary-mwe-detection](https://github.com/DiegoRossini/binary-mwe-detection)
157
 
@@ -164,4 +80,4 @@ Code: [github.com/DiegoRossini/binary-mwe-detection](https://github.com/DiegoRos
164
  booktitle = "Findings of EACL 2026",
165
  year = "2026"
166
  }
167
- ```
 
30
  ## Usage
31
 
32
  ```python
33
+ from transformers import AutoModel, AutoConfig, AutoTokenizer
34
  import torch
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
  # Load model
37
+ config = AutoConfig.from_pretrained("DiegoRossini/mwe-detection-deberta", trust_remote_code=True)
38
+ model = AutoModel.from_pretrained("DiegoRossini/mwe-detection-deberta", trust_remote_code=True)
39
+ tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-large")
40
+
41
  model.eval()
42
 
43
+ # Example inference
44
+ text = ["I", "'m", "looking", "forward", "to", "the", "meeting", "."]
45
+ enc = tokenizer(text, is_split_into_words=True, return_tensors="pt")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
+ # chunk_feat: 0=O, 1=NP (noun phrase) - use spaCy to get these
48
+ chunk_feat = torch.zeros(enc["input_ids"].shape, dtype=torch.long)
49
 
50
+ with torch.no_grad():
51
+ out = model(enc["input_ids"], enc["attention_mask"], chunk_feat)
 
 
 
 
52
 
53
+ print("Start scores:", out["start"])
54
+ print("End scores:", out["end"])
55
+ print("Inside scores:", out["inside"])
56
  ```
57
 
58
+ ## Thresholds
59
+
60
+ Use these thresholds to reconstruct MWEs from scores:
61
+ - `start`: 0.5
62
+ - `end`: 0.6
63
+ - `inside`: 0.2
64
+
65
  ## Training
66
 
67
  Trained on [CoAM](https://huggingface.co/datasets/yusuke196/CoAM) with:
68
  - Encoder: DeBERTa-v3-large
69
  - Linguistic features: NP chunking
70
  - Data augmentation: 30% oversampling
 
71
 
72
  Code: [github.com/DiegoRossini/binary-mwe-detection](https://github.com/DiegoRossini/binary-mwe-detection)
73
 
 
80
  booktitle = "Findings of EACL 2026",
81
  year = "2026"
82
  }
83
+ ```
config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "mwe-deberta",
3
+ "auto_map": {
4
+ "AutoConfig": "configuration_mwe.MWEConfig",
5
+ "AutoModel": "modeling_mwe.MWEModel"
6
+ },
7
+ "base_model_name": "microsoft/deberta-v3-large",
8
+ "hidden_size": 1024,
9
+ "dropout": 0.3,
10
+ "chunk_vocab_size": 2,
11
+ "chunk_embedding_dim": 16,
12
+ "architectures": ["MWEModel"]
13
+ }
configuration_mwe.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import PretrainedConfig
2
+
3
+
4
+ class MWEConfig(PretrainedConfig):
5
+ model_type = "mwe-deberta"
6
+
7
+ def __init__(
8
+ self,
9
+ base_model_name: str = "microsoft/deberta-v3-large",
10
+ hidden_size: int = 1024,
11
+ dropout: float = 0.3,
12
+ chunk_vocab_size: int = 2,
13
+ chunk_embedding_dim: int = 16,
14
+ **kwargs
15
+ ):
16
+ super().__init__(**kwargs)
17
+ self.base_model_name = base_model_name
18
+ self.hidden_size = hidden_size
19
+ self.dropout = dropout
20
+ self.chunk_vocab_size = chunk_vocab_size
21
+ self.chunk_embedding_dim = chunk_embedding_dim
modeling_mwe.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch import nn
3
+ from transformers import PreTrainedModel, AutoModel
4
+ from .configuration_mwe import MWEConfig
5
+
6
+
7
+ class MWEModel(PreTrainedModel):
8
+ config_class = MWEConfig
9
+
10
+ def __init__(self, config: MWEConfig):
11
+ super().__init__(config)
12
+ self.encoder = AutoModel.from_pretrained(config.base_model_name)
13
+ h = config.hidden_size
14
+
15
+ self.drop = nn.Dropout(config.dropout)
16
+ self.layer_norm = nn.LayerNorm(h)
17
+ self.chunk_emb = nn.Embedding(config.chunk_vocab_size, config.chunk_embedding_dim)
18
+
19
+ self.fc = nn.Linear(h, h // 2)
20
+ self.head_start = nn.Linear(h // 2 + config.chunk_embedding_dim, 1)
21
+ self.head_end = nn.Linear(h // 2 + config.chunk_embedding_dim, 1)
22
+ self.head_inside = nn.Linear(h // 2 + config.chunk_embedding_dim, 1)
23
+
24
+ self.post_init()
25
+
26
+ def forward(self, input_ids, attention_mask, chunk_feat):
27
+ out = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
28
+ x = out.last_hidden_state
29
+
30
+ x = self.layer_norm(x)
31
+ x = self.drop(x)
32
+
33
+ h = torch.relu(self.fc(x))
34
+ h = self.drop(h)
35
+
36
+ chunk_emb = self.chunk_emb(chunk_feat)
37
+ x_cat = torch.cat([h, chunk_emb], dim=-1)
38
+
39
+ start = torch.sigmoid(self.head_start(x_cat)).squeeze(-1)
40
+ end = torch.sigmoid(self.head_end(x_cat)).squeeze(-1)
41
+ inside = torch.sigmoid(self.head_inside(x_cat)).squeeze(-1)
42
+
43
+ return {"start": start, "end": end, "inside": inside}