Upload 4 files

Browse files

Files changed (4) hide show

README.md +23 -107
config.json +13 -0
configuration_mwe.py +21 -0
modeling_mwe.py +43 -0

README.md CHANGED Viewed

@@ -30,128 +30,44 @@ DeBERTa-v3-large fine-tuned for multiword expression identification using binary
 ## Usage
 ```python
 import torch
-import spacy
-from transformers import AutoTokenizer, AutoModel
-from torch import nn
-from safetensors.torch import load_file
-# Model definition (required)
-class StartEndInsideModel(nn.Module):
-    def __init__(self, model_name="microsoft/deberta-v3-large", dropout=0.3):
-        super().__init__()
-        self.encoder = AutoModel.from_pretrained(model_name)
-        h = self.encoder.config.hidden_size
-        self.drop = nn.Dropout(dropout)
-        self.layer_norm = nn.LayerNorm(h)
-        self.chunk_emb = nn.Embedding(2, 16)
-        self.fc = nn.Linear(h, h // 2)
-        self.head_start = nn.Linear(h // 2 + 16, 1)
-        self.head_end = nn.Linear(h // 2 + 16, 1)
-        self.head_inside = nn.Linear(h // 2 + 16, 1)
-    def forward(self, input_ids, attention_mask, chunk_feat):
-        out = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
-        x = out.last_hidden_state
-        x = self.layer_norm(x)
-        x = self.drop(x)
-        h = torch.relu(self.fc(x))
-        h = self.drop(h)
-        x_cat = torch.cat([h, self.chunk_emb(chunk_feat)], dim=-1)
-        return (torch.sigmoid(self.head_start(x_cat)).squeeze(-1),
-                torch.sigmoid(self.head_end(x_cat)).squeeze(-1),
-                torch.sigmoid(self.head_inside(x_cat)).squeeze(-1))
 # Load model
-device = "cuda" if torch.cuda.is_available() else "cpu"
-model = StartEndInsideModel().to(device)
-model.load_state_dict(load_file("model.safetensors", device=device))
 model.eval()
-# Load tokenizer and spaCy
-tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-large")
-nlp = spacy.load("en_core_web_lg")
-# Inference function
-def detect_mwes(text, thresholds=(0.5, 0.6, 0.2)):
-    doc = nlp(text)
-    words = [t.text for t in doc]
-    chunk_tags = ["O"] * len(doc)
-    for chunk in doc.noun_chunks:
-        for i in range(chunk.start, chunk.end):
-            chunk_tags[i] = "NP"
-    enc = tokenizer(words, is_split_into_words=True, truncation=True,
-                    max_length=256, add_special_tokens=True)
-    input_ids = torch.tensor(enc["input_ids"]).unsqueeze(0).to(device)
-    attention_mask = torch.tensor(enc["attention_mask"]).unsqueeze(0).to(device)
-    chunk_map = {"O": 0, "NP": 1}
-    chunk_feat = torch.zeros(len(enc["input_ids"]), dtype=torch.long)
-    for ti, wid in enumerate(enc.word_ids()):
-        if wid is not None:
-            chunk_feat[ti] = chunk_map[chunk_tags[wid]]
-    chunk_feat = chunk_feat.unsqueeze(0).to(device)
-    with torch.no_grad():
-        start_p, end_p, inside_p = model(input_ids, attention_mask, chunk_feat)
-    # Map back to words
-    scores = {"start": [0.0]*len(words), "end": [0.0]*len(words), "inside": [0.0]*len(words)}
-    used = set()
-    for ti, wid in enumerate(enc.word_ids()):
-        if wid is not None and wid not in used:
-            scores["start"][wid] = float(start_p[0, ti])
-            scores["end"][wid] = float(end_p[0, ti])
-            scores["inside"][wid] = float(inside_p[0, ti])
-            used.add(wid)
-    # Reconstruct MWEs
-    th_s, th_e, th_i = thresholds
-    starts = [i for i, v in enumerate(scores["start"]) if v >= th_s]
-    ends = [i for i, v in enumerate(scores["end"]) if v >= th_e]
-    mwes = []
-    for s in starts:
-        for e in ends:
-            if e <= s or (e - s + 1) > 13:
-                continue
-            members = {s, e}
-            for t in range(s + 1, e):
-                if scores["inside"][t] >= th_i:
-                    members.add(t)
-            if 2 <= len(members) <= 6:
-                mwe = " ".join(words[i] for i in sorted(members))
-                mwes.append(mwe)
-    return list(set(mwes))
-# Example
-text = "I'm looking forward to the meeting."
-print(detect_mwes(text))  # ['looking forward']
-```
-## Requirements
-```
-torch>=2.0.0
-transformers>=4.30.0
-safetensors>=0.3.0
-spacy>=3.5.0
-```
-```bash
-python -m spacy download en_core_web_lg
 ```
 ## Training
 Trained on [CoAM](https://huggingface.co/datasets/yusuke196/CoAM) with:
 - Encoder: DeBERTa-v3-large
 - Linguistic features: NP chunking
 - Data augmentation: 30% oversampling
-- Thresholds: τ_start=0.5, τ_end=0.6, τ_inside=0.2
 Code: [github.com/DiegoRossini/binary-mwe-detection](https://github.com/DiegoRossini/binary-mwe-detection)
@@ -164,4 +80,4 @@ Code: [github.com/DiegoRossini/binary-mwe-detection](https://github.com/DiegoRos
     booktitle = "Findings of EACL 2026",
     year = "2026"
 }
-```

 ## Usage
 ```python
+from transformers import AutoModel, AutoConfig, AutoTokenizer
 import torch
 # Load model
+config = AutoConfig.from_pretrained("DiegoRossini/mwe-detection-deberta", trust_remote_code=True)
+model = AutoModel.from_pretrained("DiegoRossini/mwe-detection-deberta", trust_remote_code=True)
+tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-large")
 model.eval()
+# Example inference
+text = ["I", "'m", "looking", "forward", "to", "the", "meeting", "."]
+enc = tokenizer(text, is_split_into_words=True, return_tensors="pt")
+# chunk_feat: 0=O, 1=NP (noun phrase) - use spaCy to get these
+chunk_feat = torch.zeros(enc["input_ids"].shape, dtype=torch.long)
+with torch.no_grad():
+    out = model(enc["input_ids"], enc["attention_mask"], chunk_feat)
+print("Start scores:", out["start"])
+print("End scores:", out["end"])
+print("Inside scores:", out["inside"])
 ```
+## Thresholds
+Use these thresholds to reconstruct MWEs from scores:
+- `start`: 0.5
+- `end`: 0.6
+- `inside`: 0.2
 ## Training
 Trained on [CoAM](https://huggingface.co/datasets/yusuke196/CoAM) with:
 - Encoder: DeBERTa-v3-large
 - Linguistic features: NP chunking
 - Data augmentation: 30% oversampling
 Code: [github.com/DiegoRossini/binary-mwe-detection](https://github.com/DiegoRossini/binary-mwe-detection)
     booktitle = "Findings of EACL 2026",
     year = "2026"
 }
+```

config.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "model_type": "mwe-deberta",
+  "auto_map": {
+    "AutoConfig": "configuration_mwe.MWEConfig",
+    "AutoModel": "modeling_mwe.MWEModel"
+  },
+  "base_model_name": "microsoft/deberta-v3-large",
+  "hidden_size": 1024,
+  "dropout": 0.3,
+  "chunk_vocab_size": 2,
+  "chunk_embedding_dim": 16,
+  "architectures": ["MWEModel"]
+}

configuration_mwe.py ADDED Viewed

	@@ -0,0 +1,21 @@

+from transformers import PretrainedConfig
+class MWEConfig(PretrainedConfig):
+    model_type = "mwe-deberta"
+    def __init__(
+        self,
+        base_model_name: str = "microsoft/deberta-v3-large",
+        hidden_size: int = 1024,
+        dropout: float = 0.3,
+        chunk_vocab_size: int = 2,
+        chunk_embedding_dim: int = 16,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.base_model_name = base_model_name
+        self.hidden_size = hidden_size
+        self.dropout = dropout
+        self.chunk_vocab_size = chunk_vocab_size
+        self.chunk_embedding_dim = chunk_embedding_dim

modeling_mwe.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import torch
+from torch import nn
+from transformers import PreTrainedModel, AutoModel
+from .configuration_mwe import MWEConfig
+class MWEModel(PreTrainedModel):
+    config_class = MWEConfig
+    def __init__(self, config: MWEConfig):
+        super().__init__(config)
+        self.encoder = AutoModel.from_pretrained(config.base_model_name)
+        h = config.hidden_size
+        self.drop = nn.Dropout(config.dropout)
+        self.layer_norm = nn.LayerNorm(h)
+        self.chunk_emb = nn.Embedding(config.chunk_vocab_size, config.chunk_embedding_dim)
+        self.fc = nn.Linear(h, h // 2)
+        self.head_start = nn.Linear(h // 2 + config.chunk_embedding_dim, 1)
+        self.head_end = nn.Linear(h // 2 + config.chunk_embedding_dim, 1)
+        self.head_inside = nn.Linear(h // 2 + config.chunk_embedding_dim, 1)
+        self.post_init()
+    def forward(self, input_ids, attention_mask, chunk_feat):
+        out = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
+        x = out.last_hidden_state
+        x = self.layer_norm(x)
+        x = self.drop(x)
+        h = torch.relu(self.fc(x))
+        h = self.drop(h)
+        chunk_emb = self.chunk_emb(chunk_feat)
+        x_cat = torch.cat([h, chunk_emb], dim=-1)
+        start = torch.sigmoid(self.head_start(x_cat)).squeeze(-1)
+        end = torch.sigmoid(self.head_end(x_cat)).squeeze(-1)
+        inside = torch.sigmoid(self.head_inside(x_cat)).squeeze(-1)
+        return {"start": start, "end": end, "inside": inside}