Model save

Browse files

Files changed (6) hide show

README.md +3 -3
dependency_classifier.py +46 -42
model.safetensors +1 -1
modeling_parser.py +25 -44
training_args.bin +1 -1
utils.py +1 -1

README.md CHANGED Viewed

@@ -21,13 +21,13 @@ model-index:
       split: validation
     metrics:
     - type: f1
-      value: 0.299975408003577
       name: Null F1
     - type: accuracy
-      value: 0.7009560732003811
       name: Ud Jaccard
     - type: accuracy
-      value: 0.7846158890455425
       name: Eud Jaccard
 ---

       split: validation
     metrics:
     - type: f1
+      value: 0.2499754084433992
       name: Null F1
     - type: accuracy
+      value: 0.32062444472648816
       name: Ud Jaccard
     - type: accuracy
+      value: 0.7903051003317022
       name: Eud Jaccard
 ---

dependency_classifier.py CHANGED Viewed

@@ -38,19 +38,21 @@ class DependencyHeadBase(nn.Module):
     def forward(
         self,
-        h_arc_head: Tensor,    # [batch_size, seq_len, hidden_size]
-        h_arc_dep: Tensor,     # ...
-        h_rel_head: Tensor,    # ...
-        h_rel_dep: Tensor,     # ...
-        gold_arcs: LongTensor, # [batch_size, seq_len, seq_len]
-        mask: BoolTensor       # [batch_size, seq_len]
     ) -> dict[str, Tensor]:
         # Score arcs.
-        # s_arc[:, i, j] = score of edge j -> i.
         s_arc = self.arc_attention(h_arc_head, h_arc_dep)
         # Mask undesirable values (padding, nulls, etc.) with -inf.
-        replace_masked_values(s_arc, pairwise_mask(mask), replace_with=-1e8)
         # Score arcs' relations.
         # [batch_size, seq_len, seq_len, num_labels]
         s_rel = self.rel_attention(h_rel_head, h_rel_dep).permute(0, 2, 3, 1)
@@ -63,11 +65,11 @@ class DependencyHeadBase(nn.Module):
         # Predict arcs based on the scores.
         # [batch_size, seq_len, seq_len]
-        pred_arcs_3d = self.predict_arcs(s_arc, mask)
         # [batch_size, seq_len, seq_len]
-        pred_rels_3d = self.predict_rels(s_rel)
         # [n_pred_arcs, 4]
-        preds_combined = self.combine_arcs_rels(pred_arcs_3d, pred_rels_3d)
         return {
             'preds': preds_combined,
             'loss': loss
@@ -91,8 +93,9 @@ class DependencyHeadBase(nn.Module):
     def predict_arcs(
         self,
-        s_arc: Tensor,   # [batch_size, seq_len, seq_len]
-        mask: BoolTensor # [batch_size, seq_len]
     ) -> LongTensor:
         """Predict arcs from scores."""
         raise NotImplementedError
@@ -127,42 +130,40 @@ class DependencyHead(DependencyHeadBase):
     @override
     def predict_arcs(
         self,
-        s_arc: Tensor,   # [batch_size, seq_len, seq_len]
-        mask: BoolTensor # [batch_size, seq_len]
     ) -> Tensor:
         if self.training:
             # During training, use fast greedy decoding.
             # - [batch_size, seq_len]
-            pred_arcs_seq = s_arc.argmax(dim=-1)
         else:
-            # During inference, diligently decode Maximum Spanning Tree.
-            pred_arcs_seq = self._mst_decode(s_arc, mask)
-            # FIXME
-            # pred_arcs_seq = s_arc.argmax(dim=-1)
         # Upscale arcs sequence of shape [batch_size, seq_len]
         # to matrix of shape [batch_size, seq_len, seq_len].
-        pred_arcs = F.one_hot(pred_arcs_seq, num_classes=pred_arcs_seq.size(1)).long()
         return pred_arcs
     def _mst_decode(
         self,
-        s_arc: Tensor, # [batch_size, seq_len, seq_len]
-        mask: Tensor   # [batch_size, seq_len]
     ) -> tuple[Tensor, Tensor]:
         batch_size = s_arc.size(0)
         device = s_arc.device
         s_arc = s_arc.cpu()
         # Convert scores to probabilities, as `decode_mst` expects non-negative values.
-        arc_probs = nn.functional.softmax(s_arc, dim=-1)
-        # Transpose arcs, because decode_mst defines 'energy' matrix as
-        #  energy[i,j] = "Score that `i` is the head of `j`",
-        # whereas
-        #  arc_probs[i,j] = "Probability that `j` is the head of `i`".
-        arc_probs = arc_probs.transpose(1, 2)
         # `decode_mst` knows nothing about UD and ROOT, so we have to manually
         # zero probabilities of arcs leading to ROOT to make sure ROOT is a source node
@@ -177,11 +178,10 @@ class DependencyHead(DependencyHeadBase):
         pred_arcs = []
         for sample_idx in range(batch_size):
             energy = arc_probs[sample_idx]
-            # has_labels=False because we will decode them manually later.
-            lengths = mask[sample_idx].sum()
-            heads, _ = decode_mst(energy, lengths, has_labels=False)
             # Some nodes may be isolated. Pick heads greedily in this case.
-            heads[heads <= 0] = s_arc[sample_idx].argmax(dim=-1)[heads <= 0]
             pred_arcs.append(heads)
         # shape: [batch_size, seq_len]
@@ -195,7 +195,7 @@ class DependencyHead(DependencyHeadBase):
         gold_arcs: LongTensor  # [n_arcs, 4]
     ) -> tuple[Tensor, Tensor]:
         batch_idxs, from_idxs, to_idxs, _ = gold_arcs.T
-        return F.cross_entropy(s_arc[batch_idxs, from_idxs], to_idxs)
 class MultiDependencyHead(DependencyHeadBase):
@@ -206,8 +206,9 @@ class MultiDependencyHead(DependencyHeadBase):
     @override
     def predict_arcs(
         self,
-        s_arc: Tensor,   # [batch_size, seq_len, seq_len]
-        mask: BoolTensor # [batch_size, seq_len]
     ) -> Tensor:
         # Convert scores to probabilities.
         arc_probs = torch.sigmoid(s_arc)
@@ -263,8 +264,8 @@ class DependencyClassifier(nn.Module):
         embeddings: Tensor,    # [batch_size, seq_len, embedding_size]
         gold_ud: Tensor,       # [n_ud_arcs, 4]
         gold_eud: Tensor,      # [n_eud_arcs, 4]
-        mask_ud: Tensor,       # [batch_size, seq_len]
-        mask_eud: Tensor       # [batch_size, seq_len]
     ) -> dict[str, Tensor]:
         # - [batch_size, seq_len, hidden_size]
@@ -280,7 +281,8 @@ class DependencyClassifier(nn.Module):
             h_rel_head,
             h_rel_dep,
             gold_arcs=gold_ud,
-            mask=mask_ud
         )
         output_eud = self.dependency_head_eud(
             h_arc_head,
@@ -288,7 +290,9 @@ class DependencyClassifier(nn.Module):
             h_rel_head,
             h_rel_dep,
             gold_arcs=gold_eud,
-            mask=mask_eud
         )
         return {

     def forward(
         self,
+        h_arc_head: Tensor,        # [batch_size, seq_len, hidden_size]
+        h_arc_dep: Tensor,         # ...
+        h_rel_head: Tensor,        # ...
+        h_rel_dep: Tensor,         # ...
+        gold_arcs: LongTensor,     # [batch_size, seq_len, seq_len]
+        null_mask: BoolTensor,     # [batch_size, seq_len]
+        padding_mask: BoolTensor   # [batch_size, seq_len]
     ) -> dict[str, Tensor]:
         # Score arcs.
+        # s_arc[:, i, j] = score of edge i -> j.
         s_arc = self.arc_attention(h_arc_head, h_arc_dep)
         # Mask undesirable values (padding, nulls, etc.) with -inf.
+        mask2d = pairwise_mask(null_mask & padding_mask)
+        replace_masked_values(s_arc, mask2d, replace_with=-1e8)
         # Score arcs' relations.
         # [batch_size, seq_len, seq_len, num_labels]
         s_rel = self.rel_attention(h_rel_head, h_rel_dep).permute(0, 2, 3, 1)
         # Predict arcs based on the scores.
         # [batch_size, seq_len, seq_len]
+        pred_arcs_matrix = self.predict_arcs(s_arc, null_mask, padding_mask)
         # [batch_size, seq_len, seq_len]
+        pred_rels_matrix = self.predict_rels(s_rel)
         # [n_pred_arcs, 4]
+        preds_combined = self.combine_arcs_rels(pred_arcs_matrix, pred_rels_matrix)
         return {
             'preds': preds_combined,
             'loss': loss
     def predict_arcs(
         self,
+        s_arc: Tensor,           # [batch_size, seq_len, seq_len]
+        null_mask: BoolTensor,   # [batch_size, seq_len]
+        padding_mask: BoolTensor # [batch_size, seq_len]
     ) -> LongTensor:
         """Predict arcs from scores."""
         raise NotImplementedError
     @override
     def predict_arcs(
         self,
+        s_arc: Tensor,           # [batch_size, seq_len, seq_len]
+        null_mask: BoolTensor,   # [batch_size, seq_len]
+        padding_mask: BoolTensor # [batch_size, seq_len, seq_len]
     ) -> Tensor:
         if self.training:
             # During training, use fast greedy decoding.
             # - [batch_size, seq_len]
+            pred_arcs_seq = s_arc.argmax(dim=1)
         else:
+            # During inference, decode Maximum Spanning Tree.
+            pred_arcs_seq = self._mst_decode(s_arc, padding_mask)
         # Upscale arcs sequence of shape [batch_size, seq_len]
         # to matrix of shape [batch_size, seq_len, seq_len].
+        pred_arcs = F.one_hot(pred_arcs_seq, num_classes=pred_arcs_seq.size(1)).long().transpose(1, 2)
+        # Apply mask one more time (even though s_arc is already masked),
+        # because argmax erases information about masked values.
+        mask2d = pairwise_mask(null_mask & padding_mask)
+        replace_masked_values(pred_arcs, mask2d, replace_with=0)
         return pred_arcs
     def _mst_decode(
         self,
+        s_arc: Tensor,    # [batch_size, seq_len, seq_len]
+        padding_mask: Tensor
     ) -> tuple[Tensor, Tensor]:
         batch_size = s_arc.size(0)
         device = s_arc.device
         s_arc = s_arc.cpu()
         # Convert scores to probabilities, as `decode_mst` expects non-negative values.
+        arc_probs = nn.functional.softmax(s_arc, dim=1)
         # `decode_mst` knows nothing about UD and ROOT, so we have to manually
         # zero probabilities of arcs leading to ROOT to make sure ROOT is a source node
         pred_arcs = []
         for sample_idx in range(batch_size):
             energy = arc_probs[sample_idx]
+            length = padding_mask[sample_idx].sum()
+            heads = decode_mst(energy, length)
             # Some nodes may be isolated. Pick heads greedily in this case.
+            heads[heads <= 0] = s_arc[sample_idx].argmax(dim=1)[heads <= 0]
             pred_arcs.append(heads)
         # shape: [batch_size, seq_len]
         gold_arcs: LongTensor  # [n_arcs, 4]
     ) -> tuple[Tensor, Tensor]:
         batch_idxs, from_idxs, to_idxs, _ = gold_arcs.T
+        return F.cross_entropy(s_arc[batch_idxs, :, to_idxs], from_idxs)
 class MultiDependencyHead(DependencyHeadBase):
     @override
     def predict_arcs(
         self,
+        s_arc: Tensor,           # [batch_size, seq_len, seq_len]
+        null_mask: BoolTensor,   # [batch_size, seq_len]
+        padding_mask: BoolTensor # [batch_size, seq_len]
     ) -> Tensor:
         # Convert scores to probabilities.
         arc_probs = torch.sigmoid(s_arc)
         embeddings: Tensor,    # [batch_size, seq_len, embedding_size]
         gold_ud: Tensor,       # [n_ud_arcs, 4]
         gold_eud: Tensor,      # [n_eud_arcs, 4]
+        null_mask: Tensor,     # [batch_size, seq_len]
+        padding_mask: Tensor   # [batch_size, seq_len]
     ) -> dict[str, Tensor]:
         # - [batch_size, seq_len, hidden_size]
             h_rel_head,
             h_rel_dep,
             gold_arcs=gold_ud,
+            null_mask=null_mask,
+            padding_mask=padding_mask
         )
         output_eud = self.dependency_head_eud(
             h_arc_head,
             h_rel_head,
             h_rel_dep,
             gold_arcs=gold_eud,
+            # Ignore null mask in E-UD
+            null_mask=torch.ones_like(padding_mask),
+            padding_mask=padding_mask
         )
         return {

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d38a0bcc207933837c2e16a3507bb9a9e65fe9360e01a50c48bf256e0bc9a551
 size 1147244460

 version https://git-lfs.github.com/spec/v1
+oid sha256:c0c2327dbffac624222d10865069c3b63b26c65dd0c034a5d86210d080c8dc47
 size 1147244460

modeling_parser.py CHANGED Viewed

@@ -1,8 +1,6 @@
 from torch import nn
 from torch import LongTensor
 from transformers import PreTrainedModel
-from transformers.modeling_outputs import ModelOutput
-from dataclasses import dataclass
 from .configuration import CobaldParserConfig
 from .encoder import WordTransformerEncoder
@@ -17,23 +15,6 @@ from .utils import (
 )
-@dataclass
-class CobaldParserOutput(ModelOutput):
-    """
-    Output type for CobaldParser.
-    """
-    loss: float = None
-    words: list = None
-    counting_mask: LongTensor = None
-    lemma_rules: LongTensor = None
-    joint_feats: LongTensor = None
-    deps_ud: LongTensor = None
-    deps_eud: LongTensor = None
-    miscs: LongTensor = None
-    deepslots: LongTensor = None
-    semclasses: LongTensor = None
 class CobaldParser(PreTrainedModel):
     """Morpho-Syntax-Semantic Parser."""
@@ -119,8 +100,8 @@ class CobaldParser(PreTrainedModel):
         sent_ids: list[str] = None,
         texts: list[str] = None,
         inference_mode: bool = False
-    ) -> CobaldParserOutput:
-        result = {}
         # Extra [CLS] token accounts for the case when #NULL is the first token in a sentence.
         words_with_cls = prepend_cls(words)
@@ -129,62 +110,62 @@ class CobaldParser(PreTrainedModel):
         embeddings_without_nulls = self.encoder(words_without_nulls)
         # Predict nulls.
         null_output = self.classifiers["null"](embeddings_without_nulls, counting_masks)
-        result["counting_mask"] = null_output['preds']
-        result["loss"] = null_output["loss"]
         # "Teacher forcing": during training, pass the original words (with gold nulls)
         # to the classification heads, so that they are trained upon correct sentences.
         if inference_mode:
             # Restore predicted nulls in the original sentences.
-            result["words"] = add_nulls(words, null_output["preds"])
         else:
-            result["words"] = words
         # Encode words with nulls.
         # [batch_size, seq_len, embedding_size]
-        embeddings = self.encoder(result["words"])
         # Predict lemmas and morphological features.
         if "lemma_rule" in self.classifiers:
             lemma_output = self.classifiers["lemma_rule"](embeddings, lemma_rules)
-            result["lemma_rules"] = lemma_output['preds']
-            result["loss"] += lemma_output['loss']
         if "joint_feats" in self.classifiers:
             joint_feats_output = self.classifiers["joint_feats"](embeddings, joint_feats)
-            result["joint_feats"] = joint_feats_output['preds']
-            result["loss"] += joint_feats_output['loss']
         # Predict syntax.
         if "syntax" in self.classifiers:
-            padding_mask = build_padding_mask(result["words"], self.device)
-            null_mask = build_null_mask(result["words"], self.device)
             deps_output = self.classifiers["syntax"](
                 embeddings,
                 deps_ud,
                 deps_eud,
-                mask_ud=(padding_mask & ~null_mask),
-                mask_eud=padding_mask
             )
-            result["deps_ud"] = deps_output['preds_ud']
-            result["deps_eud"] = deps_output['preds_eud']
-            result["loss"] += deps_output['loss_ud'] + deps_output['loss_eud']
         # Predict miscellaneous features.
         if "misc" in self.classifiers:
             misc_output = self.classifiers["misc"](embeddings, miscs)
-            result["miscs"] = misc_output['preds']
-            result["loss"] += misc_output['loss']
         # Predict semantics.
         if "deepslot" in self.classifiers:
             deepslot_output = self.classifiers["deepslot"](embeddings, deepslots)
-            result["deepslots"] = deepslot_output['preds']
-            result["loss"] += deepslot_output['loss']
         if "semclass" in self.classifiers:
             semclass_output = self.classifiers["semclass"](embeddings, semclasses)
-            result["semclasses"] = semclass_output['preds']
-            result["loss"] += semclass_output['loss']
-        return CobaldParserOutput(**result)

 from torch import nn
 from torch import LongTensor
 from transformers import PreTrainedModel
 from .configuration import CobaldParserConfig
 from .encoder import WordTransformerEncoder
 )
 class CobaldParser(PreTrainedModel):
     """Morpho-Syntax-Semantic Parser."""
         sent_ids: list[str] = None,
         texts: list[str] = None,
         inference_mode: bool = False
+    ) -> dict:
+        output = {}
         # Extra [CLS] token accounts for the case when #NULL is the first token in a sentence.
         words_with_cls = prepend_cls(words)
         embeddings_without_nulls = self.encoder(words_without_nulls)
         # Predict nulls.
         null_output = self.classifiers["null"](embeddings_without_nulls, counting_masks)
+        output["counting_mask"] = null_output['preds']
+        output["loss"] = null_output["loss"]
         # "Teacher forcing": during training, pass the original words (with gold nulls)
         # to the classification heads, so that they are trained upon correct sentences.
         if inference_mode:
             # Restore predicted nulls in the original sentences.
+            output["words"] = add_nulls(words, null_output["preds"])
         else:
+            output["words"] = words
         # Encode words with nulls.
         # [batch_size, seq_len, embedding_size]
+        embeddings = self.encoder(output["words"])
         # Predict lemmas and morphological features.
         if "lemma_rule" in self.classifiers:
             lemma_output = self.classifiers["lemma_rule"](embeddings, lemma_rules)
+            output["lemma_rules"] = lemma_output['preds']
+            output["loss"] += lemma_output['loss']
         if "joint_feats" in self.classifiers:
             joint_feats_output = self.classifiers["joint_feats"](embeddings, joint_feats)
+            output["joint_feats"] = joint_feats_output['preds']
+            output["loss"] += joint_feats_output['loss']
         # Predict syntax.
         if "syntax" in self.classifiers:
+            padding_mask = build_padding_mask(output["words"], self.device)
+            null_mask = build_null_mask(output["words"], self.device)
             deps_output = self.classifiers["syntax"](
                 embeddings,
                 deps_ud,
                 deps_eud,
+                null_mask,
+                padding_mask
             )
+            output["deps_ud"] = deps_output['preds_ud']
+            output["deps_eud"] = deps_output['preds_eud']
+            output["loss"] += deps_output['loss_ud'] + deps_output['loss_eud']
         # Predict miscellaneous features.
         if "misc" in self.classifiers:
             misc_output = self.classifiers["misc"](embeddings, miscs)
+            output["miscs"] = misc_output['preds']
+            output["loss"] += misc_output['loss']
         # Predict semantics.
         if "deepslot" in self.classifiers:
             deepslot_output = self.classifiers["deepslot"](embeddings, deepslots)
+            output["deepslots"] = deepslot_output['preds']
+            output["loss"] += deepslot_output['loss']
         if "semclass" in self.classifiers:
             semclass_output = self.classifiers["semclass"](embeddings, semclasses)
+            output["semclasses"] = semclass_output['preds']
+            output["loss"] += semclass_output['loss']
+        return output

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:90e68ebab0dce801d995fc17021452bcf2f03e62fb90217462eec51fc660bf27
 size 5432

 version https://git-lfs.github.com/spec/v1
+oid sha256:e122e18ca0d9c5f65733c55e15f58827e494045ce872cd55b7379b88c8e83ee6
 size 5432

utils.py CHANGED Viewed

@@ -21,7 +21,7 @@ def build_padding_mask(sentences: list[list[str]], device) -> Tensor:
     return _build_condition_mask(sentences, condition_fn=lambda word: True, device=device)
 def build_null_mask(sentences: list[list[str]], device) -> Tensor:
-    return _build_condition_mask(sentences, condition_fn=lambda word: word == "#NULL", device=device)
 def pairwise_mask(masks1d: Tensor) -> Tensor:

     return _build_condition_mask(sentences, condition_fn=lambda word: True, device=device)
 def build_null_mask(sentences: list[list[str]], device) -> Tensor:
+    return _build_condition_mask(sentences, condition_fn=lambda word: word != "#NULL", device=device)
 def pairwise_mask(masks1d: Tensor) -> Tensor: