dchaplinsky
/

punctuation_uk_bert

@@ -10,6 +10,17 @@ class PreTrainedPipeline:
         # This function is only called once, so do all the heavy processing I/O here"""
         self.model = PunctuationCapitalizationModel.from_pretrained("dchaplinsky/punctuation_uk_bert")
     def __call__(self, inputs: str) -> List[Dict[str, Any]]:
         """
         Args:
@@ -32,7 +43,15 @@ class PreTrainedPipeline:
         offset = 0
         for tok, lab in zip(tokens, labels):
             if lab != "OO":
-                res.append({"entity_group": lab, "word": tok, "start": offset, "end": offset + len(tok), "score": 0.99})
             offset += len(tok) + 1

         # This function is only called once, so do all the heavy processing I/O here"""
         self.model = PunctuationCapitalizationModel.from_pretrained("dchaplinsky/punctuation_uk_bert")
+    def apply_label_to_token(self, token: str, label: str) -> str:
+        punct, upper = label
+        if punct != "O":
+            token += punct
+        if upper == "U":
+            token = token.title()
+        return token
     def __call__(self, inputs: str) -> List[Dict[str, Any]]:
         """
         Args:
         offset = 0
         for tok, lab in zip(tokens, labels):
             if lab != "OO":
+                res.append(
+                    {
+                        "entity_group": lab,
+                        "word": self.apply_label_to_token(tok, lab),
+                        "start": offset,
+                        "end": offset + len(tok),
+                        "score": 0.99,
+                    }
+                )
             offset += len(tok) + 1