aekupor
/

eliciting

@@ -4,6 +4,11 @@ import pandas as pd
 import webvtt
 from datetime import datetime
 import torch
 class Utterance(object):
@@ -27,8 +32,35 @@ class EndpointHandler():
     def utterance_to_str(self, utterance: Utterance) -> str:
         # eliciting only uses text
         return utterance.text
     def convert_time(self, time_str):
         time = datetime.strptime(time_str, "%H:%M:%S.%f")
         return 1000 * (3600 * time.hour + 60 * time.minute + time.second) + time.microsecond / 1000

 import webvtt
 from datetime import datetime
 import torch
+import spacy
+nlp = spacy.load("en_core_web_sm")
+tokenizer = nlp.tokenizer
+token_limit = 200
 class Utterance(object):
     def utterance_to_str(self, utterance: Utterance) -> str:
         # eliciting only uses text
+        doc = nlp(utterance.text)
+        if len(doc) > token_limit:
+            return self.handle_long_utterances(doc)
         return utterance.text
+    def handle_long_utterances(self, doc: str) -> List[str]:
+        split_count = 1
+        total_sent = len([x for x in doc.sents])
+        sent_count = 0
+        token_count = 0
+        split_utterance = ''
+        utterances = []
+        for sent in doc.sents:
+            # add a sentence to split
+            split_utterance = split_utterance + ' ' + sent.text
+            token_count += len(sent)
+            sent_count +=1
+            if token_count >= token_limit or sent_count == total_sent:
+                # save utterance segment
+                utterances.append(split_utterance)
+                # restart count
+                split_utterance = ''
+                token_count = 0
+                split_count += 1
+        return utterances
     def convert_time(self, time_str):
         time = datetime.strptime(time_str, "%H:%M:%S.%f")
         return 1000 * (3600 * time.hour + 60 * time.minute + time.second) + time.microsecond / 1000