Ashlee Kupor commited on
Commit ·
1690680
1
Parent(s): a2b60e7
Split up long utterances
Browse files- handler.py +32 -0
handler.py
CHANGED
|
@@ -4,6 +4,11 @@ import pandas as pd
|
|
| 4 |
import webvtt
|
| 5 |
from datetime import datetime
|
| 6 |
import torch
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
class Utterance(object):
|
| 9 |
|
|
@@ -27,8 +32,35 @@ class EndpointHandler():
|
|
| 27 |
|
| 28 |
def utterance_to_str(self, utterance: Utterance) -> str:
|
| 29 |
# eliciting only uses text
|
|
|
|
|
|
|
|
|
|
| 30 |
return utterance.text
|
| 31 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
def convert_time(self, time_str):
|
| 33 |
time = datetime.strptime(time_str, "%H:%M:%S.%f")
|
| 34 |
return 1000 * (3600 * time.hour + 60 * time.minute + time.second) + time.microsecond / 1000
|
|
|
|
| 4 |
import webvtt
|
| 5 |
from datetime import datetime
|
| 6 |
import torch
|
| 7 |
+
import spacy
|
| 8 |
+
|
| 9 |
+
nlp = spacy.load("en_core_web_sm")
|
| 10 |
+
tokenizer = nlp.tokenizer
|
| 11 |
+
token_limit = 200
|
| 12 |
|
| 13 |
class Utterance(object):
|
| 14 |
|
|
|
|
| 32 |
|
| 33 |
def utterance_to_str(self, utterance: Utterance) -> str:
|
| 34 |
# eliciting only uses text
|
| 35 |
+
doc = nlp(utterance.text)
|
| 36 |
+
if len(doc) > token_limit:
|
| 37 |
+
return self.handle_long_utterances(doc)
|
| 38 |
return utterance.text
|
| 39 |
|
| 40 |
+
def handle_long_utterances(self, doc: str) -> List[str]:
|
| 41 |
+
split_count = 1
|
| 42 |
+
total_sent = len([x for x in doc.sents])
|
| 43 |
+
sent_count = 0
|
| 44 |
+
token_count = 0
|
| 45 |
+
split_utterance = ''
|
| 46 |
+
utterances = []
|
| 47 |
+
for sent in doc.sents:
|
| 48 |
+
# add a sentence to split
|
| 49 |
+
split_utterance = split_utterance + ' ' + sent.text
|
| 50 |
+
token_count += len(sent)
|
| 51 |
+
sent_count +=1
|
| 52 |
+
if token_count >= token_limit or sent_count == total_sent:
|
| 53 |
+
# save utterance segment
|
| 54 |
+
utterances.append(split_utterance)
|
| 55 |
+
|
| 56 |
+
# restart count
|
| 57 |
+
split_utterance = ''
|
| 58 |
+
token_count = 0
|
| 59 |
+
split_count += 1
|
| 60 |
+
|
| 61 |
+
return utterances
|
| 62 |
+
|
| 63 |
+
|
| 64 |
def convert_time(self, time_str):
|
| 65 |
time = datetime.strptime(time_str, "%H:%M:%S.%f")
|
| 66 |
return 1000 * (3600 * time.hour + 60 * time.minute + time.second) + time.microsecond / 1000
|