Ashlee Kupor commited on
Commit
1690680
·
1 Parent(s): a2b60e7

Split up long utterances

Browse files
Files changed (1) hide show
  1. handler.py +32 -0
handler.py CHANGED
@@ -4,6 +4,11 @@ import pandas as pd
4
  import webvtt
5
  from datetime import datetime
6
  import torch
 
 
 
 
 
7
 
8
  class Utterance(object):
9
 
@@ -27,8 +32,35 @@ class EndpointHandler():
27
 
28
  def utterance_to_str(self, utterance: Utterance) -> str:
29
  # eliciting only uses text
 
 
 
30
  return utterance.text
31
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  def convert_time(self, time_str):
33
  time = datetime.strptime(time_str, "%H:%M:%S.%f")
34
  return 1000 * (3600 * time.hour + 60 * time.minute + time.second) + time.microsecond / 1000
 
4
  import webvtt
5
  from datetime import datetime
6
  import torch
7
+ import spacy
8
+
9
+ nlp = spacy.load("en_core_web_sm")
10
+ tokenizer = nlp.tokenizer
11
+ token_limit = 200
12
 
13
  class Utterance(object):
14
 
 
32
 
33
  def utterance_to_str(self, utterance: Utterance) -> str:
34
  # eliciting only uses text
35
+ doc = nlp(utterance.text)
36
+ if len(doc) > token_limit:
37
+ return self.handle_long_utterances(doc)
38
  return utterance.text
39
 
40
+ def handle_long_utterances(self, doc: str) -> List[str]:
41
+ split_count = 1
42
+ total_sent = len([x for x in doc.sents])
43
+ sent_count = 0
44
+ token_count = 0
45
+ split_utterance = ''
46
+ utterances = []
47
+ for sent in doc.sents:
48
+ # add a sentence to split
49
+ split_utterance = split_utterance + ' ' + sent.text
50
+ token_count += len(sent)
51
+ sent_count +=1
52
+ if token_count >= token_limit or sent_count == total_sent:
53
+ # save utterance segment
54
+ utterances.append(split_utterance)
55
+
56
+ # restart count
57
+ split_utterance = ''
58
+ token_count = 0
59
+ split_count += 1
60
+
61
+ return utterances
62
+
63
+
64
  def convert_time(self, time_str):
65
  time = datetime.strptime(time_str, "%H:%M:%S.%f")
66
  return 1000 * (3600 * time.hour + 60 * time.minute + time.second) + time.microsecond / 1000