reverting
Browse files- handler.py +20 -139
handler.py
CHANGED
|
@@ -3,9 +3,6 @@ from scipy.special import softmax
|
|
| 3 |
import numpy as np
|
| 4 |
import weakref
|
| 5 |
import re
|
| 6 |
-
import nltk
|
| 7 |
-
from nltk.corpus import stopwords
|
| 8 |
-
nltk.download('stopwords')
|
| 9 |
|
| 10 |
from utils import clean_str, clean_str_nopunct
|
| 11 |
import torch
|
|
@@ -13,7 +10,7 @@ from utils import MultiHeadModel, BertInputBuilder, get_num_words, MATH_PREFIXES
|
|
| 13 |
|
| 14 |
import transformers
|
| 15 |
from transformers import BertTokenizer, BertForSequenceClassification
|
| 16 |
-
|
| 17 |
|
| 18 |
transformers.logging.set_verbosity_debug()
|
| 19 |
|
|
@@ -33,15 +30,9 @@ class Utterance:
|
|
| 33 |
self.endtime = endtime
|
| 34 |
self.transcript = weakref.ref(transcript) if transcript else None
|
| 35 |
self.props = kwargs
|
| 36 |
-
self.role = None
|
| 37 |
-
self.word_count = self.get_num_words()
|
| 38 |
-
self.timestamp = [starttime, endtime]
|
| 39 |
-
self.unit_measure = None
|
| 40 |
-
self.aggregate_unit_measure = endtime
|
| 41 |
self.num_math_terms = None
|
| 42 |
self.math_terms = None
|
| 43 |
|
| 44 |
-
# moments
|
| 45 |
self.uptake = None
|
| 46 |
self.reasoning = None
|
| 47 |
self.question = None
|
|
@@ -71,20 +62,6 @@ class Utterance:
|
|
| 71 |
**self.props
|
| 72 |
}
|
| 73 |
|
| 74 |
-
def to_talk_timeline_dict(self):
|
| 75 |
-
return{
|
| 76 |
-
'speaker': self.speaker,
|
| 77 |
-
'text': self.text,
|
| 78 |
-
'role': self.role,
|
| 79 |
-
'timestamp': self.timestamp,
|
| 80 |
-
'moments': {'reasoning': True if self.reasoning else False, 'questioning': True if self.question else False, 'uptake': True if self.uptake else False, 'focusingQuestion': True if self.focusing_question else False},
|
| 81 |
-
'unitMeasure': self.unit_measure,
|
| 82 |
-
'aggregateUnitMeasure': self.aggregate_unit_measure,
|
| 83 |
-
'wordCount': self.word_count,
|
| 84 |
-
'numMathTerms': self.num_math_terms,
|
| 85 |
-
'mathTerms': self.math_terms
|
| 86 |
-
}
|
| 87 |
-
|
| 88 |
def __repr__(self):
|
| 89 |
return f"Utterance(speaker='{self.speaker}'," \
|
| 90 |
f"text='{self.text}', uid={self.uid}," \
|
|
@@ -114,86 +91,6 @@ class Transcript:
|
|
| 114 |
def length(self):
|
| 115 |
return len(self.utterances)
|
| 116 |
|
| 117 |
-
def update_utterance_roles(self, uptake_speaker):
|
| 118 |
-
for utt in self.utterances:
|
| 119 |
-
if (utt.speaker == uptake_speaker):
|
| 120 |
-
utt.role = 'teacher'
|
| 121 |
-
else:
|
| 122 |
-
utt.role = 'student'
|
| 123 |
-
|
| 124 |
-
def get_talk_distribution_and_length(self, uptake_speaker):
|
| 125 |
-
if ((uptake_speaker is None)):
|
| 126 |
-
return None
|
| 127 |
-
teacher_words = 0
|
| 128 |
-
teacher_utt_count = 0
|
| 129 |
-
student_words = 0
|
| 130 |
-
student_utt_count = 0
|
| 131 |
-
for utt in self.utterances:
|
| 132 |
-
if (utt.speaker == uptake_speaker):
|
| 133 |
-
utt.role = 'teacher'
|
| 134 |
-
teacher_words += utt.get_num_words()
|
| 135 |
-
teacher_utt_count += 1
|
| 136 |
-
else:
|
| 137 |
-
utt.role = 'student'
|
| 138 |
-
student_words += utt.get_num_words()
|
| 139 |
-
student_utt_count += 1
|
| 140 |
-
teacher_percentage = round(
|
| 141 |
-
(teacher_words / (teacher_words + student_words)) * 100)
|
| 142 |
-
student_percentage = 100 - teacher_percentage
|
| 143 |
-
avg_teacher_length = teacher_words / teacher_utt_count
|
| 144 |
-
avg_student_length = student_words / student_utt_count
|
| 145 |
-
return {'teacher': teacher_percentage, 'student': student_percentage}, {'teacher': avg_teacher_length, 'student': avg_student_length}
|
| 146 |
-
|
| 147 |
-
def get_word_cloud_dicts(self):
|
| 148 |
-
teacher_dict = {}
|
| 149 |
-
student_dict = {}
|
| 150 |
-
uptake_teacher_dict = {}
|
| 151 |
-
stop_words = stopwords.words('english')
|
| 152 |
-
# stopwords = nltk.corpus.stopwords.word('english')
|
| 153 |
-
# print("stopwords: ", stopwords)
|
| 154 |
-
for utt in self.utterances:
|
| 155 |
-
words = (utt.get_clean_text(remove_punct=True)).split(' ')
|
| 156 |
-
for word in words:
|
| 157 |
-
if word in stop_words: continue
|
| 158 |
-
if utt.role == 'teacher':
|
| 159 |
-
if word not in teacher_dict:
|
| 160 |
-
teacher_dict[word] = 0
|
| 161 |
-
teacher_dict[word] += 1
|
| 162 |
-
if utt.uptake == 1:
|
| 163 |
-
if word not in uptake_teacher_dict:
|
| 164 |
-
uptake_teacher_dict[word] = 0
|
| 165 |
-
uptake_teacher_dict[word] += 1
|
| 166 |
-
else:
|
| 167 |
-
if word not in student_dict:
|
| 168 |
-
student_dict[word] = 0
|
| 169 |
-
student_dict[word] += 1
|
| 170 |
-
dict_list = []
|
| 171 |
-
uptake_dict_list = []
|
| 172 |
-
for word in uptake_teacher_dict.keys():
|
| 173 |
-
uptake_dict_list.append({'text': word, 'value': uptake_teacher_dict[word], 'category': 'teacher'})
|
| 174 |
-
for word in teacher_dict.keys():
|
| 175 |
-
dict_list.append(
|
| 176 |
-
{'text': word, 'value': teacher_dict[word], 'category': 'teacher'})
|
| 177 |
-
for word in student_dict.keys():
|
| 178 |
-
dict_list.append(
|
| 179 |
-
{'text': word, 'value': student_dict[word], 'category': 'student'})
|
| 180 |
-
sorted_dict_list = sorted(dict_list, key=lambda x: x['value'], reverse=True)
|
| 181 |
-
sorted_uptake_dict_list = sorted(uptake_dict_list, key=lambda x: x['value'], reverse=True)
|
| 182 |
-
return sorted_dict_list[:50], sorted_uptake_dict_list[:50]
|
| 183 |
-
|
| 184 |
-
def get_talk_timeline(self):
|
| 185 |
-
return [utterance.to_talk_timeline_dict() for utterance in self.utterances]
|
| 186 |
-
|
| 187 |
-
def calculate_aggregate_word_count(self):
|
| 188 |
-
unit_measures = [utt.unit_measure for utt in self.utterances]
|
| 189 |
-
if None in unit_measures:
|
| 190 |
-
aggregate_word_count = 0
|
| 191 |
-
for utt in self.utterances:
|
| 192 |
-
aggregate_word_count += utt.get_num_words()
|
| 193 |
-
utt.unit_measure = utt.get_num_words()
|
| 194 |
-
utt.aggregate_unit_measure = aggregate_word_count
|
| 195 |
-
|
| 196 |
-
|
| 197 |
def to_dict(self):
|
| 198 |
return {
|
| 199 |
'utterances': [utterance.to_dict() for utterance in self.utterances],
|
|
@@ -321,6 +218,8 @@ class UptakeModel:
|
|
| 321 |
return_pooler_output=False)
|
| 322 |
return output
|
| 323 |
|
|
|
|
|
|
|
| 324 |
class FocusingQuestionModel:
|
| 325 |
def __init__(self, device, tokenizer, input_builder, max_length=128, path=FOCUSING_QUESTION_MODEL):
|
| 326 |
print("Loading models...")
|
|
@@ -355,7 +254,8 @@ class FocusingQuestionModel:
|
|
| 355 |
output = self.model(input_ids=instance["input_ids"],
|
| 356 |
attention_mask=instance["attention_mask"],
|
| 357 |
token_type_ids=instance["token_type_ids"])
|
| 358 |
-
return output
|
|
|
|
| 359 |
|
| 360 |
def load_math_terms():
|
| 361 |
math_terms = []
|
|
@@ -365,29 +265,23 @@ def load_math_terms():
|
|
| 365 |
math_terms_dict[f"(^|[^a-zA-Z]){term}(s|es)?([^a-zA-Z]|$)"] = term
|
| 366 |
math_terms.append(f"(^|[^a-zA-Z]){term}(s|es)?([^a-zA-Z]|$)")
|
| 367 |
else:
|
| 368 |
-
|
| 369 |
-
|
| 370 |
return math_terms, math_terms_dict
|
| 371 |
|
| 372 |
def run_math_density(transcript):
|
| 373 |
math_terms, math_terms_dict = load_math_terms()
|
| 374 |
-
|
| 375 |
-
|
| 376 |
text = utt.get_clean_text(remove_punct=False)
|
| 377 |
-
|
| 378 |
-
|
| 379 |
-
|
| 380 |
-
|
| 381 |
-
|
| 382 |
-
|
| 383 |
-
|
| 384 |
-
|
| 385 |
-
match_list.append(math_terms_dict[term])
|
| 386 |
-
# Update matched positions
|
| 387 |
-
matched_positions.update((match.start(), match.end()) for match in matches)
|
| 388 |
-
num_matches += len(matches)
|
| 389 |
-
utt.num_math_terms = num_matches
|
| 390 |
-
utt.math_terms = match_list
|
| 391 |
|
| 392 |
class EndpointHandler():
|
| 393 |
def __init__(self, path="."):
|
|
@@ -419,13 +313,13 @@ class EndpointHandler():
|
|
| 419 |
transcript.add_utterance(Utterance(**utt))
|
| 420 |
|
| 421 |
print("Running inference on %d examples..." % transcript.length())
|
| 422 |
-
|
| 423 |
# Uptake
|
| 424 |
uptake_model = UptakeModel(
|
| 425 |
self.device, self.tokenizer, self.input_builder)
|
| 426 |
-
uptake_speaker = params.pop("uptake_speaker", None)
|
| 427 |
uptake_model.run_inference(transcript, min_prev_words=params['uptake_min_num_words'],
|
| 428 |
uptake_speaker=uptake_speaker)
|
|
|
|
| 429 |
# Reasoning
|
| 430 |
reasoning_model = ReasoningModel(
|
| 431 |
self.device, self.tokenizer, self.input_builder)
|
|
@@ -443,17 +337,4 @@ class EndpointHandler():
|
|
| 443 |
|
| 444 |
run_math_density(transcript)
|
| 445 |
|
| 446 |
-
transcript.
|
| 447 |
-
transcript.calculate_aggregate_word_count()
|
| 448 |
-
return_dict = {'talkDistribution': None, 'talkLength': None, 'talkMoments': None, 'commonTopWords': None, 'uptakeTopWords': None}
|
| 449 |
-
talk_dist, talk_len = transcript.get_talk_distribution_and_length(uptake_speaker)
|
| 450 |
-
return_dict['talkDistribution'] = talk_dist
|
| 451 |
-
return_dict['talkLength'] = talk_len
|
| 452 |
-
talk_moments = transcript.get_talk_timeline()
|
| 453 |
-
return_dict['talkMoments'] = talk_moments
|
| 454 |
-
word_cloud, uptake_word_cloud = transcript.get_word_cloud_dicts()
|
| 455 |
-
return_dict['commonTopWords'] = word_cloud
|
| 456 |
-
return_dict['uptakeTopwords'] = uptake_word_cloud
|
| 457 |
-
|
| 458 |
-
|
| 459 |
-
return return_dict
|
|
|
|
| 3 |
import numpy as np
|
| 4 |
import weakref
|
| 5 |
import re
|
|
|
|
|
|
|
|
|
|
| 6 |
|
| 7 |
from utils import clean_str, clean_str_nopunct
|
| 8 |
import torch
|
|
|
|
| 10 |
|
| 11 |
import transformers
|
| 12 |
from transformers import BertTokenizer, BertForSequenceClassification
|
| 13 |
+
|
| 14 |
|
| 15 |
transformers.logging.set_verbosity_debug()
|
| 16 |
|
|
|
|
| 30 |
self.endtime = endtime
|
| 31 |
self.transcript = weakref.ref(transcript) if transcript else None
|
| 32 |
self.props = kwargs
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
self.num_math_terms = None
|
| 34 |
self.math_terms = None
|
| 35 |
|
|
|
|
| 36 |
self.uptake = None
|
| 37 |
self.reasoning = None
|
| 38 |
self.question = None
|
|
|
|
| 62 |
**self.props
|
| 63 |
}
|
| 64 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
def __repr__(self):
|
| 66 |
return f"Utterance(speaker='{self.speaker}'," \
|
| 67 |
f"text='{self.text}', uid={self.uid}," \
|
|
|
|
| 91 |
def length(self):
|
| 92 |
return len(self.utterances)
|
| 93 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
def to_dict(self):
|
| 95 |
return {
|
| 96 |
'utterances': [utterance.to_dict() for utterance in self.utterances],
|
|
|
|
| 218 |
return_pooler_output=False)
|
| 219 |
return output
|
| 220 |
|
| 221 |
+
|
| 222 |
+
|
| 223 |
class FocusingQuestionModel:
|
| 224 |
def __init__(self, device, tokenizer, input_builder, max_length=128, path=FOCUSING_QUESTION_MODEL):
|
| 225 |
print("Loading models...")
|
|
|
|
| 254 |
output = self.model(input_ids=instance["input_ids"],
|
| 255 |
attention_mask=instance["attention_mask"],
|
| 256 |
token_type_ids=instance["token_type_ids"])
|
| 257 |
+
return output
|
| 258 |
+
|
| 259 |
|
| 260 |
def load_math_terms():
|
| 261 |
math_terms = []
|
|
|
|
| 265 |
math_terms_dict[f"(^|[^a-zA-Z]){term}(s|es)?([^a-zA-Z]|$)"] = term
|
| 266 |
math_terms.append(f"(^|[^a-zA-Z]){term}(s|es)?([^a-zA-Z]|$)")
|
| 267 |
else:
|
| 268 |
+
math_terms_dict[f"(^|[^a-zA-Z]){term}([^a-zA-Z]|$)"] = term
|
| 269 |
+
math_terms.append(f"(^|[^a-zA-Z]){term}([^a-zA-Z]|$)")
|
| 270 |
return math_terms, math_terms_dict
|
| 271 |
|
| 272 |
def run_math_density(transcript):
|
| 273 |
math_terms, math_terms_dict = load_math_terms()
|
| 274 |
+
for i, utt in enumerate(transcript.utterances):
|
| 275 |
+
found_math_terms = set()
|
| 276 |
text = utt.get_clean_text(remove_punct=False)
|
| 277 |
+
num_math_terms = 0
|
| 278 |
+
for term in math_terms:
|
| 279 |
+
count = len(re.findall(term, text))
|
| 280 |
+
if count > 0:
|
| 281 |
+
found_math_terms.add(math_terms_dict[term])
|
| 282 |
+
num_math_terms += count
|
| 283 |
+
utt.num_math_terms = num_math_terms
|
| 284 |
+
utt.math_terms = list(found_math_terms)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 285 |
|
| 286 |
class EndpointHandler():
|
| 287 |
def __init__(self, path="."):
|
|
|
|
| 313 |
transcript.add_utterance(Utterance(**utt))
|
| 314 |
|
| 315 |
print("Running inference on %d examples..." % transcript.length())
|
| 316 |
+
uptake_speaker = params.pop("uptake_speaker", None)
|
| 317 |
# Uptake
|
| 318 |
uptake_model = UptakeModel(
|
| 319 |
self.device, self.tokenizer, self.input_builder)
|
|
|
|
| 320 |
uptake_model.run_inference(transcript, min_prev_words=params['uptake_min_num_words'],
|
| 321 |
uptake_speaker=uptake_speaker)
|
| 322 |
+
|
| 323 |
# Reasoning
|
| 324 |
reasoning_model = ReasoningModel(
|
| 325 |
self.device, self.tokenizer, self.input_builder)
|
|
|
|
| 337 |
|
| 338 |
run_math_density(transcript)
|
| 339 |
|
| 340 |
+
return transcript.to_dict()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|