hylee
commited on
Commit
·
4a18cf3
1
Parent(s):
c2f7754
update with del
Browse files- handler.py +42 -27
handler.py
CHANGED
|
@@ -36,8 +36,11 @@ class Utterance:
|
|
| 36 |
self.role = None
|
| 37 |
self.word_count = self.get_num_words()
|
| 38 |
self.timestamp = [starttime, endtime]
|
| 39 |
-
|
| 40 |
-
|
|
|
|
|
|
|
|
|
|
| 41 |
self.num_math_terms = None
|
| 42 |
self.math_terms = None
|
| 43 |
|
|
@@ -75,6 +78,7 @@ class Utterance:
|
|
| 75 |
return{
|
| 76 |
'speaker': self.speaker,
|
| 77 |
'text': self.text,
|
|
|
|
| 78 |
'role': self.role,
|
| 79 |
'timestamp': self.timestamp,
|
| 80 |
'moments': {'reasoning': True if self.reasoning else False, 'questioning': True if self.question else False, 'uptake': True if self.uptake else False, 'focusingQuestion': True if self.focusing_question else False},
|
|
@@ -137,20 +141,21 @@ class Transcript:
|
|
| 137 |
utt.role = 'student'
|
| 138 |
student_words += utt.get_num_words()
|
| 139 |
student_utt_count += 1
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
|
|
|
|
|
|
|
|
|
| 145 |
return {'teacher': teacher_percentage, 'student': student_percentage}, {'teacher': avg_teacher_length, 'student': avg_student_length}
|
| 146 |
|
| 147 |
-
def
|
| 148 |
teacher_dict = {}
|
| 149 |
student_dict = {}
|
| 150 |
uptake_teacher_dict = {}
|
| 151 |
stop_words = stopwords.words('english')
|
| 152 |
-
# stopwords = nltk.corpus.stopwords.word('english')
|
| 153 |
-
# print("stopwords: ", stopwords)
|
| 154 |
for utt in self.utterances:
|
| 155 |
words = (utt.get_clean_text(remove_punct=True)).split(' ')
|
| 156 |
for word in words:
|
|
@@ -227,7 +232,7 @@ class QuestionModel:
|
|
| 227 |
max_length=self.max_length,
|
| 228 |
input_str=True)
|
| 229 |
output = self.get_prediction(instance)
|
| 230 |
-
print(output)
|
| 231 |
utt.question = np.argmax(
|
| 232 |
output["is_question_logits"][0].tolist())
|
| 233 |
|
|
@@ -255,11 +260,11 @@ class ReasoningModel:
|
|
| 255 |
self.model = BertForSequenceClassification.from_pretrained(path)
|
| 256 |
self.model.to(self.device)
|
| 257 |
|
| 258 |
-
def run_inference(self, transcript, min_num_words=8):
|
| 259 |
self.model.eval()
|
| 260 |
with torch.no_grad():
|
| 261 |
for i, utt in enumerate(transcript.utterances):
|
| 262 |
-
if utt.get_num_words() >= min_num_words:
|
| 263 |
instance = self.input_builder.build_inputs([], utt.text,
|
| 264 |
max_length=self.max_length,
|
| 265 |
input_str=True)
|
|
@@ -372,6 +377,7 @@ def load_math_terms():
|
|
| 372 |
def run_math_density(transcript):
|
| 373 |
math_terms, math_terms_dict = load_math_terms()
|
| 374 |
sorted_terms = sorted(math_terms, key=len, reverse=True)
|
|
|
|
| 375 |
for i, utt in enumerate(transcript.utterances):
|
| 376 |
text = utt.get_clean_text(remove_punct=False)
|
| 377 |
num_matches = 0
|
|
@@ -382,12 +388,21 @@ def run_math_density(transcript):
|
|
| 382 |
# Filter out matches that share positions with longer terms
|
| 383 |
matches = [match for match in matches if not any(match.start() in range(existing[0], existing[1]) for existing in matched_positions)]
|
| 384 |
if len(matches) > 0:
|
|
|
|
|
|
|
|
|
|
| 385 |
match_list.append(math_terms_dict[term])
|
| 386 |
# Update matched positions
|
| 387 |
matched_positions.update((match.start(), match.end()) for match in matches)
|
| 388 |
num_matches += len(matches)
|
| 389 |
utt.num_math_terms = num_matches
|
| 390 |
utt.math_terms = match_list
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 391 |
|
| 392 |
class EndpointHandler():
|
| 393 |
def __init__(self, path="."):
|
|
@@ -410,10 +425,6 @@ class EndpointHandler():
|
|
| 410 |
utterances = data.pop("inputs", data)
|
| 411 |
params = data.pop("parameters", None)
|
| 412 |
|
| 413 |
-
print("EXAMPLES")
|
| 414 |
-
for utt in utterances[:3]:
|
| 415 |
-
print("speaker %s: %s" % (utt["speaker"], utt["text"]))
|
| 416 |
-
|
| 417 |
transcript = Transcript(filename=params.pop("filename", None))
|
| 418 |
for utt in utterances:
|
| 419 |
transcript.add_utterance(Utterance(**utt))
|
|
@@ -426,34 +437,38 @@ class EndpointHandler():
|
|
| 426 |
uptake_speaker = params.pop("uptake_speaker", None)
|
| 427 |
uptake_model.run_inference(transcript, min_prev_words=params['uptake_min_num_words'],
|
| 428 |
uptake_speaker=uptake_speaker)
|
|
|
|
|
|
|
| 429 |
# Reasoning
|
| 430 |
reasoning_model = ReasoningModel(
|
| 431 |
self.device, self.tokenizer, self.input_builder)
|
| 432 |
-
reasoning_model.run_inference(transcript)
|
| 433 |
-
|
|
|
|
| 434 |
# Question
|
| 435 |
question_model = QuestionModel(
|
| 436 |
self.device, self.tokenizer, self.input_builder)
|
| 437 |
question_model.run_inference(transcript)
|
| 438 |
-
|
|
|
|
| 439 |
# Focusing Question
|
| 440 |
focusing_question_model = FocusingQuestionModel(
|
| 441 |
self.device, self.tokenizer, self.input_builder)
|
| 442 |
focusing_question_model.run_inference(transcript, uptake_speaker=uptake_speaker)
|
| 443 |
-
|
| 444 |
-
|
| 445 |
-
|
| 446 |
transcript.update_utterance_roles(uptake_speaker)
|
| 447 |
transcript.calculate_aggregate_word_count()
|
| 448 |
-
return_dict = {'talkDistribution': None, 'talkLength': None, 'talkMoments': None, 'commonTopWords': None, 'uptakeTopWords': None}
|
| 449 |
talk_dist, talk_len = transcript.get_talk_distribution_and_length(uptake_speaker)
|
| 450 |
return_dict['talkDistribution'] = talk_dist
|
| 451 |
return_dict['talkLength'] = talk_len
|
| 452 |
talk_moments = transcript.get_talk_timeline()
|
| 453 |
return_dict['talkMoments'] = talk_moments
|
| 454 |
-
word_cloud, uptake_word_cloud = transcript.
|
| 455 |
return_dict['commonTopWords'] = word_cloud
|
| 456 |
-
return_dict['
|
| 457 |
-
|
| 458 |
|
| 459 |
return return_dict
|
|
|
|
| 36 |
self.role = None
|
| 37 |
self.word_count = self.get_num_words()
|
| 38 |
self.timestamp = [starttime, endtime]
|
| 39 |
+
if starttime is not None and endtime is not None:
|
| 40 |
+
self.unit_measure = endtime - starttime
|
| 41 |
+
else:
|
| 42 |
+
self.unit_measure = None
|
| 43 |
+
self.aggregate_unit_measure = endtime
|
| 44 |
self.num_math_terms = None
|
| 45 |
self.math_terms = None
|
| 46 |
|
|
|
|
| 78 |
return{
|
| 79 |
'speaker': self.speaker,
|
| 80 |
'text': self.text,
|
| 81 |
+
'uid': self.uid,
|
| 82 |
'role': self.role,
|
| 83 |
'timestamp': self.timestamp,
|
| 84 |
'moments': {'reasoning': True if self.reasoning else False, 'questioning': True if self.question else False, 'uptake': True if self.uptake else False, 'focusingQuestion': True if self.focusing_question else False},
|
|
|
|
| 141 |
utt.role = 'student'
|
| 142 |
student_words += utt.get_num_words()
|
| 143 |
student_utt_count += 1
|
| 144 |
+
if teacher_words + student_words > 0:
|
| 145 |
+
teacher_percentage = round(
|
| 146 |
+
(teacher_words / (teacher_words + student_words)) * 100)
|
| 147 |
+
student_percentage = 100 - teacher_percentage
|
| 148 |
+
else:
|
| 149 |
+
teacher_percentage = student_percentage = 0
|
| 150 |
+
avg_teacher_length = teacher_words / teacher_utt_count if teacher_utt_count > 0 else 0
|
| 151 |
+
avg_student_length = student_words / student_utt_count if student_utt_count > 0 else 0
|
| 152 |
return {'teacher': teacher_percentage, 'student': student_percentage}, {'teacher': avg_teacher_length, 'student': avg_student_length}
|
| 153 |
|
| 154 |
+
def get_word_clouds(self):
|
| 155 |
teacher_dict = {}
|
| 156 |
student_dict = {}
|
| 157 |
uptake_teacher_dict = {}
|
| 158 |
stop_words = stopwords.words('english')
|
|
|
|
|
|
|
| 159 |
for utt in self.utterances:
|
| 160 |
words = (utt.get_clean_text(remove_punct=True)).split(' ')
|
| 161 |
for word in words:
|
|
|
|
| 232 |
max_length=self.max_length,
|
| 233 |
input_str=True)
|
| 234 |
output = self.get_prediction(instance)
|
| 235 |
+
# print(output)
|
| 236 |
utt.question = np.argmax(
|
| 237 |
output["is_question_logits"][0].tolist())
|
| 238 |
|
|
|
|
| 260 |
self.model = BertForSequenceClassification.from_pretrained(path)
|
| 261 |
self.model.to(self.device)
|
| 262 |
|
| 263 |
+
def run_inference(self, transcript, min_num_words=8, uptake_speaker=None):
|
| 264 |
self.model.eval()
|
| 265 |
with torch.no_grad():
|
| 266 |
for i, utt in enumerate(transcript.utterances):
|
| 267 |
+
if utt.get_num_words() >= min_num_words and utt.speaker != uptake_speaker:
|
| 268 |
instance = self.input_builder.build_inputs([], utt.text,
|
| 269 |
max_length=self.max_length,
|
| 270 |
input_str=True)
|
|
|
|
| 377 |
def run_math_density(transcript):
|
| 378 |
math_terms, math_terms_dict = load_math_terms()
|
| 379 |
sorted_terms = sorted(math_terms, key=len, reverse=True)
|
| 380 |
+
math_word_cloud = {}
|
| 381 |
for i, utt in enumerate(transcript.utterances):
|
| 382 |
text = utt.get_clean_text(remove_punct=False)
|
| 383 |
num_matches = 0
|
|
|
|
| 388 |
# Filter out matches that share positions with longer terms
|
| 389 |
matches = [match for match in matches if not any(match.start() in range(existing[0], existing[1]) for existing in matched_positions)]
|
| 390 |
if len(matches) > 0:
|
| 391 |
+
if math_terms_dict[term] not in math_word_cloud:
|
| 392 |
+
math_word_cloud[math_terms_dict[term]] = 0
|
| 393 |
+
math_word_cloud[math_terms_dict[term]] += len(matches)
|
| 394 |
match_list.append(math_terms_dict[term])
|
| 395 |
# Update matched positions
|
| 396 |
matched_positions.update((match.start(), match.end()) for match in matches)
|
| 397 |
num_matches += len(matches)
|
| 398 |
utt.num_math_terms = num_matches
|
| 399 |
utt.math_terms = match_list
|
| 400 |
+
dict_list = []
|
| 401 |
+
for word in math_word_cloud.keys():
|
| 402 |
+
dict_list.append(
|
| 403 |
+
{'text': word, 'value': math_word_cloud[word]})
|
| 404 |
+
sorted_dict_list = sorted(dict_list, key=lambda x: x['value'], reverse=True)
|
| 405 |
+
return sorted_dict_list[:50]
|
| 406 |
|
| 407 |
class EndpointHandler():
|
| 408 |
def __init__(self, path="."):
|
|
|
|
| 425 |
utterances = data.pop("inputs", data)
|
| 426 |
params = data.pop("parameters", None)
|
| 427 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 428 |
transcript = Transcript(filename=params.pop("filename", None))
|
| 429 |
for utt in utterances:
|
| 430 |
transcript.add_utterance(Utterance(**utt))
|
|
|
|
| 437 |
uptake_speaker = params.pop("uptake_speaker", None)
|
| 438 |
uptake_model.run_inference(transcript, min_prev_words=params['uptake_min_num_words'],
|
| 439 |
uptake_speaker=uptake_speaker)
|
| 440 |
+
del uptake_model
|
| 441 |
+
|
| 442 |
# Reasoning
|
| 443 |
reasoning_model = ReasoningModel(
|
| 444 |
self.device, self.tokenizer, self.input_builder)
|
| 445 |
+
reasoning_model.run_inference(transcript, uptake_speaker=uptake_speaker)
|
| 446 |
+
del reasoning_model
|
| 447 |
+
|
| 448 |
# Question
|
| 449 |
question_model = QuestionModel(
|
| 450 |
self.device, self.tokenizer, self.input_builder)
|
| 451 |
question_model.run_inference(transcript)
|
| 452 |
+
del question_model
|
| 453 |
+
|
| 454 |
# Focusing Question
|
| 455 |
focusing_question_model = FocusingQuestionModel(
|
| 456 |
self.device, self.tokenizer, self.input_builder)
|
| 457 |
focusing_question_model.run_inference(transcript, uptake_speaker=uptake_speaker)
|
| 458 |
+
del focusing_question_model
|
| 459 |
+
|
| 460 |
+
math_cloud = run_math_density(transcript)
|
| 461 |
transcript.update_utterance_roles(uptake_speaker)
|
| 462 |
transcript.calculate_aggregate_word_count()
|
| 463 |
+
return_dict = {'talkDistribution': None, 'talkLength': None, 'talkMoments': None, 'commonTopWords': None, 'uptakeTopWords': None, 'mathTopWords': None}
|
| 464 |
talk_dist, talk_len = transcript.get_talk_distribution_and_length(uptake_speaker)
|
| 465 |
return_dict['talkDistribution'] = talk_dist
|
| 466 |
return_dict['talkLength'] = talk_len
|
| 467 |
talk_moments = transcript.get_talk_timeline()
|
| 468 |
return_dict['talkMoments'] = talk_moments
|
| 469 |
+
word_cloud, uptake_word_cloud = transcript.get_word_clouds()
|
| 470 |
return_dict['commonTopWords'] = word_cloud
|
| 471 |
+
return_dict['uptakeTopWords'] = uptake_word_cloud
|
| 472 |
+
return_dict['mathTopWords'] = math_cloud
|
| 473 |
|
| 474 |
return return_dict
|