merge new March changes
Browse files- handler.py +59 -27
handler.py
CHANGED
|
@@ -86,7 +86,7 @@ class Utterance:
|
|
| 86 |
'aggregateUnitMeasure': self.aggregate_unit_measure,
|
| 87 |
'wordCount': self.word_count,
|
| 88 |
'numMathTerms': self.num_math_terms,
|
| 89 |
-
'mathTerms': self.math_terms
|
| 90 |
}
|
| 91 |
|
| 92 |
def __repr__(self):
|
|
@@ -157,34 +157,45 @@ class Transcript:
|
|
| 157 |
uptake_teacher_dict = {}
|
| 158 |
stop_words = stopwords.words('english')
|
| 159 |
for utt in self.utterances:
|
| 160 |
-
words = (utt.get_clean_text(remove_punct=True)).split(' ')
|
| 161 |
for word in words:
|
| 162 |
-
if word in stop_words: continue
|
|
|
|
| 163 |
if utt.role == 'teacher':
|
| 164 |
-
if word not in teacher_dict:
|
| 165 |
-
teacher_dict[word] = 0
|
| 166 |
-
teacher_dict[word] += 1
|
| 167 |
if utt.uptake == 1:
|
| 168 |
if word not in uptake_teacher_dict:
|
| 169 |
uptake_teacher_dict[word] = 0
|
| 170 |
uptake_teacher_dict[word] += 1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 171 |
else:
|
| 172 |
if word not in student_dict:
|
| 173 |
student_dict[word] = 0
|
| 174 |
student_dict[word] += 1
|
| 175 |
dict_list = []
|
| 176 |
uptake_dict_list = []
|
|
|
|
|
|
|
| 177 |
for word in uptake_teacher_dict.keys():
|
| 178 |
uptake_dict_list.append({'text': word, 'value': uptake_teacher_dict[word], 'category': 'teacher'})
|
| 179 |
for word in teacher_dict.keys():
|
| 180 |
-
|
| 181 |
-
{'text': word, 'value': teacher_dict[word], 'category': '
|
|
|
|
| 182 |
for word in student_dict.keys():
|
| 183 |
-
|
| 184 |
-
{'text': word, 'value': student_dict[word], 'category': '
|
|
|
|
| 185 |
sorted_dict_list = sorted(dict_list, key=lambda x: x['value'], reverse=True)
|
| 186 |
sorted_uptake_dict_list = sorted(uptake_dict_list, key=lambda x: x['value'], reverse=True)
|
| 187 |
-
|
|
|
|
|
|
|
| 188 |
|
| 189 |
def get_talk_timeline(self):
|
| 190 |
return [utterance.to_talk_timeline_dict() for utterance in self.utterances]
|
|
@@ -377,9 +388,10 @@ def load_math_terms():
|
|
| 377 |
def run_math_density(transcript):
|
| 378 |
math_terms, math_terms_dict = load_math_terms()
|
| 379 |
sorted_terms = sorted(math_terms, key=len, reverse=True)
|
| 380 |
-
|
|
|
|
| 381 |
for i, utt in enumerate(transcript.utterances):
|
| 382 |
-
text = utt.get_clean_text(remove_punct=
|
| 383 |
num_matches = 0
|
| 384 |
matched_positions = set()
|
| 385 |
match_list = []
|
|
@@ -387,22 +399,41 @@ def run_math_density(transcript):
|
|
| 387 |
matches = list(re.finditer(term, text, re.IGNORECASE))
|
| 388 |
# Filter out matches that share positions with longer terms
|
| 389 |
matches = [match for match in matches if not any(match.start() in range(existing[0], existing[1]) for existing in matched_positions)]
|
|
|
|
| 390 |
if len(matches) > 0:
|
| 391 |
-
if
|
| 392 |
-
|
| 393 |
-
|
| 394 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 395 |
# Update matched positions
|
| 396 |
matched_positions.update((match.start(), match.end()) for match in matches)
|
| 397 |
num_matches += len(matches)
|
|
|
|
| 398 |
utt.num_math_terms = num_matches
|
| 399 |
utt.math_terms = match_list
|
|
|
|
|
|
|
|
|
|
|
|
|
| 400 |
dict_list = []
|
| 401 |
-
for word in
|
| 402 |
-
|
| 403 |
-
{'text': word, 'value':
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 404 |
sorted_dict_list = sorted(dict_list, key=lambda x: x['value'], reverse=True)
|
| 405 |
-
|
|
|
|
|
|
|
|
|
|
| 406 |
|
| 407 |
class EndpointHandler():
|
| 408 |
def __init__(self, path="."):
|
|
@@ -457,18 +488,19 @@ class EndpointHandler():
|
|
| 457 |
focusing_question_model.run_inference(transcript, uptake_speaker=uptake_speaker)
|
| 458 |
del focusing_question_model
|
| 459 |
|
| 460 |
-
math_cloud = run_math_density(transcript)
|
| 461 |
transcript.update_utterance_roles(uptake_speaker)
|
|
|
|
| 462 |
transcript.calculate_aggregate_word_count()
|
| 463 |
-
return_dict = {'talkDistribution': None, 'talkLength': None, 'talkMoments': None, '
|
| 464 |
talk_dist, talk_len = transcript.get_talk_distribution_and_length(uptake_speaker)
|
| 465 |
return_dict['talkDistribution'] = talk_dist
|
| 466 |
return_dict['talkLength'] = talk_len
|
| 467 |
talk_moments = transcript.get_talk_timeline()
|
| 468 |
return_dict['talkMoments'] = talk_moments
|
| 469 |
-
word_cloud, uptake_word_cloud = transcript.get_word_clouds()
|
| 470 |
-
|
| 471 |
-
|
| 472 |
-
return_dict['
|
|
|
|
| 473 |
|
| 474 |
return return_dict
|
|
|
|
| 86 |
'aggregateUnitMeasure': self.aggregate_unit_measure,
|
| 87 |
'wordCount': self.word_count,
|
| 88 |
'numMathTerms': self.num_math_terms,
|
| 89 |
+
'mathTerms': self.math_terms,
|
| 90 |
}
|
| 91 |
|
| 92 |
def __repr__(self):
|
|
|
|
| 157 |
uptake_teacher_dict = {}
|
| 158 |
stop_words = stopwords.words('english')
|
| 159 |
for utt in self.utterances:
|
| 160 |
+
words = (utt.get_clean_text(remove_punct=True)).split(' ')
|
| 161 |
for word in words:
|
| 162 |
+
if word in stop_words or word in ['inaudible', 'crosstalk']: continue
|
| 163 |
+
# handle uptake case
|
| 164 |
if utt.role == 'teacher':
|
|
|
|
|
|
|
|
|
|
| 165 |
if utt.uptake == 1:
|
| 166 |
if word not in uptake_teacher_dict:
|
| 167 |
uptake_teacher_dict[word] = 0
|
| 168 |
uptake_teacher_dict[word] += 1
|
| 169 |
+
# ignore math words so they don't get tagged as general
|
| 170 |
+
if any(math_word in word for math_word in utt.math_terms): continue
|
| 171 |
+
if utt.role == 'teacher':
|
| 172 |
+
if word not in teacher_dict:
|
| 173 |
+
teacher_dict[word] = 0
|
| 174 |
+
teacher_dict[word] += 1
|
| 175 |
+
|
| 176 |
else:
|
| 177 |
if word not in student_dict:
|
| 178 |
student_dict[word] = 0
|
| 179 |
student_dict[word] += 1
|
| 180 |
dict_list = []
|
| 181 |
uptake_dict_list = []
|
| 182 |
+
teacher_dict_list = []
|
| 183 |
+
student_dict_list = []
|
| 184 |
for word in uptake_teacher_dict.keys():
|
| 185 |
uptake_dict_list.append({'text': word, 'value': uptake_teacher_dict[word], 'category': 'teacher'})
|
| 186 |
for word in teacher_dict.keys():
|
| 187 |
+
teacher_dict_list.append(
|
| 188 |
+
{'text': word, 'value': teacher_dict[word], 'category': 'general'})
|
| 189 |
+
dict_list.append({'text': word, 'value': teacher_dict[word], 'category': 'general'})
|
| 190 |
for word in student_dict.keys():
|
| 191 |
+
student_dict_list.append(
|
| 192 |
+
{'text': word, 'value': student_dict[word], 'category': 'general'})
|
| 193 |
+
dict_list.append({'text': word, 'value': student_dict[word], 'category': 'general'})
|
| 194 |
sorted_dict_list = sorted(dict_list, key=lambda x: x['value'], reverse=True)
|
| 195 |
sorted_uptake_dict_list = sorted(uptake_dict_list, key=lambda x: x['value'], reverse=True)
|
| 196 |
+
sorted_teacher_dict_list = sorted(teacher_dict_list, key=lambda x: x['value'], reverse=True)
|
| 197 |
+
sorted_student_dict_list = sorted(student_dict_list, key=lambda x: x['value'], reverse=True)
|
| 198 |
+
return sorted_dict_list[:50], sorted_uptake_dict_list[:50], sorted_teacher_dict_list[:50], sorted_student_dict_list[:50]
|
| 199 |
|
| 200 |
def get_talk_timeline(self):
|
| 201 |
return [utterance.to_talk_timeline_dict() for utterance in self.utterances]
|
|
|
|
| 388 |
def run_math_density(transcript):
|
| 389 |
math_terms, math_terms_dict = load_math_terms()
|
| 390 |
sorted_terms = sorted(math_terms, key=len, reverse=True)
|
| 391 |
+
teacher_math_word_cloud = {}
|
| 392 |
+
student_math_word_cloud = {}
|
| 393 |
for i, utt in enumerate(transcript.utterances):
|
| 394 |
+
text = utt.get_clean_text(remove_punct=True)
|
| 395 |
num_matches = 0
|
| 396 |
matched_positions = set()
|
| 397 |
match_list = []
|
|
|
|
| 399 |
matches = list(re.finditer(term, text, re.IGNORECASE))
|
| 400 |
# Filter out matches that share positions with longer terms
|
| 401 |
matches = [match for match in matches if not any(match.start() in range(existing[0], existing[1]) for existing in matched_positions)]
|
| 402 |
+
# matched_text = [match.group(0) for match in matches]
|
| 403 |
if len(matches) > 0:
|
| 404 |
+
if utt.role == "teacher":
|
| 405 |
+
if math_terms_dict[term] not in teacher_math_word_cloud:
|
| 406 |
+
teacher_math_word_cloud[math_terms_dict[term]] = 0
|
| 407 |
+
teacher_math_word_cloud[math_terms_dict[term]] += len(matches)
|
| 408 |
+
else:
|
| 409 |
+
if math_terms_dict[term] not in student_math_word_cloud:
|
| 410 |
+
student_math_word_cloud[math_terms_dict[term]] = 0
|
| 411 |
+
student_math_word_cloud[math_terms_dict[term]] += len(matches)
|
| 412 |
+
match_list.append(math_terms_dict[term])
|
| 413 |
# Update matched positions
|
| 414 |
matched_positions.update((match.start(), match.end()) for match in matches)
|
| 415 |
num_matches += len(matches)
|
| 416 |
+
# print("match group list: ", [match.group(0) for match in matches])
|
| 417 |
utt.num_math_terms = num_matches
|
| 418 |
utt.math_terms = match_list
|
| 419 |
+
# utt.math_match_positions = list(matched_positions)
|
| 420 |
+
# utt.math_terms_raw = [text[start:end] for start, end in matched_positions]
|
| 421 |
+
teacher_dict_list = []
|
| 422 |
+
student_dict_list = []
|
| 423 |
dict_list = []
|
| 424 |
+
for word in teacher_math_word_cloud.keys():
|
| 425 |
+
teacher_dict_list.append(
|
| 426 |
+
{'text': word, 'value': teacher_math_word_cloud[word], 'category': "math"})
|
| 427 |
+
dict_list.append({'text': word, 'value': teacher_math_word_cloud[word], 'category': "math"})
|
| 428 |
+
for word in student_math_word_cloud.keys():
|
| 429 |
+
student_dict_list.append(
|
| 430 |
+
{'text': word, 'value': student_math_word_cloud[word], 'category': "math"})
|
| 431 |
+
dict_list.append({'text': word, 'value': student_math_word_cloud[word], 'category': "math"})
|
| 432 |
sorted_dict_list = sorted(dict_list, key=lambda x: x['value'], reverse=True)
|
| 433 |
+
sorted_teacher_dict_list = sorted(teacher_dict_list, key=lambda x: x['value'], reverse=True)
|
| 434 |
+
sorted_student_dict_list = sorted(student_dict_list, key=lambda x: x['value'], reverse=True)
|
| 435 |
+
# return sorted_dict_list[:50]
|
| 436 |
+
return sorted_dict_list[:50], sorted_teacher_dict_list[:50], sorted_student_dict_list[:50]
|
| 437 |
|
| 438 |
class EndpointHandler():
|
| 439 |
def __init__(self, path="."):
|
|
|
|
| 488 |
focusing_question_model.run_inference(transcript, uptake_speaker=uptake_speaker)
|
| 489 |
del focusing_question_model
|
| 490 |
|
|
|
|
| 491 |
transcript.update_utterance_roles(uptake_speaker)
|
| 492 |
+
sorted_math_cloud, teacher_math_cloud, student_math_cloud = run_math_density(transcript)
|
| 493 |
transcript.calculate_aggregate_word_count()
|
| 494 |
+
return_dict = {'talkDistribution': None, 'talkLength': None, 'talkMoments': None, 'studentTopWords': None, 'teacherTopWords': None}
|
| 495 |
talk_dist, talk_len = transcript.get_talk_distribution_and_length(uptake_speaker)
|
| 496 |
return_dict['talkDistribution'] = talk_dist
|
| 497 |
return_dict['talkLength'] = talk_len
|
| 498 |
talk_moments = transcript.get_talk_timeline()
|
| 499 |
return_dict['talkMoments'] = talk_moments
|
| 500 |
+
word_cloud, uptake_word_cloud, teacher_general_cloud, student_general_cloud = transcript.get_word_clouds()
|
| 501 |
+
teacher_cloud = teacher_math_cloud + teacher_general_cloud
|
| 502 |
+
student_cloud = student_math_cloud + student_general_cloud
|
| 503 |
+
return_dict['teacherTopWords'] = teacher_cloud
|
| 504 |
+
return_dict['studentTopWords'] = student_cloud
|
| 505 |
|
| 506 |
return return_dict
|