hylee719
/

transcript-analysis-testing

Model card Files Files and versions

xet

Community

hylee719 commited on Feb 23, 2024

Commit

c83470a

verified ·

1 Parent(s): 7290b72

add teacherTopWords and studentTopWords

Browse files

Files changed (1) hide show

handler.py +36 -19

handler.py CHANGED Viewed

@@ -174,17 +174,19 @@ class Transcript:
                     student_dict[word] += 1
         dict_list = []
         uptake_dict_list = []
         for word in uptake_teacher_dict.keys():
             uptake_dict_list.append({'text': word, 'value': uptake_teacher_dict[word], 'category': 'teacher'})
         for word in teacher_dict.keys():
-            dict_list.append(
-                {'text': word, 'value': teacher_dict[word], 'category': 'teacher'})
         for word in student_dict.keys():
-            dict_list.append(
-                {'text': word, 'value': student_dict[word], 'category': 'student'})
         sorted_dict_list = sorted(dict_list, key=lambda x: x['value'], reverse=True)
         sorted_uptake_dict_list = sorted(uptake_dict_list, key=lambda x: x['value'], reverse=True)
-        return sorted_dict_list[:50], sorted_uptake_dict_list[:50]
     def get_talk_timeline(self):
         return [utterance.to_talk_timeline_dict() for utterance in self.utterances]
@@ -377,7 +379,8 @@ def load_math_terms():
 def run_math_density(transcript):
     math_terms, math_terms_dict = load_math_terms()
     sorted_terms = sorted(math_terms, key=len, reverse=True)
-    math_word_cloud = {}
     for i, utt in enumerate(transcript.utterances):
         text = utt.get_clean_text(remove_punct=False)
         num_matches = 0
@@ -388,21 +391,31 @@ def run_math_density(transcript):
             # Filter out matches that share positions with longer terms
             matches = [match for match in matches if not any(match.start() in range(existing[0], existing[1]) for existing in matched_positions)]
             if len(matches) > 0:
-                if math_terms_dict[term] not in math_word_cloud:
-                    math_word_cloud[math_terms_dict[term]] = 0
-                math_word_cloud[math_terms_dict[term]] += len(matches)
-                match_list.append(math_terms_dict[term])
             # Update matched positions
             matched_positions.update((match.start(), match.end()) for match in matches)
             num_matches += len(matches)
         utt.num_math_terms = num_matches
         utt.math_terms = match_list
-    dict_list = []
-    for word in math_word_cloud.keys():
-        dict_list.append(
-            {'text': word, 'value': math_word_cloud[word]})
-    sorted_dict_list = sorted(dict_list, key=lambda x: x['value'], reverse=True)
-    return sorted_dict_list[:50]
 class EndpointHandler():
     def __init__(self, path="."):
@@ -457,18 +470,22 @@ class EndpointHandler():
         focusing_question_model.run_inference(transcript, uptake_speaker=uptake_speaker)
         del focusing_question_model
-        math_cloud = run_math_density(transcript)
         transcript.update_utterance_roles(uptake_speaker)
         transcript.calculate_aggregate_word_count()
-        return_dict = {'talkDistribution': None, 'talkLength': None, 'talkMoments': None, 'commonTopWords': None, 'uptakeTopWords': None, 'mathTopWords': None}
         talk_dist, talk_len = transcript.get_talk_distribution_and_length(uptake_speaker)
         return_dict['talkDistribution'] = talk_dist
         return_dict['talkLength'] = talk_len
         talk_moments = transcript.get_talk_timeline()
         return_dict['talkMoments'] = talk_moments
-        word_cloud, uptake_word_cloud = transcript.get_word_clouds()
         return_dict['commonTopWords'] = word_cloud
         return_dict['uptakeTopWords'] = uptake_word_cloud
         return_dict['mathTopWords'] = math_cloud
         return return_dict

                     student_dict[word] += 1
         dict_list = []
         uptake_dict_list = []
+        teacher_dict_list = []
+        student_dict_list = []
         for word in uptake_teacher_dict.keys():
             uptake_dict_list.append({'text': word, 'value': uptake_teacher_dict[word], 'category': 'teacher'})
         for word in teacher_dict.keys():
+            teacher_dict_list.append(
+                {'text': word, 'value': teacher_dict[word], 'category': 'general'})
         for word in student_dict.keys():
+            student_dict_list.append(
+                {'text': word, 'value': student_dict[word], 'category': 'general'})
         sorted_dict_list = sorted(dict_list, key=lambda x: x['value'], reverse=True)
         sorted_uptake_dict_list = sorted(uptake_dict_list, key=lambda x: x['value'], reverse=True)
+        return sorted_dict_list[:50], sorted_uptake_dict_list[:50], teacher_dict_list, student_dict_list
     def get_talk_timeline(self):
         return [utterance.to_talk_timeline_dict() for utterance in self.utterances]
 def run_math_density(transcript):
     math_terms, math_terms_dict = load_math_terms()
     sorted_terms = sorted(math_terms, key=len, reverse=True)
+    teacher_math_word_cloud = {}
+    student_math_word_cloud = {}
     for i, utt in enumerate(transcript.utterances):
         text = utt.get_clean_text(remove_punct=False)
         num_matches = 0
             # Filter out matches that share positions with longer terms
             matches = [match for match in matches if not any(match.start() in range(existing[0], existing[1]) for existing in matched_positions)]
             if len(matches) > 0:
+                if utt.role == "teacher":
+                    if math_terms_dict[term] not in teacher_math_word_cloud:
+                        teacher_math_word_cloud[math_terms_dict[term]] = 0
+                    teacher_math_word_cloud[math_terms_dict[term]] += len(matches)
+                else:
+                    if math_terms_dict[term] not in student_math_word_cloud:
+                        student_math_word_cloud[math_terms_dict[term]] = 0
+                    student_math_word_cloud[math_terms_dict[term]] += len(matches)
+                match_list.append(math_terms_dict[term])
             # Update matched positions
             matched_positions.update((match.start(), match.end()) for match in matches)
             num_matches += len(matches)
         utt.num_math_terms = num_matches
         utt.math_terms = match_list
+    teacher_dict_list = []
+    student_dict_list = []
+    for word in teacher_math_word_cloud.keys():
+        teacher_dict_list.append(
+            {'text': word, 'value': teacher_math_word_cloud[word], 'category': "math"})
+    for word in student_math_word_cloud.keys():
+        student_dict_list.append(
+            {'text': word, 'value': student_math_word_cloud[word], 'category': "math"})
+    # sorted_dict_list = sorted(dict_list, key=lambda x: x['value'], reverse=True)
+    # return sorted_dict_list[:50]
+    return teacher_dict_list, student_dict_list
 class EndpointHandler():
     def __init__(self, path="."):
         focusing_question_model.run_inference(transcript, uptake_speaker=uptake_speaker)
         del focusing_question_model
         transcript.update_utterance_roles(uptake_speaker)
+        teacher_math_cloud, student_math_cloud = run_math_density(transcript)
         transcript.calculate_aggregate_word_count()
+        return_dict = {'talkDistribution': None, 'talkLength': None, 'talkMoments': None, 'commonTopWords': None, 'uptakeTopWords': None, 'mathTopWords': None, 'studentTopWords': None, 'teacherTopWords': None}
         talk_dist, talk_len = transcript.get_talk_distribution_and_length(uptake_speaker)
         return_dict['talkDistribution'] = talk_dist
         return_dict['talkLength'] = talk_len
         talk_moments = transcript.get_talk_timeline()
         return_dict['talkMoments'] = talk_moments
+        word_cloud, uptake_word_cloud, teacher_general_cloud, student_general_cloud = transcript.get_word_clouds()
         return_dict['commonTopWords'] = word_cloud
         return_dict['uptakeTopWords'] = uptake_word_cloud
         return_dict['mathTopWords'] = math_cloud
+        teacher_cloud = teacher_math_cloud + teacher_general_cloud
+        student_cloud = student_math_cloud + student_general_cloud
+        return_dict['teacherTopWords'] = teacher_cloud
+        return_dict['studentTopWords'] = student_cloud
         return return_dict