stanford-nlpxed
/

transcript-analysis

Model card Files Files and versions

xet

Community

hylee719 commited on Mar 18, 2024

Commit

56773d6

verified ·

1 Parent(s): ee3321b

merge new March changes

Browse files

Files changed (1) hide show

handler.py +59 -27

handler.py CHANGED Viewed

@@ -86,7 +86,7 @@ class Utterance:
             'aggregateUnitMeasure': self.aggregate_unit_measure,
             'wordCount': self.word_count,
             'numMathTerms': self.num_math_terms,
-            'mathTerms': self.math_terms
         }
     def __repr__(self):
@@ -157,34 +157,45 @@ class Transcript:
         uptake_teacher_dict = {}
         stop_words = stopwords.words('english')
         for utt in self.utterances:
-            words = (utt.get_clean_text(remove_punct=True)).split(' ')
             for word in words:
-                if word in stop_words: continue
                 if utt.role == 'teacher':
-                    if word not in teacher_dict:
-                        teacher_dict[word] = 0
-                    teacher_dict[word] += 1
                     if utt.uptake == 1:
                         if word not in uptake_teacher_dict:
                             uptake_teacher_dict[word] = 0
                         uptake_teacher_dict[word] += 1
                 else:
                     if word not in student_dict:
                         student_dict[word] = 0
                     student_dict[word] += 1
         dict_list = []
         uptake_dict_list = []
         for word in uptake_teacher_dict.keys():
             uptake_dict_list.append({'text': word, 'value': uptake_teacher_dict[word], 'category': 'teacher'})
         for word in teacher_dict.keys():
-            dict_list.append(
-                {'text': word, 'value': teacher_dict[word], 'category': 'teacher'})
         for word in student_dict.keys():
-            dict_list.append(
-                {'text': word, 'value': student_dict[word], 'category': 'student'})
         sorted_dict_list = sorted(dict_list, key=lambda x: x['value'], reverse=True)
         sorted_uptake_dict_list = sorted(uptake_dict_list, key=lambda x: x['value'], reverse=True)
-        return sorted_dict_list[:50], sorted_uptake_dict_list[:50]
     def get_talk_timeline(self):
         return [utterance.to_talk_timeline_dict() for utterance in self.utterances]
@@ -377,9 +388,10 @@ def load_math_terms():
 def run_math_density(transcript):
     math_terms, math_terms_dict = load_math_terms()
     sorted_terms = sorted(math_terms, key=len, reverse=True)
-    math_word_cloud = {}
     for i, utt in enumerate(transcript.utterances):
-        text = utt.get_clean_text(remove_punct=False)
         num_matches = 0
         matched_positions = set()
         match_list = []
@@ -387,22 +399,41 @@ def run_math_density(transcript):
             matches = list(re.finditer(term, text, re.IGNORECASE))
             # Filter out matches that share positions with longer terms
             matches = [match for match in matches if not any(match.start() in range(existing[0], existing[1]) for existing in matched_positions)]
             if len(matches) > 0:
-                if math_terms_dict[term] not in math_word_cloud:
-                    math_word_cloud[math_terms_dict[term]] = 0
-                math_word_cloud[math_terms_dict[term]] += len(matches)
-                match_list.append(math_terms_dict[term])
             # Update matched positions
             matched_positions.update((match.start(), match.end()) for match in matches)
             num_matches += len(matches)
         utt.num_math_terms = num_matches
         utt.math_terms = match_list
     dict_list = []
-    for word in math_word_cloud.keys():
-        dict_list.append(
-            {'text': word, 'value': math_word_cloud[word]})
     sorted_dict_list = sorted(dict_list, key=lambda x: x['value'], reverse=True)
-    return sorted_dict_list[:50]
 class EndpointHandler():
     def __init__(self, path="."):
@@ -457,18 +488,19 @@ class EndpointHandler():
         focusing_question_model.run_inference(transcript, uptake_speaker=uptake_speaker)
         del focusing_question_model
-        math_cloud = run_math_density(transcript)
         transcript.update_utterance_roles(uptake_speaker)
         transcript.calculate_aggregate_word_count()
-        return_dict = {'talkDistribution': None, 'talkLength': None, 'talkMoments': None, 'commonTopWords': None, 'uptakeTopWords': None, 'mathTopWords': None}
         talk_dist, talk_len = transcript.get_talk_distribution_and_length(uptake_speaker)
         return_dict['talkDistribution'] = talk_dist
         return_dict['talkLength'] = talk_len
         talk_moments = transcript.get_talk_timeline()
         return_dict['talkMoments'] = talk_moments
-        word_cloud, uptake_word_cloud = transcript.get_word_clouds()
-        return_dict['commonTopWords'] = word_cloud
-        return_dict['uptakeTopWords'] = uptake_word_cloud
-        return_dict['mathTopWords'] = math_cloud
         return return_dict

             'aggregateUnitMeasure': self.aggregate_unit_measure,
             'wordCount': self.word_count,
             'numMathTerms': self.num_math_terms,
+            'mathTerms': self.math_terms,
         }
     def __repr__(self):
         uptake_teacher_dict = {}
         stop_words = stopwords.words('english')
         for utt in self.utterances:
+            words = (utt.get_clean_text(remove_punct=True)).split(' ')
             for word in words:
+                if word in stop_words or word in ['inaudible', 'crosstalk']: continue
+                # handle uptake case
                 if utt.role == 'teacher':
                     if utt.uptake == 1:
                         if word not in uptake_teacher_dict:
                             uptake_teacher_dict[word] = 0
                         uptake_teacher_dict[word] += 1
+                # ignore math words so they don't get tagged as general
+                if any(math_word in word for math_word in utt.math_terms): continue
+                if utt.role == 'teacher':
+                    if word not in teacher_dict:
+                        teacher_dict[word] = 0
+                    teacher_dict[word] += 1
                 else:
                     if word not in student_dict:
                         student_dict[word] = 0
                     student_dict[word] += 1
         dict_list = []
         uptake_dict_list = []
+        teacher_dict_list = []
+        student_dict_list = []
         for word in uptake_teacher_dict.keys():
             uptake_dict_list.append({'text': word, 'value': uptake_teacher_dict[word], 'category': 'teacher'})
         for word in teacher_dict.keys():
+            teacher_dict_list.append(
+                {'text': word, 'value': teacher_dict[word], 'category': 'general'})
+            dict_list.append({'text': word, 'value': teacher_dict[word], 'category': 'general'})
         for word in student_dict.keys():
+            student_dict_list.append(
+                {'text': word, 'value': student_dict[word], 'category': 'general'})
+            dict_list.append({'text': word, 'value': student_dict[word], 'category': 'general'})
         sorted_dict_list = sorted(dict_list, key=lambda x: x['value'], reverse=True)
         sorted_uptake_dict_list = sorted(uptake_dict_list, key=lambda x: x['value'], reverse=True)
+        sorted_teacher_dict_list = sorted(teacher_dict_list, key=lambda x: x['value'], reverse=True)
+        sorted_student_dict_list = sorted(student_dict_list, key=lambda x: x['value'], reverse=True)
+        return sorted_dict_list[:50], sorted_uptake_dict_list[:50], sorted_teacher_dict_list[:50], sorted_student_dict_list[:50]
     def get_talk_timeline(self):
         return [utterance.to_talk_timeline_dict() for utterance in self.utterances]
 def run_math_density(transcript):
     math_terms, math_terms_dict = load_math_terms()
     sorted_terms = sorted(math_terms, key=len, reverse=True)
+    teacher_math_word_cloud = {}
+    student_math_word_cloud = {}
     for i, utt in enumerate(transcript.utterances):
+        text = utt.get_clean_text(remove_punct=True)
         num_matches = 0
         matched_positions = set()
         match_list = []
             matches = list(re.finditer(term, text, re.IGNORECASE))
             # Filter out matches that share positions with longer terms
             matches = [match for match in matches if not any(match.start() in range(existing[0], existing[1]) for existing in matched_positions)]
+            # matched_text = [match.group(0) for match in matches]
             if len(matches) > 0:
+                if utt.role == "teacher":
+                    if math_terms_dict[term] not in teacher_math_word_cloud:
+                        teacher_math_word_cloud[math_terms_dict[term]] = 0
+                    teacher_math_word_cloud[math_terms_dict[term]] += len(matches)
+                else:
+                    if math_terms_dict[term] not in student_math_word_cloud:
+                        student_math_word_cloud[math_terms_dict[term]] = 0
+                    student_math_word_cloud[math_terms_dict[term]] += len(matches)
+                match_list.append(math_terms_dict[term])
             # Update matched positions
             matched_positions.update((match.start(), match.end()) for match in matches)
             num_matches += len(matches)
+            # print("match group list: ", [match.group(0) for match in matches])
         utt.num_math_terms = num_matches
         utt.math_terms = match_list
+        # utt.math_match_positions = list(matched_positions)
+        # utt.math_terms_raw = [text[start:end] for start, end in matched_positions]
+    teacher_dict_list = []
+    student_dict_list = []
     dict_list = []
+    for word in teacher_math_word_cloud.keys():
+        teacher_dict_list.append(
+            {'text': word, 'value': teacher_math_word_cloud[word], 'category': "math"})
+        dict_list.append({'text': word, 'value': teacher_math_word_cloud[word], 'category': "math"})
+    for word in student_math_word_cloud.keys():
+        student_dict_list.append(
+            {'text': word, 'value': student_math_word_cloud[word], 'category': "math"})
+        dict_list.append({'text': word, 'value': student_math_word_cloud[word], 'category': "math"})
     sorted_dict_list = sorted(dict_list, key=lambda x: x['value'], reverse=True)
+    sorted_teacher_dict_list = sorted(teacher_dict_list, key=lambda x: x['value'], reverse=True)
+    sorted_student_dict_list = sorted(student_dict_list, key=lambda x: x['value'], reverse=True)
+    # return sorted_dict_list[:50]
+    return sorted_dict_list[:50], sorted_teacher_dict_list[:50], sorted_student_dict_list[:50]
 class EndpointHandler():
     def __init__(self, path="."):
         focusing_question_model.run_inference(transcript, uptake_speaker=uptake_speaker)
         del focusing_question_model
         transcript.update_utterance_roles(uptake_speaker)
+        sorted_math_cloud, teacher_math_cloud, student_math_cloud = run_math_density(transcript)
         transcript.calculate_aggregate_word_count()
+        return_dict = {'talkDistribution': None, 'talkLength': None, 'talkMoments': None, 'studentTopWords': None, 'teacherTopWords': None}
         talk_dist, talk_len = transcript.get_talk_distribution_and_length(uptake_speaker)
         return_dict['talkDistribution'] = talk_dist
         return_dict['talkLength'] = talk_len
         talk_moments = transcript.get_talk_timeline()
         return_dict['talkMoments'] = talk_moments
+        word_cloud, uptake_word_cloud, teacher_general_cloud, student_general_cloud = transcript.get_word_clouds()
+        teacher_cloud = teacher_math_cloud + teacher_general_cloud
+        student_cloud = student_math_cloud + student_general_cloud
+        return_dict['teacherTopWords'] = teacher_cloud
+        return_dict['studentTopWords'] = student_cloud
         return return_dict