stanford-nlpxed
/

transcript-analysis

Model card Files Files and versions

xet

Community

ikarasz commited on Feb 22, 2025

Commit

f0d3fb6

1 Parent(s): 0d0084a

omit math words matched from general words

Browse files

Files changed (1) hide show

handler.py +31 -30

handler.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from typing import Dict, List, Any
 from scipy.special import softmax
 import numpy as np
 import weakref
 import re
@@ -152,31 +153,35 @@ class Transcript:
         return {'teacher': teacher_percentage, 'student': student_percentage}, {'teacher': avg_teacher_length, 'student': avg_student_length}
     def get_word_clouds(self):
-        teacher_dict = {}
-        student_dict = {}
-        uptake_teacher_dict = {}
         stop_words = stopwords.words('english')
-        for utt in self.utterances:
-            words = (utt.get_clean_text(remove_punct=True)).split(' ')
-            for word in words:
-                if word in stop_words or word in ['inaudible', 'crosstalk']: continue
-                # handle uptake case
-                if utt.role == 'teacher':
-                    if utt.uptake == 1:
-                        if word not in uptake_teacher_dict:
-                            uptake_teacher_dict[word] = 0
-                        uptake_teacher_dict[word] += 1
-                # ignore math words so they don't get tagged as general
-                if any(math_word in word for math_word in utt.math_terms): continue
-                if utt.role == 'teacher':
-                    if word not in teacher_dict:
-                        teacher_dict[word] = 0
-                    teacher_dict[word] += 1
-                else:
-                    if word not in student_dict:
-                        student_dict[word] = 0
-                    student_dict[word] += 1
         dict_list = []
         uptake_dict_list = []
         teacher_dict_list = []
@@ -395,12 +400,11 @@ def run_math_density(transcript):
         text = utt.get_clean_text(remove_punct=True)
         num_matches = 0
         matched_positions = set()
-        match_list = []
         for regex in sorted_regexes:
             matches = list(re.finditer(regex, text, re.IGNORECASE))
             # Filter out matches that share positions with longer terms
             matches = [match for match in matches if not any(match.start() in range(existing[0], existing[1]) for existing in matched_positions)]
-            # matched_text = [match.group(0) for match in matches]
             if len(matches) > 0:
                 if utt.role == "teacher":
                     if math_terms_dict[regex] not in teacher_math_word_cloud:
@@ -411,14 +415,11 @@ def run_math_density(transcript):
                         student_math_word_cloud[math_terms_dict[regex]] = 0
                     student_math_word_cloud[math_terms_dict[regex]] += len(matches)
             for match in matches:
-                match_list.append(match.group())
                 matched_positions.add((match.start(), match.end()))
             num_matches += len(matches)
-            # print("match group list: ", [match.group(0) for match in matches])
         utt.num_math_terms = num_matches
-        utt.math_terms = match_list
-        # utt.math_match_positions = list(matched_positions)
-        # utt.math_terms_raw = [text[start:end] for start, end in matched_positions]
     teacher_dict_list = []
     student_dict_list = []
     dict_list = []

 from typing import Dict, List, Any
 from scipy.special import softmax
+from collections import Counter
 import numpy as np
 import weakref
 import re
         return {'teacher': teacher_percentage, 'student': student_percentage}, {'teacher': avg_teacher_length, 'student': avg_student_length}
     def get_word_clouds(self):
+        # Initialize dictionaries
+        teacher_dict = Counter()
+        student_dict = Counter()
+        uptake_teacher_dict = Counter()
         stop_words = stopwords.words('english')
+        # Go through the utterances
+        for utt in self.utterances:
+            # Get clean text
+            clean_text = utt.get_clean_text(remove_punct=True)
+            words = clean_text.split()
+            words = [word for word in words if word not in stop_words and word not in ['inaudible', 'crosstalk']]
+            # Handle uptake case
+            if utt.role == 'teacher' and utt.uptake == 1:
+                uptake_teacher_dict.update(words)
+            general_text = ' '.join(words)
+            # Replace math terms with empty strings
+            for math_term in utt.math_terms:
+                general_text = general_text.replace(math_term, '')
+                general_text = general_text.replace('  ', ' ')
+            general_words = general_text.split()
+            # Update the appropriate dictionary
+            if utt.role == 'teacher':
+                teacher_dict.update(general_words)
+            else:
+                student_dict.update(general_words)
         dict_list = []
         uptake_dict_list = []
         teacher_dict_list = []
         text = utt.get_clean_text(remove_punct=True)
         num_matches = 0
         matched_positions = set()
+        match_list = set()
         for regex in sorted_regexes:
             matches = list(re.finditer(regex, text, re.IGNORECASE))
             # Filter out matches that share positions with longer terms
             matches = [match for match in matches if not any(match.start() in range(existing[0], existing[1]) for existing in matched_positions)]
             if len(matches) > 0:
                 if utt.role == "teacher":
                     if math_terms_dict[regex] not in teacher_math_word_cloud:
                         student_math_word_cloud[math_terms_dict[regex]] = 0
                     student_math_word_cloud[math_terms_dict[regex]] += len(matches)
             for match in matches:
+                match_list.add(match.group())
                 matched_positions.add((match.start(), match.end()))
             num_matches += len(matches)
         utt.num_math_terms = num_matches
+        utt.math_terms = list(match_list)
     teacher_dict_list = []
     student_dict_list = []
     dict_list = []