stanford-nlpxed
/

transcript-analysis

Model card Files Files and versions

xet

Community

math-words/fix-word-matching

by ikarasz - opened Jan 29, 2025

base: refs/heads/main

←

from: refs/pr/2

Discussion Files changed

+17

-17

Files changed (1) hide show

handler.py +17 -17

handler.py CHANGED Viewed

@@ -374,20 +374,20 @@ class FocusingQuestionModel:
         return output
 def load_math_terms():
-    math_terms = []
     math_terms_dict = {}
     for term in MATH_WORDS:
         if term in MATH_PREFIXES:
-            math_terms_dict[f"(^|[^a-zA-Z]){term}(s|es|d|ed)?([^a-zA-Z]|$)"] = term
-            math_terms.append(f"(^|[^a-zA-Z]){term}(s|es|d|ed)?([^a-zA-Z]|$)")
         else:
-            math_terms.append(term)
-            math_terms_dict[term] = term
-    return math_terms, math_terms_dict
 def run_math_density(transcript):
-    math_terms, math_terms_dict = load_math_terms()
-    sorted_terms = sorted(math_terms, key=len, reverse=True)
     teacher_math_word_cloud = {}
     student_math_word_cloud = {}
     for i, utt in enumerate(transcript.utterances):
@@ -395,21 +395,21 @@ def run_math_density(transcript):
         num_matches = 0
         matched_positions = set()
         match_list = []
-        for term in sorted_terms:
-            matches = list(re.finditer(term, text, re.IGNORECASE))
             # Filter out matches that share positions with longer terms
             matches = [match for match in matches if not any(match.start() in range(existing[0], existing[1]) for existing in matched_positions)]
             # matched_text = [match.group(0) for match in matches]
             if len(matches) > 0:
                 if utt.role == "teacher":
-                    if math_terms_dict[term] not in teacher_math_word_cloud:
-                        teacher_math_word_cloud[math_terms_dict[term]] = 0
-                    teacher_math_word_cloud[math_terms_dict[term]] += len(matches)
                 else:
-                    if math_terms_dict[term] not in student_math_word_cloud:
-                        student_math_word_cloud[math_terms_dict[term]] = 0
-                    student_math_word_cloud[math_terms_dict[term]] += len(matches)
-                match_list.append(math_terms_dict[term])
             # Update matched positions
             matched_positions.update((match.start(), match.end()) for match in matches)
             num_matches += len(matches)

         return output
 def load_math_terms():
+    math_regexes = []
     math_terms_dict = {}
     for term in MATH_WORDS:
         if term in MATH_PREFIXES:
+            math_terms_dict[rf"\b{term}(s|es|d|ed)?\b"] = term
+            math_regexes.append(rf"\b{term}(s|es|d|ed)?\b")
         else:
+            math_regexes.append(rf"\b{term}\b")
+            math_terms_dict[rf"\b{term}\b"] = term
+    return math_regexes, math_terms_dict
 def run_math_density(transcript):
+    math_regexes, math_terms_dict = load_math_terms()
+    sorted_regexes = sorted(math_regexes, key=len, reverse=True)
     teacher_math_word_cloud = {}
     student_math_word_cloud = {}
     for i, utt in enumerate(transcript.utterances):
         num_matches = 0
         matched_positions = set()
         match_list = []
+        for regex in sorted_regexes:
+            matches = list(re.finditer(regex, text, re.IGNORECASE))
             # Filter out matches that share positions with longer terms
             matches = [match for match in matches if not any(match.start() in range(existing[0], existing[1]) for existing in matched_positions)]
             # matched_text = [match.group(0) for match in matches]
             if len(matches) > 0:
                 if utt.role == "teacher":
+                    if math_terms_dict[regex] not in teacher_math_word_cloud:
+                        teacher_math_word_cloud[math_terms_dict[regex]] = 0
+                    teacher_math_word_cloud[math_terms_dict[regex]] += len(matches)
                 else:
+                    if math_terms_dict[regex] not in student_math_word_cloud:
+                        student_math_word_cloud[math_terms_dict[regex]] = 0
+                    student_math_word_cloud[math_terms_dict[regex]] += len(matches)
+                match_list.append(math_terms_dict[regex])
             # Update matched positions
             matched_positions.update((match.start(), match.end()) for match in matches)
             num_matches += len(matches)