math-words/fix-word-matching
#2
by
ikarasz
- opened
- handler.py +17 -17
handler.py
CHANGED
|
@@ -374,20 +374,20 @@ class FocusingQuestionModel:
|
|
| 374 |
return output
|
| 375 |
|
| 376 |
def load_math_terms():
|
| 377 |
-
|
| 378 |
math_terms_dict = {}
|
| 379 |
for term in MATH_WORDS:
|
| 380 |
if term in MATH_PREFIXES:
|
| 381 |
-
math_terms_dict[
|
| 382 |
-
|
| 383 |
else:
|
| 384 |
-
|
| 385 |
-
math_terms_dict[term] = term
|
| 386 |
-
return
|
| 387 |
|
| 388 |
def run_math_density(transcript):
|
| 389 |
-
|
| 390 |
-
|
| 391 |
teacher_math_word_cloud = {}
|
| 392 |
student_math_word_cloud = {}
|
| 393 |
for i, utt in enumerate(transcript.utterances):
|
|
@@ -395,21 +395,21 @@ def run_math_density(transcript):
|
|
| 395 |
num_matches = 0
|
| 396 |
matched_positions = set()
|
| 397 |
match_list = []
|
| 398 |
-
for
|
| 399 |
-
matches = list(re.finditer(
|
| 400 |
# Filter out matches that share positions with longer terms
|
| 401 |
matches = [match for match in matches if not any(match.start() in range(existing[0], existing[1]) for existing in matched_positions)]
|
| 402 |
# matched_text = [match.group(0) for match in matches]
|
| 403 |
if len(matches) > 0:
|
| 404 |
if utt.role == "teacher":
|
| 405 |
-
if math_terms_dict[
|
| 406 |
-
teacher_math_word_cloud[math_terms_dict[
|
| 407 |
-
teacher_math_word_cloud[math_terms_dict[
|
| 408 |
else:
|
| 409 |
-
if math_terms_dict[
|
| 410 |
-
student_math_word_cloud[math_terms_dict[
|
| 411 |
-
student_math_word_cloud[math_terms_dict[
|
| 412 |
-
match_list.append(math_terms_dict[
|
| 413 |
# Update matched positions
|
| 414 |
matched_positions.update((match.start(), match.end()) for match in matches)
|
| 415 |
num_matches += len(matches)
|
|
|
|
| 374 |
return output
|
| 375 |
|
| 376 |
def load_math_terms():
|
| 377 |
+
math_regexes = []
|
| 378 |
math_terms_dict = {}
|
| 379 |
for term in MATH_WORDS:
|
| 380 |
if term in MATH_PREFIXES:
|
| 381 |
+
math_terms_dict[rf"\b{term}(s|es|d|ed)?\b"] = term
|
| 382 |
+
math_regexes.append(rf"\b{term}(s|es|d|ed)?\b")
|
| 383 |
else:
|
| 384 |
+
math_regexes.append(rf"\b{term}\b")
|
| 385 |
+
math_terms_dict[rf"\b{term}\b"] = term
|
| 386 |
+
return math_regexes, math_terms_dict
|
| 387 |
|
| 388 |
def run_math_density(transcript):
|
| 389 |
+
math_regexes, math_terms_dict = load_math_terms()
|
| 390 |
+
sorted_regexes = sorted(math_regexes, key=len, reverse=True)
|
| 391 |
teacher_math_word_cloud = {}
|
| 392 |
student_math_word_cloud = {}
|
| 393 |
for i, utt in enumerate(transcript.utterances):
|
|
|
|
| 395 |
num_matches = 0
|
| 396 |
matched_positions = set()
|
| 397 |
match_list = []
|
| 398 |
+
for regex in sorted_regexes:
|
| 399 |
+
matches = list(re.finditer(regex, text, re.IGNORECASE))
|
| 400 |
# Filter out matches that share positions with longer terms
|
| 401 |
matches = [match for match in matches if not any(match.start() in range(existing[0], existing[1]) for existing in matched_positions)]
|
| 402 |
# matched_text = [match.group(0) for match in matches]
|
| 403 |
if len(matches) > 0:
|
| 404 |
if utt.role == "teacher":
|
| 405 |
+
if math_terms_dict[regex] not in teacher_math_word_cloud:
|
| 406 |
+
teacher_math_word_cloud[math_terms_dict[regex]] = 0
|
| 407 |
+
teacher_math_word_cloud[math_terms_dict[regex]] += len(matches)
|
| 408 |
else:
|
| 409 |
+
if math_terms_dict[regex] not in student_math_word_cloud:
|
| 410 |
+
student_math_word_cloud[math_terms_dict[regex]] = 0
|
| 411 |
+
student_math_word_cloud[math_terms_dict[regex]] += len(matches)
|
| 412 |
+
match_list.append(math_terms_dict[regex])
|
| 413 |
# Update matched positions
|
| 414 |
matched_positions.update((match.start(), match.end()) for match in matches)
|
| 415 |
num_matches += len(matches)
|