omit math words matched from general words
Browse files- handler.py +31 -30
handler.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
from typing import Dict, List, Any
|
| 2 |
from scipy.special import softmax
|
|
|
|
| 3 |
import numpy as np
|
| 4 |
import weakref
|
| 5 |
import re
|
|
@@ -152,31 +153,35 @@ class Transcript:
|
|
| 152 |
return {'teacher': teacher_percentage, 'student': student_percentage}, {'teacher': avg_teacher_length, 'student': avg_student_length}
|
| 153 |
|
| 154 |
def get_word_clouds(self):
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
|
|
|
| 158 |
stop_words = stopwords.words('english')
|
| 159 |
-
for utt in self.utterances:
|
| 160 |
-
words = (utt.get_clean_text(remove_punct=True)).split(' ')
|
| 161 |
-
for word in words:
|
| 162 |
-
if word in stop_words or word in ['inaudible', 'crosstalk']: continue
|
| 163 |
-
# handle uptake case
|
| 164 |
-
if utt.role == 'teacher':
|
| 165 |
-
if utt.uptake == 1:
|
| 166 |
-
if word not in uptake_teacher_dict:
|
| 167 |
-
uptake_teacher_dict[word] = 0
|
| 168 |
-
uptake_teacher_dict[word] += 1
|
| 169 |
-
# ignore math words so they don't get tagged as general
|
| 170 |
-
if any(math_word in word for math_word in utt.math_terms): continue
|
| 171 |
-
if utt.role == 'teacher':
|
| 172 |
-
if word not in teacher_dict:
|
| 173 |
-
teacher_dict[word] = 0
|
| 174 |
-
teacher_dict[word] += 1
|
| 175 |
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 180 |
dict_list = []
|
| 181 |
uptake_dict_list = []
|
| 182 |
teacher_dict_list = []
|
|
@@ -395,12 +400,11 @@ def run_math_density(transcript):
|
|
| 395 |
text = utt.get_clean_text(remove_punct=True)
|
| 396 |
num_matches = 0
|
| 397 |
matched_positions = set()
|
| 398 |
-
match_list =
|
| 399 |
for regex in sorted_regexes:
|
| 400 |
matches = list(re.finditer(regex, text, re.IGNORECASE))
|
| 401 |
# Filter out matches that share positions with longer terms
|
| 402 |
matches = [match for match in matches if not any(match.start() in range(existing[0], existing[1]) for existing in matched_positions)]
|
| 403 |
-
# matched_text = [match.group(0) for match in matches]
|
| 404 |
if len(matches) > 0:
|
| 405 |
if utt.role == "teacher":
|
| 406 |
if math_terms_dict[regex] not in teacher_math_word_cloud:
|
|
@@ -411,14 +415,11 @@ def run_math_density(transcript):
|
|
| 411 |
student_math_word_cloud[math_terms_dict[regex]] = 0
|
| 412 |
student_math_word_cloud[math_terms_dict[regex]] += len(matches)
|
| 413 |
for match in matches:
|
| 414 |
-
match_list.
|
| 415 |
matched_positions.add((match.start(), match.end()))
|
| 416 |
num_matches += len(matches)
|
| 417 |
-
# print("match group list: ", [match.group(0) for match in matches])
|
| 418 |
utt.num_math_terms = num_matches
|
| 419 |
-
utt.math_terms = match_list
|
| 420 |
-
# utt.math_match_positions = list(matched_positions)
|
| 421 |
-
# utt.math_terms_raw = [text[start:end] for start, end in matched_positions]
|
| 422 |
teacher_dict_list = []
|
| 423 |
student_dict_list = []
|
| 424 |
dict_list = []
|
|
|
|
| 1 |
from typing import Dict, List, Any
|
| 2 |
from scipy.special import softmax
|
| 3 |
+
from collections import Counter
|
| 4 |
import numpy as np
|
| 5 |
import weakref
|
| 6 |
import re
|
|
|
|
| 153 |
return {'teacher': teacher_percentage, 'student': student_percentage}, {'teacher': avg_teacher_length, 'student': avg_student_length}
|
| 154 |
|
| 155 |
def get_word_clouds(self):
|
| 156 |
+
# Initialize dictionaries
|
| 157 |
+
teacher_dict = Counter()
|
| 158 |
+
student_dict = Counter()
|
| 159 |
+
uptake_teacher_dict = Counter()
|
| 160 |
stop_words = stopwords.words('english')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 161 |
|
| 162 |
+
# Go through the utterances
|
| 163 |
+
for utt in self.utterances:
|
| 164 |
+
# Get clean text
|
| 165 |
+
clean_text = utt.get_clean_text(remove_punct=True)
|
| 166 |
+
words = clean_text.split()
|
| 167 |
+
words = [word for word in words if word not in stop_words and word not in ['inaudible', 'crosstalk']]
|
| 168 |
+
|
| 169 |
+
# Handle uptake case
|
| 170 |
+
if utt.role == 'teacher' and utt.uptake == 1:
|
| 171 |
+
uptake_teacher_dict.update(words)
|
| 172 |
+
|
| 173 |
+
general_text = ' '.join(words)
|
| 174 |
+
# Replace math terms with empty strings
|
| 175 |
+
for math_term in utt.math_terms:
|
| 176 |
+
general_text = general_text.replace(math_term, '')
|
| 177 |
+
general_text = general_text.replace(' ', ' ')
|
| 178 |
+
|
| 179 |
+
general_words = general_text.split()
|
| 180 |
+
# Update the appropriate dictionary
|
| 181 |
+
if utt.role == 'teacher':
|
| 182 |
+
teacher_dict.update(general_words)
|
| 183 |
+
else:
|
| 184 |
+
student_dict.update(general_words)
|
| 185 |
dict_list = []
|
| 186 |
uptake_dict_list = []
|
| 187 |
teacher_dict_list = []
|
|
|
|
| 400 |
text = utt.get_clean_text(remove_punct=True)
|
| 401 |
num_matches = 0
|
| 402 |
matched_positions = set()
|
| 403 |
+
match_list = set()
|
| 404 |
for regex in sorted_regexes:
|
| 405 |
matches = list(re.finditer(regex, text, re.IGNORECASE))
|
| 406 |
# Filter out matches that share positions with longer terms
|
| 407 |
matches = [match for match in matches if not any(match.start() in range(existing[0], existing[1]) for existing in matched_positions)]
|
|
|
|
| 408 |
if len(matches) > 0:
|
| 409 |
if utt.role == "teacher":
|
| 410 |
if math_terms_dict[regex] not in teacher_math_word_cloud:
|
|
|
|
| 415 |
student_math_word_cloud[math_terms_dict[regex]] = 0
|
| 416 |
student_math_word_cloud[math_terms_dict[regex]] += len(matches)
|
| 417 |
for match in matches:
|
| 418 |
+
match_list.add(match.group())
|
| 419 |
matched_positions.add((match.start(), match.end()))
|
| 420 |
num_matches += len(matches)
|
|
|
|
| 421 |
utt.num_math_terms = num_matches
|
| 422 |
+
utt.math_terms = list(match_list)
|
|
|
|
|
|
|
| 423 |
teacher_dict_list = []
|
| 424 |
student_dict_list = []
|
| 425 |
dict_list = []
|