hylee719 commited on
Commit
0bf6be0
·
1 Parent(s): da14a77

update math terms calculation and return format for visualizations

Browse files
Files changed (1) hide show
  1. handler.py +139 -20
handler.py CHANGED
@@ -3,6 +3,9 @@ from scipy.special import softmax
3
  import numpy as np
4
  import weakref
5
  import re
 
 
 
6
 
7
  from utils import clean_str, clean_str_nopunct
8
  import torch
@@ -10,7 +13,7 @@ from utils import MultiHeadModel, BertInputBuilder, get_num_words, MATH_PREFIXES
10
 
11
  import transformers
12
  from transformers import BertTokenizer, BertForSequenceClassification
13
-
14
 
15
  transformers.logging.set_verbosity_debug()
16
 
@@ -30,9 +33,15 @@ class Utterance:
30
  self.endtime = endtime
31
  self.transcript = weakref.ref(transcript) if transcript else None
32
  self.props = kwargs
 
 
 
 
 
33
  self.num_math_terms = None
34
  self.math_terms = None
35
 
 
36
  self.uptake = None
37
  self.reasoning = None
38
  self.question = None
@@ -62,6 +71,20 @@ class Utterance:
62
  **self.props
63
  }
64
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  def __repr__(self):
66
  return f"Utterance(speaker='{self.speaker}'," \
67
  f"text='{self.text}', uid={self.uid}," \
@@ -91,6 +114,86 @@ class Transcript:
91
  def length(self):
92
  return len(self.utterances)
93
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  def to_dict(self):
95
  return {
96
  'utterances': [utterance.to_dict() for utterance in self.utterances],
@@ -218,8 +321,6 @@ class UptakeModel:
218
  return_pooler_output=False)
219
  return output
220
 
221
-
222
-
223
  class FocusingQuestionModel:
224
  def __init__(self, device, tokenizer, input_builder, max_length=128, path=FOCUSING_QUESTION_MODEL):
225
  print("Loading models...")
@@ -254,8 +355,7 @@ class FocusingQuestionModel:
254
  output = self.model(input_ids=instance["input_ids"],
255
  attention_mask=instance["attention_mask"],
256
  token_type_ids=instance["token_type_ids"])
257
- return output
258
-
259
 
260
  def load_math_terms():
261
  math_terms = []
@@ -265,23 +365,29 @@ def load_math_terms():
265
  math_terms_dict[f"(^|[^a-zA-Z]){term}(s|es)?([^a-zA-Z]|$)"] = term
266
  math_terms.append(f"(^|[^a-zA-Z]){term}(s|es)?([^a-zA-Z]|$)")
267
  else:
268
- math_terms_dict[f"(^|[^a-zA-Z]){term}([^a-zA-Z]|$)"] = term
269
- math_terms.append(f"(^|[^a-zA-Z]){term}([^a-zA-Z]|$)")
270
  return math_terms, math_terms_dict
271
 
272
  def run_math_density(transcript):
273
  math_terms, math_terms_dict = load_math_terms()
274
- for i, utt in enumerate(transcript.utterances):
275
- found_math_terms = set()
276
  text = utt.get_clean_text(remove_punct=False)
277
- num_math_terms = 0
278
- for term in math_terms:
279
- count = len(re.findall(term, text))
280
- if count > 0:
281
- found_math_terms.add(math_terms_dict[term])
282
- num_math_terms += count
283
- utt.num_math_terms = num_math_terms
284
- utt.math_terms = list(found_math_terms)
 
 
 
 
 
 
285
 
286
  class EndpointHandler():
287
  def __init__(self, path="."):
@@ -313,13 +419,13 @@ class EndpointHandler():
313
  transcript.add_utterance(Utterance(**utt))
314
 
315
  print("Running inference on %d examples..." % transcript.length())
316
- uptake_speaker = params.pop("uptake_speaker", None)
317
  # Uptake
318
  uptake_model = UptakeModel(
319
  self.device, self.tokenizer, self.input_builder)
 
320
  uptake_model.run_inference(transcript, min_prev_words=params['uptake_min_num_words'],
321
  uptake_speaker=uptake_speaker)
322
-
323
  # Reasoning
324
  reasoning_model = ReasoningModel(
325
  self.device, self.tokenizer, self.input_builder)
@@ -337,4 +443,17 @@ class EndpointHandler():
337
 
338
  run_math_density(transcript)
339
 
340
- return transcript.to_dict()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  import numpy as np
4
  import weakref
5
  import re
6
+ import nltk
7
+ from nltk.corpus import stopwords
8
+ nltk.download('stopwords')
9
 
10
  from utils import clean_str, clean_str_nopunct
11
  import torch
 
13
 
14
  import transformers
15
  from transformers import BertTokenizer, BertForSequenceClassification
16
+ from transformers.utils import logging
17
 
18
  transformers.logging.set_verbosity_debug()
19
 
 
33
  self.endtime = endtime
34
  self.transcript = weakref.ref(transcript) if transcript else None
35
  self.props = kwargs
36
+ self.role = None
37
+ self.word_count = self.get_num_words()
38
+ self.timestamp = [starttime, endtime]
39
+ self.unit_measure = None
40
+ self.aggregate_unit_measure = endtime
41
  self.num_math_terms = None
42
  self.math_terms = None
43
 
44
+ # moments
45
  self.uptake = None
46
  self.reasoning = None
47
  self.question = None
 
71
  **self.props
72
  }
73
 
74
+ def to_talk_timeline_dict(self):
75
+ return{
76
+ 'speaker': self.speaker,
77
+ 'text': self.text,
78
+ 'role': self.role,
79
+ 'timestamp': self.timestamp,
80
+ 'moments': {'reasoning': True if self.reasoning else False, 'questioning': True if self.question else False, 'uptake': True if self.uptake else False, 'focusingQuestion': True if self.focusing_question else False},
81
+ 'unitMeasure': self.unit_measure,
82
+ 'aggregateUnitMeasure': self.aggregate_unit_measure,
83
+ 'wordCount': self.word_count,
84
+ 'numMathTerms': self.num_math_terms,
85
+ 'mathTerms': self.math_terms
86
+ }
87
+
88
  def __repr__(self):
89
  return f"Utterance(speaker='{self.speaker}'," \
90
  f"text='{self.text}', uid={self.uid}," \
 
114
  def length(self):
115
  return len(self.utterances)
116
 
117
+ def update_utterance_roles(self, uptake_speaker):
118
+ for utt in self.utterances:
119
+ if (utt.speaker == uptake_speaker):
120
+ utt.role = 'teacher'
121
+ else:
122
+ utt.role = 'student'
123
+
124
+ def get_talk_distribution_and_length(self, uptake_speaker):
125
+ if ((uptake_speaker is None)):
126
+ return None
127
+ teacher_words = 0
128
+ teacher_utt_count = 0
129
+ student_words = 0
130
+ student_utt_count = 0
131
+ for utt in self.utterances:
132
+ if (utt.speaker == uptake_speaker):
133
+ utt.role = 'teacher'
134
+ teacher_words += utt.get_num_words()
135
+ teacher_utt_count += 1
136
+ else:
137
+ utt.role = 'student'
138
+ student_words += utt.get_num_words()
139
+ student_utt_count += 1
140
+ teacher_percentage = round(
141
+ (teacher_words / (teacher_words + student_words)) * 100)
142
+ student_percentage = 100 - teacher_percentage
143
+ avg_teacher_length = teacher_words / teacher_utt_count
144
+ avg_student_length = student_words / student_utt_count
145
+ return {'teacher': teacher_percentage, 'student': student_percentage}, {'teacher': avg_teacher_length, 'student': avg_student_length}
146
+
147
+ def get_word_cloud_dicts(self):
148
+ teacher_dict = {}
149
+ student_dict = {}
150
+ uptake_teacher_dict = {}
151
+ stop_words = stopwords.words('english')
152
+ # stopwords = nltk.corpus.stopwords.word('english')
153
+ # print("stopwords: ", stopwords)
154
+ for utt in self.utterances:
155
+ words = (utt.get_clean_text(remove_punct=True)).split(' ')
156
+ for word in words:
157
+ if word in stop_words: continue
158
+ if utt.role == 'teacher':
159
+ if word not in teacher_dict:
160
+ teacher_dict[word] = 0
161
+ teacher_dict[word] += 1
162
+ if utt.uptake == 1:
163
+ if word not in uptake_teacher_dict:
164
+ uptake_teacher_dict[word] = 0
165
+ uptake_teacher_dict[word] += 1
166
+ else:
167
+ if word not in student_dict:
168
+ student_dict[word] = 0
169
+ student_dict[word] += 1
170
+ dict_list = []
171
+ uptake_dict_list = []
172
+ for word in uptake_teacher_dict.keys():
173
+ uptake_dict_list.append({'text': word, 'value': uptake_teacher_dict[word], 'category': 'teacher'})
174
+ for word in teacher_dict.keys():
175
+ dict_list.append(
176
+ {'text': word, 'value': teacher_dict[word], 'category': 'teacher'})
177
+ for word in student_dict.keys():
178
+ dict_list.append(
179
+ {'text': word, 'value': student_dict[word], 'category': 'student'})
180
+ sorted_dict_list = sorted(dict_list, key=lambda x: x['value'], reverse=True)
181
+ sorted_uptake_dict_list = sorted(uptake_dict_list, key=lambda x: x['value'], reverse=True)
182
+ return sorted_dict_list[:50], sorted_uptake_dict_list[:50]
183
+
184
+ def get_talk_timeline(self):
185
+ return [utterance.to_talk_timeline_dict() for utterance in self.utterances]
186
+
187
+ def calculate_aggregate_word_count(self):
188
+ unit_measures = [utt.unit_measure for utt in self.utterances]
189
+ if None in unit_measures:
190
+ aggregate_word_count = 0
191
+ for utt in self.utterances:
192
+ aggregate_word_count += utt.get_num_words()
193
+ utt.unit_measure = utt.get_num_words()
194
+ utt.aggregate_unit_measure = aggregate_word_count
195
+
196
+
197
  def to_dict(self):
198
  return {
199
  'utterances': [utterance.to_dict() for utterance in self.utterances],
 
321
  return_pooler_output=False)
322
  return output
323
 
 
 
324
  class FocusingQuestionModel:
325
  def __init__(self, device, tokenizer, input_builder, max_length=128, path=FOCUSING_QUESTION_MODEL):
326
  print("Loading models...")
 
355
  output = self.model(input_ids=instance["input_ids"],
356
  attention_mask=instance["attention_mask"],
357
  token_type_ids=instance["token_type_ids"])
358
+ return output
 
359
 
360
  def load_math_terms():
361
  math_terms = []
 
365
  math_terms_dict[f"(^|[^a-zA-Z]){term}(s|es)?([^a-zA-Z]|$)"] = term
366
  math_terms.append(f"(^|[^a-zA-Z]){term}(s|es)?([^a-zA-Z]|$)")
367
  else:
368
+ math_terms.append(term)
369
+ math_terms_dict[term] = term
370
  return math_terms, math_terms_dict
371
 
372
  def run_math_density(transcript):
373
  math_terms, math_terms_dict = load_math_terms()
374
+ sorted_terms = sorted(math_terms, key=len, reverse=True)
375
+ for i, utt in enumerate(transcript.utterances):
376
  text = utt.get_clean_text(remove_punct=False)
377
+ num_matches = 0
378
+ matched_positions = set()
379
+ match_list = []
380
+ for term in sorted_terms:
381
+ matches = list(re.finditer(term, text, re.IGNORECASE))
382
+ # Filter out matches that share positions with longer terms
383
+ matches = [match for match in matches if not any(match.start() in range(existing[0], existing[1]) for existing in matched_positions)]
384
+ if len(matches) > 0:
385
+ match_list.append(math_terms_dict[term])
386
+ # Update matched positions
387
+ matched_positions.update((match.start(), match.end()) for match in matches)
388
+ num_matches += len(matches)
389
+ utt.num_math_terms = num_matches
390
+ utt.math_terms = match_list
391
 
392
  class EndpointHandler():
393
  def __init__(self, path="."):
 
419
  transcript.add_utterance(Utterance(**utt))
420
 
421
  print("Running inference on %d examples..." % transcript.length())
422
+ logging.set_verbosity_info()
423
  # Uptake
424
  uptake_model = UptakeModel(
425
  self.device, self.tokenizer, self.input_builder)
426
+ uptake_speaker = params.pop("uptake_speaker", None)
427
  uptake_model.run_inference(transcript, min_prev_words=params['uptake_min_num_words'],
428
  uptake_speaker=uptake_speaker)
 
429
  # Reasoning
430
  reasoning_model = ReasoningModel(
431
  self.device, self.tokenizer, self.input_builder)
 
443
 
444
  run_math_density(transcript)
445
 
446
+ transcript.update_utterance_roles(uptake_speaker)
447
+ transcript.calculate_aggregate_word_count()
448
+ return_dict = {'talkDistribution': None, 'talkLength': None, 'talkMoments': None, 'commonTopWords': None, 'uptakeTopWords': None}
449
+ talk_dist, talk_len = transcript.get_talk_distribution_and_length(uptake_speaker)
450
+ return_dict['talkDistribution'] = talk_dist
451
+ return_dict['talkLength'] = talk_len
452
+ talk_moments = transcript.get_talk_timeline()
453
+ return_dict['talkMoments'] = talk_moments
454
+ word_cloud, uptake_word_cloud = transcript.get_word_cloud_dicts()
455
+ return_dict['commonTopWords'] = word_cloud
456
+ return_dict['uptakeTopwords'] = uptake_word_cloud
457
+
458
+
459
+ return return_dict