glossary.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ add, added, adding
2
+ divide, divided, dividing
handler.py CHANGED
@@ -1,6 +1,7 @@
1
  from typing import Dict, List, Any
 
2
  from scipy.special import softmax
3
- from collections import Counter
4
  import numpy as np
5
  import weakref
6
  import re
@@ -15,6 +16,7 @@ from utils import MultiHeadModel, BertInputBuilder, get_num_words, MATH_PREFIXES
15
  import transformers
16
  from transformers import BertTokenizer, BertForSequenceClassification
17
  from transformers.utils import logging
 
18
 
19
  transformers.logging.set_verbosity_debug()
20
 
@@ -44,6 +46,7 @@ class Utterance:
44
  self.aggregate_unit_measure = endtime
45
  self.num_math_terms = None
46
  self.math_terms = None
 
47
 
48
  # moments
49
  self.uptake = None
@@ -72,6 +75,7 @@ class Utterance:
72
  'focusingQuestion': self.focusing_question,
73
  'numMathTerms': self.num_math_terms,
74
  'mathTerms': self.math_terms,
 
75
  **self.props
76
  }
77
 
@@ -82,12 +86,19 @@ class Utterance:
82
  'uid': self.uid,
83
  'role': self.role,
84
  'timestamp': self.timestamp,
85
- 'moments': {'reasoning': True if self.reasoning else False, 'questioning': True if self.question else False, 'uptake': True if self.uptake else False, 'focusingQuestion': True if self.focusing_question else False},
 
 
 
 
 
 
86
  'unitMeasure': self.unit_measure,
87
  'aggregateUnitMeasure': self.aggregate_unit_measure,
88
  'wordCount': self.word_count,
89
  'numMathTerms': self.num_math_terms,
90
  'mathTerms': self.math_terms,
 
91
  }
92
 
93
  def __repr__(self):
@@ -151,6 +162,34 @@ class Transcript:
151
  avg_student_length = student_words / student_utt_count if student_utt_count > 0 else 0
152
  return {'teacher': teacher_percentage, 'student': student_percentage}, {'teacher': avg_teacher_length, 'student': avg_student_length}
153
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
  def get_word_clouds(self):
155
  # Initialize dictionaries
156
  teacher_dict = Counter()
@@ -491,6 +530,11 @@ class EndpointHandler():
491
  self.device, self.tokenizer, self.input_builder)
492
  focusing_question_model.run_inference(transcript, uptake_speaker=uptake_speaker)
493
  del focusing_question_model
 
 
 
 
 
494
 
495
  transcript.update_utterance_roles(uptake_speaker)
496
  sorted_math_cloud, teacher_math_cloud, student_math_cloud = run_math_density(transcript)
@@ -506,5 +550,6 @@ class EndpointHandler():
506
  student_cloud = student_math_cloud + student_general_cloud
507
  return_dict['teacherTopWords'] = teacher_cloud
508
  return_dict['studentTopWords'] = student_cloud
 
509
 
510
  return return_dict
 
1
  from typing import Dict, List, Any
2
+ from measures.VocabularyAnalyser import VocabularyAnalyser
3
  from scipy.special import softmax
4
+ from collections import Counter, defaultdict
5
  import numpy as np
6
  import weakref
7
  import re
 
16
  import transformers
17
  from transformers import BertTokenizer, BertForSequenceClassification
18
  from transformers.utils import logging
19
+ from pathlib import Path
20
 
21
  transformers.logging.set_verbosity_debug()
22
 
 
46
  self.aggregate_unit_measure = endtime
47
  self.num_math_terms = None
48
  self.math_terms = None
49
+ self.vocabulary_terms = None
50
 
51
  # moments
52
  self.uptake = None
 
75
  'focusingQuestion': self.focusing_question,
76
  'numMathTerms': self.num_math_terms,
77
  'mathTerms': self.math_terms,
78
+ 'vocabularyTerms': self.vocabulary_terms,
79
  **self.props
80
  }
81
 
 
86
  'uid': self.uid,
87
  'role': self.role,
88
  'timestamp': self.timestamp,
89
+ 'moments': {
90
+ 'reasoning': True if self.reasoning else False,
91
+ 'questioning': True if self.question else False,
92
+ 'uptake': True if self.uptake else False,
93
+ 'focusingQuestion': True if self.focusing_question else False,
94
+ 'mathWord': bool(self.vocabulary_terms)
95
+ },
96
  'unitMeasure': self.unit_measure,
97
  'aggregateUnitMeasure': self.aggregate_unit_measure,
98
  'wordCount': self.word_count,
99
  'numMathTerms': self.num_math_terms,
100
  'mathTerms': self.math_terms,
101
+ 'vocabularyTerms': self.vocabulary_terms
102
  }
103
 
104
  def __repr__(self):
 
162
  avg_student_length = student_words / student_utt_count if student_utt_count > 0 else 0
163
  return {'teacher': teacher_percentage, 'student': student_percentage}, {'teacher': avg_teacher_length, 'student': avg_student_length}
164
 
165
+ def get_vocabulary_table(self):
166
+ """
167
+ Build a summary of matched vocabulary terms by utterance role.
168
+ Returns a dict of the form:
169
+ {
170
+ 'add': {
171
+ 'teacher': {'count': 2, 'utterances': [1,5]},
172
+ 'student': {'count': 1, 'utterances': [2]}
173
+ },
174
+ 'divide': {
175
+ 'teacher': {...},
176
+ 'student': {...}
177
+ }
178
+ }
179
+ """
180
+ vocab_summary = defaultdict(lambda: {"teacher": {"count": 0, "utterances": []},
181
+ "student": {"count": 0, "utterances": []}})
182
+
183
+ for utt in self.utterances:
184
+ role = utt.role if utt.role in ("teacher", "student") else "student" # default
185
+ if utt.vocabulary_terms:
186
+ for term in utt.vocabulary_terms:
187
+ vocab_summary[term][role]["count"] += 1
188
+ vocab_summary[term][role]["utterances"].append(utt.uid)
189
+
190
+ return vocab_summary
191
+
192
+
193
  def get_word_clouds(self):
194
  # Initialize dictionaries
195
  teacher_dict = Counter()
 
530
  self.device, self.tokenizer, self.input_builder)
531
  focusing_question_model.run_inference(transcript, uptake_speaker=uptake_speaker)
532
  del focusing_question_model
533
+
534
+ glossary_path = Path(__file__).resolve().parent / "glossary.csv"
535
+ vocabulary_analyser = VocabularyAnalyser(str(glossary_path))
536
+ vocabulary_analyser.run_analysis(transcript)
537
+ del vocabulary_analyser
538
 
539
  transcript.update_utterance_roles(uptake_speaker)
540
  sorted_math_cloud, teacher_math_cloud, student_math_cloud = run_math_density(transcript)
 
550
  student_cloud = student_math_cloud + student_general_cloud
551
  return_dict['teacherTopWords'] = teacher_cloud
552
  return_dict['studentTopWords'] = student_cloud
553
+ return_dict['vocabularyTable'] = transcript.get_vocabulary_table()
554
 
555
  return return_dict
measures/VocabularyAnalyser.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import re
3
+ import unicodedata
4
+ import weakref
5
+
6
+ # --- helper normalization ---
7
+ def norm_txt(x: str) -> str:
8
+ if x is None:
9
+ return ""
10
+ x = unicodedata.normalize("NFKC", str(x))
11
+ x = x.lower()
12
+ x = re.sub(r"\s+", " ", x).strip()
13
+ x = re.sub(r"[-‐–—]", " ", x)
14
+ return x
15
+
16
+ class VocabularyAnalyser:
17
+ def __init__(self, glossary_file: str):
18
+ # Load glossary CSV (first column = base + variants, comma-separated)
19
+ raw = pd.read_csv(glossary_file)
20
+ terms_col = raw.iloc[:, 0].astype(str).apply(norm_txt)
21
+
22
+ gloss_list = []
23
+ for idx, cell in enumerate(terms_col, start=1):
24
+ items = [t.strip() for t in cell.split(",") if t.strip()]
25
+ if not items:
26
+ continue
27
+ base = items[0]
28
+ for form in items:
29
+ gloss_list.append({
30
+ "row": idx,
31
+ "base": base,
32
+ "form": form,
33
+ "len": len(form),
34
+ "words": len(form.split())
35
+ })
36
+
37
+ self.gloss_forms = (
38
+ pd.DataFrame(gloss_list)
39
+ .drop_duplicates(["base", "form"])
40
+ .sort_values(["words", "len"], ascending=[False, False])
41
+ )
42
+
43
+ def match_one_utterance(self, text: str):
44
+ """Return list of matched base terms for a given utterance text."""
45
+ s = norm_txt(text)
46
+ if not s:
47
+ return []
48
+
49
+ locs = []
50
+ for fm, bs, wd in zip(self.gloss_forms["form"],
51
+ self.gloss_forms["base"],
52
+ self.gloss_forms["words"]):
53
+ if not fm:
54
+ continue
55
+ fm_escaped = re.escape(fm)
56
+ pattern = r"\b" + fm_escaped + r"\b"
57
+ for m in re.finditer(pattern, s):
58
+ locs.append({
59
+ "start": m.start(),
60
+ "end": m.end(),
61
+ "form": fm,
62
+ "base": bs,
63
+ "words": wd,
64
+ "len": m.end() - m.start()
65
+ })
66
+
67
+ if not locs:
68
+ return []
69
+
70
+ # prioritize: more tokens > longer span > earlier start
71
+ locs_df = pd.DataFrame(locs).sort_values(
72
+ ["words", "len", "start"], ascending=[False, False, True]
73
+ )
74
+
75
+ used = [False] * len(s)
76
+ keep_bases = []
77
+ for _, row in locs_df.iterrows():
78
+ rng = range(row["start"], row["end"])
79
+ if not any(used[i] for i in rng):
80
+ keep_bases.append(row["base"])
81
+ for i in rng:
82
+ used[i] = True
83
+
84
+ return sorted(set(keep_bases))
85
+
86
+ def run_analysis(self, transcript):
87
+ """Mutate transcript utterances by adding vocabulary_terms list."""
88
+ for utt in transcript.utterances:
89
+ matches = self.match_one_utterance(utt.text)
90
+ utt.vocabulary_terms = matches
91
+ return transcript
measures/__init__.py ADDED
File without changes
requirements.txt CHANGED
@@ -7,3 +7,4 @@ torchvision==0.19.0
7
  transformers==4.46.1
8
  nltk==3.9.1
9
  inflect==7.5.0
 
 
7
  transformers==4.46.1
8
  nltk==3.9.1
9
  inflect==7.5.0
10
+ pandas==2.2.2