Vocabulary Analysis

by ikarasz - opened Nov 16, 2025

base: refs/heads/main

←

from: refs/pr/7

Discussion Files changed

+141

-2

Files changed (5) hide show

glossary.csv +2 -0
handler.py +47 -2
measures/VocabularyAnalyser.py +91 -0
measures/__init__.py +0 -0
requirements.txt +1 -0

glossary.csv ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ add, added, adding
2	+ divide, divided, dividing

handler.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from typing import Dict, List, Any
 from scipy.special import softmax
-from collections import Counter
 import numpy as np
 import weakref
 import re
@@ -15,6 +16,7 @@ from utils import MultiHeadModel, BertInputBuilder, get_num_words, MATH_PREFIXES
 import transformers
 from transformers import BertTokenizer, BertForSequenceClassification
 from transformers.utils import logging
 transformers.logging.set_verbosity_debug()
@@ -44,6 +46,7 @@ class Utterance:
         self.aggregate_unit_measure = endtime
         self.num_math_terms = None
         self.math_terms = None
         # moments
         self.uptake = None
@@ -72,6 +75,7 @@ class Utterance:
             'focusingQuestion': self.focusing_question,
             'numMathTerms': self.num_math_terms,
             'mathTerms': self.math_terms,
             **self.props
         }
@@ -82,12 +86,19 @@ class Utterance:
             'uid': self.uid,
             'role': self.role,
             'timestamp': self.timestamp,
-            'moments': {'reasoning': True if self.reasoning else False, 'questioning': True if self.question else False, 'uptake': True if self.uptake else False, 'focusingQuestion': True if self.focusing_question else False},
             'unitMeasure': self.unit_measure,
             'aggregateUnitMeasure': self.aggregate_unit_measure,
             'wordCount': self.word_count,
             'numMathTerms': self.num_math_terms,
             'mathTerms': self.math_terms,
         }
     def __repr__(self):
@@ -151,6 +162,34 @@ class Transcript:
         avg_student_length = student_words / student_utt_count if student_utt_count > 0 else 0
         return {'teacher': teacher_percentage, 'student': student_percentage}, {'teacher': avg_teacher_length, 'student': avg_student_length}
     def get_word_clouds(self):
         # Initialize dictionaries
         teacher_dict = Counter()
@@ -491,6 +530,11 @@ class EndpointHandler():
             self.device, self.tokenizer, self.input_builder)
         focusing_question_model.run_inference(transcript, uptake_speaker=uptake_speaker)
         del focusing_question_model
         transcript.update_utterance_roles(uptake_speaker)
         sorted_math_cloud, teacher_math_cloud, student_math_cloud = run_math_density(transcript)
@@ -506,5 +550,6 @@ class EndpointHandler():
         student_cloud = student_math_cloud + student_general_cloud
         return_dict['teacherTopWords'] = teacher_cloud
         return_dict['studentTopWords'] = student_cloud
         return return_dict

 from typing import Dict, List, Any
+from measures.VocabularyAnalyser import VocabularyAnalyser
 from scipy.special import softmax
+from collections import Counter, defaultdict
 import numpy as np
 import weakref
 import re
 import transformers
 from transformers import BertTokenizer, BertForSequenceClassification
 from transformers.utils import logging
+from pathlib import Path
 transformers.logging.set_verbosity_debug()
         self.aggregate_unit_measure = endtime
         self.num_math_terms = None
         self.math_terms = None
+        self.vocabulary_terms = None
         # moments
         self.uptake = None
             'focusingQuestion': self.focusing_question,
             'numMathTerms': self.num_math_terms,
             'mathTerms': self.math_terms,
+            'vocabularyTerms': self.vocabulary_terms,
             **self.props
         }
             'uid': self.uid,
             'role': self.role,
             'timestamp': self.timestamp,
+            'moments': {
+                'reasoning': True if self.reasoning else False,
+                'questioning': True if self.question else False,
+                'uptake': True if self.uptake else False,
+                'focusingQuestion': True if self.focusing_question else False,
+                'mathWord': bool(self.vocabulary_terms)
+            },
             'unitMeasure': self.unit_measure,
             'aggregateUnitMeasure': self.aggregate_unit_measure,
             'wordCount': self.word_count,
             'numMathTerms': self.num_math_terms,
             'mathTerms': self.math_terms,
+            'vocabularyTerms': self.vocabulary_terms
         }
     def __repr__(self):
         avg_student_length = student_words / student_utt_count if student_utt_count > 0 else 0
         return {'teacher': teacher_percentage, 'student': student_percentage}, {'teacher': avg_teacher_length, 'student': avg_student_length}
+    def get_vocabulary_table(self):
+        """
+        Build a summary of matched vocabulary terms by utterance role.
+        Returns a dict of the form:
+        {
+          'add': {
+             'teacher': {'count': 2, 'utterances': [1,5]},
+             'student': {'count': 1, 'utterances': [2]}
+          },
+          'divide': {
+             'teacher': {...},
+             'student': {...}
+          }
+        }
+        """
+        vocab_summary = defaultdict(lambda: {"teacher": {"count": 0, "utterances": []},
+                                             "student": {"count": 0, "utterances": []}})
+        for utt in self.utterances:
+            role = utt.role if utt.role in ("teacher", "student") else "student"  # default
+            if utt.vocabulary_terms:
+                for term in utt.vocabulary_terms:
+                    vocab_summary[term][role]["count"] += 1
+                    vocab_summary[term][role]["utterances"].append(utt.uid)
+        return vocab_summary
     def get_word_clouds(self):
         # Initialize dictionaries
         teacher_dict = Counter()
             self.device, self.tokenizer, self.input_builder)
         focusing_question_model.run_inference(transcript, uptake_speaker=uptake_speaker)
         del focusing_question_model
+        glossary_path = Path(__file__).resolve().parent / "glossary.csv"
+        vocabulary_analyser = VocabularyAnalyser(str(glossary_path))
+        vocabulary_analyser.run_analysis(transcript)
+        del vocabulary_analyser
         transcript.update_utterance_roles(uptake_speaker)
         sorted_math_cloud, teacher_math_cloud, student_math_cloud = run_math_density(transcript)
         student_cloud = student_math_cloud + student_general_cloud
         return_dict['teacherTopWords'] = teacher_cloud
         return_dict['studentTopWords'] = student_cloud
+        return_dict['vocabularyTable'] = transcript.get_vocabulary_table()
         return return_dict

measures/VocabularyAnalyser.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import pandas as pd
+import re
+import unicodedata
+import weakref
+# --- helper normalization ---
+def norm_txt(x: str) -> str:
+    if x is None:
+        return ""
+    x = unicodedata.normalize("NFKC", str(x))
+    x = x.lower()
+    x = re.sub(r"\s+", " ", x).strip()
+    x = re.sub(r"[-‐–—]", " ", x)
+    return x
+class VocabularyAnalyser:
+    def __init__(self, glossary_file: str):
+        # Load glossary CSV (first column = base + variants, comma-separated)
+        raw = pd.read_csv(glossary_file)
+        terms_col = raw.iloc[:, 0].astype(str).apply(norm_txt)
+        gloss_list = []
+        for idx, cell in enumerate(terms_col, start=1):
+            items = [t.strip() for t in cell.split(",") if t.strip()]
+            if not items:
+                continue
+            base = items[0]
+            for form in items:
+                gloss_list.append({
+                    "row": idx,
+                    "base": base,
+                    "form": form,
+                    "len": len(form),
+                    "words": len(form.split())
+                })
+        self.gloss_forms = (
+            pd.DataFrame(gloss_list)
+            .drop_duplicates(["base", "form"])
+            .sort_values(["words", "len"], ascending=[False, False])
+        )
+    def match_one_utterance(self, text: str):
+        """Return list of matched base terms for a given utterance text."""
+        s = norm_txt(text)
+        if not s:
+            return []
+        locs = []
+        for fm, bs, wd in zip(self.gloss_forms["form"],
+                              self.gloss_forms["base"],
+                              self.gloss_forms["words"]):
+            if not fm:
+                continue
+            fm_escaped = re.escape(fm)
+            pattern = r"\b" + fm_escaped + r"\b"
+            for m in re.finditer(pattern, s):
+                locs.append({
+                    "start": m.start(),
+                    "end": m.end(),
+                    "form": fm,
+                    "base": bs,
+                    "words": wd,
+                    "len": m.end() - m.start()
+                })
+        if not locs:
+            return []
+        # prioritize: more tokens > longer span > earlier start
+        locs_df = pd.DataFrame(locs).sort_values(
+            ["words", "len", "start"], ascending=[False, False, True]
+        )
+        used = [False] * len(s)
+        keep_bases = []
+        for _, row in locs_df.iterrows():
+            rng = range(row["start"], row["end"])
+            if not any(used[i] for i in rng):
+                keep_bases.append(row["base"])
+                for i in rng:
+                    used[i] = True
+        return sorted(set(keep_bases))
+    def run_analysis(self, transcript):
+        """Mutate transcript utterances by adding vocabulary_terms list."""
+        for utt in transcript.utterances:
+            matches = self.match_one_utterance(utt.text)
+            utt.vocabulary_terms = matches
+        return transcript

measures/__init__.py ADDED Viewed

File without changes

requirements.txt CHANGED Viewed

@@ -7,3 +7,4 @@ torchvision==0.19.0
 transformers==4.46.1
 nltk==3.9.1
 inflect==7.5.0

 transformers==4.46.1
 nltk==3.9.1
 inflect==7.5.0
+pandas==2.2.2