Vocabulary Analysis (#7)
Browse files- first version of vocabulary analyser (ad89549aaa20ad814f127865af4f3245d73712ff)
- glossary.csv +2 -0
- handler.py +47 -2
- measures/VocabularyAnalyser.py +91 -0
- measures/__init__.py +0 -0
- requirements.txt +1 -0
glossary.csv
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
add, added, adding
|
| 2 |
+
divide, divided, dividing
|
handler.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
from typing import Dict, List, Any
|
|
|
|
| 2 |
from scipy.special import softmax
|
| 3 |
-
from collections import Counter
|
| 4 |
import numpy as np
|
| 5 |
import weakref
|
| 6 |
import re
|
|
@@ -15,6 +16,7 @@ from utils import MultiHeadModel, BertInputBuilder, get_num_words, MATH_PREFIXES
|
|
| 15 |
import transformers
|
| 16 |
from transformers import BertTokenizer, BertForSequenceClassification
|
| 17 |
from transformers.utils import logging
|
|
|
|
| 18 |
|
| 19 |
transformers.logging.set_verbosity_debug()
|
| 20 |
|
|
@@ -44,6 +46,7 @@ class Utterance:
|
|
| 44 |
self.aggregate_unit_measure = endtime
|
| 45 |
self.num_math_terms = None
|
| 46 |
self.math_terms = None
|
|
|
|
| 47 |
|
| 48 |
# moments
|
| 49 |
self.uptake = None
|
|
@@ -72,6 +75,7 @@ class Utterance:
|
|
| 72 |
'focusingQuestion': self.focusing_question,
|
| 73 |
'numMathTerms': self.num_math_terms,
|
| 74 |
'mathTerms': self.math_terms,
|
|
|
|
| 75 |
**self.props
|
| 76 |
}
|
| 77 |
|
|
@@ -82,12 +86,19 @@ class Utterance:
|
|
| 82 |
'uid': self.uid,
|
| 83 |
'role': self.role,
|
| 84 |
'timestamp': self.timestamp,
|
| 85 |
-
'moments': {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
'unitMeasure': self.unit_measure,
|
| 87 |
'aggregateUnitMeasure': self.aggregate_unit_measure,
|
| 88 |
'wordCount': self.word_count,
|
| 89 |
'numMathTerms': self.num_math_terms,
|
| 90 |
'mathTerms': self.math_terms,
|
|
|
|
| 91 |
}
|
| 92 |
|
| 93 |
def __repr__(self):
|
|
@@ -151,6 +162,34 @@ class Transcript:
|
|
| 151 |
avg_student_length = student_words / student_utt_count if student_utt_count > 0 else 0
|
| 152 |
return {'teacher': teacher_percentage, 'student': student_percentage}, {'teacher': avg_teacher_length, 'student': avg_student_length}
|
| 153 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
def get_word_clouds(self):
|
| 155 |
# Initialize dictionaries
|
| 156 |
teacher_dict = Counter()
|
|
@@ -491,6 +530,11 @@ class EndpointHandler():
|
|
| 491 |
self.device, self.tokenizer, self.input_builder)
|
| 492 |
focusing_question_model.run_inference(transcript, uptake_speaker=uptake_speaker)
|
| 493 |
del focusing_question_model
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 494 |
|
| 495 |
transcript.update_utterance_roles(uptake_speaker)
|
| 496 |
sorted_math_cloud, teacher_math_cloud, student_math_cloud = run_math_density(transcript)
|
|
@@ -506,5 +550,6 @@ class EndpointHandler():
|
|
| 506 |
student_cloud = student_math_cloud + student_general_cloud
|
| 507 |
return_dict['teacherTopWords'] = teacher_cloud
|
| 508 |
return_dict['studentTopWords'] = student_cloud
|
|
|
|
| 509 |
|
| 510 |
return return_dict
|
|
|
|
| 1 |
from typing import Dict, List, Any
|
| 2 |
+
from measures.VocabularyAnalyser import VocabularyAnalyser
|
| 3 |
from scipy.special import softmax
|
| 4 |
+
from collections import Counter, defaultdict
|
| 5 |
import numpy as np
|
| 6 |
import weakref
|
| 7 |
import re
|
|
|
|
| 16 |
import transformers
|
| 17 |
from transformers import BertTokenizer, BertForSequenceClassification
|
| 18 |
from transformers.utils import logging
|
| 19 |
+
from pathlib import Path
|
| 20 |
|
| 21 |
transformers.logging.set_verbosity_debug()
|
| 22 |
|
|
|
|
| 46 |
self.aggregate_unit_measure = endtime
|
| 47 |
self.num_math_terms = None
|
| 48 |
self.math_terms = None
|
| 49 |
+
self.vocabulary_terms = None
|
| 50 |
|
| 51 |
# moments
|
| 52 |
self.uptake = None
|
|
|
|
| 75 |
'focusingQuestion': self.focusing_question,
|
| 76 |
'numMathTerms': self.num_math_terms,
|
| 77 |
'mathTerms': self.math_terms,
|
| 78 |
+
'vocabularyTerms': self.vocabulary_terms,
|
| 79 |
**self.props
|
| 80 |
}
|
| 81 |
|
|
|
|
| 86 |
'uid': self.uid,
|
| 87 |
'role': self.role,
|
| 88 |
'timestamp': self.timestamp,
|
| 89 |
+
'moments': {
|
| 90 |
+
'reasoning': True if self.reasoning else False,
|
| 91 |
+
'questioning': True if self.question else False,
|
| 92 |
+
'uptake': True if self.uptake else False,
|
| 93 |
+
'focusingQuestion': True if self.focusing_question else False,
|
| 94 |
+
'mathWord': bool(self.vocabulary_terms)
|
| 95 |
+
},
|
| 96 |
'unitMeasure': self.unit_measure,
|
| 97 |
'aggregateUnitMeasure': self.aggregate_unit_measure,
|
| 98 |
'wordCount': self.word_count,
|
| 99 |
'numMathTerms': self.num_math_terms,
|
| 100 |
'mathTerms': self.math_terms,
|
| 101 |
+
'vocabularyTerms': self.vocabulary_terms
|
| 102 |
}
|
| 103 |
|
| 104 |
def __repr__(self):
|
|
|
|
| 162 |
avg_student_length = student_words / student_utt_count if student_utt_count > 0 else 0
|
| 163 |
return {'teacher': teacher_percentage, 'student': student_percentage}, {'teacher': avg_teacher_length, 'student': avg_student_length}
|
| 164 |
|
| 165 |
+
def get_vocabulary_table(self):
|
| 166 |
+
"""
|
| 167 |
+
Build a summary of matched vocabulary terms by utterance role.
|
| 168 |
+
Returns a dict of the form:
|
| 169 |
+
{
|
| 170 |
+
'add': {
|
| 171 |
+
'teacher': {'count': 2, 'utterances': [1,5]},
|
| 172 |
+
'student': {'count': 1, 'utterances': [2]}
|
| 173 |
+
},
|
| 174 |
+
'divide': {
|
| 175 |
+
'teacher': {...},
|
| 176 |
+
'student': {...}
|
| 177 |
+
}
|
| 178 |
+
}
|
| 179 |
+
"""
|
| 180 |
+
vocab_summary = defaultdict(lambda: {"teacher": {"count": 0, "utterances": []},
|
| 181 |
+
"student": {"count": 0, "utterances": []}})
|
| 182 |
+
|
| 183 |
+
for utt in self.utterances:
|
| 184 |
+
role = utt.role if utt.role in ("teacher", "student") else "student" # default
|
| 185 |
+
if utt.vocabulary_terms:
|
| 186 |
+
for term in utt.vocabulary_terms:
|
| 187 |
+
vocab_summary[term][role]["count"] += 1
|
| 188 |
+
vocab_summary[term][role]["utterances"].append(utt.uid)
|
| 189 |
+
|
| 190 |
+
return vocab_summary
|
| 191 |
+
|
| 192 |
+
|
| 193 |
def get_word_clouds(self):
|
| 194 |
# Initialize dictionaries
|
| 195 |
teacher_dict = Counter()
|
|
|
|
| 530 |
self.device, self.tokenizer, self.input_builder)
|
| 531 |
focusing_question_model.run_inference(transcript, uptake_speaker=uptake_speaker)
|
| 532 |
del focusing_question_model
|
| 533 |
+
|
| 534 |
+
glossary_path = Path(__file__).resolve().parent / "glossary.csv"
|
| 535 |
+
vocabulary_analyser = VocabularyAnalyser(str(glossary_path))
|
| 536 |
+
vocabulary_analyser.run_analysis(transcript)
|
| 537 |
+
del vocabulary_analyser
|
| 538 |
|
| 539 |
transcript.update_utterance_roles(uptake_speaker)
|
| 540 |
sorted_math_cloud, teacher_math_cloud, student_math_cloud = run_math_density(transcript)
|
|
|
|
| 550 |
student_cloud = student_math_cloud + student_general_cloud
|
| 551 |
return_dict['teacherTopWords'] = teacher_cloud
|
| 552 |
return_dict['studentTopWords'] = student_cloud
|
| 553 |
+
return_dict['vocabularyTable'] = transcript.get_vocabulary_table()
|
| 554 |
|
| 555 |
return return_dict
|
measures/VocabularyAnalyser.py
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import re
|
| 3 |
+
import unicodedata
|
| 4 |
+
import weakref
|
| 5 |
+
|
| 6 |
+
# --- helper normalization ---
|
| 7 |
+
def norm_txt(x: str) -> str:
|
| 8 |
+
if x is None:
|
| 9 |
+
return ""
|
| 10 |
+
x = unicodedata.normalize("NFKC", str(x))
|
| 11 |
+
x = x.lower()
|
| 12 |
+
x = re.sub(r"\s+", " ", x).strip()
|
| 13 |
+
x = re.sub(r"[-‐–—]", " ", x)
|
| 14 |
+
return x
|
| 15 |
+
|
| 16 |
+
class VocabularyAnalyser:
|
| 17 |
+
def __init__(self, glossary_file: str):
|
| 18 |
+
# Load glossary CSV (first column = base + variants, comma-separated)
|
| 19 |
+
raw = pd.read_csv(glossary_file)
|
| 20 |
+
terms_col = raw.iloc[:, 0].astype(str).apply(norm_txt)
|
| 21 |
+
|
| 22 |
+
gloss_list = []
|
| 23 |
+
for idx, cell in enumerate(terms_col, start=1):
|
| 24 |
+
items = [t.strip() for t in cell.split(",") if t.strip()]
|
| 25 |
+
if not items:
|
| 26 |
+
continue
|
| 27 |
+
base = items[0]
|
| 28 |
+
for form in items:
|
| 29 |
+
gloss_list.append({
|
| 30 |
+
"row": idx,
|
| 31 |
+
"base": base,
|
| 32 |
+
"form": form,
|
| 33 |
+
"len": len(form),
|
| 34 |
+
"words": len(form.split())
|
| 35 |
+
})
|
| 36 |
+
|
| 37 |
+
self.gloss_forms = (
|
| 38 |
+
pd.DataFrame(gloss_list)
|
| 39 |
+
.drop_duplicates(["base", "form"])
|
| 40 |
+
.sort_values(["words", "len"], ascending=[False, False])
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
def match_one_utterance(self, text: str):
|
| 44 |
+
"""Return list of matched base terms for a given utterance text."""
|
| 45 |
+
s = norm_txt(text)
|
| 46 |
+
if not s:
|
| 47 |
+
return []
|
| 48 |
+
|
| 49 |
+
locs = []
|
| 50 |
+
for fm, bs, wd in zip(self.gloss_forms["form"],
|
| 51 |
+
self.gloss_forms["base"],
|
| 52 |
+
self.gloss_forms["words"]):
|
| 53 |
+
if not fm:
|
| 54 |
+
continue
|
| 55 |
+
fm_escaped = re.escape(fm)
|
| 56 |
+
pattern = r"\b" + fm_escaped + r"\b"
|
| 57 |
+
for m in re.finditer(pattern, s):
|
| 58 |
+
locs.append({
|
| 59 |
+
"start": m.start(),
|
| 60 |
+
"end": m.end(),
|
| 61 |
+
"form": fm,
|
| 62 |
+
"base": bs,
|
| 63 |
+
"words": wd,
|
| 64 |
+
"len": m.end() - m.start()
|
| 65 |
+
})
|
| 66 |
+
|
| 67 |
+
if not locs:
|
| 68 |
+
return []
|
| 69 |
+
|
| 70 |
+
# prioritize: more tokens > longer span > earlier start
|
| 71 |
+
locs_df = pd.DataFrame(locs).sort_values(
|
| 72 |
+
["words", "len", "start"], ascending=[False, False, True]
|
| 73 |
+
)
|
| 74 |
+
|
| 75 |
+
used = [False] * len(s)
|
| 76 |
+
keep_bases = []
|
| 77 |
+
for _, row in locs_df.iterrows():
|
| 78 |
+
rng = range(row["start"], row["end"])
|
| 79 |
+
if not any(used[i] for i in rng):
|
| 80 |
+
keep_bases.append(row["base"])
|
| 81 |
+
for i in rng:
|
| 82 |
+
used[i] = True
|
| 83 |
+
|
| 84 |
+
return sorted(set(keep_bases))
|
| 85 |
+
|
| 86 |
+
def run_analysis(self, transcript):
|
| 87 |
+
"""Mutate transcript utterances by adding vocabulary_terms list."""
|
| 88 |
+
for utt in transcript.utterances:
|
| 89 |
+
matches = self.match_one_utterance(utt.text)
|
| 90 |
+
utt.vocabulary_terms = matches
|
| 91 |
+
return transcript
|
measures/__init__.py
ADDED
|
File without changes
|
requirements.txt
CHANGED
|
@@ -7,3 +7,4 @@ torchvision==0.19.0
|
|
| 7 |
transformers==4.46.1
|
| 8 |
nltk==3.9.1
|
| 9 |
inflect==7.5.0
|
|
|
|
|
|
| 7 |
transformers==4.46.1
|
| 8 |
nltk==3.9.1
|
| 9 |
inflect==7.5.0
|
| 10 |
+
pandas==2.2.2
|