list matched phrases

#9
by ikarasz - opened
__pycache__/handler.cpython-39.pyc DELETED
Binary file (8.82 kB)
 
__pycache__/utils.cpython-39.pyc DELETED
Binary file (6.53 kB)
 
handler.py CHANGED
@@ -76,6 +76,7 @@ class Utterance:
76
  'numMathTerms': self.num_math_terms,
77
  'mathTerms': self.math_terms,
78
  'vocabularyTerms': self.vocabulary_terms,
 
79
  **self.props
80
  }
81
 
@@ -98,7 +99,8 @@ class Utterance:
98
  'wordCount': self.word_count,
99
  'numMathTerms': self.num_math_terms,
100
  'mathTerms': self.math_terms,
101
- 'vocabularyTerms': self.vocabulary_terms
 
102
  }
103
 
104
  def __repr__(self):
 
76
  'numMathTerms': self.num_math_terms,
77
  'mathTerms': self.math_terms,
78
  'vocabularyTerms': self.vocabulary_terms,
79
+ 'vocabularyMatches': self.vocabulary_matches,
80
  **self.props
81
  }
82
 
 
99
  'wordCount': self.word_count,
100
  'numMathTerms': self.num_math_terms,
101
  'mathTerms': self.math_terms,
102
+ 'vocabularyTerms': self.vocabulary_terms,
103
+ 'vocabularyMatches': self.vocabulary_matches
104
  }
105
 
106
  def __repr__(self):
measures/VocabularyAnalyser.py CHANGED
@@ -14,8 +14,8 @@ def norm_txt(x: str) -> str:
14
 
15
  class VocabularyAnalyser:
16
  def __init__(self, glossary_file: str):
17
- # Load glossary CSV (first column = base + variants, comma-separated)
18
- raw = pd.read_csv(glossary_file)
19
 
20
  gloss_list = []
21
  for idx, row in raw.iterrows():
@@ -47,11 +47,11 @@ class VocabularyAnalyser:
47
  .sort_values(["words", "len"], ascending=[False, False])
48
  )
49
 
50
- def match_one_utterance(self, text: str):
51
- """Return list of matched base terms for a given utterance text."""
52
  s = norm_txt(text)
53
  if not s:
54
- return []
55
 
56
  locs = []
57
  for fm, bs, wd in zip(self.gloss_forms["form"],
@@ -72,7 +72,7 @@ class VocabularyAnalyser:
72
  })
73
 
74
  if not locs:
75
- return []
76
 
77
  # prioritize: more tokens > longer span > earlier start
78
  locs_df = pd.DataFrame(locs).sort_values(
@@ -81,18 +81,38 @@ class VocabularyAnalyser:
81
 
82
  used = [False] * len(s)
83
  keep_bases = []
 
84
  for _, row in locs_df.iterrows():
85
  rng = range(row["start"], row["end"])
86
  if not any(used[i] for i in rng):
87
  keep_bases.append(row["base"])
 
88
  for i in rng:
89
  used[i] = True
90
 
91
- return sorted(set(keep_bases))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
 
93
  def run_analysis(self, transcript):
94
  """Mutate transcript utterances by adding vocabulary_terms list."""
95
  for utt in transcript.utterances:
96
- matches = self.match_one_utterance(utt.text)
97
- utt.vocabulary_terms = matches
 
98
  return transcript
 
14
 
15
  class VocabularyAnalyser:
16
  def __init__(self, glossary_file: str):
17
+ # Load glossary CSV (no header, each row base + variants, comma-separated)
18
+ raw = pd.read_csv(glossary_file, header=None)
19
 
20
  gloss_list = []
21
  for idx, row in raw.iterrows():
 
47
  .sort_values(["words", "len"], ascending=[False, False])
48
  )
49
 
50
+ def _collect_matches(self, text: str):
51
+ """Return (bases, matches_by_base) for a given utterance text."""
52
  s = norm_txt(text)
53
  if not s:
54
+ return [], {}
55
 
56
  locs = []
57
  for fm, bs, wd in zip(self.gloss_forms["form"],
 
72
  })
73
 
74
  if not locs:
75
+ return [], {}
76
 
77
  # prioritize: more tokens > longer span > earlier start
78
  locs_df = pd.DataFrame(locs).sort_values(
 
81
 
82
  used = [False] * len(s)
83
  keep_bases = []
84
+ keep_rows = []
85
  for _, row in locs_df.iterrows():
86
  rng = range(row["start"], row["end"])
87
  if not any(used[i] for i in rng):
88
  keep_bases.append(row["base"])
89
+ keep_rows.append(row)
90
  for i in rng:
91
  used[i] = True
92
 
93
+ matches = {}
94
+ for row in keep_rows:
95
+ entry = {
96
+ "form": row["form"],
97
+ "start": int(row["start"]),
98
+ "end": int(row["end"]),
99
+ }
100
+ matches.setdefault(row["base"], []).append(entry)
101
+
102
+ for base in matches:
103
+ matches[base].sort(key=lambda item: item["start"])
104
+
105
+ return sorted(set(keep_bases)), matches
106
+
107
+ def match_one_utterance(self, text: str):
108
+ """Return list of matched base terms for a given utterance text."""
109
+ bases, _ = self._collect_matches(text)
110
+ return bases
111
 
112
  def run_analysis(self, transcript):
113
  """Mutate transcript utterances by adding vocabulary_terms list."""
114
  for utt in transcript.utterances:
115
+ bases, match_map = self._collect_matches(utt.text)
116
+ utt.vocabulary_terms = bases
117
+ utt.vocabulary_matches = match_map
118
  return transcript
requirements.txt CHANGED
@@ -8,3 +8,4 @@ transformers==4.46.1
8
  nltk==3.9.1
9
  inflect==7.5.0
10
  pandas==2.2.2
 
 
8
  nltk==3.9.1
9
  inflect==7.5.0
10
  pandas==2.2.2
11
+ pytest==8.2.2
tests/test_vocabulary_analyser.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import textwrap
2
+ import sys
3
+ from pathlib import Path
4
+
5
+ import pytest
6
+
7
+ PROJECT_ROOT = Path(__file__).resolve().parents[1]
8
+ if str(PROJECT_ROOT) not in sys.path:
9
+ sys.path.insert(0, str(PROJECT_ROOT))
10
+
11
+ from measures.VocabularyAnalyser import VocabularyAnalyser
12
+
13
+
14
+ @pytest.fixture
15
+ def glossary_file(tmp_path):
16
+ """Create a small glossary CSV for testing."""
17
+ csv_content = textwrap.dedent(
18
+ """\
19
+ acute,,,
20
+ acute angle, acute angles,,
21
+ acute triangle, acute triangles,,
22
+ add, added, adding, adds
23
+ addend, addends,,
24
+ """
25
+ )
26
+ path = tmp_path / "glossary.csv"
27
+ path.write_text(csv_content, encoding="utf-8")
28
+ return str(path)
29
+
30
+
31
+ class DummyUtterance:
32
+ def __init__(self, speaker, text):
33
+ self.speaker = speaker
34
+ self.text = text
35
+ self.vocabulary_terms = None
36
+ self.vocabulary_matches = None
37
+
38
+
39
+ class DummyTranscript:
40
+ def __init__(self, utterances):
41
+ self.utterances = utterances
42
+
43
+
44
+ @pytest.fixture
45
+ def analyser(glossary_file):
46
+ return VocabularyAnalyser(glossary_file)
47
+
48
+
49
+ def test_match_counts_base_once(analyser):
50
+ text = "Add add ADD adding added adds"
51
+ assert analyser.match_one_utterance(text) == ["add"]
52
+
53
+
54
+ def test_match_prefers_longest_phrase(analyser):
55
+ text = "An acute angle appears in this proof."
56
+ assert analyser.match_one_utterance(text) == ["acute angle"]
57
+
58
+
59
+ def test_match_handles_overlapping_and_distinct_terms(analyser):
60
+ text = (
61
+ "The class studied the properties of an acute triangle, then discussed an acute situation."
62
+ )
63
+ assert analyser.match_one_utterance(text) == [
64
+ "acute",
65
+ "acute triangle",
66
+ ]
67
+
68
+
69
+ def test_run_analysis_adds_vocabulary_terms_and_matches(analyser):
70
+ transcript = DummyTranscript(
71
+ [
72
+ DummyUtterance("Teacher", "We add addends in this acute triangle."),
73
+ DummyUtterance("Student", "Acute angles contrast with obtuse ones."),
74
+ DummyUtterance("Teacher", "No glossary matches"),
75
+ ]
76
+ )
77
+
78
+ result = analyser.run_analysis(transcript)
79
+
80
+ assert result is transcript
81
+ assert transcript.utterances[0].vocabulary_terms == ["acute triangle", "add", "addend"]
82
+ assert transcript.utterances[1].vocabulary_terms == ["acute angle"]
83
+ assert transcript.utterances[2].vocabulary_terms == []
84
+
85
+ assert transcript.utterances[0].vocabulary_matches == {
86
+ "acute triangle": [
87
+ {"form": "acute triangle", "start": 23, "end": 37},
88
+ ],
89
+ "add": [
90
+ {"form": "add", "start": 3, "end": 6},
91
+ ],
92
+ "addend": [
93
+ {"form": "addends", "start": 7, "end": 14},
94
+ ],
95
+ }
96
+ assert transcript.utterances[1].vocabulary_matches == {
97
+ "acute angle": [
98
+ {"form": "acute angles", "start": 0, "end": 12},
99
+ ]
100
+ }
101
+ assert transcript.utterances[2].vocabulary_matches == {}
102
+
103
+
104
+ def test_vocabulary_matches_capture_multiple_occurrences(analyser):
105
+ transcript = DummyTranscript([
106
+ DummyUtterance("Teacher", "Add adds add."),
107
+ ])
108
+
109
+ analyser.run_analysis(transcript)
110
+
111
+ matches = transcript.utterances[0].vocabulary_matches
112
+ assert transcript.utterances[0].vocabulary_terms == ["add"]
113
+ assert matches["add"] == [
114
+ {"form": "add", "start": 0, "end": 3},
115
+ {"form": "adds", "start": 4, "end": 8},
116
+ {"form": "add", "start": 9, "end": 12},
117
+ ]