ikarasz commited on
Commit
43d54f4
·
1 Parent(s): 272cb8d

extend response with matches

Browse files
__pycache__/handler.cpython-39.pyc DELETED
Binary file (8.82 kB)
 
__pycache__/utils.cpython-39.pyc DELETED
Binary file (6.53 kB)
 
measures/VocabularyAnalyser.py CHANGED
@@ -47,11 +47,11 @@ class VocabularyAnalyser:
47
  .sort_values(["words", "len"], ascending=[False, False])
48
  )
49
 
50
- def match_one_utterance(self, text: str):
51
- """Return list of matched base terms for a given utterance text."""
52
  s = norm_txt(text)
53
  if not s:
54
- return []
55
 
56
  locs = []
57
  for fm, bs, wd in zip(self.gloss_forms["form"],
@@ -72,7 +72,7 @@ class VocabularyAnalyser:
72
  })
73
 
74
  if not locs:
75
- return []
76
 
77
  # prioritize: more tokens > longer span > earlier start
78
  locs_df = pd.DataFrame(locs).sort_values(
@@ -81,18 +81,38 @@ class VocabularyAnalyser:
81
 
82
  used = [False] * len(s)
83
  keep_bases = []
 
84
  for _, row in locs_df.iterrows():
85
  rng = range(row["start"], row["end"])
86
  if not any(used[i] for i in rng):
87
  keep_bases.append(row["base"])
 
88
  for i in rng:
89
  used[i] = True
90
 
91
- return sorted(set(keep_bases))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
 
93
  def run_analysis(self, transcript):
94
  """Mutate transcript utterances by adding vocabulary_terms list."""
95
  for utt in transcript.utterances:
96
- matches = self.match_one_utterance(utt.text)
97
- utt.vocabulary_terms = matches
 
98
  return transcript
 
47
  .sort_values(["words", "len"], ascending=[False, False])
48
  )
49
 
50
+ def _collect_matches(self, text: str):
51
+ """Return (bases, matches_by_base) for a given utterance text."""
52
  s = norm_txt(text)
53
  if not s:
54
+ return [], {}
55
 
56
  locs = []
57
  for fm, bs, wd in zip(self.gloss_forms["form"],
 
72
  })
73
 
74
  if not locs:
75
+ return [], {}
76
 
77
  # prioritize: more tokens > longer span > earlier start
78
  locs_df = pd.DataFrame(locs).sort_values(
 
81
 
82
  used = [False] * len(s)
83
  keep_bases = []
84
+ keep_rows = []
85
  for _, row in locs_df.iterrows():
86
  rng = range(row["start"], row["end"])
87
  if not any(used[i] for i in rng):
88
  keep_bases.append(row["base"])
89
+ keep_rows.append(row)
90
  for i in rng:
91
  used[i] = True
92
 
93
+ matches = {}
94
+ for row in keep_rows:
95
+ entry = {
96
+ "form": row["form"],
97
+ "start": int(row["start"]),
98
+ "end": int(row["end"]),
99
+ }
100
+ matches.setdefault(row["base"], []).append(entry)
101
+
102
+ for base in matches:
103
+ matches[base].sort(key=lambda item: item["start"])
104
+
105
+ return sorted(set(keep_bases)), matches
106
+
107
+ def match_one_utterance(self, text: str):
108
+ """Return list of matched base terms for a given utterance text."""
109
+ bases, _ = self._collect_matches(text)
110
+ return bases
111
 
112
  def run_analysis(self, transcript):
113
  """Mutate transcript utterances by adding vocabulary_terms list."""
114
  for utt in transcript.utterances:
115
+ bases, match_map = self._collect_matches(utt.text)
116
+ utt.vocabulary_terms = bases
117
+ utt.vocabulary_matches = match_map
118
  return transcript
measures/__pycache__/VocabularyAnalyser.cpython-310.pyc DELETED
Binary file (2.72 kB)
 
measures/__pycache__/__init__.cpython-310.pyc DELETED
Binary file (161 Bytes)
 
tests/__pycache__/test_vocabulary_analyser.cpython-310-pytest-8.2.2.pyc DELETED
Binary file (4.71 kB)
 
tests/test_vocabulary_analyser.py CHANGED
@@ -28,6 +28,19 @@ def glossary_file(tmp_path):
28
  return str(path)
29
 
30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  @pytest.fixture
32
  def analyser(glossary_file):
33
  return VocabularyAnalyser(glossary_file)
@@ -53,17 +66,7 @@ def test_match_handles_overlapping_and_distinct_terms(analyser):
53
  ]
54
 
55
 
56
- def test_run_analysis_adds_vocabulary_terms(analyser):
57
- class DummyUtterance:
58
- def __init__(self, speaker, text):
59
- self.speaker = speaker
60
- self.text = text
61
- self.vocabulary_terms = None
62
-
63
- class DummyTranscript:
64
- def __init__(self, utterances):
65
- self.utterances = utterances
66
-
67
  transcript = DummyTranscript(
68
  [
69
  DummyUtterance("Teacher", "We add addends in this acute triangle."),
@@ -78,3 +81,37 @@ def test_run_analysis_adds_vocabulary_terms(analyser):
78
  assert transcript.utterances[0].vocabulary_terms == ["acute triangle", "add", "addend"]
79
  assert transcript.utterances[1].vocabulary_terms == ["acute angle"]
80
  assert transcript.utterances[2].vocabulary_terms == []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  return str(path)
29
 
30
 
31
+ class DummyUtterance:
32
+ def __init__(self, speaker, text):
33
+ self.speaker = speaker
34
+ self.text = text
35
+ self.vocabulary_terms = None
36
+ self.vocabulary_matches = None
37
+
38
+
39
+ class DummyTranscript:
40
+ def __init__(self, utterances):
41
+ self.utterances = utterances
42
+
43
+
44
  @pytest.fixture
45
  def analyser(glossary_file):
46
  return VocabularyAnalyser(glossary_file)
 
66
  ]
67
 
68
 
69
+ def test_run_analysis_adds_vocabulary_terms_and_matches(analyser):
 
 
 
 
 
 
 
 
 
 
70
  transcript = DummyTranscript(
71
  [
72
  DummyUtterance("Teacher", "We add addends in this acute triangle."),
 
81
  assert transcript.utterances[0].vocabulary_terms == ["acute triangle", "add", "addend"]
82
  assert transcript.utterances[1].vocabulary_terms == ["acute angle"]
83
  assert transcript.utterances[2].vocabulary_terms == []
84
+
85
+ assert transcript.utterances[0].vocabulary_matches == {
86
+ "acute triangle": [
87
+ {"form": "acute triangle", "start": 23, "end": 37},
88
+ ],
89
+ "add": [
90
+ {"form": "add", "start": 3, "end": 6},
91
+ ],
92
+ "addend": [
93
+ {"form": "addends", "start": 7, "end": 14},
94
+ ],
95
+ }
96
+ assert transcript.utterances[1].vocabulary_matches == {
97
+ "acute angle": [
98
+ {"form": "acute angles", "start": 0, "end": 12},
99
+ ]
100
+ }
101
+ assert transcript.utterances[2].vocabulary_matches == {}
102
+
103
+
104
+ def test_vocabulary_matches_capture_multiple_occurrences(analyser):
105
+ transcript = DummyTranscript([
106
+ DummyUtterance("Teacher", "Add adds add."),
107
+ ])
108
+
109
+ analyser.run_analysis(transcript)
110
+
111
+ matches = transcript.utterances[0].vocabulary_matches
112
+ assert transcript.utterances[0].vocabulary_terms == ["add"]
113
+ assert matches["add"] == [
114
+ {"form": "add", "start": 0, "end": 3},
115
+ {"form": "adds", "start": 4, "end": 8},
116
+ {"form": "add", "start": 9, "end": 12},
117
+ ]