extend response with matches

Browse files

Files changed (7) hide show

__pycache__/handler.cpython-39.pyc +0 -0
__pycache__/utils.cpython-39.pyc +0 -0
measures/VocabularyAnalyser.py +27 -7
measures/__pycache__/VocabularyAnalyser.cpython-310.pyc +0 -0
measures/__pycache__/__init__.cpython-310.pyc +0 -0
tests/__pycache__/test_vocabulary_analyser.cpython-310-pytest-8.2.2.pyc +0 -0
tests/test_vocabulary_analyser.py +48 -11

__pycache__/handler.cpython-39.pyc DELETED Viewed

Binary file (8.82 kB)

__pycache__/utils.cpython-39.pyc DELETED Viewed

Binary file (6.53 kB)

measures/VocabularyAnalyser.py CHANGED Viewed

@@ -47,11 +47,11 @@ class VocabularyAnalyser:
             .sort_values(["words", "len"], ascending=[False, False])
         )
-    def match_one_utterance(self, text: str):
-        """Return list of matched base terms for a given utterance text."""
         s = norm_txt(text)
         if not s:
-            return []
         locs = []
         for fm, bs, wd in zip(self.gloss_forms["form"],
@@ -72,7 +72,7 @@ class VocabularyAnalyser:
                 })
         if not locs:
-            return []
         # prioritize: more tokens > longer span > earlier start
         locs_df = pd.DataFrame(locs).sort_values(
@@ -81,18 +81,38 @@ class VocabularyAnalyser:
         used = [False] * len(s)
         keep_bases = []
         for _, row in locs_df.iterrows():
             rng = range(row["start"], row["end"])
             if not any(used[i] for i in rng):
                 keep_bases.append(row["base"])
                 for i in rng:
                     used[i] = True
-        return sorted(set(keep_bases))
     def run_analysis(self, transcript):
         """Mutate transcript utterances by adding vocabulary_terms list."""
         for utt in transcript.utterances:
-            matches = self.match_one_utterance(utt.text)
-            utt.vocabulary_terms = matches
         return transcript

             .sort_values(["words", "len"], ascending=[False, False])
         )
+    def _collect_matches(self, text: str):
+        """Return (bases, matches_by_base) for a given utterance text."""
         s = norm_txt(text)
         if not s:
+            return [], {}
         locs = []
         for fm, bs, wd in zip(self.gloss_forms["form"],
                 })
         if not locs:
+            return [], {}
         # prioritize: more tokens > longer span > earlier start
         locs_df = pd.DataFrame(locs).sort_values(
         used = [False] * len(s)
         keep_bases = []
+        keep_rows = []
         for _, row in locs_df.iterrows():
             rng = range(row["start"], row["end"])
             if not any(used[i] for i in rng):
                 keep_bases.append(row["base"])
+                keep_rows.append(row)
                 for i in rng:
                     used[i] = True
+        matches = {}
+        for row in keep_rows:
+            entry = {
+                "form": row["form"],
+                "start": int(row["start"]),
+                "end": int(row["end"]),
+            }
+            matches.setdefault(row["base"], []).append(entry)
+        for base in matches:
+            matches[base].sort(key=lambda item: item["start"])
+        return sorted(set(keep_bases)), matches
+    def match_one_utterance(self, text: str):
+        """Return list of matched base terms for a given utterance text."""
+        bases, _ = self._collect_matches(text)
+        return bases
     def run_analysis(self, transcript):
         """Mutate transcript utterances by adding vocabulary_terms list."""
         for utt in transcript.utterances:
+            bases, match_map = self._collect_matches(utt.text)
+            utt.vocabulary_terms = bases
+            utt.vocabulary_matches = match_map
         return transcript

measures/__pycache__/VocabularyAnalyser.cpython-310.pyc DELETED Viewed

Binary file (2.72 kB)

measures/__pycache__/__init__.cpython-310.pyc DELETED Viewed

Binary file (161 Bytes)

tests/__pycache__/test_vocabulary_analyser.cpython-310-pytest-8.2.2.pyc DELETED Viewed

Binary file (4.71 kB)

tests/test_vocabulary_analyser.py CHANGED Viewed

@@ -28,6 +28,19 @@ def glossary_file(tmp_path):
     return str(path)
 @pytest.fixture
 def analyser(glossary_file):
     return VocabularyAnalyser(glossary_file)
@@ -53,17 +66,7 @@ def test_match_handles_overlapping_and_distinct_terms(analyser):
     ]
-def test_run_analysis_adds_vocabulary_terms(analyser):
-    class DummyUtterance:
-        def __init__(self, speaker, text):
-            self.speaker = speaker
-            self.text = text
-            self.vocabulary_terms = None
-    class DummyTranscript:
-        def __init__(self, utterances):
-            self.utterances = utterances
     transcript = DummyTranscript(
         [
             DummyUtterance("Teacher", "We add addends in this acute triangle."),
@@ -78,3 +81,37 @@ def test_run_analysis_adds_vocabulary_terms(analyser):
     assert transcript.utterances[0].vocabulary_terms == ["acute triangle", "add", "addend"]
     assert transcript.utterances[1].vocabulary_terms == ["acute angle"]
     assert transcript.utterances[2].vocabulary_terms == []

     return str(path)
+class DummyUtterance:
+    def __init__(self, speaker, text):
+        self.speaker = speaker
+        self.text = text
+        self.vocabulary_terms = None
+        self.vocabulary_matches = None
+class DummyTranscript:
+    def __init__(self, utterances):
+        self.utterances = utterances
 @pytest.fixture
 def analyser(glossary_file):
     return VocabularyAnalyser(glossary_file)
     ]
+def test_run_analysis_adds_vocabulary_terms_and_matches(analyser):
     transcript = DummyTranscript(
         [
             DummyUtterance("Teacher", "We add addends in this acute triangle."),
     assert transcript.utterances[0].vocabulary_terms == ["acute triangle", "add", "addend"]
     assert transcript.utterances[1].vocabulary_terms == ["acute angle"]
     assert transcript.utterances[2].vocabulary_terms == []
+    assert transcript.utterances[0].vocabulary_matches == {
+        "acute triangle": [
+            {"form": "acute triangle", "start": 23, "end": 37},
+        ],
+        "add": [
+            {"form": "add", "start": 3, "end": 6},
+        ],
+        "addend": [
+            {"form": "addends", "start": 7, "end": 14},
+        ],
+    }
+    assert transcript.utterances[1].vocabulary_matches == {
+        "acute angle": [
+            {"form": "acute angles", "start": 0, "end": 12},
+        ]
+    }
+    assert transcript.utterances[2].vocabulary_matches == {}
+def test_vocabulary_matches_capture_multiple_occurrences(analyser):
+    transcript = DummyTranscript([
+        DummyUtterance("Teacher", "Add adds add."),
+    ])
+    analyser.run_analysis(transcript)
+    matches = transcript.utterances[0].vocabulary_matches
+    assert transcript.utterances[0].vocabulary_terms == ["add"]
+    assert matches["add"] == [
+        {"form": "add", "start": 0, "end": 3},
+        {"form": "adds", "start": 4, "end": 8},
+        {"form": "add", "start": 9, "end": 12},
+    ]