list matched phrases

by ikarasz - opened Jan 17

base: refs/heads/main

←

from: refs/pr/9

Discussion Files changed

+150

-10

Files changed (6) hide show

__pycache__/handler.cpython-39.pyc +0 -0
__pycache__/utils.cpython-39.pyc +0 -0
handler.py +3 -1
measures/VocabularyAnalyser.py +29 -9
requirements.txt +1 -0
tests/test_vocabulary_analyser.py +117 -0

__pycache__/handler.cpython-39.pyc DELETED Viewed

Binary file (8.82 kB)

__pycache__/utils.cpython-39.pyc DELETED Viewed

Binary file (6.53 kB)

handler.py CHANGED Viewed

@@ -76,6 +76,7 @@ class Utterance:
             'numMathTerms': self.num_math_terms,
             'mathTerms': self.math_terms,
             'vocabularyTerms': self.vocabulary_terms,
             **self.props
         }
@@ -98,7 +99,8 @@ class Utterance:
             'wordCount': self.word_count,
             'numMathTerms': self.num_math_terms,
             'mathTerms': self.math_terms,
-            'vocabularyTerms': self.vocabulary_terms
         }
     def __repr__(self):

             'numMathTerms': self.num_math_terms,
             'mathTerms': self.math_terms,
             'vocabularyTerms': self.vocabulary_terms,
+            'vocabularyMatches': self.vocabulary_matches,
             **self.props
         }
             'wordCount': self.word_count,
             'numMathTerms': self.num_math_terms,
             'mathTerms': self.math_terms,
+            'vocabularyTerms': self.vocabulary_terms,
+            'vocabularyMatches': self.vocabulary_matches
         }
     def __repr__(self):

measures/VocabularyAnalyser.py CHANGED Viewed

@@ -14,8 +14,8 @@ def norm_txt(x: str) -> str:
 class VocabularyAnalyser:
     def __init__(self, glossary_file: str):
-        # Load glossary CSV (first column = base + variants, comma-separated)
-        raw = pd.read_csv(glossary_file)
         gloss_list = []
         for idx, row in raw.iterrows():
@@ -47,11 +47,11 @@ class VocabularyAnalyser:
             .sort_values(["words", "len"], ascending=[False, False])
         )
-    def match_one_utterance(self, text: str):
-        """Return list of matched base terms for a given utterance text."""
         s = norm_txt(text)
         if not s:
-            return []
         locs = []
         for fm, bs, wd in zip(self.gloss_forms["form"],
@@ -72,7 +72,7 @@ class VocabularyAnalyser:
                 })
         if not locs:
-            return []
         # prioritize: more tokens > longer span > earlier start
         locs_df = pd.DataFrame(locs).sort_values(
@@ -81,18 +81,38 @@ class VocabularyAnalyser:
         used = [False] * len(s)
         keep_bases = []
         for _, row in locs_df.iterrows():
             rng = range(row["start"], row["end"])
             if not any(used[i] for i in rng):
                 keep_bases.append(row["base"])
                 for i in rng:
                     used[i] = True
-        return sorted(set(keep_bases))
     def run_analysis(self, transcript):
         """Mutate transcript utterances by adding vocabulary_terms list."""
         for utt in transcript.utterances:
-            matches = self.match_one_utterance(utt.text)
-            utt.vocabulary_terms = matches
         return transcript

 class VocabularyAnalyser:
     def __init__(self, glossary_file: str):
+        # Load glossary CSV (no header, each row base + variants, comma-separated)
+        raw = pd.read_csv(glossary_file, header=None)
         gloss_list = []
         for idx, row in raw.iterrows():
             .sort_values(["words", "len"], ascending=[False, False])
         )
+    def _collect_matches(self, text: str):
+        """Return (bases, matches_by_base) for a given utterance text."""
         s = norm_txt(text)
         if not s:
+            return [], {}
         locs = []
         for fm, bs, wd in zip(self.gloss_forms["form"],
                 })
         if not locs:
+            return [], {}
         # prioritize: more tokens > longer span > earlier start
         locs_df = pd.DataFrame(locs).sort_values(
         used = [False] * len(s)
         keep_bases = []
+        keep_rows = []
         for _, row in locs_df.iterrows():
             rng = range(row["start"], row["end"])
             if not any(used[i] for i in rng):
                 keep_bases.append(row["base"])
+                keep_rows.append(row)
                 for i in rng:
                     used[i] = True
+        matches = {}
+        for row in keep_rows:
+            entry = {
+                "form": row["form"],
+                "start": int(row["start"]),
+                "end": int(row["end"]),
+            }
+            matches.setdefault(row["base"], []).append(entry)
+        for base in matches:
+            matches[base].sort(key=lambda item: item["start"])
+        return sorted(set(keep_bases)), matches
+    def match_one_utterance(self, text: str):
+        """Return list of matched base terms for a given utterance text."""
+        bases, _ = self._collect_matches(text)
+        return bases
     def run_analysis(self, transcript):
         """Mutate transcript utterances by adding vocabulary_terms list."""
         for utt in transcript.utterances:
+            bases, match_map = self._collect_matches(utt.text)
+            utt.vocabulary_terms = bases
+            utt.vocabulary_matches = match_map
         return transcript

requirements.txt CHANGED Viewed

@@ -8,3 +8,4 @@ transformers==4.46.1
 nltk==3.9.1
 inflect==7.5.0
 pandas==2.2.2

 nltk==3.9.1
 inflect==7.5.0
 pandas==2.2.2
+pytest==8.2.2

tests/test_vocabulary_analyser.py ADDED Viewed

	@@ -0,0 +1,117 @@

+import textwrap
+import sys
+from pathlib import Path
+import pytest
+PROJECT_ROOT = Path(__file__).resolve().parents[1]
+if str(PROJECT_ROOT) not in sys.path:
+    sys.path.insert(0, str(PROJECT_ROOT))
+from measures.VocabularyAnalyser import VocabularyAnalyser
+@pytest.fixture
+def glossary_file(tmp_path):
+    """Create a small glossary CSV for testing."""
+    csv_content = textwrap.dedent(
+        """\
+        acute,,,
+        acute angle, acute angles,,
+        acute triangle, acute triangles,,
+        add, added, adding, adds
+        addend, addends,,
+        """
+    )
+    path = tmp_path / "glossary.csv"
+    path.write_text(csv_content, encoding="utf-8")
+    return str(path)
+class DummyUtterance:
+    def __init__(self, speaker, text):
+        self.speaker = speaker
+        self.text = text
+        self.vocabulary_terms = None
+        self.vocabulary_matches = None
+class DummyTranscript:
+    def __init__(self, utterances):
+        self.utterances = utterances
+@pytest.fixture
+def analyser(glossary_file):
+    return VocabularyAnalyser(glossary_file)
+def test_match_counts_base_once(analyser):
+    text = "Add add ADD adding added adds"
+    assert analyser.match_one_utterance(text) == ["add"]
+def test_match_prefers_longest_phrase(analyser):
+    text = "An acute angle appears in this proof."
+    assert analyser.match_one_utterance(text) == ["acute angle"]
+def test_match_handles_overlapping_and_distinct_terms(analyser):
+    text = (
+        "The class studied the properties of an acute triangle, then discussed an acute situation."
+    )
+    assert analyser.match_one_utterance(text) == [
+        "acute",
+        "acute triangle",
+    ]
+def test_run_analysis_adds_vocabulary_terms_and_matches(analyser):
+    transcript = DummyTranscript(
+        [
+            DummyUtterance("Teacher", "We add addends in this acute triangle."),
+            DummyUtterance("Student", "Acute angles contrast with obtuse ones."),
+            DummyUtterance("Teacher", "No glossary matches"),
+        ]
+    )
+    result = analyser.run_analysis(transcript)
+    assert result is transcript
+    assert transcript.utterances[0].vocabulary_terms == ["acute triangle", "add", "addend"]
+    assert transcript.utterances[1].vocabulary_terms == ["acute angle"]
+    assert transcript.utterances[2].vocabulary_terms == []
+    assert transcript.utterances[0].vocabulary_matches == {
+        "acute triangle": [
+            {"form": "acute triangle", "start": 23, "end": 37},
+        ],
+        "add": [
+            {"form": "add", "start": 3, "end": 6},
+        ],
+        "addend": [
+            {"form": "addends", "start": 7, "end": 14},
+        ],
+    }
+    assert transcript.utterances[1].vocabulary_matches == {
+        "acute angle": [
+            {"form": "acute angles", "start": 0, "end": 12},
+        ]
+    }
+    assert transcript.utterances[2].vocabulary_matches == {}
+def test_vocabulary_matches_capture_multiple_occurrences(analyser):
+    transcript = DummyTranscript([
+        DummyUtterance("Teacher", "Add adds add."),
+    ])
+    analyser.run_analysis(transcript)
+    matches = transcript.utterances[0].vocabulary_matches
+    assert transcript.utterances[0].vocabulary_terms == ["add"]
+    assert matches["add"] == [
+        {"form": "add", "start": 0, "end": 3},
+        {"form": "adds", "start": 4, "end": 8},
+        {"form": "add", "start": 9, "end": 12},
+    ]