ground-zero / src /conversation /phrase_matcher.py
jefffffff9
Add phrase library + fuzzy matcher for general Bambara/Fula conversation
6f4d8d0
Raw
History Blame Contribute Delete
7.85 kB
"""
Phrase-library-based conversational response system.
Matches transcribed text against curated Bambara/Fula phrase libraries.
No ML required — the knowledge base is plain JSON files that anyone can edit.
Library files: data/phrases/{lang}.json
User additions: data/phrases/{lang}_additions.json (saved to HF Hub for persistence)
Each phrase entry:
{
"patterns": ["i ni ce", "ini ce", ...], <- what the user might say (incl. Whisper variants)
"response": "I ni ce. N bɛ i dɛmɛ.", <- what the app says back
"english": "Hello! I am here to help.", <- English translation shown in UI
"tags": ["greeting"]
}
"""
from __future__ import annotations
import json
import logging
from pathlib import Path
from typing import Optional
logger = logging.getLogger(__name__)
# Resolve project root relative to this file:
# src/conversation/phrase_matcher.py → ../../data/phrases/
DATA_DIR = Path(__file__).parent.parent.parent / "data" / "phrases"
try:
from rapidfuzz import fuzz as _fuzz, process as _process
_HAS_RAPIDFUZZ = True
except ImportError:
_HAS_RAPIDFUZZ = False
logger.warning("rapidfuzz not installed — using substring matching only (install it for better matching)")
class PhraseMatcher:
"""
Fuzzy-matches a transcription against a per-language phrase library.
Falls back to substring matching if rapidfuzz is not installed.
"""
def __init__(self) -> None:
self._cache: dict[str, list[dict]] = {}
# ── Library loading ───────────────────────────────────────────────────────
def _load(self, lang: str) -> list[dict]:
"""Load and cache all phrases for a language (base + user additions)."""
if lang not in self._cache:
phrases: list[dict] = []
# Base library (git-tracked, always present)
base_path = DATA_DIR / f"{lang}.json"
if base_path.exists():
with open(base_path, encoding="utf-8") as f:
phrases = json.load(f)
# User additions (written locally, uploaded to HF Hub for persistence)
additions_path = DATA_DIR / f"{lang}_additions.json"
if additions_path.exists():
try:
with open(additions_path, encoding="utf-8") as f:
additions = json.load(f)
phrases.extend(additions)
except Exception as exc:
logger.warning("Could not load phrase additions for %s: %s", lang, exc)
self._cache[lang] = phrases
logger.info("PhraseMatcher: loaded %d phrases for lang=%s", len(phrases), lang)
return self._cache[lang]
def reload(self, lang: str) -> None:
"""Force reload from disk (call after adding new phrases)."""
self._cache.pop(lang, None)
def reload_from_hub_data(self, lang: str, additions_json: str) -> None:
"""
Called at startup with JSON downloaded from HF Hub.
Writes to the local additions file so _load() picks it up.
"""
DATA_DIR.mkdir(parents=True, exist_ok=True)
additions_path = DATA_DIR / f"{lang}_additions.json"
with open(additions_path, "w", encoding="utf-8") as f:
f.write(additions_json)
self.reload(lang)
# ── Matching ──────────────────────────────────────────────────────────────
def match(self, text: str, lang: str, threshold: int = 60) -> Optional[dict]:
"""
Find the best-matching phrase entry for the transcribed text.
Returns the entry dict (with 'response' and 'english' keys) or None.
Strategy:
1. Exact substring: any pattern is a substring of text (or vice-versa)
2. Fuzzy partial_ratio via rapidfuzz (handles Whisper spelling variants)
"""
phrases = self._load(lang)
if not phrases:
return None
text_norm = text.lower().strip()
if not text_norm:
return None
# Build flat candidate list: (normalised_pattern, entry)
candidates: list[tuple[str, dict]] = [
(pat.lower(), entry)
for entry in phrases
for pat in entry.get("patterns", [])
]
if not candidates:
return None
# 1. Exact substring check (fast path)
for pattern, entry in candidates:
if pattern in text_norm or text_norm in pattern:
logger.debug("Exact phrase match: %r", pattern)
return entry
# 2. Fuzzy match (handles Whisper mis-transcriptions)
if _HAS_RAPIDFUZZ:
pattern_strings = [c[0] for c in candidates]
result = _process.extractOne(
text_norm,
pattern_strings,
scorer=_fuzz.partial_ratio,
score_cutoff=threshold,
)
if result:
_, score, idx = result
logger.debug("Fuzzy phrase match score=%d: %r", score, candidates[idx][0])
return candidates[idx][1]
return None
# ── Phrase addition ───────────────────────────────────────────────────────
def add_phrase(
self,
lang: str,
patterns: list[str],
response: str,
english: str,
tags: list[str] | None = None,
) -> None:
"""Add a single phrase to the local additions file."""
DATA_DIR.mkdir(parents=True, exist_ok=True)
additions_path = DATA_DIR / f"{lang}_additions.json"
additions: list[dict] = []
if additions_path.exists():
try:
with open(additions_path, encoding="utf-8") as f:
additions = json.load(f)
except Exception:
additions = []
additions.append({
"patterns": [p.strip() for p in patterns if p.strip()],
"response": response.strip(),
"english": english.strip(),
"tags": tags or ["user_added"],
})
with open(additions_path, "w", encoding="utf-8") as f:
json.dump(additions, f, ensure_ascii=False, indent=2)
self.reload(lang)
def import_pairs(self, lang: str, text: str) -> int:
"""
Bulk-import from pasted text.
Format: one entry per line → native_phrase | english_translation
Lines starting with # are ignored.
Returns number of entries successfully added.
"""
count = 0
for line in text.strip().splitlines():
line = line.strip()
if not line or line.startswith("#"):
continue
if "|" not in line:
continue
parts = line.split("|", 1)
if len(parts) != 2:
continue
native, english = parts[0].strip(), parts[1].strip()
if not native or not english:
continue
self.add_phrase(lang, [native], native, english, tags=["imported"])
count += 1
return count
def get_additions_json(self, lang: str) -> str:
"""Return the raw JSON of user additions (for HF Hub upload)."""
additions_path = DATA_DIR / f"{lang}_additions.json"
if not additions_path.exists():
return "[]"
with open(additions_path, encoding="utf-8") as f:
return f.read()
def phrase_count(self, lang: str) -> int:
return len(self._load(lang))