""" Phrase-library-based conversational response system. Matches transcribed text against curated Bambara/Fula phrase libraries. No ML required — the knowledge base is plain JSON files that anyone can edit. Library files: data/phrases/{lang}.json User additions: data/phrases/{lang}_additions.json (saved to HF Hub for persistence) Each phrase entry: { "patterns": ["i ni ce", "ini ce", ...], <- what the user might say (incl. Whisper variants) "response": "I ni ce. N bɛ i dɛmɛ.", <- what the app says back "english": "Hello! I am here to help.", <- English translation shown in UI "tags": ["greeting"] } """ from __future__ import annotations import json import logging from pathlib import Path from typing import Optional logger = logging.getLogger(__name__) # Resolve project root relative to this file: # src/conversation/phrase_matcher.py → ../../data/phrases/ DATA_DIR = Path(__file__).parent.parent.parent / "data" / "phrases" try: from rapidfuzz import fuzz as _fuzz, process as _process _HAS_RAPIDFUZZ = True except ImportError: _HAS_RAPIDFUZZ = False logger.warning("rapidfuzz not installed — using substring matching only (install it for better matching)") class PhraseMatcher: """ Fuzzy-matches a transcription against a per-language phrase library. Falls back to substring matching if rapidfuzz is not installed. """ def __init__(self) -> None: self._cache: dict[str, list[dict]] = {} # ── Library loading ─────────────────────────────────────────────────────── def _load(self, lang: str) -> list[dict]: """Load and cache all phrases for a language (base + user additions).""" if lang not in self._cache: phrases: list[dict] = [] # Base library (git-tracked, always present) base_path = DATA_DIR / f"{lang}.json" if base_path.exists(): with open(base_path, encoding="utf-8") as f: phrases = json.load(f) # User additions (written locally, uploaded to HF Hub for persistence) additions_path = DATA_DIR / f"{lang}_additions.json" if additions_path.exists(): try: with open(additions_path, encoding="utf-8") as f: additions = json.load(f) phrases.extend(additions) except Exception as exc: logger.warning("Could not load phrase additions for %s: %s", lang, exc) self._cache[lang] = phrases logger.info("PhraseMatcher: loaded %d phrases for lang=%s", len(phrases), lang) return self._cache[lang] def reload(self, lang: str) -> None: """Force reload from disk (call after adding new phrases).""" self._cache.pop(lang, None) def reload_from_hub_data(self, lang: str, additions_json: str) -> None: """ Called at startup with JSON downloaded from HF Hub. Writes to the local additions file so _load() picks it up. """ DATA_DIR.mkdir(parents=True, exist_ok=True) additions_path = DATA_DIR / f"{lang}_additions.json" with open(additions_path, "w", encoding="utf-8") as f: f.write(additions_json) self.reload(lang) # ── Matching ────────────────────────────────────────────────────────────── def match(self, text: str, lang: str, threshold: int = 60) -> Optional[dict]: """ Find the best-matching phrase entry for the transcribed text. Returns the entry dict (with 'response' and 'english' keys) or None. Strategy: 1. Exact substring: any pattern is a substring of text (or vice-versa) 2. Fuzzy partial_ratio via rapidfuzz (handles Whisper spelling variants) """ phrases = self._load(lang) if not phrases: return None text_norm = text.lower().strip() if not text_norm: return None # Build flat candidate list: (normalised_pattern, entry) candidates: list[tuple[str, dict]] = [ (pat.lower(), entry) for entry in phrases for pat in entry.get("patterns", []) ] if not candidates: return None # 1. Exact substring check (fast path) for pattern, entry in candidates: if pattern in text_norm or text_norm in pattern: logger.debug("Exact phrase match: %r", pattern) return entry # 2. Fuzzy match (handles Whisper mis-transcriptions) if _HAS_RAPIDFUZZ: pattern_strings = [c[0] for c in candidates] result = _process.extractOne( text_norm, pattern_strings, scorer=_fuzz.partial_ratio, score_cutoff=threshold, ) if result: _, score, idx = result logger.debug("Fuzzy phrase match score=%d: %r", score, candidates[idx][0]) return candidates[idx][1] return None # ── Phrase addition ─────────────────────────────────────────────────────── def add_phrase( self, lang: str, patterns: list[str], response: str, english: str, tags: list[str] | None = None, ) -> None: """Add a single phrase to the local additions file.""" DATA_DIR.mkdir(parents=True, exist_ok=True) additions_path = DATA_DIR / f"{lang}_additions.json" additions: list[dict] = [] if additions_path.exists(): try: with open(additions_path, encoding="utf-8") as f: additions = json.load(f) except Exception: additions = [] additions.append({ "patterns": [p.strip() for p in patterns if p.strip()], "response": response.strip(), "english": english.strip(), "tags": tags or ["user_added"], }) with open(additions_path, "w", encoding="utf-8") as f: json.dump(additions, f, ensure_ascii=False, indent=2) self.reload(lang) def import_pairs(self, lang: str, text: str) -> int: """ Bulk-import from pasted text. Format: one entry per line → native_phrase | english_translation Lines starting with # are ignored. Returns number of entries successfully added. """ count = 0 for line in text.strip().splitlines(): line = line.strip() if not line or line.startswith("#"): continue if "|" not in line: continue parts = line.split("|", 1) if len(parts) != 2: continue native, english = parts[0].strip(), parts[1].strip() if not native or not english: continue self.add_phrase(lang, [native], native, english, tags=["imported"]) count += 1 return count def get_additions_json(self, lang: str) -> str: """Return the raw JSON of user additions (for HF Hub upload).""" additions_path = DATA_DIR / f"{lang}_additions.json" if not additions_path.exists(): return "[]" with open(additions_path, encoding="utf-8") as f: return f.read() def phrase_count(self, lang: str) -> int: return len(self._load(lang))