Spaces:
Sleeping
Sleeping
| """ | |
| Phrase-library-based conversational response system. | |
| Matches transcribed text against curated Bambara/Fula phrase libraries. | |
| No ML required — the knowledge base is plain JSON files that anyone can edit. | |
| Library files: data/phrases/{lang}.json | |
| User additions: data/phrases/{lang}_additions.json (saved to HF Hub for persistence) | |
| Each phrase entry: | |
| { | |
| "patterns": ["i ni ce", "ini ce", ...], <- what the user might say (incl. Whisper variants) | |
| "response": "I ni ce. N bɛ i dɛmɛ.", <- what the app says back | |
| "english": "Hello! I am here to help.", <- English translation shown in UI | |
| "tags": ["greeting"] | |
| } | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import logging | |
| from pathlib import Path | |
| from typing import Optional | |
| logger = logging.getLogger(__name__) | |
| # Resolve project root relative to this file: | |
| # src/conversation/phrase_matcher.py → ../../data/phrases/ | |
| DATA_DIR = Path(__file__).parent.parent.parent / "data" / "phrases" | |
| try: | |
| from rapidfuzz import fuzz as _fuzz, process as _process | |
| _HAS_RAPIDFUZZ = True | |
| except ImportError: | |
| _HAS_RAPIDFUZZ = False | |
| logger.warning("rapidfuzz not installed — using substring matching only (install it for better matching)") | |
| class PhraseMatcher: | |
| """ | |
| Fuzzy-matches a transcription against a per-language phrase library. | |
| Falls back to substring matching if rapidfuzz is not installed. | |
| """ | |
| def __init__(self) -> None: | |
| self._cache: dict[str, list[dict]] = {} | |
| # ── Library loading ─────────────────────────────────────────────────────── | |
| def _load(self, lang: str) -> list[dict]: | |
| """Load and cache all phrases for a language (base + user additions).""" | |
| if lang not in self._cache: | |
| phrases: list[dict] = [] | |
| # Base library (git-tracked, always present) | |
| base_path = DATA_DIR / f"{lang}.json" | |
| if base_path.exists(): | |
| with open(base_path, encoding="utf-8") as f: | |
| phrases = json.load(f) | |
| # User additions (written locally, uploaded to HF Hub for persistence) | |
| additions_path = DATA_DIR / f"{lang}_additions.json" | |
| if additions_path.exists(): | |
| try: | |
| with open(additions_path, encoding="utf-8") as f: | |
| additions = json.load(f) | |
| phrases.extend(additions) | |
| except Exception as exc: | |
| logger.warning("Could not load phrase additions for %s: %s", lang, exc) | |
| self._cache[lang] = phrases | |
| logger.info("PhraseMatcher: loaded %d phrases for lang=%s", len(phrases), lang) | |
| return self._cache[lang] | |
| def reload(self, lang: str) -> None: | |
| """Force reload from disk (call after adding new phrases).""" | |
| self._cache.pop(lang, None) | |
| def reload_from_hub_data(self, lang: str, additions_json: str) -> None: | |
| """ | |
| Called at startup with JSON downloaded from HF Hub. | |
| Writes to the local additions file so _load() picks it up. | |
| """ | |
| DATA_DIR.mkdir(parents=True, exist_ok=True) | |
| additions_path = DATA_DIR / f"{lang}_additions.json" | |
| with open(additions_path, "w", encoding="utf-8") as f: | |
| f.write(additions_json) | |
| self.reload(lang) | |
| # ── Matching ────────────────────────────────────────────────────────────── | |
| def match(self, text: str, lang: str, threshold: int = 60) -> Optional[dict]: | |
| """ | |
| Find the best-matching phrase entry for the transcribed text. | |
| Returns the entry dict (with 'response' and 'english' keys) or None. | |
| Strategy: | |
| 1. Exact substring: any pattern is a substring of text (or vice-versa) | |
| 2. Fuzzy partial_ratio via rapidfuzz (handles Whisper spelling variants) | |
| """ | |
| phrases = self._load(lang) | |
| if not phrases: | |
| return None | |
| text_norm = text.lower().strip() | |
| if not text_norm: | |
| return None | |
| # Build flat candidate list: (normalised_pattern, entry) | |
| candidates: list[tuple[str, dict]] = [ | |
| (pat.lower(), entry) | |
| for entry in phrases | |
| for pat in entry.get("patterns", []) | |
| ] | |
| if not candidates: | |
| return None | |
| # 1. Exact substring check (fast path) | |
| for pattern, entry in candidates: | |
| if pattern in text_norm or text_norm in pattern: | |
| logger.debug("Exact phrase match: %r", pattern) | |
| return entry | |
| # 2. Fuzzy match (handles Whisper mis-transcriptions) | |
| if _HAS_RAPIDFUZZ: | |
| pattern_strings = [c[0] for c in candidates] | |
| result = _process.extractOne( | |
| text_norm, | |
| pattern_strings, | |
| scorer=_fuzz.partial_ratio, | |
| score_cutoff=threshold, | |
| ) | |
| if result: | |
| _, score, idx = result | |
| logger.debug("Fuzzy phrase match score=%d: %r", score, candidates[idx][0]) | |
| return candidates[idx][1] | |
| return None | |
| # ── Phrase addition ─────────────────────────────────────────────────────── | |
| def add_phrase( | |
| self, | |
| lang: str, | |
| patterns: list[str], | |
| response: str, | |
| english: str, | |
| tags: list[str] | None = None, | |
| ) -> None: | |
| """Add a single phrase to the local additions file.""" | |
| DATA_DIR.mkdir(parents=True, exist_ok=True) | |
| additions_path = DATA_DIR / f"{lang}_additions.json" | |
| additions: list[dict] = [] | |
| if additions_path.exists(): | |
| try: | |
| with open(additions_path, encoding="utf-8") as f: | |
| additions = json.load(f) | |
| except Exception: | |
| additions = [] | |
| additions.append({ | |
| "patterns": [p.strip() for p in patterns if p.strip()], | |
| "response": response.strip(), | |
| "english": english.strip(), | |
| "tags": tags or ["user_added"], | |
| }) | |
| with open(additions_path, "w", encoding="utf-8") as f: | |
| json.dump(additions, f, ensure_ascii=False, indent=2) | |
| self.reload(lang) | |
| def import_pairs(self, lang: str, text: str) -> int: | |
| """ | |
| Bulk-import from pasted text. | |
| Format: one entry per line → native_phrase | english_translation | |
| Lines starting with # are ignored. | |
| Returns number of entries successfully added. | |
| """ | |
| count = 0 | |
| for line in text.strip().splitlines(): | |
| line = line.strip() | |
| if not line or line.startswith("#"): | |
| continue | |
| if "|" not in line: | |
| continue | |
| parts = line.split("|", 1) | |
| if len(parts) != 2: | |
| continue | |
| native, english = parts[0].strip(), parts[1].strip() | |
| if not native or not english: | |
| continue | |
| self.add_phrase(lang, [native], native, english, tags=["imported"]) | |
| count += 1 | |
| return count | |
| def get_additions_json(self, lang: str) -> str: | |
| """Return the raw JSON of user additions (for HF Hub upload).""" | |
| additions_path = DATA_DIR / f"{lang}_additions.json" | |
| if not additions_path.exists(): | |
| return "[]" | |
| with open(additions_path, encoding="utf-8") as f: | |
| return f.read() | |
| def phrase_count(self, lang: str) -> int: | |
| return len(self._load(lang)) | |