Spaces:

MataStrategy
/

ground-zero

Sleeping

ground-zero / src /conversation /phrase_matcher.py

jefffffff9

Add phrase library + fuzzy matcher for general Bambara/Fula conversation

6f4d8d0 3 months ago

7.85 kB

	"""
	Phrase-library-based conversational response system.
	Matches transcribed text against curated Bambara/Fula phrase libraries.

	No ML required — the knowledge base is plain JSON files that anyone can edit.
	Library files: data/phrases/{lang}.json
	User additions: data/phrases/{lang}_additions.json (saved to HF Hub for persistence)

	Each phrase entry:
	{
	"patterns": ["i ni ce", "ini ce", ...], <- what the user might say (incl. Whisper variants)
	"response": "I ni ce. N bɛ i dɛmɛ.", <- what the app says back
	"english": "Hello! I am here to help.", <- English translation shown in UI
	"tags": ["greeting"]
	}
	"""
	from __future__ import annotations

	import json
	import logging
	from pathlib import Path
	from typing import Optional

	logger = logging.getLogger(__name__)

	# Resolve project root relative to this file:
	# src/conversation/phrase_matcher.py → ../../data/phrases/
	DATA_DIR = Path(__file__).parent.parent.parent / "data" / "phrases"

	try:
	from rapidfuzz import fuzz as _fuzz, process as _process
	_HAS_RAPIDFUZZ = True
	except ImportError:
	_HAS_RAPIDFUZZ = False
	logger.warning("rapidfuzz not installed — using substring matching only (install it for better matching)")


	class PhraseMatcher:
	"""
	Fuzzy-matches a transcription against a per-language phrase library.
	Falls back to substring matching if rapidfuzz is not installed.
	"""

	def __init__(self) -> None:
	self._cache: dict[str, list[dict]] = {}

	# ── Library loading ───────────────────────────────────────────────────────

	def _load(self, lang: str) -> list[dict]:
	"""Load and cache all phrases for a language (base + user additions)."""
	if lang not in self._cache:
	phrases: list[dict] = []

	# Base library (git-tracked, always present)
	base_path = DATA_DIR / f"{lang}.json"
	if base_path.exists():
	with open(base_path, encoding="utf-8") as f:
	phrases = json.load(f)

	# User additions (written locally, uploaded to HF Hub for persistence)
	additions_path = DATA_DIR / f"{lang}_additions.json"
	if additions_path.exists():
	try:
	with open(additions_path, encoding="utf-8") as f:
	additions = json.load(f)
	phrases.extend(additions)
	except Exception as exc:
	logger.warning("Could not load phrase additions for %s: %s", lang, exc)

	self._cache[lang] = phrases
	logger.info("PhraseMatcher: loaded %d phrases for lang=%s", len(phrases), lang)
	return self._cache[lang]

	def reload(self, lang: str) -> None:
	"""Force reload from disk (call after adding new phrases)."""
	self._cache.pop(lang, None)

	def reload_from_hub_data(self, lang: str, additions_json: str) -> None:
	"""
	Called at startup with JSON downloaded from HF Hub.
	Writes to the local additions file so _load() picks it up.
	"""
	DATA_DIR.mkdir(parents=True, exist_ok=True)
	additions_path = DATA_DIR / f"{lang}_additions.json"
	with open(additions_path, "w", encoding="utf-8") as f:
	f.write(additions_json)
	self.reload(lang)

	# ── Matching ──────────────────────────────────────────────────────────────

	def match(self, text: str, lang: str, threshold: int = 60) -> Optional[dict]:
	"""
	Find the best-matching phrase entry for the transcribed text.
	Returns the entry dict (with 'response' and 'english' keys) or None.

	Strategy:
	1. Exact substring: any pattern is a substring of text (or vice-versa)
	2. Fuzzy partial_ratio via rapidfuzz (handles Whisper spelling variants)
	"""
	phrases = self._load(lang)
	if not phrases:
	return None

	text_norm = text.lower().strip()
	if not text_norm:
	return None

	# Build flat candidate list: (normalised_pattern, entry)
	candidates: list[tuple[str, dict]] = [
	(pat.lower(), entry)
	for entry in phrases
	for pat in entry.get("patterns", [])
	]
	if not candidates:
	return None

	# 1. Exact substring check (fast path)
	for pattern, entry in candidates:
	if pattern in text_norm or text_norm in pattern:
	logger.debug("Exact phrase match: %r", pattern)
	return entry

	# 2. Fuzzy match (handles Whisper mis-transcriptions)
	if _HAS_RAPIDFUZZ:
	pattern_strings = [c[0] for c in candidates]
	result = _process.extractOne(
	text_norm,
	pattern_strings,
	scorer=_fuzz.partial_ratio,
	score_cutoff=threshold,
	)
	if result:
	_, score, idx = result
	logger.debug("Fuzzy phrase match score=%d: %r", score, candidates[idx][0])
	return candidates[idx][1]

	return None

	# ── Phrase addition ───────────────────────────────────────────────────────

	def add_phrase(
	self,
	lang: str,
	patterns: list[str],
	response: str,
	english: str,
	tags: list[str] \| None = None,
	) -> None:
	"""Add a single phrase to the local additions file."""
	DATA_DIR.mkdir(parents=True, exist_ok=True)
	additions_path = DATA_DIR / f"{lang}_additions.json"

	additions: list[dict] = []
	if additions_path.exists():
	try:
	with open(additions_path, encoding="utf-8") as f:
	additions = json.load(f)
	except Exception:
	additions = []

	additions.append({
	"patterns": [p.strip() for p in patterns if p.strip()],
	"response": response.strip(),
	"english": english.strip(),
	"tags": tags or ["user_added"],
	})

	with open(additions_path, "w", encoding="utf-8") as f:
	json.dump(additions, f, ensure_ascii=False, indent=2)

	self.reload(lang)

	def import_pairs(self, lang: str, text: str) -> int:
	"""
	Bulk-import from pasted text.
	Format: one entry per line → native_phrase \| english_translation
	Lines starting with # are ignored.
	Returns number of entries successfully added.
	"""
	count = 0
	for line in text.strip().splitlines():
	line = line.strip()
	if not line or line.startswith("#"):
	continue
	if "\|" not in line:
	continue
	parts = line.split("\|", 1)
	if len(parts) != 2:
	continue
	native, english = parts[0].strip(), parts[1].strip()
	if not native or not english:
	continue
	self.add_phrase(lang, [native], native, english, tags=["imported"])
	count += 1
	return count

	def get_additions_json(self, lang: str) -> str:
	"""Return the raw JSON of user additions (for HF Hub upload)."""
	additions_path = DATA_DIR / f"{lang}_additions.json"
	if not additions_path.exists():
	return "[]"
	with open(additions_path, encoding="utf-8") as f:
	return f.read()

	def phrase_count(self, lang: str) -> int:
	return len(self._load(lang))