Spaces:

PlotweaverAI
/

Voice-AI-Agent-Clean

Sleeping

App Files Files Community

Voice-AI-Agent-Clean / nlu.py

Toadoum

Update nlu.py

ae6619f verified 18 days ago

raw

history blame contribute delete

16.9 kB

	"""
	NLU — NLLB + Qwen pivot-through-English architecture with keyword fast-path.

	Flow:
	1. Deterministic structural extractors run FIRST on the original Hausa
	text (digits, amounts, yes/no keywords). These MUST be deterministic
	because "1234" → "provide_digits" with digits="1234" is non-negotiable
	for banks, and regex is faster + more reliable than any model for
	this sub-task.

	2. Keyword fast-path for common Hausa + English intent phrases. Matches
	"check balance", "duba ma'auni", "canjin kuɗi", etc. in <10ms without
	loading any model. This is what real voice bots use for 90% of turns.

	3. If structural + keyword layers don't match, the text is translated
	Hausa → English via NLLB-200 (skipped if input is already English),
	then classified by Qwen2.5-1.5B in English (where it is strong) into
	one of a small fixed set of intent labels.

	4. If NLLB or Qwen fails, we return "unknown" cleanly — the dialogue
	manager routes to a vertical-specific fallback prompt.

	All heavy models are lazy-loaded on first use. Cold-start downloads:
	- NLLB-200-distilled-600M: ~2.4 GB
	- Qwen2.5-1.5B-Instruct: ~3 GB
	"""
	from __future__ import annotations
	import re
	import json
	import logging
	from typing import Optional

	logger = logging.getLogger("plotweaver.nlu")


	# ---------------------------------------------------------------------------
	# Deterministic structural extractors (run on raw Hausa text)
	# ---------------------------------------------------------------------------
	WORD_DIGITS = {
	"sifili": "0", "daya": "1", "ɗaya": "1", "biyu": "2", "uku": "3",
	"hudu": "4", "huɗu": "4", "biyar": "5", "shida": "6", "bakwai": "7",
	"takwas": "8", "tara": "9",
	}

	WORD_AMOUNTS = {
	"dubu goma": 10000, "dubu biyar": 5000, "dubu biyu": 2000,
	"dubu": 1000, "ɗari biyar": 500, "dari biyar": 500,
	"ɗari": 100, "dari": 100,
	}

	# Hausa yes/no keywords for the sole case where we short-circuit Qwen
	HAUSA_YES = {"i", "eh", "haka ne", "haka", "ok", "okay", "yes"}
	HAUSA_NO = {"a'a", "a'aa", "ba haka", "ba", "no"}

	# Human-agent escape hatch
	HUMAN_KEYWORDS = {"mutum", "wakili", "agent", "human"}


	def _extract_digits(text: str) -> Optional[str]:
	m = re.findall(r"\d+", text)
	if m:
	return "".join(m)
	tokens = text.lower().split()
	d = [WORD_DIGITS[tok] for tok in tokens if tok in WORD_DIGITS]
	return "".join(d) if d else None


	def _extract_amount(text: str) -> Optional[int]:
	m = re.search(r"\d+", text)
	if m:
	return int(m.group())
	t = text.lower()
	for phrase in sorted(WORD_AMOUNTS.keys(), key=len, reverse=True):
	if phrase in t:
	return WORD_AMOUNTS[phrase]
	return None


	def _match_yesno(text: str) -> Optional[str]:
	t = " " + text.lower().strip() + " "
	for kw in HAUSA_YES:
	if f" {kw} " in t or t.strip() == kw:
	return "yes"
	for kw in HAUSA_NO:
	if f" {kw} " in t or t.strip() == kw:
	return "no"
	return None


	def _contains_human_keyword(text: str) -> bool:
	t = text.lower()
	return any(kw in t for kw in HUMAN_KEYWORDS)


	# Keyword fast-path for common intents. Runs BEFORE NLLB+Qwen so that the
	# scripted demo flows don't require a 6GB LLM load. Phrases are Hausa and
	# English pairs that customers actually use. When none match, we fall
	# through to NLLB+Qwen for paraphrases.
	INTENT_KEYWORDS = {
	"check_balance": [
	"duba ma'auni", "ma'auni", "balance", "check balance",
	"account balance", "how much", "kudin asusu",
	],
	"block_card": [
	"toshe kati", "block card", "cancel card", "freeze card",
	"toshe", "lost card", "ɓatar da kati",
	],
	"transfer_money": [
	"canjin kuɗi", "canjin kudi", "transfer", "transfer money",
	"send money", "aiki kuɗi", "aiki kudi",
	],
	"buy_airtime": [
	"saya airtime", "airtime", "buy airtime", "top up", "topup",
	"recharge", "karɓi airtime",
	],
	"buy_bundle": [
	"saya bundle", "bundle", "buy bundle", "buy data", "data",
	"internet", "megabyte",
	],
	"complaint": [
	"yin korafi", "korafi", "complaint", "complain", "problem",
	"matsala", "file complaint",
	],
	"check_order": [
	"bincika oda", "oda", "check order", "order status", "my order",
	"where is my order", "track order",
	],
	"reschedule": [
	"sake tsara", "reschedule", "change time", "another day",
	"later", "tomorrow",
	],
	"return_item": [
	"mayar da kaya", "return", "return item", "send back", "mayar",
	],
	}


	def _match_intent_keyword(text: str) -> Optional[str]:
	"""Keyword fast-path for common customer-service intents.
	Returns the intent name if a keyword matches, else None."""
	t = text.lower().strip()
	# Check longer phrases first so "check balance" wins over "check order"
	all_kw = [(intent, kw) for intent, kws in INTENT_KEYWORDS.items() for kw in kws]
	all_kw.sort(key=lambda x: len(x[1]), reverse=True)
	for intent, kw in all_kw:
	if kw in t:
	return intent
	return None


	def _looks_english(text: str) -> bool:
	"""Heuristic: if text contains no Hausa-specific characters and is majority
	ASCII, treat as English and skip NLLB translation. Hausa uses ɓ, ɗ, ƙ, ƴ
	and the apostrophe in 'a'a', 'ma'auni', 'jumma'a' etc."""
	hausa_chars = set("ɓɗƙƴƁƊƘƳ")
	if any(c in hausa_chars for c in text):
	return False
	# Common Hausa words — if any match, treat as Hausa
	hausa_markers = {
	"duba", "ma'auni", "toshe", "kati", "canjin", "kuɗi", "kudi",
	"saya", "airtime", "bundle", "korafi", "bincika", "oda",
	"sake", "tsara", "mayar", "kaya", "wakili", "mutum",
	"sannu", "nagode", "don", "allah", "ka", "yana", "tana",
	"dubu", "ɗari", "dari", "biyar", "biyu", "uku", "hudu", "huɗu",
	}
	tokens = set(text.lower().split())
	return not bool(tokens & hausa_markers)


	# ---------------------------------------------------------------------------
	# NLLB-200 Ha → En translation (lazy-loaded)
	# ---------------------------------------------------------------------------
	_nllb_model = None
	_nllb_tokenizer = None
	_nllb_failed = False


	def _load_nllb():
	"""Lazy-load NLLB-200-distilled-600M."""
	global _nllb_model, _nllb_tokenizer, _nllb_failed
	if _nllb_failed:
	return None, None
	if _nllb_model is not None:
	return _nllb_model, _nllb_tokenizer
	try:
	import torch
	from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
	logger.info("Loading NLLB-200-distilled-600M…")
	model_id = "facebook/nllb-200-distilled-600M"
	_nllb_tokenizer = AutoTokenizer.from_pretrained(model_id)
	_nllb_model = AutoModelForSeq2SeqLM.from_pretrained(
	model_id,
	torch_dtype=torch.float32,
	low_cpu_mem_usage=True,
	)
	_nllb_model.eval()
	logger.info("NLLB-200 ready.")
	return _nllb_model, _nllb_tokenizer
	except Exception as e:
	logger.warning(f"NLLB load failed: {e}")
	_nllb_failed = True
	return None, None


	def translate_ha_to_en(text: str) -> Optional[str]:
	"""Translate Hausa to English via NLLB. Returns None on failure."""
	model, tokenizer = _load_nllb()
	if model is None or not text.strip():
	return None
	try:
	import torch
	# NLLB requires source language token set on tokenizer
	tokenizer.src_lang = "hau_Latn"
	inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128)
	# Force English output via forced_bos_token_id
	forced_bos_id = tokenizer.convert_tokens_to_ids("eng_Latn")
	with torch.no_grad():
	out = model.generate(
	**inputs,
	forced_bos_token_id=forced_bos_id,
	max_new_tokens=128,
	num_beams=2,
	)
	translated = tokenizer.batch_decode(out, skip_special_tokens=True)[0].strip()
	logger.info(f"NLLB Ha→En: {text!r} → {translated!r}")
	return translated
	except Exception as e:
	logger.warning(f"NLLB translate failed: {e}")
	return None


	# ---------------------------------------------------------------------------
	# Qwen2.5-1.5B intent classifier (operates on English text)
	# ---------------------------------------------------------------------------
	_llm_model = None
	_llm_tokenizer = None
	_llm_failed = False


	def _load_llm():
	global _llm_model, _llm_tokenizer, _llm_failed
	if _llm_failed:
	return None, None
	if _llm_model is not None:
	return _llm_model, _llm_tokenizer
	try:
	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer
	logger.info("Loading Qwen2.5-1.5B-Instruct…")
	model_id = "Qwen/Qwen2.5-1.5B-Instruct"
	_llm_tokenizer = AutoTokenizer.from_pretrained(model_id)
	_llm_model = AutoModelForCausalLM.from_pretrained(
	model_id,
	torch_dtype=torch.float32,
	low_cpu_mem_usage=True,
	)
	_llm_model.eval()
	logger.info("Qwen2.5-1.5B ready.")
	return _llm_model, _llm_tokenizer
	except Exception as e:
	logger.warning(f"Qwen load failed: {e}")
	_llm_failed = True
	return None, None


	CANDIDATE_INTENTS = {
	None: ["check_balance", "block_card", "transfer_money",
	"buy_airtime", "buy_bundle", "complaint",
	"check_order", "reschedule", "return_item",
	"human_agent", "unknown"],
	"intent": ["check_balance", "block_card", "transfer_money",
	"buy_airtime", "buy_bundle", "complaint",
	"check_order", "reschedule", "return_item",
	"human_agent", "unknown"],
	"yesno": ["yes", "no", "human_agent", "unknown"],
	"name": ["provide_name", "human_agent", "unknown"],
	"date": ["provide_date", "human_agent", "unknown"],
	"bundle": ["provide_bundle", "human_agent", "unknown"],
	"text": ["provide_text", "human_agent", "unknown"],
	}


	SYSTEM_PROMPT = """You are an intent classifier for a customer-service voice bot.

	You will be given an English-language utterance (translated from Hausa) and a list of candidate intents. Return JSON with the single best-matching intent and any entities you can extract.

	Intent meanings:
	- check_balance: user wants to check an account balance
	- block_card: user wants to block, freeze, or cancel a bank card
	- transfer_money: user wants to send or transfer money
	- buy_airtime: user wants to buy phone airtime / top-up
	- buy_bundle: user wants to buy a data bundle / internet package
	- complaint: user wants to file a complaint or report a problem
	- check_order: user wants to check the status of an order
	- reschedule: user wants to reschedule a delivery
	- return_item: user wants to return an item
	- human_agent: user wants to speak to a human person
	- yes / no: affirmative or negative reply
	- provide_name / provide_date / provide_bundle / provide_text: user is supplying information
	- unknown: cannot determine intent

	Return ONLY valid JSON. No explanation, no markdown. Example: {"intent": "check_balance", "entities": {}}"""


	def _qwen_classify(english_text: str, expected: Optional[str]) -> Optional[tuple[str, dict]]:
	"""Classify an English utterance into an intent. Returns None on failure."""
	model, tokenizer = _load_llm()
	if model is None:
	return None

	candidates = CANDIDATE_INTENTS.get(expected, CANDIDATE_INTENTS[None])
	user_prompt = (
	f'Utterance: "{english_text}"\n'
	f'Candidate intents: {", ".join(candidates)}\n\n'
	'Return JSON only.'
	)
	messages = [
	{"role": "system", "content": SYSTEM_PROMPT},
	{"role": "user", "content": user_prompt},
	]
	try:
	import torch
	prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
	inputs = tokenizer(prompt, return_tensors="pt")
	with torch.no_grad():
	out = model.generate(
	**inputs,
	max_new_tokens=60,
	do_sample=False,
	pad_token_id=tokenizer.eos_token_id,
	)
	generated = tokenizer.decode(out[0][inputs.input_ids.shape[1]:], skip_special_tokens=True).strip()
	logger.info(f"Qwen raw: {generated}")

	m = re.search(r"\{.*?\}", generated, re.DOTALL)
	if not m:
	return None
	parsed = json.loads(m.group())
	intent = parsed.get("intent", "unknown")
	entities = parsed.get("entities", {}) or {}
	if not isinstance(entities, dict):
	entities = {}
	if intent not in candidates:
	logger.info(f"Qwen returned out-of-candidate intent: {intent}")
	return None
	return intent, entities
	except Exception as e:
	logger.warning(f"Qwen inference failed: {e}")
	return None


	# ---------------------------------------------------------------------------
	# Public API
	# ---------------------------------------------------------------------------
	def parse(text: str, expected: Optional[str] = None,
	use_llm: bool = True) -> tuple[str, dict, str]:
	"""
	NLU. Returns (intent, entities, source) where source is one of:
	- 'structural': deterministic extractor caught digits/amount/yes-no
	- 'keyword': fast-path keyword matcher caught a common intent
	- 'qwen_en': input was English, classified directly by Qwen
	- 'nllb+qwen': translated via NLLB then classified via Qwen
	- 'human_keyword': caught human-agent escape hatch by keyword
	- 'unknown': nothing matched
	"""
	entities: dict = {}
	if not text or not text.strip():
	return "unknown", entities, "unknown"

	# Always-on human-agent escape (safety)
	if _contains_human_keyword(text):
	return "human_agent", entities, "human_keyword"

	# Layer 1: deterministic structural extractors for strict-format slots
	if expected == "digits":
	d = _extract_digits(text)
	if d:
	entities["digits"] = d
	return "provide_digits", entities, "structural"

	if expected == "amount":
	a = _extract_amount(text)
	if a is not None:
	entities["amount"] = a
	return "provide_amount", entities, "structural"

	if expected == "yesno":
	yn = _match_yesno(text)
	if yn:
	return yn, entities, "structural"

	if expected == "name":
	# Name is free-form; take the last token as a quick heuristic.
	name = text.strip().split()[-1] if text.strip() else ""
	if name:
	entities["name"] = name
	return "provide_name", entities, "structural"

	if expected == "date":
	entities["date"] = text.strip()
	return "provide_date", entities, "structural"

	# Layer 1.5: Keyword fast-path for common intents (Hausa + English).
	# Runs in ANY state so users can pivot intent mid-flow ("actually I want
	# to transfer money instead"). Structural extractors above already
	# claimed strict-slot cases, so if we're in a slot-filling state and
	# the text didn't match the slot, it's fair game to re-interpret as a
	# new intent.
	kw_intent = _match_intent_keyword(text)
	if kw_intent:
	logger.info(f"NLU: keyword matched {text!r} → {kw_intent}")
	return kw_intent, entities, "keyword"

	# Layer 2: NLLB Ha → En (skip if input already English), then Qwen
	if not use_llm:
	logger.info(f"NLU: use_llm=False, returning unknown for {text!r}")
	return "unknown", entities, "unknown"

	if _looks_english(text):
	logger.info(f"NLU: input looks English, skipping NLLB: {text!r}")
	english_text = text
	source_tag = "qwen_en"
	else:
	logger.info(f"NLU: translating Hausa via NLLB: {text!r}")
	english_text = translate_ha_to_en(text)
	if english_text is None:
	logger.warning("NLU: NLLB failed, returning unknown")
	return "unknown", entities, "unknown"
	source_tag = "nllb+qwen"

	qwen_result = _qwen_classify(english_text, expected)
	if qwen_result is None:
	logger.warning(f"NLU: Qwen returned no valid intent for {english_text!r}")
	return "unknown", entities, "unknown"

	intent, llm_entities = qwen_result
	logger.info(f"NLU: Qwen classified {english_text!r} → intent={intent}")

	# For free-text slots, pass the original Hausa text through
	if expected == "bundle":
	t = text.lower()
	for b in ("rana", "mako", "wata"):
	if b in t:
	llm_entities["bundle"] = b
	break

	if expected == "text":
	llm_entities["text"] = text.strip()

	return intent, llm_entities, source_tag