Spaces:
Sleeping
Sleeping
Update nlu.py
Browse files
nlu.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
"""
|
| 2 |
-
NLU — NLLB + Qwen pivot-through-English architecture.
|
| 3 |
|
| 4 |
Flow:
|
| 5 |
1. Deterministic structural extractors run FIRST on the original Hausa
|
|
@@ -8,15 +8,19 @@ Flow:
|
|
| 8 |
for banks, and regex is faster + more reliable than any model for
|
| 9 |
this sub-task.
|
| 10 |
|
| 11 |
-
2.
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
fixed set of intent labels.
|
| 15 |
|
| 16 |
-
3. If
|
| 17 |
-
|
|
|
|
|
|
|
| 18 |
|
| 19 |
-
|
|
|
|
|
|
|
|
|
|
| 20 |
- NLLB-200-distilled-600M: ~2.4 GB
|
| 21 |
- Qwen2.5-1.5B-Instruct: ~3 GB
|
| 22 |
"""
|
|
@@ -88,6 +92,81 @@ def _contains_human_keyword(text: str) -> bool:
|
|
| 88 |
return any(kw in t for kw in HUMAN_KEYWORDS)
|
| 89 |
|
| 90 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
# ---------------------------------------------------------------------------
|
| 92 |
# NLLB-200 Ha → En translation (lazy-loaded)
|
| 93 |
# ---------------------------------------------------------------------------
|
|
@@ -277,8 +356,10 @@ def parse(text: str, expected: Optional[str] = None,
|
|
| 277 |
use_llm: bool = True) -> tuple[str, dict, str]:
|
| 278 |
"""
|
| 279 |
NLU. Returns (intent, entities, source) where source is one of:
|
| 280 |
-
- 'structural': deterministic extractor caught
|
| 281 |
-
- '
|
|
|
|
|
|
|
| 282 |
- 'human_keyword': caught human-agent escape hatch by keyword
|
| 283 |
- 'unknown': nothing matched
|
| 284 |
"""
|
|
@@ -309,8 +390,7 @@ def parse(text: str, expected: Optional[str] = None,
|
|
| 309 |
return yn, entities, "structural"
|
| 310 |
|
| 311 |
if expected == "name":
|
| 312 |
-
# Name is free-form; take the last token as a quick heuristic.
|
| 313 |
-
# would not help here — names don't translate meaningfully.
|
| 314 |
name = text.strip().split()[-1] if text.strip() else ""
|
| 315 |
if name:
|
| 316 |
entities["name"] = name
|
|
@@ -320,22 +400,43 @@ def parse(text: str, expected: Optional[str] = None,
|
|
| 320 |
entities["date"] = text.strip()
|
| 321 |
return "provide_date", entities, "structural"
|
| 322 |
|
| 323 |
-
# Layer
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 324 |
if not use_llm:
|
|
|
|
| 325 |
return "unknown", entities, "unknown"
|
| 326 |
|
| 327 |
-
|
| 328 |
-
|
| 329 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 330 |
|
| 331 |
qwen_result = _qwen_classify(english_text, expected)
|
| 332 |
if qwen_result is None:
|
|
|
|
| 333 |
return "unknown", entities, "unknown"
|
| 334 |
|
| 335 |
intent, llm_entities = qwen_result
|
|
|
|
| 336 |
|
| 337 |
-
# For free-text slots, pass the original Hausa text through
|
| 338 |
-
# English-translated complaint text stored as a Hausa complaint)
|
| 339 |
if expected == "bundle":
|
| 340 |
t = text.lower()
|
| 341 |
for b in ("rana", "mako", "wata"):
|
|
@@ -346,4 +447,4 @@ def parse(text: str, expected: Optional[str] = None,
|
|
| 346 |
if expected == "text":
|
| 347 |
llm_entities["text"] = text.strip()
|
| 348 |
|
| 349 |
-
return intent, llm_entities,
|
|
|
|
| 1 |
"""
|
| 2 |
+
NLU — NLLB + Qwen pivot-through-English architecture with keyword fast-path.
|
| 3 |
|
| 4 |
Flow:
|
| 5 |
1. Deterministic structural extractors run FIRST on the original Hausa
|
|
|
|
| 8 |
for banks, and regex is faster + more reliable than any model for
|
| 9 |
this sub-task.
|
| 10 |
|
| 11 |
+
2. Keyword fast-path for common Hausa + English intent phrases. Matches
|
| 12 |
+
"check balance", "duba ma'auni", "canjin kuɗi", etc. in <10ms without
|
| 13 |
+
loading any model. This is what real voice bots use for 90% of turns.
|
|
|
|
| 14 |
|
| 15 |
+
3. If structural + keyword layers don't match, the text is translated
|
| 16 |
+
Hausa → English via NLLB-200 (skipped if input is already English),
|
| 17 |
+
then classified by Qwen2.5-1.5B in English (where it is strong) into
|
| 18 |
+
one of a small fixed set of intent labels.
|
| 19 |
|
| 20 |
+
4. If NLLB or Qwen fails, we return "unknown" cleanly — the dialogue
|
| 21 |
+
manager routes to a vertical-specific fallback prompt.
|
| 22 |
+
|
| 23 |
+
All heavy models are lazy-loaded on first use. Cold-start downloads:
|
| 24 |
- NLLB-200-distilled-600M: ~2.4 GB
|
| 25 |
- Qwen2.5-1.5B-Instruct: ~3 GB
|
| 26 |
"""
|
|
|
|
| 92 |
return any(kw in t for kw in HUMAN_KEYWORDS)
|
| 93 |
|
| 94 |
|
| 95 |
+
# Keyword fast-path for common intents. Runs BEFORE NLLB+Qwen so that the
|
| 96 |
+
# scripted demo flows don't require a 6GB LLM load. Phrases are Hausa and
|
| 97 |
+
# English pairs that customers actually use. When none match, we fall
|
| 98 |
+
# through to NLLB+Qwen for paraphrases.
|
| 99 |
+
INTENT_KEYWORDS = {
|
| 100 |
+
"check_balance": [
|
| 101 |
+
"duba ma'auni", "ma'auni", "balance", "check balance",
|
| 102 |
+
"account balance", "how much", "kudin asusu",
|
| 103 |
+
],
|
| 104 |
+
"block_card": [
|
| 105 |
+
"toshe kati", "block card", "cancel card", "freeze card",
|
| 106 |
+
"toshe", "lost card", "ɓatar da kati",
|
| 107 |
+
],
|
| 108 |
+
"transfer_money": [
|
| 109 |
+
"canjin kuɗi", "canjin kudi", "transfer", "transfer money",
|
| 110 |
+
"send money", "aiki kuɗi", "aiki kudi",
|
| 111 |
+
],
|
| 112 |
+
"buy_airtime": [
|
| 113 |
+
"saya airtime", "airtime", "buy airtime", "top up", "topup",
|
| 114 |
+
"recharge", "karɓi airtime",
|
| 115 |
+
],
|
| 116 |
+
"buy_bundle": [
|
| 117 |
+
"saya bundle", "bundle", "buy bundle", "buy data", "data",
|
| 118 |
+
"internet", "megabyte",
|
| 119 |
+
],
|
| 120 |
+
"complaint": [
|
| 121 |
+
"yin korafi", "korafi", "complaint", "complain", "problem",
|
| 122 |
+
"matsala", "file complaint",
|
| 123 |
+
],
|
| 124 |
+
"check_order": [
|
| 125 |
+
"bincika oda", "oda", "check order", "order status", "my order",
|
| 126 |
+
"where is my order", "track order",
|
| 127 |
+
],
|
| 128 |
+
"reschedule": [
|
| 129 |
+
"sake tsara", "reschedule", "change time", "another day",
|
| 130 |
+
"later", "tomorrow",
|
| 131 |
+
],
|
| 132 |
+
"return_item": [
|
| 133 |
+
"mayar da kaya", "return", "return item", "send back", "mayar",
|
| 134 |
+
],
|
| 135 |
+
}
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
def _match_intent_keyword(text: str) -> Optional[str]:
|
| 139 |
+
"""Keyword fast-path for common customer-service intents.
|
| 140 |
+
Returns the intent name if a keyword matches, else None."""
|
| 141 |
+
t = text.lower().strip()
|
| 142 |
+
# Check longer phrases first so "check balance" wins over "check order"
|
| 143 |
+
all_kw = [(intent, kw) for intent, kws in INTENT_KEYWORDS.items() for kw in kws]
|
| 144 |
+
all_kw.sort(key=lambda x: len(x[1]), reverse=True)
|
| 145 |
+
for intent, kw in all_kw:
|
| 146 |
+
if kw in t:
|
| 147 |
+
return intent
|
| 148 |
+
return None
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
def _looks_english(text: str) -> bool:
|
| 152 |
+
"""Heuristic: if text contains no Hausa-specific characters and is majority
|
| 153 |
+
ASCII, treat as English and skip NLLB translation. Hausa uses ɓ, ɗ, ƙ, ƴ
|
| 154 |
+
and the apostrophe in 'a'a', 'ma'auni', 'jumma'a' etc."""
|
| 155 |
+
hausa_chars = set("ɓɗƙƴƁƊƘƳ")
|
| 156 |
+
if any(c in hausa_chars for c in text):
|
| 157 |
+
return False
|
| 158 |
+
# Common Hausa words — if any match, treat as Hausa
|
| 159 |
+
hausa_markers = {
|
| 160 |
+
"duba", "ma'auni", "toshe", "kati", "canjin", "kuɗi", "kudi",
|
| 161 |
+
"saya", "airtime", "bundle", "korafi", "bincika", "oda",
|
| 162 |
+
"sake", "tsara", "mayar", "kaya", "wakili", "mutum",
|
| 163 |
+
"sannu", "nagode", "don", "allah", "ka", "yana", "tana",
|
| 164 |
+
"dubu", "ɗari", "dari", "biyar", "biyu", "uku", "hudu", "huɗu",
|
| 165 |
+
}
|
| 166 |
+
tokens = set(text.lower().split())
|
| 167 |
+
return not bool(tokens & hausa_markers)
|
| 168 |
+
|
| 169 |
+
|
| 170 |
# ---------------------------------------------------------------------------
|
| 171 |
# NLLB-200 Ha → En translation (lazy-loaded)
|
| 172 |
# ---------------------------------------------------------------------------
|
|
|
|
| 356 |
use_llm: bool = True) -> tuple[str, dict, str]:
|
| 357 |
"""
|
| 358 |
NLU. Returns (intent, entities, source) where source is one of:
|
| 359 |
+
- 'structural': deterministic extractor caught digits/amount/yes-no
|
| 360 |
+
- 'keyword': fast-path keyword matcher caught a common intent
|
| 361 |
+
- 'qwen_en': input was English, classified directly by Qwen
|
| 362 |
+
- 'nllb+qwen': translated via NLLB then classified via Qwen
|
| 363 |
- 'human_keyword': caught human-agent escape hatch by keyword
|
| 364 |
- 'unknown': nothing matched
|
| 365 |
"""
|
|
|
|
| 390 |
return yn, entities, "structural"
|
| 391 |
|
| 392 |
if expected == "name":
|
| 393 |
+
# Name is free-form; take the last token as a quick heuristic.
|
|
|
|
| 394 |
name = text.strip().split()[-1] if text.strip() else ""
|
| 395 |
if name:
|
| 396 |
entities["name"] = name
|
|
|
|
| 400 |
entities["date"] = text.strip()
|
| 401 |
return "provide_date", entities, "structural"
|
| 402 |
|
| 403 |
+
# Layer 1.5: Keyword fast-path for common intents (Hausa + English).
|
| 404 |
+
# Runs in ANY state so users can pivot intent mid-flow ("actually I want
|
| 405 |
+
# to transfer money instead"). Structural extractors above already
|
| 406 |
+
# claimed strict-slot cases, so if we're in a slot-filling state and
|
| 407 |
+
# the text didn't match the slot, it's fair game to re-interpret as a
|
| 408 |
+
# new intent.
|
| 409 |
+
kw_intent = _match_intent_keyword(text)
|
| 410 |
+
if kw_intent:
|
| 411 |
+
logger.info(f"NLU: keyword matched {text!r} → {kw_intent}")
|
| 412 |
+
return kw_intent, entities, "keyword"
|
| 413 |
+
|
| 414 |
+
# Layer 2: NLLB Ha → En (skip if input already English), then Qwen
|
| 415 |
if not use_llm:
|
| 416 |
+
logger.info(f"NLU: use_llm=False, returning unknown for {text!r}")
|
| 417 |
return "unknown", entities, "unknown"
|
| 418 |
|
| 419 |
+
if _looks_english(text):
|
| 420 |
+
logger.info(f"NLU: input looks English, skipping NLLB: {text!r}")
|
| 421 |
+
english_text = text
|
| 422 |
+
source_tag = "qwen_en"
|
| 423 |
+
else:
|
| 424 |
+
logger.info(f"NLU: translating Hausa via NLLB: {text!r}")
|
| 425 |
+
english_text = translate_ha_to_en(text)
|
| 426 |
+
if english_text is None:
|
| 427 |
+
logger.warning("NLU: NLLB failed, returning unknown")
|
| 428 |
+
return "unknown", entities, "unknown"
|
| 429 |
+
source_tag = "nllb+qwen"
|
| 430 |
|
| 431 |
qwen_result = _qwen_classify(english_text, expected)
|
| 432 |
if qwen_result is None:
|
| 433 |
+
logger.warning(f"NLU: Qwen returned no valid intent for {english_text!r}")
|
| 434 |
return "unknown", entities, "unknown"
|
| 435 |
|
| 436 |
intent, llm_entities = qwen_result
|
| 437 |
+
logger.info(f"NLU: Qwen classified {english_text!r} → intent={intent}")
|
| 438 |
|
| 439 |
+
# For free-text slots, pass the original Hausa text through
|
|
|
|
| 440 |
if expected == "bundle":
|
| 441 |
t = text.lower()
|
| 442 |
for b in ("rana", "mako", "wata"):
|
|
|
|
| 447 |
if expected == "text":
|
| 448 |
llm_entities["text"] = text.strip()
|
| 449 |
|
| 450 |
+
return intent, llm_entities, source_tag
|