Toadoum commited on
Commit
ae6619f
·
verified ·
1 Parent(s): a87bd84

Update nlu.py

Browse files
Files changed (1) hide show
  1. nlu.py +120 -19
nlu.py CHANGED
@@ -1,5 +1,5 @@
1
  """
2
- NLU — NLLB + Qwen pivot-through-English architecture.
3
 
4
  Flow:
5
  1. Deterministic structural extractors run FIRST on the original Hausa
@@ -8,15 +8,19 @@ Flow:
8
  for banks, and regex is faster + more reliable than any model for
9
  this sub-task.
10
 
11
- 2. If structural extractors don't match the expected slot type, the text
12
- is translated Hausa English via NLLB-200, then classified by
13
- Qwen2.5-1.5B in English (where it is strong) into one of a small
14
- fixed set of intent labels.
15
 
16
- 3. If NLLB or Qwen fails, we return "unknown" cleanly the dialogue
17
- manager will re-prompt.
 
 
18
 
19
- All models are lazy-loaded on first use. Cold-start downloads:
 
 
 
20
  - NLLB-200-distilled-600M: ~2.4 GB
21
  - Qwen2.5-1.5B-Instruct: ~3 GB
22
  """
@@ -88,6 +92,81 @@ def _contains_human_keyword(text: str) -> bool:
88
  return any(kw in t for kw in HUMAN_KEYWORDS)
89
 
90
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  # ---------------------------------------------------------------------------
92
  # NLLB-200 Ha → En translation (lazy-loaded)
93
  # ---------------------------------------------------------------------------
@@ -277,8 +356,10 @@ def parse(text: str, expected: Optional[str] = None,
277
  use_llm: bool = True) -> tuple[str, dict, str]:
278
  """
279
  NLU. Returns (intent, entities, source) where source is one of:
280
- - 'structural': deterministic extractor caught it (digits, amount, yes/no)
281
- - 'nllb+qwen': translated via NLLB and classified via Qwen
 
 
282
  - 'human_keyword': caught human-agent escape hatch by keyword
283
  - 'unknown': nothing matched
284
  """
@@ -309,8 +390,7 @@ def parse(text: str, expected: Optional[str] = None,
309
  return yn, entities, "structural"
310
 
311
  if expected == "name":
312
- # Name is free-form; take the last token as a quick heuristic. Qwen
313
- # would not help here — names don't translate meaningfully.
314
  name = text.strip().split()[-1] if text.strip() else ""
315
  if name:
316
  entities["name"] = name
@@ -320,22 +400,43 @@ def parse(text: str, expected: Optional[str] = None,
320
  entities["date"] = text.strip()
321
  return "provide_date", entities, "structural"
322
 
323
- # Layer 2: NLLB Ha En, then Qwen classification
 
 
 
 
 
 
 
 
 
 
 
324
  if not use_llm:
 
325
  return "unknown", entities, "unknown"
326
 
327
- english_text = translate_ha_to_en(text)
328
- if english_text is None:
329
- return "unknown", entities, "unknown"
 
 
 
 
 
 
 
 
330
 
331
  qwen_result = _qwen_classify(english_text, expected)
332
  if qwen_result is None:
 
333
  return "unknown", entities, "unknown"
334
 
335
  intent, llm_entities = qwen_result
 
336
 
337
- # For free-text slots, pass the original Hausa text through (don't want
338
- # English-translated complaint text stored as a Hausa complaint)
339
  if expected == "bundle":
340
  t = text.lower()
341
  for b in ("rana", "mako", "wata"):
@@ -346,4 +447,4 @@ def parse(text: str, expected: Optional[str] = None,
346
  if expected == "text":
347
  llm_entities["text"] = text.strip()
348
 
349
- return intent, llm_entities, "nllb+qwen"
 
1
  """
2
+ NLU — NLLB + Qwen pivot-through-English architecture with keyword fast-path.
3
 
4
  Flow:
5
  1. Deterministic structural extractors run FIRST on the original Hausa
 
8
  for banks, and regex is faster + more reliable than any model for
9
  this sub-task.
10
 
11
+ 2. Keyword fast-path for common Hausa + English intent phrases. Matches
12
+ "check balance", "duba ma'auni", "canjin kuɗi", etc. in <10ms without
13
+ loading any model. This is what real voice bots use for 90% of turns.
 
14
 
15
+ 3. If structural + keyword layers don't match, the text is translated
16
+ Hausa English via NLLB-200 (skipped if input is already English),
17
+ then classified by Qwen2.5-1.5B in English (where it is strong) into
18
+ one of a small fixed set of intent labels.
19
 
20
+ 4. If NLLB or Qwen fails, we return "unknown" cleanly — the dialogue
21
+ manager routes to a vertical-specific fallback prompt.
22
+
23
+ All heavy models are lazy-loaded on first use. Cold-start downloads:
24
  - NLLB-200-distilled-600M: ~2.4 GB
25
  - Qwen2.5-1.5B-Instruct: ~3 GB
26
  """
 
92
  return any(kw in t for kw in HUMAN_KEYWORDS)
93
 
94
 
95
+ # Keyword fast-path for common intents. Runs BEFORE NLLB+Qwen so that the
96
+ # scripted demo flows don't require a 6GB LLM load. Phrases are Hausa and
97
+ # English pairs that customers actually use. When none match, we fall
98
+ # through to NLLB+Qwen for paraphrases.
99
+ INTENT_KEYWORDS = {
100
+ "check_balance": [
101
+ "duba ma'auni", "ma'auni", "balance", "check balance",
102
+ "account balance", "how much", "kudin asusu",
103
+ ],
104
+ "block_card": [
105
+ "toshe kati", "block card", "cancel card", "freeze card",
106
+ "toshe", "lost card", "ɓatar da kati",
107
+ ],
108
+ "transfer_money": [
109
+ "canjin kuɗi", "canjin kudi", "transfer", "transfer money",
110
+ "send money", "aiki kuɗi", "aiki kudi",
111
+ ],
112
+ "buy_airtime": [
113
+ "saya airtime", "airtime", "buy airtime", "top up", "topup",
114
+ "recharge", "karɓi airtime",
115
+ ],
116
+ "buy_bundle": [
117
+ "saya bundle", "bundle", "buy bundle", "buy data", "data",
118
+ "internet", "megabyte",
119
+ ],
120
+ "complaint": [
121
+ "yin korafi", "korafi", "complaint", "complain", "problem",
122
+ "matsala", "file complaint",
123
+ ],
124
+ "check_order": [
125
+ "bincika oda", "oda", "check order", "order status", "my order",
126
+ "where is my order", "track order",
127
+ ],
128
+ "reschedule": [
129
+ "sake tsara", "reschedule", "change time", "another day",
130
+ "later", "tomorrow",
131
+ ],
132
+ "return_item": [
133
+ "mayar da kaya", "return", "return item", "send back", "mayar",
134
+ ],
135
+ }
136
+
137
+
138
+ def _match_intent_keyword(text: str) -> Optional[str]:
139
+ """Keyword fast-path for common customer-service intents.
140
+ Returns the intent name if a keyword matches, else None."""
141
+ t = text.lower().strip()
142
+ # Check longer phrases first so "check balance" wins over "check order"
143
+ all_kw = [(intent, kw) for intent, kws in INTENT_KEYWORDS.items() for kw in kws]
144
+ all_kw.sort(key=lambda x: len(x[1]), reverse=True)
145
+ for intent, kw in all_kw:
146
+ if kw in t:
147
+ return intent
148
+ return None
149
+
150
+
151
+ def _looks_english(text: str) -> bool:
152
+ """Heuristic: if text contains no Hausa-specific characters and is majority
153
+ ASCII, treat as English and skip NLLB translation. Hausa uses ɓ, ɗ, ƙ, ƴ
154
+ and the apostrophe in 'a'a', 'ma'auni', 'jumma'a' etc."""
155
+ hausa_chars = set("ɓɗƙƴƁƊƘƳ")
156
+ if any(c in hausa_chars for c in text):
157
+ return False
158
+ # Common Hausa words — if any match, treat as Hausa
159
+ hausa_markers = {
160
+ "duba", "ma'auni", "toshe", "kati", "canjin", "kuɗi", "kudi",
161
+ "saya", "airtime", "bundle", "korafi", "bincika", "oda",
162
+ "sake", "tsara", "mayar", "kaya", "wakili", "mutum",
163
+ "sannu", "nagode", "don", "allah", "ka", "yana", "tana",
164
+ "dubu", "ɗari", "dari", "biyar", "biyu", "uku", "hudu", "huɗu",
165
+ }
166
+ tokens = set(text.lower().split())
167
+ return not bool(tokens & hausa_markers)
168
+
169
+
170
  # ---------------------------------------------------------------------------
171
  # NLLB-200 Ha → En translation (lazy-loaded)
172
  # ---------------------------------------------------------------------------
 
356
  use_llm: bool = True) -> tuple[str, dict, str]:
357
  """
358
  NLU. Returns (intent, entities, source) where source is one of:
359
+ - 'structural': deterministic extractor caught digits/amount/yes-no
360
+ - 'keyword': fast-path keyword matcher caught a common intent
361
+ - 'qwen_en': input was English, classified directly by Qwen
362
+ - 'nllb+qwen': translated via NLLB then classified via Qwen
363
  - 'human_keyword': caught human-agent escape hatch by keyword
364
  - 'unknown': nothing matched
365
  """
 
390
  return yn, entities, "structural"
391
 
392
  if expected == "name":
393
+ # Name is free-form; take the last token as a quick heuristic.
 
394
  name = text.strip().split()[-1] if text.strip() else ""
395
  if name:
396
  entities["name"] = name
 
400
  entities["date"] = text.strip()
401
  return "provide_date", entities, "structural"
402
 
403
+ # Layer 1.5: Keyword fast-path for common intents (Hausa + English).
404
+ # Runs in ANY state so users can pivot intent mid-flow ("actually I want
405
+ # to transfer money instead"). Structural extractors above already
406
+ # claimed strict-slot cases, so if we're in a slot-filling state and
407
+ # the text didn't match the slot, it's fair game to re-interpret as a
408
+ # new intent.
409
+ kw_intent = _match_intent_keyword(text)
410
+ if kw_intent:
411
+ logger.info(f"NLU: keyword matched {text!r} → {kw_intent}")
412
+ return kw_intent, entities, "keyword"
413
+
414
+ # Layer 2: NLLB Ha → En (skip if input already English), then Qwen
415
  if not use_llm:
416
+ logger.info(f"NLU: use_llm=False, returning unknown for {text!r}")
417
  return "unknown", entities, "unknown"
418
 
419
+ if _looks_english(text):
420
+ logger.info(f"NLU: input looks English, skipping NLLB: {text!r}")
421
+ english_text = text
422
+ source_tag = "qwen_en"
423
+ else:
424
+ logger.info(f"NLU: translating Hausa via NLLB: {text!r}")
425
+ english_text = translate_ha_to_en(text)
426
+ if english_text is None:
427
+ logger.warning("NLU: NLLB failed, returning unknown")
428
+ return "unknown", entities, "unknown"
429
+ source_tag = "nllb+qwen"
430
 
431
  qwen_result = _qwen_classify(english_text, expected)
432
  if qwen_result is None:
433
+ logger.warning(f"NLU: Qwen returned no valid intent for {english_text!r}")
434
  return "unknown", entities, "unknown"
435
 
436
  intent, llm_entities = qwen_result
437
+ logger.info(f"NLU: Qwen classified {english_text!r} → intent={intent}")
438
 
439
+ # For free-text slots, pass the original Hausa text through
 
440
  if expected == "bundle":
441
  t = text.lower()
442
  for b in ("rana", "mako", "wata"):
 
447
  if expected == "text":
448
  llm_entities["text"] = text.strip()
449
 
450
+ return intent, llm_entities, source_tag