| """tutor/lang_detect.py — lightweight keyword-based language detection.""" |
| from __future__ import annotations |
| import re |
|
|
| _MARKERS = { |
| "kin": ["mbiri","gatatu","kane","gatanu","ikibazo","igisubizo","komeza", |
| "byiza","wabitsinze","ntangaza","umwe","babiri","batatu","bane","batanu"], |
| "sw": ["moja","mbili","tatu","nne","tano","sita","saba","nane","tisa","kumi", |
| "jibu","swali","hesabu","vizuri","hongera","endelea","jaribu"], |
| "fr": ["un","deux","trois","quatre","cinq","six","sept","huit","neuf","dix", |
| "bonjour","merci","oui","non","la","le","les","est","une","je","tu", |
| "combien","réponse","bravo","essaie","encore"], |
| "en": ["one","two","three","four","five","six","seven","eight","nine","ten", |
| "hello","yes","no","the","answer","how","many","count","add","minus", |
| "great","amazing","correct","wrong","next"], |
| } |
|
|
|
|
| def detect(text: str) -> str: |
| tokens = re.findall(r"[a-zA-ZÀ-öø-ÿ']+", text.lower()) |
| if not tokens: |
| return "en" |
| scores = {lang: sum(1 for t in tokens if t in markers) |
| for lang, markers in _MARKERS.items()} |
| best = max(scores.values()) |
| if best == 0: |
| return "en" |
| winners = [l for l, s in scores.items() if s == best] |
| if len(winners) == 1: |
| return winners[0] |
| return "en" if "en" in winners else "mix" |
|
|
|
|
| def reply_lang(detected: str, fallback: str = "en") -> str: |
| return detected if detected in ("en","fr","kin","sw") else fallback |
|
|