Spaces:
Sleeping
Sleeping
Create parser.py
Browse files
parser.py
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
from typing import List, Dict, Optional, Tuple
|
| 3 |
+
from slugify import slugify
|
| 4 |
+
from .config import get_db
|
| 5 |
+
|
| 6 |
+
# -------------------------
|
| 7 |
+
# Helpers
|
| 8 |
+
# -------------------------
|
| 9 |
+
def normalize_term(s: str) -> str:
|
| 10 |
+
return re.sub(r"\s+", " ", (s or "").strip().lower())
|
| 11 |
+
|
| 12 |
+
def product_id_from_name(name: str) -> str:
|
| 13 |
+
return slugify(normalize_term(name))
|
| 14 |
+
|
| 15 |
+
def load_alias_map() -> Tuple[Dict[str, Dict], Dict[str, Dict]]:
|
| 16 |
+
"""
|
| 17 |
+
Reads all docs in 'products' and returns:
|
| 18 |
+
alias2prod: { "amatamatisi": {"id": "tomatoes", "name": "Tomatoes"}, ... }
|
| 19 |
+
prod_by_id: { "tomatoes": {"id": "tomatoes", "name": "Tomatoes"} }
|
| 20 |
+
"""
|
| 21 |
+
db = get_db()
|
| 22 |
+
alias2prod, prod_by_id = {}, {}
|
| 23 |
+
for snap in db.collection("products").stream():
|
| 24 |
+
pid = snap.id
|
| 25 |
+
data = snap.to_dict() or {}
|
| 26 |
+
name = data.get("name") or pid
|
| 27 |
+
aliases = set([name] + list(data.get("aliases", [])))
|
| 28 |
+
for a in aliases:
|
| 29 |
+
alias2prod[normalize_term(a)] = {"id": pid, "name": name}
|
| 30 |
+
prod_by_id[pid] = {"id": pid, "name": name}
|
| 31 |
+
return alias2prod, prod_by_id
|
| 32 |
+
|
| 33 |
+
NUM_WORDS_EN = {
|
| 34 |
+
"one":1,"two":2,"three":3,"four":4,"five":5,"six":6,"seven":7,"eight":8,"nine":9,"ten":10,
|
| 35 |
+
}
|
| 36 |
+
def parse_quantity(text: str) -> Optional[int]:
|
| 37 |
+
t = normalize_term(text)
|
| 38 |
+
# isiZulu pattern like "ezi-3" or standalone digits
|
| 39 |
+
m = re.search(r"(?:ezi-)?(\d{1,6})\b", t.replace(",", ""))
|
| 40 |
+
if m:
|
| 41 |
+
return int(m.group(1))
|
| 42 |
+
for w, n in NUM_WORDS_EN.items():
|
| 43 |
+
if re.search(rf"\b{w}\b", t):
|
| 44 |
+
return n
|
| 45 |
+
return None
|
| 46 |
+
|
| 47 |
+
def guess_product_from_phrase(phrase: str) -> str:
|
| 48 |
+
t = normalize_term(phrase)
|
| 49 |
+
m = re.search(r"\bof\s+([a-zA-Z][a-zA-Z ]+)$", t) # "loaves of bread" -> bread
|
| 50 |
+
if m:
|
| 51 |
+
return m.group(1).strip()
|
| 52 |
+
tokens = [x for x in re.split(r"[^a-zA-Z]+", t) if x]
|
| 53 |
+
if not tokens:
|
| 54 |
+
return t
|
| 55 |
+
guess = " ".join(tokens[-2:]) if len(tokens) >= 2 else tokens[-1]
|
| 56 |
+
return guess
|
| 57 |
+
|
| 58 |
+
BUY_PAT = re.compile(
|
| 59 |
+
r"\b(buy|bought|purchase|purchased|ngithenge|ke\s+rekile)\b", re.IGNORECASE
|
| 60 |
+
)
|
| 61 |
+
SELL_PAT = re.compile(
|
| 62 |
+
r"\b(sell|sold|ngithengise|ke\s+rekisitse)\b", re.IGNORECASE
|
| 63 |
+
)
|
| 64 |
+
|
| 65 |
+
def detect_intent(text: str) -> str:
|
| 66 |
+
if BUY_PAT.search(text): return "buy"
|
| 67 |
+
if SELL_PAT.search(text): return "sell"
|
| 68 |
+
return "unknown"
|
| 69 |
+
|
| 70 |
+
def split_phrases(raw: str) -> List[str]:
|
| 71 |
+
t = normalize_term(raw)
|
| 72 |
+
parts = re.split(r"\s*,\s*|\s+and\s+|\s+no\s+|\s+le\s+|\s+na\s+", t) # 'and' (EN), 'no' (zu), 'le/na' (st)
|
| 73 |
+
return [p for p in parts if p]
|
| 74 |
+
|
| 75 |
+
def best_alias_match(part: str, alias2prod: Dict[str, Dict]) -> Optional[Dict]:
|
| 76 |
+
t = f" {normalize_term(part)} "
|
| 77 |
+
best = None
|
| 78 |
+
for alias, meta in alias2prod.items():
|
| 79 |
+
if re.search(rf"\b{re.escape(alias)}\b", t):
|
| 80 |
+
if best is None or len(alias) > len(best["alias"]):
|
| 81 |
+
best = {"alias": alias, "id": meta["id"], "name": meta["name"]}
|
| 82 |
+
return best
|
| 83 |
+
|
| 84 |
+
def extract_items_from_text(text: str, alias2prod: Dict[str, Dict]) -> List[Dict]:
|
| 85 |
+
"""
|
| 86 |
+
Returns [{ name, resolvedId (optional), quantity (optional) }]
|
| 87 |
+
"""
|
| 88 |
+
items = []
|
| 89 |
+
t = normalize_term(text)
|
| 90 |
+
t = re.sub(r"^\b(i\s+)?(bought|buy|purchased|ngithenge|ke\s+rekile)\b[\s,:-]*", "", t)
|
| 91 |
+
for part in split_phrases(t):
|
| 92 |
+
qty = parse_quantity(part)
|
| 93 |
+
match = best_alias_match(part, alias2prod)
|
| 94 |
+
if match:
|
| 95 |
+
items.append({"name": match["name"].title(), "resolvedId": match["id"], "quantity": qty})
|
| 96 |
+
else:
|
| 97 |
+
guess = guess_product_from_phrase(part)
|
| 98 |
+
items.append({"name": guess.title(), "quantity": qty})
|
| 99 |
+
return items
|
| 100 |
+
|
| 101 |
+
def interpret_message(text: str) -> Dict:
|
| 102 |
+
intent = detect_intent(text)
|
| 103 |
+
alias2prod, _ = load_alias_map()
|
| 104 |
+
items = extract_items_from_text(text, alias2prod)
|
| 105 |
+
return {"intent": intent, "items": items}
|