yoursdvniel commited on
Commit
2fe23a7
·
verified ·
1 Parent(s): 44feed3

Create parser.py

Browse files
Files changed (1) hide show
  1. parser.py +105 -0
parser.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from typing import List, Dict, Optional, Tuple
3
+ from slugify import slugify
4
+ from .config import get_db
5
+
6
+ # -------------------------
7
+ # Helpers
8
+ # -------------------------
9
+ def normalize_term(s: str) -> str:
10
+ return re.sub(r"\s+", " ", (s or "").strip().lower())
11
+
12
+ def product_id_from_name(name: str) -> str:
13
+ return slugify(normalize_term(name))
14
+
15
+ def load_alias_map() -> Tuple[Dict[str, Dict], Dict[str, Dict]]:
16
+ """
17
+ Reads all docs in 'products' and returns:
18
+ alias2prod: { "amatamatisi": {"id": "tomatoes", "name": "Tomatoes"}, ... }
19
+ prod_by_id: { "tomatoes": {"id": "tomatoes", "name": "Tomatoes"} }
20
+ """
21
+ db = get_db()
22
+ alias2prod, prod_by_id = {}, {}
23
+ for snap in db.collection("products").stream():
24
+ pid = snap.id
25
+ data = snap.to_dict() or {}
26
+ name = data.get("name") or pid
27
+ aliases = set([name] + list(data.get("aliases", [])))
28
+ for a in aliases:
29
+ alias2prod[normalize_term(a)] = {"id": pid, "name": name}
30
+ prod_by_id[pid] = {"id": pid, "name": name}
31
+ return alias2prod, prod_by_id
32
+
33
+ NUM_WORDS_EN = {
34
+ "one":1,"two":2,"three":3,"four":4,"five":5,"six":6,"seven":7,"eight":8,"nine":9,"ten":10,
35
+ }
36
+ def parse_quantity(text: str) -> Optional[int]:
37
+ t = normalize_term(text)
38
+ # isiZulu pattern like "ezi-3" or standalone digits
39
+ m = re.search(r"(?:ezi-)?(\d{1,6})\b", t.replace(",", ""))
40
+ if m:
41
+ return int(m.group(1))
42
+ for w, n in NUM_WORDS_EN.items():
43
+ if re.search(rf"\b{w}\b", t):
44
+ return n
45
+ return None
46
+
47
+ def guess_product_from_phrase(phrase: str) -> str:
48
+ t = normalize_term(phrase)
49
+ m = re.search(r"\bof\s+([a-zA-Z][a-zA-Z ]+)$", t) # "loaves of bread" -> bread
50
+ if m:
51
+ return m.group(1).strip()
52
+ tokens = [x for x in re.split(r"[^a-zA-Z]+", t) if x]
53
+ if not tokens:
54
+ return t
55
+ guess = " ".join(tokens[-2:]) if len(tokens) >= 2 else tokens[-1]
56
+ return guess
57
+
58
+ BUY_PAT = re.compile(
59
+ r"\b(buy|bought|purchase|purchased|ngithenge|ke\s+rekile)\b", re.IGNORECASE
60
+ )
61
+ SELL_PAT = re.compile(
62
+ r"\b(sell|sold|ngithengise|ke\s+rekisitse)\b", re.IGNORECASE
63
+ )
64
+
65
+ def detect_intent(text: str) -> str:
66
+ if BUY_PAT.search(text): return "buy"
67
+ if SELL_PAT.search(text): return "sell"
68
+ return "unknown"
69
+
70
+ def split_phrases(raw: str) -> List[str]:
71
+ t = normalize_term(raw)
72
+ parts = re.split(r"\s*,\s*|\s+and\s+|\s+no\s+|\s+le\s+|\s+na\s+", t) # 'and' (EN), 'no' (zu), 'le/na' (st)
73
+ return [p for p in parts if p]
74
+
75
+ def best_alias_match(part: str, alias2prod: Dict[str, Dict]) -> Optional[Dict]:
76
+ t = f" {normalize_term(part)} "
77
+ best = None
78
+ for alias, meta in alias2prod.items():
79
+ if re.search(rf"\b{re.escape(alias)}\b", t):
80
+ if best is None or len(alias) > len(best["alias"]):
81
+ best = {"alias": alias, "id": meta["id"], "name": meta["name"]}
82
+ return best
83
+
84
+ def extract_items_from_text(text: str, alias2prod: Dict[str, Dict]) -> List[Dict]:
85
+ """
86
+ Returns [{ name, resolvedId (optional), quantity (optional) }]
87
+ """
88
+ items = []
89
+ t = normalize_term(text)
90
+ t = re.sub(r"^\b(i\s+)?(bought|buy|purchased|ngithenge|ke\s+rekile)\b[\s,:-]*", "", t)
91
+ for part in split_phrases(t):
92
+ qty = parse_quantity(part)
93
+ match = best_alias_match(part, alias2prod)
94
+ if match:
95
+ items.append({"name": match["name"].title(), "resolvedId": match["id"], "quantity": qty})
96
+ else:
97
+ guess = guess_product_from_phrase(part)
98
+ items.append({"name": guess.title(), "quantity": qty})
99
+ return items
100
+
101
+ def interpret_message(text: str) -> Dict:
102
+ intent = detect_intent(text)
103
+ alias2prod, _ = load_alias_map()
104
+ items = extract_items_from_text(text, alias2prod)
105
+ return {"intent": intent, "items": items}