Youmnaaaa commited on
Commit
e6dbeb3
·
verified ·
1 Parent(s): 8f0a3f2

Upload 4 files

Browse files
Files changed (4) hide show
  1. Dockerfile +11 -0
  2. README.md +42 -5
  3. app.py +654 -0
  4. requirements.txt +12 -0
Dockerfile ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ WORKDIR /app
4
+
5
+ COPY requirements.txt .
6
+ RUN pip install --no-cache-dir -r requirements.txt
7
+
8
+ COPY . .
9
+
10
+ ENV PORT=7860
11
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
README.md CHANGED
@@ -1,10 +1,47 @@
1
  ---
2
- title: GP.chatbot
3
- emoji: 📚
4
- colorFrom: purple
5
- colorTo: gray
6
  sdk: docker
7
  pinned: false
8
  ---
9
 
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Beni Suef Chatbot
3
+ emoji: 🗺️
4
+ colorFrom: green
5
+ colorTo: blue
6
  sdk: docker
7
  pinned: false
8
  ---
9
 
10
+ # 🗺️ Beni Suef Chatbot API
11
+
12
+ شاتبوت عربي للبحث عن الأماكن في مدينة بني سويف.
13
+
14
+ ## الـ Endpoints
15
+
16
+ | Method | URL | الوصف |
17
+ |--------|-----|-------|
18
+ | GET | `/` | تأكيد إن الـ API شغال |
19
+ | GET | `/health` | حالة الموديلز |
20
+ | POST | `/chat` | إرسال رسالة |
21
+ | DELETE | `/session/{id}` | مسح محادثة |
22
+
23
+ ## مثال على الاستخدام
24
+
25
+ ```bash
26
+ curl -X POST "https://youmnaaaa-gp-chatbot.hf.space/chat" \
27
+ -H "Content-Type: application/json" \
28
+ -d '{"message": "عايز مطعم قريب", "session_id": "user_1"}'
29
+ ```
30
+
31
+ **Response:**
32
+ ```json
33
+ {
34
+ "reply": "🍽️ لقيتلك ...",
35
+ "intent": "nearest_restaurant",
36
+ "confidence": 0.97,
37
+ "entities": {"location": "..."},
38
+ "session_id": "user_1",
39
+ "best_place": { ... }
40
+ }
41
+ ```
42
+
43
+ ## الموديلز المستخدمة
44
+
45
+ - **Intent Model:** AraBERT fine-tuned لتصنيف النوايا
46
+ - **Entity Model:** BERT-based NER للـ entities العربية
47
+ - **Semantic Model:** Multilingual Sentence Transformers للبحث الدلالي
app.py ADDED
@@ -0,0 +1,654 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """app.ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1iPAjeI3M04kA13lYenlROS96tUeCYakB
8
+ """
9
+
10
+ import os, re, json, math, random, pickle, joblib
11
+ import numpy as np
12
+ import pandas as pd
13
+ import torch
14
+
15
+ from datetime import datetime
16
+ from zoneinfo import ZoneInfo
17
+ from contextlib import asynccontextmanager
18
+
19
+ from fastapi import FastAPI, HTTPException
20
+ from fastapi.middleware.cors import CORSMiddleware
21
+ from pydantic import BaseModel
22
+ from typing import Optional
23
+
24
+ from sentence_transformers import SentenceTransformer, util
25
+ from transformers import (
26
+ AutoTokenizer,
27
+ AutoModelForSequenceClassification,
28
+ AutoModelForTokenClassification,
29
+ pipeline,
30
+ )
31
+ from huggingface_hub import snapshot_download
32
+
33
+ """Paths"""
34
+
35
+ try:
36
+ BASE_DIR = os.path.dirname(os.path.abspath(__file__))
37
+ except NameError:
38
+ BASE_DIR = os.getcwd()
39
+
40
+ # HuggingFace Model Repos
41
+ INTENT_REPO = "Youmnaaaa/intent-arabert-ff"
42
+ ENTITY_REPO = "Youmnaaaa/entity-hybrid-ff"
43
+ SEMANTIC_REPO = "Youmnaaaa/semantic-search-ff"
44
+
45
+ # ملف الأماكن جوا الـ Space
46
+ PLACES_FILE = os.path.join(BASE_DIR, "beni_suef_100_places_v5ff.xlsx")
47
+
48
+ intent_tokenizer = intent_model = label_encoder = id2intent = None
49
+ ner_pipeline = label2id = id2label = None
50
+ semantic_model = corpus_df = corpus_embeddings = places_df = None
51
+ SESSIONS: dict = {}
52
+
53
+ def clean_text(text):
54
+ text = str(text).strip().lower()
55
+ text = re.sub(r"ـ+", "", text)
56
+ for old, new in [("[إأآا]","ا"),("ى","ي"),("ة","ه"),("ؤ","و"),("ئ","ي")]:
57
+ text = re.sub(old, new, text)
58
+ text = re.sub(r"[^\w\s]", " ", text)
59
+ return re.sub(r"\s+", " ", text).strip()
60
+
61
+
62
+ def norm(text):
63
+ text = str(text).strip().lower()
64
+ text = re.sub(r"ـ+", "", text)
65
+ for old, new in [("[إأآا]","ا"),("ى","ي"),("ة","ه"),("ؤ","و"),("ئ","ي")]:
66
+ text = re.sub(old, new, text)
67
+ for old, new in [("صباحًا","ص"),("صباحا","ص"),("مساءً","م"),("مساءا","م"),
68
+ ("ليلًا","م"),("ليلا","م"),("إلى","-"),("الى","-"),("حتى","-"),
69
+ ("–","-"),("—","-")]:
70
+ text = text.replace(old, new)
71
+ return re.sub(r"\s+", " ", text).strip()
72
+
73
+ # INTENT MAPS
74
+ SEARCH_INTENTS = {"nearest_restaurant","nearest_pharmacy","nearest_cafe",
75
+ "nearest_supermarket","housing_search","recommend_place",
76
+ "open_now","place_details"}
77
+ STATIC_INTENTS = {"greeting","thanks","goodbye","confirm","deny"}
78
+
79
+ INTENT_TO_CATEGORY = {
80
+ "nearest_restaurant":"restaurant","nearest_pharmacy":"pharmacy",
81
+ "nearest_cafe":"cafe","nearest_supermarket":"supermarket",
82
+ "housing_search":"housing",
83
+ }
84
+ INTENT_TEMPLATE_MAP = {
85
+ "nearest_restaurant":"find_restaurant","nearest_pharmacy":"find_pharmacy",
86
+ "nearest_cafe":"find_cafe","nearest_supermarket":"find_supermarket",
87
+ "housing_search":"find_housing","recommend_place":"find_restaurant",
88
+ "open_now":"find_restaurant","place_details":"find_restaurant",
89
+ "greeting":"greeting","thanks":"thanks","goodbye":"goodbye",
90
+ "confirm":"clarification","deny":"clarification","fallback":"fallback",
91
+ }
92
+ ENTITY_FIELD_MAP = {
93
+ "location":"location","place_type":"category","cuisine_or_item":"sub_category",
94
+ "food_type":"sub_category","price":"price","price_range":"price",
95
+ "category":"category","sub_category":"sub_category","facility_type":"category",
96
+ "housing_type":"category","status":"status","time":"time",
97
+ }
98
+ KEYWORD_OVERRIDE = {
99
+ "goodbye": ["مع السلامة","مع السلامه","باي","وداعا","bye","goodbye","تصبح على خير",
100
+ "في امان الله","الله يسلمك","سلامتك"],
101
+ "greeting":["السلام عليكم","وعليكم السلام","اهلا","أهلا","هلا","هلو","مرحبا","مرحباً",
102
+ "صباح الخير","مساء الخير","هاي","hi","hello","صباح","مساء"],
103
+ "thanks": ["شكرا","شكراً","تسلم","يسلمو","ممنون","مشكور","thanks","thank","الف شكر"],
104
+ }
105
+ CATEGORY_KEYWORDS = {
106
+ "restaurant":["مطعم","اكل","وجبات","مشويات","كباب","شاورما","كريب","برجر","سمك","فرايد"],
107
+ "pharmacy": ["صيدليه","صيدلية","دوا","ادويه","دواء"],
108
+ "cafe": ["كافيه","كوفي","قهوه","قهوة","كافيتيريا"],
109
+ "supermarket":["سوبرماركت","ماركت","بقاله","هايبر"],
110
+ "housing": ["شقه","شقة","ايجار","إيجار","فندق","هوستل","سكن"],
111
+ }
112
+ CLARIFICATION_Q = {
113
+ "nearest_restaurant":"أي نوع أكل؟ مشويات، شاورما، كريب، برجر؟",
114
+ "nearest_pharmacy":"في أي منطقة بتدور على صيدلية؟",
115
+ "nearest_cafe":"في أي منطقة بتدور على كافيه؟",
116
+ "nearest_supermarket":"في أي منطقة بتدور على ماركت؟",
117
+ "housing_search":"بتدور على إيه — شقة إيجار، فندق؟ وفين؟",
118
+ }
119
+ OUT_OF_SCOPE_KW = ["الجو","طقس","درجه","كوره","كرة","أهلي","زمالك","مباريات",
120
+ "سياسه","سياسة","أخبار","رصيد","بنك","تحويل","امتحان","مدرسه",
121
+ "جامعه","وظيفه","برمجه","كود","python","java","رياضيات","ترجمه","translate"]
122
+ NEXT_WORDS = ["تاني","غيره","غيرها","بديل","حاجة تانية","مش عاجبني","فيه تاني","عايز غيره"]
123
+ DETAIL_WORDS = ["بيفتح","بتفتح","مواعيده","مواعيدها","امتى","امتي","عنوانه","عنوانها",
124
+ "تليفونه","تليفونها","رقمه","رقمها","تقييمه","تقييمها","سعره","سعرها"]
125
+ REF_WORDS = ["هو","هي","ده","دي","المكان ده"]
126
+ _LOC_CUES = ["الحي","بني سويف","الاباصيري","الكورنيش","مقبل","الزراعيين",
127
+ "صلاح سالم","شرق النيل","سيتي سنتر","عرابي","الروضه"]
128
+
129
+ # HELPER FUNCTIONS
130
+ def apply_keyword_override(text):
131
+ t = norm(text); tw = set(t.split())
132
+ for intent, kws in KEYWORD_OVERRIDE.items():
133
+ for k in sorted(kws, key=len, reverse=True):
134
+ kn = norm(k)
135
+ if (" " in kn and kn in t) or (kn in tw): return intent
136
+ return None
137
+
138
+ def get_template_key(intent, category=None):
139
+ if category:
140
+ k = {"restaurant":"find_restaurant","pharmacy":"find_pharmacy",
141
+ "cafe":"find_cafe","supermarket":"find_supermarket",
142
+ "housing":"find_housing"}.get(category)
143
+ if k: return k
144
+ return INTENT_TEMPLATE_MAP.get(intent, "fallback")
145
+
146
+ def infer_category(query):
147
+ q = norm(query)
148
+ for cat, words in CATEGORY_KEYWORDS.items():
149
+ if any(norm(w) in q for w in words): return cat
150
+ return None
151
+
152
+ def is_out_of_scope(text):
153
+ t = norm(text)
154
+ return any(norm(k) in t for k in OUT_OF_SCOPE_KW)
155
+
156
+ def detect_ref_type(text):
157
+ t = norm(text); tw = set(t.split())
158
+ if any(norm(w) in t for w in NEXT_WORDS): return "next"
159
+ if any(norm(w) in t for w in DETAIL_WORDS): return "detail"
160
+ for w in REF_WORDS:
161
+ wn = norm(w)
162
+ if (" " in wn and wn in t) or (wn in tw): return "reference"
163
+ return "new"
164
+
165
+ def _loc_continuation(text):
166
+ t = norm(text); words = t.split()
167
+ if len(words) <= 4 and any(norm(c) in t for c in _LOC_CUES): return True
168
+ return bool(words and words[0] == "في")
169
+
170
+ def normalize_rating(r):
171
+ try:
172
+ r = float(r)
173
+ return round(r/2, 1) if r > 5 else round(r, 1) if r > 0 else 0.0
174
+ except: return 0.0
175
+
176
+ # TIME UTILS
177
+
178
+ def get_cairo_now():
179
+ return datetime.now(ZoneInfo("Africa/Cairo"))
180
+
181
+ def parse_time(token):
182
+ token = norm(token).replace(" ", "")
183
+ m = re.match(r"^(\d{1,2})(?::(\d{1,2}))?(ص|م|ظهر)?$", token)
184
+ if not m: return None
185
+ h = int(m.group(1)); mn = int(m.group(2)) if m.group(2) else 0; suf = m.group(3)
186
+ if not (0 <= mn <= 59): return None
187
+ if suf == "ص":
188
+ if h == 12: h = 0
189
+ elif not (1 <= h <= 11): return None
190
+ elif suf in ("م","ظهر"):
191
+ if h != 12 and 1 <= h <= 11: h += 12
192
+ else:
193
+ if h == 24: h = 0
194
+ elif not (0 <= h <= 23): return None
195
+ return f"{h:02d}:{mn:02d}"
196
+
197
+ def check_open_now(opening_hours_str):
198
+ if not opening_hours_str or str(opening_hours_str).strip() in ("","nan","none"): return None
199
+ text = norm(str(opening_hours_str))
200
+ if any(k in text for k in ["24","always","طول اليوم","24/7"]): return 1
201
+ sep = re.search(r"(.+?)\s*-\s*(.+)", text)
202
+ if not sep: return None
203
+ t1 = parse_time(sep.group(1).strip()); t2 = parse_time(sep.group(2).strip())
204
+ if not t1 or not t2: return None
205
+ now_t = f"{get_cairo_now().hour:02d}:{get_cairo_now().minute:02d}"
206
+ if t1 <= t2: return 1 if t1 <= now_t <= t2 else 0
207
+ return 1 if (now_t >= t1 or now_t <= t2) else 0
208
+
209
+ # SEARCH + FILTER + RANK
210
+ def semantic_candidates(query, top_k=20):
211
+ q_emb = semantic_model.encode(clean_text(query), convert_to_tensor=True)
212
+ scores = util.cos_sim(q_emb, corpus_embeddings)[0]
213
+ top_k = min(top_k, len(corpus_df))
214
+ top_r = torch.topk(scores, k=top_k)
215
+ res = corpus_df.iloc[top_r.indices.cpu().numpy()].copy()
216
+ res["semantic_score"] = top_r.values.cpu().numpy()
217
+ keep = [c for c in ["place_id","doc_id","name","category","sub_category","location",
218
+ "address","price_range","opening_hours","description","semantic_score"]
219
+ if c in res.columns]
220
+ return res[keep].reset_index(drop=True)
221
+
222
+ def merge_places(df):
223
+ extra = [c for c in ["lat","lon","rating","phone","social_media","status",
224
+ "category_clean","sub_category_clean","location_clean",
225
+ "address_clean","price_range_clean","search_text_clean"]
226
+ if c in places_df.columns]
227
+ slim = places_df[["place_id"] + extra].copy()
228
+ return df.merge(slim, on="place_id", how="left")
229
+
230
+ def apply_filters(df, query, category=None, sub_category=None, location=None,
231
+ price_range=None, open_now_only=False, min_rating=None):
232
+ f = df.copy()
233
+ if category: f = f[f["category_clean"].astype(str).str.contains(re.escape(clean_text(category)), na=False)]
234
+ if sub_category: f = f[f["sub_category_clean"].astype(str).str.contains(re.escape(clean_text(sub_category)), na=False)]
235
+ if location: f = f[f["location_clean"].astype(str).str.contains(re.escape(clean_text(location)), na=False)]
236
+ if price_range: f = f[f["price_range_clean"].astype(str).str.contains(re.escape(clean_text(price_range)), na=False)]
237
+ f["open_now"] = f["opening_hours"].apply(check_open_now)
238
+ f["rating_num"] = pd.to_numeric(f.get("rating", pd.Series()), errors="coerce").fillna(0)
239
+ f["rating_norm"] = f["rating_num"].apply(normalize_rating)
240
+ f["rating_score"] = f["rating_norm"] / 5.0
241
+ f["open_score"] = f["open_now"].apply(lambda x: 1.0 if x==1 else (0.5 if x is None else 0.0))
242
+ if open_now_only: f = f[f["open_now"] == 1]
243
+ if min_rating: f = f[f["rating_norm"] >= min_rating]
244
+ return f
245
+
246
+ def haversine(lat1, lon1, lat2, lon2):
247
+ R=6371; p=math.pi/180
248
+ a = (math.sin((lat2-lat1)*p/2)**2 + math.cos(lat1*p)*math.cos(lat2*p)*math.sin((lon2-lon1)*p/2)**2)
249
+ return 2*R*math.asin(math.sqrt(a))
250
+
251
+ def rank(df, query, user_lat=None, user_lon=None):
252
+ df = df.copy()
253
+ if user_lat and user_lon and "lat" in df.columns:
254
+ def dist(row):
255
+ try: return haversine(float(user_lat), float(user_lon), float(row["lat"]), float(row["lon"]))
256
+ except: return 999
257
+ df["distance_km"] = df.apply(dist, axis=1)
258
+ mx = df["distance_km"].replace(999, np.nan).max() or 1
259
+ df["distance_score"] = 1 - (df["distance_km"] / (mx + 1))
260
+ else:
261
+ df["distance_km"] = 999; df["distance_score"] = 0.0
262
+ q_clean = clean_text(query)
263
+ df["name_match_score"] = df["name"].apply(
264
+ lambda n: 1.0 if clean_text(str(n)) in q_clean or q_clean in clean_text(str(n)) else 0.0)
265
+ w = dict(semantic=0.40, rating=0.25, open=0.15, distance=0.10, name=0.10)
266
+ df["final_score"] = (
267
+ w["semantic"]*df.get("semantic_score", pd.Series(0,index=df.index)).fillna(0) +
268
+ w["rating"] *df.get("rating_score", pd.Series(0,index=df.index)).fillna(0) +
269
+ w["open"] *df.get("open_score", pd.Series(0,index=df.index)).fillna(0) +
270
+ w["distance"]*df["distance_score"] + w["name"]*df["name_match_score"]
271
+ )
272
+ return df.sort_values("final_score", ascending=False).reset_index(drop=True)
273
+
274
+ def search_places(query, top_k_final=5, category=None, sub_category=None,
275
+ location=None, price_range=None, open_now_only=False,
276
+ min_rating=None, user_lat=None, user_lon=None):
277
+ cands = semantic_candidates(query, top_k=20)
278
+ merged = merge_places(cands)
279
+ for attempt in [
280
+ dict(category=category, sub_category=sub_category, location=location,
281
+ price_range=price_range, open_now_only=open_now_only, min_rating=min_rating),
282
+ dict(category=category, sub_category=None, location=location,
283
+ price_range=price_range, open_now_only=open_now_only, min_rating=min_rating),
284
+ dict(category=category, sub_category=None, location=location,
285
+ price_range=None, open_now_only=False, min_rating=min_rating),
286
+ dict(category=category, sub_category=None, location=None,
287
+ price_range=None, open_now_only=False, min_rating=None),
288
+ ]:
289
+ filtered = apply_filters(merged, query, **attempt)
290
+ if not filtered.empty: break
291
+ if filtered.empty: return pd.DataFrame()
292
+ ranked = rank(filtered, query, user_lat, user_lon)
293
+ keep = [c for c in ["place_id","name","category","sub_category","location","address",
294
+ "price_range","rating","rating_norm","opening_hours","description",
295
+ "phone","lat","lon","semantic_score","final_score","open_now"]
296
+ if c in ranked.columns]
297
+ return ranked[keep].head(top_k_final).reset_index(drop=True)
298
+
299
+ # RESPONSE TEMPLATES + FORMATTERS
300
+ RESPONSE_TEMPLATES = {
301
+ "find_restaurant":[
302
+ "🍽️ لقيتلك {name} في {location}. {price_info}{rating_info}{hours_info}",
303
+ "أنصحك بـ {name} — هتلاقيه في {location}. {price_info}{rating_info}{hours_info}",
304
+ "في {location} فيه {name}. {description_short}{price_info}{hours_info}",
305
+ ],
306
+ "find_pharmacy":[
307
+ "💊 {name} في {location}.{hours_info}{rating_info}",
308
+ "أقرب صيدلية ليك: {name} — {address_info}{hours_info}",
309
+ ],
310
+ "find_cafe":[
311
+ "☕ {name} في {location}. {price_info}{rating_info}{hours_info}",
312
+ "جرب {name} — في {location}. {description_short}{hours_info}",
313
+ ],
314
+ "find_supermarket":[
315
+ "🛒 {name} في {location}.{hours_info}{rating_info}",
316
+ "أقرب ماركت: {name} — {address_info}{hours_info}",
317
+ ],
318
+ "find_housing":[
319
+ "🏠 {name} في {location}. {price_info}{description_short}",
320
+ "فيه {name} في {location}. {price_info}{rating_info}",
321
+ ],
322
+ "greeting": ["أهلاً! 😊 أنا بساعدك تلاقي أي مكان في بني سويف. عايز إيه؟",
323
+ "وعليكم السلام! قولي محتاج إيه — مطعم، صيدلية، كافيه؟",
324
+ "هلا بيك! محتاج إيه في بني سويف؟ 😊"],
325
+ "thanks": ["العفو! 😊 في حاجة تانية أساعدك فيها؟","أي خدمة! 😊","بكل سرور! 😊"],
326
+ "goodbye": ["مع السلامة! 👋","سلامتك! أي وقت محتاج مساعدة أنا هنا.","باي! ربنا يوفقك 😊"],
327
+ "clarification":["😊 قصدك إيه بالظبط؟","ممكن توضح أكتر؟","تمام! بتدور على إيه بالظبط؟"],
328
+ "no_result": ["😔 مش لاقي حاجة مناسبة. جرب تغير المنطقة أو تسأل بطريقة تانية.",
329
+ "معلش، مفيش نتايج. ممكن تحدد المنطقة أو النوع أكتر؟"],
330
+ "fallback": ["آسف، مش فاهم قصدك. 😊 قولي محتاج إيه — مطعم، صيدلية، كافيه؟",
331
+ "ممكن تسألني عن أي مكان في بني سويف وأنا هساعدك! 😊"],
332
+ }
333
+
334
+ def fmt_price(x):
335
+ p = str(x).strip().lower()
336
+ if not p or p in ("","nan","none"): return ""
337
+ m = {"cheap":"الأسعار رخيصة","رخيص":"الأسعار رخيصة","اقتصادي":"الأسعار اقتصادية",
338
+ "medium":"الأسعار متوسطة","متوسط":"الأسعار متوسطة",
339
+ "expensive":"الأسعار غالية","غالي":"الأسعار غالية"}
340
+ for k,v in m.items():
341
+ if k in p: return v+". "
342
+ return f"السعر: {x}. "
343
+
344
+ def fmt_rating(x):
345
+ try:
346
+ r = normalize_rating(float(x)); stars = min(round(r), 5)
347
+ return f"تقييمه {r} {'⭐'*stars}. " if r > 0 else ""
348
+ except: return ""
349
+
350
+ def fmt_hours(x):
351
+ h = str(x).strip()
352
+ if not h or h in ("","nan","none"): return ""
353
+ if any(k in h.lower() for k in ["24","always","طول اليوم"]): return "مفتوح 24 ساعة. "
354
+ return f"بيفتح: {h}. "
355
+
356
+ def fmt_addr(address, location):
357
+ a=str(address).strip(); l=str(location).strip()
358
+ if a and a not in ("","nan","none"): return f"عنوانه: {a}. "
359
+ if l and l not in ("","nan","none"): return f"في {l}. "
360
+ return ""
361
+
362
+ def fmt_desc(x, max_words=12):
363
+ d = str(x).strip()
364
+ if not d or d in ("","nan","none"): return ""
365
+ words = d.split()
366
+ return (" ".join(words[:max_words])+"...") if len(words)>max_words else d+" "
367
+
368
+ def build_response(place, intent, category=None):
369
+ if not place: return random.choice(RESPONSE_TEMPLATES["no_result"])
370
+ tk = get_template_key(intent, category)
371
+ reply = random.choice(RESPONSE_TEMPLATES[tk]).format(
372
+ name = str(place.get("name","")).strip(),
373
+ location = str(place.get("location","")).strip() or "بني سويف",
374
+ price_info = fmt_price(place.get("price_range","")),
375
+ rating_info = fmt_rating(place.get("rating_norm", place.get("rating", 0))),
376
+ hours_info = fmt_hours(place.get("opening_hours","")),
377
+ address_info = fmt_addr(place.get("address",""), place.get("location","")),
378
+ description_short= fmt_desc(place.get("description","")),
379
+ )
380
+ on = place.get("open_now")
381
+ if on == 1: reply += "\n🟢 مفتوح دلوقتي."
382
+ elif on == 0: reply += "\n🔴 مغلق دلوقتي."
383
+ return reply
384
+
385
+ def handle_detail(text, place):
386
+ if not place: return "مش فاكر إحنا اتكلمنا عن مكان. ممكن تسألني من الأول؟"
387
+ t = norm(text); name = str(place.get("name","")).strip()
388
+ if any(w in t for w in ["امتي","امتى","مواعيد","يفتح","تفتح","يقفل"]):
389
+ st = "🟢 مفتوح" if place.get("open_now")==1 else "🔴 مغلق"
390
+ return f"⏰ {name} — {fmt_hours(place.get('opening_hours',''))}\n{st} دلوقتي."
391
+ if any(w in t for w in ["عنوان","فين","وصول","اوصل"]):
392
+ return f"📍 {name} في {place.get('location','')}.\\nالعنوان: {place.get('address','')}"
393
+ if any(w in t for w in ["سعر","بكام","تكلف","غالي","رخيص"]):
394
+ return f"💰 {name} — {fmt_price(place.get('price_range',''))}"
395
+ if any(w in t for w in ["تقييم","نجوم"]):
396
+ return f"⭐ {name} — {fmt_rating(place.get('rating_norm', place.get('rating',0)))}"
397
+ if any(w in t for w in ["رقم","تليفون"]):
398
+ phone = str(place.get("phone","")).strip()
399
+ return f"📞 {name} — {phone}" if phone else f"معنديش رقم {name}."
400
+ return f"📋 {name}:\n{fmt_desc(place.get('description',''), 20)}\n{fmt_hours(place.get('opening_hours',''))}{fmt_rating(place.get('rating_norm',0))}"
401
+
402
+ # PREDICT FUNCTIONS
403
+
404
+ def predict_intent(text, threshold=0.5):
405
+ override = apply_keyword_override(text)
406
+ if override: return {"intent": override, "confidence": 1.0}
407
+ inputs = intent_tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
408
+ with torch.no_grad():
409
+ outputs = intent_model(**inputs)
410
+ probs = torch.softmax(outputs.logits, dim=1)
411
+ pid = torch.argmax(probs, dim=1).item()
412
+ conf = probs[0][pid].item()
413
+ return {"intent": id2intent[pid] if conf >= threshold else "fallback", "confidence": round(conf, 4)}
414
+
415
+ def extract_entities(text, min_score=0.40):
416
+ raw = ner_pipeline([text])[0]; entities = {}
417
+ for item in raw:
418
+ rtype = item["entity_group"].lower().strip()
419
+ val = re.sub(r"##", "", item["word"].strip()).strip()
420
+ val = re.sub(r"\s+", " ", val).strip()
421
+ score = float(item["score"])
422
+ if len(val) < 2 or score < min_score: continue
423
+ mapped = ENTITY_FIELD_MAP.get(rtype, rtype)
424
+ val_c = clean_text(val)
425
+ if mapped not in entities or len(val_c) > len(clean_text(entities[mapped])):
426
+ entities[mapped] = val_c
427
+ return entities
428
+
429
+ # SESSION
430
+ class Session:
431
+ def __init__(self, sid="default"):
432
+ self.sid = sid; self.history=[]; self.last_intent=None
433
+ self.last_entities={}; self.last_place=None
434
+ self.last_results=[]; self.result_pointer=0
435
+ self.context_slots={}; self.turns=0
436
+
437
+ def add(self, user, bot, intent, entities, place, results):
438
+ self.history.append({"turn":self.turns,"user":user,"bot":bot,
439
+ "intent":intent,"entities":entities})
440
+ if intent and intent not in ("fallback","no_result","out_of_scope"):
441
+ self.last_intent = intent
442
+ if intent in SEARCH_INTENTS:
443
+ self.last_entities = entities
444
+ if place is not None: self.last_place = place
445
+ if results: self.last_results=results; self.result_pointer=0
446
+ self._slots(entities)
447
+ self.turns += 1
448
+
449
+ def _slots(self, ents):
450
+ for s in ["location","category","sub_category","price"]:
451
+ v = ents.get(s)
452
+ if v and str(v).strip(): self.context_slots[s] = str(v).strip()
453
+
454
+ def merge(self, new_ents):
455
+ merged = dict(self.context_slots)
456
+ for k,v in new_ents.items():
457
+ if v and str(v).strip(): merged[k]=str(v).strip()
458
+ self._slots(new_ents)
459
+ return merged
460
+
461
+ # MAIN CHAT
462
+
463
+ def chat(text: str, session: Session, user_lat=None, user_lon=None):
464
+ result = dict(reply="", intent="", confidence=0.0, entities={}, best_place=None, all_results=[])
465
+
466
+ if not text or not text.strip():
467
+ result.update(reply="الرجاء إدخال سؤال 😊", intent="fallback")
468
+ session.add("", result["reply"], "fallback", {}, None, [])
469
+ return result
470
+
471
+ if is_out_of_scope(text):
472
+ reply = "أنا متخصص في إيجاد الأماكن في بني سويف فقط. 😊\nممكن أساعدك تلاقي مطعم، صيدلية، كافيه، ماركت، أو سكن."
473
+ result.update(reply=reply, intent="out_of_scope")
474
+ session.add(text, reply, "out_of_scope", {}, None, [])
475
+ return result
476
+
477
+ ref = detect_ref_type(text)
478
+ if ref == "detail" and session.last_place:
479
+ reply = handle_detail(text, session.last_place)
480
+ result.update(reply=reply, intent=session.last_intent or "detail", best_place=session.last_place)
481
+ session.add(text, reply, result["intent"], {}, session.last_place, [])
482
+ return result
483
+
484
+ if ref == "next" and session.last_results:
485
+ ptr = session.result_pointer + 1
486
+ if ptr < len(session.last_results):
487
+ session.result_pointer = ptr; nxt = session.last_results[ptr]; session.last_place = nxt
488
+ reply = build_response(nxt, session.last_intent, category=nxt.get("category"))
489
+ result.update(reply=reply, intent=session.last_intent, best_place=nxt)
490
+ else:
491
+ result.update(reply="😔 مفيش نتايج تانية. عايز أدور من الأول؟", intent="no_result")
492
+ session.add(text, result["reply"], result["intent"], {}, result["best_place"], [])
493
+ return result
494
+
495
+ ir = predict_intent(text); intent = ir["intent"]; conf = ir["confidence"]
496
+ result["intent"] = intent; result["confidence"] = conf
497
+
498
+ if intent in STATIC_INTENTS:
499
+ result["reply"] = random.choice(RESPONSE_TEMPLATES[get_template_key(intent)])
500
+ session.add(text, result["reply"], intent, {}, None, [])
501
+ return result
502
+
503
+ if intent == "fallback":
504
+ if session.last_intent in SEARCH_INTENTS and _loc_continuation(text):
505
+ intent = session.last_intent; result["intent"] = intent
506
+ else:
507
+ result["reply"] = random.choice(RESPONSE_TEMPLATES["fallback"])
508
+ session.add(text, result["reply"], "fallback", {}, None, [])
509
+ return result
510
+
511
+ if intent not in SEARCH_INTENTS:
512
+ result["reply"] = random.choice(RESPONSE_TEMPLATES.get(get_template_key(intent), RESPONSE_TEMPLATES["fallback"]))
513
+ session.add(text, result["reply"], intent, {}, None, [])
514
+ return result
515
+
516
+ ents = extract_entities(text); result["entities"] = ents
517
+ merged = session.merge(ents)
518
+
519
+ category = merged.get("category") or INTENT_TO_CATEGORY.get(intent) or infer_category(text)
520
+ sub_cat = merged.get("sub_category")
521
+ location = merged.get("location")
522
+ price_range = merged.get("price")
523
+ open_only = ("open_now" in intent or "place_details" in intent)
524
+
525
+ df = search_places(text, top_k_final=5, category=category, sub_category=sub_cat,
526
+ location=location, price_range=price_range, open_now_only=open_only,
527
+ user_lat=user_lat, user_lon=user_lon)
528
+
529
+ if df.empty:
530
+ cl = CLARIFICATION_Q.get(intent, "")
531
+ reply = random.choice(RESPONSE_TEMPLATES["no_result"]) + (f"\n\n💬 {cl}" if cl else "")
532
+ result.update(reply=reply, intent="no_result")
533
+ session.add(text, reply, "no_result", ents, None, [])
534
+ return result
535
+
536
+ all_res = df.to_dict(orient="records"); best = all_res[0]
537
+ reply = build_response(best, intent, category=category)
538
+ if len(all_res) > 1: reply += f"\n\n💬 فيه {len(all_res)} نتيجة — قولي 'تاني' لو عايز غيره."
539
+
540
+ result.update(reply=reply, best_place=best, all_results=all_res)
541
+ session.add(text, reply, intent, ents, best, all_res)
542
+ return result
543
+
544
+ # STARTUP
545
+ @asynccontextmanager
546
+ async def lifespan(app: FastAPI):
547
+ global intent_tokenizer, intent_model, label_encoder, id2intent
548
+ global ner_pipeline, label2id, id2label
549
+ global semantic_model, corpus_df, corpus_embeddings, places_df
550
+
551
+ print("⏳ Downloading models from HuggingFace …")
552
+
553
+ # تحميل الموديلز من HuggingFace Model Hub
554
+ intent_local = snapshot_download(INTENT_REPO)
555
+ entity_local = snapshot_download(ENTITY_REPO)
556
+ semantic_local = snapshot_download(SEMANTIC_REPO)
557
+
558
+ print("⏳ Loading Intent model …")
559
+ intent_tokenizer = AutoTokenizer.from_pretrained(intent_local)
560
+ intent_model = AutoModelForSequenceClassification.from_pretrained(intent_local)
561
+ label_encoder = joblib.load(os.path.join(intent_local, "label_encoder.pkl"))
562
+ id2intent = {i: lbl for i, lbl in enumerate(label_encoder.classes_)}
563
+ intent_model.eval()
564
+
565
+ print("⏳ Loading Entity model …")
566
+ with open(os.path.join(entity_local, "label2id.json"), encoding="utf-8") as f: label2id = json.load(f)
567
+ with open(os.path.join(entity_local, "id2label.json"), encoding="utf-8") as f: id2label = json.load(f)
568
+ etok = AutoTokenizer.from_pretrained(entity_local)
569
+ emod = AutoModelForTokenClassification.from_pretrained(entity_local)
570
+ ner_pipeline = pipeline("token-classification", model=emod, tokenizer=etok, aggregation_strategy="first")
571
+
572
+ print("⏳ Loading Semantic model …")
573
+ semantic_model = SentenceTransformer(os.path.join(semantic_local, "model"))
574
+ with open(os.path.join(semantic_local, "semantic_data.pkl"), "rb") as f:
575
+ sd = pickle.load(f)
576
+ corpus_df = sd["corpus_df"]; corpus_embeddings = sd["corpus_embeddings"]
577
+
578
+ places_df = pd.read_excel(PLACES_FILE)
579
+ for col in ["place_id","name","category","sub_category","location","address",
580
+ "price_range","rating","opening_hours","description","lat","lon"]:
581
+ if col not in places_df.columns: places_df[col] = ""
582
+ places_df = places_df.fillna("")
583
+ places_df["category_clean"] = places_df["category"].apply(clean_text)
584
+ places_df["sub_category_clean"] = places_df["sub_category"].apply(clean_text)
585
+ places_df["location_clean"] = places_df["location"].apply(clean_text)
586
+ places_df["address_clean"] = places_df["address"].apply(clean_text)
587
+ places_df["price_range_clean"] = places_df["price_range"].apply(clean_text)
588
+ places_df["description_clean"] = places_df["description"].apply(clean_text)
589
+ places_df["search_text_clean"] = (
590
+ places_df["name"].astype(str)+" "+places_df["category"].astype(str)+" "+
591
+ places_df["sub_category"].astype(str)+" "+places_df["location"].astype(str)+" "+
592
+ places_df["description"].astype(str)
593
+ ).apply(clean_text)
594
+
595
+ print("✅ All models loaded!")
596
+ yield
597
+ print("Shutting down.")
598
+
599
+ # FASTAPI
600
+ app = FastAPI(title="Beni Suef Chatbot API", version="1.0.0", lifespan=lifespan)
601
+ app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"])
602
+
603
+
604
+ class ChatRequest(BaseModel):
605
+ message: str
606
+ session_id: str = "default"
607
+ user_lat: Optional[float] = None
608
+ user_lon: Optional[float] = None
609
+
610
+ class ChatResponse(BaseModel):
611
+ reply: str
612
+ intent: str
613
+ confidence: float
614
+ entities: dict
615
+ session_id: str
616
+ best_place: Optional[dict] = None
617
+
618
+
619
+ @app.get("/")
620
+ def root():
621
+ return {"status": "ok", "message": "Beni Suef Chatbot is running 🚀"}
622
+
623
+ @app.get("/health")
624
+ def health():
625
+ return {"status": "healthy",
626
+ "models_loaded": all([intent_model, ner_pipeline, semantic_model, places_df is not None])}
627
+
628
+ @app.post("/chat", response_model=ChatResponse)
629
+ def chat_endpoint(req: ChatRequest):
630
+ if req.session_id not in SESSIONS:
631
+ SESSIONS[req.session_id] = Session(req.session_id)
632
+ session = SESSIONS[req.session_id]
633
+ try:
634
+ result = chat(req.message, session, req.user_lat, req.user_lon)
635
+ except Exception as e:
636
+ raise HTTPException(status_code=500, detail=str(e))
637
+
638
+ best = result.get("best_place")
639
+ if best:
640
+ best = {k: (float(v) if isinstance(v, (np.floating, np.integer)) else
641
+ (None if (isinstance(v, float) and np.isnan(v)) else v))
642
+ for k, v in best.items()
643
+ if k in ["place_id","name","category","sub_category","location","address",
644
+ "price_range","rating","opening_hours","description","phone",
645
+ "lat","lon","open_now","final_score"]}
646
+
647
+ return ChatResponse(reply=result["reply"], intent=result["intent"],
648
+ confidence=result["confidence"], entities=result["entities"],
649
+ session_id=req.session_id, best_place=best)
650
+
651
+ @app.delete("/session/{session_id}")
652
+ def reset_session(session_id: str):
653
+ SESSIONS.pop(session_id, None)
654
+ return {"status": "reset", "session_id": session_id}
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi==0.115.0
2
+ uvicorn[standard]==0.30.6
3
+ pydantic==2.8.2
4
+ transformers==4.44.2
5
+ sentence-transformers==3.0.1
6
+ torch==2.4.1
7
+ numpy==1.26.4
8
+ pandas==2.2.2
9
+ openpyxl==3.1.5
10
+ scikit-learn==1.5.2
11
+ joblib==1.4.2
12
+ huggingface_hub==0.24.6