Youmnaaaa commited on
Commit
354d127
·
verified ·
1 Parent(s): 47c7a92

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -692
app.py DELETED
@@ -1,692 +0,0 @@
1
- # -*- coding: utf-8 -*-
2
- """app.ipynb
3
-
4
- Automatically generated by Colab.
5
-
6
- Original file is located at
7
- https://colab.research.google.com/drive/1iPAjeI3M04kA13lYenlROS96tUeCYakB
8
- """
9
-
10
- import os, re, json, math, random, pickle, joblib
11
- import numpy as np
12
- import pandas as pd
13
- import torch
14
-
15
- from datetime import datetime
16
- from zoneinfo import ZoneInfo
17
- from contextlib import asynccontextmanager
18
-
19
- from fastapi import FastAPI, HTTPException
20
- from fastapi.middleware.cors import CORSMiddleware
21
- from pydantic import BaseModel
22
- from typing import Optional
23
-
24
- from sentence_transformers import SentenceTransformer, util
25
- from transformers import (
26
- AutoTokenizer,
27
- AutoModelForSequenceClassification,
28
- AutoModelForTokenClassification,
29
- pipeline,
30
- )
31
- from huggingface_hub import snapshot_download
32
-
33
- """Paths"""
34
-
35
- try:
36
- BASE_DIR = os.path.dirname(os.path.abspath(__file__))
37
- except NameError:
38
- BASE_DIR = os.getcwd()
39
-
40
- # HuggingFace Model Repos
41
- INTENT_REPO = "Youmnaaaa/intent-arabert-ff"
42
- ENTITY_REPO = "Youmnaaaa/entity-hybrid-ff"
43
- SEMANTIC_REPO = "Youmnaaaa/semantic-search-ff"
44
-
45
- # ملف الأماكن جوا الـ Space
46
- PLACES_FILE = os.path.join(BASE_DIR, "beni_suef_100_places_v5ff.xlsx")
47
-
48
- intent_tokenizer = intent_model = label_encoder = id2intent = None
49
- ner_pipeline = label2id = id2label = None
50
- semantic_model = corpus_df = corpus_embeddings = places_df = None
51
- SESSIONS: dict = {}
52
-
53
- def clean_text(text):
54
- text = str(text).strip().lower()
55
- text = re.sub(r"ـ+", "", text)
56
- for old, new in [("[إأآا]","ا"),("ى","ي"),("ة","ه"),("ؤ","و"),("ئ","ي")]:
57
- text = re.sub(old, new, text)
58
- text = re.sub(r"[^\w\s]", " ", text)
59
- return re.sub(r"\s+", " ", text).strip()
60
-
61
-
62
- def norm(text):
63
- text = str(text).strip().lower()
64
- text = re.sub(r"ـ+", "", text)
65
- for old, new in [("[إأآا]","ا"),("ى","ي"),("ة","ه"),("ؤ","و"),("ئ","ي")]:
66
- text = re.sub(old, new, text)
67
- for old, new in [("صباحًا","ص"),("صباحا","ص"),("مساءً","م"),("مساءا","م"),
68
- ("ليلًا","م"),("ليلا","م"),("إلى","-"),("الى","-"),("حتى","-"),
69
- ("–","-"),("—","-")]:
70
- text = text.replace(old, new)
71
- return re.sub(r"\s+", " ", text).strip()
72
-
73
- # INTENT MAPS
74
- SEARCH_INTENTS = {"nearest_restaurant","nearest_pharmacy","nearest_cafe",
75
- "nearest_supermarket","housing_search","recommend_place",
76
- "open_now","place_details"}
77
- STATIC_INTENTS = {"greeting","thanks","goodbye","confirm","deny"}
78
-
79
- INTENT_TO_CATEGORY = {
80
- "nearest_restaurant":"restaurant","nearest_pharmacy":"pharmacy",
81
- "nearest_cafe":"cafe","nearest_supermarket":"supermarket",
82
- "housing_search":"housing",
83
- }
84
- INTENT_TEMPLATE_MAP = {
85
- "nearest_restaurant":"find_restaurant","nearest_pharmacy":"find_pharmacy",
86
- "nearest_cafe":"find_cafe","nearest_supermarket":"find_supermarket",
87
- "housing_search":"find_housing","recommend_place":"find_restaurant",
88
- "open_now":"find_restaurant","place_details":"find_restaurant",
89
- "greeting":"greeting","thanks":"thanks","goodbye":"goodbye",
90
- "confirm":"clarification","deny":"clarification","fallback":"fallback",
91
- }
92
- ENTITY_FIELD_MAP = {
93
- "location":"location","place_type":"category","cuisine_or_item":"sub_category",
94
- "food_type":"sub_category","price":"price","price_range":"price",
95
- "category":"category","sub_category":"sub_category","facility_type":"category",
96
- "housing_type":"category","status":"status","time":"time",
97
- }
98
- KEYWORD_OVERRIDE = {
99
- "goodbye": ["مع السلامة","مع السلامه","باي","وداعا","bye","goodbye","تصبح على خير",
100
- "في امان الله","الله يسلمك","سلامتك"],
101
- "greeting":["السلام عليكم","وعليكم السلام","اهلا","أهلا","هلا","هلو","مرحبا","مرحباً",
102
- "صباح الخير","مساء الخير","هاي","hi","hello","صباح","مساء"],
103
- "thanks": ["شكرا","شكراً","تسلم","يسلمو","ممنون","مشكور","thanks","thank","الف شكر"],
104
- }
105
- CATEGORY_KEYWORDS = {
106
- "restaurant":["مطعم","اكل","وجبات","مشويات","كباب","شاورما","كريب","برجر","سمك","فرايد"],
107
- "pharmacy": ["صيدليه","صيدلية","دوا","ادويه","دواء"],
108
- "cafe": ["كافيه","كوفي","قهوه","قهوة","كافيتيريا"],
109
- "supermarket":["سوبرماركت","ماركت","بقاله","هايبر"],
110
- "housing": ["شقه","شقة","ايجار","إيجار","فندق","هوستل","سكن"],
111
- }
112
-
113
- _CAT_MAP = {
114
- "مطعم":"restaurant","مطاعم":"restaurant","طعام":"restaurant","اكل":"restaurant",
115
- "صيدليه":"pharmacy","صيدلية":"pharmacy","صيدله":"pharmacy","دواء":"pharmacy","دوا":"pharmacy",
116
- "كافيه":"cafe","كافية":"cafe","كوفي":"cafe","قهوه":"cafe","قهوة":"cafe","كافيتيريا":"cafe",
117
- "سوبرماركت":"supermarket","ماركت":"supermarket","بقاله":"supermarket","بقالة":"supermarket","هايبر":"supermarket",
118
- "شقه":"housing","شقة":"housing","ايجار":"housing","إيجار":"housing",
119
- "فندق":"housing","سكن":"housing","هوستل":"housing",
120
- }
121
-
122
- def normalize_category(cat):
123
- if not cat: return cat
124
- cat_s = str(cat).strip()
125
- if cat_s in ("restaurant","pharmacy","cafe","supermarket","housing"):
126
- return cat_s
127
- if cat_s in _CAT_MAP:
128
- return _CAT_MAP[cat_s]
129
- for ar, en in _CAT_MAP.items():
130
- if ar in cat_s or cat_s in ar:
131
- return en
132
- return cat_s
133
- CLARIFICATION_Q = {
134
- "nearest_restaurant":"أي نوع أكل؟ مشويات، شاورما، كريب، برجر؟",
135
- "nearest_pharmacy":"في أي منطقة بتدور على صيدلية؟",
136
- "nearest_cafe":"في أي منطقة بتدور على كافيه؟",
137
- "nearest_supermarket":"في أي منطقة بتدور على ماركت؟",
138
- "housing_search":"بتدور على إيه — شقة إيجار، فندق؟ وفين؟",
139
- }
140
- OUT_OF_SCOPE_KW = ["الجو","طقس","درجه","كوره","كرة","أهلي","زمالك","مباريات",
141
- "سياسه","سياسة","أخبار","رصيد","بنك","تحويل","امتحان","مدرسه",
142
- "جامعه","وظيفه","برمجه","كود","python","java","رياضيات","ترجمه","translate"]
143
- NEXT_WORDS = ["تاني","غيره","غيرها","بديل","حاجة تانية","مش عاجبني","فيه تاني","عايز غيره"]
144
- DETAIL_WORDS = ["بيفتح","بتفتح","مواعيده","مواعيدها","امتى","امتي","عنوانه","عنوانها",
145
- "تليفونه","تليفونها","رقمه","رقمها","تقييمه","تقييمها","سعره","سعرها"]
146
- REF_WORDS = ["هو","هي","ده","دي","المكان ده"]
147
- _LOC_CUES = ["الحي","بني سويف","الاباصيري","الكورنيش","مقبل","الزراعيين",
148
- "صلاح سالم","شرق النيل","سيتي سنتر","عرابي","الروضه"]
149
-
150
- # HELPER FUNCTIONS
151
- def apply_keyword_override(text):
152
- t = norm(text); tw = set(t.split())
153
- for intent, kws in KEYWORD_OVERRIDE.items():
154
- for k in sorted(kws, key=len, reverse=True):
155
- kn = norm(k)
156
- if (" " in kn and kn in t) or (kn in tw): return intent
157
- return None
158
-
159
- def get_template_key(intent, category=None):
160
- if category:
161
- k = {"restaurant":"find_restaurant","pharmacy":"find_pharmacy",
162
- "cafe":"find_cafe","supermarket":"find_supermarket",
163
- "housing":"find_housing"}.get(category)
164
- if k: return k
165
- return INTENT_TEMPLATE_MAP.get(intent, "fallback")
166
-
167
- def infer_category(query):
168
- q = norm(query)
169
- for cat, words in CATEGORY_KEYWORDS.items():
170
- if any(norm(w) in q for w in words): return cat
171
- return None
172
-
173
- def is_out_of_scope(text):
174
- t = norm(text)
175
- return any(norm(k) in t for k in OUT_OF_SCOPE_KW)
176
-
177
- def detect_ref_type(text):
178
- t = norm(text); tw = set(t.split())
179
- if any(norm(w) in t for w in NEXT_WORDS): return "next"
180
- if any(norm(w) in t for w in DETAIL_WORDS): return "detail"
181
- for w in REF_WORDS:
182
- wn = norm(w)
183
- if (" " in wn and wn in t) or (wn in tw): return "reference"
184
- return "new"
185
-
186
- def _loc_continuation(text):
187
- t = norm(text); words = t.split()
188
- if len(words) <= 4 and any(norm(c) in t for c in _LOC_CUES): return True
189
- return bool(words and words[0] == "في")
190
-
191
- def normalize_rating(r):
192
- try:
193
- r = float(r)
194
- return round(r/2, 1) if r > 5 else round(r, 1) if r > 0 else 0.0
195
- except: return 0.0
196
-
197
- # TIME UTILS
198
-
199
- def get_cairo_now():
200
- return datetime.now(ZoneInfo("Africa/Cairo"))
201
-
202
- def parse_time(token):
203
- token = norm(token).replace(" ", "")
204
- m = re.match(r"^(\d{1,2})(?::(\d{1,2}))?(ص|م|ظهر)?$", token)
205
- if not m: return None
206
- h = int(m.group(1)); mn = int(m.group(2)) if m.group(2) else 0; suf = m.group(3)
207
- if not (0 <= mn <= 59): return None
208
- if suf == "ص":
209
- if h == 12: h = 0
210
- elif not (1 <= h <= 11): return None
211
- elif suf in ("م","ظهر"):
212
- if h != 12 and 1 <= h <= 11: h += 12
213
- else:
214
- if h == 24: h = 0
215
- elif not (0 <= h <= 23): return None
216
- return f"{h:02d}:{mn:02d}"
217
-
218
- def check_open_now(opening_hours_str):
219
- if not opening_hours_str or str(opening_hours_str).strip() in ("","nan","none"): return None
220
- text = norm(str(opening_hours_str))
221
- if any(k in text for k in ["24","always","طول اليوم","24/7"]): return 1
222
- sep = re.search(r"(.+?)\s*-\s*(.+)", text)
223
- if not sep: return None
224
- t1 = parse_time(sep.group(1).strip()); t2 = parse_time(sep.group(2).strip())
225
- if not t1 or not t2: return None
226
- now_t = f"{get_cairo_now().hour:02d}:{get_cairo_now().minute:02d}"
227
- if t1 <= t2: return 1 if t1 <= now_t <= t2 else 0
228
- return 1 if (now_t >= t1 or now_t <= t2) else 0
229
-
230
- # SEARCH + FILTER + RANK
231
- def semantic_candidates(query, top_k=20):
232
- q_emb = semantic_model.encode(clean_text(query), convert_to_tensor=True)
233
- scores = util.cos_sim(q_emb, corpus_embeddings)[0]
234
- top_k = min(top_k, len(corpus_df))
235
- top_r = torch.topk(scores, k=top_k)
236
- res = corpus_df.iloc[top_r.indices.cpu().numpy()].copy()
237
- res["semantic_score"] = top_r.values.cpu().numpy()
238
- keep = [c for c in ["place_id","doc_id","name","category","sub_category","location",
239
- "address","price_range","opening_hours","description","semantic_score"]
240
- if c in res.columns]
241
- return res[keep].reset_index(drop=True)
242
-
243
- def merge_places(df):
244
- extra = [c for c in ["lat","lon","rating","phone","social_media","status",
245
- "category_clean","sub_category_clean","location_clean",
246
- "address_clean","price_range_clean","search_text_clean"]
247
- if c in places_df.columns]
248
- slim = places_df[["place_id"] + extra].copy()
249
- return df.merge(slim, on="place_id", how="left")
250
-
251
- def apply_filters(df, query, category=None, sub_category=None, location=None,
252
- price_range=None, open_now_only=False, min_rating=None):
253
- f = df.copy()
254
- if category: f = f[f["category_clean"].astype(str).str.contains(re.escape(clean_text(category)), na=False)]
255
- if sub_category: f = f[f["sub_category_clean"].astype(str).str.contains(re.escape(clean_text(sub_category)), na=False)]
256
- if location: f = f[f["location_clean"].astype(str).str.contains(re.escape(clean_text(location)), na=False)]
257
- if price_range: f = f[f["price_range_clean"].astype(str).str.contains(re.escape(clean_text(price_range)), na=False)]
258
- f["open_now"] = f["opening_hours"].apply(check_open_now)
259
- f["rating_num"] = pd.to_numeric(f.get("rating", pd.Series()), errors="coerce").fillna(0)
260
- f["rating_norm"] = f["rating_num"].apply(normalize_rating)
261
- f["rating_score"] = f["rating_norm"] / 5.0
262
- f["open_score"] = f["open_now"].apply(lambda x: 1.0 if x==1 else (0.5 if x is None else 0.0))
263
- if open_now_only: f = f[f["open_now"] == 1]
264
- if min_rating: f = f[f["rating_norm"] >= min_rating]
265
- return f
266
-
267
- def haversine(lat1, lon1, lat2, lon2):
268
- R=6371; p=math.pi/180
269
- a = (math.sin((lat2-lat1)*p/2)**2 + math.cos(lat1*p)*math.cos(lat2*p)*math.sin((lon2-lon1)*p/2)**2)
270
- return 2*R*math.asin(math.sqrt(a))
271
-
272
- def rank(df, query, user_lat=None, user_lon=None):
273
- df = df.copy()
274
- if user_lat and user_lon and "lat" in df.columns:
275
- def dist(row):
276
- try: return haversine(float(user_lat), float(user_lon), float(row["lat"]), float(row["lon"]))
277
- except: return 999
278
- df["distance_km"] = df.apply(dist, axis=1)
279
- mx = df["distance_km"].replace(999, np.nan).max() or 1
280
- df["distance_score"] = 1 - (df["distance_km"] / (mx + 1))
281
- else:
282
- df["distance_km"] = 999; df["distance_score"] = 0.0
283
- q_clean = clean_text(query)
284
- df["name_match_score"] = df["name"].apply(
285
- lambda n: 1.0 if clean_text(str(n)) in q_clean or q_clean in clean_text(str(n)) else 0.0)
286
- w = dict(semantic=0.40, rating=0.25, open=0.15, distance=0.10, name=0.10)
287
- df["final_score"] = (
288
- w["semantic"]*df.get("semantic_score", pd.Series(0,index=df.index)).fillna(0) +
289
- w["rating"] *df.get("rating_score", pd.Series(0,index=df.index)).fillna(0) +
290
- w["open"] *df.get("open_score", pd.Series(0,index=df.index)).fillna(0) +
291
- w["distance"]*df["distance_score"] + w["name"]*df["name_match_score"]
292
- )
293
- return df.sort_values("final_score", ascending=False).reset_index(drop=True)
294
-
295
- def search_places(query, top_k_final=5, category=None, sub_category=None,
296
- location=None, price_range=None, open_now_only=False,
297
- min_rating=None, user_lat=None, user_lon=None):
298
- cands = semantic_candidates(query, top_k=20)
299
- merged = merge_places(cands)
300
- for attempt in [
301
- dict(category=category, sub_category=sub_category, location=location,
302
- price_range=price_range, open_now_only=open_now_only, min_rating=min_rating),
303
- dict(category=category, sub_category=None, location=location,
304
- price_range=price_range, open_now_only=open_now_only, min_rating=min_rating),
305
- dict(category=category, sub_category=None, location=location,
306
- price_range=None, open_now_only=False, min_rating=min_rating),
307
- dict(category=category, sub_category=None, location=None,
308
- price_range=None, open_now_only=False, min_rating=None),
309
- ]:
310
- filtered = apply_filters(merged, query, **attempt)
311
- if not filtered.empty: break
312
- if filtered.empty: return pd.DataFrame()
313
- ranked = rank(filtered, query, user_lat, user_lon)
314
- keep = [c for c in ["place_id","name","category","sub_category","location","address",
315
- "price_range","rating","rating_norm","opening_hours","description",
316
- "phone","lat","lon","semantic_score","final_score","open_now"]
317
- if c in ranked.columns]
318
- return ranked[keep].head(top_k_final).reset_index(drop=True)
319
-
320
- # RESPONSE TEMPLATES + FORMATTERS
321
- RESPONSE_TEMPLATES = {
322
- "find_restaurant":[
323
- "🍽️ لقيتلك {name} في {location}. {price_info}{rating_info}{hours_info}",
324
- "أنصحك بـ {name} — هتلاقيه في {location}. {price_info}{rating_info}{hours_info}",
325
- "في {location} فيه {name}. {description_short}{price_info}{hours_info}",
326
- ],
327
- "find_pharmacy":[
328
- "💊 {name} في {location}.{hours_info}{rating_info}",
329
- "أقرب صيدلية ليك: {name} — {address_info}{hours_info}",
330
- ],
331
- "find_cafe":[
332
- "☕ {name} في {location}. {price_info}{rating_info}{hours_info}",
333
- "جرب {name} — في {location}. {description_short}{hours_info}",
334
- ],
335
- "find_supermarket":[
336
- "🛒 {name} في {location}.{hours_info}{rating_info}",
337
- "أقرب ماركت: {name} — {address_info}{hours_info}",
338
- ],
339
- "find_housing":[
340
- "🏠 {name} في {location}. {price_info}{description_short}",
341
- "فيه {name} في {location}. {price_info}{rating_info}",
342
- ],
343
- "greeting": ["أهلاً! 😊 أنا بساعدك تلاقي أي مكان في بني سويف. عايز إيه؟",
344
- "وعليكم السلام! قولي محتاج إيه — مطعم، صيدلية، كافيه؟",
345
- "هلا بيك! محتاج إيه في بني سويف؟ 😊"],
346
- "thanks": ["العفو! 😊 في حاجة تانية أساعدك فيها؟","أي خدمة! 😊","بكل سرور! 😊"],
347
- "goodbye": ["مع السلامة! 👋","سلامتك! أي وقت محتاج مساعدة أنا هنا.","باي! ربنا يوفقك 😊"],
348
- "clarification":["😊 قصدك إيه بالظبط؟","ممكن توضح أكتر؟","تمام! بتدور على إيه بالظبط؟"],
349
- "no_result": ["😔 مش لاقي حاجة مناسبة. جرب تغير المنطقة أو تسأل بطريقة تانية.",
350
- "معلش، مفيش نتايج. ممكن تحدد المنطقة أو النوع أكتر؟"],
351
- "fallback": ["آسف، مش فاهم قصدك. 😊 قولي محتاج إيه — مطعم، صيدلية، كافيه؟",
352
- "ممكن تسألني عن أي مكان في بني سويف وأنا هساعدك! 😊"],
353
- }
354
-
355
- def fmt_price(x):
356
- p = str(x).strip().lower()
357
- if not p or p in ("","nan","none"): return ""
358
- m = {"cheap":"الأسعار رخيصة","رخيص":"الأسعار رخيصة","اقتصادي":"الأسعار اقتصادية",
359
- "medium":"الأسعار متوسطة","متوسط":"الأسعار متوسطة",
360
- "expensive":"الأسعار غالية","غالي":"الأسعار غالية"}
361
- for k,v in m.items():
362
- if k in p: return v+". "
363
- return f"السعر: {x}. "
364
-
365
- def fmt_rating(x):
366
- try:
367
- r = normalize_rating(float(x)); stars = min(round(r), 5)
368
- return f"تقييمه {r} {'⭐'*stars}. " if r > 0 else ""
369
- except: return ""
370
-
371
- def fmt_hours(x):
372
- h = str(x).strip()
373
- if not h or h in ("","nan","none"): return ""
374
- if any(k in h.lower() for k in ["24","always","طول اليوم"]): return "مفتوح 24 ساعة. "
375
- return f"بيفتح: {h}. "
376
-
377
- def fmt_addr(address, location):
378
- a=str(address).strip(); l=str(location).strip()
379
- if a and a not in ("","nan","none"): return f"عنوانه: {a}. "
380
- if l and l not in ("","nan","none"): return f"في {l}. "
381
- return ""
382
-
383
- def fmt_desc(x, max_words=12):
384
- d = str(x).strip()
385
- if not d or d in ("","nan","none"): return ""
386
- words = d.split()
387
- return (" ".join(words[:max_words])+"...") if len(words)>max_words else d+" "
388
-
389
- def build_response(place, intent, category=None):
390
- if not place: return random.choice(RESPONSE_TEMPLATES["no_result"])
391
- tk = get_template_key(intent, category)
392
- reply = random.choice(RESPONSE_TEMPLATES[tk]).format(
393
- name = str(place.get("name","")).strip(),
394
- location = str(place.get("location","")).strip() or "بني سويف",
395
- price_info = fmt_price(place.get("price_range","")),
396
- rating_info = fmt_rating(place.get("rating_norm", place.get("rating", 0))),
397
- hours_info = fmt_hours(place.get("opening_hours","")),
398
- address_info = fmt_addr(place.get("address",""), place.get("location","")),
399
- description_short= fmt_desc(place.get("description","")),
400
- )
401
- on = place.get("open_now")
402
- if on == 1: reply += "\n🟢 مفتوح دلوقتي."
403
- elif on == 0: reply += "\n🔴 مغلق دلوقتي."
404
- return reply
405
-
406
- def handle_detail(text, place):
407
- if not place: return "مش فاكر إحنا اتكلمنا عن مكان. ممكن تسألني من الأول؟"
408
- t = norm(text); name = str(place.get("name","")).strip()
409
- if any(w in t for w in ["امتي","امتى","مواعيد","يفتح","تفتح","يقفل"]):
410
- st = "🟢 مفتوح" if place.get("open_now")==1 else "🔴 مغلق"
411
- return f"⏰ {name} — {fmt_hours(place.get('opening_hours',''))}\n{st} دلوقتي."
412
- if any(w in t for w in ["عنوان","فين","وصول","اوصل"]):
413
- return f"📍 {name} في {place.get('location','')}.\\nالعنوان: {place.get('address','')}"
414
- if any(w in t for w in ["سعر","بكام","تكلف","غالي","رخيص"]):
415
- return f"💰 {name} — {fmt_price(place.get('price_range',''))}"
416
- if any(w in t for w in ["تقييم","نجوم"]):
417
- return f"⭐ {name} — {fmt_rating(place.get('rating_norm', place.get('rating',0)))}"
418
- if any(w in t for w in ["رقم","تليفون"]):
419
- phone = str(place.get("phone","")).strip()
420
- return f"📞 {name} — {phone}" if phone else f"معنديش رقم {name}."
421
- return f"📋 {name}:\n{fmt_desc(place.get('description',''), 20)}\n{fmt_hours(place.get('opening_hours',''))}{fmt_rating(place.get('rating_norm',0))}"
422
-
423
- # PREDICT FUNCTIONS
424
-
425
- def predict_intent(text, threshold=0.5):
426
- override = apply_keyword_override(text)
427
- if override: return {"intent": override, "confidence": 1.0}
428
- inputs = intent_tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
429
- with torch.no_grad():
430
- outputs = intent_model(**inputs)
431
- probs = torch.softmax(outputs.logits, dim=1)
432
- pid = torch.argmax(probs, dim=1).item()
433
- conf = probs[0][pid].item()
434
- return {"intent": id2intent[pid] if conf >= threshold else "fallback", "confidence": round(conf, 4)}
435
-
436
- def extract_entities(text, min_score=0.40):
437
- raw = ner_pipeline([text])[0]; entities = {}
438
- for item in raw:
439
- rtype = item["entity_group"].lower().strip()
440
- val = re.sub(r"##", "", item["word"].strip()).strip()
441
- val = re.sub(r"\s+", " ", val).strip()
442
- score = float(item["score"])
443
- if len(val) < 2 or score < min_score: continue
444
- mapped = ENTITY_FIELD_MAP.get(rtype, rtype)
445
- val_c = clean_text(val)
446
- if mapped not in entities or len(val_c) > len(clean_text(entities[mapped])):
447
- entities[mapped] = val_c
448
- return entities
449
-
450
- # SESSION
451
- class Session:
452
- def __init__(self, sid="default"):
453
- self.sid = sid; self.history=[]; self.last_intent=None
454
- self.last_entities={}; self.last_place=None
455
- self.last_results=[]; self.result_pointer=0
456
- self.context_slots={}; self.turns=0
457
-
458
- def add(self, user, bot, intent, entities, place, results):
459
- self.history.append({"turn":self.turns,"user":user,"bot":bot,
460
- "intent":intent,"entities":entities})
461
- if intent and intent not in ("fallback","no_result","out_of_scope"):
462
- self.last_intent = intent
463
- if intent in SEARCH_INTENTS:
464
- self.last_entities = entities
465
- if place is not None: self.last_place = place
466
- if results: self.last_results=results; self.result_pointer=0
467
- self._slots(entities)
468
- self.turns += 1
469
-
470
- def _slots(self, ents):
471
- for s in ["location","category","sub_category","price"]:
472
- v = ents.get(s)
473
- if v and str(v).strip(): self.context_slots[s] = str(v).strip()
474
-
475
- def merge(self, new_ents):
476
- merged = dict(self.context_slots)
477
- for k,v in new_ents.items():
478
- if v and str(v).strip(): merged[k]=str(v).strip()
479
- self._slots(new_ents)
480
- return merged
481
-
482
- # MAIN CHAT
483
-
484
- def chat(text: str, session: Session, user_lat=None, user_lon=None):
485
- result = dict(reply="", intent="", confidence=0.0, entities={}, best_place=None, all_results=[])
486
-
487
- if not text or not text.strip():
488
- result.update(reply="الرجاء إدخال سؤال 😊", intent="fallback")
489
- session.add("", result["reply"], "fallback", {}, None, [])
490
- return result
491
-
492
- if is_out_of_scope(text):
493
- reply = "أنا متخصص في إيجاد الأماكن في بني سويف فقط. 😊\nممكن أساعدك تلاقي مطعم، صيدلية، كافيه، ماركت، أو سكن."
494
- result.update(reply=reply, intent="out_of_scope")
495
- session.add(text, reply, "out_of_scope", {}, None, [])
496
- return result
497
-
498
- ref = detect_ref_type(text)
499
- if ref == "detail" and session.last_place:
500
- reply = handle_detail(text, session.last_place)
501
- result.update(reply=reply, intent=session.last_intent or "detail", best_place=session.last_place)
502
- session.add(text, reply, result["intent"], {}, session.last_place, [])
503
- return result
504
-
505
- if ref == "next" and session.last_results:
506
- ptr = session.result_pointer + 1
507
- if ptr < len(session.last_results):
508
- session.result_pointer = ptr; nxt = session.last_results[ptr]; session.last_place = nxt
509
- reply = build_response(nxt, session.last_intent, category=nxt.get("category"))
510
- result.update(reply=reply, intent=session.last_intent, best_place=nxt)
511
- else:
512
- result.update(reply="😔 مفيش نتايج تانية. عايز أدور من الأول؟", intent="no_result")
513
- session.add(text, result["reply"], result["intent"], {}, result["best_place"], [])
514
- return result
515
-
516
- ir = predict_intent(text); intent = ir["intent"]; conf = ir["confidence"]
517
- result["intent"] = intent; result["confidence"] = conf
518
-
519
- if intent in STATIC_INTENTS:
520
- result["reply"] = random.choice(RESPONSE_TEMPLATES[get_template_key(intent)])
521
- session.add(text, result["reply"], intent, {}, None, [])
522
- return result
523
-
524
- if intent == "fallback":
525
- if session.last_intent in SEARCH_INTENTS and _loc_continuation(text):
526
- intent = session.last_intent; result["intent"] = intent
527
- else:
528
- result["reply"] = random.choice(RESPONSE_TEMPLATES["fallback"])
529
- session.add(text, result["reply"], "fallback", {}, None, [])
530
- return result
531
-
532
- if intent not in SEARCH_INTENTS:
533
- result["reply"] = random.choice(RESPONSE_TEMPLATES.get(get_template_key(intent), RESPONSE_TEMPLATES["fallback"]))
534
- session.add(text, result["reply"], intent, {}, None, [])
535
- return result
536
-
537
- ents = extract_entities(text); result["entities"] = ents
538
- merged = session.merge(ents)
539
-
540
- category = normalize_category(merged.get("category") or INTENT_TO_CATEGORY.get(intent) or infer_category(text))
541
- sub_cat = merged.get("sub_category")
542
- location = merged.get("location")
543
- price_range = merged.get("price")
544
- open_only = ("open_now" in intent or "place_details" in intent)
545
-
546
- df = search_places(text, top_k_final=5, category=category, sub_category=sub_cat,
547
- location=location, price_range=price_range, open_now_only=open_only,
548
- user_lat=user_lat, user_lon=user_lon)
549
-
550
- if df.empty:
551
- cl = CLARIFICATION_Q.get(intent, "")
552
- reply = random.choice(RESPONSE_TEMPLATES["no_result"]) + (f"\n\n💬 {cl}" if cl else "")
553
- result.update(reply=reply, intent="no_result")
554
- session.add(text, reply, "no_result", ents, None, [])
555
- return result
556
-
557
- all_res = df.to_dict(orient="records"); best = all_res[0]
558
- reply = build_response(best, intent, category=category)
559
- if len(all_res) > 1: reply += f"\n\n💬 فيه {len(all_res)} نتيجة — قولي 'تاني' لو عايز غيره."
560
-
561
- result.update(reply=reply, best_place=best, all_results=all_res)
562
- session.add(text, reply, intent, ents, best, all_res)
563
- return result
564
-
565
- @asynccontextmanager
566
- async def lifespan(app: FastAPI):
567
- global intent_tokenizer, intent_model, label_encoder, id2intent
568
- global ner_pipeline, label2id, id2label
569
- global semantic_model, corpus_df, corpus_embeddings, places_df
570
-
571
- print("⏳ Downloading models from HuggingFace …")
572
-
573
- # تحميل الموديلز من HuggingFace Model Hub
574
- intent_local = snapshot_download(INTENT_REPO)
575
- entity_local = snapshot_download(ENTITY_REPO)
576
- semantic_local = snapshot_download(SEMANTIC_REPO)
577
-
578
- print("⏳ Loading Intent model …")
579
- intent_tokenizer = AutoTokenizer.from_pretrained(intent_local)
580
- intent_model = AutoModelForSequenceClassification.from_pretrained(intent_local)
581
- label_encoder = joblib.load(os.path.join(intent_local, "label_encoder.pkl"))
582
- id2intent = {i: lbl for i, lbl in enumerate(label_encoder.classes_)}
583
- intent_model.eval()
584
-
585
- print("⏳ Loading Entity model …")
586
- with open(os.path.join(entity_local, "label2id.json"), encoding="utf-8") as f: label2id = json.load(f)
587
- with open(os.path.join(entity_local, "id2label.json"), encoding="utf-8") as f: id2label = json.load(f)
588
- etok = AutoTokenizer.from_pretrained(entity_local)
589
- emod = AutoModelForTokenClassification.from_pretrained(entity_local)
590
- ner_pipeline = pipeline("token-classification", model=emod, tokenizer=etok, aggregation_strategy="first")
591
-
592
- print("⏳ Loading Semantic model …")
593
- semantic_model = SentenceTransformer("Youmnaaaa/semantic-search-ff")
594
- from huggingface_hub import hf_hub_download
595
- pkl_path = hf_hub_download(
596
- repo_id="Youmnaaaa/semantic-search-ff",
597
- filename="semantic_data.pkl"
598
- )
599
- with open(pkl_path, "rb") as f:
600
- sd = pickle.load(f)
601
- corpus_df = sd["corpus_df"]
602
- corpus_embeddings = sd["corpus_embeddings"]
603
-
604
- places_df = pd.read_excel(PLACES_FILE)
605
- for col in ["place_id","name","category","sub_category","location","address",
606
- "price_range","rating","opening_hours","description","lat","lon"]:
607
- if col not in places_df.columns: places_df[col] = ""
608
- places_df = places_df.fillna("")
609
- places_df["category_clean"] = places_df["category"].apply(clean_text)
610
- places_df["sub_category_clean"] = places_df["sub_category"].apply(clean_text)
611
- places_df["location_clean"] = places_df["location"].apply(clean_text)
612
- places_df["address_clean"] = places_df["address"].apply(clean_text)
613
- places_df["price_range_clean"] = places_df["price_range"].apply(clean_text)
614
- places_df["description_clean"] = places_df["description"].apply(clean_text)
615
- places_df["search_text_clean"] = (
616
- places_df["name"].astype(str)+" "+places_df["category"].astype(str)+" "+
617
- places_df["sub_category"].astype(str)+" "+places_df["location"].astype(str)+" "+
618
- places_df["description"].astype(str)
619
- ).apply(clean_text)
620
-
621
- print("✅ All models loaded!")
622
- yield
623
- print("Shutting down.")
624
-
625
- # FASTAPI
626
- app = FastAPI(title="Beni Suef Chatbot API", version="1.0.0", lifespan=lifespan)
627
- app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"])
628
-
629
-
630
- class ChatRequest(BaseModel):
631
- message: str
632
- session_id: str = "default"
633
- user_lat: Optional[float] = None
634
- user_lon: Optional[float] = None
635
-
636
- class ChatResponse(BaseModel):
637
- reply: str
638
- intent: str
639
- confidence: float
640
- entities: dict
641
- session_id: str
642
- best_place: Optional[dict] = None
643
- place_cards: list = []
644
-
645
-
646
- @app.get("/")
647
- def root():
648
- return {"status": "ok", "message": "Beni Suef Chatbot is running 🚀"}
649
-
650
- @app.get("/health")
651
- def health():
652
- return {"status": "healthy",
653
- "models_loaded": all([intent_model, ner_pipeline, semantic_model, places_df is not None])}
654
-
655
- @app.post("/chat", response_model=ChatResponse)
656
- def chat_endpoint(req: ChatRequest):
657
- if req.session_id not in SESSIONS:
658
- SESSIONS[req.session_id] = Session(req.session_id)
659
- session = SESSIONS[req.session_id]
660
- try:
661
- result = chat(req.message, session, req.user_lat, req.user_lon)
662
- except Exception as e:
663
- raise HTTPException(status_code=500, detail=str(e))
664
-
665
- best = result.get("best_place")
666
- if best:
667
- best = {k: (float(v) if isinstance(v, (np.floating, np.integer)) else
668
- (None if (isinstance(v, float) and np.isnan(v)) else v))
669
- for k, v in best.items()
670
- if k in ["place_id","name","category","sub_category","location","address",
671
- "price_range","rating","opening_hours","description","phone",
672
- "lat","lon","open_now","final_score"]}
673
-
674
- # بعد
675
- all_cards = []
676
- for p in result.get("all_results", []):
677
- card = {k: (float(v) if isinstance(v, (np.floating, np.integer)) else
678
- (None if (isinstance(v, float) and np.isnan(v)) else v))
679
- for k, v in p.items()
680
- if k in ["place_id","name","category","sub_category","location","address",
681
- "price_range","rating","opening_hours","description","phone",
682
- "lat","lon","open_now","final_score"]}
683
- all_cards.append(card)
684
-
685
- return ChatResponse(reply=result["reply"], intent=result["intent"],
686
- confidence=result["confidence"], entities=result["entities"],
687
- session_id=req.session_id, best_place=best,
688
- place_cards=all_cards)
689
- @app.delete("/session/{session_id}")
690
- def reset_session(session_id: str):
691
- SESSIONS.pop(session_id, None)
692
- return {"status": "reset", "session_id": session_id}