Spaces:
Running
Running
Upload 4 files
Browse files- Dockerfile +11 -0
- app.py +642 -0
- beni_suef_100_places_v5ff.xlsx +0 -0
- requirements.txt +11 -0
Dockerfile
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
|
| 5 |
+
COPY requirements.txt .
|
| 6 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 7 |
+
|
| 8 |
+
COPY . .
|
| 9 |
+
|
| 10 |
+
ENV PORT=7860
|
| 11 |
+
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
|
app.py
ADDED
|
@@ -0,0 +1,642 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""app.ipynb
|
| 3 |
+
|
| 4 |
+
Automatically generated by Colab.
|
| 5 |
+
|
| 6 |
+
Original file is located at
|
| 7 |
+
https://colab.research.google.com/drive/1iPAjeI3M04kA13lYenlROS96tUeCYakB
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
import os, re, json, math, random, pickle, joblib
|
| 11 |
+
import numpy as np
|
| 12 |
+
import pandas as pd
|
| 13 |
+
import torch
|
| 14 |
+
|
| 15 |
+
from datetime import datetime
|
| 16 |
+
from zoneinfo import ZoneInfo
|
| 17 |
+
from contextlib import asynccontextmanager
|
| 18 |
+
|
| 19 |
+
from fastapi import FastAPI, HTTPException
|
| 20 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 21 |
+
from pydantic import BaseModel
|
| 22 |
+
from typing import Optional
|
| 23 |
+
|
| 24 |
+
from sentence_transformers import SentenceTransformer, util
|
| 25 |
+
from transformers import (
|
| 26 |
+
AutoTokenizer,
|
| 27 |
+
AutoModelForSequenceClassification,
|
| 28 |
+
AutoModelForTokenClassification,
|
| 29 |
+
pipeline,
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
"""Paths"""
|
| 33 |
+
|
| 34 |
+
try:
|
| 35 |
+
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
| 36 |
+
except NameError:
|
| 37 |
+
BASE_DIR = os.getcwd()
|
| 38 |
+
|
| 39 |
+
INTENT_REPO = os.getenv("INTENT_REPO", "YOUR_USERNAME/intent_arabert_saved_ff")
|
| 40 |
+
ENTITY_DIR = os.path.join(BASE_DIR, "entity_hybrid_saved_ff")
|
| 41 |
+
SEMANTIC_DIR = os.path.join(BASE_DIR, "semantic_search_saved_ff")
|
| 42 |
+
PLACES_FILE = os.path.join(BASE_DIR, "beni_suef_100_places_v5ff.xlsx")
|
| 43 |
+
|
| 44 |
+
intent_tokenizer = intent_model = label_encoder = id2intent = None
|
| 45 |
+
ner_pipeline = label2id = id2label = None
|
| 46 |
+
semantic_model = corpus_df = corpus_embeddings = places_df = None
|
| 47 |
+
SESSIONS: dict = {}
|
| 48 |
+
|
| 49 |
+
def clean_text(text):
|
| 50 |
+
text = str(text).strip().lower()
|
| 51 |
+
text = re.sub(r"ـ+", "", text)
|
| 52 |
+
for old, new in [("[إأآا]","ا"),("ى","ي"),("ة","ه"),("ؤ","و"),("ئ","ي")]:
|
| 53 |
+
text = re.sub(old, new, text)
|
| 54 |
+
text = re.sub(r"[^\w\s]", " ", text)
|
| 55 |
+
return re.sub(r"\s+", " ", text).strip()
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
def norm(text):
|
| 59 |
+
text = str(text).strip().lower()
|
| 60 |
+
text = re.sub(r"ـ+", "", text)
|
| 61 |
+
for old, new in [("[إأآا]","ا"),("ى","ي"),("ة","ه"),("ؤ","و"),("ئ","ي")]:
|
| 62 |
+
text = re.sub(old, new, text)
|
| 63 |
+
for old, new in [("صباحًا","ص"),("صباحا","ص"),("مساءً","م"),("مساءا","م"),
|
| 64 |
+
("ليلًا","م"),("ليلا","م"),("إلى","-"),("الى","-"),("حتى","-"),
|
| 65 |
+
("–","-"),("—","-")]:
|
| 66 |
+
text = text.replace(old, new)
|
| 67 |
+
return re.sub(r"\s+", " ", text).strip()
|
| 68 |
+
|
| 69 |
+
# INTENT MAPS
|
| 70 |
+
SEARCH_INTENTS = {"nearest_restaurant","nearest_pharmacy","nearest_cafe",
|
| 71 |
+
"nearest_supermarket","housing_search","recommend_place",
|
| 72 |
+
"open_now","place_details"}
|
| 73 |
+
STATIC_INTENTS = {"greeting","thanks","goodbye","confirm","deny"}
|
| 74 |
+
|
| 75 |
+
INTENT_TO_CATEGORY = {
|
| 76 |
+
"nearest_restaurant":"restaurant","nearest_pharmacy":"pharmacy",
|
| 77 |
+
"nearest_cafe":"cafe","nearest_supermarket":"supermarket",
|
| 78 |
+
"housing_search":"housing",
|
| 79 |
+
}
|
| 80 |
+
INTENT_TEMPLATE_MAP = {
|
| 81 |
+
"nearest_restaurant":"find_restaurant","nearest_pharmacy":"find_pharmacy",
|
| 82 |
+
"nearest_cafe":"find_cafe","nearest_supermarket":"find_supermarket",
|
| 83 |
+
"housing_search":"find_housing","recommend_place":"find_restaurant",
|
| 84 |
+
"open_now":"find_restaurant","place_details":"find_restaurant",
|
| 85 |
+
"greeting":"greeting","thanks":"thanks","goodbye":"goodbye",
|
| 86 |
+
"confirm":"clarification","deny":"clarification","fallback":"fallback",
|
| 87 |
+
}
|
| 88 |
+
ENTITY_FIELD_MAP = {
|
| 89 |
+
"location":"location","place_type":"category","cuisine_or_item":"sub_category",
|
| 90 |
+
"food_type":"sub_category","price":"price","price_range":"price",
|
| 91 |
+
"category":"category","sub_category":"sub_category","facility_type":"category",
|
| 92 |
+
"housing_type":"category","status":"status","time":"time",
|
| 93 |
+
}
|
| 94 |
+
KEYWORD_OVERRIDE = {
|
| 95 |
+
"goodbye": ["مع السلامة","مع السلامه","باي","وداعا","bye","goodbye","تصبح على خير",
|
| 96 |
+
"في امان الله","الله يسلمك","سلامتك"],
|
| 97 |
+
"greeting":["السلام عليكم","وعليكم السلام","اهلا","أهلا","هلا","هلو","مرحبا","مرحباً",
|
| 98 |
+
"صباح الخير","مساء الخير","هاي","hi","hello","صباح","مساء"],
|
| 99 |
+
"thanks": ["شكرا","شكراً","تسلم","يسلمو","ممنون","مشكور","thanks","thank","الف شكر"],
|
| 100 |
+
}
|
| 101 |
+
CATEGORY_KEYWORDS = {
|
| 102 |
+
"restaurant":["مطعم","اكل","وجبات","مشويات","كباب","شاورما","كريب","برجر","سمك","فرايد"],
|
| 103 |
+
"pharmacy": ["صيدليه","صيدلية","دوا","ادويه","دواء"],
|
| 104 |
+
"cafe": ["كافيه","كوفي","قهوه","قهوة","كافيتيريا"],
|
| 105 |
+
"supermarket":["سوبرماركت","ماركت","بقاله","هايبر"],
|
| 106 |
+
"housing": ["شقه","شقة","ايجار","إيجار","فندق","هوستل","سكن"],
|
| 107 |
+
}
|
| 108 |
+
CLARIFICATION_Q = {
|
| 109 |
+
"nearest_restaurant":"أي نوع أكل؟ مشويات، شاورما، كريب، برجر؟",
|
| 110 |
+
"nearest_pharmacy":"في أي منطقة بتدور على صيدلية؟",
|
| 111 |
+
"nearest_cafe":"في أي منطقة بتدور على كافيه؟",
|
| 112 |
+
"nearest_supermarket":"في أي منطقة بتدور على ماركت؟",
|
| 113 |
+
"housing_search":"بتدور على إيه — شقة ��يجار، فندق؟ وفين؟",
|
| 114 |
+
}
|
| 115 |
+
OUT_OF_SCOPE_KW = ["الجو","طقس","درجه","كوره","كرة","أهلي","زمالك","مباريات",
|
| 116 |
+
"سياسه","سياسة","أخبار","رصيد","بنك","تحويل","امتحان","مدرسه",
|
| 117 |
+
"جامعه","وظيفه","برمجه","كود","python","java","رياضيات","ترجمه","translate"]
|
| 118 |
+
NEXT_WORDS = ["تاني","غيره","غيرها","بديل","حاجة تانية","مش عاجبني","فيه تاني","عايز غيره"]
|
| 119 |
+
DETAIL_WORDS = ["بيفتح","بتفتح","مواعيده","مواعيدها","امتى","امتي","عنوانه","عنوانها",
|
| 120 |
+
"تليفونه","تليفونها","رقمه","رقمها","تقييمه","تقييمها","سعره","سعرها"]
|
| 121 |
+
REF_WORDS = ["هو","هي","ده","دي","المكان ده"]
|
| 122 |
+
_LOC_CUES = ["الحي","بني سويف","الاباصيري","الكورنيش","مقبل","الزراعيين",
|
| 123 |
+
"صلاح سالم","شرق النيل","سيتي سنتر","عرابي","الروضه"]
|
| 124 |
+
|
| 125 |
+
# HELPER FUNCTIONS
|
| 126 |
+
|
| 127 |
+
def apply_keyword_override(text):
|
| 128 |
+
t = norm(text); tw = set(t.split())
|
| 129 |
+
for intent, kws in KEYWORD_OVERRIDE.items():
|
| 130 |
+
for k in sorted(kws, key=len, reverse=True):
|
| 131 |
+
kn = norm(k)
|
| 132 |
+
if (" " in kn and kn in t) or (kn in tw): return intent
|
| 133 |
+
return None
|
| 134 |
+
|
| 135 |
+
def get_template_key(intent, category=None):
|
| 136 |
+
if category:
|
| 137 |
+
k = {"restaurant":"find_restaurant","pharmacy":"find_pharmacy",
|
| 138 |
+
"cafe":"find_cafe","supermarket":"find_supermarket",
|
| 139 |
+
"housing":"find_housing"}.get(category)
|
| 140 |
+
if k: return k
|
| 141 |
+
return INTENT_TEMPLATE_MAP.get(intent, "fallback")
|
| 142 |
+
|
| 143 |
+
def infer_category(query):
|
| 144 |
+
q = norm(query)
|
| 145 |
+
for cat, words in CATEGORY_KEYWORDS.items():
|
| 146 |
+
if any(norm(w) in q for w in words): return cat
|
| 147 |
+
return None
|
| 148 |
+
|
| 149 |
+
def is_out_of_scope(text):
|
| 150 |
+
t = norm(text)
|
| 151 |
+
return any(norm(k) in t for k in OUT_OF_SCOPE_KW)
|
| 152 |
+
|
| 153 |
+
def detect_ref_type(text):
|
| 154 |
+
t = norm(text); tw = set(t.split())
|
| 155 |
+
if any(norm(w) in t for w in NEXT_WORDS): return "next"
|
| 156 |
+
if any(norm(w) in t for w in DETAIL_WORDS): return "detail"
|
| 157 |
+
for w in REF_WORDS:
|
| 158 |
+
wn = norm(w)
|
| 159 |
+
if (" " in wn and wn in t) or (wn in tw): return "reference"
|
| 160 |
+
return "new"
|
| 161 |
+
|
| 162 |
+
def _loc_continuation(text):
|
| 163 |
+
t = norm(text); words = t.split()
|
| 164 |
+
if len(words) <= 4 and any(norm(c) in t for c in _LOC_CUES): return True
|
| 165 |
+
return bool(words and words[0] == "في")
|
| 166 |
+
|
| 167 |
+
def normalize_rating(r):
|
| 168 |
+
try:
|
| 169 |
+
r = float(r)
|
| 170 |
+
return round(r/2, 1) if r > 5 else round(r, 1) if r > 0 else 0.0
|
| 171 |
+
except: return 0.0
|
| 172 |
+
|
| 173 |
+
# TIME UTILS
|
| 174 |
+
|
| 175 |
+
def get_cairo_now():
|
| 176 |
+
return datetime.now(ZoneInfo("Africa/Cairo"))
|
| 177 |
+
|
| 178 |
+
def parse_time(token):
|
| 179 |
+
token = norm(token).replace(" ", "")
|
| 180 |
+
m = re.match(r"^(\d{1,2})(?::(\d{1,2}))?(ص|م|ظهر)?$", token)
|
| 181 |
+
if not m: return None
|
| 182 |
+
h = int(m.group(1)); mn = int(m.group(2)) if m.group(2) else 0; suf = m.group(3)
|
| 183 |
+
if not (0 <= mn <= 59): return None
|
| 184 |
+
if suf == "ص":
|
| 185 |
+
if h == 12: h = 0
|
| 186 |
+
elif not (1 <= h <= 11): return None
|
| 187 |
+
elif suf in ("م","ظهر"):
|
| 188 |
+
if h != 12 and 1 <= h <= 11: h += 12
|
| 189 |
+
else:
|
| 190 |
+
if h == 24: h = 0
|
| 191 |
+
elif not (0 <= h <= 23): return None
|
| 192 |
+
return f"{h:02d}:{mn:02d}"
|
| 193 |
+
|
| 194 |
+
def check_open_now(opening_hours_str):
|
| 195 |
+
if not opening_hours_str or str(opening_hours_str).strip() in ("","nan","none"): return None
|
| 196 |
+
text = norm(str(opening_hours_str))
|
| 197 |
+
if any(k in text for k in ["24","always","طول اليوم","24/7"]): return 1
|
| 198 |
+
sep = re.search(r"(.+?)\s*-\s*(.+)", text)
|
| 199 |
+
if not sep: return None
|
| 200 |
+
t1 = parse_time(sep.group(1).strip()); t2 = parse_time(sep.group(2).strip())
|
| 201 |
+
if not t1 or not t2: return None
|
| 202 |
+
now_t = f"{get_cairo_now().hour:02d}:{get_cairo_now().minute:02d}"
|
| 203 |
+
if t1 <= t2: return 1 if t1 <= now_t <= t2 else 0
|
| 204 |
+
return 1 if (now_t >= t1 or now_t <= t2) else 0
|
| 205 |
+
|
| 206 |
+
# SEARCH + FILTER + RANK
|
| 207 |
+
def semantic_candidates(query, top_k=20):
|
| 208 |
+
q_emb = semantic_model.encode(clean_text(query), convert_to_tensor=True)
|
| 209 |
+
scores = util.cos_sim(q_emb, corpus_embeddings)[0]
|
| 210 |
+
top_k = min(top_k, len(corpus_df))
|
| 211 |
+
top_r = torch.topk(scores, k=top_k)
|
| 212 |
+
res = corpus_df.iloc[top_r.indices.cpu().numpy()].copy()
|
| 213 |
+
res["semantic_score"] = top_r.values.cpu().numpy()
|
| 214 |
+
keep = [c for c in ["place_id","doc_id","name","category","sub_category","location",
|
| 215 |
+
"address","price_range","opening_hours","description","semantic_score"]
|
| 216 |
+
if c in res.columns]
|
| 217 |
+
return res[keep].reset_index(drop=True)
|
| 218 |
+
|
| 219 |
+
def merge_places(df):
|
| 220 |
+
extra = [c for c in ["lat","lon","rating","phone","social_media","status",
|
| 221 |
+
"category_clean","sub_category_clean","location_clean",
|
| 222 |
+
"address_clean","price_range_clean","search_text_clean"]
|
| 223 |
+
if c in places_df.columns]
|
| 224 |
+
slim = places_df[["place_id"] + extra].copy()
|
| 225 |
+
return df.merge(slim, on="place_id", how="left")
|
| 226 |
+
|
| 227 |
+
def apply_filters(df, query, category=None, sub_category=None, location=None,
|
| 228 |
+
price_range=None, open_now_only=False, min_rating=None):
|
| 229 |
+
f = df.copy()
|
| 230 |
+
if category: f = f[f["category_clean"].astype(str).str.contains(re.escape(clean_text(category)), na=False)]
|
| 231 |
+
if sub_category: f = f[f["sub_category_clean"].astype(str).str.contains(re.escape(clean_text(sub_category)), na=False)]
|
| 232 |
+
if location: f = f[f["location_clean"].astype(str).str.contains(re.escape(clean_text(location)), na=False)]
|
| 233 |
+
if price_range: f = f[f["price_range_clean"].astype(str).str.contains(re.escape(clean_text(price_range)), na=False)]
|
| 234 |
+
f["open_now"] = f["opening_hours"].apply(check_open_now)
|
| 235 |
+
f["rating_num"] = pd.to_numeric(f.get("rating", pd.Series()), errors="coerce").fillna(0)
|
| 236 |
+
f["rating_norm"] = f["rating_num"].apply(normalize_rating)
|
| 237 |
+
f["rating_score"] = f["rating_norm"] / 5.0
|
| 238 |
+
f["open_score"] = f["open_now"].apply(lambda x: 1.0 if x==1 else (0.5 if x is None else 0.0))
|
| 239 |
+
if open_now_only: f = f[f["open_now"] == 1]
|
| 240 |
+
if min_rating: f = f[f["rating_norm"] >= min_rating]
|
| 241 |
+
return f
|
| 242 |
+
|
| 243 |
+
def haversine(lat1, lon1, lat2, lon2):
|
| 244 |
+
R=6371; p=math.pi/180
|
| 245 |
+
a = (math.sin((lat2-lat1)*p/2)**2 + math.cos(lat1*p)*math.cos(lat2*p)*math.sin((lon2-lon1)*p/2)**2)
|
| 246 |
+
return 2*R*math.asin(math.sqrt(a))
|
| 247 |
+
|
| 248 |
+
def rank(df, query, user_lat=None, user_lon=None):
|
| 249 |
+
df = df.copy()
|
| 250 |
+
if user_lat and user_lon and "lat" in df.columns:
|
| 251 |
+
def dist(row):
|
| 252 |
+
try: return haversine(float(user_lat), float(user_lon), float(row["lat"]), float(row["lon"]))
|
| 253 |
+
except: return 999
|
| 254 |
+
df["distance_km"] = df.apply(dist, axis=1)
|
| 255 |
+
mx = df["distance_km"].replace(999, np.nan).max() or 1
|
| 256 |
+
df["distance_score"] = 1 - (df["distance_km"] / (mx + 1))
|
| 257 |
+
else:
|
| 258 |
+
df["distance_km"] = 999; df["distance_score"] = 0.0
|
| 259 |
+
q_clean = clean_text(query)
|
| 260 |
+
df["name_match_score"] = df["name"].apply(
|
| 261 |
+
lambda n: 1.0 if clean_text(str(n)) in q_clean or q_clean in clean_text(str(n)) else 0.0)
|
| 262 |
+
w = dict(semantic=0.40, rating=0.25, open=0.15, distance=0.10, name=0.10)
|
| 263 |
+
df["final_score"] = (
|
| 264 |
+
w["semantic"]*df.get("semantic_score", pd.Series(0,index=df.index)).fillna(0) +
|
| 265 |
+
w["rating"] *df.get("rating_score", pd.Series(0,index=df.index)).fillna(0) +
|
| 266 |
+
w["open"] *df.get("open_score", pd.Series(0,index=df.index)).fillna(0) +
|
| 267 |
+
w["distance"]*df["distance_score"] + w["name"]*df["name_match_score"]
|
| 268 |
+
)
|
| 269 |
+
return df.sort_values("final_score", ascending=False).reset_index(drop=True)
|
| 270 |
+
|
| 271 |
+
def search_places(query, top_k_final=5, category=None, sub_category=None,
|
| 272 |
+
location=None, price_range=None, open_now_only=False,
|
| 273 |
+
min_rating=None, user_lat=None, user_lon=None):
|
| 274 |
+
cands = semantic_candidates(query, top_k=20)
|
| 275 |
+
merged = merge_places(cands)
|
| 276 |
+
for attempt in [
|
| 277 |
+
dict(category=category, sub_category=sub_category, location=location,
|
| 278 |
+
price_range=price_range, open_now_only=open_now_only, min_rating=min_rating),
|
| 279 |
+
dict(category=category, sub_category=None, location=location,
|
| 280 |
+
price_range=price_range, open_now_only=open_now_only, min_rating=min_rating),
|
| 281 |
+
dict(category=category, sub_category=None, location=location,
|
| 282 |
+
price_range=None, open_now_only=False, min_rating=min_rating),
|
| 283 |
+
dict(category=category, sub_category=None, location=None,
|
| 284 |
+
price_range=None, open_now_only=False, min_rating=None),
|
| 285 |
+
]:
|
| 286 |
+
filtered = apply_filters(merged, query, **attempt)
|
| 287 |
+
if not filtered.empty: break
|
| 288 |
+
if filtered.empty: return pd.DataFrame()
|
| 289 |
+
ranked = rank(filtered, query, user_lat, user_lon)
|
| 290 |
+
keep = [c for c in ["place_id","name","category","sub_category","location","address",
|
| 291 |
+
"price_range","rating","rating_norm","opening_hours","description",
|
| 292 |
+
"phone","lat","lon","semantic_score","final_score","open_now"]
|
| 293 |
+
if c in ranked.columns]
|
| 294 |
+
return ranked[keep].head(top_k_final).reset_index(drop=True)
|
| 295 |
+
|
| 296 |
+
# RESPONSE TEMPLATES + FORMATTERS
|
| 297 |
+
RESPONSE_TEMPLATES = {
|
| 298 |
+
"find_restaurant":[
|
| 299 |
+
"🍽️ لقيتلك {name} في {location}. {price_info}{rating_info}{hours_info}",
|
| 300 |
+
"أنصحك بـ {name} — هتلاقيه في {location}. {price_info}{rating_info}{hours_info}",
|
| 301 |
+
"في {location} فيه {name}. {description_short}{price_info}{hours_info}",
|
| 302 |
+
],
|
| 303 |
+
"find_pharmacy":[
|
| 304 |
+
"💊 {name} في {location}.{hours_info}{rating_info}",
|
| 305 |
+
"أقرب صيدلية ليك: {name} — {address_info}{hours_info}",
|
| 306 |
+
],
|
| 307 |
+
"find_cafe":[
|
| 308 |
+
"☕ {name} في {location}. {price_info}{rating_info}{hours_info}",
|
| 309 |
+
"جرب {name} — في {location}. {description_short}{hours_info}",
|
| 310 |
+
],
|
| 311 |
+
"find_supermarket":[
|
| 312 |
+
"🛒 {name} في {location}.{hours_info}{rating_info}",
|
| 313 |
+
"أقرب ماركت: {name} — {address_info}{hours_info}",
|
| 314 |
+
],
|
| 315 |
+
"find_housing":[
|
| 316 |
+
"🏠 {name} في {location}. {price_info}{description_short}",
|
| 317 |
+
"فيه {name} في {location}. {price_info}{rating_info}",
|
| 318 |
+
],
|
| 319 |
+
"greeting": ["أهلاً! 😊 أنا بساعدك تلاقي أي مكان في بني سويف. عايز إيه؟",
|
| 320 |
+
"وعليكم السلام! قولي محتاج إيه — مطعم، صيدلية، كافيه؟",
|
| 321 |
+
"هلا بيك! محتاج إيه في بني سويف؟ 😊"],
|
| 322 |
+
"thanks": ["العفو! 😊 في حاجة تانية أساعدك فيها؟","أي خدمة! 😊","بكل سرور! 😊"],
|
| 323 |
+
"goodbye": ["مع السلامة! 👋","سلامتك! أي وقت محتاج مساعدة أنا هنا.","باي! ربنا يوفقك 😊"],
|
| 324 |
+
"clarification":["😊 قصدك إيه بالظبط؟","ممكن توضح أكتر؟","تمام! بتدور على إيه بالظبط؟"],
|
| 325 |
+
"no_result": ["😔 مش لاقي حاجة مناسبة. جرب تغير المنطقة أو تسأل بطريقة تانية.",
|
| 326 |
+
"معلش، مفيش نتايج. ممكن تحدد المنطقة أو النوع أكتر؟"],
|
| 327 |
+
"fallback": ["آسف، مش فاهم قصدك. 😊 قولي محتاج إيه — مطعم، صيدلية، كافيه؟",
|
| 328 |
+
"ممكن تسألني عن أي مكان في بني سويف وأنا هساعدك! 😊"],
|
| 329 |
+
}
|
| 330 |
+
|
| 331 |
+
def fmt_price(x):
|
| 332 |
+
p = str(x).strip().lower()
|
| 333 |
+
if not p or p in ("","nan","none"): return ""
|
| 334 |
+
m = {"cheap":"الأسعار رخيصة","رخيص":"الأسعار رخيصة","اقتصادي":"الأسعار اقتصادية",
|
| 335 |
+
"medium":"الأسعار متوسطة","متوسط":"الأسعار متوسطة",
|
| 336 |
+
"expensive":"الأسعار غالية","غالي":"الأسعار غالية"}
|
| 337 |
+
for k,v in m.items():
|
| 338 |
+
if k in p: return v+". "
|
| 339 |
+
return f"السعر: {x}. "
|
| 340 |
+
|
| 341 |
+
def fmt_rating(x):
|
| 342 |
+
try:
|
| 343 |
+
r = normalize_rating(float(x)); stars = min(round(r), 5)
|
| 344 |
+
return f"تقييمه {r} {'⭐'*stars}. " if r > 0 else ""
|
| 345 |
+
except: return ""
|
| 346 |
+
|
| 347 |
+
def fmt_hours(x):
|
| 348 |
+
h = str(x).strip()
|
| 349 |
+
if not h or h in ("","nan","none"): return ""
|
| 350 |
+
if any(k in h.lower() for k in ["24","always","طول اليوم"]): return "مفتوح 24 ساعة. "
|
| 351 |
+
return f"بيفتح: {h}. "
|
| 352 |
+
|
| 353 |
+
def fmt_addr(address, location):
|
| 354 |
+
a=str(address).strip(); l=str(location).strip()
|
| 355 |
+
if a and a not in ("","nan","none"): return f"عنوانه: {a}. "
|
| 356 |
+
if l and l not in ("","nan","none"): return f"في {l}. "
|
| 357 |
+
return ""
|
| 358 |
+
|
| 359 |
+
def fmt_desc(x, max_words=12):
|
| 360 |
+
d = str(x).strip()
|
| 361 |
+
if not d or d in ("","nan","none"): return ""
|
| 362 |
+
words = d.split()
|
| 363 |
+
return (" ".join(words[:max_words])+"...") if len(words)>max_words else d+" "
|
| 364 |
+
|
| 365 |
+
def build_response(place, intent, category=None):
|
| 366 |
+
if not place: return random.choice(RESPONSE_TEMPLATES["no_result"])
|
| 367 |
+
tk = get_template_key(intent, category)
|
| 368 |
+
reply = random.choice(RESPONSE_TEMPLATES[tk]).format(
|
| 369 |
+
name = str(place.get("name","")).strip(),
|
| 370 |
+
location = str(place.get("location","")).strip() or "بني سويف",
|
| 371 |
+
price_info = fmt_price(place.get("price_range","")),
|
| 372 |
+
rating_info = fmt_rating(place.get("rating_norm", place.get("rating", 0))),
|
| 373 |
+
hours_info = fmt_hours(place.get("opening_hours","")),
|
| 374 |
+
address_info = fmt_addr(place.get("address",""), place.get("location","")),
|
| 375 |
+
description_short= fmt_desc(place.get("description","")),
|
| 376 |
+
)
|
| 377 |
+
on = place.get("open_now")
|
| 378 |
+
if on == 1: reply += "\n🟢 مفتوح دلوقتي."
|
| 379 |
+
elif on == 0: reply += "\n🔴 مغلق دلوقتي."
|
| 380 |
+
return reply
|
| 381 |
+
|
| 382 |
+
def handle_detail(text, place):
|
| 383 |
+
if not place: return "مش فاكر إحنا اتكلمنا عن مكان. ممكن تسألني من الأول؟"
|
| 384 |
+
t = norm(text); name = str(place.get("name","")).strip()
|
| 385 |
+
if any(w in t for w in ["امتي","امتى","مواعيد","يفتح","تفتح","يقفل"]):
|
| 386 |
+
st = "🟢 مفتوح" if place.get("open_now")==1 else "🔴 مغلق"
|
| 387 |
+
return f"⏰ {name} — {fmt_hours(place.get('opening_hours',''))}\n{st} دلوقتي."
|
| 388 |
+
if any(w in t for w in ["عنوان","فين","وصول","اوصل"]):
|
| 389 |
+
return f"📍 {name} في {place.get('location','')}.\\nالعنوان: {place.get('address','')}"
|
| 390 |
+
if any(w in t for w in ["سعر","بكام","تكلف","غالي","رخيص"]):
|
| 391 |
+
return f"💰 {name} — {fmt_price(place.get('price_range',''))}"
|
| 392 |
+
if any(w in t for w in ["تقييم","نجوم"]):
|
| 393 |
+
return f"⭐ {name} — {fmt_rating(place.get('rating_norm', place.get('rating',0)))}"
|
| 394 |
+
if any(w in t for w in ["رقم","تليفون"]):
|
| 395 |
+
phone = str(place.get("phone","")).strip()
|
| 396 |
+
return f"📞 {name} — {phone}" if phone else f"معنديش رقم {name}."
|
| 397 |
+
return f"📋 {name}:\n{fmt_desc(place.get('description',''), 20)}\n{fmt_hours(place.get('opening_hours',''))}{fmt_rating(place.get('rating_norm',0))}"
|
| 398 |
+
|
| 399 |
+
# PREDICT FUNCTIONS
|
| 400 |
+
|
| 401 |
+
def predict_intent(text, threshold=0.5):
|
| 402 |
+
override = apply_keyword_override(text)
|
| 403 |
+
if override: return {"intent": override, "confidence": 1.0}
|
| 404 |
+
inputs = intent_tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
|
| 405 |
+
with torch.no_grad():
|
| 406 |
+
outputs = intent_model(**inputs)
|
| 407 |
+
probs = torch.softmax(outputs.logits, dim=1)
|
| 408 |
+
pid = torch.argmax(probs, dim=1).item()
|
| 409 |
+
conf = probs[0][pid].item()
|
| 410 |
+
return {"intent": id2intent[pid] if conf >= threshold else "fallback", "confidence": round(conf, 4)}
|
| 411 |
+
|
| 412 |
+
def extract_entities(text, min_score=0.40):
|
| 413 |
+
raw = ner_pipeline([text])[0]; entities = {}
|
| 414 |
+
for item in raw:
|
| 415 |
+
rtype = item["entity_group"].lower().strip()
|
| 416 |
+
val = re.sub(r"##", "", item["word"].strip()).strip()
|
| 417 |
+
val = re.sub(r"\s+", " ", val).strip()
|
| 418 |
+
score = float(item["score"])
|
| 419 |
+
if len(val) < 2 or score < min_score: continue
|
| 420 |
+
mapped = ENTITY_FIELD_MAP.get(rtype, rtype)
|
| 421 |
+
val_c = clean_text(val)
|
| 422 |
+
if mapped not in entities or len(val_c) > len(clean_text(entities[mapped])):
|
| 423 |
+
entities[mapped] = val_c
|
| 424 |
+
return entities
|
| 425 |
+
|
| 426 |
+
# SESSION
|
| 427 |
+
class Session:
|
| 428 |
+
def __init__(self, sid="default"):
|
| 429 |
+
self.sid = sid; self.history=[]; self.last_intent=None
|
| 430 |
+
self.last_entities={}; self.last_place=None
|
| 431 |
+
self.last_results=[]; self.result_pointer=0
|
| 432 |
+
self.context_slots={}; self.turns=0
|
| 433 |
+
|
| 434 |
+
def add(self, user, bot, intent, entities, place, results):
|
| 435 |
+
self.history.append({"turn":self.turns,"user":user,"bot":bot,
|
| 436 |
+
"intent":intent,"entities":entities})
|
| 437 |
+
if intent and intent not in ("fallback","no_result","out_of_scope"):
|
| 438 |
+
self.last_intent = intent
|
| 439 |
+
if intent in SEARCH_INTENTS:
|
| 440 |
+
self.last_entities = entities
|
| 441 |
+
if place is not None: self.last_place = place
|
| 442 |
+
if results: self.last_results=results; self.result_pointer=0
|
| 443 |
+
self._slots(entities)
|
| 444 |
+
self.turns += 1
|
| 445 |
+
|
| 446 |
+
def _slots(self, ents):
|
| 447 |
+
for s in ["location","category","sub_category","price"]:
|
| 448 |
+
v = ents.get(s)
|
| 449 |
+
if v and str(v).strip(): self.context_slots[s] = str(v).strip()
|
| 450 |
+
|
| 451 |
+
def merge(self, new_ents):
|
| 452 |
+
merged = dict(self.context_slots)
|
| 453 |
+
for k,v in new_ents.items():
|
| 454 |
+
if v and str(v).strip(): merged[k]=str(v).strip()
|
| 455 |
+
self._slots(new_ents)
|
| 456 |
+
return merged
|
| 457 |
+
|
| 458 |
+
# MAIN CHAT
|
| 459 |
+
|
| 460 |
+
def chat(text: str, session: Session, user_lat=None, user_lon=None):
|
| 461 |
+
result = dict(reply="", intent="", confidence=0.0, entities={}, best_place=None, all_results=[])
|
| 462 |
+
|
| 463 |
+
if not text or not text.strip():
|
| 464 |
+
result.update(reply="الرجاء إدخال سؤال 😊", intent="fallback")
|
| 465 |
+
session.add("", result["reply"], "fallback", {}, None, [])
|
| 466 |
+
return result
|
| 467 |
+
|
| 468 |
+
if is_out_of_scope(text):
|
| 469 |
+
reply = "أنا متخصص في إيجاد الأماكن في بني سويف فقط. 😊\nممكن أساعدك تلاقي مطعم، صيدلية، كافيه، ماركت، أو سكن."
|
| 470 |
+
result.update(reply=reply, intent="out_of_scope")
|
| 471 |
+
session.add(text, reply, "out_of_scope", {}, None, [])
|
| 472 |
+
return result
|
| 473 |
+
|
| 474 |
+
ref = detect_ref_type(text)
|
| 475 |
+
if ref == "detail" and session.last_place:
|
| 476 |
+
reply = handle_detail(text, session.last_place)
|
| 477 |
+
result.update(reply=reply, intent=session.last_intent or "detail", best_place=session.last_place)
|
| 478 |
+
session.add(text, reply, result["intent"], {}, session.last_place, [])
|
| 479 |
+
return result
|
| 480 |
+
|
| 481 |
+
if ref == "next" and session.last_results:
|
| 482 |
+
ptr = session.result_pointer + 1
|
| 483 |
+
if ptr < len(session.last_results):
|
| 484 |
+
session.result_pointer = ptr; nxt = session.last_results[ptr]; session.last_place = nxt
|
| 485 |
+
reply = build_response(nxt, session.last_intent, category=nxt.get("category"))
|
| 486 |
+
result.update(reply=reply, intent=session.last_intent, best_place=nxt)
|
| 487 |
+
else:
|
| 488 |
+
result.update(reply="😔 مفيش نتايج تانية. عايز أدور من الأول؟", intent="no_result")
|
| 489 |
+
session.add(text, result["reply"], result["intent"], {}, result["best_place"], [])
|
| 490 |
+
return result
|
| 491 |
+
|
| 492 |
+
ir = predict_intent(text); intent = ir["intent"]; conf = ir["confidence"]
|
| 493 |
+
result["intent"] = intent; result["confidence"] = conf
|
| 494 |
+
|
| 495 |
+
if intent in STATIC_INTENTS:
|
| 496 |
+
result["reply"] = random.choice(RESPONSE_TEMPLATES[get_template_key(intent)])
|
| 497 |
+
session.add(text, result["reply"], intent, {}, None, [])
|
| 498 |
+
return result
|
| 499 |
+
|
| 500 |
+
if intent == "fallback":
|
| 501 |
+
if session.last_intent in SEARCH_INTENTS and _loc_continuation(text):
|
| 502 |
+
intent = session.last_intent; result["intent"] = intent
|
| 503 |
+
else:
|
| 504 |
+
result["reply"] = random.choice(RESPONSE_TEMPLATES["fallback"])
|
| 505 |
+
session.add(text, result["reply"], "fallback", {}, None, [])
|
| 506 |
+
return result
|
| 507 |
+
|
| 508 |
+
if intent not in SEARCH_INTENTS:
|
| 509 |
+
result["reply"] = random.choice(RESPONSE_TEMPLATES.get(get_template_key(intent), RESPONSE_TEMPLATES["fallback"]))
|
| 510 |
+
session.add(text, result["reply"], intent, {}, None, [])
|
| 511 |
+
return result
|
| 512 |
+
|
| 513 |
+
ents = extract_entities(text); result["entities"] = ents
|
| 514 |
+
merged = session.merge(ents)
|
| 515 |
+
|
| 516 |
+
category = merged.get("category") or INTENT_TO_CATEGORY.get(intent) or infer_category(text)
|
| 517 |
+
sub_cat = merged.get("sub_category")
|
| 518 |
+
location = merged.get("location")
|
| 519 |
+
price_range = merged.get("price")
|
| 520 |
+
open_only = ("open_now" in intent or "place_details" in intent)
|
| 521 |
+
|
| 522 |
+
df = search_places(text, top_k_final=5, category=category, sub_category=sub_cat,
|
| 523 |
+
location=location, price_range=price_range, open_now_only=open_only,
|
| 524 |
+
user_lat=user_lat, user_lon=user_lon)
|
| 525 |
+
|
| 526 |
+
if df.empty:
|
| 527 |
+
cl = CLARIFICATION_Q.get(intent, "")
|
| 528 |
+
reply = random.choice(RESPONSE_TEMPLATES["no_result"]) + (f"\n\n💬 {cl}" if cl else "")
|
| 529 |
+
result.update(reply=reply, intent="no_result")
|
| 530 |
+
session.add(text, reply, "no_result", ents, None, [])
|
| 531 |
+
return result
|
| 532 |
+
|
| 533 |
+
all_res = df.to_dict(orient="records"); best = all_res[0]
|
| 534 |
+
reply = build_response(best, intent, category=category)
|
| 535 |
+
if len(all_res) > 1: reply += f"\n\n💬 فيه {len(all_res)} نتيجة — قولي 'تاني' لو عايز غيره."
|
| 536 |
+
|
| 537 |
+
result.update(reply=reply, best_place=best, all_results=all_res)
|
| 538 |
+
session.add(text, reply, intent, ents, best, all_res)
|
| 539 |
+
return result
|
| 540 |
+
|
| 541 |
+
# STARTUP
|
| 542 |
+
@asynccontextmanager
|
| 543 |
+
async def lifespan(app: FastAPI):
|
| 544 |
+
global intent_tokenizer, intent_model, label_encoder, id2intent
|
| 545 |
+
global ner_pipeline, label2id, id2label
|
| 546 |
+
global semantic_model, corpus_df, corpus_embeddings, places_df
|
| 547 |
+
|
| 548 |
+
print("⏳ Loading models …")
|
| 549 |
+
intent_tokenizer = AutoTokenizer.from_pretrained(INTENT_DIR)
|
| 550 |
+
intent_model = AutoModelForSequenceClassification.from_pretrained(INTENT_DIR)
|
| 551 |
+
label_encoder = joblib.load(os.path.join(INTENT_DIR, "label_encoder.pkl"))
|
| 552 |
+
id2intent = {i: lbl for i, lbl in enumerate(label_encoder.classes_)}
|
| 553 |
+
intent_model.eval()
|
| 554 |
+
|
| 555 |
+
with open(os.path.join(ENTITY_DIR, "label2id.json"), encoding="utf-8") as f: label2id = json.load(f)
|
| 556 |
+
with open(os.path.join(ENTITY_DIR, "id2label.json"), encoding="utf-8") as f: id2label = json.load(f)
|
| 557 |
+
etok = AutoTokenizer.from_pretrained(ENTITY_DIR, local_files_only=True)
|
| 558 |
+
emod = AutoModelForTokenClassification.from_pretrained(ENTITY_DIR, local_files_only=True)
|
| 559 |
+
ner_pipeline = pipeline("token-classification", model=emod, tokenizer=etok, aggregation_strategy="first")
|
| 560 |
+
|
| 561 |
+
semantic_model = SentenceTransformer(os.path.join(SEMANTIC_DIR, "model"))
|
| 562 |
+
with open(os.path.join(SEMANTIC_DIR, "semantic_data.pkl"), "rb") as f:
|
| 563 |
+
sd = pickle.load(f)
|
| 564 |
+
corpus_df = sd["corpus_df"]; corpus_embeddings = sd["corpus_embeddings"]
|
| 565 |
+
|
| 566 |
+
places_df = pd.read_excel(PLACES_FILE)
|
| 567 |
+
for col in ["place_id","name","category","sub_category","location","address",
|
| 568 |
+
"price_range","rating","opening_hours","description","lat","lon"]:
|
| 569 |
+
if col not in places_df.columns: places_df[col] = ""
|
| 570 |
+
places_df = places_df.fillna("")
|
| 571 |
+
places_df["category_clean"] = places_df["category"].apply(clean_text)
|
| 572 |
+
places_df["sub_category_clean"] = places_df["sub_category"].apply(clean_text)
|
| 573 |
+
places_df["location_clean"] = places_df["location"].apply(clean_text)
|
| 574 |
+
places_df["address_clean"] = places_df["address"].apply(clean_text)
|
| 575 |
+
places_df["price_range_clean"] = places_df["price_range"].apply(clean_text)
|
| 576 |
+
places_df["description_clean"] = places_df["description"].apply(clean_text)
|
| 577 |
+
places_df["search_text_clean"] = (
|
| 578 |
+
places_df["name"].astype(str)+" "+places_df["category"].astype(str)+" "+
|
| 579 |
+
places_df["sub_category"].astype(str)+" "+places_df["location"].astype(str)+" "+
|
| 580 |
+
places_df["description"].astype(str)
|
| 581 |
+
).apply(clean_text)
|
| 582 |
+
|
| 583 |
+
print("✅ All models loaded!")
|
| 584 |
+
yield
|
| 585 |
+
print("Shutting down.")
|
| 586 |
+
|
| 587 |
+
# FASTAPI
|
| 588 |
+
app = FastAPI(title="Beni Suef Chatbot API", version="1.0.0", lifespan=lifespan)
|
| 589 |
+
app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"])
|
| 590 |
+
|
| 591 |
+
|
| 592 |
+
class ChatRequest(BaseModel):
|
| 593 |
+
message: str
|
| 594 |
+
session_id: str = "default"
|
| 595 |
+
user_lat: Optional[float] = None
|
| 596 |
+
user_lon: Optional[float] = None
|
| 597 |
+
|
| 598 |
+
class ChatResponse(BaseModel):
|
| 599 |
+
reply: str
|
| 600 |
+
intent: str
|
| 601 |
+
confidence: float
|
| 602 |
+
entities: dict
|
| 603 |
+
session_id: str
|
| 604 |
+
best_place: Optional[dict] = None
|
| 605 |
+
|
| 606 |
+
|
| 607 |
+
@app.get("/")
|
| 608 |
+
def root():
|
| 609 |
+
return {"status": "ok", "message": "Beni Suef Chatbot is running 🚀"}
|
| 610 |
+
|
| 611 |
+
@app.get("/health")
|
| 612 |
+
def health():
|
| 613 |
+
return {"status": "healthy",
|
| 614 |
+
"models_loaded": all([intent_model, ner_pipeline, semantic_model, places_df is not None])}
|
| 615 |
+
|
| 616 |
+
@app.post("/chat", response_model=ChatResponse)
|
| 617 |
+
def chat_endpoint(req: ChatRequest):
|
| 618 |
+
if req.session_id not in SESSIONS:
|
| 619 |
+
SESSIONS[req.session_id] = Session(req.session_id)
|
| 620 |
+
session = SESSIONS[req.session_id]
|
| 621 |
+
try:
|
| 622 |
+
result = chat(req.message, session, req.user_lat, req.user_lon)
|
| 623 |
+
except Exception as e:
|
| 624 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 625 |
+
|
| 626 |
+
best = result.get("best_place")
|
| 627 |
+
if best:
|
| 628 |
+
best = {k: (float(v) if isinstance(v, (np.floating, np.integer)) else
|
| 629 |
+
(None if (isinstance(v, float) and np.isnan(v)) else v))
|
| 630 |
+
for k, v in best.items()
|
| 631 |
+
if k in ["place_id","name","category","sub_category","location","address",
|
| 632 |
+
"price_range","rating","opening_hours","description","phone",
|
| 633 |
+
"lat","lon","open_now","final_score"]}
|
| 634 |
+
|
| 635 |
+
return ChatResponse(reply=result["reply"], intent=result["intent"],
|
| 636 |
+
confidence=result["confidence"], entities=result["entities"],
|
| 637 |
+
session_id=req.session_id, best_place=best)
|
| 638 |
+
|
| 639 |
+
@app.delete("/session/{session_id}")
|
| 640 |
+
def reset_session(session_id: str):
|
| 641 |
+
SESSIONS.pop(session_id, None)
|
| 642 |
+
return {"status": "reset", "session_id": session_id}
|
beni_suef_100_places_v5ff.xlsx
ADDED
|
Binary file (34.4 kB). View file
|
|
|
requirements.txt
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi==0.115.0
|
| 2 |
+
uvicorn[standard]==0.30.6
|
| 3 |
+
pydantic==2.8.2
|
| 4 |
+
transformers==4.44.2
|
| 5 |
+
sentence-transformers==3.0.1
|
| 6 |
+
torch==2.4.1
|
| 7 |
+
numpy==1.26.4
|
| 8 |
+
pandas==2.2.2
|
| 9 |
+
openpyxl==3.1.5
|
| 10 |
+
scikit-learn==1.5.2
|
| 11 |
+
joblib==1.4.2
|