Update app.py
Browse files
app.py
CHANGED
|
@@ -149,18 +149,174 @@ STAFF_DIRECTORY = [
|
|
| 149 |
def _normalize_name_query(text: str):
|
| 150 |
return [t for t in re.sub(r"[^a-z0-9 ]+", " ", (text or "").lower()).split() if t]
|
| 151 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 152 |
def _match_staff_name(question: str):
|
| 153 |
tokens = _normalize_name_query(question)
|
| 154 |
-
if not tokens or len(tokens) >
|
| 155 |
return None
|
| 156 |
ql = (question or "").strip().lower()
|
| 157 |
-
blocked = ["who is", "who handles", "who can help", "contact", "librarian", "library"
|
| 158 |
if any(b in ql for b in blocked):
|
| 159 |
return None
|
| 160 |
-
for staff in
|
| 161 |
staff_tokens = set()
|
| 162 |
-
for tok in staff
|
| 163 |
staff_tokens.update(_normalize_name_query(tok))
|
|
|
|
| 164 |
if all(tok in staff_tokens for tok in tokens):
|
| 165 |
return staff
|
| 166 |
return None
|
|
@@ -194,6 +350,7 @@ GROUNDED_LIBRARY_MAP = {
|
|
| 194 |
# ===== GLOBALS =====
|
| 195 |
vectorstore = None
|
| 196 |
http_client = None
|
|
|
|
| 197 |
|
| 198 |
|
| 199 |
# ===== ANALYTICS DB =====
|
|
@@ -290,8 +447,12 @@ def log_query(question, tool, model, response_time, result_count=0, error=None):
|
|
| 290 |
|
| 291 |
# ===== RAG SETUP =====
|
| 292 |
def load_documents():
|
|
|
|
| 293 |
docs = []
|
| 294 |
files = glob.glob(os.path.join(KNOWLEDGE_DIR, "*.txt"))
|
|
|
|
|
|
|
|
|
|
| 295 |
for filepath in files:
|
| 296 |
try:
|
| 297 |
with open(filepath, "r", encoding="utf-8") as f:
|
|
@@ -1201,12 +1362,14 @@ async def agent_query(req: AgentRequest):
|
|
| 1201 |
if staff_match:
|
| 1202 |
answer = _staff_name_answer(staff_match, question)
|
| 1203 |
elapsed = time.time() - start
|
|
|
|
|
|
|
| 1204 |
return {
|
| 1205 |
"answer": answer,
|
| 1206 |
"intent": "library_info",
|
| 1207 |
"tools_used": ["staff_name_match"],
|
| 1208 |
"search_results": [],
|
| 1209 |
-
"sources": [],
|
| 1210 |
"model_used": req.model,
|
| 1211 |
"response_time": round(elapsed, 2),
|
| 1212 |
"corrected_query": question,
|
|
@@ -1214,7 +1377,7 @@ async def agent_query(req: AgentRequest):
|
|
| 1214 |
"database_query": question,
|
| 1215 |
"original_question": question,
|
| 1216 |
"is_follow_up": False,
|
| 1217 |
-
"source_mode": "staff_directory",
|
| 1218 |
}
|
| 1219 |
|
| 1220 |
# ---- Follow-up to the greeting menu ----
|
|
|
|
| 149 |
def _normalize_name_query(text: str):
|
| 150 |
return [t for t in re.sub(r"[^a-z0-9 ]+", " ", (text or "").lower()).split() if t]
|
| 151 |
|
| 152 |
+
def _dedupe_keep_order(items):
|
| 153 |
+
seen = set()
|
| 154 |
+
out = []
|
| 155 |
+
for item in items:
|
| 156 |
+
if item and item not in seen:
|
| 157 |
+
seen.add(item)
|
| 158 |
+
out.append(item)
|
| 159 |
+
return out
|
| 160 |
+
|
| 161 |
+
def _title_case_name(name: str) -> str:
|
| 162 |
+
return re.sub(r"\s+", " ", (name or "").strip()).title()
|
| 163 |
+
|
| 164 |
+
def _build_staff_tokens(full_name: str):
|
| 165 |
+
honorifics = {"dr", "mr", "mrs", "ms", "prof"}
|
| 166 |
+
raw_tokens = _normalize_name_query(full_name)
|
| 167 |
+
core_tokens = [t for t in raw_tokens if t not in honorifics]
|
| 168 |
+
token_lists = [raw_tokens, core_tokens]
|
| 169 |
+
variants = []
|
| 170 |
+
for toks in token_lists:
|
| 171 |
+
if not toks:
|
| 172 |
+
continue
|
| 173 |
+
variants.extend(toks)
|
| 174 |
+
variants.append(" ".join(toks))
|
| 175 |
+
for n in (2, 3, 4):
|
| 176 |
+
if len(toks) >= n:
|
| 177 |
+
for i in range(len(toks) - n + 1):
|
| 178 |
+
variants.append(" ".join(toks[i:i+n]))
|
| 179 |
+
return _dedupe_keep_order(variants)
|
| 180 |
+
|
| 181 |
+
def _parse_staff_directory_text(text: str):
|
| 182 |
+
staff_entries = []
|
| 183 |
+
if not text:
|
| 184 |
+
return staff_entries
|
| 185 |
+
|
| 186 |
+
lines = [line.rstrip() for line in text.splitlines()]
|
| 187 |
+
i = 0
|
| 188 |
+
while i < len(lines):
|
| 189 |
+
line = lines[i].strip()
|
| 190 |
+
is_name_line = (
|
| 191 |
+
line
|
| 192 |
+
and line == line.upper()
|
| 193 |
+
and not line.startswith("===")
|
| 194 |
+
and not line.startswith("SOURCE:")
|
| 195 |
+
and not line.startswith("TITLE:")
|
| 196 |
+
and any(ch.isalpha() for ch in line)
|
| 197 |
+
and len(line.split()) <= 10
|
| 198 |
+
)
|
| 199 |
+
if not is_name_line:
|
| 200 |
+
i += 1
|
| 201 |
+
continue
|
| 202 |
+
|
| 203 |
+
name_line = line
|
| 204 |
+
block = []
|
| 205 |
+
i += 1
|
| 206 |
+
while i < len(lines):
|
| 207 |
+
nxt = lines[i].strip()
|
| 208 |
+
next_is_name = (
|
| 209 |
+
nxt
|
| 210 |
+
and nxt == nxt.upper()
|
| 211 |
+
and not nxt.startswith("===")
|
| 212 |
+
and not nxt.startswith("SOURCE:")
|
| 213 |
+
and not nxt.startswith("TITLE:")
|
| 214 |
+
and any(ch.isalpha() for ch in nxt)
|
| 215 |
+
and len(nxt.split()) <= 10
|
| 216 |
+
)
|
| 217 |
+
if next_is_name:
|
| 218 |
+
break
|
| 219 |
+
block.append(nxt)
|
| 220 |
+
i += 1
|
| 221 |
+
|
| 222 |
+
role = ""
|
| 223 |
+
email = ""
|
| 224 |
+
phone = ""
|
| 225 |
+
mobile = ""
|
| 226 |
+
location = ""
|
| 227 |
+
best_for = ""
|
| 228 |
+
schedule = ""
|
| 229 |
+
extra_bits = []
|
| 230 |
+
for raw in block:
|
| 231 |
+
if not raw or raw.startswith("==="):
|
| 232 |
+
continue
|
| 233 |
+
low = raw.lower()
|
| 234 |
+
if raw.startswith("Title:"):
|
| 235 |
+
role = raw.split(":", 1)[1].strip()
|
| 236 |
+
elif raw.startswith("Email:"):
|
| 237 |
+
email = raw.split(":", 1)[1].strip()
|
| 238 |
+
elif raw.startswith("Phone:") or raw.startswith("Work Phone:"):
|
| 239 |
+
phone = raw.split(":", 1)[1].strip()
|
| 240 |
+
elif raw.startswith("Mobile:"):
|
| 241 |
+
mobile = raw.split(":", 1)[1].strip()
|
| 242 |
+
elif raw.startswith("Location:"):
|
| 243 |
+
location = raw.split(":", 1)[1].strip()
|
| 244 |
+
elif raw.startswith("Best for:"):
|
| 245 |
+
best_for = raw.split(":", 1)[1].strip()
|
| 246 |
+
elif raw.startswith("Schedule appointment:"):
|
| 247 |
+
schedule = raw.split(":", 1)[1].strip()
|
| 248 |
+
elif any(low.startswith(prefix) for prefix in ["linkedin:", "orcid:"]):
|
| 249 |
+
extra_bits.append(raw)
|
| 250 |
+
else:
|
| 251 |
+
extra_bits.append(raw)
|
| 252 |
+
|
| 253 |
+
details_parts = []
|
| 254 |
+
if best_for:
|
| 255 |
+
details_parts.append(f"Best for: {best_for}")
|
| 256 |
+
if email:
|
| 257 |
+
details_parts.append(f"Email: {email}")
|
| 258 |
+
if phone:
|
| 259 |
+
details_parts.append(f"Phone: {phone}")
|
| 260 |
+
if mobile:
|
| 261 |
+
details_parts.append(f"Mobile: {mobile}")
|
| 262 |
+
if location:
|
| 263 |
+
details_parts.append(f"Location: {location}")
|
| 264 |
+
if schedule:
|
| 265 |
+
details_parts.append(f"Schedule appointment: {schedule}")
|
| 266 |
+
details_parts.extend(extra_bits)
|
| 267 |
+
|
| 268 |
+
full_name = _title_case_name(name_line)
|
| 269 |
+
staff_entries.append({
|
| 270 |
+
"full_name": full_name,
|
| 271 |
+
"role": role or "Library staff member",
|
| 272 |
+
"details": " | ".join(_dedupe_keep_order(details_parts)),
|
| 273 |
+
"tokens": _build_staff_tokens(full_name),
|
| 274 |
+
"source_title": "Khalifa University Library Staff Directory and Contacts",
|
| 275 |
+
"source": "https://library.ku.ac.ae/librarystaff",
|
| 276 |
+
})
|
| 277 |
+
|
| 278 |
+
return staff_entries
|
| 279 |
+
|
| 280 |
+
def _load_staff_directory_from_kb():
|
| 281 |
+
entries = []
|
| 282 |
+
try:
|
| 283 |
+
for filepath in glob.glob(os.path.join(KNOWLEDGE_DIR, "*.txt")):
|
| 284 |
+
name = os.path.basename(filepath).lower()
|
| 285 |
+
if "staff" not in name and "contact" not in name:
|
| 286 |
+
continue
|
| 287 |
+
with open(filepath, "r", encoding="utf-8") as f:
|
| 288 |
+
content = f.read()
|
| 289 |
+
if "staff directory" not in content.lower() and "library staff" not in content.lower():
|
| 290 |
+
continue
|
| 291 |
+
entries.extend(_parse_staff_directory_text(content))
|
| 292 |
+
except Exception as e:
|
| 293 |
+
print(f"Staff KB parse error: {e}")
|
| 294 |
+
|
| 295 |
+
deduped = []
|
| 296 |
+
seen = set()
|
| 297 |
+
for entry in entries:
|
| 298 |
+
key = entry.get("full_name", "").lower()
|
| 299 |
+
if key and key not in seen:
|
| 300 |
+
seen.add(key)
|
| 301 |
+
deduped.append(entry)
|
| 302 |
+
return deduped
|
| 303 |
+
|
| 304 |
+
def _staff_lookup_candidates():
|
| 305 |
+
return kb_staff_directory or STAFF_DIRECTORY
|
| 306 |
+
|
| 307 |
def _match_staff_name(question: str):
|
| 308 |
tokens = _normalize_name_query(question)
|
| 309 |
+
if not tokens or len(tokens) > 5:
|
| 310 |
return None
|
| 311 |
ql = (question or "").strip().lower()
|
| 312 |
+
blocked = ["who is", "who handles", "who can help", "contact", "librarian", "library"]
|
| 313 |
if any(b in ql for b in blocked):
|
| 314 |
return None
|
| 315 |
+
for staff in _staff_lookup_candidates():
|
| 316 |
staff_tokens = set()
|
| 317 |
+
for tok in staff.get("tokens", []):
|
| 318 |
staff_tokens.update(_normalize_name_query(tok))
|
| 319 |
+
staff_tokens.add(" ".join(_normalize_name_query(tok)))
|
| 320 |
if all(tok in staff_tokens for tok in tokens):
|
| 321 |
return staff
|
| 322 |
return None
|
|
|
|
| 350 |
# ===== GLOBALS =====
|
| 351 |
vectorstore = None
|
| 352 |
http_client = None
|
| 353 |
+
kb_staff_directory = []
|
| 354 |
|
| 355 |
|
| 356 |
# ===== ANALYTICS DB =====
|
|
|
|
| 447 |
|
| 448 |
# ===== RAG SETUP =====
|
| 449 |
def load_documents():
|
| 450 |
+
global kb_staff_directory
|
| 451 |
docs = []
|
| 452 |
files = glob.glob(os.path.join(KNOWLEDGE_DIR, "*.txt"))
|
| 453 |
+
kb_staff_directory = _load_staff_directory_from_kb()
|
| 454 |
+
if kb_staff_directory:
|
| 455 |
+
print(f"Loaded {len(kb_staff_directory)} staff entries from KB")
|
| 456 |
for filepath in files:
|
| 457 |
try:
|
| 458 |
with open(filepath, "r", encoding="utf-8") as f:
|
|
|
|
| 1362 |
if staff_match:
|
| 1363 |
answer = _staff_name_answer(staff_match, question)
|
| 1364 |
elapsed = time.time() - start
|
| 1365 |
+
source_title = staff_match.get("source_title", "")
|
| 1366 |
+
source_url = staff_match.get("source", "")
|
| 1367 |
return {
|
| 1368 |
"answer": answer,
|
| 1369 |
"intent": "library_info",
|
| 1370 |
"tools_used": ["staff_name_match"],
|
| 1371 |
"search_results": [],
|
| 1372 |
+
"sources": ([{"title": source_title, "source": source_url}] if source_title or source_url else []),
|
| 1373 |
"model_used": req.model,
|
| 1374 |
"response_time": round(elapsed, 2),
|
| 1375 |
"corrected_query": question,
|
|
|
|
| 1377 |
"database_query": question,
|
| 1378 |
"original_question": question,
|
| 1379 |
"is_follow_up": False,
|
| 1380 |
+
"source_mode": "staff_kb" if kb_staff_directory else "staff_directory",
|
| 1381 |
}
|
| 1382 |
|
| 1383 |
# ---- Follow-up to the greeting menu ----
|