nikeshn commited on
Commit
567fa26
·
verified ·
1 Parent(s): 46b58d7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +169 -6
app.py CHANGED
@@ -149,18 +149,174 @@ STAFF_DIRECTORY = [
149
  def _normalize_name_query(text: str):
150
  return [t for t in re.sub(r"[^a-z0-9 ]+", " ", (text or "").lower()).split() if t]
151
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
152
  def _match_staff_name(question: str):
153
  tokens = _normalize_name_query(question)
154
- if not tokens or len(tokens) > 3:
155
  return None
156
  ql = (question or "").strip().lower()
157
- blocked = ["who is", "who handles", "who can help", "contact", "librarian", "library", "systems", "medical"]
158
  if any(b in ql for b in blocked):
159
  return None
160
- for staff in STAFF_DIRECTORY:
161
  staff_tokens = set()
162
- for tok in staff["tokens"]:
163
  staff_tokens.update(_normalize_name_query(tok))
 
164
  if all(tok in staff_tokens for tok in tokens):
165
  return staff
166
  return None
@@ -194,6 +350,7 @@ GROUNDED_LIBRARY_MAP = {
194
  # ===== GLOBALS =====
195
  vectorstore = None
196
  http_client = None
 
197
 
198
 
199
  # ===== ANALYTICS DB =====
@@ -290,8 +447,12 @@ def log_query(question, tool, model, response_time, result_count=0, error=None):
290
 
291
  # ===== RAG SETUP =====
292
  def load_documents():
 
293
  docs = []
294
  files = glob.glob(os.path.join(KNOWLEDGE_DIR, "*.txt"))
 
 
 
295
  for filepath in files:
296
  try:
297
  with open(filepath, "r", encoding="utf-8") as f:
@@ -1201,12 +1362,14 @@ async def agent_query(req: AgentRequest):
1201
  if staff_match:
1202
  answer = _staff_name_answer(staff_match, question)
1203
  elapsed = time.time() - start
 
 
1204
  return {
1205
  "answer": answer,
1206
  "intent": "library_info",
1207
  "tools_used": ["staff_name_match"],
1208
  "search_results": [],
1209
- "sources": [],
1210
  "model_used": req.model,
1211
  "response_time": round(elapsed, 2),
1212
  "corrected_query": question,
@@ -1214,7 +1377,7 @@ async def agent_query(req: AgentRequest):
1214
  "database_query": question,
1215
  "original_question": question,
1216
  "is_follow_up": False,
1217
- "source_mode": "staff_directory",
1218
  }
1219
 
1220
  # ---- Follow-up to the greeting menu ----
 
149
  def _normalize_name_query(text: str):
150
  return [t for t in re.sub(r"[^a-z0-9 ]+", " ", (text or "").lower()).split() if t]
151
 
152
+ def _dedupe_keep_order(items):
153
+ seen = set()
154
+ out = []
155
+ for item in items:
156
+ if item and item not in seen:
157
+ seen.add(item)
158
+ out.append(item)
159
+ return out
160
+
161
+ def _title_case_name(name: str) -> str:
162
+ return re.sub(r"\s+", " ", (name or "").strip()).title()
163
+
164
+ def _build_staff_tokens(full_name: str):
165
+ honorifics = {"dr", "mr", "mrs", "ms", "prof"}
166
+ raw_tokens = _normalize_name_query(full_name)
167
+ core_tokens = [t for t in raw_tokens if t not in honorifics]
168
+ token_lists = [raw_tokens, core_tokens]
169
+ variants = []
170
+ for toks in token_lists:
171
+ if not toks:
172
+ continue
173
+ variants.extend(toks)
174
+ variants.append(" ".join(toks))
175
+ for n in (2, 3, 4):
176
+ if len(toks) >= n:
177
+ for i in range(len(toks) - n + 1):
178
+ variants.append(" ".join(toks[i:i+n]))
179
+ return _dedupe_keep_order(variants)
180
+
181
+ def _parse_staff_directory_text(text: str):
182
+ staff_entries = []
183
+ if not text:
184
+ return staff_entries
185
+
186
+ lines = [line.rstrip() for line in text.splitlines()]
187
+ i = 0
188
+ while i < len(lines):
189
+ line = lines[i].strip()
190
+ is_name_line = (
191
+ line
192
+ and line == line.upper()
193
+ and not line.startswith("===")
194
+ and not line.startswith("SOURCE:")
195
+ and not line.startswith("TITLE:")
196
+ and any(ch.isalpha() for ch in line)
197
+ and len(line.split()) <= 10
198
+ )
199
+ if not is_name_line:
200
+ i += 1
201
+ continue
202
+
203
+ name_line = line
204
+ block = []
205
+ i += 1
206
+ while i < len(lines):
207
+ nxt = lines[i].strip()
208
+ next_is_name = (
209
+ nxt
210
+ and nxt == nxt.upper()
211
+ and not nxt.startswith("===")
212
+ and not nxt.startswith("SOURCE:")
213
+ and not nxt.startswith("TITLE:")
214
+ and any(ch.isalpha() for ch in nxt)
215
+ and len(nxt.split()) <= 10
216
+ )
217
+ if next_is_name:
218
+ break
219
+ block.append(nxt)
220
+ i += 1
221
+
222
+ role = ""
223
+ email = ""
224
+ phone = ""
225
+ mobile = ""
226
+ location = ""
227
+ best_for = ""
228
+ schedule = ""
229
+ extra_bits = []
230
+ for raw in block:
231
+ if not raw or raw.startswith("==="):
232
+ continue
233
+ low = raw.lower()
234
+ if raw.startswith("Title:"):
235
+ role = raw.split(":", 1)[1].strip()
236
+ elif raw.startswith("Email:"):
237
+ email = raw.split(":", 1)[1].strip()
238
+ elif raw.startswith("Phone:") or raw.startswith("Work Phone:"):
239
+ phone = raw.split(":", 1)[1].strip()
240
+ elif raw.startswith("Mobile:"):
241
+ mobile = raw.split(":", 1)[1].strip()
242
+ elif raw.startswith("Location:"):
243
+ location = raw.split(":", 1)[1].strip()
244
+ elif raw.startswith("Best for:"):
245
+ best_for = raw.split(":", 1)[1].strip()
246
+ elif raw.startswith("Schedule appointment:"):
247
+ schedule = raw.split(":", 1)[1].strip()
248
+ elif any(low.startswith(prefix) for prefix in ["linkedin:", "orcid:"]):
249
+ extra_bits.append(raw)
250
+ else:
251
+ extra_bits.append(raw)
252
+
253
+ details_parts = []
254
+ if best_for:
255
+ details_parts.append(f"Best for: {best_for}")
256
+ if email:
257
+ details_parts.append(f"Email: {email}")
258
+ if phone:
259
+ details_parts.append(f"Phone: {phone}")
260
+ if mobile:
261
+ details_parts.append(f"Mobile: {mobile}")
262
+ if location:
263
+ details_parts.append(f"Location: {location}")
264
+ if schedule:
265
+ details_parts.append(f"Schedule appointment: {schedule}")
266
+ details_parts.extend(extra_bits)
267
+
268
+ full_name = _title_case_name(name_line)
269
+ staff_entries.append({
270
+ "full_name": full_name,
271
+ "role": role or "Library staff member",
272
+ "details": " | ".join(_dedupe_keep_order(details_parts)),
273
+ "tokens": _build_staff_tokens(full_name),
274
+ "source_title": "Khalifa University Library Staff Directory and Contacts",
275
+ "source": "https://library.ku.ac.ae/librarystaff",
276
+ })
277
+
278
+ return staff_entries
279
+
280
+ def _load_staff_directory_from_kb():
281
+ entries = []
282
+ try:
283
+ for filepath in glob.glob(os.path.join(KNOWLEDGE_DIR, "*.txt")):
284
+ name = os.path.basename(filepath).lower()
285
+ if "staff" not in name and "contact" not in name:
286
+ continue
287
+ with open(filepath, "r", encoding="utf-8") as f:
288
+ content = f.read()
289
+ if "staff directory" not in content.lower() and "library staff" not in content.lower():
290
+ continue
291
+ entries.extend(_parse_staff_directory_text(content))
292
+ except Exception as e:
293
+ print(f"Staff KB parse error: {e}")
294
+
295
+ deduped = []
296
+ seen = set()
297
+ for entry in entries:
298
+ key = entry.get("full_name", "").lower()
299
+ if key and key not in seen:
300
+ seen.add(key)
301
+ deduped.append(entry)
302
+ return deduped
303
+
304
+ def _staff_lookup_candidates():
305
+ return kb_staff_directory or STAFF_DIRECTORY
306
+
307
  def _match_staff_name(question: str):
308
  tokens = _normalize_name_query(question)
309
+ if not tokens or len(tokens) > 5:
310
  return None
311
  ql = (question or "").strip().lower()
312
+ blocked = ["who is", "who handles", "who can help", "contact", "librarian", "library"]
313
  if any(b in ql for b in blocked):
314
  return None
315
+ for staff in _staff_lookup_candidates():
316
  staff_tokens = set()
317
+ for tok in staff.get("tokens", []):
318
  staff_tokens.update(_normalize_name_query(tok))
319
+ staff_tokens.add(" ".join(_normalize_name_query(tok)))
320
  if all(tok in staff_tokens for tok in tokens):
321
  return staff
322
  return None
 
350
  # ===== GLOBALS =====
351
  vectorstore = None
352
  http_client = None
353
+ kb_staff_directory = []
354
 
355
 
356
  # ===== ANALYTICS DB =====
 
447
 
448
  # ===== RAG SETUP =====
449
  def load_documents():
450
+ global kb_staff_directory
451
  docs = []
452
  files = glob.glob(os.path.join(KNOWLEDGE_DIR, "*.txt"))
453
+ kb_staff_directory = _load_staff_directory_from_kb()
454
+ if kb_staff_directory:
455
+ print(f"Loaded {len(kb_staff_directory)} staff entries from KB")
456
  for filepath in files:
457
  try:
458
  with open(filepath, "r", encoding="utf-8") as f:
 
1362
  if staff_match:
1363
  answer = _staff_name_answer(staff_match, question)
1364
  elapsed = time.time() - start
1365
+ source_title = staff_match.get("source_title", "")
1366
+ source_url = staff_match.get("source", "")
1367
  return {
1368
  "answer": answer,
1369
  "intent": "library_info",
1370
  "tools_used": ["staff_name_match"],
1371
  "search_results": [],
1372
+ "sources": ([{"title": source_title, "source": source_url}] if source_title or source_url else []),
1373
  "model_used": req.model,
1374
  "response_time": round(elapsed, 2),
1375
  "corrected_query": question,
 
1377
  "database_query": question,
1378
  "original_question": question,
1379
  "is_follow_up": False,
1380
+ "source_mode": "staff_kb" if kb_staff_directory else "staff_directory",
1381
  }
1382
 
1383
  # ---- Follow-up to the greeting menu ----