Spaces:

nikeshn
/

kulibrary

Running

App Files Files Community

nikeshn commited on 17 days ago

Commit

567fa26

verified ·

1 Parent(s): 46b58d7

Update app.py

Browse files

Files changed (1) hide show

app.py +169 -6

app.py CHANGED Viewed

@@ -149,18 +149,174 @@ STAFF_DIRECTORY = [
 def _normalize_name_query(text: str):
     return [t for t in re.sub(r"[^a-z0-9 ]+", " ", (text or "").lower()).split() if t]
 def _match_staff_name(question: str):
     tokens = _normalize_name_query(question)
-    if not tokens or len(tokens) > 3:
         return None
     ql = (question or "").strip().lower()
-    blocked = ["who is", "who handles", "who can help", "contact", "librarian", "library", "systems", "medical"]
     if any(b in ql for b in blocked):
         return None
-    for staff in STAFF_DIRECTORY:
         staff_tokens = set()
-        for tok in staff["tokens"]:
             staff_tokens.update(_normalize_name_query(tok))
         if all(tok in staff_tokens for tok in tokens):
             return staff
     return None
@@ -194,6 +350,7 @@ GROUNDED_LIBRARY_MAP = {
 # ===== GLOBALS =====
 vectorstore = None
 http_client = None
 # ===== ANALYTICS DB =====
@@ -290,8 +447,12 @@ def log_query(question, tool, model, response_time, result_count=0, error=None):
 # ===== RAG SETUP =====
 def load_documents():
     docs = []
     files = glob.glob(os.path.join(KNOWLEDGE_DIR, "*.txt"))
     for filepath in files:
         try:
             with open(filepath, "r", encoding="utf-8") as f:
@@ -1201,12 +1362,14 @@ async def agent_query(req: AgentRequest):
     if staff_match:
         answer = _staff_name_answer(staff_match, question)
         elapsed = time.time() - start
         return {
             "answer": answer,
             "intent": "library_info",
             "tools_used": ["staff_name_match"],
             "search_results": [],
-            "sources": [],
             "model_used": req.model,
             "response_time": round(elapsed, 2),
             "corrected_query": question,
@@ -1214,7 +1377,7 @@ async def agent_query(req: AgentRequest):
             "database_query": question,
             "original_question": question,
             "is_follow_up": False,
-            "source_mode": "staff_directory",
         }
     # ---- Follow-up to the greeting menu ----

 def _normalize_name_query(text: str):
     return [t for t in re.sub(r"[^a-z0-9 ]+", " ", (text or "").lower()).split() if t]
+def _dedupe_keep_order(items):
+    seen = set()
+    out = []
+    for item in items:
+        if item and item not in seen:
+            seen.add(item)
+            out.append(item)
+    return out
+def _title_case_name(name: str) -> str:
+    return re.sub(r"\s+", " ", (name or "").strip()).title()
+def _build_staff_tokens(full_name: str):
+    honorifics = {"dr", "mr", "mrs", "ms", "prof"}
+    raw_tokens = _normalize_name_query(full_name)
+    core_tokens = [t for t in raw_tokens if t not in honorifics]
+    token_lists = [raw_tokens, core_tokens]
+    variants = []
+    for toks in token_lists:
+        if not toks:
+            continue
+        variants.extend(toks)
+        variants.append(" ".join(toks))
+        for n in (2, 3, 4):
+            if len(toks) >= n:
+                for i in range(len(toks) - n + 1):
+                    variants.append(" ".join(toks[i:i+n]))
+    return _dedupe_keep_order(variants)
+def _parse_staff_directory_text(text: str):
+    staff_entries = []
+    if not text:
+        return staff_entries
+    lines = [line.rstrip() for line in text.splitlines()]
+    i = 0
+    while i < len(lines):
+        line = lines[i].strip()
+        is_name_line = (
+            line
+            and line == line.upper()
+            and not line.startswith("===")
+            and not line.startswith("SOURCE:")
+            and not line.startswith("TITLE:")
+            and any(ch.isalpha() for ch in line)
+            and len(line.split()) <= 10
+        )
+        if not is_name_line:
+            i += 1
+            continue
+        name_line = line
+        block = []
+        i += 1
+        while i < len(lines):
+            nxt = lines[i].strip()
+            next_is_name = (
+                nxt
+                and nxt == nxt.upper()
+                and not nxt.startswith("===")
+                and not nxt.startswith("SOURCE:")
+                and not nxt.startswith("TITLE:")
+                and any(ch.isalpha() for ch in nxt)
+                and len(nxt.split()) <= 10
+            )
+            if next_is_name:
+                break
+            block.append(nxt)
+            i += 1
+        role = ""
+        email = ""
+        phone = ""
+        mobile = ""
+        location = ""
+        best_for = ""
+        schedule = ""
+        extra_bits = []
+        for raw in block:
+            if not raw or raw.startswith("==="):
+                continue
+            low = raw.lower()
+            if raw.startswith("Title:"):
+                role = raw.split(":", 1)[1].strip()
+            elif raw.startswith("Email:"):
+                email = raw.split(":", 1)[1].strip()
+            elif raw.startswith("Phone:") or raw.startswith("Work Phone:"):
+                phone = raw.split(":", 1)[1].strip()
+            elif raw.startswith("Mobile:"):
+                mobile = raw.split(":", 1)[1].strip()
+            elif raw.startswith("Location:"):
+                location = raw.split(":", 1)[1].strip()
+            elif raw.startswith("Best for:"):
+                best_for = raw.split(":", 1)[1].strip()
+            elif raw.startswith("Schedule appointment:"):
+                schedule = raw.split(":", 1)[1].strip()
+            elif any(low.startswith(prefix) for prefix in ["linkedin:", "orcid:"]):
+                extra_bits.append(raw)
+            else:
+                extra_bits.append(raw)
+        details_parts = []
+        if best_for:
+            details_parts.append(f"Best for: {best_for}")
+        if email:
+            details_parts.append(f"Email: {email}")
+        if phone:
+            details_parts.append(f"Phone: {phone}")
+        if mobile:
+            details_parts.append(f"Mobile: {mobile}")
+        if location:
+            details_parts.append(f"Location: {location}")
+        if schedule:
+            details_parts.append(f"Schedule appointment: {schedule}")
+        details_parts.extend(extra_bits)
+        full_name = _title_case_name(name_line)
+        staff_entries.append({
+            "full_name": full_name,
+            "role": role or "Library staff member",
+            "details": " | ".join(_dedupe_keep_order(details_parts)),
+            "tokens": _build_staff_tokens(full_name),
+            "source_title": "Khalifa University Library Staff Directory and Contacts",
+            "source": "https://library.ku.ac.ae/librarystaff",
+        })
+    return staff_entries
+def _load_staff_directory_from_kb():
+    entries = []
+    try:
+        for filepath in glob.glob(os.path.join(KNOWLEDGE_DIR, "*.txt")):
+            name = os.path.basename(filepath).lower()
+            if "staff" not in name and "contact" not in name:
+                continue
+            with open(filepath, "r", encoding="utf-8") as f:
+                content = f.read()
+            if "staff directory" not in content.lower() and "library staff" not in content.lower():
+                continue
+            entries.extend(_parse_staff_directory_text(content))
+    except Exception as e:
+        print(f"Staff KB parse error: {e}")
+    deduped = []
+    seen = set()
+    for entry in entries:
+        key = entry.get("full_name", "").lower()
+        if key and key not in seen:
+            seen.add(key)
+            deduped.append(entry)
+    return deduped
+def _staff_lookup_candidates():
+    return kb_staff_directory or STAFF_DIRECTORY
 def _match_staff_name(question: str):
     tokens = _normalize_name_query(question)
+    if not tokens or len(tokens) > 5:
         return None
     ql = (question or "").strip().lower()
+    blocked = ["who is", "who handles", "who can help", "contact", "librarian", "library"]
     if any(b in ql for b in blocked):
         return None
+    for staff in _staff_lookup_candidates():
         staff_tokens = set()
+        for tok in staff.get("tokens", []):
             staff_tokens.update(_normalize_name_query(tok))
+            staff_tokens.add(" ".join(_normalize_name_query(tok)))
         if all(tok in staff_tokens for tok in tokens):
             return staff
     return None
 # ===== GLOBALS =====
 vectorstore = None
 http_client = None
+kb_staff_directory = []
 # ===== ANALYTICS DB =====
 # ===== RAG SETUP =====
 def load_documents():
+    global kb_staff_directory
     docs = []
     files = glob.glob(os.path.join(KNOWLEDGE_DIR, "*.txt"))
+    kb_staff_directory = _load_staff_directory_from_kb()
+    if kb_staff_directory:
+        print(f"Loaded {len(kb_staff_directory)} staff entries from KB")
     for filepath in files:
         try:
             with open(filepath, "r", encoding="utf-8") as f:
     if staff_match:
         answer = _staff_name_answer(staff_match, question)
         elapsed = time.time() - start
+        source_title = staff_match.get("source_title", "")
+        source_url = staff_match.get("source", "")
         return {
             "answer": answer,
             "intent": "library_info",
             "tools_used": ["staff_name_match"],
             "search_results": [],
+            "sources": ([{"title": source_title, "source": source_url}] if source_title or source_url else []),
             "model_used": req.model,
             "response_time": round(elapsed, 2),
             "corrected_query": question,
             "database_query": question,
             "original_question": question,
             "is_follow_up": False,
+            "source_mode": "staff_kb" if kb_staff_directory else "staff_directory",
         }
     # ---- Follow-up to the greeting menu ----