Spaces:

nikeshn
/

kulibrary

Running

App Files Files Community

nikeshn commited on 13 days ago

Commit

1e10395

verified ·

1 Parent(s): fde9594

Update app.py

Browse files

Files changed (1) hide show

app.py +117 -60

app.py CHANGED Viewed

@@ -1407,37 +1407,50 @@ def _make_boolean(text: str) -> str:
 def _clean_database_keywords(boolean_query: str) -> str:
     return re.sub(r'\s+', ' ', re.sub(r'\b(AND|OR|NOT)\b|[()"]', ' ', boolean_query, flags=re.IGNORECASE)).strip()
 async def _build_search_plan(query: str, year: int = 2026) -> dict:
     prompt = f"""You are a search expert for Khalifa University Library.
-The user typed a query that may be messy, fragmented, or use chat phrasing.
-Create THREE forms:
-1. corrected: spell-fixed version of the core topic (remove chat fragments like "and for", "also about")
-2. natural: a MEANINGFUL research phrase for AI tools (Consensus, Perplexity, Semantic Scholar, LeapSpace).
-   - Must be a proper research question or phrase — NOT raw keywords, NOT Boolean
-   - Should add helpful context: "physics" → "recent advances in physics research"
-   - If input is fragmented ("and for physics", "also about AI") extract topic and expand it
-   - Aim for 5-10 words that a researcher would actually type into an AI tool
-3. boolean: PRIMO/PubMed Boolean with AND/OR/parentheses for traditional database search
-Examples:
-Input: "and for physics"
-→ corrected:"physics", natural:"recent advances and developments in physics research", boolean:("physics" OR "physical sciences")
-Input: "impuct glubal waming"
-→ corrected:"impact global warming", natural:"impact of global warming on environment and climate", boolean:("global warming" OR "climate change") AND (impact OR effect)
-Input: "machne lerning helthcare"
-→ corrected:"machine learning healthcare", natural:"machine learning applications in healthcare and medicine", boolean:("machine learning" OR "deep learning" OR AI) AND (healthcare OR clinical OR medical)
-Input: "renewable energy 2023"
-→ corrected:"renewable energy 2023", natural:"renewable energy sources and sustainability research", boolean:("renewable energy" OR "clean energy" OR "solar energy"), year_from:"2023"
 Return ONLY valid JSON:
-{{"corrected":"spell-fixed core topic","natural":"meaningful 5-10 word research phrase","boolean":"(A OR B) AND (C OR D)","year_from":"","year_to":"","peer_reviewed":false,"open_access":false}}
-Query: "{query}"""
     try:
         llm = ChatOpenAI(model="gpt-4o-mini", temperature=0, max_tokens=300)
         response = llm.invoke(prompt)
@@ -1460,11 +1473,12 @@ Query: "{query}"""
             "open_access": bool(result.get("open_access", False)),
         }
     except Exception:
-        boolean = _make_boolean(query)
-        corrected = query.strip() or query
         return {
             "corrected": corrected,
-            "natural": corrected,
             "boolean": boolean,
             "database_query": _clean_database_keywords(boolean),
             "year_from": "",
@@ -1514,17 +1528,32 @@ async def _interpret_semantics(question: str, history=None) -> dict:
     if _looks_nonlibrary_ku_question(q):
         return {"intent_hint": "general", "canonical_terms": canonical_terms, "grounding_keys": grounding_keys, "social": False}
     # Staff / role semantics
-    if re.search(r"\b(systems? librarian|system librarian|website|digital services|library systems?|technology help)\b", ql):
         add("systems_help", "Walter Brian Hall", "Systems Librarian", "website", "technology")
     if re.search(r"\b(database access|e-?resources?|remote access|off campus|off-campus|login issue|access problem|vendor issue)\b", ql):
-        add("database_access", "Rani Anand", "E-Resources", "database access")
     if re.search(r"\b(orcid|open access|apc|article processing charge|research impact|bibliometric|bibliometrics|scival|scopus metrics?)\b", ql):
-        add("orcid_oa", "Walter Brian Hall", "ORCID", "Open Access", "APC", "openaccess@ku.ac.ae")
     if re.search(r"\b(research support|research impact|bibliometrics|scival|khazna|scholarly communication|libguides)\b", ql):
-        add("research_help", "Nikesh Narayanan", "research support", "bibliometrics", "Khazna")
     if re.search(r"\b(medical librarian|pubmed help|embase|cinahl|cochrane|uptodate|systematic review|clinical databases?)\b", ql):
-        add("medical_help", "Jason Fetty", "Medical Librarian", "PubMed", "systematic review")
     if re.search(r"\b(acquisitions?|collection development|suggest a book|request a title|new title request|purchase request|book request)\b", ql):
         add("acquisitions", "Alia Al-Harrasi", "Meera Alnaqbi", "Acquisitions", "collection development")
     if re.search(r"\b(catalogu(?:e|ing)|cataloging|metadata|cataloguer)\b", ql):
@@ -1680,23 +1709,14 @@ def _ku_general_redirect_answer() -> str:
 @app.post("/correct")
 async def correct_query(req: CorrectRequest):
     """
-    Spell-correct a search query and build both natural and Boolean forms
-    for traditional databases such as PRIMO and PubMed.
-    Pre-cleans the query to strip chat prefixes and connectors before processing.
     """
-    # Strip chat prefixes, connectors, and follow-up fragments
-    # so "and for physics" → "physics" before LLM processing
     raw = req.query.strip()
-    cleaned = re.sub(
-        r'^(find articles and books on|find articles on|find books on|'
-        r'search for|look for|i want|i need|give me|show me|get me|'
-        r'and\s+(also\s+)?(for|about|on|in|related to|regarding)|'
-        r'also\s+(for|about|on|in)|what about|how about|or\s+about|'
-        r'tell me about|more\s+(about|on))\s+',
-        '', raw, flags=re.IGNORECASE
-    ).strip()
-    # Use cleaned query if it's non-empty, otherwise fall back to raw
-    query_to_use = cleaned if cleaned else raw
     plan = await _build_search_plan(query_to_use, req.year)
     return {
         "corrected": plan["corrected"],
@@ -2392,10 +2412,15 @@ async def agent_query(req: AgentRequest):
     if intent in ("search_academic", "search_medical"):
         import asyncio as _asyncio
         search_plan = await _build_search_plan(question)
         natural_query  = search_plan["natural"]
         database_query = search_plan["database_query"] or search_plan["corrected"]
-        tasks = [tool_search_primo(database_query, limit=5)]
         if intent == "search_medical":
             tasks.append(tool_search_pubmed(database_query, limit=3))
         else:
@@ -2409,6 +2434,13 @@ async def agent_query(req: AgentRequest):
                 combined.extend(r["results"])
                 tools_used.append(r.get("source", "unknown"))
         rag = await tool_library_info(question, history[-3:] if history else None, model=req.model)
         tools_used.append("get_library_info")
         tools_used = list(dict.fromkeys(tools_used))
@@ -2417,29 +2449,53 @@ async def agent_query(req: AgentRequest):
         if rag.get("answer"):
             context_parts.append(f"Library Knowledge Base:\n{rag['answer']}")
         if combined:
-            top = combined[:3]
-            res_text = "\n".join(
-                f"- {r.get('title','')} by {r.get('creator','')} ({r.get('date','')})"
-                for r in top
-            )
-            context_parts.append(f"Search Results:\n{res_text}")
         context_parts.append(f"Natural query for AI tools: {natural_query}")
         context_parts.append(f"Database query for PRIMO/PubMed: {database_query}")
         behavior = get_behavior_instructions()
-        synthesis_prompt = (
-            f"{behavior}\n\n"
-            "You are the KU Library AI Assistant. Be concise (3-5 sentences).\n"
-            "Briefly describe the search direction and mention 1-2 top results if present.\n\n"
-            f"Context:\n{chr(10).join(context_parts) if context_parts else 'No additional context.'}\n\n"
-            f"Question: {question}\nAnswer:"
-        )
         try:
             if use_claude:
                 from langchain_anthropic import ChatAnthropic
-                synth_llm = ChatAnthropic(model="claude-haiku-4-5-20251001", temperature=0.2, max_tokens=600)
             else:
-                synth_llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.2, max_tokens=600)
             answer = synth_llm.invoke(synthesis_prompt).content.strip()
         except Exception as ex:
             answer = rag.get("answer", f"Error generating answer: {ex}")
@@ -2448,8 +2504,9 @@ async def agent_query(req: AgentRequest):
         return _make_agent_response(
             answer=answer, intent=intent, tools_used=tools_used,
             search_results=combined[:8], sources=rag.get("sources", []),
-            model=req.model, elapsed=elapsed, question=question,
             natural_query=natural_query, database_query=database_query,
         )
     # ── General / general_recent — web search or plain LLM ───────────

 def _clean_database_keywords(boolean_query: str) -> str:
     return re.sub(r'\s+', ' ', re.sub(r'\b(AND|OR|NOT)\b|[()"]', ' ', boolean_query, flags=re.IGNORECASE)).strip()
+def _light_strip_retrieval_boilerplate(text: str) -> str:
+    cleaned = re.sub(r'^\s*(please\s+)?(?:can you|could you|would you)\s+', '', (text or '').strip(), flags=re.IGNORECASE)
+    cleaned = re.sub(r'^\s*(please\s+)?help me\s+', '', cleaned, flags=re.IGNORECASE)
+    cleaned = re.sub(r'^\s*please\s+', '', cleaned, flags=re.IGNORECASE)
+    cleaned = re.sub(r'\s+(please|thanks|thank you|asap)$', '', cleaned, flags=re.IGNORECASE)
+    return re.sub(r'\s+', ' ', cleaned).strip()
 async def _build_search_plan(query: str, year: int = 2026) -> dict:
+    raw_query = (query or "").strip()
+    light_query = _light_strip_retrieval_boilerplate(raw_query) or raw_query
     prompt = f"""You are a search expert for Khalifa University Library.
+The user typed a research request that may contain spelling issues or conversational phrasing.
+Create THREE forms while preserving the user's full intent.
+1. corrected: lightly edit the FULL request for spelling, grammar, and clarity.
+   Preserve every substantive concept, constraint, and task.
+   Do NOT collapse it into a short topic.
+2. natural: write a natural-language research query for AI tools such as Consensus,
+   Perplexity, Semantic Scholar, Scopus AI, PRIMO AI, and LeapSpace.
+   Keep the full context and constraints, but you may remove only retrieval boilerplate
+   such as "find an article", "show me", "get me", or "can you find".
+   Keep constraints like one article vs many, peer reviewed, last five years,
+   summarize, methodology, findings, strengths, limitations, and contribution.
+3. boolean: write a PRIMO/PubMed Boolean query with AND/OR/parentheses that preserves
+   the main topical concepts and important search constraints.
+Examples:
+Input: "Find one peer-reviewed article from the last five years on climate change and biodiversity. Summarize the methodology and findings."
+→ corrected:"Find one peer-reviewed article from the last five years on climate change and biodiversity. Summarize the methodology and findings."
+→ natural:"Peer-reviewed article from the last five years on climate change and biodiversity. Summarize the methodology and findings."
+→ boolean:("climate change" OR "global warming") AND (biodiversity OR ecosystems)
+Input: "impuct glubal waming on biodiversty"
+→ corrected:"impact of global warming on biodiversity"
+→ natural:"impact of global warming on biodiversity"
+→ boolean:("global warming" OR "climate change") AND biodiversity
 Return ONLY valid JSON:
+{{"corrected":"full polished request","natural":"full natural-language research query","boolean":"(A OR B) AND (C OR D)","year_from":"","year_to":"","peer_reviewed":false,"open_access":false}}
+Query: "{light_query}"""
     try:
         llm = ChatOpenAI(model="gpt-4o-mini", temperature=0, max_tokens=300)
         response = llm.invoke(prompt)
             "open_access": bool(result.get("open_access", False)),
         }
     except Exception:
+        boolean = _make_boolean(raw_query)
+        corrected = raw_query.strip() or raw_query
+        natural = _light_strip_retrieval_boilerplate(raw_query) or corrected
         return {
             "corrected": corrected,
+            "natural": natural,
             "boolean": boolean,
             "database_query": _clean_database_keywords(boolean),
             "year_from": "",
     if _looks_nonlibrary_ku_question(q):
         return {"intent_hint": "general", "canonical_terms": canonical_terms, "grounding_keys": grounding_keys, "social": False}
+    contact_or_support = bool(re.search(r"\b(who handles|who can help|who should i contact|contact for|email for|phone for|librarian for|help with access|access problem|login issue|remote access problem|vendor issue|technical issue|support)\b", ql))
+    resource_or_search_task = bool(re.search(r"\b(best|which|recommend|suggest|compare|difference|find|search|articles?|papers?|books?|literature|study|studies|review|summariz(?:e|ing)|summaris(?:e|ing)|evaluate|critique|one article|single article|latest|recent|last \d+ years?)\b", ql))
     # Staff / role semantics
+    if re.search(r"\b(systems? librarian|system librarian|website|digital services|library systems?|technology help)\b", ql) and contact_or_support:
         add("systems_help", "Walter Brian Hall", "Systems Librarian", "website", "technology")
     if re.search(r"\b(database access|e-?resources?|remote access|off campus|off-campus|login issue|access problem|vendor issue)\b", ql):
+        if contact_or_support or re.search(r"\b(access problem|login issue|vendor issue|remote access problem|off campus access)\b", ql):
+            add("database_access", "Rani Anand", "E-Resources", "database access")
+        else:
+            add("database_access", "databases", "e-resources", "remote access")
     if re.search(r"\b(orcid|open access|apc|article processing charge|research impact|bibliometric|bibliometrics|scival|scopus metrics?)\b", ql):
+        if contact_or_support and not resource_or_search_task:
+            add("orcid_oa", "Walter Brian Hall", "ORCID", "Open Access", "APC", "openaccess@ku.ac.ae")
+        else:
+            add("orcid_oa", "ORCID", "Open Access", "APC", "research impact")
     if re.search(r"\b(research support|research impact|bibliometrics|scival|khazna|scholarly communication|libguides)\b", ql):
+        if contact_or_support and not resource_or_search_task:
+            add("research_help", "Nikesh Narayanan", "research support", "bibliometrics", "Khazna")
+        else:
+            add("research_help", "research support", "bibliometrics", "Khazna")
     if re.search(r"\b(medical librarian|pubmed help|embase|cinahl|cochrane|uptodate|systematic review|clinical databases?)\b", ql):
+        if contact_or_support and not resource_or_search_task:
+            add("medical_help", "Jason Fetty", "Medical Librarian", "PubMed", "systematic review")
+        else:
+            add("medical_help", "PubMed", "Embase", "CINAHL", "Cochrane", "UpToDate")
     if re.search(r"\b(acquisitions?|collection development|suggest a book|request a title|new title request|purchase request|book request)\b", ql):
         add("acquisitions", "Alia Al-Harrasi", "Meera Alnaqbi", "Acquisitions", "collection development")
     if re.search(r"\b(catalogu(?:e|ing)|cataloging|metadata|cataloguer)\b", ql):
 @app.post("/correct")
 async def correct_query(req: CorrectRequest):
     """
+    Build three query forms from the user's full request:
+      - corrected: polished full request
+      - natural: full natural-language AI-tool query
+      - boolean: database-ready Boolean query
+    Only light cleanup is applied before planning.
     """
     raw = req.query.strip()
+    query_to_use = _light_strip_retrieval_boilerplate(raw) or raw
     plan = await _build_search_plan(query_to_use, req.year)
     return {
         "corrected": plan["corrected"],
     if intent in ("search_academic", "search_medical"):
         import asyncio as _asyncio
         search_plan = await _build_search_plan(question)
+        corrected_query = search_plan["corrected"]
         natural_query  = search_plan["natural"]
         database_query = search_plan["database_query"] or search_plan["corrected"]
+        year_from = search_plan.get("year_from") or None
+        year_to = search_plan.get("year_to") or None
+        peer_reviewed = bool(search_plan.get("peer_reviewed"))
+        open_access = bool(search_plan.get("open_access"))
+        tasks = [tool_search_primo(database_query, limit=5, peer_reviewed=peer_reviewed, open_access=open_access, year_from=year_from, year_to=year_to)]
         if intent == "search_medical":
             tasks.append(tool_search_pubmed(database_query, limit=3))
         else:
                 combined.extend(r["results"])
                 tools_used.append(r.get("source", "unknown"))
+        wants_single_article = bool(re.search(r"\b(one|single)\s+(peer.?reviewed\s+)?(article|paper|study)\b", question, re.IGNORECASE))
+        wants_structured_summary = bool(re.search(r"\b(summariz(?:e|ing)|summaris(?:e|ing)|main research question|methodology|methods?|key findings|strengths?|limitations?|critical(?:ly)? evaluate|critique|contribution)\b", question, re.IGNORECASE))
+        if wants_single_article or wants_structured_summary:
+            preferred = [r for r in combined if r.get("_source") in ("Semantic Scholar", "PubMed")]
+            remainder = [r for r in combined if r.get("_source") not in ("Semantic Scholar", "PubMed")]
+            combined = preferred + remainder
         rag = await tool_library_info(question, history[-3:] if history else None, model=req.model)
         tools_used.append("get_library_info")
         tools_used = list(dict.fromkeys(tools_used))
         if rag.get("answer"):
             context_parts.append(f"Library Knowledge Base:\n{rag['answer']}")
         if combined:
+            top = combined[:5]
+            res_lines = []
+            for idx, r in enumerate(top, 1):
+                res_lines.append(
+                    f"{idx}. Title: {r.get('title','')}\n"
+                    f"   Authors: {r.get('creator','')}\n"
+                    f"   Year: {r.get('date','')}\n"
+                    f"   Source: {r.get('source','')}\n"
+                    f"   Type: {r.get('type','')}\n"
+                    f"   DOI: {r.get('doi','')}\n"
+                    f"   Link: {r.get('link','')}\n"
+                    f"   Abstract/Description: {r.get('description','')}"
+                )
+            context_parts.append("Candidate Search Results:\n" + "\n\n".join(res_lines))
         context_parts.append(f"Natural query for AI tools: {natural_query}")
         context_parts.append(f"Database query for PRIMO/PubMed: {database_query}")
         behavior = get_behavior_instructions()
+        if wants_single_article or wants_structured_summary:
+            synthesis_prompt = (
+                f"{behavior}\n\n"
+                "You are the KU Library AI Assistant.\n"
+                "The user wants a direct answer in the chat, not just search directions.\n"
+                "Choose the single best-matching article from the candidate results, following the user's constraints as closely as possible.\n"
+                "Prefer a result with an abstract/description when available.\n"
+                "If exact compliance is uncertain, say so briefly.\n"
+                "If you rely only on metadata/abstract rather than full text, say that explicitly.\n\n"
+                "Format your answer with these headings when relevant:\n"
+                "Recommended article\nWhy it fits\nMain research question\nMethodology\nKey findings\nStrengths of the evidence\nLimitations of the evidence\nContribution to current understanding\n\n"
+                f"Context:\n{chr(10).join(context_parts) if context_parts else 'No additional context.'}\n\n"
+                f"Question: {question}\nAnswer:"
+            )
+        else:
+            synthesis_prompt = (
+                f"{behavior}\n\n"
+                "You are the KU Library AI Assistant. Be concise but helpful (4-7 sentences).\n"
+                "Answer the user's search request directly, mention the search direction, and mention 1-3 strong results when present.\n"
+                "Keep the answer in the chat rather than redirecting the user elsewhere.\n\n"
+                f"Context:\n{chr(10).join(context_parts) if context_parts else 'No additional context.'}\n\n"
+                f"Question: {question}\nAnswer:"
+            )
         try:
             if use_claude:
                 from langchain_anthropic import ChatAnthropic
+                synth_llm = ChatAnthropic(model="claude-haiku-4-5-20251001", temperature=0.2, max_tokens=900)
             else:
+                synth_llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.2, max_tokens=900)
             answer = synth_llm.invoke(synthesis_prompt).content.strip()
         except Exception as ex:
             answer = rag.get("answer", f"Error generating answer: {ex}")
         return _make_agent_response(
             answer=answer, intent=intent, tools_used=tools_used,
             search_results=combined[:8], sources=rag.get("sources", []),
+            model=req.model, elapsed=elapsed, question=corrected_query,
             natural_query=natural_query, database_query=database_query,
+            original_question=question,
         )
     # ── General / general_recent — web search or plain LLM ───────────