nikeshn commited on
Commit
1e10395
Β·
verified Β·
1 Parent(s): fde9594

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +117 -60
app.py CHANGED
@@ -1407,37 +1407,50 @@ def _make_boolean(text: str) -> str:
1407
  def _clean_database_keywords(boolean_query: str) -> str:
1408
  return re.sub(r'\s+', ' ', re.sub(r'\b(AND|OR|NOT)\b|[()"]', ' ', boolean_query, flags=re.IGNORECASE)).strip()
1409
 
 
 
 
 
 
 
 
1410
  async def _build_search_plan(query: str, year: int = 2026) -> dict:
 
 
1411
  prompt = f"""You are a search expert for Khalifa University Library.
1412
 
1413
- The user typed a query that may be messy, fragmented, or use chat phrasing.
1414
- Create THREE forms:
1415
 
1416
- 1. corrected: spell-fixed version of the core topic (remove chat fragments like "and for", "also about")
1417
- 2. natural: a MEANINGFUL research phrase for AI tools (Consensus, Perplexity, Semantic Scholar, LeapSpace).
1418
- - Must be a proper research question or phrase β€” NOT raw keywords, NOT Boolean
1419
- - Should add helpful context: "physics" β†’ "recent advances in physics research"
1420
- - If input is fragmented ("and for physics", "also about AI") extract topic and expand it
1421
- - Aim for 5-10 words that a researcher would actually type into an AI tool
1422
- 3. boolean: PRIMO/PubMed Boolean with AND/OR/parentheses for traditional database search
1423
 
1424
- Examples:
1425
- Input: "and for physics"
1426
- β†’ corrected:"physics", natural:"recent advances and developments in physics research", boolean:("physics" OR "physical sciences")
 
 
 
1427
 
1428
- Input: "impuct glubal waming"
1429
- β†’ corrected:"impact global warming", natural:"impact of global warming on environment and climate", boolean:("global warming" OR "climate change") AND (impact OR effect)
1430
 
1431
- Input: "machne lerning helthcare"
1432
- β†’ corrected:"machine learning healthcare", natural:"machine learning applications in healthcare and medicine", boolean:("machine learning" OR "deep learning" OR AI) AND (healthcare OR clinical OR medical)
 
 
 
1433
 
1434
- Input: "renewable energy 2023"
1435
- β†’ corrected:"renewable energy 2023", natural:"renewable energy sources and sustainability research", boolean:("renewable energy" OR "clean energy" OR "solar energy"), year_from:"2023"
 
 
1436
 
1437
  Return ONLY valid JSON:
1438
- {{"corrected":"spell-fixed core topic","natural":"meaningful 5-10 word research phrase","boolean":"(A OR B) AND (C OR D)","year_from":"","year_to":"","peer_reviewed":false,"open_access":false}}
1439
 
1440
- Query: "{query}"""
1441
  try:
1442
  llm = ChatOpenAI(model="gpt-4o-mini", temperature=0, max_tokens=300)
1443
  response = llm.invoke(prompt)
@@ -1460,11 +1473,12 @@ Query: "{query}"""
1460
  "open_access": bool(result.get("open_access", False)),
1461
  }
1462
  except Exception:
1463
- boolean = _make_boolean(query)
1464
- corrected = query.strip() or query
 
1465
  return {
1466
  "corrected": corrected,
1467
- "natural": corrected,
1468
  "boolean": boolean,
1469
  "database_query": _clean_database_keywords(boolean),
1470
  "year_from": "",
@@ -1514,17 +1528,32 @@ async def _interpret_semantics(question: str, history=None) -> dict:
1514
  if _looks_nonlibrary_ku_question(q):
1515
  return {"intent_hint": "general", "canonical_terms": canonical_terms, "grounding_keys": grounding_keys, "social": False}
1516
 
 
 
 
1517
  # Staff / role semantics
1518
- if re.search(r"\b(systems? librarian|system librarian|website|digital services|library systems?|technology help)\b", ql):
1519
  add("systems_help", "Walter Brian Hall", "Systems Librarian", "website", "technology")
1520
  if re.search(r"\b(database access|e-?resources?|remote access|off campus|off-campus|login issue|access problem|vendor issue)\b", ql):
1521
- add("database_access", "Rani Anand", "E-Resources", "database access")
 
 
 
1522
  if re.search(r"\b(orcid|open access|apc|article processing charge|research impact|bibliometric|bibliometrics|scival|scopus metrics?)\b", ql):
1523
- add("orcid_oa", "Walter Brian Hall", "ORCID", "Open Access", "APC", "openaccess@ku.ac.ae")
 
 
 
1524
  if re.search(r"\b(research support|research impact|bibliometrics|scival|khazna|scholarly communication|libguides)\b", ql):
1525
- add("research_help", "Nikesh Narayanan", "research support", "bibliometrics", "Khazna")
 
 
 
1526
  if re.search(r"\b(medical librarian|pubmed help|embase|cinahl|cochrane|uptodate|systematic review|clinical databases?)\b", ql):
1527
- add("medical_help", "Jason Fetty", "Medical Librarian", "PubMed", "systematic review")
 
 
 
1528
  if re.search(r"\b(acquisitions?|collection development|suggest a book|request a title|new title request|purchase request|book request)\b", ql):
1529
  add("acquisitions", "Alia Al-Harrasi", "Meera Alnaqbi", "Acquisitions", "collection development")
1530
  if re.search(r"\b(catalogu(?:e|ing)|cataloging|metadata|cataloguer)\b", ql):
@@ -1680,23 +1709,14 @@ def _ku_general_redirect_answer() -> str:
1680
  @app.post("/correct")
1681
  async def correct_query(req: CorrectRequest):
1682
  """
1683
- Spell-correct a search query and build both natural and Boolean forms
1684
- for traditional databases such as PRIMO and PubMed.
1685
- Pre-cleans the query to strip chat prefixes and connectors before processing.
 
 
1686
  """
1687
- # Strip chat prefixes, connectors, and follow-up fragments
1688
- # so "and for physics" β†’ "physics" before LLM processing
1689
  raw = req.query.strip()
1690
- cleaned = re.sub(
1691
- r'^(find articles and books on|find articles on|find books on|'
1692
- r'search for|look for|i want|i need|give me|show me|get me|'
1693
- r'and\s+(also\s+)?(for|about|on|in|related to|regarding)|'
1694
- r'also\s+(for|about|on|in)|what about|how about|or\s+about|'
1695
- r'tell me about|more\s+(about|on))\s+',
1696
- '', raw, flags=re.IGNORECASE
1697
- ).strip()
1698
- # Use cleaned query if it's non-empty, otherwise fall back to raw
1699
- query_to_use = cleaned if cleaned else raw
1700
  plan = await _build_search_plan(query_to_use, req.year)
1701
  return {
1702
  "corrected": plan["corrected"],
@@ -2392,10 +2412,15 @@ async def agent_query(req: AgentRequest):
2392
  if intent in ("search_academic", "search_medical"):
2393
  import asyncio as _asyncio
2394
  search_plan = await _build_search_plan(question)
 
2395
  natural_query = search_plan["natural"]
2396
  database_query = search_plan["database_query"] or search_plan["corrected"]
 
 
 
 
2397
 
2398
- tasks = [tool_search_primo(database_query, limit=5)]
2399
  if intent == "search_medical":
2400
  tasks.append(tool_search_pubmed(database_query, limit=3))
2401
  else:
@@ -2409,6 +2434,13 @@ async def agent_query(req: AgentRequest):
2409
  combined.extend(r["results"])
2410
  tools_used.append(r.get("source", "unknown"))
2411
 
 
 
 
 
 
 
 
2412
  rag = await tool_library_info(question, history[-3:] if history else None, model=req.model)
2413
  tools_used.append("get_library_info")
2414
  tools_used = list(dict.fromkeys(tools_used))
@@ -2417,29 +2449,53 @@ async def agent_query(req: AgentRequest):
2417
  if rag.get("answer"):
2418
  context_parts.append(f"Library Knowledge Base:\n{rag['answer']}")
2419
  if combined:
2420
- top = combined[:3]
2421
- res_text = "\n".join(
2422
- f"- {r.get('title','')} by {r.get('creator','')} ({r.get('date','')})"
2423
- for r in top
2424
- )
2425
- context_parts.append(f"Search Results:\n{res_text}")
 
 
 
 
 
 
 
 
2426
  context_parts.append(f"Natural query for AI tools: {natural_query}")
2427
  context_parts.append(f"Database query for PRIMO/PubMed: {database_query}")
2428
 
2429
  behavior = get_behavior_instructions()
2430
- synthesis_prompt = (
2431
- f"{behavior}\n\n"
2432
- "You are the KU Library AI Assistant. Be concise (3-5 sentences).\n"
2433
- "Briefly describe the search direction and mention 1-2 top results if present.\n\n"
2434
- f"Context:\n{chr(10).join(context_parts) if context_parts else 'No additional context.'}\n\n"
2435
- f"Question: {question}\nAnswer:"
2436
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2437
  try:
2438
  if use_claude:
2439
  from langchain_anthropic import ChatAnthropic
2440
- synth_llm = ChatAnthropic(model="claude-haiku-4-5-20251001", temperature=0.2, max_tokens=600)
2441
  else:
2442
- synth_llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.2, max_tokens=600)
2443
  answer = synth_llm.invoke(synthesis_prompt).content.strip()
2444
  except Exception as ex:
2445
  answer = rag.get("answer", f"Error generating answer: {ex}")
@@ -2448,8 +2504,9 @@ async def agent_query(req: AgentRequest):
2448
  return _make_agent_response(
2449
  answer=answer, intent=intent, tools_used=tools_used,
2450
  search_results=combined[:8], sources=rag.get("sources", []),
2451
- model=req.model, elapsed=elapsed, question=question,
2452
  natural_query=natural_query, database_query=database_query,
 
2453
  )
2454
 
2455
  # ── General / general_recent β€” web search or plain LLM ───────────
 
1407
  def _clean_database_keywords(boolean_query: str) -> str:
1408
  return re.sub(r'\s+', ' ', re.sub(r'\b(AND|OR|NOT)\b|[()"]', ' ', boolean_query, flags=re.IGNORECASE)).strip()
1409
 
1410
+ def _light_strip_retrieval_boilerplate(text: str) -> str:
1411
+ cleaned = re.sub(r'^\s*(please\s+)?(?:can you|could you|would you)\s+', '', (text or '').strip(), flags=re.IGNORECASE)
1412
+ cleaned = re.sub(r'^\s*(please\s+)?help me\s+', '', cleaned, flags=re.IGNORECASE)
1413
+ cleaned = re.sub(r'^\s*please\s+', '', cleaned, flags=re.IGNORECASE)
1414
+ cleaned = re.sub(r'\s+(please|thanks|thank you|asap)$', '', cleaned, flags=re.IGNORECASE)
1415
+ return re.sub(r'\s+', ' ', cleaned).strip()
1416
+
1417
  async def _build_search_plan(query: str, year: int = 2026) -> dict:
1418
+ raw_query = (query or "").strip()
1419
+ light_query = _light_strip_retrieval_boilerplate(raw_query) or raw_query
1420
  prompt = f"""You are a search expert for Khalifa University Library.
1421
 
1422
+ The user typed a research request that may contain spelling issues or conversational phrasing.
1423
+ Create THREE forms while preserving the user's full intent.
1424
 
1425
+ 1. corrected: lightly edit the FULL request for spelling, grammar, and clarity.
1426
+ Preserve every substantive concept, constraint, and task.
1427
+ Do NOT collapse it into a short topic.
 
 
 
 
1428
 
1429
+ 2. natural: write a natural-language research query for AI tools such as Consensus,
1430
+ Perplexity, Semantic Scholar, Scopus AI, PRIMO AI, and LeapSpace.
1431
+ Keep the full context and constraints, but you may remove only retrieval boilerplate
1432
+ such as "find an article", "show me", "get me", or "can you find".
1433
+ Keep constraints like one article vs many, peer reviewed, last five years,
1434
+ summarize, methodology, findings, strengths, limitations, and contribution.
1435
 
1436
+ 3. boolean: write a PRIMO/PubMed Boolean query with AND/OR/parentheses that preserves
1437
+ the main topical concepts and important search constraints.
1438
 
1439
+ Examples:
1440
+ Input: "Find one peer-reviewed article from the last five years on climate change and biodiversity. Summarize the methodology and findings."
1441
+ β†’ corrected:"Find one peer-reviewed article from the last five years on climate change and biodiversity. Summarize the methodology and findings."
1442
+ β†’ natural:"Peer-reviewed article from the last five years on climate change and biodiversity. Summarize the methodology and findings."
1443
+ β†’ boolean:("climate change" OR "global warming") AND (biodiversity OR ecosystems)
1444
 
1445
+ Input: "impuct glubal waming on biodiversty"
1446
+ β†’ corrected:"impact of global warming on biodiversity"
1447
+ β†’ natural:"impact of global warming on biodiversity"
1448
+ β†’ boolean:("global warming" OR "climate change") AND biodiversity
1449
 
1450
  Return ONLY valid JSON:
1451
+ {{"corrected":"full polished request","natural":"full natural-language research query","boolean":"(A OR B) AND (C OR D)","year_from":"","year_to":"","peer_reviewed":false,"open_access":false}}
1452
 
1453
+ Query: "{light_query}"""
1454
  try:
1455
  llm = ChatOpenAI(model="gpt-4o-mini", temperature=0, max_tokens=300)
1456
  response = llm.invoke(prompt)
 
1473
  "open_access": bool(result.get("open_access", False)),
1474
  }
1475
  except Exception:
1476
+ boolean = _make_boolean(raw_query)
1477
+ corrected = raw_query.strip() or raw_query
1478
+ natural = _light_strip_retrieval_boilerplate(raw_query) or corrected
1479
  return {
1480
  "corrected": corrected,
1481
+ "natural": natural,
1482
  "boolean": boolean,
1483
  "database_query": _clean_database_keywords(boolean),
1484
  "year_from": "",
 
1528
  if _looks_nonlibrary_ku_question(q):
1529
  return {"intent_hint": "general", "canonical_terms": canonical_terms, "grounding_keys": grounding_keys, "social": False}
1530
 
1531
+ contact_or_support = bool(re.search(r"\b(who handles|who can help|who should i contact|contact for|email for|phone for|librarian for|help with access|access problem|login issue|remote access problem|vendor issue|technical issue|support)\b", ql))
1532
+ resource_or_search_task = bool(re.search(r"\b(best|which|recommend|suggest|compare|difference|find|search|articles?|papers?|books?|literature|study|studies|review|summariz(?:e|ing)|summaris(?:e|ing)|evaluate|critique|one article|single article|latest|recent|last \d+ years?)\b", ql))
1533
+
1534
  # Staff / role semantics
1535
+ if re.search(r"\b(systems? librarian|system librarian|website|digital services|library systems?|technology help)\b", ql) and contact_or_support:
1536
  add("systems_help", "Walter Brian Hall", "Systems Librarian", "website", "technology")
1537
  if re.search(r"\b(database access|e-?resources?|remote access|off campus|off-campus|login issue|access problem|vendor issue)\b", ql):
1538
+ if contact_or_support or re.search(r"\b(access problem|login issue|vendor issue|remote access problem|off campus access)\b", ql):
1539
+ add("database_access", "Rani Anand", "E-Resources", "database access")
1540
+ else:
1541
+ add("database_access", "databases", "e-resources", "remote access")
1542
  if re.search(r"\b(orcid|open access|apc|article processing charge|research impact|bibliometric|bibliometrics|scival|scopus metrics?)\b", ql):
1543
+ if contact_or_support and not resource_or_search_task:
1544
+ add("orcid_oa", "Walter Brian Hall", "ORCID", "Open Access", "APC", "openaccess@ku.ac.ae")
1545
+ else:
1546
+ add("orcid_oa", "ORCID", "Open Access", "APC", "research impact")
1547
  if re.search(r"\b(research support|research impact|bibliometrics|scival|khazna|scholarly communication|libguides)\b", ql):
1548
+ if contact_or_support and not resource_or_search_task:
1549
+ add("research_help", "Nikesh Narayanan", "research support", "bibliometrics", "Khazna")
1550
+ else:
1551
+ add("research_help", "research support", "bibliometrics", "Khazna")
1552
  if re.search(r"\b(medical librarian|pubmed help|embase|cinahl|cochrane|uptodate|systematic review|clinical databases?)\b", ql):
1553
+ if contact_or_support and not resource_or_search_task:
1554
+ add("medical_help", "Jason Fetty", "Medical Librarian", "PubMed", "systematic review")
1555
+ else:
1556
+ add("medical_help", "PubMed", "Embase", "CINAHL", "Cochrane", "UpToDate")
1557
  if re.search(r"\b(acquisitions?|collection development|suggest a book|request a title|new title request|purchase request|book request)\b", ql):
1558
  add("acquisitions", "Alia Al-Harrasi", "Meera Alnaqbi", "Acquisitions", "collection development")
1559
  if re.search(r"\b(catalogu(?:e|ing)|cataloging|metadata|cataloguer)\b", ql):
 
1709
  @app.post("/correct")
1710
  async def correct_query(req: CorrectRequest):
1711
  """
1712
+ Build three query forms from the user's full request:
1713
+ - corrected: polished full request
1714
+ - natural: full natural-language AI-tool query
1715
+ - boolean: database-ready Boolean query
1716
+ Only light cleanup is applied before planning.
1717
  """
 
 
1718
  raw = req.query.strip()
1719
+ query_to_use = _light_strip_retrieval_boilerplate(raw) or raw
 
 
 
 
 
 
 
 
 
1720
  plan = await _build_search_plan(query_to_use, req.year)
1721
  return {
1722
  "corrected": plan["corrected"],
 
2412
  if intent in ("search_academic", "search_medical"):
2413
  import asyncio as _asyncio
2414
  search_plan = await _build_search_plan(question)
2415
+ corrected_query = search_plan["corrected"]
2416
  natural_query = search_plan["natural"]
2417
  database_query = search_plan["database_query"] or search_plan["corrected"]
2418
+ year_from = search_plan.get("year_from") or None
2419
+ year_to = search_plan.get("year_to") or None
2420
+ peer_reviewed = bool(search_plan.get("peer_reviewed"))
2421
+ open_access = bool(search_plan.get("open_access"))
2422
 
2423
+ tasks = [tool_search_primo(database_query, limit=5, peer_reviewed=peer_reviewed, open_access=open_access, year_from=year_from, year_to=year_to)]
2424
  if intent == "search_medical":
2425
  tasks.append(tool_search_pubmed(database_query, limit=3))
2426
  else:
 
2434
  combined.extend(r["results"])
2435
  tools_used.append(r.get("source", "unknown"))
2436
 
2437
+ wants_single_article = bool(re.search(r"\b(one|single)\s+(peer.?reviewed\s+)?(article|paper|study)\b", question, re.IGNORECASE))
2438
+ wants_structured_summary = bool(re.search(r"\b(summariz(?:e|ing)|summaris(?:e|ing)|main research question|methodology|methods?|key findings|strengths?|limitations?|critical(?:ly)? evaluate|critique|contribution)\b", question, re.IGNORECASE))
2439
+ if wants_single_article or wants_structured_summary:
2440
+ preferred = [r for r in combined if r.get("_source") in ("Semantic Scholar", "PubMed")]
2441
+ remainder = [r for r in combined if r.get("_source") not in ("Semantic Scholar", "PubMed")]
2442
+ combined = preferred + remainder
2443
+
2444
  rag = await tool_library_info(question, history[-3:] if history else None, model=req.model)
2445
  tools_used.append("get_library_info")
2446
  tools_used = list(dict.fromkeys(tools_used))
 
2449
  if rag.get("answer"):
2450
  context_parts.append(f"Library Knowledge Base:\n{rag['answer']}")
2451
  if combined:
2452
+ top = combined[:5]
2453
+ res_lines = []
2454
+ for idx, r in enumerate(top, 1):
2455
+ res_lines.append(
2456
+ f"{idx}. Title: {r.get('title','')}\n"
2457
+ f" Authors: {r.get('creator','')}\n"
2458
+ f" Year: {r.get('date','')}\n"
2459
+ f" Source: {r.get('source','')}\n"
2460
+ f" Type: {r.get('type','')}\n"
2461
+ f" DOI: {r.get('doi','')}\n"
2462
+ f" Link: {r.get('link','')}\n"
2463
+ f" Abstract/Description: {r.get('description','')}"
2464
+ )
2465
+ context_parts.append("Candidate Search Results:\n" + "\n\n".join(res_lines))
2466
  context_parts.append(f"Natural query for AI tools: {natural_query}")
2467
  context_parts.append(f"Database query for PRIMO/PubMed: {database_query}")
2468
 
2469
  behavior = get_behavior_instructions()
2470
+ if wants_single_article or wants_structured_summary:
2471
+ synthesis_prompt = (
2472
+ f"{behavior}\n\n"
2473
+ "You are the KU Library AI Assistant.\n"
2474
+ "The user wants a direct answer in the chat, not just search directions.\n"
2475
+ "Choose the single best-matching article from the candidate results, following the user's constraints as closely as possible.\n"
2476
+ "Prefer a result with an abstract/description when available.\n"
2477
+ "If exact compliance is uncertain, say so briefly.\n"
2478
+ "If you rely only on metadata/abstract rather than full text, say that explicitly.\n\n"
2479
+ "Format your answer with these headings when relevant:\n"
2480
+ "Recommended article\nWhy it fits\nMain research question\nMethodology\nKey findings\nStrengths of the evidence\nLimitations of the evidence\nContribution to current understanding\n\n"
2481
+ f"Context:\n{chr(10).join(context_parts) if context_parts else 'No additional context.'}\n\n"
2482
+ f"Question: {question}\nAnswer:"
2483
+ )
2484
+ else:
2485
+ synthesis_prompt = (
2486
+ f"{behavior}\n\n"
2487
+ "You are the KU Library AI Assistant. Be concise but helpful (4-7 sentences).\n"
2488
+ "Answer the user's search request directly, mention the search direction, and mention 1-3 strong results when present.\n"
2489
+ "Keep the answer in the chat rather than redirecting the user elsewhere.\n\n"
2490
+ f"Context:\n{chr(10).join(context_parts) if context_parts else 'No additional context.'}\n\n"
2491
+ f"Question: {question}\nAnswer:"
2492
+ )
2493
  try:
2494
  if use_claude:
2495
  from langchain_anthropic import ChatAnthropic
2496
+ synth_llm = ChatAnthropic(model="claude-haiku-4-5-20251001", temperature=0.2, max_tokens=900)
2497
  else:
2498
+ synth_llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.2, max_tokens=900)
2499
  answer = synth_llm.invoke(synthesis_prompt).content.strip()
2500
  except Exception as ex:
2501
  answer = rag.get("answer", f"Error generating answer: {ex}")
 
2504
  return _make_agent_response(
2505
  answer=answer, intent=intent, tools_used=tools_used,
2506
  search_results=combined[:8], sources=rag.get("sources", []),
2507
+ model=req.model, elapsed=elapsed, question=corrected_query,
2508
  natural_query=natural_query, database_query=database_query,
2509
+ original_question=question,
2510
  )
2511
 
2512
  # ── General / general_recent β€” web search or plain LLM ───────────