nikeshn commited on
Commit
18d3583
·
verified ·
1 Parent(s): 1e10395

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +213 -55
app.py CHANGED
@@ -987,17 +987,27 @@ async def tool_search_primo(query, limit=5, peer_reviewed=False, open_access=Fal
987
  total = data.get("info", {}).get("total", 0)
988
  results = []
989
  for doc in data.get("docs", []):
990
- d = doc.get("pnx", {}).get("display", {})
991
- a = doc.get("pnx", {}).get("addata", {})
992
- s = doc.get("pnx", {}).get("search", {})
 
 
 
 
 
 
993
  results.append({
994
- "title": (d.get("title") or ["Untitled"])[0],
 
995
  "creator": "; ".join(d.get("creator") or d.get("contributor") or []) or "Unknown",
996
  "date": (s.get("creationdate") or a.get("risdate") or a.get("date") or [""])[0],
997
  "type": (d.get("type") or [""])[0],
998
  "source": (d.get("source") or a.get("jtitle") or [""])[0],
999
  "description": ((d.get("description") or [""])[0] or "")[:400],
1000
  "doi": (a.get("doi") or [None])[0],
 
 
 
1001
  })
1002
  return {"total": total, "results": results, "source": "PRIMO"}
1003
  except Exception:
@@ -1414,6 +1424,157 @@ def _light_strip_retrieval_boilerplate(text: str) -> str:
1414
  cleaned = re.sub(r'\s+(please|thanks|thank you|asap)$', '', cleaned, flags=re.IGNORECASE)
1415
  return re.sub(r'\s+', ' ', cleaned).strip()
1416
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1417
  async def _build_search_plan(query: str, year: int = 2026) -> dict:
1418
  raw_query = (query or "").strip()
1419
  light_query = _light_strip_retrieval_boilerplate(raw_query) or raw_query
@@ -2431,80 +2592,77 @@ async def agent_query(req: AgentRequest):
2431
  tools_used = []
2432
  for r in raw_results:
2433
  if isinstance(r, dict) and r.get("results"):
2434
- combined.extend(r["results"])
2435
  tools_used.append(r.get("source", "unknown"))
 
 
2436
 
2437
- wants_single_article = bool(re.search(r"\b(one|single)\s+(peer.?reviewed\s+)?(article|paper|study)\b", question, re.IGNORECASE))
2438
- wants_structured_summary = bool(re.search(r"\b(summariz(?:e|ing)|summaris(?:e|ing)|main research question|methodology|methods?|key findings|strengths?|limitations?|critical(?:ly)? evaluate|critique|contribution)\b", question, re.IGNORECASE))
2439
  if wants_single_article or wants_structured_summary:
2440
- preferred = [r for r in combined if r.get("_source") in ("Semantic Scholar", "PubMed")]
2441
- remainder = [r for r in combined if r.get("_source") not in ("Semantic Scholar", "PubMed")]
2442
- combined = preferred + remainder
2443
 
2444
  rag = await tool_library_info(question, history[-3:] if history else None, model=req.model)
2445
  tools_used.append("get_library_info")
2446
  tools_used = list(dict.fromkeys(tools_used))
2447
 
2448
- context_parts = []
2449
- if rag.get("answer"):
2450
- context_parts.append(f"Library Knowledge Base:\n{rag['answer']}")
2451
- if combined:
2452
- top = combined[:5]
2453
- res_lines = []
2454
- for idx, r in enumerate(top, 1):
2455
- res_lines.append(
2456
- f"{idx}. Title: {r.get('title','')}\n"
2457
- f" Authors: {r.get('creator','')}\n"
2458
- f" Year: {r.get('date','')}\n"
2459
- f" Source: {r.get('source','')}\n"
2460
- f" Type: {r.get('type','')}\n"
2461
- f" DOI: {r.get('doi','')}\n"
2462
- f" Link: {r.get('link','')}\n"
2463
- f" Abstract/Description: {r.get('description','')}"
2464
- )
2465
- context_parts.append("Candidate Search Results:\n" + "\n\n".join(res_lines))
2466
- context_parts.append(f"Natural query for AI tools: {natural_query}")
2467
- context_parts.append(f"Database query for PRIMO/PubMed: {database_query}")
2468
-
2469
  behavior = get_behavior_instructions()
2470
  if wants_single_article or wants_structured_summary:
2471
- synthesis_prompt = (
2472
- f"{behavior}\n\n"
2473
- "You are the KU Library AI Assistant.\n"
2474
- "The user wants a direct answer in the chat, not just search directions.\n"
2475
- "Choose the single best-matching article from the candidate results, following the user's constraints as closely as possible.\n"
2476
- "Prefer a result with an abstract/description when available.\n"
2477
- "If exact compliance is uncertain, say so briefly.\n"
2478
- "If you rely only on metadata/abstract rather than full text, say that explicitly.\n\n"
2479
- "Format your answer with these headings when relevant:\n"
2480
- "Recommended article\nWhy it fits\nMain research question\nMethodology\nKey findings\nStrengths of the evidence\nLimitations of the evidence\nContribution to current understanding\n\n"
2481
- f"Context:\n{chr(10).join(context_parts) if context_parts else 'No additional context.'}\n\n"
2482
- f"Question: {question}\nAnswer:"
2483
- )
2484
  else:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2485
  synthesis_prompt = (
2486
  f"{behavior}\n\n"
2487
  "You are the KU Library AI Assistant. Be concise but helpful (4-7 sentences).\n"
2488
- "Answer the user's search request directly, mention the search direction, and mention 1-3 strong results when present.\n"
 
2489
  "Keep the answer in the chat rather than redirecting the user elsewhere.\n\n"
2490
  f"Context:\n{chr(10).join(context_parts) if context_parts else 'No additional context.'}\n\n"
2491
  f"Question: {question}\nAnswer:"
2492
  )
2493
- try:
2494
- if use_claude:
2495
- from langchain_anthropic import ChatAnthropic
2496
- synth_llm = ChatAnthropic(model="claude-haiku-4-5-20251001", temperature=0.2, max_tokens=900)
2497
- else:
2498
- synth_llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.2, max_tokens=900)
2499
- answer = synth_llm.invoke(synthesis_prompt).content.strip()
2500
- except Exception as ex:
2501
- answer = rag.get("answer", f"Error generating answer: {ex}")
2502
 
2503
  elapsed = time.time() - start
2504
  return _make_agent_response(
2505
  answer=answer, intent=intent, tools_used=tools_used,
2506
  search_results=combined[:8], sources=rag.get("sources", []),
2507
- model=req.model, elapsed=elapsed, question=corrected_query,
2508
  natural_query=natural_query, database_query=database_query,
2509
  original_question=question,
2510
  )
 
987
  total = data.get("info", {}).get("total", 0)
988
  results = []
989
  for doc in data.get("docs", []):
990
+ pnx = doc.get("pnx", {})
991
+ d = pnx.get("display", {})
992
+ a = pnx.get("addata", {})
993
+ s = pnx.get("search", {})
994
+ c = pnx.get("control", {})
995
+ l = pnx.get("links", {})
996
+ record_id = (c.get("recordid") or [None])[0]
997
+ title = (d.get("title") or ["Untitled"])[0]
998
+ primo_url = _build_primo_search_link(title, record_id)
999
  results.append({
1000
+ "record_id": record_id,
1001
+ "title": title,
1002
  "creator": "; ".join(d.get("creator") or d.get("contributor") or []) or "Unknown",
1003
  "date": (s.get("creationdate") or a.get("risdate") or a.get("date") or [""])[0],
1004
  "type": (d.get("type") or [""])[0],
1005
  "source": (d.get("source") or a.get("jtitle") or [""])[0],
1006
  "description": ((d.get("description") or [""])[0] or "")[:400],
1007
  "doi": (a.get("doi") or [None])[0],
1008
+ "primo_url": primo_url,
1009
+ "link": ((l.get("openurl") or l.get("linktorsrc") or [None])[0]) or primo_url,
1010
+ "open_access": (d.get("oa") or [""])[0] == "free_for_read",
1011
  })
1012
  return {"total": total, "results": results, "source": "PRIMO"}
1013
  except Exception:
 
1424
  cleaned = re.sub(r'\s+(please|thanks|thank you|asap)$', '', cleaned, flags=re.IGNORECASE)
1425
  return re.sub(r'\s+', ' ', cleaned).strip()
1426
 
1427
+
1428
+ def _build_primo_search_link(title: str, record_id: str | None = None) -> str | None:
1429
+ base = 'https://khalifa.primo.exlibrisgroup.com/discovery'
1430
+ if record_id:
1431
+ return f"{base}/fulldisplay?docid={quote(str(record_id))}&vid=971KUOSTAR_INST:KU"
1432
+ clean_title = (title or '').strip()
1433
+ if not clean_title:
1434
+ return None
1435
+ return f"{base}/search?query=any,contains,{quote(clean_title)}&tab=Everything&search_scope=MyInst_and_CI&vid=971KUOSTAR_INST:KU&lang=en"
1436
+
1437
+
1438
+ def _parse_year_value(value) -> int | None:
1439
+ m = re.search(r'(19|20)\d{2}', str(value or ''))
1440
+ return int(m.group(0)) if m else None
1441
+
1442
+
1443
+ def _is_article_like(result: dict) -> bool:
1444
+ type_text = str(result.get('type') or '').lower()
1445
+ source_text = str(result.get('source') or '').lower()
1446
+ if any(k in type_text for k in ['article', 'journal', 'review', 'paper', 'study']):
1447
+ return True
1448
+ return bool(source_text)
1449
+
1450
+
1451
+ def _normalize_result_links(result: dict) -> dict:
1452
+ result = dict(result or {})
1453
+ record_id = result.get('record_id') or result.get('id')
1454
+ title = result.get('title') or ''
1455
+ primo_url = result.get('primo_url') or _build_primo_search_link(title, record_id if result.get('_source') == 'PRIMO' else None)
1456
+ if primo_url:
1457
+ result['primo_url'] = primo_url
1458
+ if result.get('_source') == 'PRIMO' and not result.get('link'):
1459
+ result['link'] = primo_url
1460
+ return result
1461
+
1462
+
1463
+ def _choose_verified_article(results: list[dict], year_from=None, year_to=None) -> dict | None:
1464
+ candidates = []
1465
+ y_from = int(year_from) if str(year_from or '').isdigit() else None
1466
+ y_to = int(year_to) if str(year_to or '').isdigit() else None
1467
+ for raw in results or []:
1468
+ r = _normalize_result_links(raw)
1469
+ yr = _parse_year_value(r.get('date'))
1470
+ if y_from and yr and yr < y_from:
1471
+ continue
1472
+ if y_to and yr and yr > y_to:
1473
+ continue
1474
+ if not _is_article_like(r):
1475
+ continue
1476
+ if not (r.get('title') and (r.get('primo_url') or r.get('link') or r.get('doi'))):
1477
+ continue
1478
+ score = 0
1479
+ if r.get('_source') == 'PRIMO':
1480
+ score += 4
1481
+ if r.get('description'):
1482
+ score += 3
1483
+ if r.get('doi'):
1484
+ score += 1
1485
+ if r.get('link'):
1486
+ score += 1
1487
+ if yr:
1488
+ score += 1
1489
+ candidates.append((score, r))
1490
+ candidates.sort(key=lambda item: item[0], reverse=True)
1491
+ if not candidates or candidates[0][0] < 5:
1492
+ return None
1493
+ return candidates[0][1]
1494
+
1495
+
1496
+ async def _summarize_verified_article(question: str, article: dict, behavior: str, use_claude: bool) -> str:
1497
+ title = article.get('title', 'Untitled')
1498
+ creators = article.get('creator', 'Unknown')
1499
+ year = article.get('date', '')
1500
+ source = article.get('source', '')
1501
+ doi = article.get('doi') or 'Not available'
1502
+ primo_url = article.get('primo_url') or _build_primo_search_link(title) or 'Not available'
1503
+ direct_url = article.get('link') or ('https://doi.org/' + article['doi'] if article.get('doi') else 'Not available')
1504
+ abstract = (article.get('description') or '').strip()
1505
+
1506
+ fallback = [
1507
+ 'Recommended article',
1508
+ f'{title} ({year})' if year else title,
1509
+ '',
1510
+ 'Authors',
1511
+ creators,
1512
+ '',
1513
+ 'Source',
1514
+ source or 'Not clearly stated in the retrieved metadata.',
1515
+ '',
1516
+ 'Open in PRIMO',
1517
+ primo_url,
1518
+ ]
1519
+ if direct_url and direct_url != 'Not available':
1520
+ fallback += ['', 'Direct link', direct_url]
1521
+ fallback += [
1522
+ '',
1523
+ 'Verification note',
1524
+ 'This answer is grounded only in the retrieved metadata and abstract/description. It does not assume access to the full text unless a direct full-text link is shown.',
1525
+ '',
1526
+ 'Main research question',
1527
+ abstract or 'Not clearly stated in the retrieved abstract/metadata.',
1528
+ '',
1529
+ 'Methodology',
1530
+ 'Not clearly stated in the retrieved abstract/metadata.',
1531
+ '',
1532
+ 'Key findings',
1533
+ abstract or 'Not clearly stated in the retrieved abstract/metadata.',
1534
+ '',
1535
+ 'Strengths of the evidence',
1536
+ 'Not clearly stated in the retrieved abstract/metadata.',
1537
+ '',
1538
+ 'Limitations of the evidence',
1539
+ 'Not clearly stated in the retrieved abstract/metadata.',
1540
+ '',
1541
+ 'Contribution to current understanding',
1542
+ 'Not clearly stated in the retrieved abstract/metadata.',
1543
+ ]
1544
+
1545
+ if not abstract:
1546
+ return '\n'.join(fallback)
1547
+
1548
+ prompt = (
1549
+ f"{behavior}\n\n"
1550
+ 'You are LibBee, the Khalifa University Library AI Assistant. '
1551
+ 'Use ONLY the retrieved metadata and abstract/description below. '
1552
+ 'Do NOT invent details, and do NOT mention any title or link that is not given below. '
1553
+ 'If a section is not explicit in the abstract/metadata, write exactly: Not clearly stated in the retrieved abstract/metadata.\n\n'
1554
+ 'Return plain text with exactly these headings in this order:\n'
1555
+ 'Recommended article\nAuthors\nSource\nOpen in PRIMO\nDirect link\nVerification note\nMain research question\nMethodology\nKey findings\nStrengths of the evidence\nLimitations of the evidence\nContribution to current understanding\n\n'
1556
+ f"User request: {question}\n\n"
1557
+ f"Retrieved title: {title}\n"
1558
+ f"Retrieved authors: {creators}\n"
1559
+ f"Retrieved year: {year}\n"
1560
+ f"Retrieved source: {source}\n"
1561
+ f"Retrieved DOI: {doi}\n"
1562
+ f"Retrieved PRIMO link: {primo_url}\n"
1563
+ f"Retrieved direct link: {direct_url}\n"
1564
+ f"Retrieved abstract/description: {abstract}\n"
1565
+ )
1566
+
1567
+ try:
1568
+ if use_claude:
1569
+ from langchain_anthropic import ChatAnthropic
1570
+ llm = ChatAnthropic(model='claude-haiku-4-5-20251001', temperature=0, max_tokens=700)
1571
+ else:
1572
+ llm = ChatOpenAI(model='gpt-4o-mini', temperature=0, max_tokens=700)
1573
+ reply = llm.invoke(prompt).content.strip()
1574
+ return reply or '\n'.join(fallback)
1575
+ except Exception:
1576
+ return '\n'.join(fallback)
1577
+
1578
  async def _build_search_plan(query: str, year: int = 2026) -> dict:
1579
  raw_query = (query or "").strip()
1580
  light_query = _light_strip_retrieval_boilerplate(raw_query) or raw_query
 
2592
  tools_used = []
2593
  for r in raw_results:
2594
  if isinstance(r, dict) and r.get("results"):
 
2595
  tools_used.append(r.get("source", "unknown"))
2596
+ for item in r["results"]:
2597
+ combined.append(_normalize_result_links(item))
2598
 
2599
+ wants_single_article = bool(re.search(r"(one|single)\s+(peer.?reviewed\s+)?(article|paper|study)", question, re.IGNORECASE))
2600
+ wants_structured_summary = bool(re.search(r"(summariz(?:e|ing)|summaris(?:e|ing)|main research question|methodology|methods?|key findings|strengths?|limitations?|critical(?:ly)? evaluate|critique|contribution)", question, re.IGNORECASE))
2601
  if wants_single_article or wants_structured_summary:
2602
+ preferred = [r for r in combined if r.get("_source") == "PRIMO"]
2603
+ secondary = [r for r in combined if r.get("_source") != "PRIMO"]
2604
+ combined = preferred + secondary
2605
 
2606
  rag = await tool_library_info(question, history[-3:] if history else None, model=req.model)
2607
  tools_used.append("get_library_info")
2608
  tools_used = list(dict.fromkeys(tools_used))
2609
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2610
  behavior = get_behavior_instructions()
2611
  if wants_single_article or wants_structured_summary:
2612
+ candidate = _choose_verified_article(combined, year_from=year_from, year_to=year_to)
2613
+ if candidate:
2614
+ answer = await _summarize_verified_article(question, candidate, behavior, use_claude)
2615
+ else:
2616
+ answer = (
2617
+ "I couldn’t confidently retrieve one exact article matching your full request from the current results. "
2618
+ "Please look into the available AI tools below to refine or expand the search."
2619
+ )
 
 
 
 
 
2620
  else:
2621
+ context_parts = []
2622
+ if rag.get("answer"):
2623
+ context_parts.append(f"Library Knowledge Base:\n{rag['answer']}")
2624
+ if combined:
2625
+ top = combined[:5]
2626
+ res_lines = []
2627
+ for idx, r in enumerate(top, 1):
2628
+ res_lines.append(
2629
+ f"{idx}. Title: {r.get('title','')}\n"
2630
+ f" Authors: {r.get('creator','')}\n"
2631
+ f" Year: {r.get('date','')}\n"
2632
+ f" Source: {r.get('source','')}\n"
2633
+ f" Type: {r.get('type','')}\n"
2634
+ f" DOI: {r.get('doi','')}\n"
2635
+ f" PRIMO link: {r.get('primo_url','')}\n"
2636
+ f" Direct link: {r.get('link','')}\n"
2637
+ f" Abstract/Description: {r.get('description','')}"
2638
+ )
2639
+ context_parts.append("Candidate Search Results:\n" + "\n\n".join(res_lines))
2640
+ context_parts.append(f"Natural query for AI tools: {natural_query}")
2641
+ context_parts.append(f"Database query for PRIMO/PubMed: {database_query}")
2642
  synthesis_prompt = (
2643
  f"{behavior}\n\n"
2644
  "You are the KU Library AI Assistant. Be concise but helpful (4-7 sentences).\n"
2645
+ "Answer the user's search request directly, mention the search direction, and mention 1-3 strong retrieved results when present.\n"
2646
+ "Do not invent titles, links, or findings beyond the retrieved metadata/abstracts.\n"
2647
  "Keep the answer in the chat rather than redirecting the user elsewhere.\n\n"
2648
  f"Context:\n{chr(10).join(context_parts) if context_parts else 'No additional context.'}\n\n"
2649
  f"Question: {question}\nAnswer:"
2650
  )
2651
+ try:
2652
+ if use_claude:
2653
+ from langchain_anthropic import ChatAnthropic
2654
+ synth_llm = ChatAnthropic(model="claude-haiku-4-5-20251001", temperature=0.2, max_tokens=900)
2655
+ else:
2656
+ synth_llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.2, max_tokens=900)
2657
+ answer = synth_llm.invoke(synthesis_prompt).content.strip()
2658
+ except Exception as ex:
2659
+ answer = rag.get("answer", f"Error generating answer: {ex}")
2660
 
2661
  elapsed = time.time() - start
2662
  return _make_agent_response(
2663
  answer=answer, intent=intent, tools_used=tools_used,
2664
  search_results=combined[:8], sources=rag.get("sources", []),
2665
+ model=req.model, elapsed=elapsed, question=question,
2666
  natural_query=natural_query, database_query=database_query,
2667
  original_question=question,
2668
  )