Spaces:

nikeshn
/

kulibrary

Running

App Files Files Community

nikeshn commited on 11 days ago

Commit

18d3583

verified ·

1 Parent(s): 1e10395

Update app.py

Browse files

Files changed (1) hide show

app.py +213 -55

app.py CHANGED Viewed

@@ -987,17 +987,27 @@ async def tool_search_primo(query, limit=5, peer_reviewed=False, open_access=Fal
                     total = data.get("info", {}).get("total", 0)
                     results = []
                     for doc in data.get("docs", []):
-                        d = doc.get("pnx", {}).get("display", {})
-                        a = doc.get("pnx", {}).get("addata", {})
-                        s = doc.get("pnx", {}).get("search", {})
                         results.append({
-                            "title": (d.get("title") or ["Untitled"])[0],
                             "creator": "; ".join(d.get("creator") or d.get("contributor") or []) or "Unknown",
                             "date": (s.get("creationdate") or a.get("risdate") or a.get("date") or [""])[0],
                             "type": (d.get("type") or [""])[0],
                             "source": (d.get("source") or a.get("jtitle") or [""])[0],
                             "description": ((d.get("description") or [""])[0] or "")[:400],
                             "doi": (a.get("doi") or [None])[0],
                         })
                     return {"total": total, "results": results, "source": "PRIMO"}
             except Exception:
@@ -1414,6 +1424,157 @@ def _light_strip_retrieval_boilerplate(text: str) -> str:
     cleaned = re.sub(r'\s+(please|thanks|thank you|asap)$', '', cleaned, flags=re.IGNORECASE)
     return re.sub(r'\s+', ' ', cleaned).strip()
 async def _build_search_plan(query: str, year: int = 2026) -> dict:
     raw_query = (query or "").strip()
     light_query = _light_strip_retrieval_boilerplate(raw_query) or raw_query
@@ -2431,80 +2592,77 @@ async def agent_query(req: AgentRequest):
         tools_used = []
         for r in raw_results:
             if isinstance(r, dict) and r.get("results"):
-                combined.extend(r["results"])
                 tools_used.append(r.get("source", "unknown"))
-        wants_single_article = bool(re.search(r"\b(one|single)\s+(peer.?reviewed\s+)?(article|paper|study)\b", question, re.IGNORECASE))
-        wants_structured_summary = bool(re.search(r"\b(summariz(?:e|ing)|summaris(?:e|ing)|main research question|methodology|methods?|key findings|strengths?|limitations?|critical(?:ly)? evaluate|critique|contribution)\b", question, re.IGNORECASE))
         if wants_single_article or wants_structured_summary:
-            preferred = [r for r in combined if r.get("_source") in ("Semantic Scholar", "PubMed")]
-            remainder = [r for r in combined if r.get("_source") not in ("Semantic Scholar", "PubMed")]
-            combined = preferred + remainder
         rag = await tool_library_info(question, history[-3:] if history else None, model=req.model)
         tools_used.append("get_library_info")
         tools_used = list(dict.fromkeys(tools_used))
-        context_parts = []
-        if rag.get("answer"):
-            context_parts.append(f"Library Knowledge Base:\n{rag['answer']}")
-        if combined:
-            top = combined[:5]
-            res_lines = []
-            for idx, r in enumerate(top, 1):
-                res_lines.append(
-                    f"{idx}. Title: {r.get('title','')}\n"
-                    f"   Authors: {r.get('creator','')}\n"
-                    f"   Year: {r.get('date','')}\n"
-                    f"   Source: {r.get('source','')}\n"
-                    f"   Type: {r.get('type','')}\n"
-                    f"   DOI: {r.get('doi','')}\n"
-                    f"   Link: {r.get('link','')}\n"
-                    f"   Abstract/Description: {r.get('description','')}"
-                )
-            context_parts.append("Candidate Search Results:\n" + "\n\n".join(res_lines))
-        context_parts.append(f"Natural query for AI tools: {natural_query}")
-        context_parts.append(f"Database query for PRIMO/PubMed: {database_query}")
         behavior = get_behavior_instructions()
         if wants_single_article or wants_structured_summary:
-            synthesis_prompt = (
-                f"{behavior}\n\n"
-                "You are the KU Library AI Assistant.\n"
-                "The user wants a direct answer in the chat, not just search directions.\n"
-                "Choose the single best-matching article from the candidate results, following the user's constraints as closely as possible.\n"
-                "Prefer a result with an abstract/description when available.\n"
-                "If exact compliance is uncertain, say so briefly.\n"
-                "If you rely only on metadata/abstract rather than full text, say that explicitly.\n\n"
-                "Format your answer with these headings when relevant:\n"
-                "Recommended article\nWhy it fits\nMain research question\nMethodology\nKey findings\nStrengths of the evidence\nLimitations of the evidence\nContribution to current understanding\n\n"
-                f"Context:\n{chr(10).join(context_parts) if context_parts else 'No additional context.'}\n\n"
-                f"Question: {question}\nAnswer:"
-            )
         else:
             synthesis_prompt = (
                 f"{behavior}\n\n"
                 "You are the KU Library AI Assistant. Be concise but helpful (4-7 sentences).\n"
-                "Answer the user's search request directly, mention the search direction, and mention 1-3 strong results when present.\n"
                 "Keep the answer in the chat rather than redirecting the user elsewhere.\n\n"
                 f"Context:\n{chr(10).join(context_parts) if context_parts else 'No additional context.'}\n\n"
                 f"Question: {question}\nAnswer:"
             )
-        try:
-            if use_claude:
-                from langchain_anthropic import ChatAnthropic
-                synth_llm = ChatAnthropic(model="claude-haiku-4-5-20251001", temperature=0.2, max_tokens=900)
-            else:
-                synth_llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.2, max_tokens=900)
-            answer = synth_llm.invoke(synthesis_prompt).content.strip()
-        except Exception as ex:
-            answer = rag.get("answer", f"Error generating answer: {ex}")
         elapsed = time.time() - start
         return _make_agent_response(
             answer=answer, intent=intent, tools_used=tools_used,
             search_results=combined[:8], sources=rag.get("sources", []),
-            model=req.model, elapsed=elapsed, question=corrected_query,
             natural_query=natural_query, database_query=database_query,
             original_question=question,
         )

                     total = data.get("info", {}).get("total", 0)
                     results = []
                     for doc in data.get("docs", []):
+                        pnx = doc.get("pnx", {})
+                        d = pnx.get("display", {})
+                        a = pnx.get("addata", {})
+                        s = pnx.get("search", {})
+                        c = pnx.get("control", {})
+                        l = pnx.get("links", {})
+                        record_id = (c.get("recordid") or [None])[0]
+                        title = (d.get("title") or ["Untitled"])[0]
+                        primo_url = _build_primo_search_link(title, record_id)
                         results.append({
+                            "record_id": record_id,
+                            "title": title,
                             "creator": "; ".join(d.get("creator") or d.get("contributor") or []) or "Unknown",
                             "date": (s.get("creationdate") or a.get("risdate") or a.get("date") or [""])[0],
                             "type": (d.get("type") or [""])[0],
                             "source": (d.get("source") or a.get("jtitle") or [""])[0],
                             "description": ((d.get("description") or [""])[0] or "")[:400],
                             "doi": (a.get("doi") or [None])[0],
+                            "primo_url": primo_url,
+                            "link": ((l.get("openurl") or l.get("linktorsrc") or [None])[0]) or primo_url,
+                            "open_access": (d.get("oa") or [""])[0] == "free_for_read",
                         })
                     return {"total": total, "results": results, "source": "PRIMO"}
             except Exception:
     cleaned = re.sub(r'\s+(please|thanks|thank you|asap)$', '', cleaned, flags=re.IGNORECASE)
     return re.sub(r'\s+', ' ', cleaned).strip()
+def _build_primo_search_link(title: str, record_id: str | None = None) -> str | None:
+    base = 'https://khalifa.primo.exlibrisgroup.com/discovery'
+    if record_id:
+        return f"{base}/fulldisplay?docid={quote(str(record_id))}&vid=971KUOSTAR_INST:KU"
+    clean_title = (title or '').strip()
+    if not clean_title:
+        return None
+    return f"{base}/search?query=any,contains,{quote(clean_title)}&tab=Everything&search_scope=MyInst_and_CI&vid=971KUOSTAR_INST:KU&lang=en"
+def _parse_year_value(value) -> int | None:
+    m = re.search(r'(19|20)\d{2}', str(value or ''))
+    return int(m.group(0)) if m else None
+def _is_article_like(result: dict) -> bool:
+    type_text = str(result.get('type') or '').lower()
+    source_text = str(result.get('source') or '').lower()
+    if any(k in type_text for k in ['article', 'journal', 'review', 'paper', 'study']):
+        return True
+    return bool(source_text)
+def _normalize_result_links(result: dict) -> dict:
+    result = dict(result or {})
+    record_id = result.get('record_id') or result.get('id')
+    title = result.get('title') or ''
+    primo_url = result.get('primo_url') or _build_primo_search_link(title, record_id if result.get('_source') == 'PRIMO' else None)
+    if primo_url:
+        result['primo_url'] = primo_url
+    if result.get('_source') == 'PRIMO' and not result.get('link'):
+        result['link'] = primo_url
+    return result
+def _choose_verified_article(results: list[dict], year_from=None, year_to=None) -> dict | None:
+    candidates = []
+    y_from = int(year_from) if str(year_from or '').isdigit() else None
+    y_to = int(year_to) if str(year_to or '').isdigit() else None
+    for raw in results or []:
+        r = _normalize_result_links(raw)
+        yr = _parse_year_value(r.get('date'))
+        if y_from and yr and yr < y_from:
+            continue
+        if y_to and yr and yr > y_to:
+            continue
+        if not _is_article_like(r):
+            continue
+        if not (r.get('title') and (r.get('primo_url') or r.get('link') or r.get('doi'))):
+            continue
+        score = 0
+        if r.get('_source') == 'PRIMO':
+            score += 4
+        if r.get('description'):
+            score += 3
+        if r.get('doi'):
+            score += 1
+        if r.get('link'):
+            score += 1
+        if yr:
+            score += 1
+        candidates.append((score, r))
+    candidates.sort(key=lambda item: item[0], reverse=True)
+    if not candidates or candidates[0][0] < 5:
+        return None
+    return candidates[0][1]
+async def _summarize_verified_article(question: str, article: dict, behavior: str, use_claude: bool) -> str:
+    title = article.get('title', 'Untitled')
+    creators = article.get('creator', 'Unknown')
+    year = article.get('date', '')
+    source = article.get('source', '')
+    doi = article.get('doi') or 'Not available'
+    primo_url = article.get('primo_url') or _build_primo_search_link(title) or 'Not available'
+    direct_url = article.get('link') or ('https://doi.org/' + article['doi'] if article.get('doi') else 'Not available')
+    abstract = (article.get('description') or '').strip()
+    fallback = [
+        'Recommended article',
+        f'{title} ({year})' if year else title,
+        '',
+        'Authors',
+        creators,
+        '',
+        'Source',
+        source or 'Not clearly stated in the retrieved metadata.',
+        '',
+        'Open in PRIMO',
+        primo_url,
+    ]
+    if direct_url and direct_url != 'Not available':
+        fallback += ['', 'Direct link', direct_url]
+    fallback += [
+        '',
+        'Verification note',
+        'This answer is grounded only in the retrieved metadata and abstract/description. It does not assume access to the full text unless a direct full-text link is shown.',
+        '',
+        'Main research question',
+        abstract or 'Not clearly stated in the retrieved abstract/metadata.',
+        '',
+        'Methodology',
+        'Not clearly stated in the retrieved abstract/metadata.',
+        '',
+        'Key findings',
+        abstract or 'Not clearly stated in the retrieved abstract/metadata.',
+        '',
+        'Strengths of the evidence',
+        'Not clearly stated in the retrieved abstract/metadata.',
+        '',
+        'Limitations of the evidence',
+        'Not clearly stated in the retrieved abstract/metadata.',
+        '',
+        'Contribution to current understanding',
+        'Not clearly stated in the retrieved abstract/metadata.',
+    ]
+    if not abstract:
+        return '\n'.join(fallback)
+    prompt = (
+        f"{behavior}\n\n"
+        'You are LibBee, the Khalifa University Library AI Assistant. '
+        'Use ONLY the retrieved metadata and abstract/description below. '
+        'Do NOT invent details, and do NOT mention any title or link that is not given below. '
+        'If a section is not explicit in the abstract/metadata, write exactly: Not clearly stated in the retrieved abstract/metadata.\n\n'
+        'Return plain text with exactly these headings in this order:\n'
+        'Recommended article\nAuthors\nSource\nOpen in PRIMO\nDirect link\nVerification note\nMain research question\nMethodology\nKey findings\nStrengths of the evidence\nLimitations of the evidence\nContribution to current understanding\n\n'
+        f"User request: {question}\n\n"
+        f"Retrieved title: {title}\n"
+        f"Retrieved authors: {creators}\n"
+        f"Retrieved year: {year}\n"
+        f"Retrieved source: {source}\n"
+        f"Retrieved DOI: {doi}\n"
+        f"Retrieved PRIMO link: {primo_url}\n"
+        f"Retrieved direct link: {direct_url}\n"
+        f"Retrieved abstract/description: {abstract}\n"
+    )
+    try:
+        if use_claude:
+            from langchain_anthropic import ChatAnthropic
+            llm = ChatAnthropic(model='claude-haiku-4-5-20251001', temperature=0, max_tokens=700)
+        else:
+            llm = ChatOpenAI(model='gpt-4o-mini', temperature=0, max_tokens=700)
+        reply = llm.invoke(prompt).content.strip()
+        return reply or '\n'.join(fallback)
+    except Exception:
+        return '\n'.join(fallback)
 async def _build_search_plan(query: str, year: int = 2026) -> dict:
     raw_query = (query or "").strip()
     light_query = _light_strip_retrieval_boilerplate(raw_query) or raw_query
         tools_used = []
         for r in raw_results:
             if isinstance(r, dict) and r.get("results"):
                 tools_used.append(r.get("source", "unknown"))
+                for item in r["results"]:
+                    combined.append(_normalize_result_links(item))
+        wants_single_article = bool(re.search(r"(one|single)\s+(peer.?reviewed\s+)?(article|paper|study)", question, re.IGNORECASE))
+        wants_structured_summary = bool(re.search(r"(summariz(?:e|ing)|summaris(?:e|ing)|main research question|methodology|methods?|key findings|strengths?|limitations?|critical(?:ly)? evaluate|critique|contribution)", question, re.IGNORECASE))
         if wants_single_article or wants_structured_summary:
+            preferred = [r for r in combined if r.get("_source") == "PRIMO"]
+            secondary = [r for r in combined if r.get("_source") != "PRIMO"]
+            combined = preferred + secondary
         rag = await tool_library_info(question, history[-3:] if history else None, model=req.model)
         tools_used.append("get_library_info")
         tools_used = list(dict.fromkeys(tools_used))
         behavior = get_behavior_instructions()
         if wants_single_article or wants_structured_summary:
+            candidate = _choose_verified_article(combined, year_from=year_from, year_to=year_to)
+            if candidate:
+                answer = await _summarize_verified_article(question, candidate, behavior, use_claude)
+            else:
+                answer = (
+                    "I couldn’t confidently retrieve one exact article matching your full request from the current results. "
+                    "Please look into the available AI tools below to refine or expand the search."
+                )
         else:
+            context_parts = []
+            if rag.get("answer"):
+                context_parts.append(f"Library Knowledge Base:\n{rag['answer']}")
+            if combined:
+                top = combined[:5]
+                res_lines = []
+                for idx, r in enumerate(top, 1):
+                    res_lines.append(
+                        f"{idx}. Title: {r.get('title','')}\n"
+                        f"   Authors: {r.get('creator','')}\n"
+                        f"   Year: {r.get('date','')}\n"
+                        f"   Source: {r.get('source','')}\n"
+                        f"   Type: {r.get('type','')}\n"
+                        f"   DOI: {r.get('doi','')}\n"
+                        f"   PRIMO link: {r.get('primo_url','')}\n"
+                        f"   Direct link: {r.get('link','')}\n"
+                        f"   Abstract/Description: {r.get('description','')}"
+                    )
+                context_parts.append("Candidate Search Results:\n" + "\n\n".join(res_lines))
+            context_parts.append(f"Natural query for AI tools: {natural_query}")
+            context_parts.append(f"Database query for PRIMO/PubMed: {database_query}")
             synthesis_prompt = (
                 f"{behavior}\n\n"
                 "You are the KU Library AI Assistant. Be concise but helpful (4-7 sentences).\n"
+                "Answer the user's search request directly, mention the search direction, and mention 1-3 strong retrieved results when present.\n"
+                "Do not invent titles, links, or findings beyond the retrieved metadata/abstracts.\n"
                 "Keep the answer in the chat rather than redirecting the user elsewhere.\n\n"
                 f"Context:\n{chr(10).join(context_parts) if context_parts else 'No additional context.'}\n\n"
                 f"Question: {question}\nAnswer:"
             )
+            try:
+                if use_claude:
+                    from langchain_anthropic import ChatAnthropic
+                    synth_llm = ChatAnthropic(model="claude-haiku-4-5-20251001", temperature=0.2, max_tokens=900)
+                else:
+                    synth_llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.2, max_tokens=900)
+                answer = synth_llm.invoke(synthesis_prompt).content.strip()
+            except Exception as ex:
+                answer = rag.get("answer", f"Error generating answer: {ex}")
         elapsed = time.time() - start
         return _make_agent_response(
             answer=answer, intent=intent, tools_used=tools_used,
             search_results=combined[:8], sources=rag.get("sources", []),
+            model=req.model, elapsed=elapsed, question=question,
             natural_query=natural_query, database_query=database_query,
             original_question=question,
         )