PaperScout / core.py
vaishnaveswar's picture
Update core.py
3efff14 verified
# core.py
from __future__ import annotations
import os
import re
import math
import uuid
import itertools
from typing import Any, Dict, List, Optional, Tuple
from urllib.parse import urlsplit, urlunsplit
from langchain_core.prompts import ChatPromptTemplate
from langchain_google_genai import ChatGoogleGenerativeAI
try:
# optional drop-in providing .text()
from ddgs import DDGS # type: ignore
except ImportError:
# provides DDGS().text with region/safesearch/timelimit/max_results options
from duckduckgo_search import DDGS # type: ignore
# Initialize LLM (Gemini via LangChain integration)
# Note: GOOGLE_API_KEY must be set in the environment for this to work.
# Example: export GOOGLE_API_KEY="your-key"
llm = ChatGoogleGenerativeAI(
model="gemini-2.5-flash-lite",
temperature=0,
max_output_tokens=None,
timeout=60,
max_retries=3,
)
ACADEMIC_SITES_FILTER = (
"site:neurips.cc OR site:arxiv.cc OR site:icml.cc OR site:iclr.cc OR "
"site:aaai.org OR site:ijcai.org OR site:thecvf.com OR site:kdd.org OR "
"site:sigcomm.org OR site:usenix.org OR site:ieeexplore.ieee.org"
)
def parse_year_from_text(text: str) -> Optional[int]:
"""Extract publication year from text."""
years = re.findall(r"\b(19|20)\d{2}\b", text or "")
return int(years[0]) if years else None
def _normalize_url(u: str) -> str:
if not u:
return ""
try:
parts = urlsplit(u.strip())
# drop query/fragment to normalize
return urlunsplit(
(parts.scheme.lower(), parts.netloc.lower(), parts.path.rstrip("/"), "", "")
)
except Exception:
return u.strip().rstrip("/").lower()
def _safe_ddgs_text_call(
ddgs: DDGS,
query: str,
region: str,
safesearch: str,
timelimit: Optional[str],
max_results: Optional[int],
backend: Optional[str] = None,
retries: int = 2,
) -> List[Dict[str, Any]]:
"""
Call DDGS().text with graceful handling of different library signatures and backend fallbacks.
Tries a sequence of backends when no results are returned.
"""
# Preferred backend order: lite -> html -> api -> auto (some versions)
candidate_backends = []
if backend:
candidate_backends.append(backend)
candidate_backends.extend(
[b for b in ["lite", "html", "api", "auto"] if b != backend]
)
for b in candidate_backends:
for _ in range(max(1, retries)):
try:
# Newer versions: returns list; older: generator
res = ddgs.text(
query,
region=region,
safesearch=safesearch,
timelimit=timelimit,
backend=b,
max_results=max_results,
)
if res is None:
results = []
elif isinstance(res, list):
results = res
else:
# generator fallback
results = list(res)
except TypeError:
# Older signature without backend/max_results
try:
res = ddgs.text(
query,
region=region,
safesearch=safesearch,
timelimit=timelimit,
)
results = list(res) if res is not None else []
if max_results:
results = results[:max_results]
except Exception:
results = []
except Exception:
results = []
if results:
return results
return []
def _build_query_prompt() -> ChatPromptTemplate:
"""
Prompt to generate 2–3 short keyword queries for academic literature search.
"""
return ChatPromptTemplate.from_template(
"""
Act as a query planner for academic literature search.
Given a topic, produce 2–3 distinct, short keyword-based queries optimized for academic sources.
Requirements:
- Be concise (each query < 12 words).
- Avoid punctuation except site: filters or boolean OR if needed.
- Prefer neutral, general keywords and important synonyms.
- Return ONLY the queries, one per line, no numbering or extra text.
Topic:
{topic}
""".strip()
)
def generate_search_queries(topic: str, k: int = 3) -> List[str]:
"""
Use the LLM to propose 2–3 concise queries for web search.
Ensures at least 2 queries; truncates to k.
"""
prompt = _build_query_prompt()
msgs = prompt.format_messages(topic=(topic or "").strip())
try:
out = (llm.invoke(msgs).content or "").strip()
except Exception:
out = ""
# Parse lines into queries
queries = [q.strip() for q in out.splitlines() if q.strip()]
# Deduplicate while preserving order
seen = set()
deduped = []
for q in queries:
if q.lower() not in seen:
deduped.append(q)
seen.add(q.lower())
# Ensure at least 2 queries; fallback heuristics
base = (topic or "").strip()
if len(deduped) < 2:
# Basic expansions
fallbacks = [
base,
f"{base} method comparison",
f"{base} benchmarks",
f"{base} survey review",
]
for fb in fallbacks:
if fb and fb.lower() not in seen:
deduped.append(fb)
seen.add(fb.lower())
if len(deduped) >= max(2, k):
break
# Truncate to k (default 3)
return deduped[: max(2, k)]
# Replace fetch_literature_results_multi with this version:
def fetch_literature_results_multi(
topic: str,
region: str = "wt-wt", # prefer wt-wt for robustness
max_results: int = 20,
safesearch: str = "moderate",
timelimit: Optional[str] = None,
backend: Optional[str] = None,
) -> List[Dict[str, Any]]:
"""
Fetch academic results via DuckDuckGo across multiple LLM-generated queries
with backend/region fallbacks and deduplication.
"""
queries = generate_search_queries(topic, k=3)
per_query = max(3, math.ceil(max_results / max(1, len(queries))))
results: List[Dict[str, Any]] = []
try:
with DDGS() as ddgs:
for q in queries:
q_aug = f"{q} {ACADEMIC_SITES_FILTER}"
rows = _safe_ddgs_text_call(
ddgs,
q_aug,
region=region,
safesearch=safesearch,
timelimit=timelimit,
max_results=per_query,
backend=backend,
retries=2,
)
for r in rows or []:
results.append(
{
"title": r.get("title", "") or "",
"body": r.get("body", "") or "",
"link": r.get("href", "") or "",
"source": r.get("source", "web") or "web",
"query_used": q,
}
)
except Exception:
return []
# Deduplicate by normalized URL
deduped: List[Dict[str, Any]] = []
seen_links = set()
for row in results:
norm = _normalize_url(row.get("link", ""))
if norm and norm not in seen_links:
deduped.append(row)
seen_links.add(norm)
return deduped[:max_results]
def _build_table_prompt() -> ChatPromptTemplate:
"""
Prompt to produce a Markdown table for literature review (used only when web is enabled).
Sorted by year (latest → oldest).
"""
return ChatPromptTemplate.from_template(
"""
You are a meticulous academic research analyst specializing in synthesizing scholarly publications.
You will examine the provided list of paper titles and abstracts in detail.
Your objective is to produce a high-quality, chronologically sorted (latest → oldest) literature review table in Markdown format.
For each paper, you must:
- Accurately determine the Year (from metadata, title, or context; estimate if unclear).
- Identify and list the Title in full.
- Extract or infer Authors from the text; if not stated, write 'N/A'.
- Summarize Key Contribution / Findings in 1–2 precise, academically phrased sentences.
- Record Citation Count if mentioned; if not, write 'N/A'.
- Provide the Source Link if present; if absent, write 'N/A'.
Additional requirements:
- If publication venue (journal/conference) is mentioned, briefly note it in parentheses after the year.
- Use neutral, scholarly tone and avoid unnecessary adjectives.
- Ensure all summaries focus on the core novel contribution, methodology highlights, and notable results.
- Maintain uniform formatting for all rows and ensure alignment of columns in Markdown.
- Double-check chronological order: newest year first, oldest last.
Topic: {topic}
Papers:
{compiled_text}
Now output ONLY the Markdown table. Do not include commentary before or after the table.
""".strip()
)
def _build_chat_prompt() -> ChatPromptTemplate:
"""Prompt for normal chat responses (no web formatting)."""
return ChatPromptTemplate.from_template(
"""
You are a helpful academic research assistant with expertise in computer science, machine learning, and related fields.
Provide clear, accurate, and informative responses to academic questions. Use a friendly but professional tone.
Guidelines:
- Be concise but thorough
- Explain concepts clearly
- Use examples when helpful
- Break down complex topics
- Cite established facts when appropriate
- Respond in natural conversational style (NOT in table format)
User Message:
{message}
Your Response:
""".strip()
)
def literature_review_table(
topic: str,
region: str = "us-en",
max_results: int = 20,
safesearch: str = "moderate",
timelimit: Optional[str] = None,
backend: Optional[str] = None,
) -> str:
"""
Generate a literature review as a Markdown TABLE using multi-query web results.
"""
articles = fetch_literature_results_multi(
topic=topic,
region=region,
max_results=max_results,
safesearch=safesearch,
timelimit=timelimit,
backend=backend,
)
if not articles:
return (
"| Intent | Reply |\n"
"|--------|-------|\n"
"| Info | No academic sources found for this topic; try refining the query or checking the connection. |\n"
)
# Compile search results for the LLM
compiled_text = ""
for art in articles:
compiled_text += (
f"Title: {art.get('title', '')}\n"
f"Abstract: {art.get('body', '')}\n"
f"Source: {art.get('source', '')}\n"
f"Link: {art.get('link', '')}\n\n"
)
prompt = _build_table_prompt()
msgs = prompt.format_messages(topic=topic, compiled_text=compiled_text)
try:
response = llm.invoke(msgs).content
except Exception as e:
return (
"| Intent | Reply |\n"
"|--------|-------|\n"
f"| Error | Error generating literature table: {str(e)} |\n"
)
# Sanity: ensure it looks like a Markdown table
if not isinstance(response, str) or "|" not in response:
# Minimal fallback: construct a table from top hits
rows = []
header = "| Year | Title | Authors | Key Contribution / Findings | Citations | Source |\n"
sep = "|------|-------|---------|-----------------------------|-----------|--------|\n"
for art in articles[: min(10, len(articles))]:
title = art.get("title") or "Untitled"
year = parse_year_from_text(art.get("body", "")) or "N/A"
link = art.get("link") or ""
rows.append(f"| {year} | {title} | N/A | N/A | N/A | {link} |\n")
response = header + sep + "".join(rows)
return response
def chat_response(message: str) -> str:
"""Generate normal conversational response (no table, no web)."""
prompt = _build_chat_prompt()
msgs = prompt.format_messages(message=message)
try:
response = llm.invoke(msgs).content
except Exception as e:
return f"I apologize, but an error occurred: {str(e)}\nPlease try again or rephrase the question."
if not isinstance(response, str):
return (
"I apologize, but I couldn't generate a proper response. Please try again."
)
return response
def answer_as_table(
message: str,
region: str = "us-en",
max_results: int = 20,
safesearch: str = "moderate",
timelimit: Optional[str] = None,
backend: Optional[str] = None,
force_web: bool = False,
) -> str:
"""
Routing:
- If force_web is True: return a Markdown TABLE (web).
- If force_web is False: return plain chat text (no web).
"""
message = (message or "").strip()
if not message:
return ""
if force_web:
return literature_review_table(
message,
region=region,
max_results=max_results,
safesearch=safesearch,
timelimit=timelimit,
backend=backend,
)
# Plain chat (no web)
return chat_response(message)