Spaces:

vaishnaveswar
/

PaperScout

Sleeping

App Files Files Community

PaperScout / core.py

vaishnaveswar

Update core.py

3efff14 verified 4 months ago

raw

history blame contribute delete

13.4 kB

	# core.py
	from __future__ import annotations

	import os
	import re
	import math
	import uuid
	import itertools
	from typing import Any, Dict, List, Optional, Tuple
	from urllib.parse import urlsplit, urlunsplit

	from langchain_core.prompts import ChatPromptTemplate
	from langchain_google_genai import ChatGoogleGenerativeAI

	try:
	# optional drop-in providing .text()
	from ddgs import DDGS # type: ignore
	except ImportError:
	# provides DDGS().text with region/safesearch/timelimit/max_results options
	from duckduckgo_search import DDGS # type: ignore


	# Initialize LLM (Gemini via LangChain integration)
	# Note: GOOGLE_API_KEY must be set in the environment for this to work.
	# Example: export GOOGLE_API_KEY="your-key"
	llm = ChatGoogleGenerativeAI(
	model="gemini-2.5-flash-lite",
	temperature=0,
	max_output_tokens=None,
	timeout=60,
	max_retries=3,
	)


	ACADEMIC_SITES_FILTER = (
	"site:neurips.cc OR site:arxiv.cc OR site:icml.cc OR site:iclr.cc OR "
	"site:aaai.org OR site:ijcai.org OR site:thecvf.com OR site:kdd.org OR "
	"site:sigcomm.org OR site:usenix.org OR site:ieeexplore.ieee.org"
	)


	def parse_year_from_text(text: str) -> Optional[int]:
	"""Extract publication year from text."""
	years = re.findall(r"\b(19\|20)\d{2}\b", text or "")
	return int(years[0]) if years else None


	def _normalize_url(u: str) -> str:
	if not u:
	return ""
	try:
	parts = urlsplit(u.strip())
	# drop query/fragment to normalize
	return urlunsplit(
	(parts.scheme.lower(), parts.netloc.lower(), parts.path.rstrip("/"), "", "")
	)
	except Exception:
	return u.strip().rstrip("/").lower()


	def _safe_ddgs_text_call(
	ddgs: DDGS,
	query: str,
	region: str,
	safesearch: str,
	timelimit: Optional[str],
	max_results: Optional[int],
	backend: Optional[str] = None,
	retries: int = 2,
	) -> List[Dict[str, Any]]:
	"""
	Call DDGS().text with graceful handling of different library signatures and backend fallbacks.
	Tries a sequence of backends when no results are returned.
	"""
	# Preferred backend order: lite -> html -> api -> auto (some versions)
	candidate_backends = []
	if backend:
	candidate_backends.append(backend)
	candidate_backends.extend(
	[b for b in ["lite", "html", "api", "auto"] if b != backend]
	)

	for b in candidate_backends:
	for _ in range(max(1, retries)):
	try:
	# Newer versions: returns list; older: generator
	res = ddgs.text(
	query,
	region=region,
	safesearch=safesearch,
	timelimit=timelimit,
	backend=b,
	max_results=max_results,
	)
	if res is None:
	results = []
	elif isinstance(res, list):
	results = res
	else:
	# generator fallback
	results = list(res)
	except TypeError:
	# Older signature without backend/max_results
	try:
	res = ddgs.text(
	query,
	region=region,
	safesearch=safesearch,
	timelimit=timelimit,
	)
	results = list(res) if res is not None else []
	if max_results:
	results = results[:max_results]
	except Exception:
	results = []
	except Exception:
	results = []

	if results:
	return results
	return []


	def _build_query_prompt() -> ChatPromptTemplate:
	"""
	Prompt to generate 2–3 short keyword queries for academic literature search.
	"""
	return ChatPromptTemplate.from_template(
	"""
	Act as a query planner for academic literature search.
	Given a topic, produce 2–3 distinct, short keyword-based queries optimized for academic sources.
	Requirements:
	- Be concise (each query < 12 words).
	- Avoid punctuation except site: filters or boolean OR if needed.
	- Prefer neutral, general keywords and important synonyms.
	- Return ONLY the queries, one per line, no numbering or extra text.

	Topic:
	{topic}
	""".strip()
	)


	def generate_search_queries(topic: str, k: int = 3) -> List[str]:
	"""
	Use the LLM to propose 2–3 concise queries for web search.
	Ensures at least 2 queries; truncates to k.
	"""
	prompt = _build_query_prompt()
	msgs = prompt.format_messages(topic=(topic or "").strip())
	try:
	out = (llm.invoke(msgs).content or "").strip()
	except Exception:
	out = ""

	# Parse lines into queries
	queries = [q.strip() for q in out.splitlines() if q.strip()]
	# Deduplicate while preserving order
	seen = set()
	deduped = []
	for q in queries:
	if q.lower() not in seen:
	deduped.append(q)
	seen.add(q.lower())

	# Ensure at least 2 queries; fallback heuristics
	base = (topic or "").strip()
	if len(deduped) < 2:
	# Basic expansions
	fallbacks = [
	base,
	f"{base} method comparison",
	f"{base} benchmarks",
	f"{base} survey review",
	]
	for fb in fallbacks:
	if fb and fb.lower() not in seen:
	deduped.append(fb)
	seen.add(fb.lower())
	if len(deduped) >= max(2, k):
	break

	# Truncate to k (default 3)
	return deduped[: max(2, k)]


	# Replace fetch_literature_results_multi with this version:
	def fetch_literature_results_multi(
	topic: str,
	region: str = "wt-wt", # prefer wt-wt for robustness
	max_results: int = 20,
	safesearch: str = "moderate",
	timelimit: Optional[str] = None,
	backend: Optional[str] = None,
	) -> List[Dict[str, Any]]:
	"""
	Fetch academic results via DuckDuckGo across multiple LLM-generated queries
	with backend/region fallbacks and deduplication.
	"""
	queries = generate_search_queries(topic, k=3)
	per_query = max(3, math.ceil(max_results / max(1, len(queries))))
	results: List[Dict[str, Any]] = []

	try:
	with DDGS() as ddgs:
	for q in queries:
	q_aug = f"{q} {ACADEMIC_SITES_FILTER}"
	rows = _safe_ddgs_text_call(
	ddgs,
	q_aug,
	region=region,
	safesearch=safesearch,
	timelimit=timelimit,
	max_results=per_query,
	backend=backend,
	retries=2,
	)
	for r in rows or []:
	results.append(
	{
	"title": r.get("title", "") or "",
	"body": r.get("body", "") or "",
	"link": r.get("href", "") or "",
	"source": r.get("source", "web") or "web",
	"query_used": q,
	}
	)
	except Exception:
	return []

	# Deduplicate by normalized URL
	deduped: List[Dict[str, Any]] = []
	seen_links = set()
	for row in results:
	norm = _normalize_url(row.get("link", ""))
	if norm and norm not in seen_links:
	deduped.append(row)
	seen_links.add(norm)

	return deduped[:max_results]


	def _build_table_prompt() -> ChatPromptTemplate:
	"""
	Prompt to produce a Markdown table for literature review (used only when web is enabled).
	Sorted by year (latest → oldest).
	"""
	return ChatPromptTemplate.from_template(
	"""
	You are a meticulous academic research analyst specializing in synthesizing scholarly publications.
	You will examine the provided list of paper titles and abstracts in detail.

	Your objective is to produce a high-quality, chronologically sorted (latest → oldest) literature review table in Markdown format.

	For each paper, you must:
	- Accurately determine the Year (from metadata, title, or context; estimate if unclear).
	- Identify and list the Title in full.
	- Extract or infer Authors from the text; if not stated, write 'N/A'.
	- Summarize Key Contribution / Findings in 1–2 precise, academically phrased sentences.
	- Record Citation Count if mentioned; if not, write 'N/A'.
	- Provide the Source Link if present; if absent, write 'N/A'.

	Additional requirements:
	- If publication venue (journal/conference) is mentioned, briefly note it in parentheses after the year.
	- Use neutral, scholarly tone and avoid unnecessary adjectives.
	- Ensure all summaries focus on the core novel contribution, methodology highlights, and notable results.
	- Maintain uniform formatting for all rows and ensure alignment of columns in Markdown.
	- Double-check chronological order: newest year first, oldest last.

	Topic: {topic}

	Papers:
	{compiled_text}

	Now output ONLY the Markdown table. Do not include commentary before or after the table.
	""".strip()
	)


	def _build_chat_prompt() -> ChatPromptTemplate:
	"""Prompt for normal chat responses (no web formatting)."""
	return ChatPromptTemplate.from_template(
	"""
	You are a helpful academic research assistant with expertise in computer science, machine learning, and related fields.
	Provide clear, accurate, and informative responses to academic questions. Use a friendly but professional tone.

	Guidelines:
	- Be concise but thorough
	- Explain concepts clearly
	- Use examples when helpful
	- Break down complex topics
	- Cite established facts when appropriate
	- Respond in natural conversational style (NOT in table format)

	User Message:
	{message}

	Your Response:
	""".strip()
	)


	def literature_review_table(
	topic: str,
	region: str = "us-en",
	max_results: int = 20,
	safesearch: str = "moderate",
	timelimit: Optional[str] = None,
	backend: Optional[str] = None,
	) -> str:
	"""
	Generate a literature review as a Markdown TABLE using multi-query web results.
	"""
	articles = fetch_literature_results_multi(
	topic=topic,
	region=region,
	max_results=max_results,
	safesearch=safesearch,
	timelimit=timelimit,
	backend=backend,
	)

	if not articles:
	return (
	"\| Intent \| Reply \|\n"
	"\|--------\|-------\|\n"
	"\| Info \| No academic sources found for this topic; try refining the query or checking the connection. \|\n"
	)

	# Compile search results for the LLM
	compiled_text = ""
	for art in articles:
	compiled_text += (
	f"Title: {art.get('title', '')}\n"
	f"Abstract: {art.get('body', '')}\n"
	f"Source: {art.get('source', '')}\n"
	f"Link: {art.get('link', '')}\n\n"
	)

	prompt = _build_table_prompt()
	msgs = prompt.format_messages(topic=topic, compiled_text=compiled_text)

	try:
	response = llm.invoke(msgs).content
	except Exception as e:
	return (
	"\| Intent \| Reply \|\n"
	"\|--------\|-------\|\n"
	f"\| Error \| Error generating literature table: {str(e)} \|\n"
	)

	# Sanity: ensure it looks like a Markdown table
	if not isinstance(response, str) or "\|" not in response:
	# Minimal fallback: construct a table from top hits
	rows = []
	header = "\| Year \| Title \| Authors \| Key Contribution / Findings \| Citations \| Source \|\n"
	sep = "\|------\|-------\|---------\|-----------------------------\|-----------\|--------\|\n"
	for art in articles[: min(10, len(articles))]:
	title = art.get("title") or "Untitled"
	year = parse_year_from_text(art.get("body", "")) or "N/A"
	link = art.get("link") or ""
	rows.append(f"\| {year} \| {title} \| N/A \| N/A \| N/A \| {link} \|\n")
	response = header + sep + "".join(rows)

	return response


	def chat_response(message: str) -> str:
	"""Generate normal conversational response (no table, no web)."""
	prompt = _build_chat_prompt()
	msgs = prompt.format_messages(message=message)

	try:
	response = llm.invoke(msgs).content
	except Exception as e:
	return f"I apologize, but an error occurred: {str(e)}\nPlease try again or rephrase the question."

	if not isinstance(response, str):
	return (
	"I apologize, but I couldn't generate a proper response. Please try again."
	)
	return response


	def answer_as_table(
	message: str,
	region: str = "us-en",
	max_results: int = 20,
	safesearch: str = "moderate",
	timelimit: Optional[str] = None,
	backend: Optional[str] = None,
	force_web: bool = False,
	) -> str:
	"""
	Routing:
	- If force_web is True: return a Markdown TABLE (web).
	- If force_web is False: return plain chat text (no web).
	"""
	message = (message or "").strip()
	if not message:
	return ""

	if force_web:
	return literature_review_table(
	message,
	region=region,
	max_results=max_results,
	safesearch=safesearch,
	timelimit=timelimit,
	backend=backend,
	)

	# Plain chat (no web)
	return chat_response(message)