Spaces:

vaishnaveswar
/

PaperScout

Sleeping

File size: 13,401 Bytes

# core.py
from __future__ import annotations

import os
import re
import math
import uuid
import itertools
from typing import Any, Dict, List, Optional, Tuple
from urllib.parse import urlsplit, urlunsplit

from langchain_core.prompts import ChatPromptTemplate
from langchain_google_genai import ChatGoogleGenerativeAI

try:
    # optional drop-in providing .text()
    from ddgs import DDGS  # type: ignore
except ImportError:
    # provides DDGS().text with region/safesearch/timelimit/max_results options
    from duckduckgo_search import DDGS  # type: ignore


# Initialize LLM (Gemini via LangChain integration)
# Note: GOOGLE_API_KEY must be set in the environment for this to work.
# Example: export GOOGLE_API_KEY="your-key"
llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash-lite",
    temperature=0,
    max_output_tokens=None,
    timeout=60,
    max_retries=3,
)


ACADEMIC_SITES_FILTER = (
    "site:neurips.cc OR site:arxiv.cc OR site:icml.cc OR site:iclr.cc OR "
    "site:aaai.org OR site:ijcai.org OR site:thecvf.com OR site:kdd.org OR "
    "site:sigcomm.org OR site:usenix.org OR site:ieeexplore.ieee.org"
)


def parse_year_from_text(text: str) -> Optional[int]:
    """Extract publication year from text."""
    years = re.findall(r"\b(19|20)\d{2}\b", text or "")
    return int(years[0]) if years else None


def _normalize_url(u: str) -> str:
    if not u:
        return ""
    try:
        parts = urlsplit(u.strip())
        # drop query/fragment to normalize
        return urlunsplit(
            (parts.scheme.lower(), parts.netloc.lower(), parts.path.rstrip("/"), "", "")
        )
    except Exception:
        return u.strip().rstrip("/").lower()


def _safe_ddgs_text_call(
    ddgs: DDGS,
    query: str,
    region: str,
    safesearch: str,
    timelimit: Optional[str],
    max_results: Optional[int],
    backend: Optional[str] = None,
    retries: int = 2,
) -> List[Dict[str, Any]]:
    """
    Call DDGS().text with graceful handling of different library signatures and backend fallbacks.
    Tries a sequence of backends when no results are returned.
    """
    # Preferred backend order: lite -> html -> api -> auto (some versions)
    candidate_backends = []
    if backend:
        candidate_backends.append(backend)
    candidate_backends.extend(
        [b for b in ["lite", "html", "api", "auto"] if b != backend]
    )

    for b in candidate_backends:
        for _ in range(max(1, retries)):
            try:
                # Newer versions: returns list; older: generator
                res = ddgs.text(
                    query,
                    region=region,
                    safesearch=safesearch,
                    timelimit=timelimit,
                    backend=b,
                    max_results=max_results,
                )
                if res is None:
                    results = []
                elif isinstance(res, list):
                    results = res
                else:
                    # generator fallback
                    results = list(res)
            except TypeError:
                # Older signature without backend/max_results
                try:
                    res = ddgs.text(
                        query,
                        region=region,
                        safesearch=safesearch,
                        timelimit=timelimit,
                    )
                    results = list(res) if res is not None else []
                    if max_results:
                        results = results[:max_results]
                except Exception:
                    results = []
            except Exception:
                results = []

            if results:
                return results
    return []


def _build_query_prompt() -> ChatPromptTemplate:
    """
    Prompt to generate 2–3 short keyword queries for academic literature search.
    """
    return ChatPromptTemplate.from_template(
        """
Act as a query planner for academic literature search.
Given a topic, produce 2–3 distinct, short keyword-based queries optimized for academic sources.
Requirements:
- Be concise (each query < 12 words).
- Avoid punctuation except site: filters or boolean OR if needed.
- Prefer neutral, general keywords and important synonyms.
- Return ONLY the queries, one per line, no numbering or extra text.

Topic:
{topic}
""".strip()
    )


def generate_search_queries(topic: str, k: int = 3) -> List[str]:
    """
    Use the LLM to propose 2–3 concise queries for web search.
    Ensures at least 2 queries; truncates to k.
    """
    prompt = _build_query_prompt()
    msgs = prompt.format_messages(topic=(topic or "").strip())
    try:
        out = (llm.invoke(msgs).content or "").strip()
    except Exception:
        out = ""

    # Parse lines into queries
    queries = [q.strip() for q in out.splitlines() if q.strip()]
    # Deduplicate while preserving order
    seen = set()
    deduped = []
    for q in queries:
        if q.lower() not in seen:
            deduped.append(q)
            seen.add(q.lower())

    # Ensure at least 2 queries; fallback heuristics
    base = (topic or "").strip()
    if len(deduped) < 2:
        # Basic expansions
        fallbacks = [
            base,
            f"{base} method comparison",
            f"{base} benchmarks",
            f"{base} survey review",
        ]
        for fb in fallbacks:
            if fb and fb.lower() not in seen:
                deduped.append(fb)
                seen.add(fb.lower())
            if len(deduped) >= max(2, k):
                break

    # Truncate to k (default 3)
    return deduped[: max(2, k)]


# Replace fetch_literature_results_multi with this version:
def fetch_literature_results_multi(
    topic: str,
    region: str = "wt-wt",  # prefer wt-wt for robustness
    max_results: int = 20,
    safesearch: str = "moderate",
    timelimit: Optional[str] = None,
    backend: Optional[str] = None,
) -> List[Dict[str, Any]]:
    """
    Fetch academic results via DuckDuckGo across multiple LLM-generated queries
    with backend/region fallbacks and deduplication.
    """
    queries = generate_search_queries(topic, k=3)
    per_query = max(3, math.ceil(max_results / max(1, len(queries))))
    results: List[Dict[str, Any]] = []

    try:
        with DDGS() as ddgs:
            for q in queries:
                q_aug = f"{q} {ACADEMIC_SITES_FILTER}"
                rows = _safe_ddgs_text_call(
                    ddgs,
                    q_aug,
                    region=region,
                    safesearch=safesearch,
                    timelimit=timelimit,
                    max_results=per_query,
                    backend=backend,
                    retries=2,
                )
                for r in rows or []:
                    results.append(
                        {
                            "title": r.get("title", "") or "",
                            "body": r.get("body", "") or "",
                            "link": r.get("href", "") or "",
                            "source": r.get("source", "web") or "web",
                            "query_used": q,
                        }
                    )
    except Exception:
        return []

    # Deduplicate by normalized URL
    deduped: List[Dict[str, Any]] = []
    seen_links = set()
    for row in results:
        norm = _normalize_url(row.get("link", ""))
        if norm and norm not in seen_links:
            deduped.append(row)
            seen_links.add(norm)

    return deduped[:max_results]


def _build_table_prompt() -> ChatPromptTemplate:
    """
    Prompt to produce a Markdown table for literature review (used only when web is enabled).
    Sorted by year (latest → oldest).
    """
    return ChatPromptTemplate.from_template(
        """
You are a meticulous academic research analyst specializing in synthesizing scholarly publications.
You will examine the provided list of paper titles and abstracts in detail.

Your objective is to produce a high-quality, chronologically sorted (latest → oldest) literature review table in Markdown format.

For each paper, you must:
- Accurately determine the Year (from metadata, title, or context; estimate if unclear).
- Identify and list the Title in full.
- Extract or infer Authors from the text; if not stated, write 'N/A'.
- Summarize Key Contribution / Findings in 1–2 precise, academically phrased sentences.
- Record Citation Count if mentioned; if not, write 'N/A'.
- Provide the Source Link if present; if absent, write 'N/A'.

Additional requirements:
- If publication venue (journal/conference) is mentioned, briefly note it in parentheses after the year.
- Use neutral, scholarly tone and avoid unnecessary adjectives.
- Ensure all summaries focus on the core novel contribution, methodology highlights, and notable results.
- Maintain uniform formatting for all rows and ensure alignment of columns in Markdown.
- Double-check chronological order: newest year first, oldest last.

Topic: {topic}

Papers:
{compiled_text}

Now output ONLY the Markdown table. Do not include commentary before or after the table.
""".strip()
    )


def _build_chat_prompt() -> ChatPromptTemplate:
    """Prompt for normal chat responses (no web formatting)."""
    return ChatPromptTemplate.from_template(
        """
You are a helpful academic research assistant with expertise in computer science, machine learning, and related fields.
Provide clear, accurate, and informative responses to academic questions. Use a friendly but professional tone.

Guidelines:
- Be concise but thorough
- Explain concepts clearly
- Use examples when helpful
- Break down complex topics
- Cite established facts when appropriate
- Respond in natural conversational style (NOT in table format)

User Message:
{message}

Your Response:
""".strip()
    )


def literature_review_table(
    topic: str,
    region: str = "us-en",
    max_results: int = 20,
    safesearch: str = "moderate",
    timelimit: Optional[str] = None,
    backend: Optional[str] = None,
) -> str:
    """
    Generate a literature review as a Markdown TABLE using multi-query web results.
    """
    articles = fetch_literature_results_multi(
        topic=topic,
        region=region,
        max_results=max_results,
        safesearch=safesearch,
        timelimit=timelimit,
        backend=backend,
    )

    if not articles:
        return (
            "| Intent | Reply |\n"
            "|--------|-------|\n"
            "| Info | No academic sources found for this topic; try refining the query or checking the connection. |\n"
        )

    # Compile search results for the LLM
    compiled_text = ""
    for art in articles:
        compiled_text += (
            f"Title: {art.get('title', '')}\n"
            f"Abstract: {art.get('body', '')}\n"
            f"Source: {art.get('source', '')}\n"
            f"Link: {art.get('link', '')}\n\n"
        )

    prompt = _build_table_prompt()
    msgs = prompt.format_messages(topic=topic, compiled_text=compiled_text)

    try:
        response = llm.invoke(msgs).content
    except Exception as e:
        return (
            "| Intent | Reply |\n"
            "|--------|-------|\n"
            f"| Error | Error generating literature table: {str(e)} |\n"
        )

    # Sanity: ensure it looks like a Markdown table
    if not isinstance(response, str) or "|" not in response:
        # Minimal fallback: construct a table from top hits
        rows = []
        header = "| Year | Title | Authors | Key Contribution / Findings | Citations | Source |\n"
        sep = "|------|-------|---------|-----------------------------|-----------|--------|\n"
        for art in articles[: min(10, len(articles))]:
            title = art.get("title") or "Untitled"
            year = parse_year_from_text(art.get("body", "")) or "N/A"
            link = art.get("link") or ""
            rows.append(f"| {year} | {title} | N/A | N/A | N/A | {link} |\n")
        response = header + sep + "".join(rows)

    return response


def chat_response(message: str) -> str:
    """Generate normal conversational response (no table, no web)."""
    prompt = _build_chat_prompt()
    msgs = prompt.format_messages(message=message)

    try:
        response = llm.invoke(msgs).content
    except Exception as e:
        return f"I apologize, but an error occurred: {str(e)}\nPlease try again or rephrase the question."

    if not isinstance(response, str):
        return (
            "I apologize, but I couldn't generate a proper response. Please try again."
        )
    return response


def answer_as_table(
    message: str,
    region: str = "us-en",
    max_results: int = 20,
    safesearch: str = "moderate",
    timelimit: Optional[str] = None,
    backend: Optional[str] = None,
    force_web: bool = False,
) -> str:
    """
    Routing:
    - If force_web is True: return a Markdown TABLE (web).
    - If force_web is False: return plain chat text (no web).
    """
    message = (message or "").strip()
    if not message:
        return ""

    if force_web:
        return literature_review_table(
            message,
            region=region,
            max_results=max_results,
            safesearch=safesearch,
            timelimit=timelimit,
            backend=backend,
        )

    # Plain chat (no web)
    return chat_response(message)