# core.py from __future__ import annotations import os import re import math import uuid import itertools from typing import Any, Dict, List, Optional, Tuple from urllib.parse import urlsplit, urlunsplit from langchain_core.prompts import ChatPromptTemplate from langchain_google_genai import ChatGoogleGenerativeAI try: # optional drop-in providing .text() from ddgs import DDGS # type: ignore except ImportError: # provides DDGS().text with region/safesearch/timelimit/max_results options from duckduckgo_search import DDGS # type: ignore # Initialize LLM (Gemini via LangChain integration) # Note: GOOGLE_API_KEY must be set in the environment for this to work. # Example: export GOOGLE_API_KEY="your-key" llm = ChatGoogleGenerativeAI( model="gemini-2.5-flash-lite", temperature=0, max_output_tokens=None, timeout=60, max_retries=3, ) ACADEMIC_SITES_FILTER = ( "site:neurips.cc OR site:arxiv.cc OR site:icml.cc OR site:iclr.cc OR " "site:aaai.org OR site:ijcai.org OR site:thecvf.com OR site:kdd.org OR " "site:sigcomm.org OR site:usenix.org OR site:ieeexplore.ieee.org" ) def parse_year_from_text(text: str) -> Optional[int]: """Extract publication year from text.""" years = re.findall(r"\b(19|20)\d{2}\b", text or "") return int(years[0]) if years else None def _normalize_url(u: str) -> str: if not u: return "" try: parts = urlsplit(u.strip()) # drop query/fragment to normalize return urlunsplit( (parts.scheme.lower(), parts.netloc.lower(), parts.path.rstrip("/"), "", "") ) except Exception: return u.strip().rstrip("/").lower() def _safe_ddgs_text_call( ddgs: DDGS, query: str, region: str, safesearch: str, timelimit: Optional[str], max_results: Optional[int], backend: Optional[str] = None, retries: int = 2, ) -> List[Dict[str, Any]]: """ Call DDGS().text with graceful handling of different library signatures and backend fallbacks. Tries a sequence of backends when no results are returned. """ # Preferred backend order: lite -> html -> api -> auto (some versions) candidate_backends = [] if backend: candidate_backends.append(backend) candidate_backends.extend( [b for b in ["lite", "html", "api", "auto"] if b != backend] ) for b in candidate_backends: for _ in range(max(1, retries)): try: # Newer versions: returns list; older: generator res = ddgs.text( query, region=region, safesearch=safesearch, timelimit=timelimit, backend=b, max_results=max_results, ) if res is None: results = [] elif isinstance(res, list): results = res else: # generator fallback results = list(res) except TypeError: # Older signature without backend/max_results try: res = ddgs.text( query, region=region, safesearch=safesearch, timelimit=timelimit, ) results = list(res) if res is not None else [] if max_results: results = results[:max_results] except Exception: results = [] except Exception: results = [] if results: return results return [] def _build_query_prompt() -> ChatPromptTemplate: """ Prompt to generate 2–3 short keyword queries for academic literature search. """ return ChatPromptTemplate.from_template( """ Act as a query planner for academic literature search. Given a topic, produce 2–3 distinct, short keyword-based queries optimized for academic sources. Requirements: - Be concise (each query < 12 words). - Avoid punctuation except site: filters or boolean OR if needed. - Prefer neutral, general keywords and important synonyms. - Return ONLY the queries, one per line, no numbering or extra text. Topic: {topic} """.strip() ) def generate_search_queries(topic: str, k: int = 3) -> List[str]: """ Use the LLM to propose 2–3 concise queries for web search. Ensures at least 2 queries; truncates to k. """ prompt = _build_query_prompt() msgs = prompt.format_messages(topic=(topic or "").strip()) try: out = (llm.invoke(msgs).content or "").strip() except Exception: out = "" # Parse lines into queries queries = [q.strip() for q in out.splitlines() if q.strip()] # Deduplicate while preserving order seen = set() deduped = [] for q in queries: if q.lower() not in seen: deduped.append(q) seen.add(q.lower()) # Ensure at least 2 queries; fallback heuristics base = (topic or "").strip() if len(deduped) < 2: # Basic expansions fallbacks = [ base, f"{base} method comparison", f"{base} benchmarks", f"{base} survey review", ] for fb in fallbacks: if fb and fb.lower() not in seen: deduped.append(fb) seen.add(fb.lower()) if len(deduped) >= max(2, k): break # Truncate to k (default 3) return deduped[: max(2, k)] # Replace fetch_literature_results_multi with this version: def fetch_literature_results_multi( topic: str, region: str = "wt-wt", # prefer wt-wt for robustness max_results: int = 20, safesearch: str = "moderate", timelimit: Optional[str] = None, backend: Optional[str] = None, ) -> List[Dict[str, Any]]: """ Fetch academic results via DuckDuckGo across multiple LLM-generated queries with backend/region fallbacks and deduplication. """ queries = generate_search_queries(topic, k=3) per_query = max(3, math.ceil(max_results / max(1, len(queries)))) results: List[Dict[str, Any]] = [] try: with DDGS() as ddgs: for q in queries: q_aug = f"{q} {ACADEMIC_SITES_FILTER}" rows = _safe_ddgs_text_call( ddgs, q_aug, region=region, safesearch=safesearch, timelimit=timelimit, max_results=per_query, backend=backend, retries=2, ) for r in rows or []: results.append( { "title": r.get("title", "") or "", "body": r.get("body", "") or "", "link": r.get("href", "") or "", "source": r.get("source", "web") or "web", "query_used": q, } ) except Exception: return [] # Deduplicate by normalized URL deduped: List[Dict[str, Any]] = [] seen_links = set() for row in results: norm = _normalize_url(row.get("link", "")) if norm and norm not in seen_links: deduped.append(row) seen_links.add(norm) return deduped[:max_results] def _build_table_prompt() -> ChatPromptTemplate: """ Prompt to produce a Markdown table for literature review (used only when web is enabled). Sorted by year (latest → oldest). """ return ChatPromptTemplate.from_template( """ You are a meticulous academic research analyst specializing in synthesizing scholarly publications. You will examine the provided list of paper titles and abstracts in detail. Your objective is to produce a high-quality, chronologically sorted (latest → oldest) literature review table in Markdown format. For each paper, you must: - Accurately determine the Year (from metadata, title, or context; estimate if unclear). - Identify and list the Title in full. - Extract or infer Authors from the text; if not stated, write 'N/A'. - Summarize Key Contribution / Findings in 1–2 precise, academically phrased sentences. - Record Citation Count if mentioned; if not, write 'N/A'. - Provide the Source Link if present; if absent, write 'N/A'. Additional requirements: - If publication venue (journal/conference) is mentioned, briefly note it in parentheses after the year. - Use neutral, scholarly tone and avoid unnecessary adjectives. - Ensure all summaries focus on the core novel contribution, methodology highlights, and notable results. - Maintain uniform formatting for all rows and ensure alignment of columns in Markdown. - Double-check chronological order: newest year first, oldest last. Topic: {topic} Papers: {compiled_text} Now output ONLY the Markdown table. Do not include commentary before or after the table. """.strip() ) def _build_chat_prompt() -> ChatPromptTemplate: """Prompt for normal chat responses (no web formatting).""" return ChatPromptTemplate.from_template( """ You are a helpful academic research assistant with expertise in computer science, machine learning, and related fields. Provide clear, accurate, and informative responses to academic questions. Use a friendly but professional tone. Guidelines: - Be concise but thorough - Explain concepts clearly - Use examples when helpful - Break down complex topics - Cite established facts when appropriate - Respond in natural conversational style (NOT in table format) User Message: {message} Your Response: """.strip() ) def literature_review_table( topic: str, region: str = "us-en", max_results: int = 20, safesearch: str = "moderate", timelimit: Optional[str] = None, backend: Optional[str] = None, ) -> str: """ Generate a literature review as a Markdown TABLE using multi-query web results. """ articles = fetch_literature_results_multi( topic=topic, region=region, max_results=max_results, safesearch=safesearch, timelimit=timelimit, backend=backend, ) if not articles: return ( "| Intent | Reply |\n" "|--------|-------|\n" "| Info | No academic sources found for this topic; try refining the query or checking the connection. |\n" ) # Compile search results for the LLM compiled_text = "" for art in articles: compiled_text += ( f"Title: {art.get('title', '')}\n" f"Abstract: {art.get('body', '')}\n" f"Source: {art.get('source', '')}\n" f"Link: {art.get('link', '')}\n\n" ) prompt = _build_table_prompt() msgs = prompt.format_messages(topic=topic, compiled_text=compiled_text) try: response = llm.invoke(msgs).content except Exception as e: return ( "| Intent | Reply |\n" "|--------|-------|\n" f"| Error | Error generating literature table: {str(e)} |\n" ) # Sanity: ensure it looks like a Markdown table if not isinstance(response, str) or "|" not in response: # Minimal fallback: construct a table from top hits rows = [] header = "| Year | Title | Authors | Key Contribution / Findings | Citations | Source |\n" sep = "|------|-------|---------|-----------------------------|-----------|--------|\n" for art in articles[: min(10, len(articles))]: title = art.get("title") or "Untitled" year = parse_year_from_text(art.get("body", "")) or "N/A" link = art.get("link") or "" rows.append(f"| {year} | {title} | N/A | N/A | N/A | {link} |\n") response = header + sep + "".join(rows) return response def chat_response(message: str) -> str: """Generate normal conversational response (no table, no web).""" prompt = _build_chat_prompt() msgs = prompt.format_messages(message=message) try: response = llm.invoke(msgs).content except Exception as e: return f"I apologize, but an error occurred: {str(e)}\nPlease try again or rephrase the question." if not isinstance(response, str): return ( "I apologize, but I couldn't generate a proper response. Please try again." ) return response def answer_as_table( message: str, region: str = "us-en", max_results: int = 20, safesearch: str = "moderate", timelimit: Optional[str] = None, backend: Optional[str] = None, force_web: bool = False, ) -> str: """ Routing: - If force_web is True: return a Markdown TABLE (web). - If force_web is False: return plain chat text (no web). """ message = (message or "").strip() if not message: return "" if force_web: return literature_review_table( message, region=region, max_results=max_results, safesearch=safesearch, timelimit=timelimit, backend=backend, ) # Plain chat (no web) return chat_response(message)