Spaces:
Sleeping
Sleeping
| # core.py | |
| from __future__ import annotations | |
| import os | |
| import re | |
| import math | |
| import uuid | |
| import itertools | |
| from typing import Any, Dict, List, Optional, Tuple | |
| from urllib.parse import urlsplit, urlunsplit | |
| from langchain_core.prompts import ChatPromptTemplate | |
| from langchain_google_genai import ChatGoogleGenerativeAI | |
| try: | |
| # optional drop-in providing .text() | |
| from ddgs import DDGS # type: ignore | |
| except ImportError: | |
| # provides DDGS().text with region/safesearch/timelimit/max_results options | |
| from duckduckgo_search import DDGS # type: ignore | |
| # Initialize LLM (Gemini via LangChain integration) | |
| # Note: GOOGLE_API_KEY must be set in the environment for this to work. | |
| # Example: export GOOGLE_API_KEY="your-key" | |
| llm = ChatGoogleGenerativeAI( | |
| model="gemini-2.5-flash-lite", | |
| temperature=0, | |
| max_output_tokens=None, | |
| timeout=60, | |
| max_retries=3, | |
| ) | |
| ACADEMIC_SITES_FILTER = ( | |
| "site:neurips.cc OR site:arxiv.cc OR site:icml.cc OR site:iclr.cc OR " | |
| "site:aaai.org OR site:ijcai.org OR site:thecvf.com OR site:kdd.org OR " | |
| "site:sigcomm.org OR site:usenix.org OR site:ieeexplore.ieee.org" | |
| ) | |
| def parse_year_from_text(text: str) -> Optional[int]: | |
| """Extract publication year from text.""" | |
| years = re.findall(r"\b(19|20)\d{2}\b", text or "") | |
| return int(years[0]) if years else None | |
| def _normalize_url(u: str) -> str: | |
| if not u: | |
| return "" | |
| try: | |
| parts = urlsplit(u.strip()) | |
| # drop query/fragment to normalize | |
| return urlunsplit( | |
| (parts.scheme.lower(), parts.netloc.lower(), parts.path.rstrip("/"), "", "") | |
| ) | |
| except Exception: | |
| return u.strip().rstrip("/").lower() | |
| def _safe_ddgs_text_call( | |
| ddgs: DDGS, | |
| query: str, | |
| region: str, | |
| safesearch: str, | |
| timelimit: Optional[str], | |
| max_results: Optional[int], | |
| backend: Optional[str] = None, | |
| retries: int = 2, | |
| ) -> List[Dict[str, Any]]: | |
| """ | |
| Call DDGS().text with graceful handling of different library signatures and backend fallbacks. | |
| Tries a sequence of backends when no results are returned. | |
| """ | |
| # Preferred backend order: lite -> html -> api -> auto (some versions) | |
| candidate_backends = [] | |
| if backend: | |
| candidate_backends.append(backend) | |
| candidate_backends.extend( | |
| [b for b in ["lite", "html", "api", "auto"] if b != backend] | |
| ) | |
| for b in candidate_backends: | |
| for _ in range(max(1, retries)): | |
| try: | |
| # Newer versions: returns list; older: generator | |
| res = ddgs.text( | |
| query, | |
| region=region, | |
| safesearch=safesearch, | |
| timelimit=timelimit, | |
| backend=b, | |
| max_results=max_results, | |
| ) | |
| if res is None: | |
| results = [] | |
| elif isinstance(res, list): | |
| results = res | |
| else: | |
| # generator fallback | |
| results = list(res) | |
| except TypeError: | |
| # Older signature without backend/max_results | |
| try: | |
| res = ddgs.text( | |
| query, | |
| region=region, | |
| safesearch=safesearch, | |
| timelimit=timelimit, | |
| ) | |
| results = list(res) if res is not None else [] | |
| if max_results: | |
| results = results[:max_results] | |
| except Exception: | |
| results = [] | |
| except Exception: | |
| results = [] | |
| if results: | |
| return results | |
| return [] | |
| def _build_query_prompt() -> ChatPromptTemplate: | |
| """ | |
| Prompt to generate 2–3 short keyword queries for academic literature search. | |
| """ | |
| return ChatPromptTemplate.from_template( | |
| """ | |
| Act as a query planner for academic literature search. | |
| Given a topic, produce 2–3 distinct, short keyword-based queries optimized for academic sources. | |
| Requirements: | |
| - Be concise (each query < 12 words). | |
| - Avoid punctuation except site: filters or boolean OR if needed. | |
| - Prefer neutral, general keywords and important synonyms. | |
| - Return ONLY the queries, one per line, no numbering or extra text. | |
| Topic: | |
| {topic} | |
| """.strip() | |
| ) | |
| def generate_search_queries(topic: str, k: int = 3) -> List[str]: | |
| """ | |
| Use the LLM to propose 2–3 concise queries for web search. | |
| Ensures at least 2 queries; truncates to k. | |
| """ | |
| prompt = _build_query_prompt() | |
| msgs = prompt.format_messages(topic=(topic or "").strip()) | |
| try: | |
| out = (llm.invoke(msgs).content or "").strip() | |
| except Exception: | |
| out = "" | |
| # Parse lines into queries | |
| queries = [q.strip() for q in out.splitlines() if q.strip()] | |
| # Deduplicate while preserving order | |
| seen = set() | |
| deduped = [] | |
| for q in queries: | |
| if q.lower() not in seen: | |
| deduped.append(q) | |
| seen.add(q.lower()) | |
| # Ensure at least 2 queries; fallback heuristics | |
| base = (topic or "").strip() | |
| if len(deduped) < 2: | |
| # Basic expansions | |
| fallbacks = [ | |
| base, | |
| f"{base} method comparison", | |
| f"{base} benchmarks", | |
| f"{base} survey review", | |
| ] | |
| for fb in fallbacks: | |
| if fb and fb.lower() not in seen: | |
| deduped.append(fb) | |
| seen.add(fb.lower()) | |
| if len(deduped) >= max(2, k): | |
| break | |
| # Truncate to k (default 3) | |
| return deduped[: max(2, k)] | |
| # Replace fetch_literature_results_multi with this version: | |
| def fetch_literature_results_multi( | |
| topic: str, | |
| region: str = "wt-wt", # prefer wt-wt for robustness | |
| max_results: int = 20, | |
| safesearch: str = "moderate", | |
| timelimit: Optional[str] = None, | |
| backend: Optional[str] = None, | |
| ) -> List[Dict[str, Any]]: | |
| """ | |
| Fetch academic results via DuckDuckGo across multiple LLM-generated queries | |
| with backend/region fallbacks and deduplication. | |
| """ | |
| queries = generate_search_queries(topic, k=3) | |
| per_query = max(3, math.ceil(max_results / max(1, len(queries)))) | |
| results: List[Dict[str, Any]] = [] | |
| try: | |
| with DDGS() as ddgs: | |
| for q in queries: | |
| q_aug = f"{q} {ACADEMIC_SITES_FILTER}" | |
| rows = _safe_ddgs_text_call( | |
| ddgs, | |
| q_aug, | |
| region=region, | |
| safesearch=safesearch, | |
| timelimit=timelimit, | |
| max_results=per_query, | |
| backend=backend, | |
| retries=2, | |
| ) | |
| for r in rows or []: | |
| results.append( | |
| { | |
| "title": r.get("title", "") or "", | |
| "body": r.get("body", "") or "", | |
| "link": r.get("href", "") or "", | |
| "source": r.get("source", "web") or "web", | |
| "query_used": q, | |
| } | |
| ) | |
| except Exception: | |
| return [] | |
| # Deduplicate by normalized URL | |
| deduped: List[Dict[str, Any]] = [] | |
| seen_links = set() | |
| for row in results: | |
| norm = _normalize_url(row.get("link", "")) | |
| if norm and norm not in seen_links: | |
| deduped.append(row) | |
| seen_links.add(norm) | |
| return deduped[:max_results] | |
| def _build_table_prompt() -> ChatPromptTemplate: | |
| """ | |
| Prompt to produce a Markdown table for literature review (used only when web is enabled). | |
| Sorted by year (latest → oldest). | |
| """ | |
| return ChatPromptTemplate.from_template( | |
| """ | |
| You are a meticulous academic research analyst specializing in synthesizing scholarly publications. | |
| You will examine the provided list of paper titles and abstracts in detail. | |
| Your objective is to produce a high-quality, chronologically sorted (latest → oldest) literature review table in Markdown format. | |
| For each paper, you must: | |
| - Accurately determine the Year (from metadata, title, or context; estimate if unclear). | |
| - Identify and list the Title in full. | |
| - Extract or infer Authors from the text; if not stated, write 'N/A'. | |
| - Summarize Key Contribution / Findings in 1–2 precise, academically phrased sentences. | |
| - Record Citation Count if mentioned; if not, write 'N/A'. | |
| - Provide the Source Link if present; if absent, write 'N/A'. | |
| Additional requirements: | |
| - If publication venue (journal/conference) is mentioned, briefly note it in parentheses after the year. | |
| - Use neutral, scholarly tone and avoid unnecessary adjectives. | |
| - Ensure all summaries focus on the core novel contribution, methodology highlights, and notable results. | |
| - Maintain uniform formatting for all rows and ensure alignment of columns in Markdown. | |
| - Double-check chronological order: newest year first, oldest last. | |
| Topic: {topic} | |
| Papers: | |
| {compiled_text} | |
| Now output ONLY the Markdown table. Do not include commentary before or after the table. | |
| """.strip() | |
| ) | |
| def _build_chat_prompt() -> ChatPromptTemplate: | |
| """Prompt for normal chat responses (no web formatting).""" | |
| return ChatPromptTemplate.from_template( | |
| """ | |
| You are a helpful academic research assistant with expertise in computer science, machine learning, and related fields. | |
| Provide clear, accurate, and informative responses to academic questions. Use a friendly but professional tone. | |
| Guidelines: | |
| - Be concise but thorough | |
| - Explain concepts clearly | |
| - Use examples when helpful | |
| - Break down complex topics | |
| - Cite established facts when appropriate | |
| - Respond in natural conversational style (NOT in table format) | |
| User Message: | |
| {message} | |
| Your Response: | |
| """.strip() | |
| ) | |
| def literature_review_table( | |
| topic: str, | |
| region: str = "us-en", | |
| max_results: int = 20, | |
| safesearch: str = "moderate", | |
| timelimit: Optional[str] = None, | |
| backend: Optional[str] = None, | |
| ) -> str: | |
| """ | |
| Generate a literature review as a Markdown TABLE using multi-query web results. | |
| """ | |
| articles = fetch_literature_results_multi( | |
| topic=topic, | |
| region=region, | |
| max_results=max_results, | |
| safesearch=safesearch, | |
| timelimit=timelimit, | |
| backend=backend, | |
| ) | |
| if not articles: | |
| return ( | |
| "| Intent | Reply |\n" | |
| "|--------|-------|\n" | |
| "| Info | No academic sources found for this topic; try refining the query or checking the connection. |\n" | |
| ) | |
| # Compile search results for the LLM | |
| compiled_text = "" | |
| for art in articles: | |
| compiled_text += ( | |
| f"Title: {art.get('title', '')}\n" | |
| f"Abstract: {art.get('body', '')}\n" | |
| f"Source: {art.get('source', '')}\n" | |
| f"Link: {art.get('link', '')}\n\n" | |
| ) | |
| prompt = _build_table_prompt() | |
| msgs = prompt.format_messages(topic=topic, compiled_text=compiled_text) | |
| try: | |
| response = llm.invoke(msgs).content | |
| except Exception as e: | |
| return ( | |
| "| Intent | Reply |\n" | |
| "|--------|-------|\n" | |
| f"| Error | Error generating literature table: {str(e)} |\n" | |
| ) | |
| # Sanity: ensure it looks like a Markdown table | |
| if not isinstance(response, str) or "|" not in response: | |
| # Minimal fallback: construct a table from top hits | |
| rows = [] | |
| header = "| Year | Title | Authors | Key Contribution / Findings | Citations | Source |\n" | |
| sep = "|------|-------|---------|-----------------------------|-----------|--------|\n" | |
| for art in articles[: min(10, len(articles))]: | |
| title = art.get("title") or "Untitled" | |
| year = parse_year_from_text(art.get("body", "")) or "N/A" | |
| link = art.get("link") or "" | |
| rows.append(f"| {year} | {title} | N/A | N/A | N/A | {link} |\n") | |
| response = header + sep + "".join(rows) | |
| return response | |
| def chat_response(message: str) -> str: | |
| """Generate normal conversational response (no table, no web).""" | |
| prompt = _build_chat_prompt() | |
| msgs = prompt.format_messages(message=message) | |
| try: | |
| response = llm.invoke(msgs).content | |
| except Exception as e: | |
| return f"I apologize, but an error occurred: {str(e)}\nPlease try again or rephrase the question." | |
| if not isinstance(response, str): | |
| return ( | |
| "I apologize, but I couldn't generate a proper response. Please try again." | |
| ) | |
| return response | |
| def answer_as_table( | |
| message: str, | |
| region: str = "us-en", | |
| max_results: int = 20, | |
| safesearch: str = "moderate", | |
| timelimit: Optional[str] = None, | |
| backend: Optional[str] = None, | |
| force_web: bool = False, | |
| ) -> str: | |
| """ | |
| Routing: | |
| - If force_web is True: return a Markdown TABLE (web). | |
| - If force_web is False: return plain chat text (no web). | |
| """ | |
| message = (message or "").strip() | |
| if not message: | |
| return "" | |
| if force_web: | |
| return literature_review_table( | |
| message, | |
| region=region, | |
| max_results=max_results, | |
| safesearch=safesearch, | |
| timelimit=timelimit, | |
| backend=backend, | |
| ) | |
| # Plain chat (no web) | |
| return chat_response(message) | |