File size: 13,401 Bytes
af59dca
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3efff14
af59dca
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
# core.py
from __future__ import annotations

import os
import re
import math
import uuid
import itertools
from typing import Any, Dict, List, Optional, Tuple
from urllib.parse import urlsplit, urlunsplit

from langchain_core.prompts import ChatPromptTemplate
from langchain_google_genai import ChatGoogleGenerativeAI

try:
    # optional drop-in providing .text()
    from ddgs import DDGS  # type: ignore
except ImportError:
    # provides DDGS().text with region/safesearch/timelimit/max_results options
    from duckduckgo_search import DDGS  # type: ignore


# Initialize LLM (Gemini via LangChain integration)
# Note: GOOGLE_API_KEY must be set in the environment for this to work.
# Example: export GOOGLE_API_KEY="your-key"
llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash-lite",
    temperature=0,
    max_output_tokens=None,
    timeout=60,
    max_retries=3,
)


ACADEMIC_SITES_FILTER = (
    "site:neurips.cc OR site:arxiv.cc OR site:icml.cc OR site:iclr.cc OR "
    "site:aaai.org OR site:ijcai.org OR site:thecvf.com OR site:kdd.org OR "
    "site:sigcomm.org OR site:usenix.org OR site:ieeexplore.ieee.org"
)


def parse_year_from_text(text: str) -> Optional[int]:
    """Extract publication year from text."""
    years = re.findall(r"\b(19|20)\d{2}\b", text or "")
    return int(years[0]) if years else None


def _normalize_url(u: str) -> str:
    if not u:
        return ""
    try:
        parts = urlsplit(u.strip())
        # drop query/fragment to normalize
        return urlunsplit(
            (parts.scheme.lower(), parts.netloc.lower(), parts.path.rstrip("/"), "", "")
        )
    except Exception:
        return u.strip().rstrip("/").lower()


def _safe_ddgs_text_call(
    ddgs: DDGS,
    query: str,
    region: str,
    safesearch: str,
    timelimit: Optional[str],
    max_results: Optional[int],
    backend: Optional[str] = None,
    retries: int = 2,
) -> List[Dict[str, Any]]:
    """
    Call DDGS().text with graceful handling of different library signatures and backend fallbacks.
    Tries a sequence of backends when no results are returned.
    """
    # Preferred backend order: lite -> html -> api -> auto (some versions)
    candidate_backends = []
    if backend:
        candidate_backends.append(backend)
    candidate_backends.extend(
        [b for b in ["lite", "html", "api", "auto"] if b != backend]
    )

    for b in candidate_backends:
        for _ in range(max(1, retries)):
            try:
                # Newer versions: returns list; older: generator
                res = ddgs.text(
                    query,
                    region=region,
                    safesearch=safesearch,
                    timelimit=timelimit,
                    backend=b,
                    max_results=max_results,
                )
                if res is None:
                    results = []
                elif isinstance(res, list):
                    results = res
                else:
                    # generator fallback
                    results = list(res)
            except TypeError:
                # Older signature without backend/max_results
                try:
                    res = ddgs.text(
                        query,
                        region=region,
                        safesearch=safesearch,
                        timelimit=timelimit,
                    )
                    results = list(res) if res is not None else []
                    if max_results:
                        results = results[:max_results]
                except Exception:
                    results = []
            except Exception:
                results = []

            if results:
                return results
    return []


def _build_query_prompt() -> ChatPromptTemplate:
    """
    Prompt to generate 2–3 short keyword queries for academic literature search.
    """
    return ChatPromptTemplate.from_template(
        """
Act as a query planner for academic literature search.
Given a topic, produce 2–3 distinct, short keyword-based queries optimized for academic sources.
Requirements:
- Be concise (each query < 12 words).
- Avoid punctuation except site: filters or boolean OR if needed.
- Prefer neutral, general keywords and important synonyms.
- Return ONLY the queries, one per line, no numbering or extra text.

Topic:
{topic}
""".strip()
    )


def generate_search_queries(topic: str, k: int = 3) -> List[str]:
    """
    Use the LLM to propose 2–3 concise queries for web search.
    Ensures at least 2 queries; truncates to k.
    """
    prompt = _build_query_prompt()
    msgs = prompt.format_messages(topic=(topic or "").strip())
    try:
        out = (llm.invoke(msgs).content or "").strip()
    except Exception:
        out = ""

    # Parse lines into queries
    queries = [q.strip() for q in out.splitlines() if q.strip()]
    # Deduplicate while preserving order
    seen = set()
    deduped = []
    for q in queries:
        if q.lower() not in seen:
            deduped.append(q)
            seen.add(q.lower())

    # Ensure at least 2 queries; fallback heuristics
    base = (topic or "").strip()
    if len(deduped) < 2:
        # Basic expansions
        fallbacks = [
            base,
            f"{base} method comparison",
            f"{base} benchmarks",
            f"{base} survey review",
        ]
        for fb in fallbacks:
            if fb and fb.lower() not in seen:
                deduped.append(fb)
                seen.add(fb.lower())
            if len(deduped) >= max(2, k):
                break

    # Truncate to k (default 3)
    return deduped[: max(2, k)]


# Replace fetch_literature_results_multi with this version:
def fetch_literature_results_multi(
    topic: str,
    region: str = "wt-wt",  # prefer wt-wt for robustness
    max_results: int = 20,
    safesearch: str = "moderate",
    timelimit: Optional[str] = None,
    backend: Optional[str] = None,
) -> List[Dict[str, Any]]:
    """
    Fetch academic results via DuckDuckGo across multiple LLM-generated queries
    with backend/region fallbacks and deduplication.
    """
    queries = generate_search_queries(topic, k=3)
    per_query = max(3, math.ceil(max_results / max(1, len(queries))))
    results: List[Dict[str, Any]] = []

    try:
        with DDGS() as ddgs:
            for q in queries:
                q_aug = f"{q} {ACADEMIC_SITES_FILTER}"
                rows = _safe_ddgs_text_call(
                    ddgs,
                    q_aug,
                    region=region,
                    safesearch=safesearch,
                    timelimit=timelimit,
                    max_results=per_query,
                    backend=backend,
                    retries=2,
                )
                for r in rows or []:
                    results.append(
                        {
                            "title": r.get("title", "") or "",
                            "body": r.get("body", "") or "",
                            "link": r.get("href", "") or "",
                            "source": r.get("source", "web") or "web",
                            "query_used": q,
                        }
                    )
    except Exception:
        return []

    # Deduplicate by normalized URL
    deduped: List[Dict[str, Any]] = []
    seen_links = set()
    for row in results:
        norm = _normalize_url(row.get("link", ""))
        if norm and norm not in seen_links:
            deduped.append(row)
            seen_links.add(norm)

    return deduped[:max_results]


def _build_table_prompt() -> ChatPromptTemplate:
    """
    Prompt to produce a Markdown table for literature review (used only when web is enabled).
    Sorted by year (latest → oldest).
    """
    return ChatPromptTemplate.from_template(
        """
You are a meticulous academic research analyst specializing in synthesizing scholarly publications.
You will examine the provided list of paper titles and abstracts in detail.

Your objective is to produce a high-quality, chronologically sorted (latest → oldest) literature review table in Markdown format.

For each paper, you must:
- Accurately determine the Year (from metadata, title, or context; estimate if unclear).
- Identify and list the Title in full.
- Extract or infer Authors from the text; if not stated, write 'N/A'.
- Summarize Key Contribution / Findings in 1–2 precise, academically phrased sentences.
- Record Citation Count if mentioned; if not, write 'N/A'.
- Provide the Source Link if present; if absent, write 'N/A'.

Additional requirements:
- If publication venue (journal/conference) is mentioned, briefly note it in parentheses after the year.
- Use neutral, scholarly tone and avoid unnecessary adjectives.
- Ensure all summaries focus on the core novel contribution, methodology highlights, and notable results.
- Maintain uniform formatting for all rows and ensure alignment of columns in Markdown.
- Double-check chronological order: newest year first, oldest last.

Topic: {topic}

Papers:
{compiled_text}

Now output ONLY the Markdown table. Do not include commentary before or after the table.
""".strip()
    )


def _build_chat_prompt() -> ChatPromptTemplate:
    """Prompt for normal chat responses (no web formatting)."""
    return ChatPromptTemplate.from_template(
        """
You are a helpful academic research assistant with expertise in computer science, machine learning, and related fields.
Provide clear, accurate, and informative responses to academic questions. Use a friendly but professional tone.

Guidelines:
- Be concise but thorough
- Explain concepts clearly
- Use examples when helpful
- Break down complex topics
- Cite established facts when appropriate
- Respond in natural conversational style (NOT in table format)

User Message:
{message}

Your Response:
""".strip()
    )


def literature_review_table(
    topic: str,
    region: str = "us-en",
    max_results: int = 20,
    safesearch: str = "moderate",
    timelimit: Optional[str] = None,
    backend: Optional[str] = None,
) -> str:
    """
    Generate a literature review as a Markdown TABLE using multi-query web results.
    """
    articles = fetch_literature_results_multi(
        topic=topic,
        region=region,
        max_results=max_results,
        safesearch=safesearch,
        timelimit=timelimit,
        backend=backend,
    )

    if not articles:
        return (
            "| Intent | Reply |\n"
            "|--------|-------|\n"
            "| Info | No academic sources found for this topic; try refining the query or checking the connection. |\n"
        )

    # Compile search results for the LLM
    compiled_text = ""
    for art in articles:
        compiled_text += (
            f"Title: {art.get('title', '')}\n"
            f"Abstract: {art.get('body', '')}\n"
            f"Source: {art.get('source', '')}\n"
            f"Link: {art.get('link', '')}\n\n"
        )

    prompt = _build_table_prompt()
    msgs = prompt.format_messages(topic=topic, compiled_text=compiled_text)

    try:
        response = llm.invoke(msgs).content
    except Exception as e:
        return (
            "| Intent | Reply |\n"
            "|--------|-------|\n"
            f"| Error | Error generating literature table: {str(e)} |\n"
        )

    # Sanity: ensure it looks like a Markdown table
    if not isinstance(response, str) or "|" not in response:
        # Minimal fallback: construct a table from top hits
        rows = []
        header = "| Year | Title | Authors | Key Contribution / Findings | Citations | Source |\n"
        sep = "|------|-------|---------|-----------------------------|-----------|--------|\n"
        for art in articles[: min(10, len(articles))]:
            title = art.get("title") or "Untitled"
            year = parse_year_from_text(art.get("body", "")) or "N/A"
            link = art.get("link") or ""
            rows.append(f"| {year} | {title} | N/A | N/A | N/A | {link} |\n")
        response = header + sep + "".join(rows)

    return response


def chat_response(message: str) -> str:
    """Generate normal conversational response (no table, no web)."""
    prompt = _build_chat_prompt()
    msgs = prompt.format_messages(message=message)

    try:
        response = llm.invoke(msgs).content
    except Exception as e:
        return f"I apologize, but an error occurred: {str(e)}\nPlease try again or rephrase the question."

    if not isinstance(response, str):
        return (
            "I apologize, but I couldn't generate a proper response. Please try again."
        )
    return response


def answer_as_table(
    message: str,
    region: str = "us-en",
    max_results: int = 20,
    safesearch: str = "moderate",
    timelimit: Optional[str] = None,
    backend: Optional[str] = None,
    force_web: bool = False,
) -> str:
    """
    Routing:
    - If force_web is True: return a Markdown TABLE (web).
    - If force_web is False: return plain chat text (no web).
    """
    message = (message or "").strip()
    if not message:
        return ""

    if force_web:
        return literature_review_table(
            message,
            region=region,
            max_results=max_results,
            safesearch=safesearch,
            timelimit=timelimit,
            backend=backend,
        )

    # Plain chat (no web)
    return chat_response(message)