| """ |
| web.py β Real-time web search utilities. |
| |
| Search cascade: |
| 1. Tavily (fast, structured results, needs API key) |
| 2. DuckDuckGo (free, no key needed) β fallback when Tavily fails/empty |
| """ |
| from __future__ import annotations |
|
|
| import asyncio |
| import logging |
| import re |
| from typing import Any |
|
|
| logger = logging.getLogger(__name__) |
|
|
|
|
| |
|
|
| |
| _REALTIME_WORDS = re.compile( |
| r"""(?ix) |
| \b( |
| latest | newest | recent | currently | right now | as of today | |
| today | tonight | this morning | this evening | this week | this month | this year | |
| yesterday | last night | last week | last month | |
| breaking | live | ongoing | just (happened|announced|released|launched|dropped) | |
| in \s+ the \s+ news | trending | viral | |
| current | now | update | updates | |
| score | result | results | winner | winners | standings | |
| stock | crypto | bitcoin | ethereum | price | prices | rate | rates | exchange \s+ rate | |
| weather | forecast | temperature | rain | storm | hurricane | earthquake | |
| election | vote | votes | poll | polls | |
| match | game | tournament | championship | cup | league | ipl | nba | nfl | premier \s+ league | |
| war | conflict | attack | protest | summit | deal | treaty | |
| launch | released | announced | unveiled | dropped | update | |
| who \s+ won | who \s+ is \s+ winning | what \s+ happened | what \s+ is \s+ happening | |
| how \s+ much \s+ is | what \s+ is \s+ the \s+ (current|latest|price|value) | |
| did \s+ \w+ \s+ (win|lose|beat|happen|say|announce|release) | |
| when \s+ did | when \s+ is \s+ (the \s+)?(next|upcoming) | |
| 2024 | 2025 | 2026 |
| )\b |
| """, |
| ) |
|
|
| |
| _REALTIME_PHRASES = [ |
| "current president", |
| "prime minister", |
| "today's", |
| "this week's", |
| "live score", |
| "box office", |
| "number one", |
| "top charts", |
| "inflation rate", |
| "gdp growth", |
| "unemployment rate", |
| "oil price", |
| "gold price", |
| "market cap", |
| "nifty 50", |
| "sensex", |
| "dow jones", |
| "s&p 500", |
| "nasdaq", |
| ] |
|
|
| |
| _SKIP_RE = re.compile( |
| r"""(?ix) |
| \b( |
| what \s+ is \s+ (machine \s+ learning|artificial \s+ intelligence|python|java| |
| gravity|photosynthesis|democracy|capitalism|blockchain| |
| deep \s+ learning|neural \s+ network) | |
| how \s+ does \s+ \w+ \s+ work | |
| explain \s+ (me \s+)? (the \s+)? \w+ | |
| write \s+ (a|an|me) \s+ (poem|story|essay|function|code|program|script|letter) | |
| help \s+ me \s+ (with|understand|write|debug|fix|code) | |
| what \s+ are \s+ the \s+ (best \s+ practices|benefits|advantages|disadvantages) | |
| history \s+ of | |
| definition \s+ of | |
| difference \s+ between |
| )\b |
| """, |
| ) |
|
|
|
|
| def should_web_search(user_message: str) -> bool: |
| """ |
| Decide whether the user message warrants a real-time web search. |
| Returns True when the query is likely time-sensitive or current-events related. |
| """ |
| msg = (user_message or "").strip() |
| if not msg: |
| return False |
|
|
| |
| if _SKIP_RE.search(msg): |
| return False |
|
|
| |
| if _REALTIME_WORDS.search(msg): |
| return True |
|
|
| |
| msg_lower = msg.lower() |
| if any(phrase in msg_lower for phrase in _REALTIME_PHRASES): |
| return True |
|
|
| |
| if re.match(r"(?i)^who\s+(is|are|was|were)\s+", msg): |
| return True |
|
|
| |
| if re.match(r"(?i)^what('?s|\s+is|\s+are)\s+(the\s+)?(current|latest|today)", msg): |
| return True |
|
|
| return False |
|
|
|
|
| |
|
|
| async def _tavily_search(api_key: str, query: str, max_results: int) -> list[dict[str, Any]]: |
| def _do() -> dict[str, Any]: |
| from tavily import TavilyClient |
| return TavilyClient(api_key=api_key).search(query=query, max_results=max_results) |
|
|
| result = await asyncio.to_thread(_do) |
| return result.get("results") or [] |
|
|
|
|
| |
|
|
| async def _ddg_search(query: str, max_results: int) -> list[dict[str, Any]]: |
| def _do() -> list[dict[str, Any]]: |
| from duckduckgo_search import DDGS |
| with DDGS() as ddgs: |
| return list(ddgs.text(query, max_results=max_results)) |
|
|
| return await asyncio.to_thread(_do) |
|
|
|
|
| |
|
|
| def _format_results(results: list[dict[str, Any]], source: str) -> str: |
| if not results: |
| return "" |
| lines: list[str] = [f"π Web search results ({source}):"] |
| for i, r in enumerate(results, start=1): |
| title = (r.get("title") or "").strip() |
| url = (r.get("url") or r.get("href") or "").strip() |
| snippet = (r.get("content") or r.get("body") or r.get("snippet") or "").strip() |
| if not snippet: |
| snippet = "No snippet available." |
| lines.append(f"\n{i}. **{title}**\n {snippet}\n Source: {url}") |
| return "\n".join(lines) |
|
|
|
|
| |
|
|
| async def fetch_web_context( |
| query: str, |
| *, |
| tavily_api_key: str | None = None, |
| max_results: int = 5, |
| ) -> str | None: |
| """ |
| Try Tavily first, fall back to DuckDuckGo. |
| Returns a formatted context string or None if both fail. |
| """ |
| |
| if tavily_api_key: |
| try: |
| results = await _tavily_search(tavily_api_key, query, max_results) |
| if results: |
| logger.info("web_search source=tavily query=%r results=%d", query, len(results)) |
| return _format_results(results, "Tavily") |
| logger.info("web_search tavily returned 0 results for %r, trying DuckDuckGo", query) |
| except Exception as exc: |
| logger.warning("web_search tavily_failed=%r, falling back to DuckDuckGo", exc) |
|
|
| |
| try: |
| results = await _ddg_search(query, max_results) |
| if results: |
| logger.info("web_search source=duckduckgo query=%r results=%d", query, len(results)) |
| return _format_results(results, "DuckDuckGo") |
| except Exception as exc: |
| logger.warning("web_search duckduckgo_failed=%r", exc) |
|
|
| return None |
|
|
|
|
| |
| async def tavily_search_to_context( |
| api_key: str, |
| query: str, |
| *, |
| max_results: int = 5, |
| ) -> str: |
| """Deprecated: use fetch_web_context instead.""" |
| ctx = await fetch_web_context(query, tavily_api_key=api_key, max_results=max_results) |
| return ctx or "No relevant web results found." |
|
|