Spaces:

Nymbo
/

Tools

Running

File size: 21,499 Bytes

588592f

from __future__ import annotations

from typing import Annotated, List, Literal
from datetime import datetime

import gradio as gr
from ddgs import DDGS

from app import _log_call_end, _log_call_start, _search_rate_limiter, _truncate_for_log
from ._docstrings import autodoc
from ._searxng_client import SearXNGClient, TimeRange
from ._query_optimizer import get_optimizer


# Single source of truth for the LLM-facing tool description
TOOL_SUMMARY = (
    "Run a web search across text, news, images, videos, or books. "
    "Supports multiple backends (DuckDuckGo, SearXNG) with optional AI query optimization. "
    "Readable results include pagination hints and next_offset when more results are available."
)


_SAFESEARCH_LEVEL = "off"

# Defaults and choices for newly added parameters
BACKEND_CHOICES = [
    "auto",
    "duckduckgo",
    "searxng",
    "bing",
    "brave",
    "yahoo",
    "wikipedia",
]

# Allowed backends per type (explicit selection set)
_ALLOWED_BACKENDS = {
    "text": ["duckduckgo", "searxng", "bing", "brave", "yahoo", "wikipedia"],
    "news": ["duckduckgo", "searxng", "bing", "yahoo"],
    "images": ["duckduckgo", "searxng"],
    "videos": ["duckduckgo"],
    "books": ["annasarchive"],
}

# Auto order per type (used when backend == "auto"); wikipedia excluded for text
_AUTO_ORDER = {
    "text": ["searxng", "duckduckgo", "bing", "brave", "yahoo"],
    "news": ["searxng", "duckduckgo", "bing", "yahoo"],
    "images": ["searxng", "duckduckgo"],
    "videos": ["duckduckgo"],
    "books": ["annasarchive"],
}

# Date filter choices: canonical values used by resolver
DATE_FILTER_CHOICES = ["any", "day", "week", "month", "year"]


def _resolve_backend(search_type: str, backend_choice: str) -> str:
    """Resolve backend string for DDGS based on search type and user choice.



    - If backend_choice is "auto", return a comma-separated fallback order for that type.

    - If backend_choice is not supported by the type, fall back to the first allowed backend.

    - Books endpoint uses only 'annasarchive'.

    """
    stype = search_type if search_type in _ALLOWED_BACKENDS else "text"
    allowed = _ALLOWED_BACKENDS[stype]
    if backend_choice == "auto":
        return ", ".join(_AUTO_ORDER[stype])
    if stype == "books":
        return "annasarchive"
    # Validate backend against allowed set for this type
    if backend_choice in allowed:
        return backend_choice
    # Fallback to first allowed backend
    return allowed[0]


def _resolve_timelimit(date_filter: str, search_type: str) -> str | None:
    """Map UI date filter to DDGS timelimit code per endpoint.



    Returns one of: None, 'd', 'w', 'm', 'y'. For news/videos (which support d/w/m),

    selecting 'year' will coerce to 'm' to stay within supported range.

    """
    normalized = (date_filter or "any").strip().lower()
    if normalized in ("any", "none", ""):
        return None
    mapping = {
        "day": "d",
        "week": "w",
        "month": "m",
        "year": "y",
    }
    code = mapping.get(normalized)
    if not code:
        return None
    if search_type in ("news", "videos") and code == "y":
        return "m"
    return code


def _extract_date_from_snippet(snippet: str) -> str:
    if not snippet:
        return ""
    import re

    date_patterns = [
        r"\b(\d{4}[-/]\d{1,2}[-/]\d{1,2})\b",
        r"\b([A-Za-z]{3,9}\s+\d{1,2},?\s+\d{4})\b",
        r"\b(\d{1,2}\s+[A-Za-z]{3,9}\s+\d{4})\b",
        r"\b(\d+\s+(?:day|week|month|year)s?\s+ago)\b",
        r"(?:Published|Updated|Posted):\s*([^,\n]+?)(?:[,\n]|$)",
    ]
    for pattern in date_patterns:
        matches = re.findall(pattern, snippet, re.IGNORECASE)
        if matches:
            return matches[0].strip()
    return ""


def _format_search_result(result: dict, search_type: str, index: int) -> List[str]:
    lines: List[str] = []
    if search_type == "text":
        title = result.get("title", "").strip()
        url = result.get("href", "").strip()
        snippet = result.get("body", "").strip()
        date = _extract_date_from_snippet(snippet)
        lines.append(f"{index}. {title}")
        lines.append(f"   URL: {url}")
        if snippet:
            lines.append(f"   Summary: {snippet}")
        if date:
            lines.append(f"   Date: {date}")
    elif search_type == "news":
        title = result.get("title", "").strip()
        url = result.get("url", "").strip()
        body = result.get("body", "").strip()
        date = result.get("date", "").strip()
        source = result.get("source", "").strip()
        lines.append(f"{index}. {title}")
        lines.append(f"   URL: {url}")
        if source:
            lines.append(f"   Source: {source}")
        if date:
            lines.append(f"   Date: {date}")
        if body:
            lines.append(f"   Summary: {body}")
    elif search_type == "images":
        title = result.get("title", "").strip()
        image_url = result.get("image", "").strip()
        source_url = result.get("url", "").strip()
        source = result.get("source", "").strip()
        width = result.get("width", "")
        height = result.get("height", "")
        lines.append(f"{index}. {title}")
        lines.append(f"   Image: {image_url}")
        lines.append(f"   Source: {source_url}")
        if source:
            lines.append(f"   Publisher: {source}")
        if width and height:
            lines.append(f"   Dimensions: {width}x{height}")
    elif search_type == "videos":
        title = result.get("title", "").strip()
        description = result.get("description", "").strip()
        duration = result.get("duration", "").strip()
        published = result.get("published", "").strip()
        uploader = result.get("uploader", "").strip()
        embed_url = result.get("embed_url", "").strip()
        lines.append(f"{index}. {title}")
        if embed_url:
            lines.append(f"   Video: {embed_url}")
        if uploader:
            lines.append(f"   Uploader: {uploader}")
        if duration:
            lines.append(f"   Duration: {duration}")
        if published:
            lines.append(f"   Published: {published}")
        if description:
            lines.append(f"   Description: {description}")
    elif search_type == "books":
        title = result.get("title", "").strip()
        url = result.get("url", "").strip()
        body = result.get("body", "").strip()
        lines.append(f"{index}. {title}")
        lines.append(f"   URL: {url}")
        if body:
            lines.append(f"   Description: {body}")
    return lines


@autodoc(

    summary=TOOL_SUMMARY,

)
def Web_Search(

    query: Annotated[str, "The search query (supports operators like site:, quotes, OR)."],

    max_results: Annotated[int, "Number of results to return (1–20)."] = 5,

    page: Annotated[int, "Page number for pagination (1-based, each page contains max_results items)."] = 1,

    offset: Annotated[int, "Result offset to start from (overrides page if > 0, for precise continuation)."] = 0,

    search_type: Annotated[str, "Type of search: 'text' (web pages), 'news', 'images', 'videos', or 'books'."] = "text",

    backend: Annotated[str, "Search backend: 'duckduckgo', 'searxng', or 'auto' (SearXNG first, then DDG)."] = "auto",

    date_filter: Annotated[str, "Time filter: any, day, week, month, year."] = "any",

    optimize_query: Annotated[bool, "Use AI to optimize the query for better results (adds ~2s latency)."] = False,

) -> str:
    _log_call_start(
        "Web_Search",
        query=query,
        max_results=max_results,
        page=page,
        search_type=search_type,
        offset=offset,
        backend=backend,
        date_filter=date_filter,
        optimize_query=optimize_query,
    )

    # Query optimization (optional)
    optimization_metadata = None
    if optimize_query:
        try:
            optimizer = get_optimizer()
            query, optimization_metadata = optimizer.optimize_for_search_engine(query)
        except Exception as exc:
            print(f"[Web_Search] Query optimization failed: {exc}", flush=True)
            # Continue with original query

    if not query or not query.strip():
        result = "No search query provided. Please enter a search term."
        _log_call_end("Web_Search", _truncate_for_log(result))
        return result
    max_results = max(1, min(20, max_results))
    page = max(1, page)
    offset = max(0, offset)
    valid_types = ["text", "news", "images", "videos", "books"]
    if search_type not in valid_types:
        search_type = "text"
    if offset > 0:
        actual_offset = offset
        calculated_page = (offset // max_results) + 1
    else:
        actual_offset = (page - 1) * max_results
        calculated_page = page
    total_needed = actual_offset + max_results
    used_fallback = False
    original_search_type = search_type
    # Prepare cross-cutting parameters
    resolved_backend = _resolve_backend(search_type, (backend or "auto").lower())
    timelimit = _resolve_timelimit(date_filter, search_type)

    # Map date_filter to SearXNG TimeRange
    _TIME_RANGE_MAP = {
        "day": TimeRange.DAY,
        "week": TimeRange.WEEK,
        "month": TimeRange.MONTH,
        "year": TimeRange.YEAR,
    }
    searxng_time_range = _TIME_RANGE_MAP.get(date_filter.lower()) if date_filter else None

    def _perform_searxng_search(stype: str) -> list[dict]:
        """Perform search using SearXNG backend."""
        try:
            _search_rate_limiter.acquire()
            with SearXNGClient() as client:
                if stype == "text":
                    results = client.text(query, max_results=total_needed, time_range=searxng_time_range)
                    return [
                        {
                            "title": r.title,
                            "href": r.url,
                            "body": r.content,
                            "engine": r.engine,
                        }
                        for r in results
                    ]
                elif stype == "news":
                    results = client.news(query, max_results=total_needed, time_range=searxng_time_range)
                    return [
                        {
                            "title": r.title,
                            "url": r.url,
                            "body": r.content,
                            "date": r.published_date or "",
                            "source": r.engine or "",
                        }
                        for r in results
                    ]
                elif stype == "images":
                    results = client.images(query, max_results=total_needed)
                    return [
                        {
                            "title": r.title,
                            "image": r.img_src,
                            "url": r.url,
                            "source": r.source or r.engine or "",
                            "thumbnail": r.thumbnail_src,
                        }
                        for r in results
                    ]
                return []
        except Exception as exc:
            print(f"[Web_Search] SearXNG error: {exc}", flush=True)
            return []

    def _perform_search(stype: str) -> list[dict]:
        user_backend_choice = (backend or "auto").lower()

        # Handle SearXNG backend explicitly
        if user_backend_choice == "searxng":
            return _perform_searxng_search(stype)

        # Handle auto: SearXNG first, then DDG fallback
        if user_backend_choice == "auto":
            # Try SearXNG first
            searxng_results = _perform_searxng_search(stype)
            if searxng_results:
                return searxng_results
            # Fallback to DDG
            print(f"[Web_Search] SearXNG returned no results, falling back to DuckDuckGo", flush=True)

        try:
            _search_rate_limiter.acquire()
            with DDGS() as ddgs:
                if stype == "text":
                    if user_backend_choice == "auto":
                        # Auto fallback to DDG after SearXNG failed
                        raw_gen = ddgs.text(
                            query,
                            max_results=total_needed + 10,
                            safesearch=_SAFESEARCH_LEVEL,
                            timelimit=timelimit,
                            backend="duckduckgo",
                        )
                    else:
                        raw_gen = ddgs.text(
                            query,
                            max_results=total_needed + 10,
                            safesearch=_SAFESEARCH_LEVEL,
                            timelimit=timelimit,
                            backend=resolved_backend,
                        )
                elif stype == "news":
                    if user_backend_choice == "auto":
                        # Auto fallback to DDG after SearXNG failed
                        raw_gen = ddgs.news(
                            query,
                            max_results=total_needed + 10,
                            safesearch=_SAFESEARCH_LEVEL,
                            timelimit=timelimit,
                            backend="duckduckgo",
                        )
                    else:
                        raw_gen = ddgs.news(
                            query,
                            max_results=total_needed + 10,
                            safesearch=_SAFESEARCH_LEVEL,
                            timelimit=timelimit,
                            backend=_resolve_backend("news", user_backend_choice),
                        )
                elif stype == "images":
                    raw_gen = ddgs.images(
                        query,
                        max_results=total_needed + 10,
                        safesearch=_SAFESEARCH_LEVEL,
                        timelimit=timelimit,
                        backend=_resolve_backend("images", (backend or "auto").lower()),
                    )
                elif stype == "videos":
                    raw_gen = ddgs.videos(
                        query,
                        max_results=total_needed + 10,
                        safesearch=_SAFESEARCH_LEVEL,
                        timelimit=timelimit,
                        backend=_resolve_backend("videos", (backend or "auto").lower()),
                    )
                else:
                    raw_gen = ddgs.books(
                        query,
                        max_results=total_needed + 10,
                        backend=_resolve_backend("books", (backend or "auto").lower()),
                    )
                try:
                    return list(raw_gen)
                except Exception as inner_exc:
                    if "no results" in str(inner_exc).lower() or "not found" in str(inner_exc).lower():
                        return []
                    raise inner_exc
        except Exception as exc:
            error_msg = f"Search failed: {str(exc)[:200]}"
            lowered = str(exc).lower()
            if "blocked" in lowered or "rate" in lowered:
                error_msg = "Search temporarily blocked due to rate limiting. Please try again in a few minutes."
            elif "timeout" in lowered:
                error_msg = "Search timed out. Please try again with a simpler query."
            elif "network" in lowered or "connection" in lowered:
                error_msg = "Network connection error. Please check your internet connection and try again."
            elif "no results" in lowered or "not found" in lowered:
                return []
            raise Exception(error_msg)

    try:
        raw = _perform_search(search_type)
    except Exception as exc:
        result = f"Error: {exc}"
        _log_call_end("Web_Search", _truncate_for_log(result))
        return result

    if not raw and search_type == "news":
        try:
            raw = _perform_search("text")
            if raw:
                used_fallback = True
                search_type = "text"
        except Exception:
            pass

    if not raw:
        fallback_note = " (also tried 'text' search as fallback)" if original_search_type == "news" and used_fallback else ""
        result = f"No {original_search_type} results found for query: {query}{fallback_note}"
        _log_call_end("Web_Search", _truncate_for_log(result))
        return result

    paginated_results = raw[actual_offset: actual_offset + max_results]
    if not paginated_results:
        if actual_offset >= len(raw):
            result = f"Offset {actual_offset} exceeds available results ({len(raw)} total). Try offset=0 to start from beginning."
        else:
            result = f"No {original_search_type} results found on page {calculated_page} for query: {query}. Try page 1 or reduce page number."
        _log_call_end("Web_Search", _truncate_for_log(result))
        return result

    total_available = len(raw)
    start_num = actual_offset + 1
    end_num = actual_offset + len(paginated_results)
    next_offset = actual_offset + len(paginated_results)
    search_label = original_search_type.title()
    if used_fallback:
        search_label += " → Text (Smart Fallback)"

    now_dt = datetime.now().astimezone()
    date_str = now_dt.strftime("%A, %B %d, %Y %I:%M %p %Z").strip()
    if not date_str:
        date_str = now_dt.isoformat()

    pagination_info = f"Page {calculated_page}"
    if offset > 0:
        pagination_info = f"Offset {actual_offset} (≈ {pagination_info})"
    lines = [f"Current Date: {date_str}", f"{search_label} search results for: {query}"]
    if used_fallback:
        lines.append("📍 Note: News search returned no results, automatically searched general web content instead")
    lines.append(f"{pagination_info} (results {start_num}-{end_num} of ~{total_available}+ available)\n")
    for i, result in enumerate(paginated_results, start_num):
        result_lines = _format_search_result(result, search_type, i)
        lines.extend(result_lines)
        lines.append("")
    if total_available > end_num:
        lines.append("💡 More results available:")
        lines.append(f"   • Next page: page={calculated_page + 1}")
        lines.append(f"   • Next offset: offset={next_offset}")
        lines.append(f"   • Use offset={next_offset} to continue exactly from result {next_offset + 1}")
    result = "\n".join(lines)
    search_info = f"type={original_search_type}"
    if used_fallback:
        search_info += "→text"
    _log_call_end("Web_Search", f"{search_info} page={calculated_page} offset={actual_offset} results={len(paginated_results)} chars={len(result)}")
    return result


def build_interface() -> gr.Interface:
    return gr.Interface(
        fn=Web_Search,
        inputs=[
            gr.Textbox(label="Query", placeholder="topic OR site:example.com", max_lines=1, info="The search query"),
            gr.Slider(minimum=1, maximum=20, value=5, step=1, label="Max results", info="Number of results to return (1–20)"),
            gr.Slider(minimum=1, maximum=10, value=1, step=1, label="Page", info="Page number for pagination (ignored if offset > 0)"),
            gr.Slider(
                minimum=0,
                maximum=1000,
                value=0,
                step=1,
                label="Offset",
                info="Result offset to start from (overrides page if > 0, use next_offset from previous search)",
            ),
            gr.Radio(
                label="Search Type",
                choices=["text", "news", "images", "videos", "books"],
                value="text",
                info="Type of content to search for",
            ),
            gr.Radio(
                label="Backend",
                choices=BACKEND_CHOICES,
                value="auto",
                info="Search backend: auto (SearXNG → DDG), searxng, or duckduckgo",
            ),
            gr.Radio(
                label="Date filter",
                choices=DATE_FILTER_CHOICES,
                value="any",
                info="Limit results to: day, week, month, or year",
            ),
            gr.Checkbox(
                label="Optimize Query",
                value=False,
                info="Use AI to optimize the query for better results (adds ~2s latency)",
            ),
        ],
        outputs=gr.Textbox(label="Search Results", interactive=False, lines=20, max_lines=20),
        title="Web Search",
        description=(
            "<div style=\"text-align:center\">Multi-backend web search (SearXNG + DuckDuckGo) with optional AI query optimization. "
            "Supports text, news, images, videos, and books. Auto backend tries SearXNG first, then DDG fallback.</div>"
        ),
        api_description=TOOL_SUMMARY,
        flagging_mode="never",
        submit_btn="Search",
    )


__all__ = ["Web_Search", "build_interface"]