File size: 7,225 Bytes

fea1bd1

# C:\Users\marco\agente_gemini\HASHIRU_6_1\tools\web.py
from __future__ import annotations

import html
import shlex
from typing import List, Optional

import requests
from bs4 import BeautifulSoup
from duckduckgo_search import DDGS

try:
    import trafilatura  # limpeza de conteúdo
except Exception:
    trafilatura = None  # segue com fallback


def _parse_search_args(args: str) -> dict:
    """
    /search termo [--max N] [--news] [--safesearch off|moderate|strict] [--site dominio]
    """
    tokens = shlex.split(args, posix=False)
    out = {
        "term": None,
        "max": 10,
        "news": False,
        "safesearch": "off",
        "site": None,
    }
    i = 0
    # termo = primeiro token não-flag (se quiser usar aspas, funciona)
    while i < len(tokens):
        t = tokens[i]
        if not t.startswith("--") and out["term"] is None:
            out["term"] = t
            i += 1
            break
        i += 1

    while i < len(tokens):
        t = tokens[i]
        if t == "--max" and i + 1 < len(tokens):
            try:
                out["max"] = int(tokens[i + 1])
            except:
                pass
            i += 2
        elif t == "--news":
            out["news"] = True
            i += 1
        elif t == "--safesearch" and i + 1 < len(tokens):
            level = tokens[i + 1].lower()
            if level in {"off", "moderate", "strict"}:
                out["safesearch"] = level
            i += 2
        elif t == "--site" and i + 1 < len(tokens):
            out["site"] = tokens[i + 1]
            i += 2
        else:
            i += 1
    return out


async def handle_search(args: str, block: str) -> str:
    """
    /search termo [--max N] [--news] [--safesearch off|moderate|strict] [--site dominio]

    Exemplos:
      /search rtx 4060 --max 5
      /search "python venv windows" --safesearch moderate
      /search chainlit --site github.com
      /search OpenAI --news --max 3
    """
    cfg = _parse_search_args(args)
    term = cfg["term"]
    if not term:
        return "Uso: /search <termo> [--max N] [--news] [--safesearch off|moderate|strict] [--site dominio]"

    # aplica filtro por site usando o próprio query
    if cfg["site"]:
        term = f"site:{cfg['site']} {term}"

    results: List[str] = []
    try:
        with DDGS() as ddgs:
            if cfg["news"]:
                src = ddgs.news(term, max_results=cfg["max"], safesearch=cfg["safesearch"])
            else:
                src = ddgs.text(term, max_results=cfg["max"], safesearch=cfg["safesearch"])

            for r in src or []:
                title = (r.get("title") or "").strip()
                href = r.get("href") or r.get("url") or ""
                body = (r.get("body") or r.get("snippet") or "").strip()
                if title:
                    title = title[:120]
                if body:
                    body = body[:200]
                item = []
                if title:
                    item.append(f"**{title}**")
                if body:
                    item.append(body)
                if href:
                    item.append(href)
                if item:
                    results.append("\n".join(item))

    except Exception as e:
        return f"💥 Erro na busca: {e}"

    if not results:
        return f"❌ Sem resultados para `{term}`."
    return f"🔎 Resultados para `{html.escape(term)}`:\n\n" + "\n\n".join(results)


def _parse_scrape_args(args: str) -> dict:
    """
    /scrape URL [--max N] [--headers] [--links] [--raw]
    """
    tokens = shlex.split(args, posix=False)
    out = {"url": None, "max": 2000, "headers": False, "links": False, "raw": False, "timeout": 30}
    i = 0
    while i < len(tokens):
        t = tokens[i]
        if not t.startswith("--") and out["url"] is None:
            out["url"] = t
            i += 1
            break
        i += 1
    while i < len(tokens):
        t = tokens[i]
        if t == "--max" and i + 1 < len(tokens):
            try:
                out["max"] = int(tokens[i + 1])
            except:
                pass
            i += 2
        elif t == "--headers":
            out["headers"] = True
            i += 1
        elif t == "--links":
            out["links"] = True
            i += 1
        elif t == "--raw":
            out["raw"] = True
            i += 1
        elif t == "--timeout" and i + 1 < len(tokens):
            try:
                out["timeout"] = int(tokens[i + 1])
            except:
                pass
            i += 2
        else:
            i += 1
    return out


def _clean_with_trafilatura(html_text: str, url: Optional[str]) -> Optional[str]:
    if trafilatura is None:
        return None
    try:
        return trafilatura.extract(
            html_text,
            include_links=False,
            include_comments=False,
            include_tables=False,
            url=url,
            favor_precision=True,
            include_formatting=False,
        )
    except Exception:
        return None


async def handle_scrape(args: str, block: str) -> str:
    """
    /scrape URL [--max N] [--headers] [--links] [--raw] [--timeout S]

    Exemplos:
      /scrape https://example.com
      /scrape https://news.ycombinator.com/ --max 1200 --links
      /scrape https://httpbin.org/html --headers --raw
    """
    cfg = _parse_scrape_args(args)
    url = cfg["url"]
    if not url:
        return "Uso: /scrape <URL> [--max N] [--headers] [--links] [--raw] [--timeout S]"

    try:
        r = requests.get(url, timeout=cfg["timeout"])
        r.raise_for_status()
    except requests.exceptions.Timeout:
        return f"⏳ Timeout após {cfg['timeout']}s em `{url}`"
    except requests.exceptions.HTTPError as e:
        code = getattr(e.response, "status_code", "N/A")
        return f"❌ HTTP {code}: {e}"
    except Exception as e:
        return f"💥 Erro requisitando `{url}`: {e}"

    # Puxa título com BS4
    soup = BeautifulSoup(r.text, "html.parser")
    title_tag = soup.find("title")
    title = title_tag.get_text(strip=True) if title_tag else "(sem título)"

    # Conteúdo
    if cfg["raw"]:
        text = soup.get_text(" ", strip=True)
    else:
        extracted = _clean_with_trafilatura(r.text, url)
        text = extracted if (extracted and extracted.strip()) else soup.get_text(" ", strip=True)

    preview = text[: cfg["max"]]
    extra = "" if len(text) <= cfg["max"] else "\n...(truncado)"

    lines: List[str] = [f"🕷️ `{title}`", f"HTTP {r.status_code} • {len(r.text)} bytes (HTML)"]
    if cfg["headers"]:
        lines.append("**Headers de resposta:**")
        for k, v in r.headers.items():
            lines.append(f"- {k}: {v}")
    if cfg["links"]:
        links = []
        for a in soup.find_all("a"):
            href = a.get("href")
            if href and href.strip():
                links.append(href.strip())
            if len(links) >= 20:
                break
        if links:
            lines.append("**Links (até 20):**")
            for l in links:
                lines.append(f"- {l}")

    lines.append("**Texto (preview):**")
    lines.append(f"```\n{preview}\n```{extra}")
    return "\n".join(lines)