Spaces:

akra35567
/

akira

Sleeping

App Files Files Community

akira / modules /web_search.py

akra35567

Update modules/web_search.py

509bdcc verified 2 months ago

raw

history blame contribute delete

15.3 kB

	# modules/web_search.py — AKIRA V19 (Dezembro 2025)
	"""
	Módulo de busca na web para APIs sem acesso nativo:
	- Busca notícias de Angola (WebScraping)
	- Busca geral (DuckDuckGo API - gratuita)
	- Pesquisa de clima/tempo
	- Cache de 15 minutos
	"""
	import time
	import re
	import requests
	from typing import List, Dict, Any, Optional
	from loguru import logger
	from bs4 import BeautifulSoup

	# === CONFIGURAÇÕES ===
	CACHE_TTL = 900 # 15 minutos

	class SimpleCache:
	"""Cache simples em memória com TTL"""
	def __init__(self, ttl: int = CACHE_TTL):
	self.ttl = ttl
	self._data: Dict[str, Any] = {}

	def get(self, key: str):
	if key in self._data:
	value, timestamp = self._data[key]
	if time.time() - timestamp < self.ttl:
	return value
	del self._data[key]
	return None

	def set(self, key: str, value: Any):
	self._data[key] = (value, time.time())


	class WebSearch:
	"""
	Gerenciador de buscas na web:
	- Notícias de Angola (scraping)
	- Busca geral (DuckDuckGo)
	- Clima/tempo
	"""

	def __init__(self):
	self.cache = SimpleCache(ttl=CACHE_TTL)
	self.session = requests.Session()
	self.session.headers.update({
	"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
	"Accept-Language": "pt-BR,pt;q=0.9,en-US;q=0.8,en;q=0.7"
	})

	# Fontes de notícias Angola
	self.fontes_angola = [
	"https://www.angop.ao/ultimas",
	"https://www.novojornal.co.ao/",
	"https://www.jornaldeangola.ao/"
	]

	# ========================================================================
	# BUSCA GERAL (MULTI-FONTE - GRATUITA E ROBUSTA)
	# ========================================================================

	def buscar_geral(self, query: str, max_resultados: int = 3) -> str:
	"""
	Busca geral na web usando múltiplas fontes gratuitas

	Args:
	query: Termo de busca
	max_resultados: Número máximo de resultados

	Returns:
	String formatada com resultados para o prompt da IA
	"""
	cache_key = f"busca_geral_{query.lower()}"
	cached = self.cache.get(cache_key)
	if cached:
	return cached

	try:
	# Tentar múltiplas fontes em ordem de prioridade
	resultados = []

	# 1. DuckDuckGo Instant Answer
	try:
	url = "https://api.duckduckgo.com/"
	params = {
	"q": query,
	"format": "json",
	"no_html": "1",
	"skip_disambig": "1"
	}

	resp = self.session.get(url, params=params, timeout=8)
	if resp.status_code == 200:
	data = resp.json()

	# Abstract (resumo principal)
	if data.get("Abstract"):
	resultados.append(f"RESUMO: {data['Abstract'][:300]}")

	# Related topics
	for topic in data.get("RelatedTopics", [])[:max_resultados]:
	if isinstance(topic, dict) and "Text" in topic:
	resultados.append(f"INFO: {topic['Text'][:200]}")
	elif isinstance(topic, str):
	resultados.append(f"INFO: {topic[:200]}")
	except Exception as e:
	logger.debug(f"DuckDuckGo falhou: {e}")

	# 2. Wikipedia API (se for busca factual)
	if len(resultados) < max_resultados:
	try:
	wiki_url = "https://en.wikipedia.org/api/rest_v1/page/summary/"
	wiki_resp = self.session.get(wiki_url + query.replace(" ", "_"), timeout=5)
	if wiki_resp.status_code == 200:
	wiki_data = wiki_resp.json()
	if wiki_data.get("extract"):
	resultados.append(f"Wikipedia: {wiki_data['extract'][:250]}")
	except Exception as e:
	logger.debug(f"Wikipedia falhou: {e}")

	# 3. Fallback com busca simulada baseada em conhecimento geral
	if not resultados:
	return self._fallback_busca_geral(query)

	# Formatar para o prompt da IA (não para usuário)
	resposta = f"INFORMAÇÕES SOBRE '{query.upper()}':\n\n" + "\n\n".join(resultados[:max_resultados])
	self.cache.set(cache_key, resposta)
	return resposta

	except Exception as e:
	logger.warning(f"Busca geral falhou: {e}")
	return self._fallback_busca_geral(query)

	def _fallback_busca_geral(self, query: str) -> str:
	"""Fallback quando todas as fontes falham"""
	return f"INFORMAÇÕES GERAIS SOBRE '{query}': Não foi possível obter dados específicos da web no momento. Use conhecimento geral para responder."

	# ========================================================================
	# NOTÍCIAS DE ANGOLA (WEB SCRAPING)
	# ========================================================================

	def pesquisar_noticias_angola(self, limite: int = 5) -> str:
	"""
	Busca notícias mais recentes de Angola via scraping

	Returns:
	String formatada com notícias
	"""
	cache_key = "noticias_angola"
	cached = self.cache.get(cache_key)
	if cached:
	return cached

	todas_noticias = []

	try:
	# Tenta cada fonte
	todas_noticias.extend(self._buscar_angop())
	todas_noticias.extend(self._buscar_novojornal())
	todas_noticias.extend(self._buscar_jornaldeangola())

	except Exception as e:
	logger.error(f"Erro no scraping de notícias: {e}")

	# Remove duplicatas e limita
	vistos = set()
	unicas = []
	for n in todas_noticias:
	titulo_lower = n["titulo"].lower()
	if titulo_lower not in vistos and len(titulo_lower) > 20:
	vistos.add(titulo_lower)
	unicas.append(n)
	if len(unicas) >= limite:
	break

	if not unicas:
	fallback = "Sem notícias recentes de Angola disponíveis no momento."
	self.cache.set(cache_key, fallback)
	return fallback

	# Formata resposta
	texto = "📰 NOTÍCIAS RECENTES DE ANGOLA:\n\n"
	for i, n in enumerate(unicas, 1):
	texto += f"[{i}] {n['titulo']}\n"
	if n.get('link'):
	texto += f" 🔗 {n['link']}\n"
	texto += "\n"

	self.cache.set(cache_key, texto.strip())
	return texto.strip()

	def _buscar_angop(self) -> List[Dict]:
	"""Scraping da Angop"""
	try:
	r = self.session.get(self.fontes_angola[0], timeout=8)
	if r.status_code != 200:
	return []

	soup = BeautifulSoup(r.text, 'html.parser')
	itens = soup.select('.ultimas-noticias .item')[:3]
	noticias = []

	for item in itens:
	titulo = item.select_one('h3 a')
	link = item.select_one('a')
	if titulo and link:
	href = link.get('href', '')
	if isinstance(href, str):
	full_link = "https://www.angop.ao" + href if href.startswith('/') else href
	else:
	full_link = "https://www.angop.ao" + str(href) if str(href).startswith('/') else str(href)
	noticias.append({
	"titulo": self._limpar_texto(titulo.get_text()),
	"link": full_link,
	"fonte": "Angop"
	})

	return noticias

	except Exception as e:
	logger.warning(f"Angop scraping falhou: {e}")
	return []

	def _buscar_novojornal(self) -> List[Dict]:
	"""Scraping do Novo Jornal"""
	try:
	r = self.session.get(self.fontes_angola[1], timeout=8)
	if r.status_code != 200:
	return []

	soup = BeautifulSoup(r.text, 'html.parser')
	itens = soup.select('.noticia-lista .titulo a')[:3]
	noticias = []

	for a in itens:
	noticias.append({
	"titulo": self._limpar_texto(a.get_text()),
	"link": a.get('href', ''),
	"fonte": "Novo Jornal"
	})

	return noticias

	except Exception as e:
	logger.warning(f"Novo Jornal scraping falhou: {e}")
	return []

	def _buscar_jornaldeangola(self) -> List[Dict]:
	"""Scraping do Jornal de Angola"""
	try:
	r = self.session.get(self.fontes_angola[2], timeout=8)
	if r.status_code != 200:
	return []

	soup = BeautifulSoup(r.text, 'html.parser')
	itens = soup.select('.ultimas .titulo a')[:3]
	noticias = []

	for a in itens:
	noticias.append({
	"titulo": self._limpar_texto(a.get_text()),
	"link": a.get('href', ''),
	"fonte": "Jornal de Angola"
	})

	return noticias

	except Exception as e:
	logger.warning(f"Jornal de Angola scraping falhou: {e}")
	return []

	# ========================================================================
	# CLIMA/TEMPO
	# ========================================================================

	def buscar_clima(self, cidade: str = "Luanda") -> str:
	"""
	Busca informações de clima usando wttr.in (gratuito)

	Args:
	cidade: Nome da cidade (padrão: Luanda)

	Returns:
	String com informações do clima
	"""
	cache_key = f"clima_{cidade.lower()}"
	cached = self.cache.get(cache_key)
	if cached:
	return cached

	try:
	# wttr.in - serviço gratuito de clima
	url = f"https://wttr.in/{cidade}?format=j1"
	resp = self.session.get(url, timeout=8)

	if resp.status_code != 200:
	return f"Não consegui obter informações do clima em {cidade}."

	data = resp.json()

	# Extrai dados
	current = data['current_condition'][0]
	temp = current['temp_C']
	desc = current['lang_pt'][0]['value'] if 'lang_pt' in current else current['weatherDesc'][0]['value']
	humidity = current['humidity']

	resposta = f"🌤️ CLIMA EM {cidade.upper()}:\n\n"
	resposta += f"Temperatura: {temp}°C\n"
	resposta += f"Condição: {desc}\n"
	resposta += f"Umidade: {humidity}%"

	self.cache.set(cache_key, resposta)
	return resposta

	except Exception as e:
	logger.warning(f"Busca de clima falhou: {e}")
	return f"Não consegui obter informações do clima em {cidade} no momento."

	# ========================================================================
	# UTILIDADES
	# ========================================================================

	def _limpar_texto(self, texto: str) -> str:
	"""Limpa e formata texto"""
	if not texto:
	return ""
	texto = re.sub(r'[\s\n\t]+', ' ', texto)
	return texto.strip()[:200]

	# ========================================================================
	# DETECÇÃO DE INTENÇÃO DE BUSCA
	# ========================================================================

	@staticmethod
	def detectar_intencao_busca(mensagem: str) -> Optional[str]:
	"""
	Detecta se mensagem requer busca na web - MELHORADO

	Returns:
	"noticias" \| "clima" \| "busca_geral" \| None
	"""
	msg_lower = mensagem.lower()

	# PALAVRAS-CHAVE DE BUSCA DIRETAS (PRIORIDADE ALTA)
	palavras_busca_diretas = [
	"busca", "pesquisa", "pesquisar", "procurar", "procura",
	"web", "internet", "google", "wikipedia", "site",
	"informações", "dados", "saber", "conhecer", "descobrir",
	"encontrar", "localizar", "achar"
	]

	# Verificar se contém palavras de busca diretas
	for palavra in palavras_busca_diretas:
	if palavra in msg_lower:
	# Se for sobre clima, priorizar clima
	if any(k in msg_lower for k in ["clima", "tempo", "temperatura", "chuva", "sol"]):
	return "clima"
	# Se for sobre notícias, priorizar notícias
	elif any(k in msg_lower for k in ["notícias", "noticias", "novidades", "aconteceu", "news"]):
	if "angola" in msg_lower or "angolano" in msg_lower:
	return "noticias"
	else:
	return "busca_geral"
	else:
	return "busca_geral"

	# Notícias (específicas de Angola)
	if any(k in msg_lower for k in ["notícias", "noticias", "novidades", "aconteceu", "news"]):
	if "angola" in msg_lower or "angolano" in msg_lower or "angola" in msg_lower:
	return "noticias"

	# Clima
	if any(k in msg_lower for k in ["clima", "tempo", "temperatura", "chuva", "sol"]):
	return "clima"

	# Busca geral (perguntas sobre fatos/eventos)
	palavras_chave_busca = [
	"quem é", "o que é", "onde fica", "quando foi", "como funciona",
	"definição", "significa", "história", "explicação", "significado",
	"qual é", "quais são", "quanto é", "quantos são"
	]

	if any(k in msg_lower for k in palavras_chave_busca):
	return "busca_geral"

	# Perguntas com "?" também podem ativar busca (mais seletivo)
	if "?" in mensagem:
	palavras = mensagem.split()
	if len(palavras) > 2: # Pelo menos 3 palavras para considerar busca
	# Verificar se é uma pergunta factual
	indicadores_pergunta = ["quem", "o que", "onde", "quando", "como", "por que", "qual", "quanto", "porquê", "porque"]
	if any(indicador in msg_lower for indicador in indicadores_pergunta):
	return "busca_geral"

	return None


	# === INSTÂNCIA GLOBAL (SINGLETON) ===
	_web_search_instance = None

	def get_web_search() -> WebSearch:
	"""Retorna instância singleton do WebSearch"""
	global _web_search_instance
	if _web_search_instance is None:
	_web_search_instance = WebSearch()
	return _web_search_instance