akra35567 commited on
Commit
51e76e2
·
verified ·
1 Parent(s): df16e05

Create web_search.py

Browse files
Files changed (1) hide show
  1. web_search.py +110 -0
web_search.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # web_search.py — V27 — SERPER API + ANGOLA SCRAPING
2
+ import time
3
+ import re
4
+ import requests
5
+ from typing import List, Dict
6
+ from loguru import logger
7
+ from bs4 import BeautifulSoup
8
+ import os
9
+ import config
10
+
11
+ class SimpleCache:
12
+ def __init__(self, ttl: int = 900):
13
+ self.ttl = ttl
14
+ self._data = {}
15
+ def get(self, key):
16
+ if key in self._data and time.time() - self._data[key][1] < self.ttl:
17
+ return self._data[key][0]
18
+ return None
19
+ def set(self, key, value):
20
+ self._data[key] = (value, time.time())
21
+
22
+ class WebSearch:
23
+ def __init__(self):
24
+ self.cache = SimpleCache()
25
+ self.session = requests.Session()
26
+ self.session.headers.update({
27
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
28
+ })
29
+ self.fontes_angola = [
30
+ "https://www.angop.ao/ultimas",
31
+ "https://www.novojornal.co.ao/",
32
+ "https://www.jornaldeangola.ao/",
33
+ "https://www.verangola.net/va/noticias"
34
+ ]
35
+
36
+ def _limpar(self, texto: str) -> str:
37
+ return re.sub(r'\s+', ' ', texto).strip()[:200]
38
+
39
+ def _scraping_angola(self) -> str:
40
+ key = "noticias_angola"
41
+ cached = self.cache.get(key)
42
+ if cached: return cached
43
+
44
+ noticias = []
45
+ for url in self.fontes_angola:
46
+ try:
47
+ r = self.session.get(url, timeout=8)
48
+ if r.status_code != 200: continue
49
+ soup = BeautifulSoup(r.text, 'html.parser')
50
+ for item in soup.select('.titulo a, h3 a, .noticia-item a')[:3]:
51
+ titulo = self._limpar(item.get_text())
52
+ if titulo and len(titulo) > 20:
53
+ noticias.append(f"• {titulo}")
54
+ except: continue
55
+
56
+ if not noticias:
57
+ result = "Sem notícias recentes de Angola."
58
+ else:
59
+ result = "NOTÍCIAS DE ANGOLA:\n" + "\n".join(noticias[:5])
60
+
61
+ self.cache.set(key, result)
62
+ return result
63
+
64
+ def _busca_geral(self, query: str) -> str:
65
+ key = f"geral_{query.lower()}"
66
+ cached = self.cache.get(key)
67
+ if cached: return cached
68
+
69
+ if not config.SERPER_API_KEY:
70
+ return "Busca geral não configurada. Configure SERPER_API_KEY no HF Space Secrets."
71
+
72
+ try:
73
+ # SERPER API REAL
74
+ url = "https://google.serper.dev/search"
75
+ payload = {"q": query}
76
+ headers = {"X-API-KEY": config.SERPER_API_KEY}
77
+ r = requests.post(url, json=payload, headers=headers, timeout=10)
78
+
79
+ if r.status_code != 200:
80
+ return "Erro na API de busca geral."
81
+
82
+ data = r.json()
83
+ results = []
84
+ for result in data.get('organic', [])[:5]:
85
+ title = result.get('title', '')[:100]
86
+ snippet = result.get('snippet', '')[:150]
87
+ if title:
88
+ results.append(f"• {title}: {snippet}")
89
+
90
+ if not results:
91
+ result = "Nada encontrado na busca geral."
92
+ else:
93
+ result = "INFORMAÇÕES:\n" + "\n".join(results)
94
+
95
+ except Exception as e:
96
+ logger.error(f"Erro Serper: {e}")
97
+ result = "Erro na busca geral."
98
+
99
+ self.cache.set(key, result)
100
+ return result
101
+
102
+ def pesquisar(self, mensagem: str) -> str:
103
+ """Akira decide sozinha se precisa pesquisar (sem palavras-chave no prompt)"""
104
+ # Angola sempre
105
+ if any(w in mensagem.lower() for w in ["angola", "luanda", "notícia", "jornal", "governo", "presidente"]):
106
+ return self._scraping_angola()
107
+ # Conhecimento geral
108
+ if any(w in mensagem.lower() for w in ["quem é", "o que é", "quando", "onde", "como", "por que", "quanto", "qual"]):
109
+ return self._busca_geral(mensagem)
110
+ return ""