Spaces:
Sleeping
Sleeping
| import requests | |
| from bs4 import BeautifulSoup | |
| from sentence_transformers import SentenceTransformer | |
| import faiss | |
| import pickle | |
| import os | |
| # Modelo de embeddings | |
| EMBED_MODEL = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2" | |
| # URLs a serem indexadas | |
| URLS = [ | |
| "https://labnoticias.jor.br/", | |
| # você pode adicionar aqui manualmente outras URLs de artigos específicos | |
| ] | |
| # Header para burlar o bloqueio 406 | |
| HEADERS = { | |
| "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)" | |
| } | |
| def fetch_text(url): | |
| resp = requests.get(url, headers=HEADERS) | |
| resp.raise_for_status() | |
| soup = BeautifulSoup(resp.text, "html.parser") | |
| container = soup.select_one("div.post-content") or soup.body | |
| return container.get_text(separator=" ", strip=True) | |
| def main(): | |
| os.makedirs("data", exist_ok=True) | |
| texts = [] | |
| for url in URLS: | |
| try: | |
| txt = fetch_text(url) | |
| texts.append(txt) | |
| print(f"✔️ Coletado: {url}") | |
| except Exception as e: | |
| print(f"❌ Erro em {url}: {e}") | |
| if not texts: | |
| raise RuntimeError("Nenhum texto foi coletado. Verifique as URLs ou o seletor CSS.") | |
| # Gera embeddings | |
| model = SentenceTransformer(EMBED_MODEL) | |
| embeddings = model.encode(texts, show_progress_bar=True) | |
| # Verifica formato | |
| if embeddings.ndim != 2: | |
| raise RuntimeError(f"Formato inesperado de embeddings: {embeddings.shape}") | |
| # Constrói índice FAISS | |
| dim = embeddings.shape[1] | |
| index = faiss.IndexFlatL2(dim) | |
| index.add(embeddings) | |
| # Salva arquivos | |
| with open("data/index.faiss", "wb") as f: | |
| pickle.dump(index, f) | |
| with open("data/texts.pkl", "wb") as f: | |
| pickle.dump(texts, f) | |
| print("✅ Ingestão e indexação concluídas.") | |
| if __name__ == "__main__": | |
| main() | |