from bs4 import BeautifulSoup import re from langchain_chroma import Chroma from langchain_openai import OpenAIEmbeddings from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_core.documents import Document import requests from tqdm import tqdm def process_documents(docs: list[Document]) -> list[Document]: """ Przetwarza listę dokumentów, wyodrębniając treść i metadane z HTML. """ processed_docs = [] for doc in docs: soup = BeautifulSoup(doc.page_content, "lxml") # Wyodrębnienie głównej treści article = soup.find("article") if article: content = article.get_text(separator="\n", strip=True) else: content = soup.get_text(separator="\n", strip=True) # Wyodrębnienie metadanych metadata = doc.metadata.copy() # Title ze znacznika if soup.title: title_text = soup.title.get_text(strip=True) if title_text: metadata["title"] = title_text # Data publikacji pub_date_tag = soup.find("meta", property="article:published_time") if pub_date_tag and pub_date_tag.get("content"): metadata["published_time"] = pub_date_tag["content"] else: time_tag = soup.find("time") if time_tag and time_tag.get("datetime"): metadata["published_time"] = time_tag.get("datetime") elif time_tag and time_tag.get_text(strip=True): metadata["published_time"] = time_tag.get_text(strip=True) else: text = soup.get_text(separator="\n", strip=True) m = re.search(r"Opublikowano(?: w dniu)?[:\s]+([0-9]{1,2}\s+\w+\s+\d{4})", text, re.IGNORECASE) if m: metadata["published_time"] = m.group(1) # Kategorie categories = [ tag["content"] for tag in soup.find_all("meta", property="article:section") if tag.get("content") ] if categories: metadata["categories"] = ", ".join(categories) # Słowa kluczowe keywords = [ tag["content"] for tag in soup.find_all("meta", property="article:tag") if tag.get("content") ] if keywords: metadata["keywords"] = ", ".join(keywords) processed_docs.append(Document(page_content=content, metadata=metadata)) return processed_docs def initialize_database(persist_directory="./szuflada", clear_existing=True): """ Inicjalizuje bazę danych Chroma z danymi ze strony mojaszuflada.pl """ embedder = OpenAIEmbeddings(model="text-embedding-3-small", show_progress_bar=True) baza = Chroma(collection_name="szuflada", embedding_function=embedder, persist_directory=persist_directory) if clear_existing: print("Czyszczenie istniejącej kolekcji w bazie danych...") try: baza.delete_collection() print("Kolekcja została wyczyszczona.") baza = Chroma(collection_name="szuflada", embedding_function=embedder, persist_directory=persist_directory) except Exception as e: print(f"Nie można było wyczyścić kolekcji (może nie istniała): {e}") print("Pobieranie i parsowanie mapy strony...") headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'} sitemap_url = "https://mojaszuflada.pl/wp-sitemap.xml" docs = [] try: response = requests.get(sitemap_url, headers=headers) response.raise_for_status() sitemap_xml = response.text sitemap_soup = BeautifulSoup(sitemap_xml, "xml") urls = [loc.text for loc in sitemap_soup.find_all("loc")] sitemap_urls = [url for url in urls if url.endswith(".xml")] page_urls = [url for url in urls if not url.endswith(".xml")] for sub_sitemap_url in tqdm(sitemap_urls, desc="Parsowanie pod-map"): try: response = requests.get(sub_sitemap_url, headers=headers) response.raise_for_status() sub_sitemap_xml = response.text sub_sitemap_soup = BeautifulSoup(sub_sitemap_xml, "xml") page_urls.extend([loc.text for loc in sub_sitemap_soup.find_all("loc")]) except requests.RequestException as e: print(f"Pominięto pod-mapę {sub_sitemap_url}: {e}") print(f"Znaleziono {len(page_urls)} adresów URL do przetworzenia.") for url in tqdm(page_urls, desc="Pobieranie stron"): try: response = requests.get(url, headers=headers) response.raise_for_status() doc = Document( page_content=response.text, metadata={"source": url, "loc": url} ) docs.append(doc) except requests.RequestException as e: print(f"Pominięto stronę {url}: {e}") except requests.RequestException as e: print(f"Krytyczny błąd: Nie udało się pobrać głównej mapy strony: {e}") if not docs: print("Nie załadowano żadnych dokumentów.") return None processed_docs = process_documents(docs) print(f"\nPrzetworzono {len(processed_docs)} dokumentów.") text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) chunks = text_splitter.split_documents(processed_docs) # Walidacja metadanych required_meta_keys = ["source", "title", "published_time"] missing_counts = {k: 0 for k in required_meta_keys} for chunk in chunks: md = chunk.metadata or {} for k in required_meta_keys: if not md.get(k): missing_counts[k] += 1 print(f"Liczba chunków: {len(chunks)}") print("Braki metadanych:", missing_counts) # Dodawanie chunków do bazy batch_size = 1000 for i in range(0, len(chunks), batch_size): baza.add_documents(documents=chunks[i:i + batch_size]) print("Baza danych została zainicjalizowana pomyślnie.") return baza return baza