Spaces:
Runtime error
Runtime error
| from bs4 import BeautifulSoup | |
| import re | |
| from langchain_chroma import Chroma | |
| from langchain_openai import OpenAIEmbeddings | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain_core.documents import Document | |
| import requests | |
| from tqdm import tqdm | |
| def process_documents(docs: list[Document]) -> list[Document]: | |
| """ | |
| Przetwarza list臋 dokument贸w, wyodr臋bniaj膮c tre艣膰 i metadane z HTML. | |
| """ | |
| processed_docs = [] | |
| for doc in docs: | |
| soup = BeautifulSoup(doc.page_content, "lxml") | |
| # Wyodr臋bnienie g艂贸wnej tre艣ci | |
| article = soup.find("article") | |
| if article: | |
| content = article.get_text(separator="\n", strip=True) | |
| else: | |
| content = soup.get_text(separator="\n", strip=True) | |
| # Wyodr臋bnienie metadanych | |
| metadata = doc.metadata.copy() | |
| # Title ze znacznika <title> | |
| if soup.title: | |
| title_text = soup.title.get_text(strip=True) | |
| if title_text: | |
| metadata["title"] = title_text | |
| # Data publikacji | |
| pub_date_tag = soup.find("meta", property="article:published_time") | |
| if pub_date_tag and pub_date_tag.get("content"): | |
| metadata["published_time"] = pub_date_tag["content"] | |
| else: | |
| time_tag = soup.find("time") | |
| if time_tag and time_tag.get("datetime"): | |
| metadata["published_time"] = time_tag.get("datetime") | |
| elif time_tag and time_tag.get_text(strip=True): | |
| metadata["published_time"] = time_tag.get_text(strip=True) | |
| else: | |
| text = soup.get_text(separator="\n", strip=True) | |
| m = re.search(r"Opublikowano(?: w dniu)?[:\s]+([0-9]{1,2}\s+\w+\s+\d{4})", text, re.IGNORECASE) | |
| if m: | |
| metadata["published_time"] = m.group(1) | |
| # Kategorie | |
| categories = [ | |
| tag["content"] | |
| for tag in soup.find_all("meta", property="article:section") | |
| if tag.get("content") | |
| ] | |
| if categories: | |
| metadata["categories"] = ", ".join(categories) | |
| # S艂owa kluczowe | |
| keywords = [ | |
| tag["content"] | |
| for tag in soup.find_all("meta", property="article:tag") | |
| if tag.get("content") | |
| ] | |
| if keywords: | |
| metadata["keywords"] = ", ".join(keywords) | |
| processed_docs.append(Document(page_content=content, metadata=metadata)) | |
| return processed_docs | |
| def initialize_database(persist_directory="./szuflada", clear_existing=True): | |
| """ | |
| Inicjalizuje baz臋 danych Chroma z danymi ze strony mojaszuflada.pl | |
| """ | |
| embedder = OpenAIEmbeddings(model="text-embedding-3-small", show_progress_bar=True) | |
| baza = Chroma(collection_name="szuflada", embedding_function=embedder, persist_directory=persist_directory) | |
| if clear_existing: | |
| print("Czyszczenie istniej膮cej kolekcji w bazie danych...") | |
| try: | |
| baza.delete_collection() | |
| print("Kolekcja zosta艂a wyczyszczona.") | |
| baza = Chroma(collection_name="szuflada", embedding_function=embedder, persist_directory=persist_directory) | |
| except Exception as e: | |
| print(f"Nie mo偶na by艂o wyczy艣ci膰 kolekcji (mo偶e nie istnia艂a): {e}") | |
| print("Pobieranie i parsowanie mapy strony...") | |
| headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'} | |
| sitemap_url = "https://mojaszuflada.pl/wp-sitemap.xml" | |
| docs = [] | |
| try: | |
| response = requests.get(sitemap_url, headers=headers) | |
| response.raise_for_status() | |
| sitemap_xml = response.text | |
| sitemap_soup = BeautifulSoup(sitemap_xml, "xml") | |
| urls = [loc.text for loc in sitemap_soup.find_all("loc")] | |
| sitemap_urls = [url for url in urls if url.endswith(".xml")] | |
| page_urls = [url for url in urls if not url.endswith(".xml")] | |
| for sub_sitemap_url in tqdm(sitemap_urls, desc="Parsowanie pod-map"): | |
| try: | |
| response = requests.get(sub_sitemap_url, headers=headers) | |
| response.raise_for_status() | |
| sub_sitemap_xml = response.text | |
| sub_sitemap_soup = BeautifulSoup(sub_sitemap_xml, "xml") | |
| page_urls.extend([loc.text for loc in sub_sitemap_soup.find_all("loc")]) | |
| except requests.RequestException as e: | |
| print(f"Pomini臋to pod-map臋 {sub_sitemap_url}: {e}") | |
| print(f"Znaleziono {len(page_urls)} adres贸w URL do przetworzenia.") | |
| for url in tqdm(page_urls, desc="Pobieranie stron"): | |
| try: | |
| response = requests.get(url, headers=headers) | |
| response.raise_for_status() | |
| doc = Document( | |
| page_content=response.text, | |
| metadata={"source": url, "loc": url} | |
| ) | |
| docs.append(doc) | |
| except requests.RequestException as e: | |
| print(f"Pomini臋to stron臋 {url}: {e}") | |
| except requests.RequestException as e: | |
| print(f"Krytyczny b艂膮d: Nie uda艂o si臋 pobra膰 g艂贸wnej mapy strony: {e}") | |
| if not docs: | |
| print("Nie za艂adowano 偶adnych dokument贸w.") | |
| return None | |
| processed_docs = process_documents(docs) | |
| print(f"\nPrzetworzono {len(processed_docs)} dokument贸w.") | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) | |
| chunks = text_splitter.split_documents(processed_docs) | |
| # Walidacja metadanych | |
| required_meta_keys = ["source", "title", "published_time"] | |
| missing_counts = {k: 0 for k in required_meta_keys} | |
| for chunk in chunks: | |
| md = chunk.metadata or {} | |
| for k in required_meta_keys: | |
| if not md.get(k): | |
| missing_counts[k] += 1 | |
| print(f"Liczba chunk贸w: {len(chunks)}") | |
| print("Braki metadanych:", missing_counts) | |
| # Dodawanie chunk贸w do bazy | |
| batch_size = 1000 | |
| for i in range(0, len(chunks), batch_size): | |
| baza.add_documents(documents=chunks[i:i + batch_size]) | |
| print("Baza danych zosta艂a zainicjalizowana pomy艣lnie.") | |
| return baza | |
| return baza | |