Szuflada / database_setup.py
jaczad's picture
Aplikacja tworzy baz臋 po uruchomieniu
0e2081a
from bs4 import BeautifulSoup
import re
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
import requests
from tqdm import tqdm
def process_documents(docs: list[Document]) -> list[Document]:
"""
Przetwarza list臋 dokument贸w, wyodr臋bniaj膮c tre艣膰 i metadane z HTML.
"""
processed_docs = []
for doc in docs:
soup = BeautifulSoup(doc.page_content, "lxml")
# Wyodr臋bnienie g艂贸wnej tre艣ci
article = soup.find("article")
if article:
content = article.get_text(separator="\n", strip=True)
else:
content = soup.get_text(separator="\n", strip=True)
# Wyodr臋bnienie metadanych
metadata = doc.metadata.copy()
# Title ze znacznika <title>
if soup.title:
title_text = soup.title.get_text(strip=True)
if title_text:
metadata["title"] = title_text
# Data publikacji
pub_date_tag = soup.find("meta", property="article:published_time")
if pub_date_tag and pub_date_tag.get("content"):
metadata["published_time"] = pub_date_tag["content"]
else:
time_tag = soup.find("time")
if time_tag and time_tag.get("datetime"):
metadata["published_time"] = time_tag.get("datetime")
elif time_tag and time_tag.get_text(strip=True):
metadata["published_time"] = time_tag.get_text(strip=True)
else:
text = soup.get_text(separator="\n", strip=True)
m = re.search(r"Opublikowano(?: w dniu)?[:\s]+([0-9]{1,2}\s+\w+\s+\d{4})", text, re.IGNORECASE)
if m:
metadata["published_time"] = m.group(1)
# Kategorie
categories = [
tag["content"]
for tag in soup.find_all("meta", property="article:section")
if tag.get("content")
]
if categories:
metadata["categories"] = ", ".join(categories)
# S艂owa kluczowe
keywords = [
tag["content"]
for tag in soup.find_all("meta", property="article:tag")
if tag.get("content")
]
if keywords:
metadata["keywords"] = ", ".join(keywords)
processed_docs.append(Document(page_content=content, metadata=metadata))
return processed_docs
def initialize_database(persist_directory="./szuflada", clear_existing=True):
"""
Inicjalizuje baz臋 danych Chroma z danymi ze strony mojaszuflada.pl
"""
embedder = OpenAIEmbeddings(model="text-embedding-3-small", show_progress_bar=True)
baza = Chroma(collection_name="szuflada", embedding_function=embedder, persist_directory=persist_directory)
if clear_existing:
print("Czyszczenie istniej膮cej kolekcji w bazie danych...")
try:
baza.delete_collection()
print("Kolekcja zosta艂a wyczyszczona.")
baza = Chroma(collection_name="szuflada", embedding_function=embedder, persist_directory=persist_directory)
except Exception as e:
print(f"Nie mo偶na by艂o wyczy艣ci膰 kolekcji (mo偶e nie istnia艂a): {e}")
print("Pobieranie i parsowanie mapy strony...")
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
sitemap_url = "https://mojaszuflada.pl/wp-sitemap.xml"
docs = []
try:
response = requests.get(sitemap_url, headers=headers)
response.raise_for_status()
sitemap_xml = response.text
sitemap_soup = BeautifulSoup(sitemap_xml, "xml")
urls = [loc.text for loc in sitemap_soup.find_all("loc")]
sitemap_urls = [url for url in urls if url.endswith(".xml")]
page_urls = [url for url in urls if not url.endswith(".xml")]
for sub_sitemap_url in tqdm(sitemap_urls, desc="Parsowanie pod-map"):
try:
response = requests.get(sub_sitemap_url, headers=headers)
response.raise_for_status()
sub_sitemap_xml = response.text
sub_sitemap_soup = BeautifulSoup(sub_sitemap_xml, "xml")
page_urls.extend([loc.text for loc in sub_sitemap_soup.find_all("loc")])
except requests.RequestException as e:
print(f"Pomini臋to pod-map臋 {sub_sitemap_url}: {e}")
print(f"Znaleziono {len(page_urls)} adres贸w URL do przetworzenia.")
for url in tqdm(page_urls, desc="Pobieranie stron"):
try:
response = requests.get(url, headers=headers)
response.raise_for_status()
doc = Document(
page_content=response.text,
metadata={"source": url, "loc": url}
)
docs.append(doc)
except requests.RequestException as e:
print(f"Pomini臋to stron臋 {url}: {e}")
except requests.RequestException as e:
print(f"Krytyczny b艂膮d: Nie uda艂o si臋 pobra膰 g艂贸wnej mapy strony: {e}")
if not docs:
print("Nie za艂adowano 偶adnych dokument贸w.")
return None
processed_docs = process_documents(docs)
print(f"\nPrzetworzono {len(processed_docs)} dokument贸w.")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(processed_docs)
# Walidacja metadanych
required_meta_keys = ["source", "title", "published_time"]
missing_counts = {k: 0 for k in required_meta_keys}
for chunk in chunks:
md = chunk.metadata or {}
for k in required_meta_keys:
if not md.get(k):
missing_counts[k] += 1
print(f"Liczba chunk贸w: {len(chunks)}")
print("Braki metadanych:", missing_counts)
# Dodawanie chunk贸w do bazy
batch_size = 1000
for i in range(0, len(chunks), batch_size):
baza.add_documents(documents=chunks[i:i + batch_size])
print("Baza danych zosta艂a zainicjalizowana pomy艣lnie.")
return baza
return baza