File size: 6,248 Bytes
0e2081a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
from bs4 import BeautifulSoup
import re
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
import requests
from tqdm import tqdm
def process_documents(docs: list[Document]) -> list[Document]:
"""
Przetwarza list臋 dokument贸w, wyodr臋bniaj膮c tre艣膰 i metadane z HTML.
"""
processed_docs = []
for doc in docs:
soup = BeautifulSoup(doc.page_content, "lxml")
# Wyodr臋bnienie g艂贸wnej tre艣ci
article = soup.find("article")
if article:
content = article.get_text(separator="\n", strip=True)
else:
content = soup.get_text(separator="\n", strip=True)
# Wyodr臋bnienie metadanych
metadata = doc.metadata.copy()
# Title ze znacznika <title>
if soup.title:
title_text = soup.title.get_text(strip=True)
if title_text:
metadata["title"] = title_text
# Data publikacji
pub_date_tag = soup.find("meta", property="article:published_time")
if pub_date_tag and pub_date_tag.get("content"):
metadata["published_time"] = pub_date_tag["content"]
else:
time_tag = soup.find("time")
if time_tag and time_tag.get("datetime"):
metadata["published_time"] = time_tag.get("datetime")
elif time_tag and time_tag.get_text(strip=True):
metadata["published_time"] = time_tag.get_text(strip=True)
else:
text = soup.get_text(separator="\n", strip=True)
m = re.search(r"Opublikowano(?: w dniu)?[:\s]+([0-9]{1,2}\s+\w+\s+\d{4})", text, re.IGNORECASE)
if m:
metadata["published_time"] = m.group(1)
# Kategorie
categories = [
tag["content"]
for tag in soup.find_all("meta", property="article:section")
if tag.get("content")
]
if categories:
metadata["categories"] = ", ".join(categories)
# S艂owa kluczowe
keywords = [
tag["content"]
for tag in soup.find_all("meta", property="article:tag")
if tag.get("content")
]
if keywords:
metadata["keywords"] = ", ".join(keywords)
processed_docs.append(Document(page_content=content, metadata=metadata))
return processed_docs
def initialize_database(persist_directory="./szuflada", clear_existing=True):
"""
Inicjalizuje baz臋 danych Chroma z danymi ze strony mojaszuflada.pl
"""
embedder = OpenAIEmbeddings(model="text-embedding-3-small", show_progress_bar=True)
baza = Chroma(collection_name="szuflada", embedding_function=embedder, persist_directory=persist_directory)
if clear_existing:
print("Czyszczenie istniej膮cej kolekcji w bazie danych...")
try:
baza.delete_collection()
print("Kolekcja zosta艂a wyczyszczona.")
baza = Chroma(collection_name="szuflada", embedding_function=embedder, persist_directory=persist_directory)
except Exception as e:
print(f"Nie mo偶na by艂o wyczy艣ci膰 kolekcji (mo偶e nie istnia艂a): {e}")
print("Pobieranie i parsowanie mapy strony...")
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
sitemap_url = "https://mojaszuflada.pl/wp-sitemap.xml"
docs = []
try:
response = requests.get(sitemap_url, headers=headers)
response.raise_for_status()
sitemap_xml = response.text
sitemap_soup = BeautifulSoup(sitemap_xml, "xml")
urls = [loc.text for loc in sitemap_soup.find_all("loc")]
sitemap_urls = [url for url in urls if url.endswith(".xml")]
page_urls = [url for url in urls if not url.endswith(".xml")]
for sub_sitemap_url in tqdm(sitemap_urls, desc="Parsowanie pod-map"):
try:
response = requests.get(sub_sitemap_url, headers=headers)
response.raise_for_status()
sub_sitemap_xml = response.text
sub_sitemap_soup = BeautifulSoup(sub_sitemap_xml, "xml")
page_urls.extend([loc.text for loc in sub_sitemap_soup.find_all("loc")])
except requests.RequestException as e:
print(f"Pomini臋to pod-map臋 {sub_sitemap_url}: {e}")
print(f"Znaleziono {len(page_urls)} adres贸w URL do przetworzenia.")
for url in tqdm(page_urls, desc="Pobieranie stron"):
try:
response = requests.get(url, headers=headers)
response.raise_for_status()
doc = Document(
page_content=response.text,
metadata={"source": url, "loc": url}
)
docs.append(doc)
except requests.RequestException as e:
print(f"Pomini臋to stron臋 {url}: {e}")
except requests.RequestException as e:
print(f"Krytyczny b艂膮d: Nie uda艂o si臋 pobra膰 g艂贸wnej mapy strony: {e}")
if not docs:
print("Nie za艂adowano 偶adnych dokument贸w.")
return None
processed_docs = process_documents(docs)
print(f"\nPrzetworzono {len(processed_docs)} dokument贸w.")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(processed_docs)
# Walidacja metadanych
required_meta_keys = ["source", "title", "published_time"]
missing_counts = {k: 0 for k in required_meta_keys}
for chunk in chunks:
md = chunk.metadata or {}
for k in required_meta_keys:
if not md.get(k):
missing_counts[k] += 1
print(f"Liczba chunk贸w: {len(chunks)}")
print("Braki metadanych:", missing_counts)
# Dodawanie chunk贸w do bazy
batch_size = 1000
for i in range(0, len(chunks), batch_size):
baza.add_documents(documents=chunks[i:i + batch_size])
print("Baza danych zosta艂a zainicjalizowana pomy艣lnie.")
return baza
return baza
|