File size: 6,248 Bytes
0e2081a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
from bs4 import BeautifulSoup
import re
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
import requests
from tqdm import tqdm

def process_documents(docs: list[Document]) -> list[Document]:
    """
    Przetwarza list臋 dokument贸w, wyodr臋bniaj膮c tre艣膰 i metadane z HTML.
    """
    processed_docs = []
    for doc in docs:
        soup = BeautifulSoup(doc.page_content, "lxml")

        # Wyodr臋bnienie g艂贸wnej tre艣ci
        article = soup.find("article")
        if article:
            content = article.get_text(separator="\n", strip=True)
        else:
            content = soup.get_text(separator="\n", strip=True)

        # Wyodr臋bnienie metadanych
        metadata = doc.metadata.copy()
        
        # Title ze znacznika <title>
        if soup.title:
            title_text = soup.title.get_text(strip=True)
            if title_text:
                metadata["title"] = title_text

        # Data publikacji
        pub_date_tag = soup.find("meta", property="article:published_time")
        if pub_date_tag and pub_date_tag.get("content"):
            metadata["published_time"] = pub_date_tag["content"]
        else:
            time_tag = soup.find("time")
            if time_tag and time_tag.get("datetime"):
                metadata["published_time"] = time_tag.get("datetime")
            elif time_tag and time_tag.get_text(strip=True):
                metadata["published_time"] = time_tag.get_text(strip=True)
            else:
                text = soup.get_text(separator="\n", strip=True)
                m = re.search(r"Opublikowano(?: w dniu)?[:\s]+([0-9]{1,2}\s+\w+\s+\d{4})", text, re.IGNORECASE)
                if m:
                    metadata["published_time"] = m.group(1)

        # Kategorie
        categories = [
            tag["content"]
            for tag in soup.find_all("meta", property="article:section")
            if tag.get("content")
        ]
        if categories:
            metadata["categories"] = ", ".join(categories)

        # S艂owa kluczowe
        keywords = [
            tag["content"]
            for tag in soup.find_all("meta", property="article:tag")
            if tag.get("content")
        ]
        if keywords:
            metadata["keywords"] = ", ".join(keywords)

        processed_docs.append(Document(page_content=content, metadata=metadata))
    return processed_docs

def initialize_database(persist_directory="./szuflada", clear_existing=True):
    """
    Inicjalizuje baz臋 danych Chroma z danymi ze strony mojaszuflada.pl
    """
    embedder = OpenAIEmbeddings(model="text-embedding-3-small", show_progress_bar=True)
    baza = Chroma(collection_name="szuflada", embedding_function=embedder, persist_directory=persist_directory)

    if clear_existing:
        print("Czyszczenie istniej膮cej kolekcji w bazie danych...")
        try:
            baza.delete_collection()
            print("Kolekcja zosta艂a wyczyszczona.")
            baza = Chroma(collection_name="szuflada", embedding_function=embedder, persist_directory=persist_directory)
        except Exception as e:
            print(f"Nie mo偶na by艂o wyczy艣ci膰 kolekcji (mo偶e nie istnia艂a): {e}")

    print("Pobieranie i parsowanie mapy strony...")
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
    sitemap_url = "https://mojaszuflada.pl/wp-sitemap.xml"
    docs = []

    try:
        response = requests.get(sitemap_url, headers=headers)
        response.raise_for_status()
        sitemap_xml = response.text

        sitemap_soup = BeautifulSoup(sitemap_xml, "xml")
        urls = [loc.text for loc in sitemap_soup.find_all("loc")]

        sitemap_urls = [url for url in urls if url.endswith(".xml")]
        page_urls = [url for url in urls if not url.endswith(".xml")]

        for sub_sitemap_url in tqdm(sitemap_urls, desc="Parsowanie pod-map"):
            try:
                response = requests.get(sub_sitemap_url, headers=headers)
                response.raise_for_status()
                sub_sitemap_xml = response.text
                sub_sitemap_soup = BeautifulSoup(sub_sitemap_xml, "xml")
                page_urls.extend([loc.text for loc in sub_sitemap_soup.find_all("loc")])
            except requests.RequestException as e:
                print(f"Pomini臋to pod-map臋 {sub_sitemap_url}: {e}")

        print(f"Znaleziono {len(page_urls)} adres贸w URL do przetworzenia.")

        for url in tqdm(page_urls, desc="Pobieranie stron"):
            try:
                response = requests.get(url, headers=headers)
                response.raise_for_status()
                doc = Document(
                    page_content=response.text,
                    metadata={"source": url, "loc": url}
                )
                docs.append(doc)
            except requests.RequestException as e:
                print(f"Pomini臋to stron臋 {url}: {e}")

    except requests.RequestException as e:
        print(f"Krytyczny b艂膮d: Nie uda艂o si臋 pobra膰 g艂贸wnej mapy strony: {e}")

    if not docs:
        print("Nie za艂adowano 偶adnych dokument贸w.")
        return None

    processed_docs = process_documents(docs)
    print(f"\nPrzetworzono {len(processed_docs)} dokument贸w.")

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    chunks = text_splitter.split_documents(processed_docs)

    # Walidacja metadanych
    required_meta_keys = ["source", "title", "published_time"]
    missing_counts = {k: 0 for k in required_meta_keys}
    for chunk in chunks:
        md = chunk.metadata or {}
        for k in required_meta_keys:
            if not md.get(k):
                missing_counts[k] += 1

    print(f"Liczba chunk贸w: {len(chunks)}")
    print("Braki metadanych:", missing_counts)

    # Dodawanie chunk贸w do bazy
    batch_size = 1000
    for i in range(0, len(chunks), batch_size):
        baza.add_documents(documents=chunks[i:i + batch_size])

    print("Baza danych zosta艂a zainicjalizowana pomy艣lnie.")
    return baza
    return baza