File size: 3,865 Bytes
6548bf5
4da3e87
 
 
 
 
 
 
6548bf5
 
4da3e87
6548bf5
 
4da3e87
6548bf5
4da3e87
6548bf5
4da3e87
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6548bf5
4da3e87
 
 
 
6548bf5
4da3e87
6548bf5
4da3e87
 
 
6548bf5
4da3e87
 
6548bf5
4da3e87
6548bf5
4da3e87
 
6548bf5
4da3e87
 
6548bf5
4da3e87
 
 
 
 
 
6548bf5
 
 
4da3e87
 
 
 
 
 
 
 
 
6548bf5
4da3e87
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6548bf5
4da3e87
 
 
 
 
 
6548bf5
4da3e87
6548bf5
4da3e87
 
6548bf5
4da3e87
 
 
 
6548bf5
4da3e87
 
 
6548bf5
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
"""
BƯỚC 1: LOAD DOCUMENTS
-----------------------
Debug-full version

- Lädt Prüfungsordnung (PDF) seitenweise.
- Lädt Hochschulgesetz NRW aus dem im Dataset gespeicherten HTML,
  und zerlegt es in einzelne Absätze (Document pro <p>).
"""

from huggingface_hub import hf_hub_download, list_repo_files
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.documents import Document
from bs4 import BeautifulSoup

DATASET = "Nguyen5/docs"
PDF_FILE = "f10_bpo_ifb_tei_mif_wii_2021-01-04.pdf"
HTML_FILE = "Hochschulgesetz_NRW.html"  # konsistent mit hg_nrw.py

def _load_hg_paragraph_documents(html_path: str):
    """
    Liest das generierte Hochschulgesetz-HTML ein und erzeugt
    pro <p>-Element einen LangChain-Document mit:
      - page_content = Text des Absatzes
      - metadata:
          source       = "Hochschulgesetz NRW (HTML)"
          filename     = HTML_FILE
          paragraph_id = id-Attribut (z.B. 'hg_abs_12'), falls vorhanden
    """
    with open(html_path, "r", encoding="utf-8") as f:
        html = f.read()

    soup = BeautifulSoup(html, "html.parser")
    docs = []

    for p in soup.find_all("p"):
        text = p.get_text(" ", strip=True)
        if not text:
            continue

        pid = p.get("id")

        metadata = {
            "source": "Hochschulgesetz NRW (HTML)",
            "filename": HTML_FILE,
        }
        if pid:
            metadata["paragraph_id"] = pid

        docs.append(Document(page_content=text, metadata=metadata))

    print(f"Loaded {len(docs)} paragraph Documents from HG-HTML.\n")
    return docs

def load_documents():
    print("=== START: load_documents() ===\n")

    # -------------------------
    # Check files in dataset
    # -------------------------
    print(">>> Checking dataset file list from HuggingFace...")
    files = list_repo_files(DATASET, repo_type="dataset")
    print("Files in dataset:", files, "\n")

    docs = []

    # -------------------------
    # Load PDF
    # -------------------------
    print(">>> Step 1: Download PDF from HuggingFace...")
    try:
        pdf_path = hf_hub_download(
            repo_id=DATASET,
            filename=PDF_FILE,
            repo_type="dataset",
        )
        print(f"Downloaded PDF to local cache:\n{pdf_path}\n")
    except Exception as e:
        print("ERROR downloading PDF:", e)
        return []

    print(">>> Step 1.1: Loading PDF pages...")
    try:
        pdf_docs = PyPDFLoader(pdf_path).load()
        print(f"Loaded {len(pdf_docs)} PDF pages.\n")
    except Exception as e:
        print("ERROR loading PDF:", e)
        return []

    for d in pdf_docs:
        d.metadata["source"] = "Prüfungsordnung (PDF)"
        d.metadata["filename"] = PDF_FILE

    docs.extend(pdf_docs)

    # -------------------------
    # Load HTML (Hochschulgesetz NRW)
    # -------------------------
    print(">>> Step 2: Download HTML from HuggingFace...")
    try:
        html_path = hf_hub_download(
            repo_id=DATASET,
            filename=HTML_FILE,
            repo_type="dataset",
        )
        print(f"Downloaded HTML to local cache:\n{html_path}\n")
    except Exception as e:
        print("ERROR downloading HTML:", e)
        return docs

    print(">>> Step 2.1: Loading HG-HTML and splitting into paragraphs...")
    try:
        html_docs = _load_hg_paragraph_documents(html_path)
    except Exception as e:
        print("ERROR loading / parsing HTML:", e)
        return docs

    docs.extend(html_docs)

    print("=== DONE: load_documents() ===\n")
    return docs

if __name__ == "__main__":
    print("\n=== Running load_documents.py directly ===\n")
    docs = load_documents()
    print(f"\n>>> TOTAL documents loaded: {len(docs)}")

    if len(docs):
        print("\nExample metadata from 1st document:")
        print(docs[0].metadata)