|
|
""" |
|
|
BƯỚC 1: LOAD DOCUMENTS |
|
|
----------------------- |
|
|
Debug-full version |
|
|
|
|
|
- Lädt Prüfungsordnung (PDF) seitenweise. |
|
|
- Lädt Hochschulgesetz NRW aus dem im Dataset gespeicherten HTML, |
|
|
und zerlegt es in einzelne Absätze (Document pro <p>). |
|
|
""" |
|
|
|
|
|
from huggingface_hub import hf_hub_download, list_repo_files |
|
|
from langchain_community.document_loaders import PyPDFLoader |
|
|
from langchain_core.documents import Document |
|
|
from bs4 import BeautifulSoup |
|
|
|
|
|
DATASET = "Nguyen5/docs" |
|
|
PDF_FILE = "f10_bpo_ifb_tei_mif_wii_2021-01-04.pdf" |
|
|
HTML_FILE = "Hochschulgesetz_NRW.html" |
|
|
|
|
|
def _load_hg_paragraph_documents(html_path: str): |
|
|
""" |
|
|
Liest das generierte Hochschulgesetz-HTML ein und erzeugt |
|
|
pro <p>-Element einen LangChain-Document mit: |
|
|
- page_content = Text des Absatzes |
|
|
- metadata: |
|
|
source = "Hochschulgesetz NRW (HTML)" |
|
|
filename = HTML_FILE |
|
|
paragraph_id = id-Attribut (z.B. 'hg_abs_12'), falls vorhanden |
|
|
""" |
|
|
with open(html_path, "r", encoding="utf-8") as f: |
|
|
html = f.read() |
|
|
|
|
|
soup = BeautifulSoup(html, "html.parser") |
|
|
docs = [] |
|
|
|
|
|
for p in soup.find_all("p"): |
|
|
text = p.get_text(" ", strip=True) |
|
|
if not text: |
|
|
continue |
|
|
|
|
|
pid = p.get("id") |
|
|
|
|
|
metadata = { |
|
|
"source": "Hochschulgesetz NRW (HTML)", |
|
|
"filename": HTML_FILE, |
|
|
} |
|
|
if pid: |
|
|
metadata["paragraph_id"] = pid |
|
|
|
|
|
docs.append(Document(page_content=text, metadata=metadata)) |
|
|
|
|
|
print(f"Loaded {len(docs)} paragraph Documents from HG-HTML.\n") |
|
|
return docs |
|
|
|
|
|
def load_documents(): |
|
|
print("=== START: load_documents() ===\n") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print(">>> Checking dataset file list from HuggingFace...") |
|
|
files = list_repo_files(DATASET, repo_type="dataset") |
|
|
print("Files in dataset:", files, "\n") |
|
|
|
|
|
docs = [] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print(">>> Step 1: Download PDF from HuggingFace...") |
|
|
try: |
|
|
pdf_path = hf_hub_download( |
|
|
repo_id=DATASET, |
|
|
filename=PDF_FILE, |
|
|
repo_type="dataset", |
|
|
) |
|
|
print(f"Downloaded PDF to local cache:\n{pdf_path}\n") |
|
|
except Exception as e: |
|
|
print("ERROR downloading PDF:", e) |
|
|
return [] |
|
|
|
|
|
print(">>> Step 1.1: Loading PDF pages...") |
|
|
try: |
|
|
pdf_docs = PyPDFLoader(pdf_path).load() |
|
|
print(f"Loaded {len(pdf_docs)} PDF pages.\n") |
|
|
except Exception as e: |
|
|
print("ERROR loading PDF:", e) |
|
|
return [] |
|
|
|
|
|
for d in pdf_docs: |
|
|
d.metadata["source"] = "Prüfungsordnung (PDF)" |
|
|
d.metadata["filename"] = PDF_FILE |
|
|
|
|
|
docs.extend(pdf_docs) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print(">>> Step 2: Download HTML from HuggingFace...") |
|
|
try: |
|
|
html_path = hf_hub_download( |
|
|
repo_id=DATASET, |
|
|
filename=HTML_FILE, |
|
|
repo_type="dataset", |
|
|
) |
|
|
print(f"Downloaded HTML to local cache:\n{html_path}\n") |
|
|
except Exception as e: |
|
|
print("ERROR downloading HTML:", e) |
|
|
return docs |
|
|
|
|
|
print(">>> Step 2.1: Loading HG-HTML and splitting into paragraphs...") |
|
|
try: |
|
|
html_docs = _load_hg_paragraph_documents(html_path) |
|
|
except Exception as e: |
|
|
print("ERROR loading / parsing HTML:", e) |
|
|
return docs |
|
|
|
|
|
docs.extend(html_docs) |
|
|
|
|
|
print("=== DONE: load_documents() ===\n") |
|
|
return docs |
|
|
|
|
|
if __name__ == "__main__": |
|
|
print("\n=== Running load_documents.py directly ===\n") |
|
|
docs = load_documents() |
|
|
print(f"\n>>> TOTAL documents loaded: {len(docs)}") |
|
|
|
|
|
if len(docs): |
|
|
print("\nExample metadata from 1st document:") |
|
|
print(docs[0].metadata) |
|
|
|
|
|
|