""" BƯỚC 1: LOAD DOCUMENTS ----------------------- Debug-full version - Lädt Prüfungsordnung (PDF) seitenweise. - Lädt Hochschulgesetz NRW aus dem im Dataset gespeicherten HTML, und zerlegt es in einzelne Absätze (Document pro
). """ from huggingface_hub import hf_hub_download, list_repo_files from langchain_community.document_loaders import PyPDFLoader from langchain_core.documents import Document from bs4 import BeautifulSoup DATASET = "Nguyen5/docs" PDF_FILE = "f10_bpo_ifb_tei_mif_wii_2021-01-04.pdf" HTML_FILE = "Hochschulgesetz_NRW.html" # konsistent mit hg_nrw.py def _load_hg_paragraph_documents(html_path: str): """ Liest das generierte Hochschulgesetz-HTML ein und erzeugt pro
-Element einen LangChain-Document mit: - page_content = Text des Absatzes - metadata: source = "Hochschulgesetz NRW (HTML)" filename = HTML_FILE paragraph_id = id-Attribut (z.B. 'hg_abs_12'), falls vorhanden """ with open(html_path, "r", encoding="utf-8") as f: html = f.read() soup = BeautifulSoup(html, "html.parser") docs = [] for p in soup.find_all("p"): text = p.get_text(" ", strip=True) if not text: continue pid = p.get("id") metadata = { "source": "Hochschulgesetz NRW (HTML)", "filename": HTML_FILE, } if pid: metadata["paragraph_id"] = pid docs.append(Document(page_content=text, metadata=metadata)) print(f"Loaded {len(docs)} paragraph Documents from HG-HTML.\n") return docs def load_documents(): print("=== START: load_documents() ===\n") # ------------------------- # Check files in dataset # ------------------------- print(">>> Checking dataset file list from HuggingFace...") files = list_repo_files(DATASET, repo_type="dataset") print("Files in dataset:", files, "\n") docs = [] # ------------------------- # Load PDF # ------------------------- print(">>> Step 1: Download PDF from HuggingFace...") try: pdf_path = hf_hub_download( repo_id=DATASET, filename=PDF_FILE, repo_type="dataset", ) print(f"Downloaded PDF to local cache:\n{pdf_path}\n") except Exception as e: print("ERROR downloading PDF:", e) return [] print(">>> Step 1.1: Loading PDF pages...") try: pdf_docs = PyPDFLoader(pdf_path).load() print(f"Loaded {len(pdf_docs)} PDF pages.\n") except Exception as e: print("ERROR loading PDF:", e) return [] for d in pdf_docs: d.metadata["source"] = "Prüfungsordnung (PDF)" d.metadata["filename"] = PDF_FILE docs.extend(pdf_docs) # ------------------------- # Load HTML (Hochschulgesetz NRW) # ------------------------- print(">>> Step 2: Download HTML from HuggingFace...") try: html_path = hf_hub_download( repo_id=DATASET, filename=HTML_FILE, repo_type="dataset", ) print(f"Downloaded HTML to local cache:\n{html_path}\n") except Exception as e: print("ERROR downloading HTML:", e) return docs print(">>> Step 2.1: Loading HG-HTML and splitting into paragraphs...") try: html_docs = _load_hg_paragraph_documents(html_path) except Exception as e: print("ERROR loading / parsing HTML:", e) return docs docs.extend(html_docs) print("=== DONE: load_documents() ===\n") return docs if __name__ == "__main__": print("\n=== Running load_documents.py directly ===\n") docs = load_documents() print(f"\n>>> TOTAL documents loaded: {len(docs)}") if len(docs): print("\nExample metadata from 1st document:") print(docs[0].metadata)