chatbot1 / load_documents.py
Nguyen5's picture
commit
4da3e87
"""
BƯỚC 1: LOAD DOCUMENTS
-----------------------
Debug-full version
- Lädt Prüfungsordnung (PDF) seitenweise.
- Lädt Hochschulgesetz NRW aus dem im Dataset gespeicherten HTML,
und zerlegt es in einzelne Absätze (Document pro <p>).
"""
from huggingface_hub import hf_hub_download, list_repo_files
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.documents import Document
from bs4 import BeautifulSoup
DATASET = "Nguyen5/docs"
PDF_FILE = "f10_bpo_ifb_tei_mif_wii_2021-01-04.pdf"
HTML_FILE = "Hochschulgesetz_NRW.html" # konsistent mit hg_nrw.py
def _load_hg_paragraph_documents(html_path: str):
"""
Liest das generierte Hochschulgesetz-HTML ein und erzeugt
pro <p>-Element einen LangChain-Document mit:
- page_content = Text des Absatzes
- metadata:
source = "Hochschulgesetz NRW (HTML)"
filename = HTML_FILE
paragraph_id = id-Attribut (z.B. 'hg_abs_12'), falls vorhanden
"""
with open(html_path, "r", encoding="utf-8") as f:
html = f.read()
soup = BeautifulSoup(html, "html.parser")
docs = []
for p in soup.find_all("p"):
text = p.get_text(" ", strip=True)
if not text:
continue
pid = p.get("id")
metadata = {
"source": "Hochschulgesetz NRW (HTML)",
"filename": HTML_FILE,
}
if pid:
metadata["paragraph_id"] = pid
docs.append(Document(page_content=text, metadata=metadata))
print(f"Loaded {len(docs)} paragraph Documents from HG-HTML.\n")
return docs
def load_documents():
print("=== START: load_documents() ===\n")
# -------------------------
# Check files in dataset
# -------------------------
print(">>> Checking dataset file list from HuggingFace...")
files = list_repo_files(DATASET, repo_type="dataset")
print("Files in dataset:", files, "\n")
docs = []
# -------------------------
# Load PDF
# -------------------------
print(">>> Step 1: Download PDF from HuggingFace...")
try:
pdf_path = hf_hub_download(
repo_id=DATASET,
filename=PDF_FILE,
repo_type="dataset",
)
print(f"Downloaded PDF to local cache:\n{pdf_path}\n")
except Exception as e:
print("ERROR downloading PDF:", e)
return []
print(">>> Step 1.1: Loading PDF pages...")
try:
pdf_docs = PyPDFLoader(pdf_path).load()
print(f"Loaded {len(pdf_docs)} PDF pages.\n")
except Exception as e:
print("ERROR loading PDF:", e)
return []
for d in pdf_docs:
d.metadata["source"] = "Prüfungsordnung (PDF)"
d.metadata["filename"] = PDF_FILE
docs.extend(pdf_docs)
# -------------------------
# Load HTML (Hochschulgesetz NRW)
# -------------------------
print(">>> Step 2: Download HTML from HuggingFace...")
try:
html_path = hf_hub_download(
repo_id=DATASET,
filename=HTML_FILE,
repo_type="dataset",
)
print(f"Downloaded HTML to local cache:\n{html_path}\n")
except Exception as e:
print("ERROR downloading HTML:", e)
return docs
print(">>> Step 2.1: Loading HG-HTML and splitting into paragraphs...")
try:
html_docs = _load_hg_paragraph_documents(html_path)
except Exception as e:
print("ERROR loading / parsing HTML:", e)
return docs
docs.extend(html_docs)
print("=== DONE: load_documents() ===\n")
return docs
if __name__ == "__main__":
print("\n=== Running load_documents.py directly ===\n")
docs = load_documents()
print(f"\n>>> TOTAL documents loaded: {len(docs)}")
if len(docs):
print("\nExample metadata from 1st document:")
print(docs[0].metadata)