"""
BƯỚC 1: LOAD DOCUMENTS
-----------------------
Debug-full version

- Lädt Prüfungsordnung (PDF) seitenweise.
- Lädt Hochschulgesetz NRW aus dem im Dataset gespeicherten HTML,
  und zerlegt es in einzelne Absätze (Document pro <p>).
"""

from huggingface_hub import hf_hub_download, list_repo_files
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.documents import Document
from bs4 import BeautifulSoup

DATASET = "Nguyen5/docs"
PDF_FILE = "f10_bpo_ifb_tei_mif_wii_2021-01-04.pdf"
HTML_FILE = "Hochschulgesetz_NRW.html"  # konsistent mit hg_nrw.py

def _load_hg_paragraph_documents(html_path: str):
    """
    Liest das generierte Hochschulgesetz-HTML ein und erzeugt
    pro <p>-Element einen LangChain-Document mit:
      - page_content = Text des Absatzes
      - metadata:
          source       = "Hochschulgesetz NRW (HTML)"
          filename     = HTML_FILE
          paragraph_id = id-Attribut (z.B. 'hg_abs_12'), falls vorhanden
    """
    with open(html_path, "r", encoding="utf-8") as f:
        html = f.read()

    soup = BeautifulSoup(html, "html.parser")
    docs = []

    for p in soup.find_all("p"):
        text = p.get_text(" ", strip=True)
        if not text:
            continue

        pid = p.get("id")

        metadata = {
            "source": "Hochschulgesetz NRW (HTML)",
            "filename": HTML_FILE,
        }
        if pid:
            metadata["paragraph_id"] = pid

        docs.append(Document(page_content=text, metadata=metadata))

    print(f"Loaded {len(docs)} paragraph Documents from HG-HTML.\n")
    return docs

def load_documents():
    print("=== START: load_documents() ===\n")

    # -------------------------
    # Check files in dataset
    # -------------------------
    print(">>> Checking dataset file list from HuggingFace...")
    files = list_repo_files(DATASET, repo_type="dataset")
    print("Files in dataset:", files, "\n")

    docs = []

    # -------------------------
    # Load PDF
    # -------------------------
    print(">>> Step 1: Download PDF from HuggingFace...")
    try:
        pdf_path = hf_hub_download(
            repo_id=DATASET,
            filename=PDF_FILE,
            repo_type="dataset",
        )
        print(f"Downloaded PDF to local cache:\n{pdf_path}\n")
    except Exception as e:
        print("ERROR downloading PDF:", e)
        return []

    print(">>> Step 1.1: Loading PDF pages...")
    try:
        pdf_docs = PyPDFLoader(pdf_path).load()
        print(f"Loaded {len(pdf_docs)} PDF pages.\n")
    except Exception as e:
        print("ERROR loading PDF:", e)
        return []

    for d in pdf_docs:
        d.metadata["source"] = "Prüfungsordnung (PDF)"
        d.metadata["filename"] = PDF_FILE

    docs.extend(pdf_docs)

    # -------------------------
    # Load HTML (Hochschulgesetz NRW)
    # -------------------------
    print(">>> Step 2: Download HTML from HuggingFace...")
    try:
        html_path = hf_hub_download(
            repo_id=DATASET,
            filename=HTML_FILE,
            repo_type="dataset",
        )
        print(f"Downloaded HTML to local cache:\n{html_path}\n")
    except Exception as e:
        print("ERROR downloading HTML:", e)
        return docs

    print(">>> Step 2.1: Loading HG-HTML and splitting into paragraphs...")
    try:
        html_docs = _load_hg_paragraph_documents(html_path)
    except Exception as e:
        print("ERROR loading / parsing HTML:", e)
        return docs

    docs.extend(html_docs)

    print("=== DONE: load_documents() ===\n")
    return docs

if __name__ == "__main__":
    print("\n=== Running load_documents.py directly ===\n")
    docs = load_documents()
    print(f"\n>>> TOTAL documents loaded: {len(docs)}")

    if len(docs):
        print("\nExample metadata from 1st document:")
        print(docs[0].metadata)