chatbot2 / load_documents.py
Nguyen5's picture
commit
ed084d7
# load_documents.py – Supabase + PDF + Paragraph-Viewer
import os
import requests
import tempfile
from supabase import create_client
from langchain_core.documents import Document
from langchain_community.document_loaders import PyPDFLoader
# ===== ENV =====
SUPABASE_URL = os.getenv("SUPABASE_URL")
SUPABASE_ANON_KEY = os.getenv("SUPABASE_ANON_KEY")
if not SUPABASE_URL or not SUPABASE_ANON_KEY:
raise RuntimeError("Missing SUPABASE_URL / SUPABASE_ANON_KEY")
supabase = create_client(SUPABASE_URL, SUPABASE_ANON_KEY)
# ===== PDF (Prüfungsordnung) im Storage =====
PDF_FILE = "f10_bpo_ifb_tei_mif_wii_2021-01-04.pdf"
PDF_URL = f"{SUPABASE_URL}/storage/v1/object/public/File%20PDF/{PDF_FILE}"
# ===== Paragraph-Viewer (hg_clean.html) im Bucket "hg_viewer" =====
HG_HTML_URL = f"{SUPABASE_URL}/storage/v1/object/public/hg_viewer/hg_clean.html"
def load_hg_nrw():
print(">>> Lade Hochschulgesetz NRW (§) aus Tabelle hg_nrw …")
rows = (
supabase.table("hg_nrw")
.select("*")
.order("order_index")
.execute()
).data
docs = []
for r in rows:
abs_id = r["abs_id"] # z.B. para_1
title = r["title"] # z.B. § 1 (Fn 44) Geltungsbereich
content = r["content"] # kompletter Text inkl. Fußnoten
# HTML-Viewer: <div id="para_1">…</div>
viewer_url = f"{HG_HTML_URL}#{abs_id}"
docs.append(
Document(
page_content=f"{title}\n{content}",
metadata={
"source": "Hochschulgesetz NRW",
"paragraph": title,
"url": viewer_url,
},
)
)
print(f"✔ {len(docs)} Paragraphen geladen.\n")
return docs
def load_pdf():
print(">>> Lade Prüfungsordnung PDF …")
resp = requests.get(PDF_URL)
resp.raise_for_status()
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
tmp.write(resp.content)
path = tmp.name
pages = PyPDFLoader(path).load()
for i, p in enumerate(pages):
p.metadata["source"] = "Prüfungsordnung (PDF)"
p.metadata["page"] = i
p.metadata["pdf_url"] = PDF_URL
print(f"✔ {len(pages)} PDF-Seiten geladen.\n")
return pages
def load_documents():
docs = []
docs.extend(load_hg_nrw())
docs.extend(load_pdf())
print(f"✔ DOCUMENTS LOADED: {len(docs)}\n")
return docs
if __name__ == "__main__":
d = load_documents()
print("Example doc:", d[0])