# load_documents.py – Supabase + PDF + Paragraph-Viewer import os import requests import tempfile from supabase import create_client from langchain_core.documents import Document from langchain_community.document_loaders import PyPDFLoader # ===== ENV ===== SUPABASE_URL = os.getenv("SUPABASE_URL") SUPABASE_ANON_KEY = os.getenv("SUPABASE_ANON_KEY") if not SUPABASE_URL or not SUPABASE_ANON_KEY: raise RuntimeError("Missing SUPABASE_URL / SUPABASE_ANON_KEY") supabase = create_client(SUPABASE_URL, SUPABASE_ANON_KEY) # ===== PDF (Prüfungsordnung) im Storage ===== PDF_FILE = "f10_bpo_ifb_tei_mif_wii_2021-01-04.pdf" PDF_URL = f"{SUPABASE_URL}/storage/v1/object/public/File%20PDF/{PDF_FILE}" # ===== Paragraph-Viewer (hg_clean.html) im Bucket "hg_viewer" ===== HG_HTML_URL = f"{SUPABASE_URL}/storage/v1/object/public/hg_viewer/hg_clean.html" def load_hg_nrw(): print(">>> Lade Hochschulgesetz NRW (§) aus Tabelle hg_nrw …") rows = ( supabase.table("hg_nrw") .select("*") .order("order_index") .execute() ).data docs = [] for r in rows: abs_id = r["abs_id"] # z.B. para_1 title = r["title"] # z.B. § 1 (Fn 44) Geltungsbereich content = r["content"] # kompletter Text inkl. Fußnoten # HTML-Viewer:
viewer_url = f"{HG_HTML_URL}#{abs_id}" docs.append( Document( page_content=f"{title}\n{content}", metadata={ "source": "Hochschulgesetz NRW", "paragraph": title, "url": viewer_url, }, ) ) print(f"✔ {len(docs)} Paragraphen geladen.\n") return docs def load_pdf(): print(">>> Lade Prüfungsordnung PDF …") resp = requests.get(PDF_URL) resp.raise_for_status() with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp: tmp.write(resp.content) path = tmp.name pages = PyPDFLoader(path).load() for i, p in enumerate(pages): p.metadata["source"] = "Prüfungsordnung (PDF)" p.metadata["page"] = i p.metadata["pdf_url"] = PDF_URL print(f"✔ {len(pages)} PDF-Seiten geladen.\n") return pages def load_documents(): docs = [] docs.extend(load_hg_nrw()) docs.extend(load_pdf()) print(f"✔ DOCUMENTS LOADED: {len(docs)}\n") return docs if __name__ == "__main__": d = load_documents() print("Example doc:", d[0])