""" LOAD_DOCUMENTS – SINGLE SOURCE OF TRUTH Nhiệm vụ: 1) Lade Prüfungsordnung PDF direkt aus Supabase-Storage. 2) Lade Hochschulgesetz NRW aus Supabase-Tabelle hg_nrw. 3) Cung cấp metadata đầy đủ để các file khác KHÔNG PHẢI tính lại URL. """ import os import tempfile from dotenv import load_dotenv from langchain_community.document_loaders import PyPDFLoader from langchain_core.documents import Document from supabase import create_client load_dotenv() import urllib.parse # ===== Supabase config ===== SUPABASE_URL = os.getenv("SUPABASE_URL") SUPABASE_SERVICE_ROLE = os.getenv("SUPABASE_SERVICE_ROLE") supabase = create_client(SUPABASE_URL, SUPABASE_SERVICE_ROLE) # ===== Storage Config ===== #import urllib.parse PDF_FILE = "f10_bpo_ifb_tei_mif_wii_2021-01-04.pdf" PDF_BUCKET = "File PDF" ENC_BUCKET = urllib.parse.quote(PDF_BUCKET) # "File%20PDF" #PDF_URL = f"{SUPABASE_URL}/storage/v1/object/public/{PDF_BUCKET}/{PDF_FILE}" PDF_URL = f"{SUPABASE_URL}/storage/v1/object/public/{ENC_BUCKET}/{PDF_FILE}" # ===== Viewer URL ===== HG_VIEWER_BUCKET = "hg_viewer" HG_VIEWER_FILE = "hg_clean.html" HG_VIEWER_URL = f"{SUPABASE_URL}/storage/v1/object/public/{HG_VIEWER_BUCKET}/{HG_VIEWER_FILE}" # ============================================================ # 1) PDF aus Supabase laden # ============================================================ def load_pdf_from_supabase() -> list[Document]: print("📥 Lade Prüfungsordnung PDF aus Supabase...") response = supabase.storage.from_(PDF_BUCKET).download(PDF_FILE) if response is None: raise ValueError("❌ Konnte PDF nicht laden!") # Temporäre Datei with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp: tmp.write(response) temp_pdf_path = tmp.name pages = PyPDFLoader(temp_pdf_path).load() for i, p in enumerate(pages): p.metadata = { "type": "pdf", "source": "Prüfungsordnung", "page": i, "pdf_url": f"{PDF_URL}#page={i}", "filename": PDF_FILE, } print(f"✔ {len(pages)} PDF-Seiten geladen.") return pages # ============================================================ # 2) HG aus Tabelle laden # ============================================================ def load_hg_from_supabase() -> list[Document]: print("📥 Lade Hochschulgesetz NRW aus Tabelle hg_nrw...") res = ( supabase.table("hg_nrw") .select("*") .order("order_index", desc=False) .execute() ) rows = res.data or [] docs = [] for row in rows: abs_id = row["abs_id"] title = row["title"] content = row["content"] viewer_url = f"{HG_VIEWER_URL}#{abs_id}" docs.append( Document( page_content=content, metadata={ "type": "hg", "source": "Hochschulgesetz NRW", "abs_id": abs_id, "title": title, "viewer_url": viewer_url, }, ) ) print(f"✔ {len(docs)} HG-Absätze geladen.") return docs # ============================================================ # 3) ALLES LADEN # ============================================================ def load_all_documents(): pdf_docs = load_pdf_from_supabase() hg_docs = load_hg_from_supabase() return pdf_docs + hg_docs if __name__ == "__main__": docs = load_all_documents() print("📚 Gesamt:", len(docs)) print("🔎 Beispiel metadata:", docs[0].metadata)