|
|
""" |
|
|
LOAD_DOCUMENTS – SINGLE SOURCE OF TRUTH |
|
|
|
|
|
Nhiệm vụ: |
|
|
1) Lade Prüfungsordnung PDF direkt aus Supabase-Storage. |
|
|
2) Lade Hochschulgesetz NRW aus Supabase-Tabelle hg_nrw. |
|
|
3) Cung cấp metadata đầy đủ để các file khác KHÔNG PHẢI tính lại URL. |
|
|
""" |
|
|
|
|
|
import os |
|
|
import tempfile |
|
|
from dotenv import load_dotenv |
|
|
from langchain_community.document_loaders import PyPDFLoader |
|
|
from langchain_core.documents import Document |
|
|
from supabase import create_client |
|
|
|
|
|
load_dotenv() |
|
|
|
|
|
import urllib.parse |
|
|
|
|
|
|
|
|
SUPABASE_URL = os.getenv("SUPABASE_URL") |
|
|
SUPABASE_SERVICE_ROLE = os.getenv("SUPABASE_SERVICE_ROLE") |
|
|
|
|
|
supabase = create_client(SUPABASE_URL, SUPABASE_SERVICE_ROLE) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
PDF_FILE = "f10_bpo_ifb_tei_mif_wii_2021-01-04.pdf" |
|
|
|
|
|
PDF_BUCKET = "File PDF" |
|
|
ENC_BUCKET = urllib.parse.quote(PDF_BUCKET) |
|
|
|
|
|
|
|
|
PDF_URL = f"{SUPABASE_URL}/storage/v1/object/public/{ENC_BUCKET}/{PDF_FILE}" |
|
|
|
|
|
|
|
|
|
|
|
HG_VIEWER_BUCKET = "hg_viewer" |
|
|
HG_VIEWER_FILE = "hg_clean.html" |
|
|
HG_VIEWER_URL = f"{SUPABASE_URL}/storage/v1/object/public/{HG_VIEWER_BUCKET}/{HG_VIEWER_FILE}" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def load_pdf_from_supabase() -> list[Document]: |
|
|
print("📥 Lade Prüfungsordnung PDF aus Supabase...") |
|
|
|
|
|
response = supabase.storage.from_(PDF_BUCKET).download(PDF_FILE) |
|
|
if response is None: |
|
|
raise ValueError("❌ Konnte PDF nicht laden!") |
|
|
|
|
|
|
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp: |
|
|
tmp.write(response) |
|
|
temp_pdf_path = tmp.name |
|
|
|
|
|
pages = PyPDFLoader(temp_pdf_path).load() |
|
|
|
|
|
for i, p in enumerate(pages): |
|
|
p.metadata = { |
|
|
"type": "pdf", |
|
|
"source": "Prüfungsordnung", |
|
|
"page": i, |
|
|
"pdf_url": f"{PDF_URL}#page={i}", |
|
|
"filename": PDF_FILE, |
|
|
} |
|
|
|
|
|
print(f"✔ {len(pages)} PDF-Seiten geladen.") |
|
|
return pages |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def load_hg_from_supabase() -> list[Document]: |
|
|
print("📥 Lade Hochschulgesetz NRW aus Tabelle hg_nrw...") |
|
|
|
|
|
res = ( |
|
|
supabase.table("hg_nrw") |
|
|
.select("*") |
|
|
.order("order_index", desc=False) |
|
|
.execute() |
|
|
) |
|
|
rows = res.data or [] |
|
|
docs = [] |
|
|
|
|
|
for row in rows: |
|
|
abs_id = row["abs_id"] |
|
|
title = row["title"] |
|
|
content = row["content"] |
|
|
|
|
|
viewer_url = f"{HG_VIEWER_URL}#{abs_id}" |
|
|
|
|
|
docs.append( |
|
|
Document( |
|
|
page_content=content, |
|
|
metadata={ |
|
|
"type": "hg", |
|
|
"source": "Hochschulgesetz NRW", |
|
|
"abs_id": abs_id, |
|
|
"title": title, |
|
|
"viewer_url": viewer_url, |
|
|
}, |
|
|
) |
|
|
) |
|
|
|
|
|
print(f"✔ {len(docs)} HG-Absätze geladen.") |
|
|
return docs |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def load_all_documents(): |
|
|
pdf_docs = load_pdf_from_supabase() |
|
|
hg_docs = load_hg_from_supabase() |
|
|
return pdf_docs + hg_docs |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
docs = load_all_documents() |
|
|
print("📚 Gesamt:", len(docs)) |
|
|
print("🔎 Beispiel metadata:", docs[0].metadata) |
|
|
|