|
|
|
|
|
|
|
|
import os |
|
|
import requests |
|
|
import tempfile |
|
|
from supabase import create_client |
|
|
from langchain_core.documents import Document |
|
|
from langchain_community.document_loaders import PyPDFLoader |
|
|
|
|
|
|
|
|
SUPABASE_URL = os.getenv("SUPABASE_URL") |
|
|
SUPABASE_ANON_KEY = os.getenv("SUPABASE_ANON_KEY") |
|
|
|
|
|
if not SUPABASE_URL or not SUPABASE_ANON_KEY: |
|
|
raise RuntimeError("Missing SUPABASE_URL / SUPABASE_ANON_KEY") |
|
|
|
|
|
supabase = create_client(SUPABASE_URL, SUPABASE_ANON_KEY) |
|
|
|
|
|
|
|
|
PDF_FILE = "f10_bpo_ifb_tei_mif_wii_2021-01-04.pdf" |
|
|
PDF_URL = f"{SUPABASE_URL}/storage/v1/object/public/File%20PDF/{PDF_FILE}" |
|
|
|
|
|
|
|
|
HG_HTML_URL = f"{SUPABASE_URL}/storage/v1/object/public/hg_viewer/hg_clean.html" |
|
|
|
|
|
|
|
|
def load_hg_nrw(): |
|
|
print(">>> Lade Hochschulgesetz NRW (§) aus Tabelle hg_nrw …") |
|
|
|
|
|
rows = ( |
|
|
supabase.table("hg_nrw") |
|
|
.select("*") |
|
|
.order("order_index") |
|
|
.execute() |
|
|
).data |
|
|
|
|
|
docs = [] |
|
|
for r in rows: |
|
|
abs_id = r["abs_id"] |
|
|
title = r["title"] |
|
|
content = r["content"] |
|
|
|
|
|
|
|
|
viewer_url = f"{HG_HTML_URL}#{abs_id}" |
|
|
|
|
|
docs.append( |
|
|
Document( |
|
|
page_content=f"{title}\n{content}", |
|
|
metadata={ |
|
|
"source": "Hochschulgesetz NRW", |
|
|
"paragraph": title, |
|
|
"url": viewer_url, |
|
|
}, |
|
|
) |
|
|
) |
|
|
|
|
|
print(f"✔ {len(docs)} Paragraphen geladen.\n") |
|
|
return docs |
|
|
|
|
|
|
|
|
def load_pdf(): |
|
|
print(">>> Lade Prüfungsordnung PDF …") |
|
|
|
|
|
resp = requests.get(PDF_URL) |
|
|
resp.raise_for_status() |
|
|
|
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp: |
|
|
tmp.write(resp.content) |
|
|
path = tmp.name |
|
|
|
|
|
pages = PyPDFLoader(path).load() |
|
|
|
|
|
for i, p in enumerate(pages): |
|
|
p.metadata["source"] = "Prüfungsordnung (PDF)" |
|
|
p.metadata["page"] = i |
|
|
p.metadata["pdf_url"] = PDF_URL |
|
|
|
|
|
print(f"✔ {len(pages)} PDF-Seiten geladen.\n") |
|
|
return pages |
|
|
|
|
|
|
|
|
def load_documents(): |
|
|
docs = [] |
|
|
docs.extend(load_hg_nrw()) |
|
|
docs.extend(load_pdf()) |
|
|
print(f"✔ DOCUMENTS LOADED: {len(docs)}\n") |
|
|
return docs |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
d = load_documents() |
|
|
print("Example doc:", d[0]) |
|
|
|