chatbot / load_documents.py
Nguyen5's picture
commit
99100eb
"""
LOAD_DOCUMENTS – SINGLE SOURCE OF TRUTH
Nhiệm vụ:
1) Lade Prüfungsordnung PDF direkt aus Supabase-Storage.
2) Lade Hochschulgesetz NRW aus Supabase-Tabelle hg_nrw.
3) Cung cấp metadata đầy đủ để các file khác KHÔNG PHẢI tính lại URL.
"""
import os
import tempfile
from dotenv import load_dotenv
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.documents import Document
from supabase import create_client
load_dotenv()
import urllib.parse
# ===== Supabase config =====
SUPABASE_URL = os.getenv("SUPABASE_URL")
SUPABASE_SERVICE_ROLE = os.getenv("SUPABASE_SERVICE_ROLE")
supabase = create_client(SUPABASE_URL, SUPABASE_SERVICE_ROLE)
# ===== Storage Config =====
#import urllib.parse
PDF_FILE = "f10_bpo_ifb_tei_mif_wii_2021-01-04.pdf"
PDF_BUCKET = "File PDF"
ENC_BUCKET = urllib.parse.quote(PDF_BUCKET) # "File%20PDF"
#PDF_URL = f"{SUPABASE_URL}/storage/v1/object/public/{PDF_BUCKET}/{PDF_FILE}"
PDF_URL = f"{SUPABASE_URL}/storage/v1/object/public/{ENC_BUCKET}/{PDF_FILE}"
# ===== Viewer URL =====
HG_VIEWER_BUCKET = "hg_viewer"
HG_VIEWER_FILE = "hg_clean.html"
HG_VIEWER_URL = f"{SUPABASE_URL}/storage/v1/object/public/{HG_VIEWER_BUCKET}/{HG_VIEWER_FILE}"
# ============================================================
# 1) PDF aus Supabase laden
# ============================================================
def load_pdf_from_supabase() -> list[Document]:
print("📥 Lade Prüfungsordnung PDF aus Supabase...")
response = supabase.storage.from_(PDF_BUCKET).download(PDF_FILE)
if response is None:
raise ValueError("❌ Konnte PDF nicht laden!")
# Temporäre Datei
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
tmp.write(response)
temp_pdf_path = tmp.name
pages = PyPDFLoader(temp_pdf_path).load()
for i, p in enumerate(pages):
p.metadata = {
"type": "pdf",
"source": "Prüfungsordnung",
"page": i,
"pdf_url": f"{PDF_URL}#page={i}",
"filename": PDF_FILE,
}
print(f"✔ {len(pages)} PDF-Seiten geladen.")
return pages
# ============================================================
# 2) HG aus Tabelle laden
# ============================================================
def load_hg_from_supabase() -> list[Document]:
print("📥 Lade Hochschulgesetz NRW aus Tabelle hg_nrw...")
res = (
supabase.table("hg_nrw")
.select("*")
.order("order_index", desc=False)
.execute()
)
rows = res.data or []
docs = []
for row in rows:
abs_id = row["abs_id"]
title = row["title"]
content = row["content"]
viewer_url = f"{HG_VIEWER_URL}#{abs_id}"
docs.append(
Document(
page_content=content,
metadata={
"type": "hg",
"source": "Hochschulgesetz NRW",
"abs_id": abs_id,
"title": title,
"viewer_url": viewer_url,
},
)
)
print(f"✔ {len(docs)} HG-Absätze geladen.")
return docs
# ============================================================
# 3) ALLES LADEN
# ============================================================
def load_all_documents():
pdf_docs = load_pdf_from_supabase()
hg_docs = load_hg_from_supabase()
return pdf_docs + hg_docs
if __name__ == "__main__":
docs = load_all_documents()
print("📚 Gesamt:", len(docs))
print("🔎 Beispiel metadata:", docs[0].metadata)