File size: 3,628 Bytes
9fd5591
9370c0a
cae6054
9370c0a
 
 
 
9fd5591
 
9370c0a
 
 
de84956
9fd5591
9370c0a
de84956
9370c0a
 
a6e73c8
 
9370c0a
 
 
 
 
 
 
a6e73c8
 
dac7bf7
305c683
9370c0a
305c683
cae6054
f57ff46
 
 
9370c0a
 
d9fbc4f
 
 
 
9370c0a
 
 
 
 
 
 
 
 
 
 
cae6054
9370c0a
 
 
 
cae6054
9370c0a
 
 
 
 
 
 
99100eb
9370c0a
cae6054
 
9370c0a
 
cae6054
 
9370c0a
 
 
9fd5591
9370c0a
 
cae6054
9370c0a
 
 
 
 
 
 
cae6054
 
9370c0a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
de84956
cae6054
9370c0a
 
 
cae6054
9370c0a
 
 
cae6054
9370c0a
 
 
 
de84956
cae6054
9370c0a
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
"""
LOAD_DOCUMENTS – SINGLE SOURCE OF TRUTH

Nhiệm vụ:
1) Lade Prüfungsordnung PDF direkt aus Supabase-Storage.
2) Lade Hochschulgesetz NRW aus Supabase-Tabelle hg_nrw.
3) Cung cấp metadata đầy đủ để các file khác KHÔNG PHẢI tính lại URL.
"""

import os
import tempfile
from dotenv import load_dotenv
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.documents import Document
from supabase import create_client

load_dotenv()

import urllib.parse

# ===== Supabase config =====
SUPABASE_URL = os.getenv("SUPABASE_URL")
SUPABASE_SERVICE_ROLE = os.getenv("SUPABASE_SERVICE_ROLE")

supabase = create_client(SUPABASE_URL, SUPABASE_SERVICE_ROLE)

# ===== Storage Config =====

#import urllib.parse
PDF_FILE = "f10_bpo_ifb_tei_mif_wii_2021-01-04.pdf"

PDF_BUCKET = "File PDF"
ENC_BUCKET = urllib.parse.quote(PDF_BUCKET)   # "File%20PDF"

#PDF_URL = f"{SUPABASE_URL}/storage/v1/object/public/{PDF_BUCKET}/{PDF_FILE}"
PDF_URL = f"{SUPABASE_URL}/storage/v1/object/public/{ENC_BUCKET}/{PDF_FILE}"


# ===== Viewer URL =====
HG_VIEWER_BUCKET = "hg_viewer"
HG_VIEWER_FILE = "hg_clean.html"
HG_VIEWER_URL = f"{SUPABASE_URL}/storage/v1/object/public/{HG_VIEWER_BUCKET}/{HG_VIEWER_FILE}"


# ============================================================
# 1) PDF aus Supabase laden
# ============================================================

def load_pdf_from_supabase() -> list[Document]:
    print("📥 Lade Prüfungsordnung PDF aus Supabase...")

    response = supabase.storage.from_(PDF_BUCKET).download(PDF_FILE)
    if response is None:
        raise ValueError("❌ Konnte PDF nicht laden!")

    # Temporäre Datei
    with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
        tmp.write(response)
        temp_pdf_path = tmp.name

    pages = PyPDFLoader(temp_pdf_path).load()

    for i, p in enumerate(pages):
        p.metadata = {
            "type": "pdf",
            "source": "Prüfungsordnung",
            "page": i,
            "pdf_url": f"{PDF_URL}#page={i}",
            "filename": PDF_FILE,
        }

    print(f"✔ {len(pages)} PDF-Seiten geladen.")
    return pages


# ============================================================
# 2) HG aus Tabelle laden
# ============================================================

def load_hg_from_supabase() -> list[Document]:
    print("📥 Lade Hochschulgesetz NRW aus Tabelle hg_nrw...")

    res = (
        supabase.table("hg_nrw")
        .select("*")
        .order("order_index", desc=False)
        .execute()
    )
    rows = res.data or []
    docs = []

    for row in rows:
        abs_id = row["abs_id"]
        title = row["title"]
        content = row["content"]

        viewer_url = f"{HG_VIEWER_URL}#{abs_id}"

        docs.append(
            Document(
                page_content=content,
                metadata={
                    "type": "hg",
                    "source": "Hochschulgesetz NRW",
                    "abs_id": abs_id,
                    "title": title,
                    "viewer_url": viewer_url,
                },
            )
        )

    print(f"✔ {len(docs)} HG-Absätze geladen.")
    return docs


# ============================================================
# 3) ALLES LADEN
# ============================================================

def load_all_documents():
    pdf_docs = load_pdf_from_supabase()
    hg_docs = load_hg_from_supabase()
    return pdf_docs + hg_docs


if __name__ == "__main__":
    docs = load_all_documents()
    print("📚 Gesamt:", len(docs))
    print("🔎 Beispiel metadata:", docs[0].metadata)