commit
Browse files- app.py +87 -50
- ingest.py +50 -47
- rag_pipeline.py +158 -110
- supabase_client.py +10 -6
app.py
CHANGED
|
@@ -2,12 +2,16 @@
|
|
| 2 |
import os
|
| 3 |
import re
|
| 4 |
import base64
|
|
|
|
| 5 |
import gradio as gr
|
| 6 |
from openai import OpenAI
|
| 7 |
|
| 8 |
from supabase_client import load_file_bytes
|
| 9 |
-
from rag_pipeline import rag_answer
|
| 10 |
|
|
|
|
|
|
|
|
|
|
| 11 |
client = OpenAI()
|
| 12 |
|
| 13 |
BUCKET = os.environ["SUPABASE_BUCKET"]
|
|
@@ -18,32 +22,46 @@ HG_URL = "https://recht.nrw.de/lmi/owa/br_text_anzeigen?v_id=1000000000000000065
|
|
| 18 |
|
| 19 |
|
| 20 |
# -------------------------------------------------------------------
|
| 21 |
-
# PDF
|
| 22 |
# -------------------------------------------------------------------
|
| 23 |
-
def encode_pdf_src():
|
| 24 |
pdf_bytes = load_file_bytes(BUCKET, "pruefungsordnung.pdf")
|
| 25 |
b64 = base64.b64encode(pdf_bytes).decode("utf-8")
|
| 26 |
return f"data:application/pdf;base64,{b64}"
|
| 27 |
|
| 28 |
|
| 29 |
# -------------------------------------------------------------------
|
| 30 |
-
#
|
| 31 |
# -------------------------------------------------------------------
|
| 32 |
-
FILLER = [
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
if not t:
|
| 37 |
return ""
|
| 38 |
t = t.lower().strip()
|
| 39 |
for f in FILLER:
|
| 40 |
-
t = re.sub(rf"\b{f}\b", "", t)
|
| 41 |
t = re.sub(r"[^a-zA-ZäöüÄÖÜß0-9,.? ]+", " ", t)
|
| 42 |
t = re.sub(r"\s+", " ", t).strip()
|
| 43 |
-
|
|
|
|
|
|
|
| 44 |
|
| 45 |
|
| 46 |
-
def transcribe(audio_path):
|
| 47 |
if audio_path is None:
|
| 48 |
return ""
|
| 49 |
with open(audio_path, "rb") as f:
|
|
@@ -53,53 +71,69 @@ def transcribe(audio_path):
|
|
| 53 |
language="de",
|
| 54 |
temperature=0.0,
|
| 55 |
)
|
| 56 |
-
|
|
|
|
|
|
|
| 57 |
|
| 58 |
|
| 59 |
# -------------------------------------------------------------------
|
| 60 |
-
# CHAT
|
| 61 |
-
# User CHỌN mode: "text" hoặc "audio"
|
| 62 |
# -------------------------------------------------------------------
|
| 63 |
def chat_fn(mode, text, audio, history):
|
| 64 |
history = history or []
|
| 65 |
|
| 66 |
-
#
|
| 67 |
if mode == "text":
|
| 68 |
if not (text or "").strip():
|
| 69 |
return history, "Bitte Text eingeben.", None
|
| 70 |
question = text.strip()
|
| 71 |
-
|
| 72 |
-
# --- MODE: SPRACHE ---
|
| 73 |
-
if mode == "audio":
|
| 74 |
if audio is None:
|
| 75 |
return history, "Bitte ins Mikrofon sprechen.", None
|
| 76 |
-
|
| 77 |
question = transcribe(audio)
|
| 78 |
if not question:
|
| 79 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
|
| 81 |
-
#
|
| 82 |
answer, docs = rag_answer(question, history)
|
| 83 |
|
| 84 |
-
#
|
| 85 |
-
|
| 86 |
-
for i, d in enumerate(docs):
|
| 87 |
-
meta = d["metadata"]
|
| 88 |
-
src = meta.get("source")
|
| 89 |
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
else:
|
| 95 |
url = HG_URL
|
| 96 |
-
title = f"Quelle {i+1}
|
| 97 |
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
quellen.append(f" > {snip}")
|
| 101 |
|
| 102 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
|
| 104 |
new_history = history + [
|
| 105 |
{"role": "user", "content": question},
|
|
@@ -110,22 +144,26 @@ def chat_fn(mode, text, audio, history):
|
|
| 110 |
|
| 111 |
|
| 112 |
# -------------------------------------------------------------------
|
| 113 |
-
#
|
| 114 |
# -------------------------------------------------------------------
|
| 115 |
with gr.Blocks() as demo:
|
|
|
|
|
|
|
|
|
|
| 116 |
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
Wähle eine Eingabemethode: Text oder Sprache.
|
| 120 |
-
""")
|
| 121 |
|
| 122 |
-
|
|
|
|
| 123 |
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
with gr.Column(scale=3):
|
| 128 |
|
|
|
|
|
|
|
|
|
|
| 129 |
chatbot = gr.Chatbot(label="Chatverlauf")
|
| 130 |
|
| 131 |
mode_select = gr.Radio(
|
|
@@ -136,16 +174,15 @@ with gr.Blocks() as demo:
|
|
| 136 |
)
|
| 137 |
|
| 138 |
text_input = gr.Textbox(label="Text eingeben")
|
| 139 |
-
audio_input = gr.Audio(
|
|
|
|
|
|
|
| 140 |
|
| 141 |
send_btn = gr.Button("Senden")
|
| 142 |
answer_preview = gr.Markdown("")
|
| 143 |
|
| 144 |
-
#
|
| 145 |
-
# RIGHT SIDE: VIEWER
|
| 146 |
-
# ======================
|
| 147 |
with gr.Column(scale=2):
|
| 148 |
-
|
| 149 |
gr.Markdown("### 📄 Prüfungsordnung (PDF)")
|
| 150 |
gr.HTML(
|
| 151 |
f"<iframe src='{encode_pdf_src()}' width='100%' height='250' style='border:none;'></iframe>"
|
|
|
|
| 2 |
import os
|
| 3 |
import re
|
| 4 |
import base64
|
| 5 |
+
|
| 6 |
import gradio as gr
|
| 7 |
from openai import OpenAI
|
| 8 |
|
| 9 |
from supabase_client import load_file_bytes
|
| 10 |
+
from rag_pipeline import rag_answer # agent_answer alias
|
| 11 |
|
| 12 |
+
# -------------------------------------------------------------------
|
| 13 |
+
# OpenAI client cho Whisper (Speech-to-Text)
|
| 14 |
+
# -------------------------------------------------------------------
|
| 15 |
client = OpenAI()
|
| 16 |
|
| 17 |
BUCKET = os.environ["SUPABASE_BUCKET"]
|
|
|
|
| 22 |
|
| 23 |
|
| 24 |
# -------------------------------------------------------------------
|
| 25 |
+
# PDF Viewer (Base64 iframe)
|
| 26 |
# -------------------------------------------------------------------
|
| 27 |
+
def encode_pdf_src() -> str:
|
| 28 |
pdf_bytes = load_file_bytes(BUCKET, "pruefungsordnung.pdf")
|
| 29 |
b64 = base64.b64encode(pdf_bytes).decode("utf-8")
|
| 30 |
return f"data:application/pdf;base64,{b64}"
|
| 31 |
|
| 32 |
|
| 33 |
# -------------------------------------------------------------------
|
| 34 |
+
# Speech-to-Text (Whisper) + cleaning
|
| 35 |
# -------------------------------------------------------------------
|
| 36 |
+
FILLER = [
|
| 37 |
+
"äh",
|
| 38 |
+
"ähm",
|
| 39 |
+
"uh",
|
| 40 |
+
"hmm",
|
| 41 |
+
"mmh",
|
| 42 |
+
"ah",
|
| 43 |
+
"oh",
|
| 44 |
+
"also",
|
| 45 |
+
"sozusagen",
|
| 46 |
+
"halt",
|
| 47 |
+
"irgendwie",
|
| 48 |
+
]
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def clean_transcript(t: str) -> str:
|
| 52 |
if not t:
|
| 53 |
return ""
|
| 54 |
t = t.lower().strip()
|
| 55 |
for f in FILLER:
|
| 56 |
+
t = re.sub(rf"\b{re.escape(f)}\b", "", t)
|
| 57 |
t = re.sub(r"[^a-zA-ZäöüÄÖÜß0-9,.? ]+", " ", t)
|
| 58 |
t = re.sub(r"\s+", " ", t).strip()
|
| 59 |
+
if len(t) > 1:
|
| 60 |
+
t = t[0].upper() + t[1:]
|
| 61 |
+
return t
|
| 62 |
|
| 63 |
|
| 64 |
+
def transcribe(audio_path: str) -> str:
|
| 65 |
if audio_path is None:
|
| 66 |
return ""
|
| 67 |
with open(audio_path, "rb") as f:
|
|
|
|
| 71 |
language="de",
|
| 72 |
temperature=0.0,
|
| 73 |
)
|
| 74 |
+
raw = (result.text or "").strip()
|
| 75 |
+
cleaned = clean_transcript(raw)
|
| 76 |
+
return cleaned if len(cleaned) >= 3 else ""
|
| 77 |
|
| 78 |
|
| 79 |
# -------------------------------------------------------------------
|
| 80 |
+
# Hàm CHAT chính – gọi Agent (rag_answer)
|
|
|
|
| 81 |
# -------------------------------------------------------------------
|
| 82 |
def chat_fn(mode, text, audio, history):
|
| 83 |
history = history or []
|
| 84 |
|
| 85 |
+
# 1) Chọn câu hỏi theo mode
|
| 86 |
if mode == "text":
|
| 87 |
if not (text or "").strip():
|
| 88 |
return history, "Bitte Text eingeben.", None
|
| 89 |
question = text.strip()
|
| 90 |
+
else: # mode == "audio"
|
|
|
|
|
|
|
| 91 |
if audio is None:
|
| 92 |
return history, "Bitte ins Mikrofon sprechen.", None
|
|
|
|
| 93 |
question = transcribe(audio)
|
| 94 |
if not question:
|
| 95 |
+
return (
|
| 96 |
+
history,
|
| 97 |
+
"Spracherkennung fehlgeschlagen. Bitte erneut versuchen.",
|
| 98 |
+
None,
|
| 99 |
+
)
|
| 100 |
|
| 101 |
+
# 2) Gọi Agent (RAG + Tools)
|
| 102 |
answer, docs = rag_answer(question, history)
|
| 103 |
|
| 104 |
+
# 3) Xây block Quellen (UI-friendly)
|
| 105 |
+
quellen_md_lines = ["", "### 📚 Verwendete Quellen"]
|
|
|
|
|
|
|
|
|
|
| 106 |
|
| 107 |
+
for i, d in enumerate(docs):
|
| 108 |
+
meta = d.get("metadata", {}) or {}
|
| 109 |
+
src = meta.get("source", "?")
|
| 110 |
+
page = meta.get("page", None)
|
| 111 |
+
|
| 112 |
+
# Prüfungsordnung – nhảy đúng Seite
|
| 113 |
+
if isinstance(src, str) and src.startswith("Prüfungsordnung"):
|
| 114 |
+
page_num = page if isinstance(page, int) else None
|
| 115 |
+
if page_num:
|
| 116 |
+
url = f"{PDF_URL}#page={page_num}"
|
| 117 |
+
title = f"Quelle {i+1}: Prüfungsordnung (Seite {page_num})"
|
| 118 |
+
else:
|
| 119 |
+
url = PDF_URL
|
| 120 |
+
title = f"Quelle {i+1}: Prüfungsordnung"
|
| 121 |
+
# Hochschulgesetz – link trang chính thức
|
| 122 |
else:
|
| 123 |
url = HG_URL
|
| 124 |
+
title = f"Quelle {i+1}: Hochschulgesetz NRW"
|
| 125 |
|
| 126 |
+
snippet = (d.get("content") or "").strip().replace("\n", " ")
|
| 127 |
+
snippet = snippet[:200] + ("…" if len(snippet) > 200 else "")
|
|
|
|
| 128 |
|
| 129 |
+
quellen_md_lines.append(
|
| 130 |
+
f"- [{title}]({url})\n"
|
| 131 |
+
f" - **Ausschnitt:** „{snippet}“"
|
| 132 |
+
)
|
| 133 |
+
|
| 134 |
+
quellen_md = "\n".join(quellen_md_lines)
|
| 135 |
+
|
| 136 |
+
bot_msg = answer + "\n\n" + quellen_md
|
| 137 |
|
| 138 |
new_history = history + [
|
| 139 |
{"role": "user", "content": question},
|
|
|
|
| 144 |
|
| 145 |
|
| 146 |
# -------------------------------------------------------------------
|
| 147 |
+
# Giao diện Gradio – UI thân thiện
|
| 148 |
# -------------------------------------------------------------------
|
| 149 |
with gr.Blocks() as demo:
|
| 150 |
+
gr.Markdown(
|
| 151 |
+
"""
|
| 152 |
+
# ⚖️ Prüfungsrechts-Assistent (NRW)
|
| 153 |
|
| 154 |
+
Willkommen!
|
| 155 |
+
Ich beantworte Ihre Fragen auf Basis der **offiziellen Dokumente**:
|
|
|
|
|
|
|
| 156 |
|
| 157 |
+
- 📘 *Prüfungsordnung Ihrer Hochschule*
|
| 158 |
+
- 📗 *Hochschulgesetz NRW (recht.nrw.de)*
|
| 159 |
|
| 160 |
+
Wählen Sie unten: **Text** oder **Sprache**.
|
| 161 |
+
"""
|
| 162 |
+
)
|
|
|
|
| 163 |
|
| 164 |
+
with gr.Row():
|
| 165 |
+
# LEFT: Chat
|
| 166 |
+
with gr.Column(scale=3):
|
| 167 |
chatbot = gr.Chatbot(label="Chatverlauf")
|
| 168 |
|
| 169 |
mode_select = gr.Radio(
|
|
|
|
| 174 |
)
|
| 175 |
|
| 176 |
text_input = gr.Textbox(label="Text eingeben")
|
| 177 |
+
audio_input = gr.Audio(
|
| 178 |
+
type="filepath", label="Spracheingabe (Mikrofon)"
|
| 179 |
+
)
|
| 180 |
|
| 181 |
send_btn = gr.Button("Senden")
|
| 182 |
answer_preview = gr.Markdown("")
|
| 183 |
|
| 184 |
+
# RIGHT: Viewer
|
|
|
|
|
|
|
| 185 |
with gr.Column(scale=2):
|
|
|
|
| 186 |
gr.Markdown("### 📄 Prüfungsordnung (PDF)")
|
| 187 |
gr.HTML(
|
| 188 |
f"<iframe src='{encode_pdf_src()}' width='100%' height='250' style='border:none;'></iframe>"
|
ingest.py
CHANGED
|
@@ -6,9 +6,11 @@ from bs4 import BeautifulSoup
|
|
| 6 |
from pypdf import PdfReader
|
| 7 |
|
| 8 |
from supabase_client import supabase, load_file_bytes
|
|
|
|
| 9 |
from langchain_openai import OpenAIEmbeddings
|
| 10 |
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
| 11 |
from langchain_core.documents import Document
|
|
|
|
| 12 |
|
| 13 |
# -------------------------------------------------------------------
|
| 14 |
# ENV + URLs
|
|
@@ -16,12 +18,12 @@ from langchain_core.documents import Document
|
|
| 16 |
BUCKET = os.environ["SUPABASE_BUCKET"]
|
| 17 |
SUPABASE_URL = os.environ["SUPABASE_URL"]
|
| 18 |
|
| 19 |
-
# Public URLs trong Supabase Storage (chỉ dùng để tham chiếu / Quelle)
|
| 20 |
PDF_URL = f"{SUPABASE_URL}/storage/v1/object/public/{BUCKET}/pruefungsordnung.pdf"
|
| 21 |
HG_STORAGE_URL = f"{SUPABASE_URL}/storage/v1/object/public/{BUCKET}/hochschulgesetz.html"
|
| 22 |
|
| 23 |
-
|
| 24 |
-
|
|
|
|
| 25 |
|
| 26 |
|
| 27 |
# -------------------------------------------------------------------
|
|
@@ -29,13 +31,8 @@ OFFICIAL_HG_URL = "https://recht.nrw.de/lmi/owa/br_text_anzeigen?v_id=1000000000
|
|
| 29 |
# -------------------------------------------------------------------
|
| 30 |
def load_pdf_docs():
|
| 31 |
"""
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
- Trích text từng trang
|
| 35 |
-
- Mỗi trang là 1 Document với metadata:
|
| 36 |
-
- source: "Prüfungsordnung (PDF)"
|
| 37 |
-
- page: SỐ TRANG 1-based (Seite 1, 2, 3, ...)
|
| 38 |
-
- pdf_url: URL public của PDF trong Supabase (không #page)
|
| 39 |
"""
|
| 40 |
pdf_bytes = load_file_bytes(BUCKET, "pruefungsordnung.pdf")
|
| 41 |
reader = PdfReader(BytesIO(pdf_bytes))
|
|
@@ -43,8 +40,6 @@ def load_pdf_docs():
|
|
| 43 |
docs = []
|
| 44 |
for i, page in enumerate(reader.pages):
|
| 45 |
text = page.extract_text() or ""
|
| 46 |
-
|
| 47 |
-
# Lưu page 1-based để sau dùng trực tiếp trong UI
|
| 48 |
page_num = i + 1
|
| 49 |
|
| 50 |
docs.append(
|
|
@@ -52,8 +47,8 @@ def load_pdf_docs():
|
|
| 52 |
page_content=text,
|
| 53 |
metadata={
|
| 54 |
"source": "Prüfungsordnung (PDF)",
|
| 55 |
-
"page": page_num,
|
| 56 |
-
"pdf_url": PDF_URL,
|
| 57 |
},
|
| 58 |
)
|
| 59 |
)
|
|
@@ -61,15 +56,12 @@ def load_pdf_docs():
|
|
| 61 |
|
| 62 |
|
| 63 |
# -------------------------------------------------------------------
|
| 64 |
-
# Loader HTML Hochschulgesetz
|
| 65 |
# -------------------------------------------------------------------
|
| 66 |
def load_html_docs():
|
| 67 |
"""
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
- Ta load bản HTML từ Supabase Storage (trước đó đã crawl/lưu).
|
| 71 |
-
- get_text(separator="\\n") để giữ cấu trúc tương đối.
|
| 72 |
-
- Việc chunk sẽ do TextSplitter xử lý.
|
| 73 |
"""
|
| 74 |
html_bytes = load_file_bytes(BUCKET, "hochschulgesetz.html")
|
| 75 |
html = html_bytes.decode("utf-8", errors="ignore")
|
|
@@ -82,7 +74,6 @@ def load_html_docs():
|
|
| 82 |
page_content=text,
|
| 83 |
metadata={
|
| 84 |
"source": "Hochschulgesetz NRW",
|
| 85 |
-
# anchor_id sẽ được gán sau khi chunk
|
| 86 |
"official_url": OFFICIAL_HG_URL,
|
| 87 |
},
|
| 88 |
)
|
|
@@ -90,14 +81,9 @@ def load_html_docs():
|
|
| 90 |
|
| 91 |
|
| 92 |
# -------------------------------------------------------------------
|
| 93 |
-
#
|
| 94 |
# -------------------------------------------------------------------
|
| 95 |
def chunk_docs(docs):
|
| 96 |
-
"""
|
| 97 |
-
Dùng RecursiveCharacterTextSplitter để chia nhỏ nội dung.
|
| 98 |
-
- chunk_size: 900
|
| 99 |
-
- chunk_overlap: 100
|
| 100 |
-
"""
|
| 101 |
splitter = RecursiveCharacterTextSplitter(
|
| 102 |
chunk_size=900,
|
| 103 |
chunk_overlap=100,
|
|
@@ -106,20 +92,41 @@ def chunk_docs(docs):
|
|
| 106 |
|
| 107 |
|
| 108 |
# -------------------------------------------------------------------
|
| 109 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
# -------------------------------------------------------------------
|
| 111 |
def ingest():
|
| 112 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
pdf_docs = load_pdf_docs()
|
| 114 |
hg_docs = load_html_docs()
|
| 115 |
|
| 116 |
-
#
|
| 117 |
chunks = chunk_docs(pdf_docs + hg_docs)
|
| 118 |
|
| 119 |
-
#
|
| 120 |
po_idx = 1
|
| 121 |
hg_idx = 1
|
| 122 |
-
|
| 123 |
for d in chunks:
|
| 124 |
src = d.metadata.get("source")
|
| 125 |
|
|
@@ -129,26 +136,22 @@ def ingest():
|
|
| 129 |
else:
|
| 130 |
d.metadata["anchor_id"] = f"hg_{hg_idx}"
|
| 131 |
hg_idx += 1
|
| 132 |
-
|
| 133 |
-
# Thêm URL cho HG nếu muốn dùng sau
|
| 134 |
-
if src == "Hochschulgesetz NRW":
|
| 135 |
d.metadata["url"] = OFFICIAL_HG_URL
|
| 136 |
|
| 137 |
-
#
|
| 138 |
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
|
| 139 |
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
).execute()
|
| 150 |
|
| 151 |
-
print("
|
| 152 |
|
| 153 |
|
| 154 |
if __name__ == "__main__":
|
|
|
|
| 6 |
from pypdf import PdfReader
|
| 7 |
|
| 8 |
from supabase_client import supabase, load_file_bytes
|
| 9 |
+
|
| 10 |
from langchain_openai import OpenAIEmbeddings
|
| 11 |
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
| 12 |
from langchain_core.documents import Document
|
| 13 |
+
from langchain_community.vectorstores import SupabaseVectorStore
|
| 14 |
|
| 15 |
# -------------------------------------------------------------------
|
| 16 |
# ENV + URLs
|
|
|
|
| 18 |
BUCKET = os.environ["SUPABASE_BUCKET"]
|
| 19 |
SUPABASE_URL = os.environ["SUPABASE_URL"]
|
| 20 |
|
|
|
|
| 21 |
PDF_URL = f"{SUPABASE_URL}/storage/v1/object/public/{BUCKET}/pruefungsordnung.pdf"
|
| 22 |
HG_STORAGE_URL = f"{SUPABASE_URL}/storage/v1/object/public/{BUCKET}/hochschulgesetz.html"
|
| 23 |
|
| 24 |
+
OFFICIAL_HG_URL = (
|
| 25 |
+
"https://recht.nrw.de/lmi/owa/br_text_anzeigen?v_id=10000000000000000654"
|
| 26 |
+
)
|
| 27 |
|
| 28 |
|
| 29 |
# -------------------------------------------------------------------
|
|
|
|
| 31 |
# -------------------------------------------------------------------
|
| 32 |
def load_pdf_docs():
|
| 33 |
"""
|
| 34 |
+
Đọc Prüfungsordnung.pdf từ Supabase Storage và tạo 1 Document cho mỗi
|
| 35 |
+
trang (page 1-based).
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
"""
|
| 37 |
pdf_bytes = load_file_bytes(BUCKET, "pruefungsordnung.pdf")
|
| 38 |
reader = PdfReader(BytesIO(pdf_bytes))
|
|
|
|
| 40 |
docs = []
|
| 41 |
for i, page in enumerate(reader.pages):
|
| 42 |
text = page.extract_text() or ""
|
|
|
|
|
|
|
| 43 |
page_num = i + 1
|
| 44 |
|
| 45 |
docs.append(
|
|
|
|
| 47 |
page_content=text,
|
| 48 |
metadata={
|
| 49 |
"source": "Prüfungsordnung (PDF)",
|
| 50 |
+
"page": page_num,
|
| 51 |
+
"pdf_url": PDF_URL,
|
| 52 |
},
|
| 53 |
)
|
| 54 |
)
|
|
|
|
| 56 |
|
| 57 |
|
| 58 |
# -------------------------------------------------------------------
|
| 59 |
+
# Loader HTML Hochschulgesetz
|
| 60 |
# -------------------------------------------------------------------
|
| 61 |
def load_html_docs():
|
| 62 |
"""
|
| 63 |
+
Đọc hochschulgesetz.html từ Supabase Storage, parse bằng BeautifulSoup,
|
| 64 |
+
lấy toàn bộ text thành 1 Document lớn (chunk sau).
|
|
|
|
|
|
|
|
|
|
| 65 |
"""
|
| 66 |
html_bytes = load_file_bytes(BUCKET, "hochschulgesetz.html")
|
| 67 |
html = html_bytes.decode("utf-8", errors="ignore")
|
|
|
|
| 74 |
page_content=text,
|
| 75 |
metadata={
|
| 76 |
"source": "Hochschulgesetz NRW",
|
|
|
|
| 77 |
"official_url": OFFICIAL_HG_URL,
|
| 78 |
},
|
| 79 |
)
|
|
|
|
| 81 |
|
| 82 |
|
| 83 |
# -------------------------------------------------------------------
|
| 84 |
+
# Chunking – RecursiveCharacterTextSplitter
|
| 85 |
# -------------------------------------------------------------------
|
| 86 |
def chunk_docs(docs):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
splitter = RecursiveCharacterTextSplitter(
|
| 88 |
chunk_size=900,
|
| 89 |
chunk_overlap=100,
|
|
|
|
| 92 |
|
| 93 |
|
| 94 |
# -------------------------------------------------------------------
|
| 95 |
+
# Xoá dữ liệu cũ trong bảng documents
|
| 96 |
+
# -------------------------------------------------------------------
|
| 97 |
+
def delete_old_data():
|
| 98 |
+
"""
|
| 99 |
+
Xoá toàn bộ rows trong bảng 'documents'.
|
| 100 |
+
|
| 101 |
+
Cột id là UUID, nên dùng điều kiện >= với UUID nhỏ nhất để tránh lỗi
|
| 102 |
+
'invalid input syntax for type uuid'.
|
| 103 |
+
"""
|
| 104 |
+
print("🔄 Lösche alte Daten aus Tabelle 'documents' ...")
|
| 105 |
+
supabase.table("documents").delete().gte(
|
| 106 |
+
"id", "00000000-0000-0000-0000-000000000000"
|
| 107 |
+
).execute()
|
| 108 |
+
print("✔ Alte Daten in 'documents' gelöscht.")
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
# -------------------------------------------------------------------
|
| 112 |
+
# Ingest chính
|
| 113 |
# -------------------------------------------------------------------
|
| 114 |
def ingest():
|
| 115 |
+
print("🚀 Starte Ingest (PDF + Hochschulgesetz) ...")
|
| 116 |
+
|
| 117 |
+
# 1) Xoá data cũ
|
| 118 |
+
delete_old_data()
|
| 119 |
+
|
| 120 |
+
# 2) Load nguồn
|
| 121 |
pdf_docs = load_pdf_docs()
|
| 122 |
hg_docs = load_html_docs()
|
| 123 |
|
| 124 |
+
# 3) Chunk
|
| 125 |
chunks = chunk_docs(pdf_docs + hg_docs)
|
| 126 |
|
| 127 |
+
# 4) Gắn anchor_id & URL meta
|
| 128 |
po_idx = 1
|
| 129 |
hg_idx = 1
|
|
|
|
| 130 |
for d in chunks:
|
| 131 |
src = d.metadata.get("source")
|
| 132 |
|
|
|
|
| 136 |
else:
|
| 137 |
d.metadata["anchor_id"] = f"hg_{hg_idx}"
|
| 138 |
hg_idx += 1
|
|
|
|
|
|
|
|
|
|
| 139 |
d.metadata["url"] = OFFICIAL_HG_URL
|
| 140 |
|
| 141 |
+
# 5) Embeddings + SupabaseVectorStore
|
| 142 |
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
|
| 143 |
|
| 144 |
+
print("🔍 Erstelle Embeddings und speichere in SupabaseVectorStore ...")
|
| 145 |
+
SupabaseVectorStore.from_documents(
|
| 146 |
+
chunks,
|
| 147 |
+
embeddings,
|
| 148 |
+
client=supabase,
|
| 149 |
+
table_name="documents",
|
| 150 |
+
query_name="match_documents",
|
| 151 |
+
chunk_size=500, # batch size khi insert
|
| 152 |
+
)
|
|
|
|
| 153 |
|
| 154 |
+
print("🎉 Ingest fertig – 'documents' ist frisch aufgebaut.")
|
| 155 |
|
| 156 |
|
| 157 |
if __name__ == "__main__":
|
rag_pipeline.py
CHANGED
|
@@ -1,70 +1,112 @@
|
|
| 1 |
# rag_pipeline.py
|
| 2 |
-
from typing import List, Dict
|
| 3 |
from datetime import date
|
| 4 |
|
| 5 |
-
from openai import OpenAI
|
| 6 |
from supabase_client import supabase
|
| 7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
# -------------------------------------------------------------------
|
| 10 |
-
#
|
| 11 |
# -------------------------------------------------------------------
|
| 12 |
-
|
| 13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
# -------------------------------------------------------------------
|
| 16 |
-
#
|
| 17 |
# -------------------------------------------------------------------
|
| 18 |
SYSTEM_PROMPT = """
|
| 19 |
-
Du bist ein hochpräziser
|
| 20 |
-
Du
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
|
| 38 |
# -------------------------------------------------------------------
|
| 39 |
-
#
|
| 40 |
# -------------------------------------------------------------------
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
|
| 56 |
-
|
| 57 |
-
|
|
|
|
| 58 |
|
| 59 |
|
| 60 |
# -------------------------------------------------------------------
|
| 61 |
-
#
|
| 62 |
# -------------------------------------------------------------------
|
| 63 |
def save_message(role: str, content: str) -> None:
|
| 64 |
-
"""
|
| 65 |
-
Speichert eine Chatnachricht (role, content) zusammen mit dem heutigen Datum
|
| 66 |
-
in der Tabelle `chat_history`.
|
| 67 |
-
"""
|
| 68 |
supabase.table("chat_history").insert(
|
| 69 |
{
|
| 70 |
"session_date": date.today().isoformat(),
|
|
@@ -75,80 +117,86 @@ def save_message(role: str, content: str) -> None:
|
|
| 75 |
|
| 76 |
|
| 77 |
# -------------------------------------------------------------------
|
| 78 |
-
#
|
| 79 |
# -------------------------------------------------------------------
|
| 80 |
-
def
|
| 81 |
"""
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
1. Hole relevante Dokumente aus Supabase (Vektorsuche).
|
| 85 |
-
2. Baue einen kompakten Kontext-String mit Metadaten + Ausschnitten.
|
| 86 |
-
3. Erzeuge eine Chat-Completion mit SYSTEM_PROMPT + Nutzerfrage + Kontext.
|
| 87 |
-
4. Speichere User- und Assistant-Nachricht in chat_history.
|
| 88 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
|
| 90 |
-
# 1) Relevante Dokumente
|
| 91 |
-
docs = get_relevant_docs(query)
|
| 92 |
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
context += f"[Quelle {i+1}] {src} {page_info}\n{short}\n\n"
|
| 111 |
-
|
| 112 |
-
# Optional: kurzen bisherigen Verlauf (für mehr Kontext), nur letzte 6 Einträge
|
| 113 |
-
history_text = ""
|
| 114 |
-
if isinstance(history, list):
|
| 115 |
-
for h in history[-6:]:
|
| 116 |
-
if isinstance(h, dict):
|
| 117 |
-
r = h.get("role")
|
| 118 |
-
c = h.get("content")
|
| 119 |
-
if r in ("user", "assistant") and c:
|
| 120 |
-
history_text += f"{r}: {c}\n"
|
| 121 |
-
|
| 122 |
-
# 3) Messages für OpenAI
|
| 123 |
-
user_prompt = f"""
|
| 124 |
-
Bisheriger Chatverlauf (kurz):
|
| 125 |
-
|
| 126 |
-
{history_text}
|
| 127 |
-
|
| 128 |
-
Aktuelle Frage des Nutzers:
|
| 129 |
-
{query}
|
| 130 |
-
|
| 131 |
-
Relevante Dokumentauszüge:
|
| 132 |
-
{context}
|
| 133 |
|
| 134 |
-
|
| 135 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 136 |
|
| 137 |
-
|
| 138 |
-
{"role": "system", "content": SYSTEM_PROMPT},
|
| 139 |
-
{"role": "user", "content": user_prompt},
|
| 140 |
-
]
|
| 141 |
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
|
|
|
|
|
|
| 147 |
|
| 148 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 149 |
|
| 150 |
-
#
|
| 151 |
save_message("user", query)
|
| 152 |
save_message("assistant", answer)
|
| 153 |
|
| 154 |
-
return answer,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
# rag_pipeline.py
|
| 2 |
+
from typing import Any, List, Dict
|
| 3 |
from datetime import date
|
| 4 |
|
|
|
|
| 5 |
from supabase_client import supabase
|
| 6 |
+
|
| 7 |
+
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
|
| 8 |
+
from langchain_community.vectorstores import SupabaseVectorStore
|
| 9 |
+
from langchain.tools.retriever import create_retriever_tool
|
| 10 |
+
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
|
| 11 |
+
from langchain_core.messages import HumanMessage, AIMessage
|
| 12 |
+
from langchain.agents import create_openai_tools_agent, AgentExecutor
|
| 13 |
|
| 14 |
# -------------------------------------------------------------------
|
| 15 |
+
# LLM, Embeddings, VectorStore, Retriever
|
| 16 |
# -------------------------------------------------------------------
|
| 17 |
+
_embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
|
| 18 |
+
|
| 19 |
+
_vector_store = SupabaseVectorStore(
|
| 20 |
+
embedding=_embeddings,
|
| 21 |
+
client=supabase,
|
| 22 |
+
table_name="documents",
|
| 23 |
+
query_name="match_documents",
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
_retriever = _vector_store.as_retriever(search_kwargs={"k": 4})
|
| 27 |
+
|
| 28 |
+
_llm = ChatOpenAI(
|
| 29 |
+
model="gpt-4o-mini",
|
| 30 |
+
temperature=0.0,
|
| 31 |
+
)
|
| 32 |
|
| 33 |
# -------------------------------------------------------------------
|
| 34 |
+
# Prompt engineering – legal guardrails
|
| 35 |
# -------------------------------------------------------------------
|
| 36 |
SYSTEM_PROMPT = """
|
| 37 |
+
Du bist ein hochpräziser juristischer Assistent für Prüfungsrecht in NRW.
|
| 38 |
+
Du arbeitest ausschließlich auf Grundlage der folgenden Dokumente:
|
| 39 |
+
|
| 40 |
+
1. Prüfungsordnung (PDF)
|
| 41 |
+
2. Hochschulgesetz NRW (offizielle Fassung auf recht.nrw.de)
|
| 42 |
+
|
| 43 |
+
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
| 44 |
+
REGELN FÜR DEINE ANTWORT
|
| 45 |
+
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
|
| 46 |
+
|
| 47 |
+
1) Nutze AUSSCHLIESSLICH die Dokumentauszüge, die du über das Tool
|
| 48 |
+
'suche_pruefungsrecht_dokumente' erhältst.
|
| 49 |
+
- Wenn eine Information NICHT im Kontext steht, antworte:
|
| 50 |
+
„Dazu liegen im bereitgestellten Dokumentenkontext keine Informationen vor.“
|
| 51 |
+
|
| 52 |
+
2) Spekuliere nicht, erfinde nichts, nutze keine externen Quellen.
|
| 53 |
+
|
| 54 |
+
3) Antworte strukturiert:
|
| 55 |
+
(a) kurze Einordnung,
|
| 56 |
+
(b) Kernaussage / Rechtsgrundlage,
|
| 57 |
+
(c) wichtige Bedingungen oder Ausnahmen,
|
| 58 |
+
(d) praktische Konsequenz für Studierende.
|
| 59 |
+
|
| 60 |
+
4) Du fügst selbst KEINE Quellenlinks hinzu.
|
| 61 |
+
- Die UI zeigt die Quellen separat an.
|
| 62 |
+
- Du kannst aber sinngemäß auf „die Prüfungsordnung“ oder „das Hochschulgesetz“
|
| 63 |
+
verweisen.
|
| 64 |
+
|
| 65 |
+
5) Wenn mehrere Dokumentstellen relevant sind, vergleiche sie kurz.
|
| 66 |
|
| 67 |
+
6) Wenn die Frage unklar ist, bitte freundlich um Präzisierung.
|
| 68 |
+
|
| 69 |
+
7) Schreibe so, dass Studierende ohne Jurastudium dich verstehen.
|
| 70 |
+
"""
|
| 71 |
|
| 72 |
# -------------------------------------------------------------------
|
| 73 |
+
# Retriever Tool cho Agent
|
| 74 |
# -------------------------------------------------------------------
|
| 75 |
+
retriever_tool = create_retriever_tool(
|
| 76 |
+
_retriever,
|
| 77 |
+
name="suche_pruefungsrecht_dokumente",
|
| 78 |
+
description=(
|
| 79 |
+
"Suche in der Prüfungsordnung (PDF) und im Hochschulgesetz NRW "
|
| 80 |
+
"nach relevanten Gesetzesstellen zum Prüfungsrecht. "
|
| 81 |
+
"Nutze dieses Tool IMMER, bevor du eine Antwort gibst."
|
| 82 |
+
),
|
| 83 |
+
)
|
| 84 |
+
|
| 85 |
+
tools = [retriever_tool]
|
| 86 |
+
|
| 87 |
+
# Prompt cho Agent (dùng Tools + Memory)
|
| 88 |
+
prompt = ChatPromptTemplate.from_messages(
|
| 89 |
+
[
|
| 90 |
+
("system", SYSTEM_PROMPT),
|
| 91 |
+
MessagesPlaceholder("chat_history"),
|
| 92 |
+
(
|
| 93 |
+
"user",
|
| 94 |
+
"Aktuelle Frage:\n{input}\n\n"
|
| 95 |
+
"Nutze das Tool, um relevante Dokumentstellen zu finden, "
|
| 96 |
+
"und beantworte die Frage ausschließlich anhand dieses Kontextes.",
|
| 97 |
+
),
|
| 98 |
+
]
|
| 99 |
+
)
|
| 100 |
|
| 101 |
+
# Tạo Agent + Executor
|
| 102 |
+
agent = create_openai_tools_agent(_llm, tools, prompt)
|
| 103 |
+
agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=False)
|
| 104 |
|
| 105 |
|
| 106 |
# -------------------------------------------------------------------
|
| 107 |
+
# Lưu chat_history (logging) vào Supabase
|
| 108 |
# -------------------------------------------------------------------
|
| 109 |
def save_message(role: str, content: str) -> None:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
supabase.table("chat_history").insert(
|
| 111 |
{
|
| 112 |
"session_date": date.today().isoformat(),
|
|
|
|
| 117 |
|
| 118 |
|
| 119 |
# -------------------------------------------------------------------
|
| 120 |
+
# Convert history của Gradio → chat_history cho Agent
|
| 121 |
# -------------------------------------------------------------------
|
| 122 |
+
def _convert_history(history: Any):
|
| 123 |
"""
|
| 124 |
+
Gradio history: list[{"role": "user"/"assistant", "content": str}, ...]
|
| 125 |
+
→ list[HumanMessage/AIMessage] cho MessagesPlaceholder.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
"""
|
| 127 |
+
msgs: List[Any] = []
|
| 128 |
+
if not isinstance(history, list):
|
| 129 |
+
return msgs
|
| 130 |
+
|
| 131 |
+
for h in history[-8:]: # chỉ lấy ~8 lượt gần nhất
|
| 132 |
+
if not isinstance(h, dict):
|
| 133 |
+
continue
|
| 134 |
+
role = h.get("role")
|
| 135 |
+
content = h.get("content")
|
| 136 |
+
if not content:
|
| 137 |
+
continue
|
| 138 |
+
if role == "user":
|
| 139 |
+
msgs.append(HumanMessage(content=content))
|
| 140 |
+
elif role == "assistant":
|
| 141 |
+
msgs.append(AIMessage(content=content))
|
| 142 |
+
return msgs
|
| 143 |
|
|
|
|
|
|
|
| 144 |
|
| 145 |
+
# -------------------------------------------------------------------
|
| 146 |
+
# Hàm chính: Agent-Antwort + Dokumente für Quellen
|
| 147 |
+
# -------------------------------------------------------------------
|
| 148 |
+
def agent_answer(query: str, history: Any):
|
| 149 |
+
"""
|
| 150 |
+
Dùng OpenAI Tools Agent để trả lời câu hỏi:
|
| 151 |
+
- Agent gọi tool 'suche_pruefungsrecht_dokumente' (Retriever) khi cần.
|
| 152 |
+
- Đồng thời, ta tự gọi retriever để lấy docs cho UI (Quellen).
|
| 153 |
+
|
| 154 |
+
Returns
|
| 155 |
+
-------
|
| 156 |
+
answer : str
|
| 157 |
+
Câu trả lời đã qua prompt engineering (không có link).
|
| 158 |
+
docs_info : list[dict]
|
| 159 |
+
Thông tin document cho phần Quellen trong UI.
|
| 160 |
+
"""
|
| 161 |
+
chat_history_msgs = _convert_history(history)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 162 |
|
| 163 |
+
result = agent_executor.invoke(
|
| 164 |
+
{
|
| 165 |
+
"input": query,
|
| 166 |
+
"chat_history": chat_history_msgs,
|
| 167 |
+
}
|
| 168 |
+
)
|
| 169 |
|
| 170 |
+
answer: str = result["output"]
|
|
|
|
|
|
|
|
|
|
| 171 |
|
| 172 |
+
# Safety-Hinweis, falls Agent selbst zugibt, dass Kontext nicht reicht
|
| 173 |
+
if "keine informationen vor" in answer.lower():
|
| 174 |
+
answer = (
|
| 175 |
+
"⚠️ **Hinweis:** Die Frage kann anhand des bereitgestellten "
|
| 176 |
+
"Dokumentenkontextes nur eingeschränkt beantwortet werden.\n\n"
|
| 177 |
+
+ answer
|
| 178 |
+
)
|
| 179 |
|
| 180 |
+
# Dokumente separat für UI holen (gleiches Retriever wie Agent)
|
| 181 |
+
retrieved_docs = _retriever.get_relevant_documents(query)
|
| 182 |
+
docs_info: List[Dict[str, Any]] = []
|
| 183 |
+
for doc in retrieved_docs:
|
| 184 |
+
docs_info.append(
|
| 185 |
+
{
|
| 186 |
+
"content": doc.page_content,
|
| 187 |
+
"metadata": doc.metadata or {},
|
| 188 |
+
"score": 0.0, # hier nicht benutzt, aber Feld gelassen
|
| 189 |
+
}
|
| 190 |
+
)
|
| 191 |
|
| 192 |
+
# Logging
|
| 193 |
save_message("user", query)
|
| 194 |
save_message("assistant", answer)
|
| 195 |
|
| 196 |
+
return answer, docs_info
|
| 197 |
+
|
| 198 |
+
|
| 199 |
+
# Alias kompatibel mit Version cũ
|
| 200 |
+
def rag_answer(query: str, history: Any):
|
| 201 |
+
"""Alias, để app.py có thể tiếp tục import rag_answer như trước."""
|
| 202 |
+
return agent_answer(query, history)
|
supabase_client.py
CHANGED
|
@@ -2,9 +2,13 @@
|
|
| 2 |
import os
|
| 3 |
from supabase import create_client
|
| 4 |
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
SUPABASE_URL = os.environ["SUPABASE_URL"]
|
| 9 |
SUPABASE_SERVICE_ROLE = os.environ["SUPABASE_SERVICE_ROLE"]
|
| 10 |
|
|
@@ -13,18 +17,18 @@ supabase = create_client(SUPABASE_URL, SUPABASE_SERVICE_ROLE)
|
|
| 13 |
|
| 14 |
def load_file_bytes(bucket: str, filename: str) -> bytes:
|
| 15 |
"""
|
| 16 |
-
Tải file từ Supabase Storage
|
| 17 |
|
| 18 |
Parameters
|
| 19 |
----------
|
| 20 |
bucket : str
|
| 21 |
Tên bucket trong Supabase Storage.
|
| 22 |
filename : str
|
| 23 |
-
|
| 24 |
|
| 25 |
Returns
|
| 26 |
-------
|
| 27 |
bytes
|
| 28 |
-
Nội dung file
|
| 29 |
"""
|
| 30 |
return supabase.storage.from_(bucket).download(filename)
|
|
|
|
| 2 |
import os
|
| 3 |
from supabase import create_client
|
| 4 |
|
| 5 |
+
"""
|
| 6 |
+
Supabase-Client (Service-Role) – dùng chung cho:
|
| 7 |
+
- ingest.py (đọc Storage + ghi embeddings vào bảng documents)
|
| 8 |
+
- rag_pipeline.py (tạo SupabaseVectorStore cho Agent)
|
| 9 |
+
- app.py (PDF-Viewer)
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
SUPABASE_URL = os.environ["SUPABASE_URL"]
|
| 13 |
SUPABASE_SERVICE_ROLE = os.environ["SUPABASE_SERVICE_ROLE"]
|
| 14 |
|
|
|
|
| 17 |
|
| 18 |
def load_file_bytes(bucket: str, filename: str) -> bytes:
|
| 19 |
"""
|
| 20 |
+
Tải file từ Supabase Storage (PDF, HTML, …) và trả về bytes.
|
| 21 |
|
| 22 |
Parameters
|
| 23 |
----------
|
| 24 |
bucket : str
|
| 25 |
Tên bucket trong Supabase Storage.
|
| 26 |
filename : str
|
| 27 |
+
Tên / đường dẫn file trong bucket.
|
| 28 |
|
| 29 |
Returns
|
| 30 |
-------
|
| 31 |
bytes
|
| 32 |
+
Nội dung file.
|
| 33 |
"""
|
| 34 |
return supabase.storage.from_(bucket).download(filename)
|