commit
Browse files- app.py +64 -153
- ingest.py +27 -91
- rag_pipeline.py +58 -130
- supabase_client.py +11 -20
app.py
CHANGED
|
@@ -2,16 +2,16 @@
|
|
| 2 |
import os
|
| 3 |
import re
|
| 4 |
import base64
|
|
|
|
|
|
|
| 5 |
|
| 6 |
import gradio as gr
|
| 7 |
from openai import OpenAI
|
| 8 |
|
| 9 |
from supabase_client import load_file_bytes
|
| 10 |
-
from rag_pipeline import rag_answer
|
|
|
|
| 11 |
|
| 12 |
-
# -------------------------------------------------------------------
|
| 13 |
-
# OpenAI client cho Whisper (Speech-to-Text)
|
| 14 |
-
# -------------------------------------------------------------------
|
| 15 |
client = OpenAI()
|
| 16 |
|
| 17 |
BUCKET = os.environ["SUPABASE_BUCKET"]
|
|
@@ -21,183 +21,94 @@ PDF_URL = f"{SUPABASE_URL}/storage/v1/object/public/{BUCKET}/pruefungsordnung.pd
|
|
| 21 |
HG_URL = "https://recht.nrw.de/lmi/owa/br_text_anzeigen?v_id=10000000000000000654"
|
| 22 |
|
| 23 |
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
"ähm",
|
| 39 |
-
"uh",
|
| 40 |
-
"hmm",
|
| 41 |
-
"mmh",
|
| 42 |
-
"ah",
|
| 43 |
-
"oh",
|
| 44 |
-
"also",
|
| 45 |
-
"sozusagen",
|
| 46 |
-
"halt",
|
| 47 |
-
"irgendwie",
|
| 48 |
-
]
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
def clean_transcript(t: str) -> str:
|
| 52 |
-
if not t:
|
| 53 |
-
return ""
|
| 54 |
-
t = t.lower().strip()
|
| 55 |
-
for f in FILLER:
|
| 56 |
-
t = re.sub(rf"\b{re.escape(f)}\b", "", t)
|
| 57 |
-
t = re.sub(r"[^a-zA-ZäöüÄÖÜß0-9,.? ]+", " ", t)
|
| 58 |
-
t = re.sub(r"\s+", " ", t).strip()
|
| 59 |
-
if len(t) > 1:
|
| 60 |
-
t = t[0].upper() + t[1:]
|
| 61 |
-
return t
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
def transcribe(audio_path: str) -> str:
|
| 65 |
-
if audio_path is None:
|
| 66 |
return ""
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
# -------------------------------------------------------------------
|
| 80 |
-
# Hàm CHAT chính – gọi Agent (rag_answer)
|
| 81 |
-
# -------------------------------------------------------------------
|
| 82 |
def chat_fn(mode, text, audio, history):
|
| 83 |
history = history or []
|
| 84 |
|
| 85 |
-
# 1) Chọn câu hỏi theo mode
|
| 86 |
if mode == "text":
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
else: # mode == "audio"
|
| 91 |
-
if audio is None:
|
| 92 |
-
return history, "Bitte ins Mikrofon sprechen.", None
|
| 93 |
-
question = transcribe(audio)
|
| 94 |
-
if not question:
|
| 95 |
-
return (
|
| 96 |
-
history,
|
| 97 |
-
"Spracherkennung fehlgeschlagen. Bitte erneut versuchen.",
|
| 98 |
-
None,
|
| 99 |
-
)
|
| 100 |
|
| 101 |
-
|
| 102 |
-
|
| 103 |
|
| 104 |
-
|
| 105 |
-
quellen_md_lines = ["", "### 📚 Verwendete Quellen"]
|
| 106 |
|
|
|
|
| 107 |
for i, d in enumerate(docs):
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
if isinstance(src, str) and src.startswith("Prüfungsordnung"):
|
| 114 |
-
page_num = page if isinstance(page, int) else None
|
| 115 |
-
if page_num:
|
| 116 |
-
url = f"{PDF_URL}#page={page_num}"
|
| 117 |
-
title = f"Quelle {i+1}: Prüfungsordnung (Seite {page_num})"
|
| 118 |
-
else:
|
| 119 |
-
url = PDF_URL
|
| 120 |
-
title = f"Quelle {i+1}: Prüfungsordnung"
|
| 121 |
-
# Hochschulgesetz – link trang chính thức
|
| 122 |
else:
|
| 123 |
url = HG_URL
|
| 124 |
-
title = f"Quelle {i+1}: Hochschulgesetz NRW"
|
| 125 |
|
| 126 |
-
snippet =
|
| 127 |
-
|
| 128 |
|
| 129 |
-
|
| 130 |
-
f"- [{title}]({url})\n"
|
| 131 |
-
f" - **Ausschnitt:** „{snippet}“"
|
| 132 |
-
)
|
| 133 |
|
| 134 |
-
|
|
|
|
|
|
|
|
|
|
| 135 |
|
| 136 |
-
bot_msg = answer + "\n\n" + quellen_md
|
| 137 |
|
| 138 |
-
new_history = history + [
|
| 139 |
-
{"role": "user", "content": question},
|
| 140 |
-
{"role": "assistant", "content": bot_msg},
|
| 141 |
-
]
|
| 142 |
-
|
| 143 |
-
return new_history, bot_msg, gr.update(value=None)
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
# -------------------------------------------------------------------
|
| 147 |
-
# Giao diện Gradio – UI thân thiện
|
| 148 |
-
# -------------------------------------------------------------------
|
| 149 |
with gr.Blocks() as demo:
|
| 150 |
-
gr.Markdown(
|
| 151 |
-
"""
|
| 152 |
-
# ⚖️ Prüfungsrechts-Assistent (NRW)
|
| 153 |
-
|
| 154 |
-
Willkommen!
|
| 155 |
-
Ich beantworte Ihre Fragen auf Basis der **offiziellen Dokumente**:
|
| 156 |
-
|
| 157 |
-
- 📘 *Prüfungsordnung Ihrer Hochschule*
|
| 158 |
-
- 📗 *Hochschulgesetz NRW (recht.nrw.de)*
|
| 159 |
-
|
| 160 |
-
Wählen Sie unten: **Text** oder **Sprache**.
|
| 161 |
-
"""
|
| 162 |
-
)
|
| 163 |
|
| 164 |
with gr.Row():
|
| 165 |
-
# LEFT: Chat
|
| 166 |
with gr.Column(scale=3):
|
| 167 |
-
chatbot = gr.Chatbot(
|
| 168 |
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
value="text",
|
| 172 |
-
label="Eingabemodus",
|
| 173 |
-
info="Wähle zwischen Text oder Sprache",
|
| 174 |
-
)
|
| 175 |
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
type="
|
|
|
|
|
|
|
| 179 |
)
|
| 180 |
|
| 181 |
-
|
| 182 |
-
|
| 183 |
|
| 184 |
-
# RIGHT: Viewer
|
| 185 |
with gr.Column(scale=2):
|
| 186 |
gr.Markdown("### 📄 Prüfungsordnung (PDF)")
|
| 187 |
-
gr.HTML(
|
| 188 |
-
f"<iframe src='{encode_pdf_src()}' width='100%' height='250' style='border:none;'></iframe>"
|
| 189 |
-
)
|
| 190 |
|
| 191 |
-
gr.Markdown("### 📘 Hochschulgesetz NRW
|
| 192 |
-
gr.HTML(
|
| 193 |
-
f"<iframe src='{HG_URL}' width='100%' height='250' style='border:none;'></iframe>"
|
| 194 |
-
)
|
| 195 |
|
| 196 |
-
|
| 197 |
-
chat_fn,
|
| 198 |
-
inputs=[mode_select, text_input, audio_input, chatbot],
|
| 199 |
-
outputs=[chatbot, answer_preview, audio_input],
|
| 200 |
-
)
|
| 201 |
|
| 202 |
if __name__ == "__main__":
|
| 203 |
-
demo.queue().launch(
|
|
|
|
| 2 |
import os
|
| 3 |
import re
|
| 4 |
import base64
|
| 5 |
+
import io
|
| 6 |
+
import soundfile as sf
|
| 7 |
|
| 8 |
import gradio as gr
|
| 9 |
from openai import OpenAI
|
| 10 |
|
| 11 |
from supabase_client import load_file_bytes
|
| 12 |
+
from rag_pipeline import rag_answer
|
| 13 |
+
|
| 14 |
|
|
|
|
|
|
|
|
|
|
| 15 |
client = OpenAI()
|
| 16 |
|
| 17 |
BUCKET = os.environ["SUPABASE_BUCKET"]
|
|
|
|
| 21 |
HG_URL = "https://recht.nrw.de/lmi/owa/br_text_anzeigen?v_id=10000000000000000654"
|
| 22 |
|
| 23 |
|
| 24 |
+
def encode_pdf_src():
|
| 25 |
+
b = load_file_bytes(BUCKET, "pruefungsordnung.pdf")
|
| 26 |
+
return f"data:application/pdf;base64,{base64.b64encode(b).decode()}"
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
# Whisper cleanup
|
| 30 |
+
def clean_text(t):
|
| 31 |
+
t = t.lower()
|
| 32 |
+
t = re.sub(r"[^\wäöüß ,.?-]+", " ", t)
|
| 33 |
+
return t.strip().capitalize()
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def transcribe(audio):
|
| 37 |
+
if audio is None:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
return ""
|
| 39 |
+
audio_data, sr = audio
|
| 40 |
+
buf = io.BytesIO()
|
| 41 |
+
sf.write(buf, audio_data, sr, format="WAV")
|
| 42 |
+
buf.seek(0)
|
| 43 |
+
|
| 44 |
+
result = client.audio.transcriptions.create(
|
| 45 |
+
model="whisper-1", file=buf, filename="audio.wav", language="de"
|
| 46 |
+
)
|
| 47 |
+
return clean_text(result.text or "")
|
| 48 |
+
|
| 49 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
def chat_fn(mode, text, audio, history):
|
| 51 |
history = history or []
|
| 52 |
|
|
|
|
| 53 |
if mode == "text":
|
| 54 |
+
q = text.strip()
|
| 55 |
+
else:
|
| 56 |
+
q = transcribe(audio)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
|
| 58 |
+
if not q:
|
| 59 |
+
return history, "Keine gültige Eingabe erkannt.", None
|
| 60 |
|
| 61 |
+
answer, docs = rag_answer(q, history)
|
|
|
|
| 62 |
|
| 63 |
+
quellen = ["", "### 📚 Verwendete Quellen"]
|
| 64 |
for i, d in enumerate(docs):
|
| 65 |
+
src = d["source"]
|
| 66 |
+
pg = d["page"]
|
| 67 |
+
|
| 68 |
+
if src.startswith("Prüfungsordnung"):
|
| 69 |
+
url = f"{PDF_URL}#page={pg}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
else:
|
| 71 |
url = HG_URL
|
|
|
|
| 72 |
|
| 73 |
+
snippet = d["snippet"][:200]
|
| 74 |
+
quellen.append(f"- **{src}** (Seite {pg}) → [{url}]({url}) \n „{snippet}…”")
|
| 75 |
|
| 76 |
+
bot = answer + "\n\n" + "\n".join(quellen)
|
|
|
|
|
|
|
|
|
|
| 77 |
|
| 78 |
+
return history + [
|
| 79 |
+
{"role": "user", "content": q},
|
| 80 |
+
{"role": "assistant", "content": bot},
|
| 81 |
+
], bot, gr.update(value=None)
|
| 82 |
|
|
|
|
| 83 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
with gr.Blocks() as demo:
|
| 85 |
+
gr.Markdown("# ⚖️ Prüfungsrechts-Assistent NRW")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
|
| 87 |
with gr.Row():
|
|
|
|
| 88 |
with gr.Column(scale=3):
|
| 89 |
+
chatbot = gr.Chatbot()
|
| 90 |
|
| 91 |
+
mode = gr.Radio(["text", "audio"], value="text", label="Eingabemodus")
|
| 92 |
+
text = gr.Textbox(label="Text eingeben")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
|
| 94 |
+
audio = gr.Audio(
|
| 95 |
+
sources=["microphone"],
|
| 96 |
+
type="numpy",
|
| 97 |
+
format="wav",
|
| 98 |
+
label="Spracheingabe (Mikrofon)",
|
| 99 |
)
|
| 100 |
|
| 101 |
+
send = gr.Button("Senden")
|
| 102 |
+
preview = gr.Markdown()
|
| 103 |
|
|
|
|
| 104 |
with gr.Column(scale=2):
|
| 105 |
gr.Markdown("### 📄 Prüfungsordnung (PDF)")
|
| 106 |
+
gr.HTML(f"<iframe src='{encode_pdf_src()}' width='100%' height='260'></iframe>")
|
|
|
|
|
|
|
| 107 |
|
| 108 |
+
gr.Markdown("### 📘 Hochschulgesetz NRW")
|
| 109 |
+
gr.HTML(f"<iframe src='{HG_URL}' width='100%' height='260'></iframe>")
|
|
|
|
|
|
|
| 110 |
|
| 111 |
+
send.click(chat_fn, [mode, text, audio, chatbot], [chatbot, preview, audio])
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
|
| 113 |
if __name__ == "__main__":
|
| 114 |
+
demo.queue().launch()
|
ingest.py
CHANGED
|
@@ -1,53 +1,37 @@
|
|
| 1 |
# ingest.py
|
| 2 |
import os
|
| 3 |
from io import BytesIO
|
| 4 |
-
|
| 5 |
from bs4 import BeautifulSoup
|
| 6 |
from pypdf import PdfReader
|
| 7 |
|
| 8 |
from supabase_client import supabase, load_file_bytes
|
| 9 |
-
|
| 10 |
from langchain_openai import OpenAIEmbeddings
|
| 11 |
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
| 12 |
from langchain_core.documents import Document
|
| 13 |
-
from langchain_community.vectorstores import SupabaseVectorStore
|
| 14 |
|
| 15 |
-
# -------------------------------------------------------------------
|
| 16 |
-
# ENV + URLs
|
| 17 |
-
# -------------------------------------------------------------------
|
| 18 |
BUCKET = os.environ["SUPABASE_BUCKET"]
|
| 19 |
SUPABASE_URL = os.environ["SUPABASE_URL"]
|
| 20 |
|
| 21 |
PDF_URL = f"{SUPABASE_URL}/storage/v1/object/public/{BUCKET}/pruefungsordnung.pdf"
|
| 22 |
-
HG_STORAGE_URL = f"{SUPABASE_URL}/storage/v1/object/public/{BUCKET}/hochschulgesetz.html"
|
| 23 |
-
|
| 24 |
OFFICIAL_HG_URL = (
|
| 25 |
"https://recht.nrw.de/lmi/owa/br_text_anzeigen?v_id=10000000000000000654"
|
| 26 |
)
|
| 27 |
|
| 28 |
|
| 29 |
-
#
|
| 30 |
-
# Loader PDF Prüfungsordnung
|
| 31 |
-
# -------------------------------------------------------------------
|
| 32 |
def load_pdf_docs():
|
| 33 |
-
"""
|
| 34 |
-
Đọc Prüfungsordnung.pdf từ Supabase Storage và tạo 1 Document cho mỗi
|
| 35 |
-
trang (page 1-based).
|
| 36 |
-
"""
|
| 37 |
pdf_bytes = load_file_bytes(BUCKET, "pruefungsordnung.pdf")
|
| 38 |
reader = PdfReader(BytesIO(pdf_bytes))
|
| 39 |
|
| 40 |
docs = []
|
| 41 |
-
for i,
|
| 42 |
-
text =
|
| 43 |
-
page_num = i + 1
|
| 44 |
-
|
| 45 |
docs.append(
|
| 46 |
Document(
|
| 47 |
page_content=text,
|
| 48 |
metadata={
|
| 49 |
"source": "Prüfungsordnung (PDF)",
|
| 50 |
-
"page":
|
| 51 |
"pdf_url": PDF_URL,
|
| 52 |
},
|
| 53 |
)
|
|
@@ -55,103 +39,55 @@ def load_pdf_docs():
|
|
| 55 |
return docs
|
| 56 |
|
| 57 |
|
| 58 |
-
# -------------------------------------------------------------------
|
| 59 |
-
# Loader HTML Hochschulgesetz
|
| 60 |
-
# -------------------------------------------------------------------
|
| 61 |
def load_html_docs():
|
| 62 |
-
"""
|
| 63 |
-
Đọc hochschulgesetz.html từ Supabase Storage, parse bằng BeautifulSoup,
|
| 64 |
-
lấy toàn bộ text thành 1 Document lớn (chunk sau).
|
| 65 |
-
"""
|
| 66 |
html_bytes = load_file_bytes(BUCKET, "hochschulgesetz.html")
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
soup = BeautifulSoup(html, "html.parser")
|
| 70 |
-
text = soup.get_text(separator="\n")
|
| 71 |
|
| 72 |
return [
|
| 73 |
Document(
|
| 74 |
-
page_content=
|
| 75 |
-
metadata={
|
| 76 |
-
"source": "Hochschulgesetz NRW",
|
| 77 |
-
"official_url": OFFICIAL_HG_URL,
|
| 78 |
-
},
|
| 79 |
)
|
| 80 |
]
|
| 81 |
|
| 82 |
|
| 83 |
-
# -------------------------------------------------------------------
|
| 84 |
-
# Chunking – RecursiveCharacterTextSplitter
|
| 85 |
-
# -------------------------------------------------------------------
|
| 86 |
def chunk_docs(docs):
|
| 87 |
-
splitter = RecursiveCharacterTextSplitter(
|
| 88 |
-
chunk_size=900,
|
| 89 |
-
chunk_overlap=100,
|
| 90 |
-
)
|
| 91 |
return splitter.split_documents(docs)
|
| 92 |
|
| 93 |
|
| 94 |
-
#
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
def delete_old_data():
|
| 98 |
-
"""
|
| 99 |
-
Xoá toàn bộ rows trong bảng 'documents'.
|
| 100 |
-
|
| 101 |
-
Cột id là UUID, nên dùng điều kiện >= với UUID nhỏ nhất để tránh lỗi
|
| 102 |
-
'invalid input syntax for type uuid'.
|
| 103 |
-
"""
|
| 104 |
-
print("🔄 Lösche alte Daten aus Tabelle 'documents' ...")
|
| 105 |
supabase.table("documents").delete().gte(
|
| 106 |
"id", "00000000-0000-0000-0000-000000000000"
|
| 107 |
).execute()
|
| 108 |
-
print("✔ Alte Daten in 'documents' gelöscht.")
|
| 109 |
|
| 110 |
|
| 111 |
-
#
|
| 112 |
-
# Ingest chính
|
| 113 |
-
# -------------------------------------------------------------------
|
| 114 |
def ingest():
|
| 115 |
-
|
| 116 |
|
| 117 |
-
# 1) Xoá data cũ
|
| 118 |
-
delete_old_data()
|
| 119 |
-
|
| 120 |
-
# 2) Load nguồn
|
| 121 |
pdf_docs = load_pdf_docs()
|
| 122 |
-
|
| 123 |
|
| 124 |
-
|
| 125 |
-
chunks = chunk_docs(pdf_docs + hg_docs)
|
| 126 |
|
| 127 |
-
|
| 128 |
-
po_idx = 1
|
| 129 |
-
hg_idx = 1
|
| 130 |
-
for d in chunks:
|
| 131 |
-
src = d.metadata.get("source")
|
| 132 |
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
else:
|
| 137 |
-
d.metadata["anchor_id"] = f"hg_{hg_idx}"
|
| 138 |
-
hg_idx += 1
|
| 139 |
-
d.metadata["url"] = OFFICIAL_HG_URL
|
| 140 |
|
| 141 |
-
|
| 142 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 143 |
|
| 144 |
-
print("
|
| 145 |
-
SupabaseVectorStore.from_documents(
|
| 146 |
-
chunks,
|
| 147 |
-
embeddings,
|
| 148 |
-
client=supabase,
|
| 149 |
-
table_name="documents",
|
| 150 |
-
query_name="match_documents",
|
| 151 |
-
chunk_size=500, # batch size khi insert
|
| 152 |
-
)
|
| 153 |
-
|
| 154 |
-
print("🎉 Ingest fertig – 'documents' ist frisch aufgebaut.")
|
| 155 |
|
| 156 |
|
| 157 |
if __name__ == "__main__":
|
|
|
|
| 1 |
# ingest.py
|
| 2 |
import os
|
| 3 |
from io import BytesIO
|
|
|
|
| 4 |
from bs4 import BeautifulSoup
|
| 5 |
from pypdf import PdfReader
|
| 6 |
|
| 7 |
from supabase_client import supabase, load_file_bytes
|
|
|
|
| 8 |
from langchain_openai import OpenAIEmbeddings
|
| 9 |
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
| 10 |
from langchain_core.documents import Document
|
|
|
|
| 11 |
|
|
|
|
|
|
|
|
|
|
| 12 |
BUCKET = os.environ["SUPABASE_BUCKET"]
|
| 13 |
SUPABASE_URL = os.environ["SUPABASE_URL"]
|
| 14 |
|
| 15 |
PDF_URL = f"{SUPABASE_URL}/storage/v1/object/public/{BUCKET}/pruefungsordnung.pdf"
|
|
|
|
|
|
|
| 16 |
OFFICIAL_HG_URL = (
|
| 17 |
"https://recht.nrw.de/lmi/owa/br_text_anzeigen?v_id=10000000000000000654"
|
| 18 |
)
|
| 19 |
|
| 20 |
|
| 21 |
+
# ---------------- Loaders ----------------
|
|
|
|
|
|
|
| 22 |
def load_pdf_docs():
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
pdf_bytes = load_file_bytes(BUCKET, "pruefungsordnung.pdf")
|
| 24 |
reader = PdfReader(BytesIO(pdf_bytes))
|
| 25 |
|
| 26 |
docs = []
|
| 27 |
+
for i, p in enumerate(reader.pages):
|
| 28 |
+
text = p.extract_text() or ""
|
|
|
|
|
|
|
| 29 |
docs.append(
|
| 30 |
Document(
|
| 31 |
page_content=text,
|
| 32 |
metadata={
|
| 33 |
"source": "Prüfungsordnung (PDF)",
|
| 34 |
+
"page": i + 1,
|
| 35 |
"pdf_url": PDF_URL,
|
| 36 |
},
|
| 37 |
)
|
|
|
|
| 39 |
return docs
|
| 40 |
|
| 41 |
|
|
|
|
|
|
|
|
|
|
| 42 |
def load_html_docs():
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
html_bytes = load_file_bytes(BUCKET, "hochschulgesetz.html")
|
| 44 |
+
soup = BeautifulSoup(html_bytes.decode("utf-8", "ignore"), "html.parser")
|
|
|
|
|
|
|
|
|
|
| 45 |
|
| 46 |
return [
|
| 47 |
Document(
|
| 48 |
+
page_content=soup.get_text("\n"),
|
| 49 |
+
metadata={"source": "Hochschulgesetz NRW", "url": OFFICIAL_HG_URL},
|
|
|
|
|
|
|
|
|
|
| 50 |
)
|
| 51 |
]
|
| 52 |
|
| 53 |
|
|
|
|
|
|
|
|
|
|
| 54 |
def chunk_docs(docs):
|
| 55 |
+
splitter = RecursiveCharacterTextSplitter(chunk_size=900, chunk_overlap=80)
|
|
|
|
|
|
|
|
|
|
| 56 |
return splitter.split_documents(docs)
|
| 57 |
|
| 58 |
|
| 59 |
+
# ---------------- Delete old data ----------------
|
| 60 |
+
def delete_old_documents():
|
| 61 |
+
print("🗑️ Lösche alte Daten…")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
supabase.table("documents").delete().gte(
|
| 63 |
"id", "00000000-0000-0000-0000-000000000000"
|
| 64 |
).execute()
|
|
|
|
| 65 |
|
| 66 |
|
| 67 |
+
# ---------------- Ingest ----------------
|
|
|
|
|
|
|
| 68 |
def ingest():
|
| 69 |
+
delete_old_documents()
|
| 70 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
pdf_docs = load_pdf_docs()
|
| 72 |
+
html_docs = load_html_docs()
|
| 73 |
|
| 74 |
+
chunks = chunk_docs(pdf_docs + html_docs)
|
|
|
|
| 75 |
|
| 76 |
+
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
|
| 78 |
+
print("📥 Speichere neue Dokumente…")
|
| 79 |
+
for d in chunks:
|
| 80 |
+
emb = embeddings.embed_query(d.page_content)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
|
| 82 |
+
supabase.table("documents").insert(
|
| 83 |
+
{
|
| 84 |
+
"content": d.page_content,
|
| 85 |
+
"metadata": d.metadata,
|
| 86 |
+
"embedding": emb,
|
| 87 |
+
}
|
| 88 |
+
).execute()
|
| 89 |
|
| 90 |
+
print("✅ Ingest abgeschlossen!")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
|
| 92 |
|
| 93 |
if __name__ == "__main__":
|
rag_pipeline.py
CHANGED
|
@@ -1,110 +1,72 @@
|
|
| 1 |
# rag_pipeline.py
|
| 2 |
-
from typing import Any
|
| 3 |
from datetime import date
|
| 4 |
|
| 5 |
-
from supabase_client import supabase
|
| 6 |
|
| 7 |
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
|
| 8 |
-
from langchain_community.vectorstores import SupabaseVectorStore
|
| 9 |
from langchain_core.messages import (
|
|
|
|
| 10 |
HumanMessage,
|
| 11 |
AIMessage,
|
| 12 |
-
SystemMessage,
|
| 13 |
)
|
| 14 |
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
# ================================================================
|
| 18 |
-
_embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
|
| 19 |
|
| 20 |
-
_vector_store = SupabaseVectorStore(
|
| 21 |
-
embedding=_embeddings,
|
| 22 |
-
client=supabase,
|
| 23 |
-
table_name="documents",
|
| 24 |
-
query_name="match_documents",
|
| 25 |
-
)
|
| 26 |
|
| 27 |
-
_retriever = _vector_store.as_retriever(search_kwargs={"k": 4})
|
| 28 |
-
|
| 29 |
-
_llm = ChatOpenAI(
|
| 30 |
-
model="gpt-4o-mini",
|
| 31 |
-
temperature=0.0
|
| 32 |
-
)
|
| 33 |
-
|
| 34 |
-
# ================================================================
|
| 35 |
-
# SYSTEM PROMPT (LEGAL GUARDRAILS)
|
| 36 |
-
# ================================================================
|
| 37 |
SYSTEM_PROMPT = """
|
| 38 |
Du bist ein hochpräziser juristischer Assistent für Prüfungsrecht in NRW.
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
'suche_pruefungsrecht_dokumente' geliefert werden.
|
| 43 |
-
2) Keine Spekulation – wenn im Kontext nicht vorhanden, antworte:
|
| 44 |
-
„Dazu liegen im bereitgestellten Dokumentenkontext keine Informationen vor.“
|
| 45 |
-
3) Antworte strukturiert:
|
| 46 |
-
(a) Einordnung
|
| 47 |
-
(b) Rechtsgrundlage (sinngemäß)
|
| 48 |
-
(c) Bedingungen / Ausnahmen
|
| 49 |
-
(d) Konsequenz für Studierende
|
| 50 |
-
4) Keine eigenen Quellenlinks – nur Sachverhalt erklären.
|
| 51 |
"""
|
| 52 |
|
| 53 |
-
|
| 54 |
-
#
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
"""
|
| 61 |
-
docs = _retriever.invoke(query)
|
| 62 |
-
|
| 63 |
-
out_docs = []
|
| 64 |
for i, d in enumerate(docs):
|
| 65 |
-
meta = d
|
| 66 |
-
snippet = d.
|
| 67 |
-
snippet = snippet[:500]
|
| 68 |
|
| 69 |
-
|
| 70 |
{
|
| 71 |
"index": i + 1,
|
| 72 |
-
"source": meta.get("source"
|
| 73 |
"page": meta.get("page"),
|
| 74 |
"snippet": snippet,
|
|
|
|
| 75 |
"metadata": meta,
|
| 76 |
-
"content": d.page_content,
|
| 77 |
}
|
| 78 |
)
|
| 79 |
|
| 80 |
-
return {"results":
|
| 81 |
|
| 82 |
|
| 83 |
-
# OpenAI tools definition
|
| 84 |
TOOLS = [
|
| 85 |
{
|
| 86 |
"type": "function",
|
| 87 |
"function": {
|
| 88 |
"name": "suche_pruefungsrecht_dokumente",
|
| 89 |
-
"description": "Sucht relevante Stellen
|
| 90 |
"parameters": {
|
| 91 |
"type": "object",
|
| 92 |
-
"properties": {
|
| 93 |
-
"query": {"type": "string"}
|
| 94 |
-
},
|
| 95 |
"required": ["query"],
|
| 96 |
},
|
| 97 |
},
|
| 98 |
}
|
| 99 |
]
|
| 100 |
|
| 101 |
-
|
| 102 |
-
llm_with_tools = _llm.bind_tools(TOOLS)
|
| 103 |
|
| 104 |
-
|
| 105 |
-
# HISTORY
|
| 106 |
-
|
| 107 |
-
def save_message(role: str, content: str) -> None:
|
| 108 |
supabase.table("chat_history").insert(
|
| 109 |
{
|
| 110 |
"session_date": date.today().isoformat(),
|
|
@@ -113,91 +75,57 @@ def save_message(role: str, content: str) -> None:
|
|
| 113 |
}
|
| 114 |
).execute()
|
| 115 |
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
# ================================================================
|
| 119 |
-
def _convert_history(history):
|
| 120 |
msgs = []
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
content = h.get("content")
|
| 127 |
-
if not content:
|
| 128 |
-
continue
|
| 129 |
-
|
| 130 |
-
if role == "user":
|
| 131 |
-
msgs.append(HumanMessage(content=content))
|
| 132 |
-
elif role == "assistant":
|
| 133 |
-
msgs.append(AIMessage(content=content))
|
| 134 |
return msgs
|
| 135 |
|
| 136 |
|
| 137 |
-
#
|
| 138 |
-
# MAIN — AGENT ANSWER
|
| 139 |
-
# ================================================================
|
| 140 |
def agent_answer(query: str, history: Any):
|
| 141 |
-
"""
|
| 142 |
-
1. Gửi prompt + query vào model.
|
| 143 |
-
2. Nếu model đòi gọi tool → thực thi tool → lấy kết quả → gửi lại vào LLM.
|
| 144 |
-
3. Trích nguồn để UI hiển thị trong phần Quellen.
|
| 145 |
-
"""
|
| 146 |
-
|
| 147 |
-
chat_history_msgs = _convert_history(history)
|
| 148 |
-
|
| 149 |
-
# -------- 1) Gửi câu hỏi lần đầu ----------
|
| 150 |
messages = [
|
| 151 |
SystemMessage(content=SYSTEM_PROMPT),
|
| 152 |
-
*
|
| 153 |
HumanMessage(content=query),
|
| 154 |
]
|
| 155 |
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
)
|
| 172 |
|
| 173 |
-
|
| 174 |
-
answer =
|
| 175 |
-
|
| 176 |
-
# CREATE docs_info FOR UI
|
| 177 |
-
docs_info = tool_result["results"]
|
| 178 |
-
|
| 179 |
else:
|
| 180 |
answer = "Tool nicht unterstützt."
|
| 181 |
-
|
| 182 |
-
|
| 183 |
else:
|
| 184 |
-
answer =
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
# Safety Hinweis
|
| 188 |
-
if "keine informationen" in answer.lower():
|
| 189 |
-
answer = (
|
| 190 |
-
"⚠️ **Hinweis:** Die Frage kann anhand des bereitgestellten Dokumentenkontextes "
|
| 191 |
-
"nur eingeschränkt beantwortet werden.\n\n"
|
| 192 |
-
+ answer
|
| 193 |
-
)
|
| 194 |
|
| 195 |
save_message("user", query)
|
| 196 |
save_message("assistant", answer)
|
| 197 |
|
| 198 |
-
return answer,
|
| 199 |
|
| 200 |
|
| 201 |
-
# Alias để app.py dùng như cũ
|
| 202 |
def rag_answer(query: str, history: Any):
|
| 203 |
return agent_answer(query, history)
|
|
|
|
| 1 |
# rag_pipeline.py
|
| 2 |
+
from typing import Any
|
| 3 |
from datetime import date
|
| 4 |
|
| 5 |
+
from supabase_client import supabase, match_documents
|
| 6 |
|
| 7 |
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
|
|
|
|
| 8 |
from langchain_core.messages import (
|
| 9 |
+
SystemMessage,
|
| 10 |
HumanMessage,
|
| 11 |
AIMessage,
|
|
|
|
| 12 |
)
|
| 13 |
|
| 14 |
+
emb = OpenAIEmbeddings(model="text-embedding-3-small")
|
| 15 |
+
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.0)
|
|
|
|
|
|
|
| 16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
SYSTEM_PROMPT = """
|
| 19 |
Du bist ein hochpräziser juristischer Assistent für Prüfungsrecht in NRW.
|
| 20 |
+
- Nutze AUSSCHLIESSLICH Dokumente, die über das Tool geliefert werden.
|
| 21 |
+
- Keine Spekulation.
|
| 22 |
+
- Antwort strukturiert + verständlich.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
"""
|
| 24 |
|
| 25 |
+
|
| 26 |
+
# ---------------- TOOL: Suche Dokumente ----------------
|
| 27 |
+
def tool_suche_dokumente(query: str):
|
| 28 |
+
vec = emb.embed_query(query)
|
| 29 |
+
docs = match_documents(vec, k=4)
|
| 30 |
+
|
| 31 |
+
results = []
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
for i, d in enumerate(docs):
|
| 33 |
+
meta = d["metadata"] or {}
|
| 34 |
+
snippet = d["content"].replace("\n", " ")[:400]
|
|
|
|
| 35 |
|
| 36 |
+
results.append(
|
| 37 |
{
|
| 38 |
"index": i + 1,
|
| 39 |
+
"source": meta.get("source"),
|
| 40 |
"page": meta.get("page"),
|
| 41 |
"snippet": snippet,
|
| 42 |
+
"content": d["content"],
|
| 43 |
"metadata": meta,
|
|
|
|
| 44 |
}
|
| 45 |
)
|
| 46 |
|
| 47 |
+
return {"results": results}
|
| 48 |
|
| 49 |
|
|
|
|
| 50 |
TOOLS = [
|
| 51 |
{
|
| 52 |
"type": "function",
|
| 53 |
"function": {
|
| 54 |
"name": "suche_pruefungsrecht_dokumente",
|
| 55 |
+
"description": "Sucht relevante Stellen im Prüfungsrecht.",
|
| 56 |
"parameters": {
|
| 57 |
"type": "object",
|
| 58 |
+
"properties": {"query": {"type": "string"}},
|
|
|
|
|
|
|
| 59 |
"required": ["query"],
|
| 60 |
},
|
| 61 |
},
|
| 62 |
}
|
| 63 |
]
|
| 64 |
|
| 65 |
+
llm_tools = llm.bind_tools(TOOLS)
|
|
|
|
| 66 |
|
| 67 |
+
|
| 68 |
+
# ---------------- HISTORY LOG ----------------
|
| 69 |
+
def save_message(role: str, content: str):
|
|
|
|
| 70 |
supabase.table("chat_history").insert(
|
| 71 |
{
|
| 72 |
"session_date": date.today().isoformat(),
|
|
|
|
| 75 |
}
|
| 76 |
).execute()
|
| 77 |
|
| 78 |
+
|
| 79 |
+
def convert_history(hist):
|
|
|
|
|
|
|
| 80 |
msgs = []
|
| 81 |
+
for h in hist[-6:]:
|
| 82 |
+
if h["role"] == "user":
|
| 83 |
+
msgs.append(HumanMessage(content=h["content"]))
|
| 84 |
+
else:
|
| 85 |
+
msgs.append(AIMessage(content=h["content"]))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
return msgs
|
| 87 |
|
| 88 |
|
| 89 |
+
# ---------------- AGENT ANSWER ----------------
|
|
|
|
|
|
|
| 90 |
def agent_answer(query: str, history: Any):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
messages = [
|
| 92 |
SystemMessage(content=SYSTEM_PROMPT),
|
| 93 |
+
*convert_history(history),
|
| 94 |
HumanMessage(content=query),
|
| 95 |
]
|
| 96 |
|
| 97 |
+
first = llm_tools.invoke(messages)
|
| 98 |
+
|
| 99 |
+
if first.tool_calls:
|
| 100 |
+
call = first.tool_calls[0]
|
| 101 |
+
if call["name"] == "suche_pruefungsrecht_dokumente":
|
| 102 |
+
tool_res = tool_suche_dokumente(call["args"]["query"])
|
| 103 |
+
|
| 104 |
+
messages.extend(
|
| 105 |
+
[
|
| 106 |
+
first,
|
| 107 |
+
AIMessage(
|
| 108 |
+
content=str(tool_res),
|
| 109 |
+
name="suche_pruefungsrecht_dokumente",
|
| 110 |
+
),
|
| 111 |
+
]
|
| 112 |
)
|
| 113 |
|
| 114 |
+
final = llm.invoke(messages)
|
| 115 |
+
answer = final.content
|
| 116 |
+
docs = tool_res["results"]
|
|
|
|
|
|
|
|
|
|
| 117 |
else:
|
| 118 |
answer = "Tool nicht unterstützt."
|
| 119 |
+
docs = []
|
|
|
|
| 120 |
else:
|
| 121 |
+
answer = first.content
|
| 122 |
+
docs = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
|
| 124 |
save_message("user", query)
|
| 125 |
save_message("assistant", answer)
|
| 126 |
|
| 127 |
+
return answer, docs
|
| 128 |
|
| 129 |
|
|
|
|
| 130 |
def rag_answer(query: str, history: Any):
|
| 131 |
return agent_answer(query, history)
|
supabase_client.py
CHANGED
|
@@ -2,13 +2,6 @@
|
|
| 2 |
import os
|
| 3 |
from supabase import create_client
|
| 4 |
|
| 5 |
-
"""
|
| 6 |
-
Supabase-Client (Service-Role) – dùng chung cho:
|
| 7 |
-
- ingest.py (đọc Storage + ghi embeddings vào bảng documents)
|
| 8 |
-
- rag_pipeline.py (tạo SupabaseVectorStore cho Agent)
|
| 9 |
-
- app.py (PDF-Viewer)
|
| 10 |
-
"""
|
| 11 |
-
|
| 12 |
SUPABASE_URL = os.environ["SUPABASE_URL"]
|
| 13 |
SUPABASE_SERVICE_ROLE = os.environ["SUPABASE_SERVICE_ROLE"]
|
| 14 |
|
|
@@ -16,19 +9,17 @@ supabase = create_client(SUPABASE_URL, SUPABASE_SERVICE_ROLE)
|
|
| 16 |
|
| 17 |
|
| 18 |
def load_file_bytes(bucket: str, filename: str) -> bytes:
|
| 19 |
-
|
| 20 |
-
Tải file từ Supabase Storage (PDF, HTML, …) và trả về bytes.
|
| 21 |
|
| 22 |
-
Parameters
|
| 23 |
-
----------
|
| 24 |
-
bucket : str
|
| 25 |
-
Tên bucket trong Supabase Storage.
|
| 26 |
-
filename : str
|
| 27 |
-
Tên / đường dẫn file trong bucket.
|
| 28 |
|
| 29 |
-
|
| 30 |
-
-------
|
| 31 |
-
bytes
|
| 32 |
-
Nội dung file.
|
| 33 |
"""
|
| 34 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
import os
|
| 3 |
from supabase import create_client
|
| 4 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
SUPABASE_URL = os.environ["SUPABASE_URL"]
|
| 6 |
SUPABASE_SERVICE_ROLE = os.environ["SUPABASE_SERVICE_ROLE"]
|
| 7 |
|
|
|
|
| 9 |
|
| 10 |
|
| 11 |
def load_file_bytes(bucket: str, filename: str) -> bytes:
|
| 12 |
+
return supabase.storage.from_(bucket).download(filename)
|
|
|
|
| 13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
+
def match_documents(embedding: list, k: int = 4):
|
|
|
|
|
|
|
|
|
|
| 16 |
"""
|
| 17 |
+
Gọi trực tiếp RPC match_documents trong Supabase.
|
| 18 |
+
Trả về list các rows: {content, metadata, embedding?}
|
| 19 |
+
"""
|
| 20 |
+
resp = supabase.rpc(
|
| 21 |
+
"match_documents",
|
| 22 |
+
{"query_embedding": embedding, "match_count": k}
|
| 23 |
+
).execute()
|
| 24 |
+
|
| 25 |
+
return resp.data or []
|