commit
Browse files- app.py +163 -76
- build_hg_viewer.py +313 -0
- embeddings.py +24 -0
- ingest.py +0 -94
- llm.py +26 -0
- load_documents.py +119 -0
- rag_pipeline.py +108 -114
- requirements.txt +27 -5
- retriever.py +47 -0
- speech_io.py +157 -0
- split_documents.py +28 -0
- supabase_client.py +0 -25
- upload_weblink_to_supabase.py +76 -0
- vectorstore.py +56 -0
app.py
CHANGED
|
@@ -1,114 +1,201 @@
|
|
| 1 |
-
# app.py
|
| 2 |
-
import os
|
| 3 |
-
import re
|
| 4 |
-
import base64
|
| 5 |
-
import io
|
| 6 |
-
import soundfile as sf
|
| 7 |
|
| 8 |
import gradio as gr
|
| 9 |
-
from openai import OpenAI
|
| 10 |
|
| 11 |
-
from
|
| 12 |
-
from
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
-
|
|
|
|
| 16 |
|
| 17 |
-
|
| 18 |
-
|
| 19 |
|
| 20 |
-
|
| 21 |
-
|
| 22 |
|
|
|
|
|
|
|
| 23 |
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
return f"data:application/pdf;base64,{base64.b64encode(b).decode()}"
|
| 27 |
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
t = re.sub(r"[^\wäöüß ,.?-]+", " ", t)
|
| 33 |
-
return t.strip().capitalize()
|
| 34 |
|
|
|
|
| 35 |
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
|
| 44 |
-
|
| 45 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
)
|
| 47 |
-
return clean_text(result.text or "")
|
| 48 |
|
|
|
|
|
|
|
| 49 |
|
| 50 |
-
|
| 51 |
-
|
|
|
|
|
|
|
| 52 |
|
| 53 |
-
|
| 54 |
-
q = text.strip()
|
| 55 |
-
else:
|
| 56 |
-
q = transcribe(audio)
|
| 57 |
|
| 58 |
-
|
| 59 |
-
|
|
|
|
| 60 |
|
| 61 |
-
|
|
|
|
|
|
|
|
|
|
| 62 |
|
| 63 |
-
|
| 64 |
-
for i, d in enumerate(docs):
|
| 65 |
-
src = d["source"]
|
| 66 |
-
pg = d["page"]
|
| 67 |
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
|
| 73 |
-
|
| 74 |
-
|
|
|
|
| 75 |
|
| 76 |
-
|
|
|
|
|
|
|
| 77 |
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
|
|
|
|
|
|
|
|
|
|
| 83 |
|
| 84 |
-
with gr.Blocks() as demo:
|
| 85 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
|
| 87 |
with gr.Row():
|
| 88 |
-
with gr.Column(scale=3):
|
| 89 |
-
chatbot = gr.Chatbot()
|
| 90 |
|
| 91 |
-
|
| 92 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
label="Spracheingabe (Mikrofon)",
|
| 99 |
)
|
|
|
|
| 100 |
|
| 101 |
-
|
| 102 |
-
|
| 103 |
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
gr.
|
|
|
|
| 107 |
|
| 108 |
-
gr.
|
| 109 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
|
| 111 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
|
| 113 |
if __name__ == "__main__":
|
| 114 |
-
demo.queue().launch()
|
|
|
|
| 1 |
+
# app.py – Prüfungsrechts-Chatbot mit OpenAI (Supabase RAG)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
import gradio as gr
|
|
|
|
| 4 |
|
| 5 |
+
from load_documents import load_documents, PDF_URL, HG_HTML_URL
|
| 6 |
+
from split_documents import split_documents
|
| 7 |
+
from vectorstore import build_vectorstore
|
| 8 |
+
from retriever import get_retriever
|
| 9 |
+
from llm import load_llm
|
| 10 |
+
from rag_pipeline import answer
|
| 11 |
+
from speech_io import transcribe_audio, synthesize_speech
|
| 12 |
|
| 13 |
+
# =====================================================
|
| 14 |
+
# INITIALISIERUNG (beim Start der Space einmalig)
|
| 15 |
+
# =====================================================
|
| 16 |
|
| 17 |
+
print("🔹 Lade Dokumente aus Supabase …")
|
| 18 |
+
_docs = load_documents()
|
| 19 |
|
| 20 |
+
print("🔹 Splitte Dokumente …")
|
| 21 |
+
_chunks = split_documents(_docs)
|
| 22 |
|
| 23 |
+
print("🔹 Baue VectorStore …")
|
| 24 |
+
_vs = build_vectorstore(_chunks)
|
| 25 |
|
| 26 |
+
print("🔹 Erzeuge Retriever …")
|
| 27 |
+
_retriever = get_retriever(_vs)
|
| 28 |
|
| 29 |
+
print("🔹 Lade OpenAI LLM …")
|
| 30 |
+
_llm = load_llm()
|
|
|
|
| 31 |
|
| 32 |
+
# =====================================================
|
| 33 |
+
# Quellen formatieren – Markdown im Chat
|
| 34 |
+
# =====================================================
|
| 35 |
|
| 36 |
+
def format_sources_markdown(sources):
|
| 37 |
+
if not sources:
|
| 38 |
+
return ""
|
|
|
|
|
|
|
| 39 |
|
| 40 |
+
lines = ["", "### 📚 Quellen (verwendete Dokumentstellen):"]
|
| 41 |
|
| 42 |
+
for s in sources:
|
| 43 |
+
sid = s["id"]
|
| 44 |
+
src = s["source"]
|
| 45 |
+
page = s["page"]
|
| 46 |
+
url = s["url"]
|
| 47 |
+
snippet = s["snippet"]
|
| 48 |
+
|
| 49 |
+
if page:
|
| 50 |
+
title = f"Quelle {sid} – {src}, Seite {page}"
|
| 51 |
+
else:
|
| 52 |
+
title = f"Quelle {sid} – {src}"
|
| 53 |
+
|
| 54 |
+
if url:
|
| 55 |
+
base = f"- [{title}]({url})"
|
| 56 |
+
else:
|
| 57 |
+
base = f"- {title}"
|
| 58 |
+
|
| 59 |
+
lines.append(base)
|
| 60 |
+
if snippet:
|
| 61 |
+
lines.append(f" > {snippet}")
|
| 62 |
+
|
| 63 |
+
return "\n".join(lines)
|
| 64 |
|
| 65 |
+
# =====================================================
|
| 66 |
+
# TEXT CHATBOT
|
| 67 |
+
# =====================================================
|
| 68 |
+
|
| 69 |
+
def chatbot_text(user_message, history):
|
| 70 |
+
if not user_message:
|
| 71 |
+
return history, ""
|
| 72 |
+
|
| 73 |
+
answer_text, sources = answer(
|
| 74 |
+
question=user_message,
|
| 75 |
+
retriever=_retriever,
|
| 76 |
+
chat_model=_llm,
|
| 77 |
)
|
|
|
|
| 78 |
|
| 79 |
+
quellen_block = format_sources_markdown(sources)
|
| 80 |
+
bot_msg = answer_text + "\n\n" + quellen_block
|
| 81 |
|
| 82 |
+
history = history + [
|
| 83 |
+
{"role": "user", "content": user_message},
|
| 84 |
+
{"role": "assistant", "content": bot_msg},
|
| 85 |
+
]
|
| 86 |
|
| 87 |
+
return history, ""
|
|
|
|
|
|
|
|
|
|
| 88 |
|
| 89 |
+
# =====================================================
|
| 90 |
+
# VOICE CHATBOT
|
| 91 |
+
# =====================================================
|
| 92 |
|
| 93 |
+
def chatbot_voice(audio_path, history):
|
| 94 |
+
text = transcribe_audio(audio_path)
|
| 95 |
+
if not text:
|
| 96 |
+
return history, None, ""
|
| 97 |
|
| 98 |
+
history = history + [{"role": "user", "content": text}]
|
|
|
|
|
|
|
|
|
|
| 99 |
|
| 100 |
+
answer_text, sources = answer(
|
| 101 |
+
question=text,
|
| 102 |
+
retriever=_retriever,
|
| 103 |
+
chat_model=_llm,
|
| 104 |
+
)
|
| 105 |
+
|
| 106 |
+
quellen_block = format_sources_markdown(sources)
|
| 107 |
+
bot_msg = answer_text + "\n\n" + quellen_block
|
| 108 |
+
|
| 109 |
+
history = history + [{"role": "assistant", "content": bot_msg}]
|
| 110 |
+
|
| 111 |
+
audio = synthesize_speech(bot_msg)
|
| 112 |
+
|
| 113 |
+
return history, audio, ""
|
| 114 |
|
| 115 |
+
# =====================================================
|
| 116 |
+
# Wieder-Vorlesen der letzten Antwort
|
| 117 |
+
# =====================================================
|
| 118 |
|
| 119 |
+
def read_last_answer(history):
|
| 120 |
+
if not history:
|
| 121 |
+
return None
|
| 122 |
|
| 123 |
+
for msg in reversed(history):
|
| 124 |
+
if msg["role"] == "assistant":
|
| 125 |
+
return synthesize_speech(msg["content"])
|
| 126 |
+
return None
|
| 127 |
|
| 128 |
+
# =====================================================
|
| 129 |
+
# UI (Gradio)
|
| 130 |
+
# =====================================================
|
| 131 |
|
| 132 |
+
with gr.Blocks(title="Prüfungsrechts-Chatbot (Supabase + OpenAI)") as demo:
|
| 133 |
+
|
| 134 |
+
gr.Markdown("# 🧑⚖️ Prüfungsrechts-Chatbot (Supabase RAG + OpenAI)")
|
| 135 |
+
gr.Markdown(
|
| 136 |
+
"Fragen zum Prüfungsrecht (Prüfungsordnung + Hochschulgesetz NRW). "
|
| 137 |
+
"Antworten mit Quellenangabe und Direktlinks."
|
| 138 |
+
)
|
| 139 |
|
| 140 |
with gr.Row():
|
|
|
|
|
|
|
| 141 |
|
| 142 |
+
# ---------- LINKER BEREICH: CHAT ----------
|
| 143 |
+
with gr.Column(scale=2):
|
| 144 |
+
|
| 145 |
+
chatbot = gr.Chatbot(
|
| 146 |
+
type="messages",
|
| 147 |
+
label="Chat",
|
| 148 |
+
height=550,
|
| 149 |
+
)
|
| 150 |
|
| 151 |
+
msg = gr.Textbox(
|
| 152 |
+
label="Frage eingeben",
|
| 153 |
+
placeholder="Stelle deine Frage zum Prüfungsrecht …",
|
| 154 |
+
autofocus=True,
|
|
|
|
| 155 |
)
|
| 156 |
+
msg.submit(chatbot_text, [msg, chatbot], [chatbot, msg])
|
| 157 |
|
| 158 |
+
send_btn = gr.Button("Senden (Text)")
|
| 159 |
+
send_btn.click(chatbot_text, [msg, chatbot], [chatbot, msg])
|
| 160 |
|
| 161 |
+
gr.Markdown("### 🎙️ Spracheingabe")
|
| 162 |
+
|
| 163 |
+
voice_in = gr.Audio(sources=["microphone"], type="filepath")
|
| 164 |
+
voice_out = gr.Audio(label="Vorgelesene Antwort", type="numpy")
|
| 165 |
|
| 166 |
+
send_voice_btn = gr.Button("Sprechen & Senden")
|
| 167 |
+
send_voice_btn.click(
|
| 168 |
+
chatbot_voice,
|
| 169 |
+
[voice_in, chatbot],
|
| 170 |
+
[chatbot, voice_out, msg],
|
| 171 |
+
)
|
| 172 |
+
|
| 173 |
+
read_btn = gr.Button("Antwort erneut vorlesen")
|
| 174 |
+
read_btn.click(read_last_answer, [chatbot], [voice_out])
|
| 175 |
+
|
| 176 |
+
clear_btn = gr.Button("Chat löschen")
|
| 177 |
+
clear_btn.click(lambda: [], None, chatbot)
|
| 178 |
+
|
| 179 |
+
# ---------- RECHTER BEREICH: VIEWER ----------
|
| 180 |
+
with gr.Column(scale=1):
|
| 181 |
|
| 182 |
+
gr.Markdown("### 📄 Prüfungsordnung (PDF)")
|
| 183 |
+
gr.HTML(
|
| 184 |
+
f"""
|
| 185 |
+
<iframe src="{PDF_URL}"
|
| 186 |
+
style="width:100%; height:330px; border:none;">
|
| 187 |
+
</iframe>
|
| 188 |
+
"""
|
| 189 |
+
)
|
| 190 |
+
|
| 191 |
+
gr.Markdown("### 📘 Hochschulgesetz NRW (Paragraph-Viewer)")
|
| 192 |
+
gr.HTML(
|
| 193 |
+
f"""
|
| 194 |
+
<iframe src="{HG_HTML_URL}"
|
| 195 |
+
style="width:100%; height:330px; border:none;">
|
| 196 |
+
</iframe>
|
| 197 |
+
"""
|
| 198 |
+
)
|
| 199 |
|
| 200 |
if __name__ == "__main__":
|
| 201 |
+
demo.queue().launch(ssr_mode=False, show_error=True)
|
build_hg_viewer.py
ADDED
|
@@ -0,0 +1,313 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# build_hg_viewer.py
|
| 2 |
+
import os
|
| 3 |
+
from supabase import create_client
|
| 4 |
+
from dotenv import load_dotenv
|
| 5 |
+
|
| 6 |
+
load_dotenv()
|
| 7 |
+
|
| 8 |
+
SUPABASE_URL = os.environ["SUPABASE_URL"]
|
| 9 |
+
SUPABASE_SERVICE_ROLE = os.environ["SUPABASE_SERVICE_ROLE"]
|
| 10 |
+
|
| 11 |
+
if not SUPABASE_URL or not SUPABASE_SERVICE_ROLE:
|
| 12 |
+
raise RuntimeError("Missing SUPABASE_URL or SUPABASE_SERVICE_ROLE")
|
| 13 |
+
|
| 14 |
+
supabase = create_client(SUPABASE_URL, SUPABASE_SERVICE_ROLE)
|
| 15 |
+
|
| 16 |
+
from upload_weblink_to_supabase import extract_paragraphs
|
| 17 |
+
|
| 18 |
+
# ======== HTML TEMPLATE ========
|
| 19 |
+
VIEW_TEMPLATE = """
|
| 20 |
+
<!DOCTYPE html>
|
| 21 |
+
<html lang="de">
|
| 22 |
+
<head>
|
| 23 |
+
<meta charset="UTF-8">
|
| 24 |
+
<title>Hochschulgesetz NRW – Paragraph Viewer</title>
|
| 25 |
+
|
| 26 |
+
<style>
|
| 27 |
+
body {
|
| 28 |
+
font-family: Arial, sans-serif;
|
| 29 |
+
margin: 0;
|
| 30 |
+
padding: 0;
|
| 31 |
+
display: flex;
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
+
/* ----------- SIDEBAR ------------- */
|
| 35 |
+
#sidebar {
|
| 36 |
+
width: 280px;
|
| 37 |
+
height: 100vh;
|
| 38 |
+
overflow-y: auto;
|
| 39 |
+
background: #f5f5f5;
|
| 40 |
+
border-right: 1px solid #ccc;
|
| 41 |
+
padding: 15px;
|
| 42 |
+
position: sticky;
|
| 43 |
+
top: 0;
|
| 44 |
+
}
|
| 45 |
+
|
| 46 |
+
#sidebar h2 {
|
| 47 |
+
margin-top: 0;
|
| 48 |
+
}
|
| 49 |
+
|
| 50 |
+
#searchBox {
|
| 51 |
+
width: 100%;
|
| 52 |
+
padding: 8px;
|
| 53 |
+
font-size: 15px;
|
| 54 |
+
margin-bottom: 10px;
|
| 55 |
+
border: 1px solid #aaa;
|
| 56 |
+
border-radius: 5px;
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
.sidebar-link {
|
| 60 |
+
display: block;
|
| 61 |
+
padding: 6px 8px;
|
| 62 |
+
margin-bottom: 4px;
|
| 63 |
+
text-decoration: none;
|
| 64 |
+
color: #003366;
|
| 65 |
+
border-radius: 4px;
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
.sidebar-link:hover {
|
| 69 |
+
background: #e0e7ff;
|
| 70 |
+
color: #001d4d;
|
| 71 |
+
}
|
| 72 |
+
|
| 73 |
+
/* ----------- CONTENT ------------- */
|
| 74 |
+
#content {
|
| 75 |
+
flex: 1;
|
| 76 |
+
padding: 25px;
|
| 77 |
+
max-width: 900px;
|
| 78 |
+
}
|
| 79 |
+
|
| 80 |
+
/* Absatz block */
|
| 81 |
+
.para {
|
| 82 |
+
padding: 20px 0;
|
| 83 |
+
border-bottom: 1px solid #ddd;
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
+
.para h2 {
|
| 87 |
+
color: #003366;
|
| 88 |
+
margin-bottom: 10px;
|
| 89 |
+
}
|
| 90 |
+
|
| 91 |
+
/* ----------- Fußnoten ------------- */
|
| 92 |
+
.fn-block {
|
| 93 |
+
background: #fafafa;
|
| 94 |
+
border-left: 4px solid #999;
|
| 95 |
+
padding: 12px;
|
| 96 |
+
margin-top: 10px;
|
| 97 |
+
margin-bottom: 25px;
|
| 98 |
+
}
|
| 99 |
+
|
| 100 |
+
.fn-toggle {
|
| 101 |
+
cursor: pointer;
|
| 102 |
+
font-weight: bold;
|
| 103 |
+
color: #003366;
|
| 104 |
+
margin-bottom: 5px;
|
| 105 |
+
}
|
| 106 |
+
|
| 107 |
+
.fn-content {
|
| 108 |
+
display: none;
|
| 109 |
+
padding-left: 10px;
|
| 110 |
+
}
|
| 111 |
+
|
| 112 |
+
.fn-title {
|
| 113 |
+
font-weight: bold;
|
| 114 |
+
margin-bottom: 6px;
|
| 115 |
+
}
|
| 116 |
+
|
| 117 |
+
.fn-item {
|
| 118 |
+
margin-bottom: 8px;
|
| 119 |
+
}
|
| 120 |
+
|
| 121 |
+
/* ----------- Highlight beim Öffnen ------------- */
|
| 122 |
+
.highlight {
|
| 123 |
+
animation: flash 2s ease-in-out;
|
| 124 |
+
background: #fff8c6 !important;
|
| 125 |
+
}
|
| 126 |
+
|
| 127 |
+
@keyframes flash {
|
| 128 |
+
0% { background: #fff8c6; }
|
| 129 |
+
100% { background: transparent; }
|
| 130 |
+
}
|
| 131 |
+
|
| 132 |
+
/* Keyword highlight */
|
| 133 |
+
.keyword {
|
| 134 |
+
background: yellow;
|
| 135 |
+
padding: 2px 3px;
|
| 136 |
+
border-radius: 3px;
|
| 137 |
+
}
|
| 138 |
+
|
| 139 |
+
/* Back to top button */
|
| 140 |
+
#topBtn {
|
| 141 |
+
position: fixed;
|
| 142 |
+
bottom: 25px;
|
| 143 |
+
right: 25px;
|
| 144 |
+
background: #003366;
|
| 145 |
+
color: white;
|
| 146 |
+
border-radius: 8px;
|
| 147 |
+
padding: 10px 14px;
|
| 148 |
+
cursor: pointer;
|
| 149 |
+
font-size: 16px;
|
| 150 |
+
display: none;
|
| 151 |
+
}
|
| 152 |
+
</style>
|
| 153 |
+
|
| 154 |
+
</head>
|
| 155 |
+
<body>
|
| 156 |
+
|
| 157 |
+
<div id="sidebar">
|
| 158 |
+
<h2>Inhaltsverzeichnis</h2>
|
| 159 |
+
<input type="text" id="searchBox" placeholder="Suchen nach § …">
|
| 160 |
+
<!-- SIDEBAR_LINKS -->
|
| 161 |
+
</div>
|
| 162 |
+
|
| 163 |
+
<div id="content">
|
| 164 |
+
<h1>Hochschulgesetz NRW – Paragraph Viewer</h1>
|
| 165 |
+
<!-- PARAGRAPH_CONTENT -->
|
| 166 |
+
</div>
|
| 167 |
+
|
| 168 |
+
<div id="topBtn" onclick="scrollToTop()">⬆️ Top</div>
|
| 169 |
+
|
| 170 |
+
<script>
|
| 171 |
+
// ------ TỰ ĐỘNG HIGHLIGHT Absatz khi có #anchor HIGHLIGHT ABSATZ & SCROLL ------
|
| 172 |
+
window.onload = function() {
|
| 173 |
+
const anchor = window.location.hash.substring(1);
|
| 174 |
+
const params = new URLSearchParams(window.location.search);
|
| 175 |
+
const keywords = params.get("k");
|
| 176 |
+
|
| 177 |
+
if (anchor) {
|
| 178 |
+
const el = document.getElementById(anchor);
|
| 179 |
+
if (el) {
|
| 180 |
+
el.classList.add("highlight");
|
| 181 |
+
el.scrollIntoView({ behavior: "smooth", block: "center" });
|
| 182 |
+
}
|
| 183 |
+
}
|
| 184 |
+
|
| 185 |
+
/* KEYWORD HIGHLIGHT */
|
| 186 |
+
if (keywords) {
|
| 187 |
+
const words = keywords.split("%20");
|
| 188 |
+
highlightKeywords(words);
|
| 189 |
+
}
|
| 190 |
+
};
|
| 191 |
+
|
| 192 |
+
/* --- KEYWORD HIGHLIGHT FUNCTION --- */
|
| 193 |
+
function highlightKeywords(words) {
|
| 194 |
+
const container = document.getElementById("content");
|
| 195 |
+
let html = container.innerHTML;
|
| 196 |
+
|
| 197 |
+
words.forEach(word => {
|
| 198 |
+
if (word.length < 2) return;
|
| 199 |
+
const regex = new RegExp(`(${decodeURIComponent(word)})`, "gi");
|
| 200 |
+
html = html.replace(regex, `<span class="keyword">$1</span>`);
|
| 201 |
+
});
|
| 202 |
+
|
| 203 |
+
container.innerHTML = html;
|
| 204 |
+
}
|
| 205 |
+
|
| 206 |
+
/* --- SEARCH IN SIDEBAR --- */
|
| 207 |
+
document.getElementById("searchBox").addEventListener("input", function() {
|
| 208 |
+
const q = this.value.toLowerCase();
|
| 209 |
+
document.querySelectorAll(".sidebar-link").forEach(link => {
|
| 210 |
+
const txt = link.innerText.toLowerCase();
|
| 211 |
+
link.style.display = txt.includes(q) ? "block" : "none";
|
| 212 |
+
});
|
| 213 |
+
});
|
| 214 |
+
|
| 215 |
+
/* --- COLLAPSIBLE FUSSNOTEN --- */
|
| 216 |
+
document.addEventListener("click", function(e) {
|
| 217 |
+
if (e.target.classList.contains("fn-toggle")) {
|
| 218 |
+
const content = e.target.nextElementSibling;
|
| 219 |
+
content.style.display = content.style.display === "block" ? "none" : "block";
|
| 220 |
+
}
|
| 221 |
+
});
|
| 222 |
+
|
| 223 |
+
/* --- BACK TO TOP BUTTON --- */
|
| 224 |
+
window.onscroll = function() {
|
| 225 |
+
document.getElementById("topBtn").style.display =
|
| 226 |
+
window.scrollY > 300 ? "block" : "none";
|
| 227 |
+
};
|
| 228 |
+
|
| 229 |
+
function scrollToTop() {
|
| 230 |
+
window.scrollTo({ top: 0, behavior: 'smooth' });
|
| 231 |
+
}
|
| 232 |
+
|
| 233 |
+
</script>
|
| 234 |
+
|
| 235 |
+
</body>
|
| 236 |
+
</html>
|
| 237 |
+
"""
|
| 238 |
+
|
| 239 |
+
# -------------------------------------------------------------------
|
| 240 |
+
# 2. BUILD VIEWER
|
| 241 |
+
# -------------------------------------------------------------------
|
| 242 |
+
|
| 243 |
+
def build_html():
|
| 244 |
+
print(">>> Lade Paragraphs aus Supabase...")
|
| 245 |
+
paras = extract_paragraphs()
|
| 246 |
+
|
| 247 |
+
sidebar_links = ""
|
| 248 |
+
content_html = ""
|
| 249 |
+
|
| 250 |
+
for p in paras:
|
| 251 |
+
pid = p["abs_id"]
|
| 252 |
+
title = p["title"]
|
| 253 |
+
body = p["content"]
|
| 254 |
+
|
| 255 |
+
# Sidebar item
|
| 256 |
+
sidebar_links += f'<a class="sidebar-link" href="#{pid}">{title}</a>\n'
|
| 257 |
+
|
| 258 |
+
# Fußnoten tách riêng (bắt đầu bằng "Fn 1", "Fn 2", ...)
|
| 259 |
+
lines = body.split("\n")
|
| 260 |
+
main_text = []
|
| 261 |
+
fn_text = []
|
| 262 |
+
in_fn = False
|
| 263 |
+
|
| 264 |
+
for line in lines:
|
| 265 |
+
if line.startswith("Fn "):
|
| 266 |
+
in_fn = True
|
| 267 |
+
if in_fn:
|
| 268 |
+
fn_text.append(line)
|
| 269 |
+
else:
|
| 270 |
+
main_text.append(line)
|
| 271 |
+
|
| 272 |
+
footnotes_html = ""
|
| 273 |
+
if fn_text:
|
| 274 |
+
footnotes_html += '<div class="fn-block">'
|
| 275 |
+
footnotes_html += '<div class="fn-title">Fußnoten:</div>'
|
| 276 |
+
for fn in fn_text:
|
| 277 |
+
footnotes_html += f'<div class="fn-item">{fn}</div>'
|
| 278 |
+
footnotes_html += "</div>"
|
| 279 |
+
|
| 280 |
+
# Paragraph block
|
| 281 |
+
content_html += f"""
|
| 282 |
+
<div class="para" id="{pid}">
|
| 283 |
+
<h2>{title}</h2>
|
| 284 |
+
<div>{'<br>'.join(main_text)}</div>
|
| 285 |
+
{footnotes_html}
|
| 286 |
+
</div>
|
| 287 |
+
"""
|
| 288 |
+
|
| 289 |
+
html = VIEW_TEMPLATE.replace("<!-- SIDEBAR_LINKS -->", sidebar_links)
|
| 290 |
+
html = html.replace("<!-- PARAGRAPH_CONTENT -->", content_html)
|
| 291 |
+
|
| 292 |
+
return html
|
| 293 |
+
|
| 294 |
+
# -------------------------------------------------------------------
|
| 295 |
+
# 3. UPLOAD TO SUPABASE STORAGE
|
| 296 |
+
# -------------------------------------------------------------------
|
| 297 |
+
|
| 298 |
+
def upload_html():
|
| 299 |
+
html = build_html()
|
| 300 |
+
|
| 301 |
+
supabase.storage.from_("hg_viewer").update(
|
| 302 |
+
"hg_clean.html",
|
| 303 |
+
html.encode("utf-8"),
|
| 304 |
+
{
|
| 305 |
+
"content-type": "text/html",
|
| 306 |
+
"x-upsert": "true"
|
| 307 |
+
}
|
| 308 |
+
)
|
| 309 |
+
|
| 310 |
+
print("✔ hg_clean.html uploaded!")
|
| 311 |
+
|
| 312 |
+
if __name__ == "__main__":
|
| 313 |
+
upload_html()
|
embeddings.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# embeddings.py – OpenAI Version (text-embedding-3-small)
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
from langchain_openai import OpenAIEmbeddings
|
| 5 |
+
|
| 6 |
+
EMBED_MODEL = "text-embedding-3-small"
|
| 7 |
+
|
| 8 |
+
def get_embeddings():
|
| 9 |
+
api_key = os.environ.get("OPENAI_API_KEY")
|
| 10 |
+
if not api_key:
|
| 11 |
+
raise RuntimeError(
|
| 12 |
+
"OPENAI_API_KEY fehlt. Bitte als Secret im HuggingFace Space setzen."
|
| 13 |
+
)
|
| 14 |
+
|
| 15 |
+
print(f">>> Lade OpenAI Embedding Model: {EMBED_MODEL}")
|
| 16 |
+
emb = OpenAIEmbeddings(
|
| 17 |
+
model=EMBED_MODEL,
|
| 18 |
+
api_key=api_key,
|
| 19 |
+
)
|
| 20 |
+
return emb
|
| 21 |
+
|
| 22 |
+
if __name__ == "__main__":
|
| 23 |
+
e = get_embeddings()
|
| 24 |
+
print(e.embed_query("Test"))
|
ingest.py
DELETED
|
@@ -1,94 +0,0 @@
|
|
| 1 |
-
# ingest.py
|
| 2 |
-
import os
|
| 3 |
-
from io import BytesIO
|
| 4 |
-
from bs4 import BeautifulSoup
|
| 5 |
-
from pypdf import PdfReader
|
| 6 |
-
|
| 7 |
-
from supabase_client import supabase, load_file_bytes
|
| 8 |
-
from langchain_openai import OpenAIEmbeddings
|
| 9 |
-
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
| 10 |
-
from langchain_core.documents import Document
|
| 11 |
-
|
| 12 |
-
BUCKET = os.environ["SUPABASE_BUCKET"]
|
| 13 |
-
SUPABASE_URL = os.environ["SUPABASE_URL"]
|
| 14 |
-
|
| 15 |
-
PDF_URL = f"{SUPABASE_URL}/storage/v1/object/public/{BUCKET}/pruefungsordnung.pdf"
|
| 16 |
-
OFFICIAL_HG_URL = (
|
| 17 |
-
"https://recht.nrw.de/lmi/owa/br_text_anzeigen?v_id=10000000000000000654"
|
| 18 |
-
)
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
# ---------------- Loaders ----------------
|
| 22 |
-
def load_pdf_docs():
|
| 23 |
-
pdf_bytes = load_file_bytes(BUCKET, "pruefungsordnung.pdf")
|
| 24 |
-
reader = PdfReader(BytesIO(pdf_bytes))
|
| 25 |
-
|
| 26 |
-
docs = []
|
| 27 |
-
for i, p in enumerate(reader.pages):
|
| 28 |
-
text = p.extract_text() or ""
|
| 29 |
-
docs.append(
|
| 30 |
-
Document(
|
| 31 |
-
page_content=text,
|
| 32 |
-
metadata={
|
| 33 |
-
"source": "Prüfungsordnung (PDF)",
|
| 34 |
-
"page": i + 1,
|
| 35 |
-
"pdf_url": PDF_URL,
|
| 36 |
-
},
|
| 37 |
-
)
|
| 38 |
-
)
|
| 39 |
-
return docs
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
def load_html_docs():
|
| 43 |
-
html_bytes = load_file_bytes(BUCKET, "hochschulgesetz.html")
|
| 44 |
-
soup = BeautifulSoup(html_bytes.decode("utf-8", "ignore"), "html.parser")
|
| 45 |
-
|
| 46 |
-
return [
|
| 47 |
-
Document(
|
| 48 |
-
page_content=soup.get_text("\n"),
|
| 49 |
-
metadata={"source": "Hochschulgesetz NRW", "url": OFFICIAL_HG_URL},
|
| 50 |
-
)
|
| 51 |
-
]
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
def chunk_docs(docs):
|
| 55 |
-
splitter = RecursiveCharacterTextSplitter(chunk_size=900, chunk_overlap=80)
|
| 56 |
-
return splitter.split_documents(docs)
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
# ---------------- Delete old data ----------------
|
| 60 |
-
def delete_old_documents():
|
| 61 |
-
print("🗑️ Lösche alte Daten…")
|
| 62 |
-
supabase.table("documents").delete().gte(
|
| 63 |
-
"id", "00000000-0000-0000-0000-000000000000"
|
| 64 |
-
).execute()
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
# ---------------- Ingest ----------------
|
| 68 |
-
def ingest():
|
| 69 |
-
delete_old_documents()
|
| 70 |
-
|
| 71 |
-
pdf_docs = load_pdf_docs()
|
| 72 |
-
html_docs = load_html_docs()
|
| 73 |
-
|
| 74 |
-
chunks = chunk_docs(pdf_docs + html_docs)
|
| 75 |
-
|
| 76 |
-
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
|
| 77 |
-
|
| 78 |
-
print("📥 Speichere neue Dokumente…")
|
| 79 |
-
for d in chunks:
|
| 80 |
-
emb = embeddings.embed_query(d.page_content)
|
| 81 |
-
|
| 82 |
-
supabase.table("documents").insert(
|
| 83 |
-
{
|
| 84 |
-
"content": d.page_content,
|
| 85 |
-
"metadata": d.metadata,
|
| 86 |
-
"embedding": emb,
|
| 87 |
-
}
|
| 88 |
-
).execute()
|
| 89 |
-
|
| 90 |
-
print("✅ Ingest abgeschlossen!")
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
if __name__ == "__main__":
|
| 94 |
-
ingest()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
llm.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# llm.py – OpenAI Chatmodell für RAG
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
from langchain_openai import ChatOpenAI
|
| 5 |
+
|
| 6 |
+
CHAT_MODEL = "gpt-4o-mini" # günstig & stark
|
| 7 |
+
|
| 8 |
+
def load_llm():
|
| 9 |
+
api_key = os.environ.get("OPENAI_API_KEY")
|
| 10 |
+
if not api_key:
|
| 11 |
+
raise RuntimeError(
|
| 12 |
+
"OPENAI_API_KEY fehlt. Bitte als Secret im HuggingFace Space setzen."
|
| 13 |
+
)
|
| 14 |
+
|
| 15 |
+
print(f">>> Lade OpenAI Chatmodell: {CHAT_MODEL}")
|
| 16 |
+
|
| 17 |
+
llm = ChatOpenAI(
|
| 18 |
+
model=CHAT_MODEL,
|
| 19 |
+
temperature=0.0, # deterministisch, wenig Halluzination
|
| 20 |
+
api_key=api_key,
|
| 21 |
+
)
|
| 22 |
+
return llm
|
| 23 |
+
|
| 24 |
+
if __name__ == "__main__":
|
| 25 |
+
llm = load_llm()
|
| 26 |
+
print(llm.invoke("Sag einen Satz zum Prüfungsrecht.").content)
|
load_documents.py
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import os
|
| 3 |
+
import requests
|
| 4 |
+
import tempfile
|
| 5 |
+
from supabase import create_client
|
| 6 |
+
from langchain_core.documents import Document
|
| 7 |
+
from langchain_community.document_loaders import PyPDFLoader
|
| 8 |
+
|
| 9 |
+
# ---------------------------------------------------------
|
| 10 |
+
# ENV Variablen aus HuggingFace Space
|
| 11 |
+
# ---------------------------------------------------------
|
| 12 |
+
SUPABASE_URL = os.getenv("SUPABASE_URL")
|
| 13 |
+
SUPABASE_ANON_KEY = os.getenv("SUPABASE_ANON_KEY")
|
| 14 |
+
|
| 15 |
+
if not SUPABASE_URL or not SUPABASE_ANON_KEY:
|
| 16 |
+
raise RuntimeError("Missing SUPABASE_URL / SUPABASE_ANON_KEY in environment.")
|
| 17 |
+
|
| 18 |
+
supabase = create_client(SUPABASE_URL, SUPABASE_ANON_KEY)
|
| 19 |
+
|
| 20 |
+
# ---------------------------------------------------------
|
| 21 |
+
# Prüfungsordnung PDF – liegt in Supabase Storage (public)
|
| 22 |
+
# ---------------------------------------------------------
|
| 23 |
+
PDF_FILE = "f10_bpo_ifb_tei_mif_wii_2021-01-04.pdf"
|
| 24 |
+
PDF_URL = f"{SUPABASE_URL}/storage/v1/object/public/File%20PDF/{PDF_FILE}"
|
| 25 |
+
|
| 26 |
+
# ---------------------------------------------------------
|
| 27 |
+
# Statischer Paragraph-Viewer in HuggingFace Space
|
| 28 |
+
# -> hg_clean.html liegt als Datei im Repo!
|
| 29 |
+
# -> in der App: iframe src="file=hg_clean.html"
|
| 30 |
+
# -> für Links: "file=hg_clean.html#para_123"
|
| 31 |
+
# ---------------------------------------------------------
|
| 32 |
+
# HG_HTML_URL = "file=hg_clean.html" # WICHTIG: nicht absolut, Space kümmert sich
|
| 33 |
+
#HG_HTML_URL = "https://huggingface.co/spaces/Nguyen5/chatbot/resolve/main/hg_clean.html"
|
| 34 |
+
#HG_HTML_URL = "https://huggingface.co/spaces/Nguyen5/chatbot/raw/main/hg_clean.html"
|
| 35 |
+
|
| 36 |
+
HG_HTML_URL = f"{SUPABASE_URL}/storage/v1/object/public/hg_viewer/hg_clean.html"
|
| 37 |
+
|
| 38 |
+
def load_hg_nrw():
|
| 39 |
+
"""
|
| 40 |
+
Holt alle Paragraphen aus Tabelle public.hg_nrw und baut
|
| 41 |
+
LangChain-Dokumente. Jeder Paragraph:
|
| 42 |
+
- title : z.B. "§ 64 (Fn 35) Prüfungsordnungen"
|
| 43 |
+
- content: Volltext inkl. Fußnoten
|
| 44 |
+
- abs_id : para_1, para_2, ...
|
| 45 |
+
"""
|
| 46 |
+
print(">>> Lade Hochschulgesetz NRW (§) aus Supabase…")
|
| 47 |
+
|
| 48 |
+
rows = (
|
| 49 |
+
supabase.table("hg_nrw")
|
| 50 |
+
.select("*")
|
| 51 |
+
.order("order_index")
|
| 52 |
+
.execute()
|
| 53 |
+
).data or []
|
| 54 |
+
|
| 55 |
+
print(f" - {len(rows)} Paragraphen geladen.")
|
| 56 |
+
|
| 57 |
+
docs = []
|
| 58 |
+
for r in rows:
|
| 59 |
+
abs_id = r["abs_id"] # z.B. "para_64"
|
| 60 |
+
title = r.get("title", "")
|
| 61 |
+
content = r.get("content", "")
|
| 62 |
+
|
| 63 |
+
# Anker im Viewer – IDs in hg_clean.html sind identisch (id="para_64")
|
| 64 |
+
viewer_url = f"{HG_HTML_URL}#{abs_id}"
|
| 65 |
+
|
| 66 |
+
docs.append(
|
| 67 |
+
Document(
|
| 68 |
+
page_content=f"{title}\n{content}",
|
| 69 |
+
metadata={
|
| 70 |
+
"source": "Hochschulgesetz NRW",
|
| 71 |
+
"paragraph": title,
|
| 72 |
+
"abs_id": abs_id,
|
| 73 |
+
"url": viewer_url,
|
| 74 |
+
},
|
| 75 |
+
)
|
| 76 |
+
)
|
| 77 |
+
|
| 78 |
+
return docs
|
| 79 |
+
|
| 80 |
+
def load_pdf():
|
| 81 |
+
"""
|
| 82 |
+
Lädt Prüfungsordnung aus Supabase Storage, speichert temporär,
|
| 83 |
+
splitten erfolgt später in split_documents.py.
|
| 84 |
+
"""
|
| 85 |
+
print(">>> Lade Prüfungsordnung PDF …")
|
| 86 |
+
|
| 87 |
+
resp = requests.get(PDF_URL)
|
| 88 |
+
resp.raise_for_status()
|
| 89 |
+
|
| 90 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
|
| 91 |
+
tmp.write(resp.content)
|
| 92 |
+
path = tmp.name
|
| 93 |
+
|
| 94 |
+
pages = PyPDFLoader(path).load()
|
| 95 |
+
|
| 96 |
+
for i, p in enumerate(pages):
|
| 97 |
+
p.metadata["source"] = "Prüfungsordnung (PDF)"
|
| 98 |
+
p.metadata["page"] = i # 0-basiert
|
| 99 |
+
p.metadata["pdf_url"] = PDF_URL
|
| 100 |
+
|
| 101 |
+
print(f" - {len(pages)} PDF-Seiten geladen.")
|
| 102 |
+
return pages
|
| 103 |
+
|
| 104 |
+
def load_documents():
|
| 105 |
+
"""
|
| 106 |
+
Master-Funktion:
|
| 107 |
+
- Hochschulgesetz NRW (Supabase Tabelle hg_nrw)
|
| 108 |
+
- Prüfungsordnung (PDF)
|
| 109 |
+
"""
|
| 110 |
+
docs = []
|
| 111 |
+
docs.extend(load_hg_nrw())
|
| 112 |
+
docs.extend(load_pdf())
|
| 113 |
+
print(f"✔ DOCUMENTS LOADED: {len(docs)}")
|
| 114 |
+
return docs
|
| 115 |
+
|
| 116 |
+
if __name__ == "__main__":
|
| 117 |
+
docs = load_documents()
|
| 118 |
+
print(docs[0])
|
| 119 |
+
print("Total:", len(docs))
|
rag_pipeline.py
CHANGED
|
@@ -1,131 +1,125 @@
|
|
| 1 |
-
# rag_pipeline.py
|
| 2 |
-
from typing import Any
|
| 3 |
-
from datetime import date
|
| 4 |
|
| 5 |
-
from
|
|
|
|
| 6 |
|
| 7 |
-
|
| 8 |
-
from langchain_core.messages import (
|
| 9 |
-
SystemMessage,
|
| 10 |
-
HumanMessage,
|
| 11 |
-
AIMessage,
|
| 12 |
-
)
|
| 13 |
|
| 14 |
-
|
| 15 |
-
|
|
|
|
| 16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
|
|
|
|
| 25 |
|
| 26 |
-
# ----------------
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
docs = match_documents(vec, k=4)
|
| 30 |
|
| 31 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
for i, d in enumerate(docs):
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
"source": meta.get("source"),
|
| 40 |
-
"page": meta.get("page"),
|
| 41 |
-
"snippet": snippet,
|
| 42 |
-
"content": d["content"],
|
| 43 |
-
"metadata": meta,
|
| 44 |
-
}
|
| 45 |
-
)
|
| 46 |
-
|
| 47 |
-
return {"results": results}
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
TOOLS = [
|
| 51 |
-
{
|
| 52 |
-
"type": "function",
|
| 53 |
-
"function": {
|
| 54 |
-
"name": "suche_pruefungsrecht_dokumente",
|
| 55 |
-
"description": "Sucht relevante Stellen im Prüfungsrecht.",
|
| 56 |
-
"parameters": {
|
| 57 |
-
"type": "object",
|
| 58 |
-
"properties": {"query": {"type": "string"}},
|
| 59 |
-
"required": ["query"],
|
| 60 |
-
},
|
| 61 |
-
},
|
| 62 |
-
}
|
| 63 |
-
]
|
| 64 |
-
|
| 65 |
-
llm_tools = llm.bind_tools(TOOLS)
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
# ---------------- HISTORY LOG ----------------
|
| 69 |
-
def save_message(role: str, content: str):
|
| 70 |
-
supabase.table("chat_history").insert(
|
| 71 |
-
{
|
| 72 |
-
"session_date": date.today().isoformat(),
|
| 73 |
-
"role": role,
|
| 74 |
-
"message": content,
|
| 75 |
-
}
|
| 76 |
-
).execute()
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
def convert_history(hist):
|
| 80 |
-
msgs = []
|
| 81 |
-
for h in hist[-6:]:
|
| 82 |
-
if h["role"] == "user":
|
| 83 |
-
msgs.append(HumanMessage(content=h["content"]))
|
| 84 |
else:
|
| 85 |
-
|
| 86 |
-
return msgs
|
| 87 |
|
|
|
|
| 88 |
|
| 89 |
-
|
| 90 |
-
def agent_answer(query: str, history: Any):
|
| 91 |
-
messages = [
|
| 92 |
-
SystemMessage(content=SYSTEM_PROMPT),
|
| 93 |
-
*convert_history(history),
|
| 94 |
-
HumanMessage(content=query),
|
| 95 |
-
]
|
| 96 |
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
call = first.tool_calls[0]
|
| 101 |
-
if call["name"] == "suche_pruefungsrecht_dokumente":
|
| 102 |
-
tool_res = tool_suche_dokumente(call["args"]["query"])
|
| 103 |
-
|
| 104 |
-
messages.extend(
|
| 105 |
-
[
|
| 106 |
-
first,
|
| 107 |
-
AIMessage(
|
| 108 |
-
content=str(tool_res),
|
| 109 |
-
name="suche_pruefungsrecht_dokumente",
|
| 110 |
-
),
|
| 111 |
-
]
|
| 112 |
-
)
|
| 113 |
-
|
| 114 |
-
final = llm.invoke(messages)
|
| 115 |
-
answer = final.content
|
| 116 |
-
docs = tool_res["results"]
|
| 117 |
-
else:
|
| 118 |
-
answer = "Tool nicht unterstützt."
|
| 119 |
-
docs = []
|
| 120 |
-
else:
|
| 121 |
-
answer = first.content
|
| 122 |
-
docs = []
|
| 123 |
|
| 124 |
-
|
| 125 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
|
| 127 |
-
|
|
|
|
|
|
|
| 128 |
|
|
|
|
|
|
|
| 129 |
|
| 130 |
-
|
| 131 |
-
return agent_answer(query, history)
|
|
|
|
| 1 |
+
# rag_pipeline.py – OpenAI RAG mit Supabase-Dokumenten
|
|
|
|
|
|
|
| 2 |
|
| 3 |
+
from typing import List, Dict, Any, Tuple
|
| 4 |
+
from langchain_core.messages import SystemMessage, HumanMessage
|
| 5 |
|
| 6 |
+
MAX_CHARS = 900
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
+
# ------------------------------------------------------
|
| 9 |
+
# Quellen-Metadaten
|
| 10 |
+
# ------------------------------------------------------
|
| 11 |
|
| 12 |
+
def build_sources_metadata(docs: List) -> List[Dict[str, Any]]:
|
| 13 |
+
"""
|
| 14 |
+
Gibt für jeden verwendeten Chunk eine strukturierte Quelle zurück:
|
| 15 |
+
- id
|
| 16 |
+
- source (Dokument)
|
| 17 |
+
- page (bei PDF)
|
| 18 |
+
- url (PDF-Seite oder hg_clean.html#para_x)
|
| 19 |
+
- snippet (Kurzvorschau)
|
| 20 |
+
"""
|
| 21 |
+
srcs = []
|
| 22 |
|
| 23 |
+
for i, d in enumerate(docs):
|
| 24 |
+
meta = d.metadata
|
| 25 |
+
src = meta.get("source")
|
| 26 |
+
page = meta.get("page")
|
| 27 |
+
snippet = d.page_content[:300].replace("\n", " ")
|
| 28 |
+
|
| 29 |
+
if src == "Prüfungsordnung (PDF)":
|
| 30 |
+
pdf_url = meta.get("pdf_url")
|
| 31 |
+
if isinstance(page, int) and pdf_url:
|
| 32 |
+
url = f"{pdf_url}#page={page + 1}"
|
| 33 |
+
else:
|
| 34 |
+
url = pdf_url
|
| 35 |
+
|
| 36 |
+
elif src == "Hochschulgesetz NRW":
|
| 37 |
+
url = meta.get("url")
|
| 38 |
+
page = None
|
| 39 |
+
|
| 40 |
+
else:
|
| 41 |
+
url = None
|
| 42 |
+
|
| 43 |
+
srcs.append({
|
| 44 |
+
"id": i + 1,
|
| 45 |
+
"source": src,
|
| 46 |
+
"page": page + 1 if isinstance(page, int) else None,
|
| 47 |
+
"url": url,
|
| 48 |
+
"snippet": snippet,
|
| 49 |
+
})
|
| 50 |
|
| 51 |
+
return srcs
|
| 52 |
|
| 53 |
+
# ------------------------------------------------------
|
| 54 |
+
# Kontextformatierung
|
| 55 |
+
# ------------------------------------------------------
|
|
|
|
| 56 |
|
| 57 |
+
def format_context(docs):
|
| 58 |
+
if not docs:
|
| 59 |
+
return "(Kein relevanter Kontext gefunden.)"
|
| 60 |
+
|
| 61 |
+
out_lines = []
|
| 62 |
for i, d in enumerate(docs):
|
| 63 |
+
txt = d.page_content[:MAX_CHARS]
|
| 64 |
+
src = d.metadata.get("source")
|
| 65 |
+
page = d.metadata.get("page")
|
| 66 |
+
|
| 67 |
+
if src == "Prüfungsordnung (PDF)" and isinstance(page, int):
|
| 68 |
+
src_str = f"{src}, Seite {page + 1}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
else:
|
| 70 |
+
src_str = src
|
|
|
|
| 71 |
|
| 72 |
+
out_lines.append(f"[KONTEXT {i+1}] ({src_str})\n{txt}")
|
| 73 |
|
| 74 |
+
return "\n\n".join(out_lines)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
|
| 76 |
+
SYSTEM_PROMPT = """
|
| 77 |
+
Du bist ein juristisch präziser Chatbot für Prüfungsrecht.
|
| 78 |
+
Du nutzt ausschließlich:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
|
| 80 |
+
- die Prüfungsordnung (PDF) und
|
| 81 |
+
- das Hochschulgesetz NRW (Paragraphen aus der Datenbank / hg_clean.html)
|
| 82 |
+
|
| 83 |
+
Regeln:
|
| 84 |
+
|
| 85 |
+
1. Antworte nur auf Basis des gelieferten Kontextes.
|
| 86 |
+
2. Wenn der Kontext keine sichere Antwort erlaubt, sage das klar.
|
| 87 |
+
3. Antworte in gut verständlichem Deutsch, in ganzen Sätzen.
|
| 88 |
+
4. Nenne, soweit möglich:
|
| 89 |
+
- Paragraphen oder Überschriften,
|
| 90 |
+
- das Dokument (Prüfungsordnung / Hochschulgesetz NRW),
|
| 91 |
+
- Seitenzahl (bei der Prüfungsordnung).
|
| 92 |
+
"""
|
| 93 |
+
|
| 94 |
+
def answer(question: str, retriever, chat_model) -> Tuple[str, List[Dict[str, Any]]]:
|
| 95 |
+
# 1. Chunks holen
|
| 96 |
+
docs = retriever.invoke(question)
|
| 97 |
+
context_str = format_context(docs)
|
| 98 |
+
|
| 99 |
+
# 2. Prompt bauen
|
| 100 |
+
human = f"""
|
| 101 |
+
FRAGE:
|
| 102 |
+
{question}
|
| 103 |
+
|
| 104 |
+
NUTZE AUSSCHLIESSLICH DIESEN KONTEXT:
|
| 105 |
+
{context_str}
|
| 106 |
+
|
| 107 |
+
AUFGABE:
|
| 108 |
+
Erstelle eine juristisch korrekte Antwort ausschließlich auf Basis
|
| 109 |
+
des obigen Kontextes. Wenn der Kontext keine sichere Antwort zulässt,
|
| 110 |
+
sage das ausdrücklich und verzichte auf Spekulationen.
|
| 111 |
+
"""
|
| 112 |
+
|
| 113 |
+
msgs = [
|
| 114 |
+
SystemMessage(content=SYSTEM_PROMPT),
|
| 115 |
+
HumanMessage(content=human),
|
| 116 |
+
]
|
| 117 |
|
| 118 |
+
# 3. LLM aufrufen
|
| 119 |
+
result = chat_model.invoke(msgs)
|
| 120 |
+
answer_text = result.content.strip()
|
| 121 |
|
| 122 |
+
# 4. Quellenliste
|
| 123 |
+
sources = build_sources_metadata(docs)
|
| 124 |
|
| 125 |
+
return answer_text, sources
|
|
|
requirements.txt
CHANGED
|
@@ -1,10 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
langchain
|
| 2 |
langchain-community
|
|
|
|
| 3 |
langchain-openai
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
|
|
|
|
|
|
| 7 |
pypdf
|
|
|
|
| 8 |
beautifulsoup4
|
| 9 |
-
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# === UI ===
|
| 2 |
+
gradio
|
| 3 |
+
|
| 4 |
+
# === Supabase ===
|
| 5 |
+
supabase
|
| 6 |
+
postgrest
|
| 7 |
+
httpx
|
| 8 |
+
python-dotenv
|
| 9 |
+
|
| 10 |
+
# === LangChain Core ===
|
| 11 |
langchain
|
| 12 |
langchain-community
|
| 13 |
+
langchain-text-splitters
|
| 14 |
langchain-openai
|
| 15 |
+
|
| 16 |
+
# === VectorStore ===
|
| 17 |
+
faiss-cpu
|
| 18 |
+
|
| 19 |
+
# === PDF + HTTP + HTML ===
|
| 20 |
pypdf
|
| 21 |
+
requests
|
| 22 |
beautifulsoup4
|
| 23 |
+
|
| 24 |
+
# === Audio (STT/TTS local) ===
|
| 25 |
+
transformers
|
| 26 |
+
accelerate
|
| 27 |
+
soundfile
|
| 28 |
+
scipy
|
| 29 |
+
numpy
|
| 30 |
+
|
| 31 |
+
# OpenAI offizielle Bibliothek (kommt i.d.R. mit langchain-openai, zur Sicherheit explizit)
|
| 32 |
+
openai
|
retriever.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
BƯỚC 5: RETRIEVER
|
| 3 |
+
-----------------
|
| 4 |
+
Tạo LangChain Retriever từ FAISS VectorStore.
|
| 5 |
+
|
| 6 |
+
Retriever sẽ dùng trong bước RAG sau này:
|
| 7 |
+
- retriever.get_relevant_documents(query)
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
from langchain_community.vectorstores import FAISS
|
| 11 |
+
|
| 12 |
+
# số chunk sẽ lấy cho mỗi câu hỏi
|
| 13 |
+
RETRIEVER_K = 4
|
| 14 |
+
|
| 15 |
+
def get_retriever(vectorstore: FAISS, k: int = RETRIEVER_K):
|
| 16 |
+
"""
|
| 17 |
+
Tạo retriever từ FAISS VectorStore.
|
| 18 |
+
"""
|
| 19 |
+
print(f">>> Creating retriever with k={k} ...")
|
| 20 |
+
retriever = vectorstore.as_retriever(search_kwargs={"k": k})
|
| 21 |
+
print(">>> Retriever ready.\n")
|
| 22 |
+
return retriever
|
| 23 |
+
|
| 24 |
+
if __name__ == "__main__":
|
| 25 |
+
# Test: load -> split -> FAISS -> retriever.get_relevant_documents()
|
| 26 |
+
from load_documents import load_documents
|
| 27 |
+
from split_documents import split_documents
|
| 28 |
+
from vectorstore import build_vectorstore
|
| 29 |
+
|
| 30 |
+
print("=== TEST: retriever.get_relevant_documents ===\n")
|
| 31 |
+
|
| 32 |
+
docs = load_documents()
|
| 33 |
+
chunks = split_documents(docs)
|
| 34 |
+
vs = build_vectorstore(chunks)
|
| 35 |
+
retriever = get_retriever(vs, k=4)
|
| 36 |
+
|
| 37 |
+
query = "Wie lange habe ich Zeit, eine Prüfungsleistung zu wiederholen?"
|
| 38 |
+
print("Test query:")
|
| 39 |
+
print(" ", query, "\n")
|
| 40 |
+
|
| 41 |
+
retrieved_docs = retriever.invoke(query)
|
| 42 |
+
|
| 43 |
+
print(f"Retriever returned {len(retrieved_docs)} documents.")
|
| 44 |
+
for i, d in enumerate(retrieved_docs, start=1):
|
| 45 |
+
print(f"\n=== DOC {i} ===")
|
| 46 |
+
print(d.page_content[:400], "...")
|
| 47 |
+
print("Metadata:", d.metadata)
|
speech_io.py
ADDED
|
@@ -0,0 +1,157 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
speech_io.py
|
| 3 |
+
|
| 4 |
+
Sprachbasierte Ein-/Ausgabe:
|
| 5 |
+
- Speech-to-Text (STT) mit Whisper (transformers.pipeline)
|
| 6 |
+
- Text-to-Speech (TTS) mit MMS-TTS Deutsch
|
| 7 |
+
|
| 8 |
+
Dieses File ist 100% stabil für HuggingFace Spaces.
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
from typing import Optional, Tuple
|
| 12 |
+
import numpy as np
|
| 13 |
+
import soundfile as sf
|
| 14 |
+
from scipy.signal import butter, filtfilt
|
| 15 |
+
from transformers import pipeline
|
| 16 |
+
|
| 17 |
+
# Modelle
|
| 18 |
+
ASR_MODEL_ID = "openai/whisper-small"
|
| 19 |
+
TTS_MODEL_ID = "facebook/mms-tts-deu"
|
| 20 |
+
|
| 21 |
+
_asr = None
|
| 22 |
+
_tts = None
|
| 23 |
+
|
| 24 |
+
# ========================================================
|
| 25 |
+
# STT PIPELINE
|
| 26 |
+
# ========================================================
|
| 27 |
+
|
| 28 |
+
def get_asr_pipeline():
|
| 29 |
+
global _asr
|
| 30 |
+
if _asr is None:
|
| 31 |
+
print(f">>> Lade ASR Modell: {ASR_MODEL_ID}")
|
| 32 |
+
_asr = pipeline(
|
| 33 |
+
task="automatic-speech-recognition",
|
| 34 |
+
model=ASR_MODEL_ID,
|
| 35 |
+
device="cpu",
|
| 36 |
+
return_timestamps=True, # wichtig
|
| 37 |
+
chunk_length_s=30 # auto-chunk für lange audio
|
| 38 |
+
)
|
| 39 |
+
return _asr
|
| 40 |
+
|
| 41 |
+
# ========================================================
|
| 42 |
+
# TTS PIPELINE
|
| 43 |
+
# ========================================================
|
| 44 |
+
|
| 45 |
+
def get_tts_pipeline():
|
| 46 |
+
global _tts
|
| 47 |
+
if _tts is None:
|
| 48 |
+
print(f">>> Lade TTS Modell: {TTS_MODEL_ID}")
|
| 49 |
+
_tts = pipeline(
|
| 50 |
+
task="text-to-speech",
|
| 51 |
+
model=TTS_MODEL_ID,
|
| 52 |
+
)
|
| 53 |
+
return _tts
|
| 54 |
+
|
| 55 |
+
# ========================================================
|
| 56 |
+
# AUDIO FILTER – Noise Reduction + Highpass
|
| 57 |
+
# ========================================================
|
| 58 |
+
|
| 59 |
+
def butter_highpass_filter(data, cutoff=60, fs=16000, order=4):
|
| 60 |
+
nyq = 0.5 * fs
|
| 61 |
+
norm_cutoff = cutoff / nyq
|
| 62 |
+
b, a = butter(order, norm_cutoff, btype="high")
|
| 63 |
+
return filtfilt(b, a, data)
|
| 64 |
+
|
| 65 |
+
def apply_fade(audio, sr, duration_ms=10):
|
| 66 |
+
fade_samples = int(sr * duration_ms / 1000)
|
| 67 |
+
|
| 68 |
+
if fade_samples * 2 >= len(audio):
|
| 69 |
+
return audio
|
| 70 |
+
|
| 71 |
+
fade_in_curve = np.linspace(0, 1, fade_samples)
|
| 72 |
+
audio[:fade_samples] *= fade_in_curve
|
| 73 |
+
|
| 74 |
+
fade_out_curve = np.linspace(1, 0, fade_samples)
|
| 75 |
+
audio[-fade_samples:] *= fade_out_curve
|
| 76 |
+
|
| 77 |
+
return audio
|
| 78 |
+
|
| 79 |
+
# ========================================================
|
| 80 |
+
# SPEECH-TO-TEXT (STT)
|
| 81 |
+
# ========================================================
|
| 82 |
+
|
| 83 |
+
def transcribe_audio(audio_path: str) -> str:
|
| 84 |
+
"""
|
| 85 |
+
audio_path: path zu WAV-Datei (von gr.Audio type="filepath")
|
| 86 |
+
"""
|
| 87 |
+
|
| 88 |
+
if audio_path is None:
|
| 89 |
+
return ""
|
| 90 |
+
|
| 91 |
+
# WAV einlesen (soundfile garantiert PCM korrekt)
|
| 92 |
+
data, sr = sf.read(audio_path)
|
| 93 |
+
|
| 94 |
+
# immer Mono
|
| 95 |
+
if len(data.shape) > 1:
|
| 96 |
+
data = data[:, 0]
|
| 97 |
+
|
| 98 |
+
# Whisper >30s vermeiden
|
| 99 |
+
MAX_SAMPLES = sr * 30
|
| 100 |
+
if len(data) > MAX_SAMPLES:
|
| 101 |
+
data = data[:MAX_SAMPLES]
|
| 102 |
+
|
| 103 |
+
asr = get_asr_pipeline()
|
| 104 |
+
|
| 105 |
+
print(">>> Transkribiere Audio...")
|
| 106 |
+
result = asr(
|
| 107 |
+
{"array": data, "sampling_rate": sr},
|
| 108 |
+
)
|
| 109 |
+
|
| 110 |
+
text = result.get("text", "").strip()
|
| 111 |
+
print("ASR:", text)
|
| 112 |
+
return text
|
| 113 |
+
|
| 114 |
+
# ========================================================
|
| 115 |
+
# TEXT-TO-SPEECH (TTS)
|
| 116 |
+
# ========================================================
|
| 117 |
+
|
| 118 |
+
def synthesize_speech(text: str):
|
| 119 |
+
if not text or not text.strip():
|
| 120 |
+
return None
|
| 121 |
+
|
| 122 |
+
tts = get_tts_pipeline()
|
| 123 |
+
out = tts(text)
|
| 124 |
+
|
| 125 |
+
# rohes Audio from MMS (float32 [-1, 1])
|
| 126 |
+
audio = np.array(out["audio"], dtype=np.float32)
|
| 127 |
+
sr = out.get("sampling_rate", 16000)
|
| 128 |
+
|
| 129 |
+
# ===== FIX sample_rate =====
|
| 130 |
+
if sr is None or sr <= 0 or sr > 65535:
|
| 131 |
+
sr = 16000
|
| 132 |
+
|
| 133 |
+
# ===== Mono erzwingen =====
|
| 134 |
+
if audio.ndim > 1:
|
| 135 |
+
audio = audio.squeeze()
|
| 136 |
+
if audio.ndim > 1:
|
| 137 |
+
audio = audio[:, 0]
|
| 138 |
+
|
| 139 |
+
# ===== Noise reduction =====
|
| 140 |
+
try:
|
| 141 |
+
audio = butter_highpass_filter(audio, cutoff=60, fs=sr)
|
| 142 |
+
except:
|
| 143 |
+
pass
|
| 144 |
+
|
| 145 |
+
# ===== Normalize =====
|
| 146 |
+
max_val = np.max(np.abs(audio))
|
| 147 |
+
if max_val > 0:
|
| 148 |
+
audio = audio / max_val
|
| 149 |
+
|
| 150 |
+
# ===== Fade gegen pop =====
|
| 151 |
+
audio = apply_fade(audio, sr)
|
| 152 |
+
|
| 153 |
+
# ===== int16 =====
|
| 154 |
+
audio_int16 = np.clip(audio * 32767, -32768, 32767).astype(np.int16)
|
| 155 |
+
|
| 156 |
+
# Rückgabe: (sr, np.int16 array)
|
| 157 |
+
return (sr, audio_int16)
|
split_documents.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# split_documents.py – v2
|
| 2 |
+
|
| 3 |
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
| 4 |
+
|
| 5 |
+
CHUNK_SIZE = 1500
|
| 6 |
+
CHUNK_OVERLAP = 200
|
| 7 |
+
|
| 8 |
+
def split_documents(docs):
|
| 9 |
+
splitter = RecursiveCharacterTextSplitter(
|
| 10 |
+
chunk_size=CHUNK_SIZE,
|
| 11 |
+
chunk_overlap=CHUNK_OVERLAP,
|
| 12 |
+
separators=["\n\n", "\n", ". ", " ", ""],
|
| 13 |
+
)
|
| 14 |
+
chunks = splitter.split_documents(docs)
|
| 15 |
+
|
| 16 |
+
for c in chunks:
|
| 17 |
+
c.metadata["chunk_size"] = CHUNK_SIZE
|
| 18 |
+
c.metadata["chunk_overlap"] = CHUNK_OVERLAP
|
| 19 |
+
|
| 20 |
+
return chunks
|
| 21 |
+
|
| 22 |
+
if __name__ == "__main__":
|
| 23 |
+
from load_documents import load_documents
|
| 24 |
+
docs = load_documents()
|
| 25 |
+
chunks = split_documents(docs)
|
| 26 |
+
print("Docs:", len(docs), "Chunks:", len(chunks))
|
| 27 |
+
print(chunks[0].page_content[:300], chunks[0].metadata)
|
| 28 |
+
|
supabase_client.py
DELETED
|
@@ -1,25 +0,0 @@
|
|
| 1 |
-
# supabase_client.py
|
| 2 |
-
import os
|
| 3 |
-
from supabase import create_client
|
| 4 |
-
|
| 5 |
-
SUPABASE_URL = os.environ["SUPABASE_URL"]
|
| 6 |
-
SUPABASE_SERVICE_ROLE = os.environ["SUPABASE_SERVICE_ROLE"]
|
| 7 |
-
|
| 8 |
-
supabase = create_client(SUPABASE_URL, SUPABASE_SERVICE_ROLE)
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
def load_file_bytes(bucket: str, filename: str) -> bytes:
|
| 12 |
-
return supabase.storage.from_(bucket).download(filename)
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
def match_documents(embedding: list, k: int = 4):
|
| 16 |
-
"""
|
| 17 |
-
Gọi trực tiếp RPC match_documents trong Supabase.
|
| 18 |
-
Trả về list các rows: {content, metadata, embedding?}
|
| 19 |
-
"""
|
| 20 |
-
resp = supabase.rpc(
|
| 21 |
-
"match_documents",
|
| 22 |
-
{"query_embedding": embedding, "match_count": k}
|
| 23 |
-
).execute()
|
| 24 |
-
|
| 25 |
-
return resp.data or []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
upload_weblink_to_supabase.py
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import requests
|
| 3 |
+
from bs4 import BeautifulSoup
|
| 4 |
+
from supabase import create_client
|
| 5 |
+
from dotenv import load_dotenv
|
| 6 |
+
|
| 7 |
+
load_dotenv()
|
| 8 |
+
|
| 9 |
+
SUPABASE_URL = os.environ["SUPABASE_URL"]
|
| 10 |
+
SUPABASE_SERVICE_ROLE = os.environ["SUPABASE_SERVICE_ROLE"]
|
| 11 |
+
|
| 12 |
+
supabase = create_client(SUPABASE_URL, SUPABASE_SERVICE_ROLE)
|
| 13 |
+
|
| 14 |
+
LAW_URL = "https://recht.nrw.de/lmi/owa/br_text_anzeigen?v_id=10000000000000000654"
|
| 15 |
+
|
| 16 |
+
def extract_paragraphs():
|
| 17 |
+
print(">>> Lade Hochschulgesetz NRW …")
|
| 18 |
+
|
| 19 |
+
html = requests.get(LAW_URL, timeout=30).text
|
| 20 |
+
soup = BeautifulSoup(html, "html.parser")
|
| 21 |
+
|
| 22 |
+
# Tất cả tiêu đề Paragraph xuất hiện trong <h2> hoặc <h3>
|
| 23 |
+
headers = soup.find_all(["h2", "h3"])
|
| 24 |
+
|
| 25 |
+
paragraphs = []
|
| 26 |
+
order = 1
|
| 27 |
+
|
| 28 |
+
for header in headers:
|
| 29 |
+
title = header.get_text(" ", strip=True)
|
| 30 |
+
|
| 31 |
+
if not title.startswith("§"):
|
| 32 |
+
continue # bỏ các h2/h3 không phải Paragraph
|
| 33 |
+
|
| 34 |
+
# Gom toàn bộ nội dung từ header đến trước h2/h3 tiếp theo
|
| 35 |
+
content_parts = []
|
| 36 |
+
sibling = header.find_next_sibling()
|
| 37 |
+
|
| 38 |
+
while sibling and sibling.name not in ["h2", "h3"]:
|
| 39 |
+
text = sibling.get_text(" ", strip=True)
|
| 40 |
+
if text:
|
| 41 |
+
content_parts.append(text)
|
| 42 |
+
sibling = sibling.find_next_sibling()
|
| 43 |
+
|
| 44 |
+
full_content = "\n".join(content_parts).strip()
|
| 45 |
+
|
| 46 |
+
para_id = f"para_{order}"
|
| 47 |
+
|
| 48 |
+
paragraphs.append({
|
| 49 |
+
"abs_id": para_id,
|
| 50 |
+
"title": title,
|
| 51 |
+
"content": full_content,
|
| 52 |
+
"order_index": order
|
| 53 |
+
})
|
| 54 |
+
|
| 55 |
+
order += 1
|
| 56 |
+
|
| 57 |
+
print(f"✔ Extracted {len(paragraphs)} paragraphs (§).")
|
| 58 |
+
return paragraphs
|
| 59 |
+
|
| 60 |
+
def upload_to_supabase():
|
| 61 |
+
paras = extract_paragraphs()
|
| 62 |
+
|
| 63 |
+
print(">>> Clear table hg_nrw …")
|
| 64 |
+
supabase.table("hg_nrw").delete().neq("abs_id", "").execute()
|
| 65 |
+
|
| 66 |
+
print(">>> Upload begin …")
|
| 67 |
+
BATCH = 100
|
| 68 |
+
for i in range(0, len(paras), BATCH):
|
| 69 |
+
batch = paras[i:i+BATCH]
|
| 70 |
+
print(f" - Upload batch {i} – {i+len(batch)-1}")
|
| 71 |
+
supabase.table("hg_nrw").upsert(batch).execute()
|
| 72 |
+
|
| 73 |
+
print("✔ DONE uploading complete NRW law.")
|
| 74 |
+
|
| 75 |
+
if __name__ == "__main__":
|
| 76 |
+
upload_to_supabase()
|
vectorstore.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
BƯỚC 4: VECTORSTORE (FAISS in-memory)
|
| 3 |
+
-------------------------------------
|
| 4 |
+
Tạo FAISS index từ các CHUNK văn bản.
|
| 5 |
+
|
| 6 |
+
- Không ghi file .faiss nào, tất cả nằm trong RAM.
|
| 7 |
+
- Embeddings được lấy từ get_embeddings() (Bước 3).
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
from langchain_community.vectorstores import FAISS
|
| 11 |
+
from embeddings import get_embeddings
|
| 12 |
+
|
| 13 |
+
def build_vectorstore(chunks):
|
| 14 |
+
"""
|
| 15 |
+
Nhận danh sách Document (đã split) và trả về FAISS VectorStore.
|
| 16 |
+
"""
|
| 17 |
+
print(">>> Initialising embedding model for FAISS index ...")
|
| 18 |
+
embeddings = get_embeddings()
|
| 19 |
+
|
| 20 |
+
print(f">>> Building FAISS index from {len(chunks)} chunks ...")
|
| 21 |
+
vs = FAISS.from_documents(chunks, embeddings)
|
| 22 |
+
print(">>> FAISS index built.\n")
|
| 23 |
+
return vs
|
| 24 |
+
|
| 25 |
+
if __name__ == "__main__":
|
| 26 |
+
# Test toàn pipeline: load -> split -> FAISS -> similarity_search
|
| 27 |
+
from load_documents import load_documents
|
| 28 |
+
from split_documents import split_documents
|
| 29 |
+
|
| 30 |
+
print("=== TEST: load_documents -> split_documents -> FAISS.similarity_search ===\n")
|
| 31 |
+
|
| 32 |
+
# 1) Load tài liệu (PDF + HTML) từ HuggingFace
|
| 33 |
+
docs = load_documents()
|
| 34 |
+
|
| 35 |
+
# 2) Split thành chunks
|
| 36 |
+
from pprint import pprint
|
| 37 |
+
print(f"Loaded {len(docs)} raw documents.")
|
| 38 |
+
chunks = split_documents(docs)
|
| 39 |
+
print(f"Split into {len(chunks)} chunks.\n")
|
| 40 |
+
|
| 41 |
+
# 3) Xây FAISS vectorstore
|
| 42 |
+
vectorstore = build_vectorstore(chunks)
|
| 43 |
+
|
| 44 |
+
# 4) Test similarity_search
|
| 45 |
+
query = "Fristen für die Prüfungsanmeldung im Bachelorstudium"
|
| 46 |
+
print("Test query:")
|
| 47 |
+
print(" ", query, "\n")
|
| 48 |
+
|
| 49 |
+
results = vectorstore.similarity_search(query, k=3)
|
| 50 |
+
|
| 51 |
+
print("Top-3 ähnliche Chunks aus dem VectorStore:")
|
| 52 |
+
for i, doc in enumerate(results, start=1):
|
| 53 |
+
print(f"\n=== RESULT {i} ===")
|
| 54 |
+
print(doc.page_content[:400], "...")
|
| 55 |
+
print("Metadata:", doc.metadata)
|
| 56 |
+
|