commit
Browse files- app.py +65 -127
- chat_history.py +0 -70
- embeddings.py +0 -24
- hg_nrw_supabase.py +0 -99
- ingest.py +73 -0
- llm.py +0 -27
- load_documents.py +0 -104
- rag_pipeline.py +62 -96
- requirements.txt +3 -15
- retriever.py +0 -48
- speech_io.py +0 -52
- split_documents.py +0 -28
- supabase_client.py +12 -0
- vectorstore.py +0 -55
- viewer.py +0 -76
app.py
CHANGED
|
@@ -1,131 +1,69 @@
|
|
| 1 |
# app.py
|
|
|
|
|
|
|
| 2 |
import os
|
| 3 |
-
from typing import List, Tuple
|
| 4 |
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
from langchain_community.vectorstores import FAISS
|
| 9 |
-
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
|
| 10 |
-
|
| 11 |
-
from load_documents import load_documents
|
| 12 |
-
from speech_io import transcribe_audio, synthesize_speech
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
# ========== 1. Lade Dokumente ==========
|
| 16 |
-
print("🔹 Lade Dokumente aus Supabase …")
|
| 17 |
-
docs: List[Document] = load_documents()
|
| 18 |
-
print("✔ DOCUMENTS LOADED:", len(docs))
|
| 19 |
-
|
| 20 |
-
print("🔹 Splitte Dokumente …")
|
| 21 |
-
text_splitter = RecursiveCharacterTextSplitter(
|
| 22 |
-
chunk_size=800,
|
| 23 |
-
chunk_overlap=200,
|
| 24 |
-
)
|
| 25 |
-
chunks = text_splitter.split_documents(docs)
|
| 26 |
-
print(f" - {len(chunks)} Chunks erzeugt.")
|
| 27 |
-
|
| 28 |
-
print("🔹 Erzeuge VectorStore …")
|
| 29 |
-
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
|
| 30 |
-
vectorstore = FAISS.from_documents(chunks, embeddings)
|
| 31 |
-
retriever = vectorstore.as_retriever(search_kwargs={"k": 4})
|
| 32 |
-
print(">>> Retriever ready.")
|
| 33 |
-
|
| 34 |
-
print("🔹 Lade OpenAI LLM …")
|
| 35 |
-
llm = ChatOpenAI(
|
| 36 |
-
model="gpt-4o-mini",
|
| 37 |
-
temperature=0.1,
|
| 38 |
-
)
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
# ========== 2. RAG ==========
|
| 42 |
-
def build_context(docs: List[Document]) -> str:
|
| 43 |
-
parts = []
|
| 44 |
-
for i, d in enumerate(docs, 1):
|
| 45 |
-
meta = d.metadata
|
| 46 |
-
src = meta.get("source")
|
| 47 |
-
page = meta.get("page")
|
| 48 |
-
abs_id = meta.get("abs_id")
|
| 49 |
-
|
| 50 |
-
label = f"[Quelle {i}] {src}"
|
| 51 |
-
if page:
|
| 52 |
-
label += f", Seite {page}"
|
| 53 |
-
if abs_id:
|
| 54 |
-
label += f", Abs. {abs_id}"
|
| 55 |
-
|
| 56 |
-
parts.append(f"{label}\n{d.page_content}")
|
| 57 |
-
return "\n\n".join(parts)
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
def rag_answer(query: str, mode: str):
|
| 61 |
-
retrieved = retriever.invoke(query)
|
| 62 |
-
ctx = build_context(retrieved)
|
| 63 |
-
|
| 64 |
-
modes = {
|
| 65 |
-
"Kurz": "Antworte sehr kurz (max. 3 Sätze).",
|
| 66 |
-
"Standard": "Antworte ausführlich und verständlich.",
|
| 67 |
-
"Juristisch Präzise": "Formuliere juristisch präzise.",
|
| 68 |
-
}
|
| 69 |
-
|
| 70 |
-
messages = [
|
| 71 |
-
{
|
| 72 |
-
"role": "system",
|
| 73 |
-
"content": "Du bist ein Chatbot für Prüfungsrecht. Antworte nur auf Deutsch."
|
| 74 |
-
},
|
| 75 |
-
{
|
| 76 |
-
"role": "user",
|
| 77 |
-
"content": f"FRAGE:\n{query}\n\nKONTEXT:\n{ctx}\n\n{modes[mode]}"
|
| 78 |
-
}
|
| 79 |
-
]
|
| 80 |
-
|
| 81 |
-
resp = llm.invoke(messages)
|
| 82 |
-
return resp.content, retrieved
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
# ========== 3. Chatbot Funktionen (GRADIO 4.x – TUPLES) ==========
|
| 86 |
-
def chatbot_text(user_input: str, history: List[Tuple[str, str]], mode: str):
|
| 87 |
-
answer, _ = rag_answer(user_input, mode)
|
| 88 |
-
history = history + [(user_input, answer)]
|
| 89 |
-
return history, history
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
def chatbot_voice(audio_file: str, history: List[Tuple[str, str]], mode: str, language_hint: str):
|
| 93 |
-
user_text = transcribe_audio(audio_file, language_hint or None)
|
| 94 |
-
answer, _ = rag_answer(user_text, mode)
|
| 95 |
-
audio_out = synthesize_speech(answer)
|
| 96 |
-
|
| 97 |
-
history = history + [(user_text, answer)]
|
| 98 |
-
return history, audio_out, user_text, history
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
# ========== 4. UI ==========
|
| 102 |
-
with gr.Blocks(title="Prüfungsrechts-Chatbot") as demo:
|
| 103 |
-
|
| 104 |
-
with gr.Tab("💬 Text-Chat"):
|
| 105 |
-
mode = gr.Radio(["Kurz", "Standard", "Juristisch Präzise"], value="Standard")
|
| 106 |
-
chat = gr.Chatbot()
|
| 107 |
-
state = gr.State([])
|
| 108 |
-
inp = gr.Textbox(label="Frage eingeben")
|
| 109 |
-
send = gr.Button("Senden")
|
| 110 |
-
|
| 111 |
-
send.click(chatbot_text, [inp, state, mode], [chat, state])
|
| 112 |
-
|
| 113 |
-
with gr.Tab("🎙️ Sprach-Chat"):
|
| 114 |
-
mode_v = gr.Radio(["Kurz", "Standard", "Juristisch Präzise"], value="Standard")
|
| 115 |
-
chat_v = gr.Chatbot()
|
| 116 |
-
state_v = gr.State([])
|
| 117 |
-
|
| 118 |
-
mic = gr.Audio(sources=["microphone"], type="filepath")
|
| 119 |
-
lang = gr.Textbox(label="Sprache (optional: de/en/vi)")
|
| 120 |
-
out_audio = gr.Audio()
|
| 121 |
-
transcript = gr.Textbox(label="Transkript")
|
| 122 |
-
|
| 123 |
-
btn = gr.Button("Sprechen")
|
| 124 |
-
btn.click(
|
| 125 |
-
chatbot_voice,
|
| 126 |
-
[mic, state_v, mode_v, lang],
|
| 127 |
-
[chat_v, out_audio, transcript, state_v]
|
| 128 |
-
)
|
| 129 |
|
| 130 |
-
|
| 131 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
# app.py
|
| 2 |
+
import gradio as gr
|
| 3 |
+
from openai import OpenAI
|
| 4 |
import os
|
|
|
|
| 5 |
|
| 6 |
+
from rag_pipeline import rag_answer
|
| 7 |
+
|
| 8 |
+
client = OpenAI()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
+
PDF_URL = os.environ["PDF_URL"]
|
| 11 |
+
HG_URL = os.environ["HG_URL"]
|
| 12 |
+
|
| 13 |
+
def transcribe(audio_path):
|
| 14 |
+
if audio_path is None:
|
| 15 |
+
return ""
|
| 16 |
+
|
| 17 |
+
with open(audio_path, "rb") as f:
|
| 18 |
+
result = client.audio.transcriptions.create(
|
| 19 |
+
model="whisper-1",
|
| 20 |
+
file=f,
|
| 21 |
+
)
|
| 22 |
+
return result.text
|
| 23 |
+
|
| 24 |
+
def chat_fn(text, audio, history):
|
| 25 |
+
# Microphone input → text
|
| 26 |
+
spoken_text = transcribe(audio)
|
| 27 |
+
|
| 28 |
+
if text and spoken_text:
|
| 29 |
+
question = f"{text}\n(Gesprochen: {spoken_text})"
|
| 30 |
+
elif spoken_text:
|
| 31 |
+
question = spoken_text
|
| 32 |
+
else:
|
| 33 |
+
question = text or ""
|
| 34 |
+
|
| 35 |
+
if not question:
|
| 36 |
+
return history, "<p>Bitte Text oder Mikrofon benutzen.</p>"
|
| 37 |
+
|
| 38 |
+
answer, docs = rag_answer(question, history or [])
|
| 39 |
+
|
| 40 |
+
# Prepare sources
|
| 41 |
+
html = "<ol>"
|
| 42 |
+
for i, d in enumerate(docs):
|
| 43 |
+
src = d.metadata.get("source", "?")
|
| 44 |
+
page = d.metadata.get("page", "")
|
| 45 |
+
url = PDF_URL if "Prüfungsordnung" in src else HG_URL
|
| 46 |
+
html += f"<li><a target='_blank' href='{url}'>{src} {page}</a><br>{d.page_content[:200]}...</li>"
|
| 47 |
+
html += "</ol>"
|
| 48 |
+
|
| 49 |
+
history.append((question, answer))
|
| 50 |
+
return history, html
|
| 51 |
+
|
| 52 |
+
with gr.Blocks() as demo:
|
| 53 |
+
gr.Markdown("# ⚖️ Sprachbasierter Chatbot für Prüfungsrecht")
|
| 54 |
+
|
| 55 |
+
with gr.Row():
|
| 56 |
+
with gr.Column(scale=3):
|
| 57 |
+
chat = gr.Chatbot()
|
| 58 |
+
text = gr.Textbox(label="Text Eingabe")
|
| 59 |
+
audio = gr.Audio(source="microphone", type="filepath")
|
| 60 |
+
send = gr.Button("Senden")
|
| 61 |
+
|
| 62 |
+
with gr.Column(scale=2):
|
| 63 |
+
gr.HTML(f"<iframe src='{PDF_URL}' width='100%' height='250'></iframe>")
|
| 64 |
+
gr.HTML(f"<iframe src='{HG_URL}' width='100%' height='250'></iframe>")
|
| 65 |
+
sources = gr.HTML()
|
| 66 |
+
|
| 67 |
+
send.click(chat_fn, inputs=[text, audio, chat], outputs=[chat, sources])
|
| 68 |
+
|
| 69 |
+
demo.launch()
|
chat_history.py
DELETED
|
@@ -1,70 +0,0 @@
|
|
| 1 |
-
# chat_history.py – Supabase chat history (messages-format für Gradio)
|
| 2 |
-
|
| 3 |
-
import uuid
|
| 4 |
-
import os
|
| 5 |
-
from supabase import create_client
|
| 6 |
-
|
| 7 |
-
SUPABASE_URL = os.environ["SUPABASE_URL"]
|
| 8 |
-
SUPABASE_SERVICE_ROLE = os.environ["SUPABASE_SERVICE_ROLE"]
|
| 9 |
-
|
| 10 |
-
supabase = create_client(SUPABASE_URL, SUPABASE_SERVICE_ROLE)
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
# ------------------------------------------------------
|
| 14 |
-
# Session anlegen
|
| 15 |
-
# ------------------------------------------------------
|
| 16 |
-
|
| 17 |
-
def create_session() -> str:
|
| 18 |
-
session_id = str(uuid.uuid4())
|
| 19 |
-
|
| 20 |
-
supabase.table("chat_sessions").insert({
|
| 21 |
-
"id": session_id
|
| 22 |
-
}).execute()
|
| 23 |
-
|
| 24 |
-
return session_id
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
# ------------------------------------------------------
|
| 28 |
-
# Message speichern
|
| 29 |
-
# ------------------------------------------------------
|
| 30 |
-
|
| 31 |
-
def save_message(session_id: str, role: str, content: str):
|
| 32 |
-
if not session_id or session_id == "None":
|
| 33 |
-
print("⚠ WARN: invalid session_id, skip save_message")
|
| 34 |
-
return
|
| 35 |
-
|
| 36 |
-
supabase.table("chat_messages").insert({
|
| 37 |
-
"session_id": session_id,
|
| 38 |
-
"role": role,
|
| 39 |
-
"content": content,
|
| 40 |
-
}).execute()
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
# ------------------------------------------------------
|
| 44 |
-
# History laden – Format: [{role: ..., content: ...}, ...]
|
| 45 |
-
# ------------------------------------------------------
|
| 46 |
-
|
| 47 |
-
def load_history(session_id: str):
|
| 48 |
-
if not session_id or session_id == "None":
|
| 49 |
-
return []
|
| 50 |
-
|
| 51 |
-
res = (
|
| 52 |
-
supabase.table("chat_messages")
|
| 53 |
-
.select("*")
|
| 54 |
-
.eq("session_id", session_id)
|
| 55 |
-
.order("created_at")
|
| 56 |
-
.execute()
|
| 57 |
-
)
|
| 58 |
-
|
| 59 |
-
rows = res.data or []
|
| 60 |
-
|
| 61 |
-
history = []
|
| 62 |
-
for r in rows:
|
| 63 |
-
history.append(
|
| 64 |
-
{
|
| 65 |
-
"role": r["role"],
|
| 66 |
-
"content": r["content"],
|
| 67 |
-
}
|
| 68 |
-
)
|
| 69 |
-
|
| 70 |
-
return history
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
embeddings.py
DELETED
|
@@ -1,24 +0,0 @@
|
|
| 1 |
-
# embeddings.py – OpenAI Version (text-embedding-3-small)
|
| 2 |
-
|
| 3 |
-
import os
|
| 4 |
-
from langchain_openai import OpenAIEmbeddings
|
| 5 |
-
|
| 6 |
-
EMBED_MODEL = "text-embedding-3-small"
|
| 7 |
-
|
| 8 |
-
def get_embeddings():
|
| 9 |
-
api_key = os.environ.get("OPENAI_API_KEY")
|
| 10 |
-
if not api_key:
|
| 11 |
-
raise RuntimeError(
|
| 12 |
-
"OPENAI_API_KEY fehlt. Bitte als Secret im HuggingFace Space setzen."
|
| 13 |
-
)
|
| 14 |
-
|
| 15 |
-
print(f">>> Lade OpenAI Embedding Model: {EMBED_MODEL}")
|
| 16 |
-
emb = OpenAIEmbeddings(
|
| 17 |
-
model=EMBED_MODEL,
|
| 18 |
-
api_key=api_key,
|
| 19 |
-
)
|
| 20 |
-
return emb
|
| 21 |
-
|
| 22 |
-
if __name__ == "__main__":
|
| 23 |
-
e = get_embeddings()
|
| 24 |
-
print(e.embed_query("Test"))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
hg_nrw_supabase.py
DELETED
|
@@ -1,99 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
hg_nrw_supabase.py
|
| 3 |
-
|
| 4 |
-
Lädt das Hochschulgesetz NRW von recht.nrw.de,
|
| 5 |
-
extrahiert alle Paragraphen (§ …) und schreibt sie in
|
| 6 |
-
die Supabase-Tabelle public.hg_nrw.
|
| 7 |
-
|
| 8 |
-
Erwartete Spalten in hg_nrw:
|
| 9 |
-
- abs_id text (z.B. 'para_64')
|
| 10 |
-
- title text (z.B. '§ 64 Prüfungsordnungen')
|
| 11 |
-
- content text (Volltext)
|
| 12 |
-
- order_index int4 (Sortierreihenfolge)
|
| 13 |
-
- source_url text (immer die Original-URL von recht.nrw.de)
|
| 14 |
-
"""
|
| 15 |
-
|
| 16 |
-
import os
|
| 17 |
-
import requests
|
| 18 |
-
from bs4 import BeautifulSoup
|
| 19 |
-
from supabase import create_client
|
| 20 |
-
from dotenv import load_dotenv
|
| 21 |
-
|
| 22 |
-
load_dotenv()
|
| 23 |
-
|
| 24 |
-
SUPABASE_URL = os.environ["SUPABASE_URL"]
|
| 25 |
-
SUPABASE_SERVICE_ROLE = os.environ["SUPABASE_SERVICE_ROLE"]
|
| 26 |
-
|
| 27 |
-
supabase = create_client(SUPABASE_URL, SUPABASE_SERVICE_ROLE)
|
| 28 |
-
|
| 29 |
-
# Nur DIESE URL, keine Druckversion:
|
| 30 |
-
LAW_URL = "https://recht.nrw.de/lmi/owa/br_text_anzeigen?v_id=10000000000000000654"
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
def extract_paragraphs():
|
| 34 |
-
print(">>> Lade Hochschulgesetz NRW von recht.nrw.de …")
|
| 35 |
-
|
| 36 |
-
html = requests.get(LAW_URL, timeout=30).text
|
| 37 |
-
soup = BeautifulSoup(html, "html.parser")
|
| 38 |
-
|
| 39 |
-
# Alle Überschriften <h2>/<h3>, viele davon sind §§
|
| 40 |
-
headers = soup.find_all(["h2", "h3"])
|
| 41 |
-
|
| 42 |
-
paragraphs = []
|
| 43 |
-
order = 1
|
| 44 |
-
|
| 45 |
-
for header in headers:
|
| 46 |
-
title = header.get_text(" ", strip=True)
|
| 47 |
-
|
| 48 |
-
# Nur Überschriften, die mit "§" anfangen
|
| 49 |
-
if not title.startswith("§"):
|
| 50 |
-
continue
|
| 51 |
-
|
| 52 |
-
# Inhalte ab dieser Überschrift bis vor die nächste h2/h3
|
| 53 |
-
content_parts = []
|
| 54 |
-
sibling = header.find_next_sibling()
|
| 55 |
-
|
| 56 |
-
while sibling and sibling.name not in ["h2", "h3"]:
|
| 57 |
-
text = sibling.get_text(" ", strip=True)
|
| 58 |
-
if text:
|
| 59 |
-
content_parts.append(text)
|
| 60 |
-
sibling = sibling.find_next_sibling()
|
| 61 |
-
|
| 62 |
-
full_content = "\n".join(content_parts).strip()
|
| 63 |
-
abs_id = f"para_{order}"
|
| 64 |
-
|
| 65 |
-
paragraphs.append(
|
| 66 |
-
{
|
| 67 |
-
"abs_id": abs_id,
|
| 68 |
-
"title": title,
|
| 69 |
-
"content": full_content,
|
| 70 |
-
"order_index": order,
|
| 71 |
-
# dùng trực tiếp web link gốc, không thêm anchor
|
| 72 |
-
"source_url": LAW_URL,
|
| 73 |
-
}
|
| 74 |
-
)
|
| 75 |
-
|
| 76 |
-
order += 1
|
| 77 |
-
|
| 78 |
-
print(f"✔ Extracted {len(paragraphs)} paragraphs (§).")
|
| 79 |
-
return paragraphs
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
def upload_to_supabase():
|
| 83 |
-
paras = extract_paragraphs()
|
| 84 |
-
|
| 85 |
-
print(">>> Clear table hg_nrw …")
|
| 86 |
-
supabase.table("hg_nrw").delete().neq("abs_id", "").execute()
|
| 87 |
-
|
| 88 |
-
print(">>> Upload begin …")
|
| 89 |
-
BATCH = 100
|
| 90 |
-
for i in range(0, len(paras), BATCH):
|
| 91 |
-
batch = paras[i : i + BATCH]
|
| 92 |
-
print(f" - Upload batch {i} – {i + len(batch) - 1}")
|
| 93 |
-
supabase.table("hg_nrw").upsert(batch).execute()
|
| 94 |
-
|
| 95 |
-
print("✔ DONE uploading complete NRW law.")
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
if __name__ == "__main__":
|
| 99 |
-
upload_to_supabase()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ingest.py
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ingest.py
|
| 2 |
+
import os
|
| 3 |
+
from io import BytesIO
|
| 4 |
+
from bs4 import BeautifulSoup
|
| 5 |
+
from pypdf import PdfReader
|
| 6 |
+
|
| 7 |
+
from supabase_client import supabase, load_file_bytes
|
| 8 |
+
from langchain_openai import OpenAIEmbeddings
|
| 9 |
+
from langchain_community.vectorstores import SupabaseVectorStore
|
| 10 |
+
from langchain_core.documents import Document
|
| 11 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 12 |
+
|
| 13 |
+
BUCKET = os.environ["SUPABASE_BUCKET"]
|
| 14 |
+
|
| 15 |
+
def load_pdf_docs():
|
| 16 |
+
pdf_bytes = load_file_bytes(BUCKET, "pruefungsordnung.pdf")
|
| 17 |
+
reader = PdfReader(BytesIO(pdf_bytes))
|
| 18 |
+
|
| 19 |
+
docs = []
|
| 20 |
+
for i, page in enumerate(reader.pages):
|
| 21 |
+
text = page.extract_text() or ""
|
| 22 |
+
docs.append(
|
| 23 |
+
Document(
|
| 24 |
+
page_content=text,
|
| 25 |
+
metadata={"source": "Prüfungsordnung", "page": i + 1},
|
| 26 |
+
)
|
| 27 |
+
)
|
| 28 |
+
return docs
|
| 29 |
+
|
| 30 |
+
def load_html_docs():
|
| 31 |
+
html_bytes = load_file_bytes(BUCKET, "hochschulgesetz.html")
|
| 32 |
+
html_str = html_bytes.decode("utf-8", errors="ignore")
|
| 33 |
+
soup = BeautifulSoup(html_str, "html.parser")
|
| 34 |
+
text = soup.get_text(separator="\n")
|
| 35 |
+
|
| 36 |
+
return [
|
| 37 |
+
Document(
|
| 38 |
+
page_content=text,
|
| 39 |
+
metadata={"source": "Hochschulgesetz NRW"},
|
| 40 |
+
)
|
| 41 |
+
]
|
| 42 |
+
|
| 43 |
+
def chunk_docs(docs):
|
| 44 |
+
splitter = RecursiveCharacterTextSplitter(
|
| 45 |
+
chunk_size=1000,
|
| 46 |
+
chunk_overlap=150
|
| 47 |
+
)
|
| 48 |
+
return splitter.split_documents(docs)
|
| 49 |
+
|
| 50 |
+
def main():
|
| 51 |
+
pdf_docs = load_pdf_docs()
|
| 52 |
+
html_docs = load_html_docs()
|
| 53 |
+
all_docs = pdf_docs + html_docs
|
| 54 |
+
|
| 55 |
+
chunks = chunk_docs(all_docs)
|
| 56 |
+
|
| 57 |
+
embeddings = OpenAIEmbeddings(
|
| 58 |
+
model="text-embedding-3-small"
|
| 59 |
+
)
|
| 60 |
+
|
| 61 |
+
SupabaseVectorStore.from_documents(
|
| 62 |
+
chunks,
|
| 63 |
+
embeddings,
|
| 64 |
+
client=supabase,
|
| 65 |
+
table_name="documents",
|
| 66 |
+
query_name="match_documents",
|
| 67 |
+
chunk_size=200,
|
| 68 |
+
)
|
| 69 |
+
|
| 70 |
+
print("Ingest OK (no local files).")
|
| 71 |
+
|
| 72 |
+
if __name__ == "__main__":
|
| 73 |
+
main()
|
llm.py
DELETED
|
@@ -1,27 +0,0 @@
|
|
| 1 |
-
# llm.py – OpenAI Chatmodell für RAG
|
| 2 |
-
|
| 3 |
-
import os
|
| 4 |
-
from langchain_openai import ChatOpenAI
|
| 5 |
-
|
| 6 |
-
CHAT_MODEL = "gpt-4o-mini" # günstig & stark
|
| 7 |
-
|
| 8 |
-
def load_llm():
|
| 9 |
-
api_key = os.environ.get("OPENAI_API_KEY")
|
| 10 |
-
if not api_key:
|
| 11 |
-
raise RuntimeError(
|
| 12 |
-
"OPENAI_API_KEY fehlt. Bitte als Secret im HuggingFace Space setzen."
|
| 13 |
-
)
|
| 14 |
-
|
| 15 |
-
print(f">>> Lade OpenAI Chatmodell: {CHAT_MODEL}")
|
| 16 |
-
|
| 17 |
-
llm = ChatOpenAI(
|
| 18 |
-
model=CHAT_MODEL,
|
| 19 |
-
temperature=0.0, # deterministisch, wenig Halluzination
|
| 20 |
-
api_key=api_key,
|
| 21 |
-
)
|
| 22 |
-
return llm
|
| 23 |
-
|
| 24 |
-
if __name__ == "__main__":
|
| 25 |
-
llm = load_llm()
|
| 26 |
-
print(llm.invoke("Sag einen Satz zum Prüfungsrecht.").content)
|
| 27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
load_documents.py
DELETED
|
@@ -1,104 +0,0 @@
|
|
| 1 |
-
# load_documents.py
|
| 2 |
-
|
| 3 |
-
import os
|
| 4 |
-
from io import BytesIO
|
| 5 |
-
from typing import List
|
| 6 |
-
|
| 7 |
-
from dotenv import load_dotenv
|
| 8 |
-
from supabase import create_client, Client
|
| 9 |
-
from pypdf import PdfReader
|
| 10 |
-
from langchain_core.documents import Document
|
| 11 |
-
|
| 12 |
-
load_dotenv()
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
# ============== Supabase Init ==============
|
| 16 |
-
def get_supabase_client() -> Client:
|
| 17 |
-
url = os.getenv("SUPABASE_URL")
|
| 18 |
-
key = (
|
| 19 |
-
os.getenv("SUPABASE_SERVICE_ROLE_KEY")
|
| 20 |
-
or os.getenv("SUPABASE_SERVICE_ROLE")
|
| 21 |
-
or os.getenv("SUPABASE_KEY")
|
| 22 |
-
)
|
| 23 |
-
if not url or not key:
|
| 24 |
-
raise RuntimeError("Supabase ENV fehlen.")
|
| 25 |
-
|
| 26 |
-
return create_client(url, key)
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
# ============== HG NRW Paragraphen ==============
|
| 30 |
-
def load_hg_paragraphs(supabase: Client) -> List[Document]:
|
| 31 |
-
print(">>> Lade Hochschulgesetz NRW (§) aus Supabase…")
|
| 32 |
-
|
| 33 |
-
table = os.getenv("HG_TABLE_NAME", "hg_nrw")
|
| 34 |
-
rows = supabase.table(table).select("*").order("order_index").execute().data or []
|
| 35 |
-
|
| 36 |
-
docs = []
|
| 37 |
-
for row in rows:
|
| 38 |
-
text = (row.get("title", "") + "\n\n" + row.get("content", "")).strip()
|
| 39 |
-
if not text:
|
| 40 |
-
continue
|
| 41 |
-
|
| 42 |
-
docs.append(Document(
|
| 43 |
-
page_content=text,
|
| 44 |
-
metadata={
|
| 45 |
-
"source": "Hochschulgesetz NRW",
|
| 46 |
-
"abs_id": row.get("abs_id"),
|
| 47 |
-
"order_index": row.get("order_index"),
|
| 48 |
-
"url": "https://recht.nrw.de/lmi/owa/br_text_anzeigen?v_id=10000000000000000654",
|
| 49 |
-
"type": "law",
|
| 50 |
-
}
|
| 51 |
-
))
|
| 52 |
-
|
| 53 |
-
print(f" - {len(docs)} Paragraphen geladen.")
|
| 54 |
-
return docs
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
# ============== Prüfungsordnung PDF ==============
|
| 58 |
-
def load_pruefungsordnung_from_storage(supabase: Client) -> List[Document]:
|
| 59 |
-
bucket = os.getenv("PRUEF_BUCKET")
|
| 60 |
-
pdf_path = os.getenv("PRUEF_PDF_PATH")
|
| 61 |
-
|
| 62 |
-
if not bucket or not pdf_path:
|
| 63 |
-
print(">>> Keine Prüfungsordnung-PDF definiert.")
|
| 64 |
-
return []
|
| 65 |
-
|
| 66 |
-
print(">>> Lade Prüfungsordnung PDF …")
|
| 67 |
-
|
| 68 |
-
try:
|
| 69 |
-
data = supabase.storage.from_(bucket).download(pdf_path)
|
| 70 |
-
except Exception as e:
|
| 71 |
-
print(" Fehler beim PDF Download:", e)
|
| 72 |
-
return []
|
| 73 |
-
|
| 74 |
-
reader = PdfReader(BytesIO(data))
|
| 75 |
-
docs = []
|
| 76 |
-
|
| 77 |
-
for i, page in enumerate(reader.pages):
|
| 78 |
-
text = (page.extract_text() or "").strip()
|
| 79 |
-
if not text:
|
| 80 |
-
continue
|
| 81 |
-
|
| 82 |
-
docs.append(Document(
|
| 83 |
-
page_content=text,
|
| 84 |
-
metadata={
|
| 85 |
-
"source": "Prüfungsordnung (PDF)",
|
| 86 |
-
"page": i + 1,
|
| 87 |
-
"type": "pruefungsordnung",
|
| 88 |
-
}
|
| 89 |
-
))
|
| 90 |
-
|
| 91 |
-
print(f" - {len(docs)} PDF-Seiten geladen.")
|
| 92 |
-
return docs
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
# ============== Main Loader ==============
|
| 96 |
-
def load_documents() -> List[Document]:
|
| 97 |
-
supabase = get_supabase_client()
|
| 98 |
-
docs = []
|
| 99 |
-
|
| 100 |
-
docs += load_hg_paragraphs(supabase)
|
| 101 |
-
docs += load_pruefungsordnung_from_storage(supabase)
|
| 102 |
-
|
| 103 |
-
print(f"✔ DOCUMENTS LOADED: {len(docs)}")
|
| 104 |
-
return docs
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
rag_pipeline.py
CHANGED
|
@@ -1,100 +1,66 @@
|
|
| 1 |
-
# rag_pipeline.py
|
| 2 |
-
|
| 3 |
-
from
|
| 4 |
-
from
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
def
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
"id": i + 1,
|
| 34 |
-
"source": src,
|
| 35 |
-
"page": page + 1 if isinstance(page, int) else None,
|
| 36 |
-
"url": url,
|
| 37 |
-
"snippet": snippet,
|
| 38 |
-
})
|
| 39 |
-
|
| 40 |
-
return srcs
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
def format_context(docs):
|
| 44 |
-
if not docs:
|
| 45 |
-
return "(Kein relevanter Kontext gefunden.)"
|
| 46 |
-
|
| 47 |
-
out = []
|
| 48 |
for i, d in enumerate(docs):
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
""
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
def answer(question: str, retriever, chat_model) -> Tuple[str, List[Dict[str, Any]]]:
|
| 76 |
-
docs = retriever.invoke(question)
|
| 77 |
-
context_str = format_context(docs)
|
| 78 |
-
|
| 79 |
-
human = f"""
|
| 80 |
-
FRAGE:
|
| 81 |
-
{question}
|
| 82 |
-
|
| 83 |
-
NUTZE AUSSCHLIESSLICH DIESEN KONTEXT:
|
| 84 |
-
{context_str}
|
| 85 |
-
|
| 86 |
-
AUFGABE:
|
| 87 |
-
Erstelle eine juristisch korrekte Antwort ausschließlich basierend
|
| 88 |
-
auf diesem Kontext. Falls der Kontext unzureichend ist, sage das klar.
|
| 89 |
-
"""
|
| 90 |
-
|
| 91 |
-
msgs = [
|
| 92 |
-
SystemMessage(content=SYSTEM_PROMPT),
|
| 93 |
-
HumanMessage(content=human),
|
| 94 |
]
|
| 95 |
|
| 96 |
-
|
| 97 |
-
|
|
|
|
|
|
|
| 98 |
|
| 99 |
-
|
| 100 |
-
return answer_text, sources
|
|
|
|
| 1 |
+
# rag_pipeline.py
|
| 2 |
+
import os
|
| 3 |
+
from datetime import date
|
| 4 |
+
from supabase_client import supabase
|
| 5 |
+
|
| 6 |
+
from langchain_community.vectorstores import SupabaseVectorStore
|
| 7 |
+
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
|
| 8 |
+
|
| 9 |
+
def get_vectorstore():
|
| 10 |
+
embeddings = OpenAIEmbeddings(
|
| 11 |
+
model="text-embedding-3-small"
|
| 12 |
+
)
|
| 13 |
+
return SupabaseVectorStore(
|
| 14 |
+
embedding=embeddings,
|
| 15 |
+
client=supabase,
|
| 16 |
+
table_name="documents",
|
| 17 |
+
query_name="match_documents",
|
| 18 |
+
)
|
| 19 |
+
|
| 20 |
+
def save_message(role, message):
|
| 21 |
+
supabase.table("chat_history").insert({
|
| 22 |
+
"session_date": date.today().isoformat(),
|
| 23 |
+
"role": role,
|
| 24 |
+
"message": message
|
| 25 |
+
}).execute()
|
| 26 |
+
|
| 27 |
+
def rag_answer(question, history):
|
| 28 |
+
retriever = get_vectorstore().as_retriever(search_kwargs={"k": 4})
|
| 29 |
+
docs = retriever.get_relevant_documents(question)
|
| 30 |
+
|
| 31 |
+
# Build context
|
| 32 |
+
context = ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
for i, d in enumerate(docs):
|
| 34 |
+
src = d.metadata.get("source", "?")
|
| 35 |
+
pg = d.metadata.get("page", "")
|
| 36 |
+
pg = f"(Seite {pg})" if pg else ""
|
| 37 |
+
context += f"[Quelle {i+1}] {src} {pg}\n{d.page_content}\n\n"
|
| 38 |
+
|
| 39 |
+
# Build history text
|
| 40 |
+
hist = ""
|
| 41 |
+
for u, b in history:
|
| 42 |
+
hist += f"User: {u}\nAssistant: {b}\n"
|
| 43 |
+
|
| 44 |
+
system_prompt = (
|
| 45 |
+
"Du bist ein Sprachbasierter Chatbot für Prüfungsrecht. "
|
| 46 |
+
"Nutze NUR die bereitgestellten Dokumente."
|
| 47 |
+
"Zitiere immer [Quelle X]."
|
| 48 |
+
)
|
| 49 |
+
|
| 50 |
+
llm = ChatOpenAI(model="gpt-4.1-mini", temperature=0.1)
|
| 51 |
+
|
| 52 |
+
msg = [
|
| 53 |
+
("system", system_prompt),
|
| 54 |
+
("user",
|
| 55 |
+
f"Frage: {question}\n\n"
|
| 56 |
+
f"Vorheriger Chatverlauf:\n{hist}\n\n"
|
| 57 |
+
f"Dokumente:\n{context}"
|
| 58 |
+
),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
]
|
| 60 |
|
| 61 |
+
answer = llm.invoke(msg).content
|
| 62 |
+
|
| 63 |
+
save_message("user", question)
|
| 64 |
+
save_message("assistant", answer)
|
| 65 |
|
| 66 |
+
return answer, docs
|
|
|
requirements.txt
CHANGED
|
@@ -1,21 +1,9 @@
|
|
| 1 |
-
# === UI ===
|
| 2 |
-
gradio
|
| 3 |
-
gradio_pdf
|
| 4 |
-
|
| 5 |
-
# === Core RAG + LangChain ===
|
| 6 |
langchain
|
| 7 |
langchain-community
|
| 8 |
-
langchain-text-splitters
|
| 9 |
langchain-openai
|
| 10 |
-
|
| 11 |
-
# === OpenAI SDK (LLM, Embeddings, Audio) ===
|
| 12 |
-
openai>=1.35.0
|
| 13 |
-
|
| 14 |
-
# === VectorStore ===
|
| 15 |
-
faiss-cpu
|
| 16 |
-
|
| 17 |
-
# === Supabase + Dokumente laden ===
|
| 18 |
supabase
|
|
|
|
| 19 |
pypdf
|
| 20 |
-
|
| 21 |
python-dotenv
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
langchain
|
| 2 |
langchain-community
|
|
|
|
| 3 |
langchain-openai
|
| 4 |
+
openai
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
supabase
|
| 6 |
+
gradio
|
| 7 |
pypdf
|
| 8 |
+
beautifulsoup4
|
| 9 |
python-dotenv
|
retriever.py
DELETED
|
@@ -1,48 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
BƯỚC 5: RETRIEVER
|
| 3 |
-
-----------------
|
| 4 |
-
Tạo LangChain Retriever từ FAISS VectorStore.
|
| 5 |
-
|
| 6 |
-
Retriever sẽ dùng trong bước RAG sau này:
|
| 7 |
-
- retriever.get_relevant_documents(query)
|
| 8 |
-
"""
|
| 9 |
-
|
| 10 |
-
from langchain_community.vectorstores import FAISS
|
| 11 |
-
|
| 12 |
-
# số chunk sẽ lấy cho mỗi câu hỏi
|
| 13 |
-
RETRIEVER_K = 4
|
| 14 |
-
|
| 15 |
-
def get_retriever(vectorstore: FAISS, k: int = RETRIEVER_K):
|
| 16 |
-
"""
|
| 17 |
-
Tạo retriever từ FAISS VectorStore.
|
| 18 |
-
"""
|
| 19 |
-
print(f">>> Creating retriever with k={k} ...")
|
| 20 |
-
retriever = vectorstore.as_retriever(search_kwargs={"k": k})
|
| 21 |
-
print(">>> Retriever ready.\n")
|
| 22 |
-
return retriever
|
| 23 |
-
|
| 24 |
-
if __name__ == "__main__":
|
| 25 |
-
# Test: load -> split -> FAISS -> retriever.get_relevant_documents()
|
| 26 |
-
from load_documents import load_documents
|
| 27 |
-
from split_documents import split_documents
|
| 28 |
-
from vectorstore import build_vectorstore
|
| 29 |
-
|
| 30 |
-
print("=== TEST: retriever.get_relevant_documents ===\n")
|
| 31 |
-
|
| 32 |
-
docs = load_documents()
|
| 33 |
-
chunks = split_documents(docs)
|
| 34 |
-
vs = build_vectorstore(chunks)
|
| 35 |
-
retriever = get_retriever(vs, k=4)
|
| 36 |
-
|
| 37 |
-
query = "Wie lange habe ich Zeit, eine Prüfungsleistung zu wiederholen?"
|
| 38 |
-
print("Test query:")
|
| 39 |
-
print(" ", query, "\n")
|
| 40 |
-
|
| 41 |
-
retrieved_docs = retriever.invoke(query)
|
| 42 |
-
|
| 43 |
-
print(f"Retriever returned {len(retrieved_docs)} documents.")
|
| 44 |
-
for i, d in enumerate(retrieved_docs, start=1):
|
| 45 |
-
print(f"\n=== DOC {i} ===")
|
| 46 |
-
print(d.page_content[:400], "...")
|
| 47 |
-
print("Metadata:", d.metadata)
|
| 48 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
speech_io.py
DELETED
|
@@ -1,52 +0,0 @@
|
|
| 1 |
-
# speech_io.py
|
| 2 |
-
import os
|
| 3 |
-
from tempfile import NamedTemporaryFile
|
| 4 |
-
from typing import Optional
|
| 5 |
-
from openai import OpenAI
|
| 6 |
-
|
| 7 |
-
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
# ======================
|
| 11 |
-
# 1. Speech-to-Text (STT)
|
| 12 |
-
# ======================
|
| 13 |
-
def transcribe_audio(file_path: str, language: Optional[str] = None) -> str:
|
| 14 |
-
"""
|
| 15 |
-
Transkribiert Audio via OpenAI Audio Transcription API (gpt-4o-mini-transcribe).
|
| 16 |
-
"""
|
| 17 |
-
print(">>> Transkribiere Audio via OpenAI Audio API …")
|
| 18 |
-
|
| 19 |
-
with open(file_path, "rb") as f:
|
| 20 |
-
resp = client.audio.transcriptions.create(
|
| 21 |
-
model="gpt-4o-mini-transcribe",
|
| 22 |
-
file=f,
|
| 23 |
-
language=language,
|
| 24 |
-
)
|
| 25 |
-
|
| 26 |
-
return resp.text
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
# ======================
|
| 30 |
-
# 2. Text-to-Speech (TTS)
|
| 31 |
-
# ======================
|
| 32 |
-
def synthesize_speech(text: str, voice: str = "alloy") -> str:
|
| 33 |
-
"""
|
| 34 |
-
Wandelt Text in Sprache um (OpenAI TTS - gpt-4o-mini-tts)
|
| 35 |
-
Speichert MP3-Datei und gibt den Pfad zurück.
|
| 36 |
-
"""
|
| 37 |
-
print(">>> Synthesizing speech via OpenAI TTS …")
|
| 38 |
-
|
| 39 |
-
response = client.audio.speech.create(
|
| 40 |
-
model="gpt-4o-mini-tts",
|
| 41 |
-
voice=voice,
|
| 42 |
-
input=text,
|
| 43 |
-
)
|
| 44 |
-
|
| 45 |
-
# HF Spaces + OpenAI SDK v2.x → raw bytes
|
| 46 |
-
audio_bytes = response.read()
|
| 47 |
-
|
| 48 |
-
tmp = NamedTemporaryFile(delete=False, suffix=".mp3")
|
| 49 |
-
tmp.write(audio_bytes)
|
| 50 |
-
tmp.close()
|
| 51 |
-
|
| 52 |
-
return tmp.name
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
split_documents.py
DELETED
|
@@ -1,28 +0,0 @@
|
|
| 1 |
-
# split_documents.py – v2
|
| 2 |
-
|
| 3 |
-
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
| 4 |
-
|
| 5 |
-
CHUNK_SIZE = 1500
|
| 6 |
-
CHUNK_OVERLAP = 200
|
| 7 |
-
|
| 8 |
-
def split_documents(docs):
|
| 9 |
-
splitter = RecursiveCharacterTextSplitter(
|
| 10 |
-
chunk_size=CHUNK_SIZE,
|
| 11 |
-
chunk_overlap=CHUNK_OVERLAP,
|
| 12 |
-
separators=["\n\n", "\n", ". ", " ", ""],
|
| 13 |
-
)
|
| 14 |
-
chunks = splitter.split_documents(docs)
|
| 15 |
-
|
| 16 |
-
for c in chunks:
|
| 17 |
-
c.metadata["chunk_size"] = CHUNK_SIZE
|
| 18 |
-
c.metadata["chunk_overlap"] = CHUNK_OVERLAP
|
| 19 |
-
|
| 20 |
-
return chunks
|
| 21 |
-
|
| 22 |
-
if __name__ == "__main__":
|
| 23 |
-
from load_documents import load_documents
|
| 24 |
-
docs = load_documents()
|
| 25 |
-
chunks = split_documents(docs)
|
| 26 |
-
print("Docs:", len(docs), "Chunks:", len(chunks))
|
| 27 |
-
print(chunks[0].page_content[:300], chunks[0].metadata)
|
| 28 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
supabase_client.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# supabase_client.py
|
| 2 |
+
import os
|
| 3 |
+
from supabase import create_client
|
| 4 |
+
|
| 5 |
+
SUPABASE_URL = os.environ["SUPABASE_URL"]
|
| 6 |
+
SUPABASE_SERVICE_KEY = os.environ["SUPABASE_SERVICE_KEY"]
|
| 7 |
+
|
| 8 |
+
supabase = create_client(SUPABASE_URL, SUPABASE_SERVICE_KEY)
|
| 9 |
+
|
| 10 |
+
def load_file_bytes(bucket: str, filename: str) -> bytes:
|
| 11 |
+
"""Tải file từ Supabase Storage nhưng KHÔNG ghi ra local – trả về bytes."""
|
| 12 |
+
return supabase.storage.from_(bucket).download(filename)
|
vectorstore.py
DELETED
|
@@ -1,55 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
BƯỚC 4: VECTORSTORE (FAISS in-memory)
|
| 3 |
-
-------------------------------------
|
| 4 |
-
Tạo FAISS index từ các CHUNK văn bản.
|
| 5 |
-
|
| 6 |
-
- Không ghi file .faiss nào, tất cả nằm trong RAM.
|
| 7 |
-
- Embeddings được lấy từ get_embeddings() (Bước 3).
|
| 8 |
-
"""
|
| 9 |
-
|
| 10 |
-
from langchain_community.vectorstores import FAISS
|
| 11 |
-
from embeddings import get_embeddings
|
| 12 |
-
|
| 13 |
-
def build_vectorstore(chunks):
|
| 14 |
-
"""
|
| 15 |
-
Nhận danh sách Document (đã split) và trả về FAISS VectorStore.
|
| 16 |
-
"""
|
| 17 |
-
print(">>> Initialising embedding model for FAISS index ...")
|
| 18 |
-
embeddings = get_embeddings()
|
| 19 |
-
|
| 20 |
-
print(f">>> Building FAISS index from {len(chunks)} chunks ...")
|
| 21 |
-
vs = FAISS.from_documents(chunks, embeddings)
|
| 22 |
-
print(">>> FAISS index built.\n")
|
| 23 |
-
return vs
|
| 24 |
-
|
| 25 |
-
if __name__ == "__main__":
|
| 26 |
-
# Test toàn pipeline: load -> split -> FAISS -> similarity_search
|
| 27 |
-
from load_documents import load_documents
|
| 28 |
-
from split_documents import split_documents
|
| 29 |
-
|
| 30 |
-
print("=== TEST: load_documents -> split_documents -> FAISS.similarity_search ===\n")
|
| 31 |
-
|
| 32 |
-
# 1) Load tài liệu (PDF + HTML) từ HuggingFace
|
| 33 |
-
docs = load_documents()
|
| 34 |
-
|
| 35 |
-
# 2) Split thành chunks
|
| 36 |
-
from pprint import pprint
|
| 37 |
-
print(f"Loaded {len(docs)} raw documents.")
|
| 38 |
-
chunks = split_documents(docs)
|
| 39 |
-
print(f"Split into {len(chunks)} chunks.\n")
|
| 40 |
-
|
| 41 |
-
# 3) Xây FAISS vectorstore
|
| 42 |
-
vectorstore = build_vectorstore(chunks)
|
| 43 |
-
|
| 44 |
-
# 4) Test similarity_search
|
| 45 |
-
query = "Fristen für die Prüfungsanmeldung im Bachelorstudium"
|
| 46 |
-
print("Test query:")
|
| 47 |
-
print(" ", query, "\n")
|
| 48 |
-
|
| 49 |
-
results = vectorstore.similarity_search(query, k=3)
|
| 50 |
-
|
| 51 |
-
print("Top-3 ähnliche Chunks aus dem VectorStore:")
|
| 52 |
-
for i, doc in enumerate(results, start=1):
|
| 53 |
-
print(f"\n=== RESULT {i} ===")
|
| 54 |
-
print(doc.page_content[:400], "...")
|
| 55 |
-
print("Metadata:", doc.metadata)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
viewer.py
DELETED
|
@@ -1,76 +0,0 @@
|
|
| 1 |
-
# viewer.py – Dynamischer HTML-Viewer für Hochschulgesetz NRW
|
| 2 |
-
|
| 3 |
-
import os
|
| 4 |
-
from supabase import create_client
|
| 5 |
-
|
| 6 |
-
SUPABASE_URL = os.getenv("SUPABASE_URL")
|
| 7 |
-
SUPABASE_ANON_KEY = os.getenv("SUPABASE_ANON_KEY")
|
| 8 |
-
|
| 9 |
-
if not SUPABASE_URL or not SUPABASE_ANON_KEY:
|
| 10 |
-
raise RuntimeError("Missing SUPABASE_URL / SUPABASE_ANON_KEY")
|
| 11 |
-
|
| 12 |
-
supabase = create_client(SUPABASE_URL, SUPABASE_ANON_KEY)
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
def generate_hg_viewer() -> str:
|
| 16 |
-
"""
|
| 17 |
-
Tạo HTML full law, mỗi Paragraph có id="para_xx",
|
| 18 |
-
để /hg_view#para_xx có thể scroll đúng đoạn.
|
| 19 |
-
"""
|
| 20 |
-
rows = (
|
| 21 |
-
supabase
|
| 22 |
-
.table("hg_nrw")
|
| 23 |
-
.select("*")
|
| 24 |
-
.order("order_index")
|
| 25 |
-
.execute()
|
| 26 |
-
).data or []
|
| 27 |
-
|
| 28 |
-
html_parts = [
|
| 29 |
-
"""
|
| 30 |
-
<!DOCTYPE html>
|
| 31 |
-
<html>
|
| 32 |
-
<head>
|
| 33 |
-
<meta charset="utf-8">
|
| 34 |
-
<title>Hochschulgesetz NRW</title>
|
| 35 |
-
<style>
|
| 36 |
-
body {
|
| 37 |
-
font-family: -apple-system, BlinkMacSystemFont, sans-serif;
|
| 38 |
-
padding: 20px;
|
| 39 |
-
line-height: 1.6;
|
| 40 |
-
}
|
| 41 |
-
h1 { margin-bottom: 10px; }
|
| 42 |
-
h2 {
|
| 43 |
-
margin-top: 30px;
|
| 44 |
-
scroll-margin-top: 20px;
|
| 45 |
-
}
|
| 46 |
-
.para-block {
|
| 47 |
-
margin-bottom: 20px;
|
| 48 |
-
padding-bottom: 10px;
|
| 49 |
-
border-bottom: 1px solid #eee;
|
| 50 |
-
}
|
| 51 |
-
.subtitle {
|
| 52 |
-
color: #555;
|
| 53 |
-
font-size: 14px;
|
| 54 |
-
}
|
| 55 |
-
</style>
|
| 56 |
-
</head>
|
| 57 |
-
<body>
|
| 58 |
-
<h1>Hochschulgesetz NRW</h1>
|
| 59 |
-
<p class="subtitle">Dynamisch geladen aus Supabase (Tabelle hg_nrw)</p>
|
| 60 |
-
"""
|
| 61 |
-
]
|
| 62 |
-
|
| 63 |
-
for row in rows:
|
| 64 |
-
abs_id = row["abs_id"]
|
| 65 |
-
title = row["title"]
|
| 66 |
-
content = row["content"]
|
| 67 |
-
|
| 68 |
-
html_parts.append(f"""
|
| 69 |
-
<div class="para-block" id="{abs_id}">
|
| 70 |
-
<h2>{title}</h2>
|
| 71 |
-
<p>{content}</p>
|
| 72 |
-
</div>
|
| 73 |
-
""")
|
| 74 |
-
|
| 75 |
-
html_parts.append("</body></html>")
|
| 76 |
-
return "\n".join(html_parts)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|