Nguyen5 commited on
Commit
de84956
·
1 Parent(s): 172fa2e
Files changed (14) hide show
  1. app.py +163 -76
  2. build_hg_viewer.py +313 -0
  3. embeddings.py +24 -0
  4. ingest.py +0 -94
  5. llm.py +26 -0
  6. load_documents.py +119 -0
  7. rag_pipeline.py +108 -114
  8. requirements.txt +27 -5
  9. retriever.py +47 -0
  10. speech_io.py +157 -0
  11. split_documents.py +28 -0
  12. supabase_client.py +0 -25
  13. upload_weblink_to_supabase.py +76 -0
  14. vectorstore.py +56 -0
app.py CHANGED
@@ -1,114 +1,201 @@
1
- # app.py
2
- import os
3
- import re
4
- import base64
5
- import io
6
- import soundfile as sf
7
 
8
  import gradio as gr
9
- from openai import OpenAI
10
 
11
- from supabase_client import load_file_bytes
12
- from rag_pipeline import rag_answer
 
 
 
 
 
13
 
 
 
 
14
 
15
- client = OpenAI()
 
16
 
17
- BUCKET = os.environ["SUPABASE_BUCKET"]
18
- SUPABASE_URL = os.environ["SUPABASE_URL"]
19
 
20
- PDF_URL = f"{SUPABASE_URL}/storage/v1/object/public/{BUCKET}/pruefungsordnung.pdf"
21
- HG_URL = "https://recht.nrw.de/lmi/owa/br_text_anzeigen?v_id=10000000000000000654"
22
 
 
 
23
 
24
- def encode_pdf_src():
25
- b = load_file_bytes(BUCKET, "pruefungsordnung.pdf")
26
- return f"data:application/pdf;base64,{base64.b64encode(b).decode()}"
27
 
 
 
 
28
 
29
- # Whisper cleanup
30
- def clean_text(t):
31
- t = t.lower()
32
- t = re.sub(r"[^\wäöüß ,.?-]+", " ", t)
33
- return t.strip().capitalize()
34
 
 
35
 
36
- def transcribe(audio):
37
- if audio is None:
38
- return ""
39
- audio_data, sr = audio
40
- buf = io.BytesIO()
41
- sf.write(buf, audio_data, sr, format="WAV")
42
- buf.seek(0)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
- result = client.audio.transcriptions.create(
45
- model="whisper-1", file=buf, filename="audio.wav", language="de"
 
 
 
 
 
 
 
 
 
 
46
  )
47
- return clean_text(result.text or "")
48
 
 
 
49
 
50
- def chat_fn(mode, text, audio, history):
51
- history = history or []
 
 
52
 
53
- if mode == "text":
54
- q = text.strip()
55
- else:
56
- q = transcribe(audio)
57
 
58
- if not q:
59
- return history, "Keine gültige Eingabe erkannt.", None
 
60
 
61
- answer, docs = rag_answer(q, history)
 
 
 
62
 
63
- quellen = ["", "### 📚 Verwendete Quellen"]
64
- for i, d in enumerate(docs):
65
- src = d["source"]
66
- pg = d["page"]
67
 
68
- if src.startswith("Prüfungsordnung"):
69
- url = f"{PDF_URL}#page={pg}"
70
- else:
71
- url = HG_URL
 
 
 
 
 
 
 
 
 
 
72
 
73
- snippet = d["snippet"][:200]
74
- quellen.append(f"- **{src}** (Seite {pg}) → [{url}]({url}) \n „{snippet}…”")
 
75
 
76
- bot = answer + "\n\n" + "\n".join(quellen)
 
 
77
 
78
- return history + [
79
- {"role": "user", "content": q},
80
- {"role": "assistant", "content": bot},
81
- ], bot, gr.update(value=None)
82
 
 
 
 
83
 
84
- with gr.Blocks() as demo:
85
- gr.Markdown("# ⚖️ Prüfungsrechts-Assistent NRW")
 
 
 
 
 
86
 
87
  with gr.Row():
88
- with gr.Column(scale=3):
89
- chatbot = gr.Chatbot()
90
 
91
- mode = gr.Radio(["text", "audio"], value="text", label="Eingabemodus")
92
- text = gr.Textbox(label="Text eingeben")
 
 
 
 
 
 
93
 
94
- audio = gr.Audio(
95
- sources=["microphone"],
96
- type="numpy",
97
- format="wav",
98
- label="Spracheingabe (Mikrofon)",
99
  )
 
100
 
101
- send = gr.Button("Senden")
102
- preview = gr.Markdown()
103
 
104
- with gr.Column(scale=2):
105
- gr.Markdown("### 📄 Prüfungsordnung (PDF)")
106
- gr.HTML(f"<iframe src='{encode_pdf_src()}' width='100%' height='260'></iframe>")
 
107
 
108
- gr.Markdown("### 📘 Hochschulgesetz NRW")
109
- gr.HTML(f"<iframe src='{HG_URL}' width='100%' height='260'></iframe>")
 
 
 
 
 
 
 
 
 
 
 
 
 
110
 
111
- send.click(chat_fn, [mode, text, audio, chatbot], [chatbot, preview, audio])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
 
113
  if __name__ == "__main__":
114
- demo.queue().launch()
 
1
+ # app.py – Prüfungsrechts-Chatbot mit OpenAI (Supabase RAG)
 
 
 
 
 
2
 
3
  import gradio as gr
 
4
 
5
+ from load_documents import load_documents, PDF_URL, HG_HTML_URL
6
+ from split_documents import split_documents
7
+ from vectorstore import build_vectorstore
8
+ from retriever import get_retriever
9
+ from llm import load_llm
10
+ from rag_pipeline import answer
11
+ from speech_io import transcribe_audio, synthesize_speech
12
 
13
+ # =====================================================
14
+ # INITIALISIERUNG (beim Start der Space einmalig)
15
+ # =====================================================
16
 
17
+ print("🔹 Lade Dokumente aus Supabase …")
18
+ _docs = load_documents()
19
 
20
+ print("🔹 Splitte Dokumente …")
21
+ _chunks = split_documents(_docs)
22
 
23
+ print("🔹 Baue VectorStore …")
24
+ _vs = build_vectorstore(_chunks)
25
 
26
+ print("🔹 Erzeuge Retriever …")
27
+ _retriever = get_retriever(_vs)
28
 
29
+ print("🔹 Lade OpenAI LLM …")
30
+ _llm = load_llm()
 
31
 
32
+ # =====================================================
33
+ # Quellen formatieren – Markdown im Chat
34
+ # =====================================================
35
 
36
+ def format_sources_markdown(sources):
37
+ if not sources:
38
+ return ""
 
 
39
 
40
+ lines = ["", "### 📚 Quellen (verwendete Dokumentstellen):"]
41
 
42
+ for s in sources:
43
+ sid = s["id"]
44
+ src = s["source"]
45
+ page = s["page"]
46
+ url = s["url"]
47
+ snippet = s["snippet"]
48
+
49
+ if page:
50
+ title = f"Quelle {sid} – {src}, Seite {page}"
51
+ else:
52
+ title = f"Quelle {sid} – {src}"
53
+
54
+ if url:
55
+ base = f"- [{title}]({url})"
56
+ else:
57
+ base = f"- {title}"
58
+
59
+ lines.append(base)
60
+ if snippet:
61
+ lines.append(f" > {snippet}")
62
+
63
+ return "\n".join(lines)
64
 
65
+ # =====================================================
66
+ # TEXT CHATBOT
67
+ # =====================================================
68
+
69
+ def chatbot_text(user_message, history):
70
+ if not user_message:
71
+ return history, ""
72
+
73
+ answer_text, sources = answer(
74
+ question=user_message,
75
+ retriever=_retriever,
76
+ chat_model=_llm,
77
  )
 
78
 
79
+ quellen_block = format_sources_markdown(sources)
80
+ bot_msg = answer_text + "\n\n" + quellen_block
81
 
82
+ history = history + [
83
+ {"role": "user", "content": user_message},
84
+ {"role": "assistant", "content": bot_msg},
85
+ ]
86
 
87
+ return history, ""
 
 
 
88
 
89
+ # =====================================================
90
+ # VOICE CHATBOT
91
+ # =====================================================
92
 
93
+ def chatbot_voice(audio_path, history):
94
+ text = transcribe_audio(audio_path)
95
+ if not text:
96
+ return history, None, ""
97
 
98
+ history = history + [{"role": "user", "content": text}]
 
 
 
99
 
100
+ answer_text, sources = answer(
101
+ question=text,
102
+ retriever=_retriever,
103
+ chat_model=_llm,
104
+ )
105
+
106
+ quellen_block = format_sources_markdown(sources)
107
+ bot_msg = answer_text + "\n\n" + quellen_block
108
+
109
+ history = history + [{"role": "assistant", "content": bot_msg}]
110
+
111
+ audio = synthesize_speech(bot_msg)
112
+
113
+ return history, audio, ""
114
 
115
+ # =====================================================
116
+ # Wieder-Vorlesen der letzten Antwort
117
+ # =====================================================
118
 
119
+ def read_last_answer(history):
120
+ if not history:
121
+ return None
122
 
123
+ for msg in reversed(history):
124
+ if msg["role"] == "assistant":
125
+ return synthesize_speech(msg["content"])
126
+ return None
127
 
128
+ # =====================================================
129
+ # UI (Gradio)
130
+ # =====================================================
131
 
132
+ with gr.Blocks(title="Prüfungsrechts-Chatbot (Supabase + OpenAI)") as demo:
133
+
134
+ gr.Markdown("# 🧑‍⚖️ Prüfungsrechts-Chatbot (Supabase RAG + OpenAI)")
135
+ gr.Markdown(
136
+ "Fragen zum Prüfungsrecht (Prüfungsordnung + Hochschulgesetz NRW). "
137
+ "Antworten mit Quellenangabe und Direktlinks."
138
+ )
139
 
140
  with gr.Row():
 
 
141
 
142
+ # ---------- LINKER BEREICH: CHAT ----------
143
+ with gr.Column(scale=2):
144
+
145
+ chatbot = gr.Chatbot(
146
+ type="messages",
147
+ label="Chat",
148
+ height=550,
149
+ )
150
 
151
+ msg = gr.Textbox(
152
+ label="Frage eingeben",
153
+ placeholder="Stelle deine Frage zum Prüfungsrecht …",
154
+ autofocus=True,
 
155
  )
156
+ msg.submit(chatbot_text, [msg, chatbot], [chatbot, msg])
157
 
158
+ send_btn = gr.Button("Senden (Text)")
159
+ send_btn.click(chatbot_text, [msg, chatbot], [chatbot, msg])
160
 
161
+ gr.Markdown("### 🎙️ Spracheingabe")
162
+
163
+ voice_in = gr.Audio(sources=["microphone"], type="filepath")
164
+ voice_out = gr.Audio(label="Vorgelesene Antwort", type="numpy")
165
 
166
+ send_voice_btn = gr.Button("Sprechen & Senden")
167
+ send_voice_btn.click(
168
+ chatbot_voice,
169
+ [voice_in, chatbot],
170
+ [chatbot, voice_out, msg],
171
+ )
172
+
173
+ read_btn = gr.Button("Antwort erneut vorlesen")
174
+ read_btn.click(read_last_answer, [chatbot], [voice_out])
175
+
176
+ clear_btn = gr.Button("Chat löschen")
177
+ clear_btn.click(lambda: [], None, chatbot)
178
+
179
+ # ---------- RECHTER BEREICH: VIEWER ----------
180
+ with gr.Column(scale=1):
181
 
182
+ gr.Markdown("### 📄 Prüfungsordnung (PDF)")
183
+ gr.HTML(
184
+ f"""
185
+ <iframe src="{PDF_URL}"
186
+ style="width:100%; height:330px; border:none;">
187
+ </iframe>
188
+ """
189
+ )
190
+
191
+ gr.Markdown("### 📘 Hochschulgesetz NRW (Paragraph-Viewer)")
192
+ gr.HTML(
193
+ f"""
194
+ <iframe src="{HG_HTML_URL}"
195
+ style="width:100%; height:330px; border:none;">
196
+ </iframe>
197
+ """
198
+ )
199
 
200
  if __name__ == "__main__":
201
+ demo.queue().launch(ssr_mode=False, show_error=True)
build_hg_viewer.py ADDED
@@ -0,0 +1,313 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # build_hg_viewer.py
2
+ import os
3
+ from supabase import create_client
4
+ from dotenv import load_dotenv
5
+
6
+ load_dotenv()
7
+
8
+ SUPABASE_URL = os.environ["SUPABASE_URL"]
9
+ SUPABASE_SERVICE_ROLE = os.environ["SUPABASE_SERVICE_ROLE"]
10
+
11
+ if not SUPABASE_URL or not SUPABASE_SERVICE_ROLE:
12
+ raise RuntimeError("Missing SUPABASE_URL or SUPABASE_SERVICE_ROLE")
13
+
14
+ supabase = create_client(SUPABASE_URL, SUPABASE_SERVICE_ROLE)
15
+
16
+ from upload_weblink_to_supabase import extract_paragraphs
17
+
18
+ # ======== HTML TEMPLATE ========
19
+ VIEW_TEMPLATE = """
20
+ <!DOCTYPE html>
21
+ <html lang="de">
22
+ <head>
23
+ <meta charset="UTF-8">
24
+ <title>Hochschulgesetz NRW – Paragraph Viewer</title>
25
+
26
+ <style>
27
+ body {
28
+ font-family: Arial, sans-serif;
29
+ margin: 0;
30
+ padding: 0;
31
+ display: flex;
32
+ }
33
+
34
+ /* ----------- SIDEBAR ------------- */
35
+ #sidebar {
36
+ width: 280px;
37
+ height: 100vh;
38
+ overflow-y: auto;
39
+ background: #f5f5f5;
40
+ border-right: 1px solid #ccc;
41
+ padding: 15px;
42
+ position: sticky;
43
+ top: 0;
44
+ }
45
+
46
+ #sidebar h2 {
47
+ margin-top: 0;
48
+ }
49
+
50
+ #searchBox {
51
+ width: 100%;
52
+ padding: 8px;
53
+ font-size: 15px;
54
+ margin-bottom: 10px;
55
+ border: 1px solid #aaa;
56
+ border-radius: 5px;
57
+ }
58
+
59
+ .sidebar-link {
60
+ display: block;
61
+ padding: 6px 8px;
62
+ margin-bottom: 4px;
63
+ text-decoration: none;
64
+ color: #003366;
65
+ border-radius: 4px;
66
+ }
67
+
68
+ .sidebar-link:hover {
69
+ background: #e0e7ff;
70
+ color: #001d4d;
71
+ }
72
+
73
+ /* ----------- CONTENT ------------- */
74
+ #content {
75
+ flex: 1;
76
+ padding: 25px;
77
+ max-width: 900px;
78
+ }
79
+
80
+ /* Absatz block */
81
+ .para {
82
+ padding: 20px 0;
83
+ border-bottom: 1px solid #ddd;
84
+ }
85
+
86
+ .para h2 {
87
+ color: #003366;
88
+ margin-bottom: 10px;
89
+ }
90
+
91
+ /* ----------- Fußnoten ------------- */
92
+ .fn-block {
93
+ background: #fafafa;
94
+ border-left: 4px solid #999;
95
+ padding: 12px;
96
+ margin-top: 10px;
97
+ margin-bottom: 25px;
98
+ }
99
+
100
+ .fn-toggle {
101
+ cursor: pointer;
102
+ font-weight: bold;
103
+ color: #003366;
104
+ margin-bottom: 5px;
105
+ }
106
+
107
+ .fn-content {
108
+ display: none;
109
+ padding-left: 10px;
110
+ }
111
+
112
+ .fn-title {
113
+ font-weight: bold;
114
+ margin-bottom: 6px;
115
+ }
116
+
117
+ .fn-item {
118
+ margin-bottom: 8px;
119
+ }
120
+
121
+ /* ----------- Highlight beim Öffnen ------------- */
122
+ .highlight {
123
+ animation: flash 2s ease-in-out;
124
+ background: #fff8c6 !important;
125
+ }
126
+
127
+ @keyframes flash {
128
+ 0% { background: #fff8c6; }
129
+ 100% { background: transparent; }
130
+ }
131
+
132
+ /* Keyword highlight */
133
+ .keyword {
134
+ background: yellow;
135
+ padding: 2px 3px;
136
+ border-radius: 3px;
137
+ }
138
+
139
+ /* Back to top button */
140
+ #topBtn {
141
+ position: fixed;
142
+ bottom: 25px;
143
+ right: 25px;
144
+ background: #003366;
145
+ color: white;
146
+ border-radius: 8px;
147
+ padding: 10px 14px;
148
+ cursor: pointer;
149
+ font-size: 16px;
150
+ display: none;
151
+ }
152
+ </style>
153
+
154
+ </head>
155
+ <body>
156
+
157
+ <div id="sidebar">
158
+ <h2>Inhaltsverzeichnis</h2>
159
+ <input type="text" id="searchBox" placeholder="Suchen nach § …">
160
+ <!-- SIDEBAR_LINKS -->
161
+ </div>
162
+
163
+ <div id="content">
164
+ <h1>Hochschulgesetz NRW – Paragraph Viewer</h1>
165
+ <!-- PARAGRAPH_CONTENT -->
166
+ </div>
167
+
168
+ <div id="topBtn" onclick="scrollToTop()">⬆️ Top</div>
169
+
170
+ <script>
171
+ // ------ TỰ ĐỘNG HIGHLIGHT Absatz khi có #anchor HIGHLIGHT ABSATZ & SCROLL ------
172
+ window.onload = function() {
173
+ const anchor = window.location.hash.substring(1);
174
+ const params = new URLSearchParams(window.location.search);
175
+ const keywords = params.get("k");
176
+
177
+ if (anchor) {
178
+ const el = document.getElementById(anchor);
179
+ if (el) {
180
+ el.classList.add("highlight");
181
+ el.scrollIntoView({ behavior: "smooth", block: "center" });
182
+ }
183
+ }
184
+
185
+ /* KEYWORD HIGHLIGHT */
186
+ if (keywords) {
187
+ const words = keywords.split("%20");
188
+ highlightKeywords(words);
189
+ }
190
+ };
191
+
192
+ /* --- KEYWORD HIGHLIGHT FUNCTION --- */
193
+ function highlightKeywords(words) {
194
+ const container = document.getElementById("content");
195
+ let html = container.innerHTML;
196
+
197
+ words.forEach(word => {
198
+ if (word.length < 2) return;
199
+ const regex = new RegExp(`(${decodeURIComponent(word)})`, "gi");
200
+ html = html.replace(regex, `<span class="keyword">$1</span>`);
201
+ });
202
+
203
+ container.innerHTML = html;
204
+ }
205
+
206
+ /* --- SEARCH IN SIDEBAR --- */
207
+ document.getElementById("searchBox").addEventListener("input", function() {
208
+ const q = this.value.toLowerCase();
209
+ document.querySelectorAll(".sidebar-link").forEach(link => {
210
+ const txt = link.innerText.toLowerCase();
211
+ link.style.display = txt.includes(q) ? "block" : "none";
212
+ });
213
+ });
214
+
215
+ /* --- COLLAPSIBLE FUSSNOTEN --- */
216
+ document.addEventListener("click", function(e) {
217
+ if (e.target.classList.contains("fn-toggle")) {
218
+ const content = e.target.nextElementSibling;
219
+ content.style.display = content.style.display === "block" ? "none" : "block";
220
+ }
221
+ });
222
+
223
+ /* --- BACK TO TOP BUTTON --- */
224
+ window.onscroll = function() {
225
+ document.getElementById("topBtn").style.display =
226
+ window.scrollY > 300 ? "block" : "none";
227
+ };
228
+
229
+ function scrollToTop() {
230
+ window.scrollTo({ top: 0, behavior: 'smooth' });
231
+ }
232
+
233
+ </script>
234
+
235
+ </body>
236
+ </html>
237
+ """
238
+
239
+ # -------------------------------------------------------------------
240
+ # 2. BUILD VIEWER
241
+ # -------------------------------------------------------------------
242
+
243
+ def build_html():
244
+ print(">>> Lade Paragraphs aus Supabase...")
245
+ paras = extract_paragraphs()
246
+
247
+ sidebar_links = ""
248
+ content_html = ""
249
+
250
+ for p in paras:
251
+ pid = p["abs_id"]
252
+ title = p["title"]
253
+ body = p["content"]
254
+
255
+ # Sidebar item
256
+ sidebar_links += f'<a class="sidebar-link" href="#{pid}">{title}</a>\n'
257
+
258
+ # Fußnoten tách riêng (bắt đầu bằng "Fn 1", "Fn 2", ...)
259
+ lines = body.split("\n")
260
+ main_text = []
261
+ fn_text = []
262
+ in_fn = False
263
+
264
+ for line in lines:
265
+ if line.startswith("Fn "):
266
+ in_fn = True
267
+ if in_fn:
268
+ fn_text.append(line)
269
+ else:
270
+ main_text.append(line)
271
+
272
+ footnotes_html = ""
273
+ if fn_text:
274
+ footnotes_html += '<div class="fn-block">'
275
+ footnotes_html += '<div class="fn-title">Fußnoten:</div>'
276
+ for fn in fn_text:
277
+ footnotes_html += f'<div class="fn-item">{fn}</div>'
278
+ footnotes_html += "</div>"
279
+
280
+ # Paragraph block
281
+ content_html += f"""
282
+ <div class="para" id="{pid}">
283
+ <h2>{title}</h2>
284
+ <div>{'<br>'.join(main_text)}</div>
285
+ {footnotes_html}
286
+ </div>
287
+ """
288
+
289
+ html = VIEW_TEMPLATE.replace("<!-- SIDEBAR_LINKS -->", sidebar_links)
290
+ html = html.replace("<!-- PARAGRAPH_CONTENT -->", content_html)
291
+
292
+ return html
293
+
294
+ # -------------------------------------------------------------------
295
+ # 3. UPLOAD TO SUPABASE STORAGE
296
+ # -------------------------------------------------------------------
297
+
298
+ def upload_html():
299
+ html = build_html()
300
+
301
+ supabase.storage.from_("hg_viewer").update(
302
+ "hg_clean.html",
303
+ html.encode("utf-8"),
304
+ {
305
+ "content-type": "text/html",
306
+ "x-upsert": "true"
307
+ }
308
+ )
309
+
310
+ print("✔ hg_clean.html uploaded!")
311
+
312
+ if __name__ == "__main__":
313
+ upload_html()
embeddings.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # embeddings.py – OpenAI Version (text-embedding-3-small)
2
+
3
+ import os
4
+ from langchain_openai import OpenAIEmbeddings
5
+
6
+ EMBED_MODEL = "text-embedding-3-small"
7
+
8
+ def get_embeddings():
9
+ api_key = os.environ.get("OPENAI_API_KEY")
10
+ if not api_key:
11
+ raise RuntimeError(
12
+ "OPENAI_API_KEY fehlt. Bitte als Secret im HuggingFace Space setzen."
13
+ )
14
+
15
+ print(f">>> Lade OpenAI Embedding Model: {EMBED_MODEL}")
16
+ emb = OpenAIEmbeddings(
17
+ model=EMBED_MODEL,
18
+ api_key=api_key,
19
+ )
20
+ return emb
21
+
22
+ if __name__ == "__main__":
23
+ e = get_embeddings()
24
+ print(e.embed_query("Test"))
ingest.py DELETED
@@ -1,94 +0,0 @@
1
- # ingest.py
2
- import os
3
- from io import BytesIO
4
- from bs4 import BeautifulSoup
5
- from pypdf import PdfReader
6
-
7
- from supabase_client import supabase, load_file_bytes
8
- from langchain_openai import OpenAIEmbeddings
9
- from langchain_text_splitters import RecursiveCharacterTextSplitter
10
- from langchain_core.documents import Document
11
-
12
- BUCKET = os.environ["SUPABASE_BUCKET"]
13
- SUPABASE_URL = os.environ["SUPABASE_URL"]
14
-
15
- PDF_URL = f"{SUPABASE_URL}/storage/v1/object/public/{BUCKET}/pruefungsordnung.pdf"
16
- OFFICIAL_HG_URL = (
17
- "https://recht.nrw.de/lmi/owa/br_text_anzeigen?v_id=10000000000000000654"
18
- )
19
-
20
-
21
- # ---------------- Loaders ----------------
22
- def load_pdf_docs():
23
- pdf_bytes = load_file_bytes(BUCKET, "pruefungsordnung.pdf")
24
- reader = PdfReader(BytesIO(pdf_bytes))
25
-
26
- docs = []
27
- for i, p in enumerate(reader.pages):
28
- text = p.extract_text() or ""
29
- docs.append(
30
- Document(
31
- page_content=text,
32
- metadata={
33
- "source": "Prüfungsordnung (PDF)",
34
- "page": i + 1,
35
- "pdf_url": PDF_URL,
36
- },
37
- )
38
- )
39
- return docs
40
-
41
-
42
- def load_html_docs():
43
- html_bytes = load_file_bytes(BUCKET, "hochschulgesetz.html")
44
- soup = BeautifulSoup(html_bytes.decode("utf-8", "ignore"), "html.parser")
45
-
46
- return [
47
- Document(
48
- page_content=soup.get_text("\n"),
49
- metadata={"source": "Hochschulgesetz NRW", "url": OFFICIAL_HG_URL},
50
- )
51
- ]
52
-
53
-
54
- def chunk_docs(docs):
55
- splitter = RecursiveCharacterTextSplitter(chunk_size=900, chunk_overlap=80)
56
- return splitter.split_documents(docs)
57
-
58
-
59
- # ---------------- Delete old data ----------------
60
- def delete_old_documents():
61
- print("🗑️ Lösche alte Daten…")
62
- supabase.table("documents").delete().gte(
63
- "id", "00000000-0000-0000-0000-000000000000"
64
- ).execute()
65
-
66
-
67
- # ---------------- Ingest ----------------
68
- def ingest():
69
- delete_old_documents()
70
-
71
- pdf_docs = load_pdf_docs()
72
- html_docs = load_html_docs()
73
-
74
- chunks = chunk_docs(pdf_docs + html_docs)
75
-
76
- embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
77
-
78
- print("📥 Speichere neue Dokumente…")
79
- for d in chunks:
80
- emb = embeddings.embed_query(d.page_content)
81
-
82
- supabase.table("documents").insert(
83
- {
84
- "content": d.page_content,
85
- "metadata": d.metadata,
86
- "embedding": emb,
87
- }
88
- ).execute()
89
-
90
- print("✅ Ingest abgeschlossen!")
91
-
92
-
93
- if __name__ == "__main__":
94
- ingest()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
llm.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # llm.py – OpenAI Chatmodell für RAG
2
+
3
+ import os
4
+ from langchain_openai import ChatOpenAI
5
+
6
+ CHAT_MODEL = "gpt-4o-mini" # günstig & stark
7
+
8
+ def load_llm():
9
+ api_key = os.environ.get("OPENAI_API_KEY")
10
+ if not api_key:
11
+ raise RuntimeError(
12
+ "OPENAI_API_KEY fehlt. Bitte als Secret im HuggingFace Space setzen."
13
+ )
14
+
15
+ print(f">>> Lade OpenAI Chatmodell: {CHAT_MODEL}")
16
+
17
+ llm = ChatOpenAI(
18
+ model=CHAT_MODEL,
19
+ temperature=0.0, # deterministisch, wenig Halluzination
20
+ api_key=api_key,
21
+ )
22
+ return llm
23
+
24
+ if __name__ == "__main__":
25
+ llm = load_llm()
26
+ print(llm.invoke("Sag einen Satz zum Prüfungsrecht.").content)
load_documents.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import os
3
+ import requests
4
+ import tempfile
5
+ from supabase import create_client
6
+ from langchain_core.documents import Document
7
+ from langchain_community.document_loaders import PyPDFLoader
8
+
9
+ # ---------------------------------------------------------
10
+ # ENV Variablen aus HuggingFace Space
11
+ # ---------------------------------------------------------
12
+ SUPABASE_URL = os.getenv("SUPABASE_URL")
13
+ SUPABASE_ANON_KEY = os.getenv("SUPABASE_ANON_KEY")
14
+
15
+ if not SUPABASE_URL or not SUPABASE_ANON_KEY:
16
+ raise RuntimeError("Missing SUPABASE_URL / SUPABASE_ANON_KEY in environment.")
17
+
18
+ supabase = create_client(SUPABASE_URL, SUPABASE_ANON_KEY)
19
+
20
+ # ---------------------------------------------------------
21
+ # Prüfungsordnung PDF – liegt in Supabase Storage (public)
22
+ # ---------------------------------------------------------
23
+ PDF_FILE = "f10_bpo_ifb_tei_mif_wii_2021-01-04.pdf"
24
+ PDF_URL = f"{SUPABASE_URL}/storage/v1/object/public/File%20PDF/{PDF_FILE}"
25
+
26
+ # ---------------------------------------------------------
27
+ # Statischer Paragraph-Viewer in HuggingFace Space
28
+ # -> hg_clean.html liegt als Datei im Repo!
29
+ # -> in der App: iframe src="file=hg_clean.html"
30
+ # -> für Links: "file=hg_clean.html#para_123"
31
+ # ---------------------------------------------------------
32
+ # HG_HTML_URL = "file=hg_clean.html" # WICHTIG: nicht absolut, Space kümmert sich
33
+ #HG_HTML_URL = "https://huggingface.co/spaces/Nguyen5/chatbot/resolve/main/hg_clean.html"
34
+ #HG_HTML_URL = "https://huggingface.co/spaces/Nguyen5/chatbot/raw/main/hg_clean.html"
35
+
36
+ HG_HTML_URL = f"{SUPABASE_URL}/storage/v1/object/public/hg_viewer/hg_clean.html"
37
+
38
+ def load_hg_nrw():
39
+ """
40
+ Holt alle Paragraphen aus Tabelle public.hg_nrw und baut
41
+ LangChain-Dokumente. Jeder Paragraph:
42
+ - title : z.B. "§ 64 (Fn 35) Prüfungsordnungen"
43
+ - content: Volltext inkl. Fußnoten
44
+ - abs_id : para_1, para_2, ...
45
+ """
46
+ print(">>> Lade Hochschulgesetz NRW (§) aus Supabase…")
47
+
48
+ rows = (
49
+ supabase.table("hg_nrw")
50
+ .select("*")
51
+ .order("order_index")
52
+ .execute()
53
+ ).data or []
54
+
55
+ print(f" - {len(rows)} Paragraphen geladen.")
56
+
57
+ docs = []
58
+ for r in rows:
59
+ abs_id = r["abs_id"] # z.B. "para_64"
60
+ title = r.get("title", "")
61
+ content = r.get("content", "")
62
+
63
+ # Anker im Viewer – IDs in hg_clean.html sind identisch (id="para_64")
64
+ viewer_url = f"{HG_HTML_URL}#{abs_id}"
65
+
66
+ docs.append(
67
+ Document(
68
+ page_content=f"{title}\n{content}",
69
+ metadata={
70
+ "source": "Hochschulgesetz NRW",
71
+ "paragraph": title,
72
+ "abs_id": abs_id,
73
+ "url": viewer_url,
74
+ },
75
+ )
76
+ )
77
+
78
+ return docs
79
+
80
+ def load_pdf():
81
+ """
82
+ Lädt Prüfungsordnung aus Supabase Storage, speichert temporär,
83
+ splitten erfolgt später in split_documents.py.
84
+ """
85
+ print(">>> Lade Prüfungsordnung PDF …")
86
+
87
+ resp = requests.get(PDF_URL)
88
+ resp.raise_for_status()
89
+
90
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
91
+ tmp.write(resp.content)
92
+ path = tmp.name
93
+
94
+ pages = PyPDFLoader(path).load()
95
+
96
+ for i, p in enumerate(pages):
97
+ p.metadata["source"] = "Prüfungsordnung (PDF)"
98
+ p.metadata["page"] = i # 0-basiert
99
+ p.metadata["pdf_url"] = PDF_URL
100
+
101
+ print(f" - {len(pages)} PDF-Seiten geladen.")
102
+ return pages
103
+
104
+ def load_documents():
105
+ """
106
+ Master-Funktion:
107
+ - Hochschulgesetz NRW (Supabase Tabelle hg_nrw)
108
+ - Prüfungsordnung (PDF)
109
+ """
110
+ docs = []
111
+ docs.extend(load_hg_nrw())
112
+ docs.extend(load_pdf())
113
+ print(f"✔ DOCUMENTS LOADED: {len(docs)}")
114
+ return docs
115
+
116
+ if __name__ == "__main__":
117
+ docs = load_documents()
118
+ print(docs[0])
119
+ print("Total:", len(docs))
rag_pipeline.py CHANGED
@@ -1,131 +1,125 @@
1
- # rag_pipeline.py
2
- from typing import Any
3
- from datetime import date
4
 
5
- from supabase_client import supabase, match_documents
 
6
 
7
- from langchain_openai import ChatOpenAI, OpenAIEmbeddings
8
- from langchain_core.messages import (
9
- SystemMessage,
10
- HumanMessage,
11
- AIMessage,
12
- )
13
 
14
- emb = OpenAIEmbeddings(model="text-embedding-3-small")
15
- llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.0)
 
16
 
 
 
 
 
 
 
 
 
 
 
17
 
18
- SYSTEM_PROMPT = """
19
- Du bist ein hochpräziser juristischer Assistent für Prüfungsrecht in NRW.
20
- - Nutze AUSSCHLIESSLICH Dokumente, die über das Tool geliefert werden.
21
- - Keine Spekulation.
22
- - Antwort strukturiert + verständlich.
23
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
 
25
 
26
- # ---------------- TOOL: Suche Dokumente ----------------
27
- def tool_suche_dokumente(query: str):
28
- vec = emb.embed_query(query)
29
- docs = match_documents(vec, k=4)
30
 
31
- results = []
 
 
 
 
32
  for i, d in enumerate(docs):
33
- meta = d["metadata"] or {}
34
- snippet = d["content"].replace("\n", " ")[:400]
35
-
36
- results.append(
37
- {
38
- "index": i + 1,
39
- "source": meta.get("source"),
40
- "page": meta.get("page"),
41
- "snippet": snippet,
42
- "content": d["content"],
43
- "metadata": meta,
44
- }
45
- )
46
-
47
- return {"results": results}
48
-
49
-
50
- TOOLS = [
51
- {
52
- "type": "function",
53
- "function": {
54
- "name": "suche_pruefungsrecht_dokumente",
55
- "description": "Sucht relevante Stellen im Prüfungsrecht.",
56
- "parameters": {
57
- "type": "object",
58
- "properties": {"query": {"type": "string"}},
59
- "required": ["query"],
60
- },
61
- },
62
- }
63
- ]
64
-
65
- llm_tools = llm.bind_tools(TOOLS)
66
-
67
-
68
- # ---------------- HISTORY LOG ----------------
69
- def save_message(role: str, content: str):
70
- supabase.table("chat_history").insert(
71
- {
72
- "session_date": date.today().isoformat(),
73
- "role": role,
74
- "message": content,
75
- }
76
- ).execute()
77
-
78
-
79
- def convert_history(hist):
80
- msgs = []
81
- for h in hist[-6:]:
82
- if h["role"] == "user":
83
- msgs.append(HumanMessage(content=h["content"]))
84
  else:
85
- msgs.append(AIMessage(content=h["content"]))
86
- return msgs
87
 
 
88
 
89
- # ---------------- AGENT ANSWER ----------------
90
- def agent_answer(query: str, history: Any):
91
- messages = [
92
- SystemMessage(content=SYSTEM_PROMPT),
93
- *convert_history(history),
94
- HumanMessage(content=query),
95
- ]
96
 
97
- first = llm_tools.invoke(messages)
98
-
99
- if first.tool_calls:
100
- call = first.tool_calls[0]
101
- if call["name"] == "suche_pruefungsrecht_dokumente":
102
- tool_res = tool_suche_dokumente(call["args"]["query"])
103
-
104
- messages.extend(
105
- [
106
- first,
107
- AIMessage(
108
- content=str(tool_res),
109
- name="suche_pruefungsrecht_dokumente",
110
- ),
111
- ]
112
- )
113
-
114
- final = llm.invoke(messages)
115
- answer = final.content
116
- docs = tool_res["results"]
117
- else:
118
- answer = "Tool nicht unterstützt."
119
- docs = []
120
- else:
121
- answer = first.content
122
- docs = []
123
 
124
- save_message("user", query)
125
- save_message("assistant", answer)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
 
127
- return answer, docs
 
 
128
 
 
 
129
 
130
- def rag_answer(query: str, history: Any):
131
- return agent_answer(query, history)
 
1
+ # rag_pipeline.py – OpenAI RAG mit Supabase-Dokumenten
 
 
2
 
3
+ from typing import List, Dict, Any, Tuple
4
+ from langchain_core.messages import SystemMessage, HumanMessage
5
 
6
+ MAX_CHARS = 900
 
 
 
 
 
7
 
8
+ # ------------------------------------------------------
9
+ # Quellen-Metadaten
10
+ # ------------------------------------------------------
11
 
12
+ def build_sources_metadata(docs: List) -> List[Dict[str, Any]]:
13
+ """
14
+ Gibt für jeden verwendeten Chunk eine strukturierte Quelle zurück:
15
+ - id
16
+ - source (Dokument)
17
+ - page (bei PDF)
18
+ - url (PDF-Seite oder hg_clean.html#para_x)
19
+ - snippet (Kurzvorschau)
20
+ """
21
+ srcs = []
22
 
23
+ for i, d in enumerate(docs):
24
+ meta = d.metadata
25
+ src = meta.get("source")
26
+ page = meta.get("page")
27
+ snippet = d.page_content[:300].replace("\n", " ")
28
+
29
+ if src == "Prüfungsordnung (PDF)":
30
+ pdf_url = meta.get("pdf_url")
31
+ if isinstance(page, int) and pdf_url:
32
+ url = f"{pdf_url}#page={page + 1}"
33
+ else:
34
+ url = pdf_url
35
+
36
+ elif src == "Hochschulgesetz NRW":
37
+ url = meta.get("url")
38
+ page = None
39
+
40
+ else:
41
+ url = None
42
+
43
+ srcs.append({
44
+ "id": i + 1,
45
+ "source": src,
46
+ "page": page + 1 if isinstance(page, int) else None,
47
+ "url": url,
48
+ "snippet": snippet,
49
+ })
50
 
51
+ return srcs
52
 
53
+ # ------------------------------------------------------
54
+ # Kontextformatierung
55
+ # ------------------------------------------------------
 
56
 
57
+ def format_context(docs):
58
+ if not docs:
59
+ return "(Kein relevanter Kontext gefunden.)"
60
+
61
+ out_lines = []
62
  for i, d in enumerate(docs):
63
+ txt = d.page_content[:MAX_CHARS]
64
+ src = d.metadata.get("source")
65
+ page = d.metadata.get("page")
66
+
67
+ if src == "Prüfungsordnung (PDF)" and isinstance(page, int):
68
+ src_str = f"{src}, Seite {page + 1}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  else:
70
+ src_str = src
 
71
 
72
+ out_lines.append(f"[KONTEXT {i+1}] ({src_str})\n{txt}")
73
 
74
+ return "\n\n".join(out_lines)
 
 
 
 
 
 
75
 
76
+ SYSTEM_PROMPT = """
77
+ Du bist ein juristisch präziser Chatbot für Prüfungsrecht.
78
+ Du nutzt ausschließlich:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
 
80
+ - die Prüfungsordnung (PDF) und
81
+ - das Hochschulgesetz NRW (Paragraphen aus der Datenbank / hg_clean.html)
82
+
83
+ Regeln:
84
+
85
+ 1. Antworte nur auf Basis des gelieferten Kontextes.
86
+ 2. Wenn der Kontext keine sichere Antwort erlaubt, sage das klar.
87
+ 3. Antworte in gut verständlichem Deutsch, in ganzen Sätzen.
88
+ 4. Nenne, soweit möglich:
89
+ - Paragraphen oder Überschriften,
90
+ - das Dokument (Prüfungsordnung / Hochschulgesetz NRW),
91
+ - Seitenzahl (bei der Prüfungsordnung).
92
+ """
93
+
94
+ def answer(question: str, retriever, chat_model) -> Tuple[str, List[Dict[str, Any]]]:
95
+ # 1. Chunks holen
96
+ docs = retriever.invoke(question)
97
+ context_str = format_context(docs)
98
+
99
+ # 2. Prompt bauen
100
+ human = f"""
101
+ FRAGE:
102
+ {question}
103
+
104
+ NUTZE AUSSCHLIESSLICH DIESEN KONTEXT:
105
+ {context_str}
106
+
107
+ AUFGABE:
108
+ Erstelle eine juristisch korrekte Antwort ausschließlich auf Basis
109
+ des obigen Kontextes. Wenn der Kontext keine sichere Antwort zulässt,
110
+ sage das ausdrücklich und verzichte auf Spekulationen.
111
+ """
112
+
113
+ msgs = [
114
+ SystemMessage(content=SYSTEM_PROMPT),
115
+ HumanMessage(content=human),
116
+ ]
117
 
118
+ # 3. LLM aufrufen
119
+ result = chat_model.invoke(msgs)
120
+ answer_text = result.content.strip()
121
 
122
+ # 4. Quellenliste
123
+ sources = build_sources_metadata(docs)
124
 
125
+ return answer_text, sources
 
requirements.txt CHANGED
@@ -1,10 +1,32 @@
 
 
 
 
 
 
 
 
 
 
1
  langchain
2
  langchain-community
 
3
  langchain-openai
4
- openai
5
- supabase
6
- gradio
 
 
7
  pypdf
 
8
  beautifulsoup4
9
- python-dotenv
10
- soundfile
 
 
 
 
 
 
 
 
 
1
+ # === UI ===
2
+ gradio
3
+
4
+ # === Supabase ===
5
+ supabase
6
+ postgrest
7
+ httpx
8
+ python-dotenv
9
+
10
+ # === LangChain Core ===
11
  langchain
12
  langchain-community
13
+ langchain-text-splitters
14
  langchain-openai
15
+
16
+ # === VectorStore ===
17
+ faiss-cpu
18
+
19
+ # === PDF + HTTP + HTML ===
20
  pypdf
21
+ requests
22
  beautifulsoup4
23
+
24
+ # === Audio (STT/TTS local) ===
25
+ transformers
26
+ accelerate
27
+ soundfile
28
+ scipy
29
+ numpy
30
+
31
+ # OpenAI offizielle Bibliothek (kommt i.d.R. mit langchain-openai, zur Sicherheit explizit)
32
+ openai
retriever.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ BƯỚC 5: RETRIEVER
3
+ -----------------
4
+ Tạo LangChain Retriever từ FAISS VectorStore.
5
+
6
+ Retriever sẽ dùng trong bước RAG sau này:
7
+ - retriever.get_relevant_documents(query)
8
+ """
9
+
10
+ from langchain_community.vectorstores import FAISS
11
+
12
+ # số chunk sẽ lấy cho mỗi câu hỏi
13
+ RETRIEVER_K = 4
14
+
15
+ def get_retriever(vectorstore: FAISS, k: int = RETRIEVER_K):
16
+ """
17
+ Tạo retriever từ FAISS VectorStore.
18
+ """
19
+ print(f">>> Creating retriever with k={k} ...")
20
+ retriever = vectorstore.as_retriever(search_kwargs={"k": k})
21
+ print(">>> Retriever ready.\n")
22
+ return retriever
23
+
24
+ if __name__ == "__main__":
25
+ # Test: load -> split -> FAISS -> retriever.get_relevant_documents()
26
+ from load_documents import load_documents
27
+ from split_documents import split_documents
28
+ from vectorstore import build_vectorstore
29
+
30
+ print("=== TEST: retriever.get_relevant_documents ===\n")
31
+
32
+ docs = load_documents()
33
+ chunks = split_documents(docs)
34
+ vs = build_vectorstore(chunks)
35
+ retriever = get_retriever(vs, k=4)
36
+
37
+ query = "Wie lange habe ich Zeit, eine Prüfungsleistung zu wiederholen?"
38
+ print("Test query:")
39
+ print(" ", query, "\n")
40
+
41
+ retrieved_docs = retriever.invoke(query)
42
+
43
+ print(f"Retriever returned {len(retrieved_docs)} documents.")
44
+ for i, d in enumerate(retrieved_docs, start=1):
45
+ print(f"\n=== DOC {i} ===")
46
+ print(d.page_content[:400], "...")
47
+ print("Metadata:", d.metadata)
speech_io.py ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ speech_io.py
3
+
4
+ Sprachbasierte Ein-/Ausgabe:
5
+ - Speech-to-Text (STT) mit Whisper (transformers.pipeline)
6
+ - Text-to-Speech (TTS) mit MMS-TTS Deutsch
7
+
8
+ Dieses File ist 100% stabil für HuggingFace Spaces.
9
+ """
10
+
11
+ from typing import Optional, Tuple
12
+ import numpy as np
13
+ import soundfile as sf
14
+ from scipy.signal import butter, filtfilt
15
+ from transformers import pipeline
16
+
17
+ # Modelle
18
+ ASR_MODEL_ID = "openai/whisper-small"
19
+ TTS_MODEL_ID = "facebook/mms-tts-deu"
20
+
21
+ _asr = None
22
+ _tts = None
23
+
24
+ # ========================================================
25
+ # STT PIPELINE
26
+ # ========================================================
27
+
28
+ def get_asr_pipeline():
29
+ global _asr
30
+ if _asr is None:
31
+ print(f">>> Lade ASR Modell: {ASR_MODEL_ID}")
32
+ _asr = pipeline(
33
+ task="automatic-speech-recognition",
34
+ model=ASR_MODEL_ID,
35
+ device="cpu",
36
+ return_timestamps=True, # wichtig
37
+ chunk_length_s=30 # auto-chunk für lange audio
38
+ )
39
+ return _asr
40
+
41
+ # ========================================================
42
+ # TTS PIPELINE
43
+ # ========================================================
44
+
45
+ def get_tts_pipeline():
46
+ global _tts
47
+ if _tts is None:
48
+ print(f">>> Lade TTS Modell: {TTS_MODEL_ID}")
49
+ _tts = pipeline(
50
+ task="text-to-speech",
51
+ model=TTS_MODEL_ID,
52
+ )
53
+ return _tts
54
+
55
+ # ========================================================
56
+ # AUDIO FILTER – Noise Reduction + Highpass
57
+ # ========================================================
58
+
59
+ def butter_highpass_filter(data, cutoff=60, fs=16000, order=4):
60
+ nyq = 0.5 * fs
61
+ norm_cutoff = cutoff / nyq
62
+ b, a = butter(order, norm_cutoff, btype="high")
63
+ return filtfilt(b, a, data)
64
+
65
+ def apply_fade(audio, sr, duration_ms=10):
66
+ fade_samples = int(sr * duration_ms / 1000)
67
+
68
+ if fade_samples * 2 >= len(audio):
69
+ return audio
70
+
71
+ fade_in_curve = np.linspace(0, 1, fade_samples)
72
+ audio[:fade_samples] *= fade_in_curve
73
+
74
+ fade_out_curve = np.linspace(1, 0, fade_samples)
75
+ audio[-fade_samples:] *= fade_out_curve
76
+
77
+ return audio
78
+
79
+ # ========================================================
80
+ # SPEECH-TO-TEXT (STT)
81
+ # ========================================================
82
+
83
+ def transcribe_audio(audio_path: str) -> str:
84
+ """
85
+ audio_path: path zu WAV-Datei (von gr.Audio type="filepath")
86
+ """
87
+
88
+ if audio_path is None:
89
+ return ""
90
+
91
+ # WAV einlesen (soundfile garantiert PCM korrekt)
92
+ data, sr = sf.read(audio_path)
93
+
94
+ # immer Mono
95
+ if len(data.shape) > 1:
96
+ data = data[:, 0]
97
+
98
+ # Whisper >30s vermeiden
99
+ MAX_SAMPLES = sr * 30
100
+ if len(data) > MAX_SAMPLES:
101
+ data = data[:MAX_SAMPLES]
102
+
103
+ asr = get_asr_pipeline()
104
+
105
+ print(">>> Transkribiere Audio...")
106
+ result = asr(
107
+ {"array": data, "sampling_rate": sr},
108
+ )
109
+
110
+ text = result.get("text", "").strip()
111
+ print("ASR:", text)
112
+ return text
113
+
114
+ # ========================================================
115
+ # TEXT-TO-SPEECH (TTS)
116
+ # ========================================================
117
+
118
+ def synthesize_speech(text: str):
119
+ if not text or not text.strip():
120
+ return None
121
+
122
+ tts = get_tts_pipeline()
123
+ out = tts(text)
124
+
125
+ # rohes Audio from MMS (float32 [-1, 1])
126
+ audio = np.array(out["audio"], dtype=np.float32)
127
+ sr = out.get("sampling_rate", 16000)
128
+
129
+ # ===== FIX sample_rate =====
130
+ if sr is None or sr <= 0 or sr > 65535:
131
+ sr = 16000
132
+
133
+ # ===== Mono erzwingen =====
134
+ if audio.ndim > 1:
135
+ audio = audio.squeeze()
136
+ if audio.ndim > 1:
137
+ audio = audio[:, 0]
138
+
139
+ # ===== Noise reduction =====
140
+ try:
141
+ audio = butter_highpass_filter(audio, cutoff=60, fs=sr)
142
+ except:
143
+ pass
144
+
145
+ # ===== Normalize =====
146
+ max_val = np.max(np.abs(audio))
147
+ if max_val > 0:
148
+ audio = audio / max_val
149
+
150
+ # ===== Fade gegen pop =====
151
+ audio = apply_fade(audio, sr)
152
+
153
+ # ===== int16 =====
154
+ audio_int16 = np.clip(audio * 32767, -32768, 32767).astype(np.int16)
155
+
156
+ # Rückgabe: (sr, np.int16 array)
157
+ return (sr, audio_int16)
split_documents.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # split_documents.py – v2
2
+
3
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
4
+
5
+ CHUNK_SIZE = 1500
6
+ CHUNK_OVERLAP = 200
7
+
8
+ def split_documents(docs):
9
+ splitter = RecursiveCharacterTextSplitter(
10
+ chunk_size=CHUNK_SIZE,
11
+ chunk_overlap=CHUNK_OVERLAP,
12
+ separators=["\n\n", "\n", ". ", " ", ""],
13
+ )
14
+ chunks = splitter.split_documents(docs)
15
+
16
+ for c in chunks:
17
+ c.metadata["chunk_size"] = CHUNK_SIZE
18
+ c.metadata["chunk_overlap"] = CHUNK_OVERLAP
19
+
20
+ return chunks
21
+
22
+ if __name__ == "__main__":
23
+ from load_documents import load_documents
24
+ docs = load_documents()
25
+ chunks = split_documents(docs)
26
+ print("Docs:", len(docs), "Chunks:", len(chunks))
27
+ print(chunks[0].page_content[:300], chunks[0].metadata)
28
+
supabase_client.py DELETED
@@ -1,25 +0,0 @@
1
- # supabase_client.py
2
- import os
3
- from supabase import create_client
4
-
5
- SUPABASE_URL = os.environ["SUPABASE_URL"]
6
- SUPABASE_SERVICE_ROLE = os.environ["SUPABASE_SERVICE_ROLE"]
7
-
8
- supabase = create_client(SUPABASE_URL, SUPABASE_SERVICE_ROLE)
9
-
10
-
11
- def load_file_bytes(bucket: str, filename: str) -> bytes:
12
- return supabase.storage.from_(bucket).download(filename)
13
-
14
-
15
- def match_documents(embedding: list, k: int = 4):
16
- """
17
- Gọi trực tiếp RPC match_documents trong Supabase.
18
- Trả về list các rows: {content, metadata, embedding?}
19
- """
20
- resp = supabase.rpc(
21
- "match_documents",
22
- {"query_embedding": embedding, "match_count": k}
23
- ).execute()
24
-
25
- return resp.data or []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
upload_weblink_to_supabase.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import requests
3
+ from bs4 import BeautifulSoup
4
+ from supabase import create_client
5
+ from dotenv import load_dotenv
6
+
7
+ load_dotenv()
8
+
9
+ SUPABASE_URL = os.environ["SUPABASE_URL"]
10
+ SUPABASE_SERVICE_ROLE = os.environ["SUPABASE_SERVICE_ROLE"]
11
+
12
+ supabase = create_client(SUPABASE_URL, SUPABASE_SERVICE_ROLE)
13
+
14
+ LAW_URL = "https://recht.nrw.de/lmi/owa/br_text_anzeigen?v_id=10000000000000000654"
15
+
16
+ def extract_paragraphs():
17
+ print(">>> Lade Hochschulgesetz NRW …")
18
+
19
+ html = requests.get(LAW_URL, timeout=30).text
20
+ soup = BeautifulSoup(html, "html.parser")
21
+
22
+ # Tất cả tiêu đề Paragraph xuất hiện trong <h2> hoặc <h3>
23
+ headers = soup.find_all(["h2", "h3"])
24
+
25
+ paragraphs = []
26
+ order = 1
27
+
28
+ for header in headers:
29
+ title = header.get_text(" ", strip=True)
30
+
31
+ if not title.startswith("§"):
32
+ continue # bỏ các h2/h3 không phải Paragraph
33
+
34
+ # Gom toàn bộ nội dung từ header đến trước h2/h3 tiếp theo
35
+ content_parts = []
36
+ sibling = header.find_next_sibling()
37
+
38
+ while sibling and sibling.name not in ["h2", "h3"]:
39
+ text = sibling.get_text(" ", strip=True)
40
+ if text:
41
+ content_parts.append(text)
42
+ sibling = sibling.find_next_sibling()
43
+
44
+ full_content = "\n".join(content_parts).strip()
45
+
46
+ para_id = f"para_{order}"
47
+
48
+ paragraphs.append({
49
+ "abs_id": para_id,
50
+ "title": title,
51
+ "content": full_content,
52
+ "order_index": order
53
+ })
54
+
55
+ order += 1
56
+
57
+ print(f"✔ Extracted {len(paragraphs)} paragraphs (§).")
58
+ return paragraphs
59
+
60
+ def upload_to_supabase():
61
+ paras = extract_paragraphs()
62
+
63
+ print(">>> Clear table hg_nrw …")
64
+ supabase.table("hg_nrw").delete().neq("abs_id", "").execute()
65
+
66
+ print(">>> Upload begin …")
67
+ BATCH = 100
68
+ for i in range(0, len(paras), BATCH):
69
+ batch = paras[i:i+BATCH]
70
+ print(f" - Upload batch {i} – {i+len(batch)-1}")
71
+ supabase.table("hg_nrw").upsert(batch).execute()
72
+
73
+ print("✔ DONE uploading complete NRW law.")
74
+
75
+ if __name__ == "__main__":
76
+ upload_to_supabase()
vectorstore.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ BƯỚC 4: VECTORSTORE (FAISS in-memory)
3
+ -------------------------------------
4
+ Tạo FAISS index từ các CHUNK văn bản.
5
+
6
+ - Không ghi file .faiss nào, tất cả nằm trong RAM.
7
+ - Embeddings được lấy từ get_embeddings() (Bước 3).
8
+ """
9
+
10
+ from langchain_community.vectorstores import FAISS
11
+ from embeddings import get_embeddings
12
+
13
+ def build_vectorstore(chunks):
14
+ """
15
+ Nhận danh sách Document (đã split) và trả về FAISS VectorStore.
16
+ """
17
+ print(">>> Initialising embedding model for FAISS index ...")
18
+ embeddings = get_embeddings()
19
+
20
+ print(f">>> Building FAISS index from {len(chunks)} chunks ...")
21
+ vs = FAISS.from_documents(chunks, embeddings)
22
+ print(">>> FAISS index built.\n")
23
+ return vs
24
+
25
+ if __name__ == "__main__":
26
+ # Test toàn pipeline: load -> split -> FAISS -> similarity_search
27
+ from load_documents import load_documents
28
+ from split_documents import split_documents
29
+
30
+ print("=== TEST: load_documents -> split_documents -> FAISS.similarity_search ===\n")
31
+
32
+ # 1) Load tài liệu (PDF + HTML) từ HuggingFace
33
+ docs = load_documents()
34
+
35
+ # 2) Split thành chunks
36
+ from pprint import pprint
37
+ print(f"Loaded {len(docs)} raw documents.")
38
+ chunks = split_documents(docs)
39
+ print(f"Split into {len(chunks)} chunks.\n")
40
+
41
+ # 3) Xây FAISS vectorstore
42
+ vectorstore = build_vectorstore(chunks)
43
+
44
+ # 4) Test similarity_search
45
+ query = "Fristen für die Prüfungsanmeldung im Bachelorstudium"
46
+ print("Test query:")
47
+ print(" ", query, "\n")
48
+
49
+ results = vectorstore.similarity_search(query, k=3)
50
+
51
+ print("Top-3 ähnliche Chunks aus dem VectorStore:")
52
+ for i, doc in enumerate(results, start=1):
53
+ print(f"\n=== RESULT {i} ===")
54
+ print(doc.page_content[:400], "...")
55
+ print("Metadata:", doc.metadata)
56
+