Nguyen5 commited on
Commit
b77e194
·
1 Parent(s): 612ca73
Files changed (6) hide show
  1. app.py +256 -149
  2. build_hg_viewer.py +677 -272
  3. load_documents.py +165 -95
  4. rag_pipeline.py +158 -155
  5. requirements.txt +3 -0
  6. upload_weblink_to_supabase.py +101 -43
app.py CHANGED
@@ -1,212 +1,319 @@
1
-
2
- # app.py – Prüfungsrechts-Chatbot (RAG + Sprachmodus)
3
- # Version 26.11 – ohne Modi, stabil für Text + Voice
4
 
5
  import gradio as gr
6
  from gradio_pdf import PDF
7
  from huggingface_hub import hf_hub_download
 
8
 
9
- from load_documents import load_documents, DATASET, PDF_FILE, HTML_FILE
10
  from split_documents import split_documents
11
  from vectorstore import build_vectorstore
12
  from retriever import get_retriever
13
  from llm import load_llm
14
- from rag_pipeline import answer, PDF_BASE_URL, LAW_URL
15
 
16
  from speech_io import transcribe_audio, synthesize_speech
17
 
18
  # =====================================================
19
- # INITIALISIERUNG (global)
20
  # =====================================================
21
 
22
- print("🔹 Lade Dokumente ...")
 
 
 
 
 
 
 
 
 
23
  _docs = load_documents()
24
 
25
- print("🔹 Splitte Dokumente ...")
26
  _chunks = split_documents(_docs)
27
 
28
- print("🔹 Baue VectorStore (FAISS) ...")
29
  _vs = build_vectorstore(_chunks)
30
 
31
- print("🔹 Erzeuge Retriever ...")
32
  _retriever = get_retriever(_vs)
33
 
34
- print("🔹 Lade LLM ...")
35
  _llm = load_llm()
36
 
37
- print("🔹 Lade Dateien für Viewer")
38
- _pdf_path = hf_hub_download(DATASET, PDF_FILE, repo_type="dataset")
39
- _html_path = hf_hub_download(DATASET, HTML_FILE, repo_type="dataset")
 
 
 
 
 
 
40
 
41
  # =====================================================
42
- # Quellen formatieren – Markdown für Chat
43
  # =====================================================
44
 
45
  def format_sources_markdown(sources):
 
46
  if not sources:
47
  return ""
48
-
49
- lines = ["", "**📚 Quellen (genutzte Dokumentstellen):**"]
 
50
  for s in sources:
51
- sid = s["id"]
52
- src = s["source"]
53
  page = s["page"]
 
54
  url = s["url"]
55
- snippet = s["snippet"]
56
-
57
- title = f"Quelle {sid} – {src}"
58
-
59
  if url:
60
- base = f"- [{title}]({url})"
 
 
 
 
 
 
 
61
  else:
62
- base = f"- {title}"
63
-
64
- if page and "Prüfungsordnung" in src:
65
- base += f", Seite {page}"
66
-
67
- lines.append(base)
68
-
69
  if snippet:
70
- lines.append(f" > {snippet}")
71
-
72
  return "\n".join(lines)
73
 
74
  # =====================================================
75
- # TEXT CHATBOT
76
  # =====================================================
77
 
78
  def chatbot_text(user_message, history):
79
- if not user_message:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
  return history, ""
81
-
82
- answer_text, sources = answer(
83
- question=user_message,
84
- retriever=_retriever,
85
- chat_model=_llm,
86
- )
87
-
88
- quellen_block = format_sources_markdown(sources)
89
-
90
- history = history + [
91
- {"role": "user", "content": user_message},
92
- {"role": "assistant", "content": answer_text + quellen_block},
93
- ]
94
-
95
- return history, ""
96
-
97
- # =====================================================
98
- # VOICE CHATBOT
99
- # =====================================================
100
 
101
  def chatbot_voice(audio_path, history):
102
- # 1. Speech → Text
 
 
 
 
103
  text = transcribe_audio(audio_path)
104
  if not text:
105
- return history, None, ""
106
-
107
- # Lưu vào lịch sử chat
108
- history = history + [{"role": "user", "content": text}]
109
-
110
- # 2. RAG trả lời
111
- answer_text, sources = answer(
112
- question=text,
113
- retriever=_retriever,
114
- chat_model=_llm,
115
- )
116
- quellen_block = format_sources_markdown(sources)
117
-
118
- bot_msg = answer_text + quellen_block
119
- history = history + [{"role": "assistant", "content": bot_msg}]
120
-
121
- # 3. Text → Speech
122
- audio = synthesize_speech(bot_msg)
123
-
124
- return history, audio, ""
125
-
126
- # =====================================================
127
- # LAST ANSWER → TTS
128
- # =====================================================
129
 
130
  def read_last_answer(history):
 
131
  if not history:
132
  return None
133
-
134
  for msg in reversed(history):
135
  if msg["role"] == "assistant":
136
- return synthesize_speech(msg["content"])
137
-
138
  return None
139
 
140
  # =====================================================
141
- # UI – GRADIO
142
  # =====================================================
143
 
144
- with gr.Blocks(title="Prüfungsrechts-Chatbot (RAG + Sprache)") as demo:
145
- gr.Markdown("# 🧑‍⚖️ Prüfungsrechts-Chatbot")
146
- gr.Markdown(
147
- "Dieser Chatbot beantwortet Fragen **ausschließlich** aus der "
148
- "Prüfungsordnung (PDF) und dem Hochschulgesetz NRW (Website). "
149
- "Du kannst Text eingeben oder direkt ins Mikrofon sprechen."
150
- )
151
-
152
- with gr.Row():
153
- with gr.Column(scale=2):
154
- chatbot = gr.Chatbot(label="Chat", height=500)
155
-
156
- msg = gr.Textbox(
157
- label="Frage eingeben",
158
- placeholder="Stelle deine Frage zum Prüfungsrecht …",
159
- )
160
-
161
- # TEXT SENDEN
162
- msg.submit(
163
- chatbot_text,
164
- [msg, chatbot],
165
- [chatbot, msg]
166
- )
167
-
168
- send_btn = gr.Button("Senden (Text)")
169
- send_btn.click(
170
- chatbot_text,
171
- [msg, chatbot],
172
- [chatbot, msg]
173
- )
174
-
175
- # SPRACHEINGABE
176
- gr.Markdown("### 🎙️ Spracheingabe")
177
- voice_in = gr.Audio(sources=["microphone"], type="filepath")
178
- voice_out = gr.Audio(label="Vorgelesene Antwort", type="numpy")
179
-
180
- voice_btn = gr.Button("Sprechen & senden")
181
- voice_btn.click(
182
- chatbot_voice,
183
- [voice_in, chatbot],
184
- [chatbot, voice_out, msg]
185
- )
186
-
187
- read_btn = gr.Button("🔁 Antwort erneut vorlesen")
188
- read_btn.click(
189
- read_last_answer,
190
- [chatbot],
191
- [voice_out]
192
- )
193
-
194
- clear_btn = gr.Button("Chat zurücksetzen")
195
- clear_btn.click(lambda: [], None, chatbot)
196
-
197
- # =====================
198
- # RECHTE SPALTE: Viewer
199
- # =====================
200
-
201
- with gr.Column(scale=1):
202
- gr.Markdown("### 📄 Prüfungsordnung (PDF)")
203
- PDF(_pdf_path, height=350)
204
-
205
- gr.Markdown("### 📘 Hochschulgesetz NRW (Website)")
206
- gr.HTML(
207
- f'<iframe src="{LAW_URL}" style="width:100%;height:350px;border:none;"></iframe>'
208
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
209
 
 
 
 
210
 
211
  if __name__ == "__main__":
212
- demo.queue().launch(ssr_mode=False, show_error=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ app.py – Aktualisierte Version mit verbessertem Viewer
3
+ """
4
 
5
  import gradio as gr
6
  from gradio_pdf import PDF
7
  from huggingface_hub import hf_hub_download
8
+ import os
9
 
10
+ from load_documents import load_documents, DATASET, PDF_FILE
11
  from split_documents import split_documents
12
  from vectorstore import build_vectorstore
13
  from retriever import get_retriever
14
  from llm import load_llm
15
+ from rag_pipeline import answer
16
 
17
  from speech_io import transcribe_audio, synthesize_speech
18
 
19
  # =====================================================
20
+ # KONFIGURATION
21
  # =====================================================
22
 
23
+ # Viewer URL (ersetze mit deiner Supabase URL)
24
+ SUPABASE_URL = os.environ.get("SUPABASE_URL", "https://your-project.supabase.co")
25
+ LAW_VIEWER_URL = f"{SUPABASE_URL}/storage/v1/object/public/hg_viewer/hg_viewer.html"
26
+
27
+ # =====================================================
28
+ # INITIALISIERUNG
29
+ # =====================================================
30
+
31
+ print("🔹 Initialisiere System...")
32
+ print("1. Lade Dokumente...")
33
  _docs = load_documents()
34
 
35
+ print("2. Splitte Dokumente...")
36
  _chunks = split_documents(_docs)
37
 
38
+ print("3. Baue VectorStore...")
39
  _vs = build_vectorstore(_chunks)
40
 
41
+ print("4. Erzeuge Retriever...")
42
  _retriever = get_retriever(_vs)
43
 
44
+ print("5. Lade LLM...")
45
  _llm = load_llm()
46
 
47
+ print("6. Lade Dateien für Viewer...")
48
+ try:
49
+ _pdf_path = hf_hub_download(DATASET, PDF_FILE, repo_type="dataset")
50
+ print(f"✅ PDF geladen: {_pdf_path}")
51
+ except Exception as e:
52
+ print(f"⚠️ PDF konnte nicht geladen werden: {e}")
53
+ _pdf_path = None
54
+
55
+ print("✅ System initialisiert!")
56
 
57
  # =====================================================
58
+ # HELPER FUNCTIONS
59
  # =====================================================
60
 
61
  def format_sources_markdown(sources):
62
+ """Formatiere Quellen als Markdown"""
63
  if not sources:
64
  return ""
65
+
66
+ lines = ["", "**📚 Quellenverweise:**", ""]
67
+
68
  for s in sources:
69
+ source_type = s["source"]
 
70
  page = s["page"]
71
+ para_id = s.get("paragraph_id", "")
72
  url = s["url"]
73
+ snippet = s.get("snippet", "")
74
+
75
+ # Build source line
 
76
  if url:
77
+ if "PDF" in source_type:
78
+ source_text = f"[{source_type}"
79
+ if page:
80
+ source_text += f", Seite {page}"
81
+ source_text += f"]({url})"
82
+ else:
83
+ display_name = para_id if para_id else "Hochschulgesetz NRW"
84
+ source_text = f"[{display_name}]({url})"
85
  else:
86
+ source_text = source_type
87
+
88
+ lines.append(f"- {source_text}")
89
+
 
 
 
90
  if snippet:
91
+ lines.append(f" > *{snippet}*")
92
+
93
  return "\n".join(lines)
94
 
95
  # =====================================================
96
+ # CHATBOT FUNCTIONS
97
  # =====================================================
98
 
99
  def chatbot_text(user_message, history):
100
+ """Text-Chatbot Funktion"""
101
+ if not user_message.strip():
102
+ return history, ""
103
+
104
+ try:
105
+ # Get answer from RAG pipeline
106
+ answer_text, sources = answer(
107
+ question=user_message,
108
+ retriever=_retriever,
109
+ chat_model=_llm
110
+ )
111
+
112
+ # Add sources
113
+ sources_text = format_sources_markdown(sources)
114
+ full_response = f"{answer_text}\n\n{sources_text}"
115
+
116
+ # Update history
117
+ history.append({"role": "user", "content": user_message})
118
+ history.append({"role": "assistant", "content": full_response})
119
+
120
+ return history, ""
121
+
122
+ except Exception as e:
123
+ error_msg = f"Fehler bei der Verarbeitung: {str(e)}"
124
+ history.append({"role": "user", "content": user_message})
125
+ history.append({"role": "assistant", "content": error_msg})
126
  return history, ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
 
128
  def chatbot_voice(audio_path, history):
129
+ """Voice-Chatbot Funktion"""
130
+ if not audio_path:
131
+ return history, None, ""
132
+
133
+ # Transcribe audio
134
  text = transcribe_audio(audio_path)
135
  if not text:
136
+ return history, None, "Keine Sprache erkannt"
137
+
138
+ # Process with text chatbot
139
+ history, _ = chatbot_text(text, history)
140
+
141
+ # Get last response for TTS
142
+ last_response = None
143
+ for msg in reversed(history):
144
+ if msg["role"] == "assistant":
145
+ last_response = msg["content"]
146
+ break
147
+
148
+ # Generate audio
149
+ audio_output = None
150
+ if last_response:
151
+ audio_output = synthesize_speech(last_response.split("\n\n")[0]) # Nur erste Teil für TTS
152
+
153
+ return history, audio_output, text
 
 
 
 
 
 
154
 
155
  def read_last_answer(history):
156
+ """Lese letzte Antwort vor"""
157
  if not history:
158
  return None
159
+
160
  for msg in reversed(history):
161
  if msg["role"] == "assistant":
162
+ return synthesize_speech(msg["content"].split("\n\n")[0])
163
+
164
  return None
165
 
166
  # =====================================================
167
+ # GRADIO UI
168
  # =====================================================
169
 
170
+ def create_ui():
171
+ """Erstelle die Gradio Benutzeroberfläche"""
172
+
173
+ with gr.Blocks(
174
+ title="Prüfungsrechts-Chatbot NRW",
175
+ theme=gr.themes.Soft(),
176
+ css="""
177
+ .chatbot { min-height: 500px; }
178
+ .viewer-frame { border-radius: 10px; border: 1px solid #e0e0e0; }
179
+ """
180
+ ) as demo:
181
+
182
+ # Header
183
+ gr.Markdown("""
184
+ # 🧑‍⚖️ Prüfungsrechts-Chatbot für NRW Hochschulen
185
+
186
+ Dieser Chatbot beantwortet Fragen basierend auf:
187
+ - **Prüfungsordnung** (offizielles PDF)
188
+ - **Hochschulgesetz NRW** (aktuelle Fassung von recht.nrw.de)
189
+
190
+ Fragen können per Text oder Spracheingabe gestellt werden.
191
+ """)
192
+
193
+ with gr.Row():
194
+ # Left Column - Chat
195
+ with gr.Column(scale=2):
196
+ chatbot = gr.Chatbot(
197
+ label="Chat",
198
+ height=500,
199
+ bubble_full_width=False,
200
+ show_copy_button=True
201
+ )
202
+
203
+ with gr.Row():
204
+ msg = gr.Textbox(
205
+ label="Frage eingeben",
206
+ placeholder="Stellen Sie Ihre Frage zum Prüfungsrecht...",
207
+ scale=4,
208
+ container=False
209
+ )
210
+ send_btn = gr.Button("Senden", variant="primary", scale=1)
211
+
212
+ # Voice Input
213
+ with gr.Accordion("🎤 Spracheingabe", open=False):
214
+ with gr.Row():
215
+ voice_in = gr.Audio(
216
+ sources=["microphone"],
217
+ type="filepath",
218
+ label="Aufnahme",
219
+ scale=3
220
+ )
221
+ voice_btn = gr.Button("Sprechen & senden", scale=1)
222
+
223
+ voice_out = gr.Audio(
224
+ label="Antwort als Audio",
225
+ type="numpy",
226
+ visible=True
227
+ )
228
+
229
+ # Controls
230
+ with gr.Row():
231
+ read_btn = gr.Button("🔊 Antwort vorlesen")
232
+ clear_btn = gr.Button("🗑️ Chat leeren", variant="secondary")
233
+
234
+ # Right Column - Viewer
235
+ with gr.Column(scale=1):
236
+ # PDF Viewer
237
+ gr.Markdown("### 📄 Prüfungsordnung")
238
+ if _pdf_path:
239
+ pdf_viewer = PDF(_pdf_path, height=350, label="PDF Viewer")
240
+ else:
241
+ gr.Markdown("⚠️ PDF konnte nicht geladen werden")
242
+
243
+ # Law Viewer
244
+ gr.Markdown("### 📘 Hochschulgesetz NRW")
245
+ gr.HTML(f"""
246
+ <iframe
247
+ src="{LAW_VIEWER_URL}"
248
+ style="width:100%; height:400px; border:none; border-radius:10px;"
249
+ title="Hochschulgesetz NRW Viewer"
250
+ ></iframe>
251
+ """)
252
+
253
+ # Event Handlers
254
+ # Text input
255
+ msg.submit(
256
+ chatbot_text,
257
+ [msg, chatbot],
258
+ [chatbot, msg]
259
+ )
260
+
261
+ send_btn.click(
262
+ chatbot_text,
263
+ [msg, chatbot],
264
+ [chatbot, msg]
265
+ )
266
+
267
+ # Voice input
268
+ voice_btn.click(
269
+ chatbot_voice,
270
+ [voice_in, chatbot],
271
+ [chatbot, voice_out, msg]
272
+ )
273
+
274
+ # Controls
275
+ read_btn.click(
276
+ read_last_answer,
277
+ [chatbot],
278
+ [voice_out]
279
+ )
280
+
281
+ clear_btn.click(
282
+ lambda: [],
283
+ None,
284
+ [chatbot]
285
+ )
286
+
287
+ # Instructions
288
+ gr.Markdown("""
289
+ ### ℹ️ Nutzungshinweise
290
+
291
+ 1. **Präzise Fragen** stellen für bessere Antworten
292
+ 2. **Quellen** werden automatisch verlinkt
293
+ 3. **Klicken Sie auf Links** im Chat, um direkt zur Quelle zu springen
294
+ 4. **Spracheingabe** für hands-free Nutzung
295
+
296
+ ### ⚠️ Hinweis
297
+ Dies ist ein Assistenzsystem. Für verbindliche rechtliche Auskünfte wenden Sie sich bitte an die zuständigen Prüfungsämter.
298
+ """)
299
+
300
+ return demo
301
 
302
+ # =====================================================
303
+ # MAIN
304
+ # =====================================================
305
 
306
  if __name__ == "__main__":
307
+ demo = create_ui()
308
+
309
+ # Konfiguration für HuggingFace Spaces
310
+ demo.queue(
311
+ max_size=20,
312
+ api_open=False
313
+ ).launch(
314
+ server_name="0.0.0.0",
315
+ server_port=7860,
316
+ share=False,
317
+ show_error=True,
318
+ debug=False
319
+ )
build_hg_viewer.py CHANGED
@@ -1,7 +1,12 @@
1
- # build_hg_viewer.py
 
 
 
2
  import os
 
3
  from supabase import create_client
4
  from dotenv import load_dotenv
 
5
 
6
  load_dotenv()
7
 
@@ -13,301 +18,701 @@ if not SUPABASE_URL or not SUPABASE_SERVICE_ROLE:
13
 
14
  supabase = create_client(SUPABASE_URL, SUPABASE_SERVICE_ROLE)
15
 
16
- from upload_weblink_to_supabase import extract_paragraphs
17
-
18
- # ======== HTML TEMPLATE ========
19
- VIEW_TEMPLATE = """
20
- <!DOCTYPE html>
 
 
 
 
 
 
 
 
 
 
21
  <html lang="de">
22
  <head>
23
- <meta charset="UTF-8">
24
- <title>Hochschulgesetz NRW Paragraph Viewer</title>
25
-
26
- <style>
27
- body {
28
- font-family: Arial, sans-serif;
29
- margin: 0;
30
- padding: 0;
31
- display: flex;
32
- }
33
-
34
- /* ----------- SIDEBAR ------------- */
35
- #sidebar {
36
- width: 280px;
37
- height: 100vh;
38
- overflow-y: auto;
39
- background: #f5f5f5;
40
- border-right: 1px solid #ccc;
41
- padding: 15px;
42
- position: sticky;
43
- top: 0;
44
- }
45
-
46
- #sidebar h2 {
47
- margin-top: 0;
48
- }
49
-
50
- #searchBox {
51
- width: 100%;
52
- padding: 8px;
53
- font-size: 15px;
54
- margin-bottom: 10px;
55
- border: 1px solid #aaa;
56
- border-radius: 5px;
57
- }
58
-
59
- .sidebar-link {
60
- display: block;
61
- padding: 6px 8px;
62
- margin-bottom: 4px;
63
- text-decoration: none;
64
- color: #003366;
65
- border-radius: 4px;
66
- }
67
-
68
- .sidebar-link:hover {
69
- background: #e0e7ff;
70
- color: #001d4d;
71
- }
72
-
73
- /* ----------- CONTENT ------------- */
74
- #content {
75
- flex: 1;
76
- padding: 25px;
77
- max-width: 900px;
78
- }
79
-
80
- /* Absatz block */
81
- .para {
82
- padding: 20px 0;
83
- border-bottom: 1px solid #ddd;
84
- }
85
-
86
- .para h2 {
87
- color: #003366;
88
- margin-bottom: 10px;
89
- }
90
-
91
- /* ----------- Fußnoten ------------- */
92
- .fn-block {
93
- background: #fafafa;
94
- border-left: 4px solid #999;
95
- padding: 12px;
96
- margin-top: 10px;
97
- margin-bottom: 25px;
98
- }
99
-
100
- .fn-toggle {
101
- cursor: pointer;
102
- font-weight: bold;
103
- color: #003366;
104
- margin-bottom: 5px;
105
- }
106
-
107
- .fn-content {
108
- display: none;
109
- padding-left: 10px;
110
- }
111
-
112
- .fn-title {
113
- font-weight: bold;
114
- margin-bottom: 6px;
115
- }
116
-
117
- .fn-item {
118
- margin-bottom: 8px;
119
- }
120
-
121
- /* ----------- Highlight beim Öffnen ------------- */
122
- .highlight {
123
- animation: flash 2s ease-in-out;
124
- background: #fff8c6 !important;
125
- }
126
-
127
- @keyframes flash {
128
- 0% { background: #fff8c6; }
129
- 100% { background: transparent; }
130
- }
131
-
132
- /* Keyword highlight */
133
- .keyword {
134
- background: yellow;
135
- padding: 2px 3px;
136
- border-radius: 3px;
137
- }
138
-
139
- /* Back to top button */
140
- #topBtn {
141
- position: fixed;
142
- bottom: 25px;
143
- right: 25px;
144
- background: #003366;
145
- color: white;
146
- border-radius: 8px;
147
- padding: 10px 14px;
148
- cursor: pointer;
149
- font-size: 16px;
150
- display: none;
151
- }
152
- </style>
 
 
 
 
 
 
 
 
 
 
 
 
153
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
  </head>
155
  <body>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
 
157
- <div id="sidebar">
158
- <h2>Inhaltsverzeichnis</h2>
159
- <input type="text" id="searchBox" placeholder="Suchen nach § …">
160
- <!-- SIDEBAR_LINKS -->
161
- </div>
162
-
163
- <div id="content">
164
- <h1>Hochschulgesetz NRW Paragraph Viewer</h1>
165
- <!-- PARAGRAPH_CONTENT -->
166
- </div>
167
-
168
- <div id="topBtn" onclick="scrollToTop()">⬆️ Top</div>
169
-
170
- <script>
171
- // ------ TỰ ĐỘNG HIGHLIGHT Absatz khi có #anchor HIGHLIGHT ABSATZ & SCROLL ------
172
- window.onload = function() {
173
- const anchor = window.location.hash.substring(1);
174
- const params = new URLSearchParams(window.location.search);
175
- const keywords = params.get("k");
176
-
177
- if (anchor) {
178
- const el = document.getElementById(anchor);
179
- if (el) {
180
- el.classList.add("highlight");
181
- el.scrollIntoView({ behavior: "smooth", block: "center" });
182
- }
183
- }
184
-
185
- /* KEYWORD HIGHLIGHT */
186
- if (keywords) {
187
- const words = keywords.split("%20");
188
- highlightKeywords(words);
189
- }
190
- };
191
-
192
- /* --- KEYWORD HIGHLIGHT FUNCTION --- */
193
- function highlightKeywords(words) {
194
- const container = document.getElementById("content");
195
- let html = container.innerHTML;
196
-
197
- words.forEach(word => {
198
- if (word.length < 2) return;
199
- const regex = new RegExp(`(${decodeURIComponent(word)})`, "gi");
200
- html = html.replace(regex, `<span class="keyword">$1</span>`);
201
- });
202
-
203
- container.innerHTML = html;
204
- }
205
-
206
- /* --- SEARCH IN SIDEBAR --- */
207
- document.getElementById("searchBox").addEventListener("input", function() {
208
- const q = this.value.toLowerCase();
209
- document.querySelectorAll(".sidebar-link").forEach(link => {
210
- const txt = link.innerText.toLowerCase();
211
- link.style.display = txt.includes(q) ? "block" : "none";
212
- });
213
- });
214
-
215
- /* --- COLLAPSIBLE FUSSNOTEN --- */
216
- document.addEventListener("click", function(e) {
217
- if (e.target.classList.contains("fn-toggle")) {
218
- const content = e.target.nextElementSibling;
219
- content.style.display = content.style.display === "block" ? "none" : "block";
220
- }
221
- });
222
-
223
- /* --- BACK TO TOP BUTTON --- */
224
- window.onscroll = function() {
225
- document.getElementById("topBtn").style.display =
226
- window.scrollY > 300 ? "block" : "none";
227
- };
228
-
229
- function scrollToTop() {
230
- window.scrollTo({ top: 0, behavior: 'smooth' });
231
- }
232
-
233
- </script>
234
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
235
  </body>
236
  </html>
237
  """
238
 
239
  # -------------------------------------------------------------------
240
- # 2. BUILD VIEWER
241
  # -------------------------------------------------------------------
242
 
243
  def build_html():
244
- print(">>> Lade Paragraphs aus Supabase...")
245
- paras = extract_paragraphs()
246
-
247
- sidebar_links = ""
248
- content_html = ""
249
-
250
- for p in paras:
 
 
 
 
251
  pid = p["abs_id"]
252
  title = p["title"]
253
- body = p["content"]
254
-
255
- # Sidebar item
256
- sidebar_links += f'<a class="sidebar-link" href="#{pid}">{title}</a>\n'
257
-
258
- # Fußnoten tách riêng (bắt đầu bằng "Fn 1", "Fn 2", ...)
259
- lines = body.split("\n")
260
- main_text = []
261
- fn_text = []
262
- in_fn = False
263
-
 
 
 
 
 
264
  for line in lines:
265
- if line.startswith("Fn "):
266
- in_fn = True
267
- if in_fn:
268
- fn_text.append(line)
269
- else:
270
- main_text.append(line)
271
-
272
- footnotes_html = ""
273
- if fn_text:
274
- footnotes_html += '<div class="fn-block">'
275
- footnotes_html += '<div class="fn-title">Fußnoten:</div>'
276
- for fn in fn_text:
277
- footnotes_html += f'<div class="fn-item">{fn}</div>'
278
- footnotes_html += "</div>"
279
-
280
- # Paragraph block
281
- content_html += f"""
282
- <div class="para" id="{pid}">
283
- <h2>{title}</h2>
284
- <div>{'<br>'.join(main_text)}</div>
285
- {footnotes_html}
286
- </div>
287
- """
288
-
289
- html = VIEW_TEMPLATE.replace("<!-- SIDEBAR_LINKS -->", sidebar_links)
290
- html = html.replace("<!-- PARAGRAPH_CONTENT -->", content_html)
291
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
292
  return html
293
 
294
  # -------------------------------------------------------------------
295
- # 3. UPLOAD TO SUPABASE STORAGE
296
  # -------------------------------------------------------------------
297
 
298
  def upload_html():
 
 
 
299
  html = build_html()
300
-
301
- supabase.storage.from_("hg_viewer").update(
302
- "hg_clean.html",
303
- html.encode("utf-8"),
304
- {
305
- "content-type": "text/html",
306
- "x-upsert": "true"
307
- }
308
- )
309
-
310
- print("✔ hg_clean.html uploaded!")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
311
 
312
  if __name__ == "__main__":
313
- upload_html()
 
1
+ """
2
+ build_hg_viewer.py
3
+ Tạo HTML viewer cho Hochschulgesetz NRW với định dạng chuyên nghiệp
4
+ """
5
  import os
6
+ import json
7
  from supabase import create_client
8
  from dotenv import load_dotenv
9
+ import re
10
 
11
  load_dotenv()
12
 
 
18
 
19
  supabase = create_client(SUPABASE_URL, SUPABASE_SERVICE_ROLE)
20
 
21
+ def get_paragraphs_from_supabase():
22
+ """Lấy paragraphs từ Supabase"""
23
+ print(">>> Lade Paragraphs aus Supabase...")
24
+ response = supabase.table("hg_nrw").select("*").order("order_index").execute()
25
+ paragraphs = response.data
26
+
27
+ if not paragraphs:
28
+ print("❌ Keine Paragraphs in der Datenbank gefunden.")
29
+ return []
30
+
31
+ print(f"✔ {len(paragraphs)} Paragraphs geladen.")
32
+ return paragraphs
33
+
34
+ # ======== HTML TEMPLATE MIT PROFESSIONELLEM DESIGN ========
35
+ VIEW_TEMPLATE = """<!DOCTYPE html>
36
  <html lang="de">
37
  <head>
38
+ <meta charset="UTF-8">
39
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
40
+ <title>Hochschulgesetz NRW – Offizielle Viewer</title>
41
+ <style>
42
+ * {
43
+ margin: 0;
44
+ padding: 0;
45
+ box-sizing: border-box;
46
+ }
47
+
48
+ body {
49
+ font-family: 'Segoe UI', 'Roboto', 'Arial', sans-serif;
50
+ line-height: 1.6;
51
+ color: #333;
52
+ background: #f8f9fa;
53
+ display: flex;
54
+ min-height: 100vh;
55
+ }
56
+
57
+ /* ----------- SIDEBAR ------------- */
58
+ #sidebar {
59
+ width: 320px;
60
+ background: #ffffff;
61
+ border-right: 1px solid #e0e0e0;
62
+ height: 100vh;
63
+ overflow-y: auto;
64
+ position: fixed;
65
+ left: 0;
66
+ top: 0;
67
+ box-shadow: 2px 0 5px rgba(0,0,0,0.1);
68
+ z-index: 1000;
69
+ }
70
+
71
+ .sidebar-header {
72
+ padding: 20px;
73
+ background: linear-gradient(135deg, #003366 0%, #00509e 100%);
74
+ color: white;
75
+ border-bottom: 1px solid #002244;
76
+ }
77
+
78
+ .sidebar-header h2 {
79
+ font-size: 1.4rem;
80
+ font-weight: 600;
81
+ margin-bottom: 10px;
82
+ }
83
+
84
+ .sidebar-header p {
85
+ font-size: 0.9rem;
86
+ opacity: 0.9;
87
+ }
88
+
89
+ #searchBox {
90
+ width: 100%;
91
+ padding: 12px 15px;
92
+ font-size: 14px;
93
+ border: 1px solid #ddd;
94
+ border-radius: 8px;
95
+ margin: 15px;
96
+ background: #f8f9fa;
97
+ transition: all 0.3s;
98
+ }
99
+
100
+ #searchBox:focus {
101
+ outline: none;
102
+ border-color: #003366;
103
+ box-shadow: 0 0 0 3px rgba(0, 51, 102, 0.1);
104
+ }
105
+
106
+ .paragraph-list {
107
+ padding: 0 15px 20px 15px;
108
+ }
109
+
110
+ .sidebar-link {
111
+ display: block;
112
+ padding: 12px 15px;
113
+ margin-bottom: 5px;
114
+ text-decoration: none;
115
+ color: #003366;
116
+ background: #f8f9fa;
117
+ border-left: 4px solid transparent;
118
+ border-radius: 6px;
119
+ font-size: 14px;
120
+ font-weight: 500;
121
+ transition: all 0.2s;
122
+ }
123
+
124
+ .sidebar-link:hover {
125
+ background: #e3f2fd;
126
+ border-left-color: #003366;
127
+ transform: translateX(3px);
128
+ }
129
+
130
+ .sidebar-link.active {
131
+ background: #e3f2fd;
132
+ border-left-color: #003366;
133
+ font-weight: 600;
134
+ }
135
+
136
+ /* ----------- MAIN CONTENT ------------- */
137
+ #content-wrapper {
138
+ flex: 1;
139
+ margin-left: 320px;
140
+ min-height: 100vh;
141
+ }
142
+
143
+ #content {
144
+ max-width: 900px;
145
+ margin: 0 auto;
146
+ padding: 30px;
147
+ background: white;
148
+ min-height: 100vh;
149
+ box-shadow: 0 0 20px rgba(0,0,0,0.05);
150
+ }
151
+
152
+ .page-header {
153
+ margin-bottom: 40px;
154
+ padding-bottom: 20px;
155
+ border-bottom: 2px solid #003366;
156
+ }
157
+
158
+ .page-header h1 {
159
+ color: #003366;
160
+ font-size: 2.2rem;
161
+ font-weight: 700;
162
+ margin-bottom: 10px;
163
+ }
164
+
165
+ .page-header .subtitle {
166
+ color: #666;
167
+ font-size: 1.1rem;
168
+ }
169
+
170
+ /* ----------- PARAGRAPH STYLES ------------- */
171
+ .paragraph {
172
+ margin-bottom: 50px;
173
+ padding: 25px;
174
+ background: #ffffff;
175
+ border-radius: 10px;
176
+ border-left: 5px solid #003366;
177
+ box-shadow: 0 2px 10px rgba(0,0,0,0.08);
178
+ transition: all 0.3s;
179
+ }
180
 
181
+ .paragraph.highlight {
182
+ animation: highlight-pulse 2s ease;
183
+ border-left-color: #ff9800;
184
+ box-shadow: 0 0 0 3px rgba(255, 152, 0, 0.2);
185
+ }
186
+
187
+ .paragraph-header {
188
+ margin-bottom: 20px;
189
+ }
190
+
191
+ .paragraph-title {
192
+ color: #003366;
193
+ font-size: 1.6rem;
194
+ font-weight: 700;
195
+ margin-bottom: 10px;
196
+ display: flex;
197
+ align-items: center;
198
+ gap: 10px;
199
+ }
200
+
201
+ .paragraph-title .anchor {
202
+ font-size: 0.8em;
203
+ color: #666;
204
+ text-decoration: none;
205
+ opacity: 0;
206
+ transition: opacity 0.2s;
207
+ }
208
+
209
+ .paragraph:hover .anchor {
210
+ opacity: 1;
211
+ }
212
+
213
+ .paragraph-content {
214
+ font-size: 1.05rem;
215
+ line-height: 1.8;
216
+ color: #333;
217
+ }
218
+
219
+ .paragraph-content p {
220
+ margin-bottom: 15px;
221
+ }
222
+
223
+ .paragraph-content ul, .paragraph-content ol {
224
+ margin: 15px 0 15px 25px;
225
+ }
226
+
227
+ .paragraph-content li {
228
+ margin-bottom: 8px;
229
+ }
230
+
231
+ /* ----------- FOOTNOTES ------------- */
232
+ .footnotes {
233
+ margin-top: 25px;
234
+ padding-top: 20px;
235
+ border-top: 1px solid #eee;
236
+ }
237
+
238
+ .footnotes-title {
239
+ font-weight: 600;
240
+ color: #666;
241
+ margin-bottom: 15px;
242
+ font-size: 0.95rem;
243
+ }
244
+
245
+ .footnote-item {
246
+ margin-bottom: 10px;
247
+ padding-left: 15px;
248
+ border-left: 2px solid #ddd;
249
+ font-size: 0.9rem;
250
+ color: #555;
251
+ }
252
+
253
+ /* ----------- HIGHLIGHT ANIMATION ------------- */
254
+ @keyframes highlight-pulse {
255
+ 0% { background-color: #fff8e1; }
256
+ 70% { background-color: #fff8e1; }
257
+ 100% { background-color: #ffffff; }
258
+ }
259
+
260
+ /* ----------- RESPONSIVE ------------- */
261
+ @media (max-width: 992px) {
262
+ body {
263
+ flex-direction: column;
264
+ }
265
+
266
+ #sidebar {
267
+ position: static;
268
+ width: 100%;
269
+ height: auto;
270
+ max-height: 50vh;
271
+ }
272
+
273
+ #content-wrapper {
274
+ margin-left: 0;
275
+ }
276
+ }
277
+
278
+ /* ----------- BACK TO TOP ------------- */
279
+ #back-to-top {
280
+ position: fixed;
281
+ bottom: 30px;
282
+ right: 30px;
283
+ width: 50px;
284
+ height: 50px;
285
+ background: #003366;
286
+ color: white;
287
+ border-radius: 50%;
288
+ display: none;
289
+ justify-content: center;
290
+ align-items: center;
291
+ cursor: pointer;
292
+ box-shadow: 0 2px 10px rgba(0,0,0,0.2);
293
+ transition: all 0.3s;
294
+ z-index: 1000;
295
+ }
296
+
297
+ #back-to-top:hover {
298
+ background: #00509e;
299
+ transform: translateY(-3px);
300
+ }
301
+
302
+ /* ----------- KEYWORD HIGHLIGHT ------------- */
303
+ .keyword-highlight {
304
+ background: #fff9c4;
305
+ padding: 2px 4px;
306
+ border-radius: 3px;
307
+ font-weight: 500;
308
+ }
309
+
310
+ /* ----------- PRINT STYLES ------------- */
311
+ @media print {
312
+ #sidebar {
313
+ display: none;
314
+ }
315
+
316
+ #content-wrapper {
317
+ margin-left: 0;
318
+ }
319
+
320
+ #back-to-top {
321
+ display: none !important;
322
+ }
323
+ }
324
+ </style>
325
  </head>
326
  <body>
327
+ <!-- SIDEBAR -->
328
+ <div id="sidebar">
329
+ <div class="sidebar-header">
330
+ <h2>Hochschulgesetz NRW</h2>
331
+ <p>Inhaltsverzeichnis</p>
332
+ </div>
333
+
334
+ <input type="text" id="searchBox" placeholder="Paragraph suchen (z.B. §1 oder Text)..."
335
+ title="Geben Sie eine Paragraphennummer oder Suchbegriff ein">
336
+
337
+ <div class="paragraph-list" id="paragraphList">
338
+ <!-- SIDEBAR_LINKS -->
339
+ </div>
340
+ </div>
341
+
342
+ <!-- MAIN CONTENT -->
343
+ <div id="content-wrapper">
344
+ <div id="content">
345
+ <div class="page-header">
346
+ <h1>Hochschulgesetz Nordrhein-Westfalen</h1>
347
+ <p class="subtitle">Gesetz über die Hochschulen des Landes Nordrhein-Westfalen (Hochschulgesetz – HG)</p>
348
+ <p class="subtitle" style="font-size: 0.9rem; color: #777;">
349
+ Stand: Aktuelle Fassung | Quelle: <a href="https://recht.nrw.de" target="_blank">recht.nrw.de</a>
350
+ </p>
351
+ </div>
352
+
353
+ <div id="paragraphContent">
354
+ <!-- PARAGRAPH_CONTENT -->
355
+ </div>
356
+ </div>
357
+ </div>
358
+
359
+ <!-- BACK TO TOP BUTTON -->
360
+ <div id="back-to-top" title="Zum Anfang">
361
+
362
+ </div>
363
+
364
+ <script>
365
+ // ========== GLOBAL VARIABLES ==========
366
+ let currentParagraphId = '';
367
+ let searchTimeout = null;
368
+
369
+ // ========== INITIALIZATION ==========
370
+ document.addEventListener('DOMContentLoaded', function() {
371
+ // Check for URL hash
372
+ const hash = window.location.hash.substring(1);
373
+ const urlParams = new URLSearchParams(window.location.search);
374
+ const keywords = urlParams.get('keywords');
375
+
376
+ if (hash) {
377
+ scrollToParagraph(hash);
378
+ }
379
+
380
+ if (keywords) {
381
+ highlightKeywords(decodeURIComponent(keywords));
382
+ }
383
+
384
+ setupEventListeners();
385
+ updateActiveLink();
386
+ });
387
+
388
+ // ========== SCROLL TO PARAGRAPH ==========
389
+ function scrollToParagraph(paragraphId, highlight = true) {
390
+ const element = document.getElementById(paragraphId);
391
+ if (!element) return;
392
+
393
+ // Remove previous highlight
394
+ document.querySelectorAll('.paragraph.highlight').forEach(el => {
395
+ el.classList.remove('highlight');
396
+ });
397
+
398
+ // Calculate position for smooth scroll
399
+ const sidebarHeight = document.getElementById('sidebar').offsetHeight;
400
+ const elementPosition = element.getBoundingClientRect().top;
401
+ const offsetPosition = elementPosition + window.pageYOffset - 100;
402
+
403
+ // Smooth scroll
404
+ window.scrollTo({
405
+ top: offsetPosition,
406
+ behavior: 'smooth'
407
+ });
408
+
409
+ // Highlight if requested
410
+ if (highlight) {
411
+ setTimeout(() => {
412
+ element.classList.add('highlight');
413
+
414
+ // Update URL without page reload
415
+ history.replaceState(null, null, `#${paragraphId}`);
416
+
417
+ // Update active link in sidebar
418
+ updateActiveLink(paragraphId);
419
+ }, 300);
420
+ }
421
+ }
422
 
423
+ // ========== SEARCH FUNCTIONALITY ==========
424
+ function setupEventListeners() {
425
+ const searchBox = document.getElementById('searchBox');
426
+
427
+ // Search input with debounce
428
+ searchBox.addEventListener('input', function() {
429
+ clearTimeout(searchTimeout);
430
+ searchTimeout = setTimeout(() => {
431
+ filterParagraphs(this.value);
432
+ }, 300);
433
+ });
434
+
435
+ // Enter key to jump to first result
436
+ searchBox.addEventListener('keypress', function(e) {
437
+ if (e.key === 'Enter') {
438
+ e.preventDefault();
439
+ jumpToFirstResult(this.value);
440
+ }
441
+ });
442
+
443
+ // Back to top button
444
+ const backToTop = document.getElementById('back-to-top');
445
+ backToTop.addEventListener('click', function() {
446
+ window.scrollTo({
447
+ top: 0,
448
+ behavior: 'smooth'
449
+ });
450
+ });
451
+
452
+ // Show/hide back to top button
453
+ window.addEventListener('scroll', function() {
454
+ if (window.scrollY > 500) {
455
+ backToTop.style.display = 'flex';
456
+ } else {
457
+ backToTop.style.display = 'none';
458
+ }
459
+
460
+ updateActiveLink();
461
+ });
462
+ }
463
+
464
+ function filterParagraphs(searchTerm) {
465
+ const links = document.querySelectorAll('.sidebar-link');
466
+ const searchLower = searchTerm.toLowerCase();
467
+ let hasVisible = false;
468
+
469
+ links.forEach(link => {
470
+ const text = link.textContent.toLowerCase();
471
+ if (text.includes(searchLower)) {
472
+ link.style.display = 'block';
473
+ hasVisible = true;
474
+ } else {
475
+ link.style.display = 'none';
476
+ }
477
+ });
478
+
479
+ // Update search box placeholder based on results
480
+ const searchBox = document.getElementById('searchBox');
481
+ if (!hasVisible && searchTerm) {
482
+ searchBox.title = 'Keine Ergebnisse gefunden';
483
+ } else {
484
+ searchBox.title = '';
485
+ }
486
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
487
 
488
+ function jumpToFirstResult(searchTerm) {
489
+ const links = document.querySelectorAll('.sidebar-link');
490
+ const searchLower = searchTerm.toLowerCase();
491
+
492
+ for (const link of links) {
493
+ if (link.style.display !== 'none') {
494
+ const paragraphId = link.getAttribute('href').substring(1);
495
+ scrollToParagraph(paragraphId);
496
+ break;
497
+ }
498
+ }
499
+ }
500
+
501
+ // ========== HIGHLIGHT KEYWORDS ==========
502
+ function highlightKeywords(keywords) {
503
+ const content = document.getElementById('paragraphContent');
504
+ const searchTerms = keywords.split(/[\s,]+/).filter(term => term.length > 2);
505
+
506
+ searchTerms.forEach(term => {
507
+ const regex = new RegExp(`(${escapeRegExp(term)})`, 'gi');
508
+ content.innerHTML = content.innerHTML.replace(regex,
509
+ '<span class="keyword-highlight">$1</span>');
510
+ });
511
+ }
512
+
513
+ function escapeRegExp(string) {
514
+ return string.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
515
+ }
516
+
517
+ // ========== UPDATE ACTIVE LINK ==========
518
+ function updateActiveLink(forceId = null) {
519
+ const links = document.querySelectorAll('.sidebar-link');
520
+ const paragraphs = document.querySelectorAll('.paragraph');
521
+
522
+ let activeId = forceId;
523
+
524
+ if (!activeId) {
525
+ // Find paragraph in viewport
526
+ const viewportHeight = window.innerHeight;
527
+ const viewportMiddle = window.scrollY + (viewportHeight / 2);
528
+
529
+ for (const paragraph of paragraphs) {
530
+ const rect = paragraph.getBoundingClientRect();
531
+ const paragraphTop = window.pageYOffset + rect.top;
532
+ const paragraphBottom = paragraphTop + rect.height;
533
+
534
+ if (viewportMiddle >= paragraphTop && viewportMiddle <= paragraphBottom) {
535
+ activeId = paragraph.id;
536
+ break;
537
+ }
538
+ }
539
+ }
540
+
541
+ // Update active state
542
+ links.forEach(link => {
543
+ const href = link.getAttribute('href').substring(1);
544
+ if (href === activeId) {
545
+ link.classList.add('active');
546
+ } else {
547
+ link.classList.remove('active');
548
+ }
549
+ });
550
+ }
551
+
552
+ // ========== FORMAT CONTENT ==========
553
+ function formatContent(text) {
554
+ // Replace multiple newlines with paragraphs
555
+ return text.split('\n\n').map(paragraph => {
556
+ if (paragraph.trim()) {
557
+ return `<p>${paragraph.trim()}</p>`;
558
+ }
559
+ return '';
560
+ }).join('');
561
+ }
562
+
563
+ // ========== COPY TO CLIPBOARD ==========
564
+ function copyParagraphLink(paragraphId) {
565
+ const url = window.location.origin + window.location.pathname + '#' + paragraphId;
566
+ navigator.clipboard.writeText(url).then(() => {
567
+ // Show temporary notification
568
+ const notification = document.createElement('div');
569
+ notification.textContent = 'Link kopiert!';
570
+ notification.style.cssText = `
571
+ position: fixed;
572
+ top: 20px;
573
+ right: 20px;
574
+ background: #4CAF50;
575
+ color: white;
576
+ padding: 10px 20px;
577
+ border-radius: 5px;
578
+ z-index: 10000;
579
+ animation: fadeInOut 2s ease;
580
+ `;
581
+ document.body.appendChild(notification);
582
+
583
+ setTimeout(() => {
584
+ document.body.removeChild(notification);
585
+ }, 2000);
586
+ });
587
+ }
588
+ </script>
589
  </body>
590
  </html>
591
  """
592
 
593
  # -------------------------------------------------------------------
594
+ # BUILD VIEWER
595
  # -------------------------------------------------------------------
596
 
597
  def build_html():
598
+ """Xây dựng HTML viewer từ dữ liệu Supabase"""
599
+ paragraphs = get_paragraphs_from_supabase()
600
+
601
+ if not paragraphs:
602
+ print("❌ Keine Paragraphs zum Erstellen des Viewers verfügbar.")
603
+ return None
604
+
605
+ sidebar_links = []
606
+ content_html = []
607
+
608
+ for p in paragraphs:
609
  pid = p["abs_id"]
610
  title = p["title"]
611
+ content = p["content"]
612
+
613
+ # Tạo link cho sidebar
614
+ sidebar_link = f'''
615
+ <a class="sidebar-link" href="#{pid}" onclick="scrollToParagraph('{pid}'); return false;">
616
+ {title}
617
+ </a>
618
+ '''
619
+ sidebar_links.append(sidebar_link)
620
+
621
+ # Tạo nội dung paragraph
622
+ # Phân loại footnote và nội dung chính
623
+ lines = content.split('\n')
624
+ main_content = []
625
+ footnotes = []
626
+
627
  for line in lines:
628
+ line = line.strip()
629
+ if line.lower().startswith('fn ') or line.lower().startswith('fussnote'):
630
+ footnotes.append(line)
631
+ elif line:
632
+ main_content.append(line)
633
+
634
+ # Format main content
635
+ formatted_content = '<br>'.join(main_content)
636
+
637
+ # Format footnotes
638
+ footnotes_html = ''
639
+ if footnotes:
640
+ footnotes_html = '''
641
+ <div class="footnotes">
642
+ <div class="footnotes-title">Fußnoten:</div>
643
+ ''' + ''.join(f'<div class="footnote-item">{fn}</div>' for fn in footnotes) + '''
644
+ </div>
645
+ '''
646
+
647
+ # Tạo paragraph block
648
+ paragraph_html = f'''
649
+ <div class="paragraph" id="{pid}">
650
+ <div class="paragraph-header">
651
+ <h3 class="paragraph-title">
652
+ {title}
653
+ <a href="#{pid}" class="anchor" onclick="copyParagraphLink('{pid}'); return false;"
654
+ title="Link zu diesem Paragraph kopieren">🔗</a>
655
+ </h3>
656
+ </div>
657
+ <div class="paragraph-content">
658
+ {formatted_content}
659
+ </div>
660
+ {footnotes_html}
661
+ </div>
662
+ '''
663
+ content_html.append(paragraph_html)
664
+
665
+ # Điền nội dung vào template
666
+ html = VIEW_TEMPLATE
667
+ html = html.replace('<!-- SIDEBAR_LINKS -->', '\n'.join(sidebar_links))
668
+ html = html.replace('<!-- PARAGRAPH_CONTENT -->', '\n'.join(content_html))
669
+
670
+ # Thêm metadata
671
+ html = html.replace(
672
+ 'Aktuelle Fassung',
673
+ f'Aktuelle Fassung - {len(paragraphs)} Paragraphs'
674
+ )
675
+
676
  return html
677
 
678
  # -------------------------------------------------------------------
679
+ # UPLOAD TO SUPABASE STORAGE
680
  # -------------------------------------------------------------------
681
 
682
  def upload_html():
683
+ """Tạo và tải lên HTML viewer"""
684
+ print(">>> Baue HTML Viewer...")
685
+
686
  html = build_html()
687
+ if not html:
688
+ print("❌ Konnte HTML nicht erstellen.")
689
+ return
690
+
691
+ try:
692
+ # Tạo bucket nếu chưa tồn tại
693
+ try:
694
+ supabase.storage.get_bucket("hg_viewer")
695
+ except:
696
+ supabase.storage.create_bucket("hg_viewer", {
697
+ "public": True,
698
+ "file_size_limit": 10485760 # 10MB
699
+ })
700
+
701
+ # Upload HTML file
702
+ supabase.storage.from_("hg_viewer").upload(
703
+ "hg_viewer.html",
704
+ html.encode("utf-8"),
705
+ {
706
+ "content-type": "text/html",
707
+ "cache-control": "public, max-age=3600"
708
+ }
709
+ )
710
+
711
+ print("✅ hg_viewer.html erfolgreich hochgeladen!")
712
+ print(f"📁 URL: {SUPABASE_URL}/storage/v1/object/public/hg_viewer/hg_viewer.html")
713
+
714
+ except Exception as e:
715
+ print(f"❌ Fehler beim Upload: {e}")
716
 
717
  if __name__ == "__main__":
718
+ upload_html()
load_documents.py CHANGED
@@ -1,130 +1,200 @@
1
  """
2
- BƯỚC 1: LOAD DOCUMENTS
3
- -----------------------
4
- Debug-full version
5
-
6
- - Lädt Prüfungsordnung (PDF) seitenweise.
7
- - Lädt Hochschulgesetz NRW aus dem im Dataset gespeicherten HTML,
8
- und zerlegt es in einzelne Absätze (Document pro <p>).
9
  """
10
 
11
  from huggingface_hub import hf_hub_download, list_repo_files
12
  from langchain_community.document_loaders import PyPDFLoader
13
  from langchain_core.documents import Document
14
  from bs4 import BeautifulSoup
 
 
 
15
 
16
  DATASET = "Nguyen5/docs"
17
  PDF_FILE = "f10_bpo_ifb_tei_mif_wii_2021-01-04.pdf"
18
- HTML_FILE = "Hochschulgesetz_NRW.html" # konsistent mit hg_nrw.py
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
- def _load_hg_paragraph_documents(html_path: str):
21
  """
22
- Liest das generierte Hochschulgesetz-HTML ein und erzeugt
23
- pro <p>-Element einen LangChain-Document mit:
24
- - page_content = Text des Absatzes
25
- - metadata:
26
- source = "Hochschulgesetz NRW (HTML)"
27
- filename = HTML_FILE
28
- paragraph_id = id-Attribut (z.B. 'hg_abs_12'), falls vorhanden
29
  """
30
- with open(html_path, "r", encoding="utf-8") as f:
31
- html = f.read()
32
-
33
- soup = BeautifulSoup(html, "html.parser")
34
- docs = []
35
-
36
- for p in soup.find_all("p"):
37
- text = p.get_text(" ", strip=True)
38
- if not text:
39
- continue
40
-
41
- pid = p.get("id")
42
-
43
- metadata = {
44
- "source": "Hochschulgesetz NRW (HTML)",
45
- "filename": HTML_FILE,
46
- }
47
- if pid:
48
- metadata["paragraph_id"] = pid
49
-
50
- docs.append(Document(page_content=text, metadata=metadata))
51
-
52
- print(f"Loaded {len(docs)} paragraph Documents from HG-HTML.\n")
53
- return docs
54
-
55
- def load_documents():
56
- print("=== START: load_documents() ===\n")
57
-
58
- # -------------------------
59
- # Check files in dataset
60
- # -------------------------
61
- print(">>> Checking dataset file list from HuggingFace...")
62
- files = list_repo_files(DATASET, repo_type="dataset")
63
- print("Files in dataset:", files, "\n")
64
-
65
- docs = []
66
 
67
- # -------------------------
68
- # Load PDF
69
- # -------------------------
70
- print(">>> Step 1: Download PDF from HuggingFace...")
71
  try:
72
  pdf_path = hf_hub_download(
73
  repo_id=DATASET,
74
  filename=PDF_FILE,
75
  repo_type="dataset",
76
  )
77
- print(f"Downloaded PDF to local cache:\n{pdf_path}\n")
78
- except Exception as e:
79
- print("ERROR downloading PDF:", e)
80
- return []
81
-
82
- print(">>> Step 1.1: Loading PDF pages...")
83
- try:
84
  pdf_docs = PyPDFLoader(pdf_path).load()
85
- print(f"Loaded {len(pdf_docs)} PDF pages.\n")
 
 
 
 
 
 
 
 
 
 
 
 
86
  except Exception as e:
87
- print("ERROR loading PDF:", e)
88
  return []
89
 
90
- for d in pdf_docs:
91
- d.metadata["source"] = "Prüfungsordnung (PDF)"
92
- d.metadata["filename"] = PDF_FILE
93
-
94
- docs.extend(pdf_docs)
95
-
96
- # -------------------------
97
- # Load HTML (Hochschulgesetz NRW)
98
- # -------------------------
99
- print(">>> Step 2: Download HTML from HuggingFace...")
 
 
 
 
100
  try:
101
  html_path = hf_hub_download(
102
  repo_id=DATASET,
103
  filename=HTML_FILE,
104
  repo_type="dataset",
105
  )
106
- print(f"Downloaded HTML to local cache:\n{html_path}\n")
107
- except Exception as e:
108
- print("ERROR downloading HTML:", e)
109
- return docs
110
-
111
- print(">>> Step 2.1: Loading HG-HTML and splitting into paragraphs...")
112
- try:
113
  html_docs = _load_hg_paragraph_documents(html_path)
 
 
114
  except Exception as e:
115
- print("ERROR loading / parsing HTML:", e)
116
- return docs
117
-
118
- docs.extend(html_docs)
119
-
120
- print("=== DONE: load_documents() ===\n")
121
- return docs
 
 
 
 
 
 
 
 
 
 
 
 
 
122
 
123
  if __name__ == "__main__":
124
- print("\n=== Running load_documents.py directly ===\n")
125
  docs = load_documents()
126
- print(f"\n>>> TOTAL documents loaded: {len(docs)}")
127
-
128
- if len(docs):
129
- print("\nExample metadata from 1st document:")
130
- print(docs[0].metadata)
 
1
  """
2
+ load_documents.py
3
+ Cải thiện việc load tài liệu với xử lý lỗi tốt hơn
 
 
 
 
 
4
  """
5
 
6
  from huggingface_hub import hf_hub_download, list_repo_files
7
  from langchain_community.document_loaders import PyPDFLoader
8
  from langchain_core.documents import Document
9
  from bs4 import BeautifulSoup
10
+ import requests
11
+ import re
12
+ from typing import List, Optional
13
 
14
  DATASET = "Nguyen5/docs"
15
  PDF_FILE = "f10_bpo_ifb_tei_mif_wii_2021-01-04.pdf"
16
+ HTML_FILE = "Hochschulgesetz_NRW.html"
17
+
18
+ def clean_html_content(text: str) -> str:
19
+ """Làm sạch nội dung HTML"""
20
+ # Loại bỏ khoảng trắng thừa
21
+ text = re.sub(r'\s+', ' ', text)
22
+ # Chuẩn hóa dấu câu
23
+ text = re.sub(r'\s*([.,;:!?])\s*', r'\1 ', text)
24
+ # Đảm bảo chữ cái đầu câu viết hoa
25
+ sentences = text.split('. ')
26
+ sentences = [s.strip().capitalize() for s in sentences if s.strip()]
27
+ return '. '.join(sentences)
28
+
29
+ def load_recht_nrw_direct() -> List[Document]:
30
+ """Tải trực tiếp từ recht.nrw.de"""
31
+ print(">>> Lade Hochschulgesetz NRW direkt von recht.nrw.de...")
32
+
33
+ url = "https://recht.nrw.de/lmi/owa/br_text_anzeigen?v_id=10000000000000000654"
34
+
35
+ try:
36
+ headers = {
37
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
38
+ }
39
+ response = requests.get(url, headers=headers, timeout=60)
40
+ response.raise_for_status()
41
+
42
+ soup = BeautifulSoup(response.text, 'html.parser')
43
+ docs = []
44
+
45
+ # Tìm tất cả các paragraph
46
+ for i, element in enumerate(soup.find_all(['p', 'div', 'td'])):
47
+ text = element.get_text(" ", strip=True)
48
+
49
+ # Chỉ lấy các phần có chứa §
50
+ if '§' in text:
51
+ # Tách title và content
52
+ lines = text.split('\n')
53
+ title = lines[0].strip() if lines else f"§ {i+1}"
54
+ content = " ".join(lines[1:]) if len(lines) > 1 else text
55
+
56
+ metadata = {
57
+ "source": "Hochschulgesetz NRW (Website)",
58
+ "filename": "recht_nrw_direct.html",
59
+ "paragraph_id": f"hg_direct_{i+1}",
60
+ "url": url
61
+ }
62
+
63
+ doc = Document(
64
+ page_content=clean_html_content(content),
65
+ metadata=metadata
66
+ )
67
+ docs.append(doc)
68
+
69
+ print(f"✅ {len(docs)} Paragraphs direkt von recht.nrw.de geladen.")
70
+ return docs
71
+
72
+ except Exception as e:
73
+ print(f"❌ Fehler beim Laden von recht.nrw.de: {e}")
74
+ return []
75
 
76
+ def _load_hg_paragraph_documents(html_path: str) -> List[Document]:
77
  """
78
+ Lädt Paragraphs aus dem gespeicherten HTML
 
 
 
 
 
 
79
  """
80
+ try:
81
+ with open(html_path, "r", encoding="utf-8") as f:
82
+ html = f.read()
83
+
84
+ soup = BeautifulSoup(html, "html.parser")
85
+ docs = []
86
+
87
+ # Suche nach allen relevanten Inhalten
88
+ for i, p in enumerate(soup.find_all(['p', 'div', 'section'])):
89
+ text = p.get_text(" ", strip=True)
90
+ if not text or len(text) < 10:
91
+ continue
92
+
93
+ # Check if it's a paragraph
94
+ if '§' in text or 'Artikel' in text:
95
+ pid = p.get("id", f"hg_para_{i+1}")
96
+
97
+ metadata = {
98
+ "source": "Hochschulgesetz NRW (HTML)",
99
+ "filename": HTML_FILE,
100
+ "paragraph_id": pid,
101
+ "type": "paragraph"
102
+ }
103
+
104
+ docs.append(Document(
105
+ page_content=clean_html_content(text),
106
+ metadata=metadata
107
+ ))
108
+
109
+ print(f"✅ {len(docs)} Paragraphs aus HTML geladen.")
110
+ return docs
111
+
112
+ except Exception as e:
113
+ print(f" Fehler beim Laden des HTML: {e}")
114
+ return []
 
115
 
116
+ def load_pdf_documents() -> List[Document]:
117
+ """Lädt PDF-Dokumente"""
118
+ print(">>> Lade PDF-Dokumente...")
119
+
120
  try:
121
  pdf_path = hf_hub_download(
122
  repo_id=DATASET,
123
  filename=PDF_FILE,
124
  repo_type="dataset",
125
  )
126
+ print(f" PDF heruntergeladen: {pdf_path}")
127
+
128
+ # Load PDF with PyPDFLoader
 
 
 
 
129
  pdf_docs = PyPDFLoader(pdf_path).load()
130
+
131
+ # Enhance metadata
132
+ for i, doc in enumerate(pdf_docs):
133
+ doc.metadata.update({
134
+ "source": "Prüfungsordnung (PDF)",
135
+ "filename": PDF_FILE,
136
+ "document_type": "exam_regulation",
137
+ "chunk_index": i
138
+ })
139
+
140
+ print(f"✅ {len(pdf_docs)} Seiten aus PDF geladen.")
141
+ return pdf_docs
142
+
143
  except Exception as e:
144
+ print(f" Fehler beim Laden des PDF: {e}")
145
  return []
146
 
147
+ def load_documents() -> List[Document]:
148
+ """
149
+ Hauptfunktion zum Laden aller Dokumente
150
+ """
151
+ print("=== START: load_documents() ===\n")
152
+
153
+ all_docs = []
154
+
155
+ # 1. Load PDF documents
156
+ pdf_docs = load_pdf_documents()
157
+ all_docs.extend(pdf_docs)
158
+
159
+ # 2. Try loading from dataset HTML
160
+ print(">>> Versuche, HTML aus Dataset zu laden...")
161
  try:
162
  html_path = hf_hub_download(
163
  repo_id=DATASET,
164
  filename=HTML_FILE,
165
  repo_type="dataset",
166
  )
167
+ print(f" HTML heruntergeladen: {html_path}")
168
+
 
 
 
 
 
169
  html_docs = _load_hg_paragraph_documents(html_path)
170
+ all_docs.extend(html_docs)
171
+
172
  except Exception as e:
173
+ print(f"⚠️ Konnte HTML nicht aus Dataset laden: {e}")
174
+
175
+ # 3. Fallback: Load directly from website
176
+ print(">>> Fallback: Lade direkt von recht.nrw.de...")
177
+ web_docs = load_recht_nrw_direct()
178
+ all_docs.extend(web_docs)
179
+
180
+ print(f"\n=== DONE: {len(all_docs)} Dokumente geladen ===")
181
+
182
+ # Print summary
183
+ pdf_count = len([d for d in all_docs if "PDF" in d.metadata.get("source", "")])
184
+ html_count = len([d for d in all_docs if "HTML" in d.metadata.get("source", "")])
185
+ web_count = len([d for d in all_docs if "Website" in d.metadata.get("source", "")])
186
+
187
+ print(f"📊 Zusammenfassung:")
188
+ print(f" - PDF-Seiten: {pdf_count}")
189
+ print(f" - HTML-Paragraphs: {html_count}")
190
+ print(f" - Web-Paragraphs: {web_count}")
191
+
192
+ return all_docs
193
 
194
  if __name__ == "__main__":
 
195
  docs = load_documents()
196
+
197
+ if docs:
198
+ print(f"\nErstes Dokument (Beispiel):")
199
+ print(f"Content: {docs[0].page_content[:200]}...")
200
+ print(f"Metadata: {docs[0].metadata}")
rag_pipeline.py CHANGED
@@ -1,194 +1,197 @@
1
  """
2
- RAG PIPELINE – Version 26.11 (ohne Modi, stabil, juristisch korrekt)
3
  """
4
 
5
  from typing import List, Dict, Any, Tuple
6
  from langchain_core.messages import SystemMessage, HumanMessage
7
- from load_documents import DATASET, PDF_FILE, HTML_FILE
 
8
 
9
- # -------------------------------------------------------------------
10
  # URLs für Quellen
11
- # -------------------------------------------------------------------
12
-
13
- # Direktes PDF im Dataset (für #page)
14
- PDF_BASE_URL = f"https://huggingface.co/datasets/{DATASET}/resolve/main/{PDF_FILE}"
15
-
16
- # Hochschulgesetz-HTML im Dataset (enthält <p id="hg_abs_X"> …)
17
- LAW_DATASET_URL = f"https://huggingface.co/datasets/{DATASET}/resolve/main/{HTML_FILE}"
18
-
19
- # Offizielle Recht.NRW-Druckversion (für Viewer im Frontend)
20
- LAW_URL = (
21
- "https://recht.nrw.de/lmi/owa/br_bes_text?"
22
- "print=1&anw_nr=2&gld_nr=2&ugl_nr=221&val=28364&ver=0&"
23
- "aufgehoben=N&keyword=&bes_id=28364&show_preview=1"
24
- )
25
-
26
- MAX_CHARS = 900
27
-
28
- # -----------------------------
29
- # Quellen formatieren
30
- # -----------------------------
31
-
32
- def build_sources_metadata(docs: List) -> List[Dict[str, Any]]:
33
  """
34
- Erzeugt eine Liste strukturierter Quellen-Infos:
35
-
36
- [
37
- {
38
- "id": 1,
39
- "source": "Prüfungsordnung (PDF)" / "Hochschulgesetz NRW (HTML)",
40
- "page": 3, # nur bei PDF
41
- "url": "...", # direkter Klick-Link
42
- "snippet": "Erste 300 Zeichen des Chunks..."
43
- },
44
- ...
45
- ]
46
  """
47
- srcs = []
48
- for i, d in enumerate(docs):
49
- meta = d.metadata
50
- src = meta.get("source", "")
51
- page = meta.get("page")
52
- snippet = d.page_content[:300].replace("\n", " ")
53
-
54
- # PDF-Link
55
- if "Prüfungsordnung" in src:
 
 
 
 
 
 
 
56
  if isinstance(page, int):
57
- # PyPDFLoader: page ist 0-basiert, Anzeige 1-basiert
58
  url = f"{PDF_BASE_URL}#page={page + 1}"
59
  else:
60
  url = PDF_BASE_URL
61
-
62
- # NRW-Gesetz (HTML im Dataset mit Absatz-IDs)
63
- elif "Hochschulgesetz" in src:
64
- para_id = meta.get("paragraph_id")
65
  if para_id:
66
- # Klick führt direkt zum Absatz im Dataset-HTML
67
- url = f"{LAW_DATASET_URL}#{para_id}"
68
  else:
69
- # Fallback: offizielle Druckversion (ohne Absatz-Anker)
70
- url = LAW_URL
71
- page = None # keine Seitenangabe für Gesetz-HTML
72
-
73
- else:
74
- url = None
75
-
76
- srcs.append(
77
- {
78
- "id": i + 1,
79
- "source": src,
80
- "page": page + 1 if isinstance(page, int) else None,
81
- "url": url,
82
- "snippet": snippet,
83
- }
84
- )
85
- return srcs
86
-
87
- # -----------------------------
88
- # Kontext formatieren
89
- # -----------------------------
90
-
91
- def format_context(docs):
92
  if not docs:
93
- return "(Kein relevanter Kontext im Dokument gefunden.)"
94
-
95
- out = []
96
- for i, d in enumerate(docs):
97
- txt = d.page_content[:MAX_CHARS]
98
- src = d.metadata.get("source")
99
- page = d.metadata.get("page")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
 
101
- if "Prüfungsordnung" in (src or "") and isinstance(page, int):
102
- src_str = f"{src}, Seite {page + 1}"
103
- else:
104
- src_str = src
105
 
106
- out.append(f"[KONTEXT {i+1}] ({src_str})\n{txt}")
107
 
108
- return "\n\n".join(out)
 
 
 
109
 
110
- # -----------------------------
111
- # Systemprompt verschärft
112
- # -----------------------------
 
113
 
114
- SYSTEM_PROMPT = """
115
- Du bist ein hochpräziser juristischer Chatbot für Prüfungsrecht
116
- mit Zugriff nur auf:
 
117
 
118
- - die Prüfungsordnung (als PDF) und
119
- - das Hochschulgesetz NRW (als HTML aus der offiziellen Druckversion).
 
 
120
 
121
- Strenge Regeln:
 
 
122
 
123
- 1. Antworte ausschließlich anhand des bereitgestellten Kontextes
124
- (KONTEXT-Abschnitte). Wenn die Information nicht im Kontext steht,
125
- sage ausdrücklich, dass dies aus den vorliegenden Dokumenten nicht
126
- hervorgeht und du dazu nichts Sicheres sagen kannst.
127
 
128
- 2.
129
- Keine Spekulationen, keine Vermutungen.
130
 
131
- 3. Antworte in zusammenhängenden, ganzen Sätzen. Verwende keine Mischung aus Deutsch und Englisch.
 
 
 
 
 
132
 
133
- 4. Nenne, soweit aus dem Kontext erkennbar,
134
- - die rechtliche Grundlage (z.B. Paragraph, Artikel),
135
- - das Dokument (Prüfungsordnung / Hochschulgesetz NRW),
136
- - die Seite (bei der Prüfungsordnung), wenn im Kontext vorhanden.
137
 
138
- 5. Füge KEINE externen Informationen hinzu, z.B. aus anderen Gesetzen,
139
- Webseiten oder allgemeinem Wissen. Nur das, was im Kontext steht,
140
- darf in der Antwort verwendet werden.
141
 
142
- Wenn der Kontext keine eindeutige Antwort zulässt, erkläre klar,
143
- warum keine sichere Antwort möglich ist und welche Informationen
144
- im Dokument fehlen.
145
- """
 
 
 
 
146
 
147
- # -----------------------------
148
- # Hauptfunktion
149
- # -----------------------------
150
 
151
  def answer(question: str, retriever, chat_model) -> Tuple[str, List[Dict[str, Any]]]:
152
  """
153
- Haupt-RAG-Funktion:
154
-
155
- - ruft retriever.invoke(question) auf,
156
- - baut einen präzisen Prompt mit KONTEXT,
157
- - ruft LLM auf,
158
- - gibt Antworttext + Quellenliste zurück.
159
  """
160
- # 1. Dokumente holen
161
  docs = retriever.invoke(question)
 
 
162
  context_str = format_context(docs)
163
-
164
- # 2. Prompt bauen
165
- human = f"""
166
- FRAGE:
167
- {question}
168
-
169
- NUTZE AUSSCHLIESSLICH DIESEN KONTEXT:
170
- {context_str}
171
-
172
- AUFGABE:
173
- Formuliere eine juristisch korrekte, gut verständliche Antwort
174
- ausschließlich anhand des obigen Kontextes.
175
-
176
- - Wenn der Kontext aus den Dokumenten eine klare Antwort erlaubt,
177
- erläutere diese strukturiert und in vollständigen Sätzen.
178
- - Wenn der Kontext KEINE klare Antwort erlaubt oder wichtige Informationen
179
- fehlen, erkläre das offen und formuliere KEINE Vermutung.
180
- """
181
-
182
- msgs = [
183
  SystemMessage(content=SYSTEM_PROMPT),
184
- HumanMessage(content=human),
185
  ]
186
-
187
- # 3. LLM aufrufen
188
- result = chat_model.invoke(msgs)
189
- answer_text = result.content.strip()
190
-
191
- # 4. Quellenliste bauen
 
 
 
 
 
 
 
 
192
  sources = build_sources_metadata(docs)
193
-
194
- return answer_text, sources
 
1
  """
2
+ RAG PIPELINE – Verbesserte Version mit präzisen Prompts
3
  """
4
 
5
  from typing import List, Dict, Any, Tuple
6
  from langchain_core.messages import SystemMessage, HumanMessage
7
+ from langchain_core.documents import Document
8
+ import re
9
 
 
10
  # URLs für Quellen
11
+ PDF_BASE_URL = "https://huggingface.co/datasets/Nguyen5/docs/resolve/main/f10_bpo_ifb_tei_mif_wii_2021-01-04.pdf"
12
+ LAW_VIEWER_URL = "https://YOUR_SUPABASE_URL/storage/v1/object/public/hg_viewer/hg_viewer.html"
13
+
14
+ MAX_CHARS = 1000
15
+
16
+ def format_chunk_content(chunk: Document) -> str:
17
+ """Format chunk content for better readability"""
18
+ content = chunk.page_content
19
+
20
+ # Remove excessive whitespace
21
+ content = re.sub(r'\s+', ' ', content)
22
+
23
+ # Ensure proper sentence endings
24
+ if not content.strip().endswith(('.', '!', '?')):
25
+ content = content.strip() + '.'
26
+
27
+ return content[:MAX_CHARS]
28
+
29
+ def build_sources_metadata(docs: List[Document]) -> List[Dict[str, Any]]:
 
 
 
30
  """
31
+ Erzeugt strukturierte Quellen-Informationen
 
 
 
 
 
 
 
 
 
 
 
32
  """
33
+ sources = []
34
+
35
+ for i, doc in enumerate(docs, 1):
36
+ metadata = doc.metadata
37
+ source_type = metadata.get("source", "")
38
+ page = metadata.get("page")
39
+ para_id = metadata.get("paragraph_id", "")
40
+
41
+ # Prepare snippet
42
+ snippet = format_chunk_content(doc)
43
+ if len(snippet) > 300:
44
+ snippet = snippet[:297] + "..."
45
+
46
+ # Determine URL
47
+ url = None
48
+ if "PDF" in source_type:
49
  if isinstance(page, int):
 
50
  url = f"{PDF_BASE_URL}#page={page + 1}"
51
  else:
52
  url = PDF_BASE_URL
53
+
54
+ elif "HTML" in source_type or "Website" in source_type:
 
 
55
  if para_id:
56
+ url = f"{LAW_VIEWER_URL}#{para_id}"
 
57
  else:
58
+ url = LAW_VIEWER_URL
59
+
60
+ # Build source info
61
+ source_info = {
62
+ "id": i,
63
+ "source": source_type,
64
+ "page": page + 1 if isinstance(page, int) else None,
65
+ "paragraph_id": para_id,
66
+ "url": url,
67
+ "snippet": snippet,
68
+ "content_preview": doc.page_content[:200] + "..." if len(doc.page_content) > 200 else doc.page_content
69
+ }
70
+ sources.append(source_info)
71
+
72
+ return sources
73
+
74
+ def format_context(docs: List[Document]) -> str:
75
+ """
76
+ Formatiert den Kontext für den Prompt
77
+ """
 
 
 
78
  if not docs:
79
+ return "KEIN_RELEVANTER_KONTEXT_GEFUNDEN"
80
+
81
+ context_parts = []
82
+
83
+ for i, doc in enumerate(docs, 1):
84
+ content = format_chunk_content(doc)
85
+ metadata = doc.metadata
86
+
87
+ # Build source description
88
+ source_desc = metadata.get("source", "Unbekannte Quelle")
89
+ if "page" in metadata and metadata["page"] is not None:
90
+ source_desc += f", Seite {metadata['page'] + 1}"
91
+ if "paragraph_id" in metadata:
92
+ source_desc += f", {metadata['paragraph_id']}"
93
+
94
+ context_parts.append(f"【Quelle {i}】{source_desc}\n{content}")
95
+
96
+ return "\n\n".join(context_parts)
97
+
98
+ # ========== IMPROVED SYSTEM PROMPT ==========
99
+ SYSTEM_PROMPT = """
100
+ Du bist ein hochpräziser juristischer Assistenz-Chatbot für Prüfungsrecht an Hochschulen in Nordrhein-Westfalen.
101
 
102
+ Deine Wissensbasis umfasst ausschließlich:
103
+ 1. Die spezifische Prüfungsordnung (PDF-Dokument)
104
+ 2. Das Hochschulgesetz NRW (Hochschulgesetz - HG)
 
105
 
106
+ STRENGE ANWEISUNGEN:
107
 
108
+ 1. **AUSSCHLIESSLICHE KONTEXTNUTZUNG:**
109
+ - Verwende NUR die bereitgestellten Quellen aus der Wissensbasis.
110
+ - Wenn Informationen nicht im Kontext stehen, sage explizit: "Auf Basis der vorliegenden Dokumente kann ich diese Frage nicht sicher beantworten."
111
+ - KEINE Vermutungen, Spekulationen oder externes Wissen.
112
 
113
+ 2. **PRÄZISE JURISTISCHE ANTWORTEN:**
114
+ - Formuliere in vollständigen, grammatikalisch korrekten Sätzen.
115
+ - Verwende präzise juristische Sprache, aber bleibe verständlich.
116
+ - Strukturiere komplexe Antworten mit Absätzen oder Aufzählungen.
117
 
118
+ 3. **QUELLENNACHWEISE:**
119
+ - Verweise immer auf die konkrete Quelle (Prüfungsordnung §X oder Hochschulgesetz §Y).
120
+ - Bei der Prüfungsordnung gib die Seite an.
121
+ - Beim Hochschulgesetz verweise auf den Paragraphen.
122
 
123
+ 4. **ANTWORTSTRUKTUR:**
124
+ a) Kurze präzise Antwort zuerst
125
+ b) Detaillierte Erklärung mit Quellenangaben
126
+ c) Falls relevant: praktische Hinweise basierend auf dem Kontext
127
 
128
+ 5. **FEHLENDE INFORMATIONEN:**
129
+ - Wenn der Kontext unvollständig ist, erkläre, welche Informationen fehlen.
130
+ - Biete an, nur die vorhandenen Informationen zusammenzufassen.
131
 
132
+ 6. **SPRACHE:**
133
+ - Verwende ausschließlich formelles Deutsch.
134
+ - Vermeide Umgangssprache und Abkürzungen.
 
135
 
136
+ Deine Antworten müssen rechtlich korrekt, vollständig und nachprüfbar sein.
137
+ """
138
 
139
+ def create_human_prompt(question: str, context: str) -> str:
140
+ """
141
+ Erstellt optimierten Human Prompt
142
+ """
143
+ return f"""FRAGE DES NUTZERS:
144
+ {question}
145
 
146
+ VERFÜGBARE RECHTSQUELLEN:
147
+ {context if context else "KEINE RELEVANTEN QUELLEN GEFUNDEN"}
 
 
148
 
149
+ AUFGABE:
150
+ Beantworte die Frage ausschließlich auf Basis der oben genannten Rechtsquellen.
 
151
 
152
+ ANFORDERUNGEN:
153
+ 1. Gib eine präzise juristische Antwort in vollständigen Sätzen.
154
+ 2. Zitiere konkret:
155
+ - Für die Prüfungsordnung: "Laut Prüfungsordnung, §X auf Seite Y, ..."
156
+ - Für das Hochschulgesetz: "Gemäß Hochschulgesetz NRW §Z, ..."
157
+ 3. Wenn mehrere Quellen relevant sind, erwähne alle.
158
+ 4. Wenn Informationen fehlen, erkläre dies klar.
159
+ 5. Strukturiere die Antwort logisch.
160
 
161
+ ANTWORT (auf Deutsch):"""
 
 
162
 
163
  def answer(question: str, retriever, chat_model) -> Tuple[str, List[Dict[str, Any]]]:
164
  """
165
+ Haupt-RAG-Funktion mit verbessertem Prompting
 
 
 
 
 
166
  """
167
+ # 1. Retrieve relevant documents
168
  docs = retriever.invoke(question)
169
+
170
+ # 2. Format context
171
  context_str = format_context(docs)
172
+
173
+ # 3. Create prompt
174
+ human_prompt = create_human_prompt(question, context_str)
175
+
176
+ # 4. Call LLM
177
+ messages = [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
  SystemMessage(content=SYSTEM_PROMPT),
179
+ HumanMessage(content=human_prompt)
180
  ]
181
+
182
+ try:
183
+ result = chat_model.invoke(messages)
184
+ answer_text = result.content.strip()
185
+
186
+ # Clean up answer
187
+ answer_text = re.sub(r'\n\s*\n+', '\n\n', answer_text) # Remove excessive newlines
188
+ answer_text = answer_text.replace("KEINE RELEVANTEN QUELLEN GEFUNDEN",
189
+ "Auf Basis der vorliegenden Dokumente kann ich diese Frage nicht sicher beantworten.")
190
+
191
+ except Exception as e:
192
+ answer_text = f"Fehler bei der Generierung der Antwort: {str(e)}"
193
+
194
+ # 5. Build sources metadata
195
  sources = build_sources_metadata(docs)
196
+
197
+ return answer_text, sources
requirements.txt CHANGED
@@ -13,6 +13,7 @@ langchain
13
  langchain-community
14
  langchain-text-splitters
15
  langchain-openai
 
16
 
17
  # === VectorStore ===
18
  faiss-cpu
@@ -21,6 +22,7 @@ faiss-cpu
21
  pypdf
22
  requests
23
  beautifulsoup4
 
24
 
25
  # === Audio (STT/TTS local) ===
26
  transformers
@@ -29,6 +31,7 @@ soundfile
29
  scipy
30
  numpy
31
  torchaudio
 
32
 
33
  # OpenAI offizielle Bibliothek (kommt i.d.R. mit langchain-openai, zur Sicherheit explizit)
34
  openai
 
13
  langchain-community
14
  langchain-text-splitters
15
  langchain-openai
16
+ huggingface-hub
17
 
18
  # === VectorStore ===
19
  faiss-cpu
 
22
  pypdf
23
  requests
24
  beautifulsoup4
25
+ lxml
26
 
27
  # === Audio (STT/TTS local) ===
28
  transformers
 
31
  scipy
32
  numpy
33
  torchaudio
34
+ torch
35
 
36
  # OpenAI offizielle Bibliothek (kommt i.d.R. mit langchain-openai, zur Sicherheit explizit)
37
  openai
upload_weblink_to_supabase.py CHANGED
@@ -1,8 +1,14 @@
 
 
 
 
1
  import os
2
  import requests
 
3
  from bs4 import BeautifulSoup
4
  from supabase import create_client
5
  from dotenv import load_dotenv
 
6
 
7
  load_dotenv()
8
 
@@ -13,64 +19,116 @@ supabase = create_client(SUPABASE_URL, SUPABASE_SERVICE_ROLE)
13
 
14
  LAW_URL = "https://recht.nrw.de/lmi/owa/br_text_anzeigen?v_id=10000000000000000654"
15
 
 
 
 
 
 
 
 
 
 
 
 
16
  def extract_paragraphs():
17
  print(">>> Lade Hochschulgesetz NRW …")
18
 
19
- html = requests.get(LAW_URL, timeout=30).text
 
 
 
 
 
 
 
 
 
 
 
20
  soup = BeautifulSoup(html, "html.parser")
21
 
22
- # Tất cả tiêu đề Paragraph xuất hiện trong <h2> hoặc <h3>
23
- headers = soup.find_all(["h2", "h3"])
24
-
25
  paragraphs = []
26
  order = 1
27
 
28
- for header in headers:
29
- title = header.get_text(" ", strip=True)
30
-
31
- if not title.startswith("§"):
32
- continue # bỏ các h2/h3 không phải Paragraph
33
-
34
- # Gom toàn bộ nội dung từ header đến trước h2/h3 tiếp theo
35
- content_parts = []
36
- sibling = header.find_next_sibling()
37
-
38
- while sibling and sibling.name not in ["h2", "h3"]:
39
- text = sibling.get_text(" ", strip=True)
40
- if text:
41
- content_parts.append(text)
42
- sibling = sibling.find_next_sibling()
43
-
44
- full_content = "\n".join(content_parts).strip()
45
-
46
- para_id = f"para_{order}"
47
-
48
- paragraphs.append({
49
- "abs_id": para_id,
50
- "title": title,
51
- "content": full_content,
52
- "order_index": order
53
- })
54
-
55
- order += 1
 
 
 
 
 
 
 
 
 
 
 
56
 
57
  print(f"✔ Extracted {len(paragraphs)} paragraphs (§).")
 
 
 
 
 
 
 
58
  return paragraphs
59
 
60
  def upload_to_supabase():
61
  paras = extract_paragraphs()
62
 
63
- print(">>> Clear table hg_nrw …")
64
- supabase.table("hg_nrw").delete().neq("abs_id", "").execute()
65
-
66
- print(">>> Upload begin …")
67
- BATCH = 100
68
- for i in range(0, len(paras), BATCH):
69
- batch = paras[i:i+BATCH]
70
- print(f" - Upload batch {i} – {i+len(batch)-1}")
71
- supabase.table("hg_nrw").upsert(batch).execute()
72
 
73
- print(" DONE uploading complete NRW law.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
 
75
  if __name__ == "__main__":
76
- upload_to_supabase()
 
1
+ """
2
+ upload_weblink_to_supabase.py
3
+ Trích xuất và tải lên các paragraph từ trang web recht.nrw.de
4
+ """
5
  import os
6
  import requests
7
+ import re
8
  from bs4 import BeautifulSoup
9
  from supabase import create_client
10
  from dotenv import load_dotenv
11
+ import time
12
 
13
  load_dotenv()
14
 
 
19
 
20
  LAW_URL = "https://recht.nrw.de/lmi/owa/br_text_anzeigen?v_id=10000000000000000654"
21
 
22
+ def clean_text(text):
23
+ """Làm sạch và định dạng văn bản"""
24
+ # Loại bỏ khoảng trắng thừa
25
+ text = re.sub(r'\s+', ' ', text)
26
+ # Chuẩn hóa dấu câu
27
+ text = re.sub(r'\s*([.,;:!?])\s*', r'\1 ', text)
28
+ # Đảm bảo chữ cái đầu câu viết hoa
29
+ sentences = text.split('. ')
30
+ sentences = [s.strip().capitalize() for s in sentences if s.strip()]
31
+ return '. '.join(sentences)
32
+
33
  def extract_paragraphs():
34
  print(">>> Lade Hochschulgesetz NRW …")
35
 
36
+ headers = {
37
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
38
+ }
39
+
40
+ try:
41
+ response = requests.get(LAW_URL, headers=headers, timeout=60)
42
+ response.raise_for_status()
43
+ except requests.RequestException as e:
44
+ print(f"❌ Fehler beim Laden der Seite: {e}")
45
+ return []
46
+
47
+ html = response.text
48
  soup = BeautifulSoup(html, "html.parser")
49
 
50
+ # Tìm tất cả các section chứa paragraph
 
 
51
  paragraphs = []
52
  order = 1
53
 
54
+ # Tìm các phần có chứa § (paragraph symbol)
55
+ pattern = re.compile(r'§\s*\d+')
56
+
57
+ # Tìm tất cả các element chứa paragraph
58
+ for element in soup.find_all(['p', 'div', 'td']):
59
+ text = element.get_text(" ", strip=True)
60
+
61
+ # Kiểm tra nếu có paragraph symbol
62
+ if pattern.search(text):
63
+ # Tách title và content
64
+ lines = text.split('\n')
65
+ title = lines[0].strip() if lines else ""
66
+
67
+ # Lấy nội dung
68
+ content = ""
69
+ if len(lines) > 1:
70
+ content = clean_text(" ".join(lines[1:]))
71
+
72
+ # Nếu title chưa có §, thêm từ nội dung
73
+ if '§' not in title and content:
74
+ # Tìm § trong content để thêm vào title
75
+ match = pattern.search(content)
76
+ if match:
77
+ title = match.group()
78
+ # Xóa title khỏi content
79
+ content = content.replace(title, "", 1).strip()
80
+
81
+ # Tạo ID cho paragraph
82
+ para_id = f"para_{order}"
83
+
84
+ paragraphs.append({
85
+ "abs_id": para_id,
86
+ "title": title if title else f"§ {order}",
87
+ "content": content if content else text,
88
+ "order_index": order,
89
+ "source_url": LAW_URL
90
+ })
91
+
92
+ order += 1
93
 
94
  print(f"✔ Extracted {len(paragraphs)} paragraphs (§).")
95
+
96
+ # In ra mẫu để kiểm tra
97
+ if paragraphs:
98
+ print("\nBeispiel Paragraph 1:")
99
+ print(f"Title: {paragraphs[0]['title']}")
100
+ print(f"Content (Auszug): {paragraphs[0]['content'][:200]}...\n")
101
+
102
  return paragraphs
103
 
104
  def upload_to_supabase():
105
  paras = extract_paragraphs()
106
 
107
+ if not paras:
108
+ print("❌ Keine Paragraphs gefunden. Upload abgebrochen.")
109
+ return
 
 
 
 
 
 
110
 
111
+ print(">>> Clear table hg_nrw ")
112
+ try:
113
+ # Xóa toàn bộ dữ liệu cũ
114
+ supabase.table("hg_nrw").delete().neq("abs_id", "").execute()
115
+ print("✔ Tabelle geleert.")
116
+ except Exception as e:
117
+ print(f"⚠️ Fehler beim Leeren der Tabelle: {e}")
118
+
119
+ print(">>> Upload beginnt …")
120
+ BATCH_SIZE = 50
121
+
122
+ for i in range(0, len(paras), BATCH_SIZE):
123
+ batch = paras[i:i+BATCH_SIZE]
124
+ try:
125
+ result = supabase.table("hg_nrw").upsert(batch).execute()
126
+ print(f"✔ Batch {i//BATCH_SIZE + 1} hochgeladen ({len(batch)} Einträge)")
127
+ time.sleep(0.1) # Tránh rate limiting
128
+ except Exception as e:
129
+ print(f"❌ Fehler beim Upload von Batch {i//BATCH_SIZE + 1}: {e}")
130
+
131
+ print(f"✔ DONE - {len(paras)} Paragraphs erfolgreich hochgeladen.")
132
 
133
  if __name__ == "__main__":
134
+ upload_to_supabase()