Nguyen5 commited on
Commit
cae6054
·
1 Parent(s): d3ea7b2
Files changed (4) hide show
  1. app.py +150 -245
  2. build_hg_viewer.py +272 -677
  3. load_documents.py +95 -165
  4. rag_pipeline.py +155 -158
app.py CHANGED
@@ -1,307 +1,212 @@
1
- """
2
- app.pyAktualisierte Version mit verbessertem Viewer
3
- """
4
 
5
  import gradio as gr
6
  from gradio_pdf import PDF
7
  from huggingface_hub import hf_hub_download
8
- import os
9
 
10
- from load_documents import load_documents, DATASET, PDF_FILE
11
  from split_documents import split_documents
12
  from vectorstore import build_vectorstore
13
  from retriever import get_retriever
14
  from llm import load_llm
15
- from rag_pipeline import answer
16
 
17
  from speech_io import transcribe_audio, synthesize_speech
18
 
19
  # =====================================================
20
- # KONFIGURATION
21
  # =====================================================
22
 
23
- # Viewer URL (ersetze mit deiner Supabase URL)
24
- SUPABASE_URL = os.environ.get("SUPABASE_URL", "https://your-project.supabase.co")
25
- LAW_VIEWER_URL = f"{SUPABASE_URL}/storage/v1/object/public/hg_viewer/hg_viewer.html"
26
-
27
- # =====================================================
28
- # INITIALISIERUNG
29
- # =====================================================
30
-
31
- print("🔹 Initialisiere System...")
32
- print("1. Lade Dokumente...")
33
  _docs = load_documents()
34
 
35
- print("2. Splitte Dokumente...")
36
  _chunks = split_documents(_docs)
37
 
38
- print("3. Baue VectorStore...")
39
  _vs = build_vectorstore(_chunks)
40
 
41
- print("4. Erzeuge Retriever...")
42
  _retriever = get_retriever(_vs)
43
 
44
- print("5. Lade LLM...")
45
  _llm = load_llm()
46
 
47
- print("6. Lade Dateien für Viewer...")
48
- try:
49
- _pdf_path = hf_hub_download(DATASET, PDF_FILE, repo_type="dataset")
50
- print(f"✅ PDF geladen: {_pdf_path}")
51
- except Exception as e:
52
- print(f"⚠️ PDF konnte nicht geladen werden: {e}")
53
- _pdf_path = None
54
-
55
- print("✅ System initialisiert!")
56
 
57
  # =====================================================
58
- # HELPER FUNCTIONS
59
  # =====================================================
60
 
61
  def format_sources_markdown(sources):
62
- """Formatiere Quellen als Markdown"""
63
  if not sources:
64
  return ""
65
-
66
- lines = ["", "**📚 Quellenverweise:**", ""]
67
-
68
  for s in sources:
69
- source_type = s["source"]
 
70
  page = s["page"]
71
- para_id = s.get("paragraph_id", "")
72
  url = s["url"]
73
- snippet = s.get("snippet", "")
74
-
75
- # Build source line
 
76
  if url:
77
- if "PDF" in source_type:
78
- source_text = f"[{source_type}"
79
- if page:
80
- source_text += f", Seite {page}"
81
- source_text += f"]({url})"
82
- else:
83
- display_name = para_id if para_id else "Hochschulgesetz NRW"
84
- source_text = f"[{display_name}]({url})"
85
  else:
86
- source_text = source_type
87
-
88
- lines.append(f"- {source_text}")
89
-
 
 
 
90
  if snippet:
91
- lines.append(f" > *{snippet}*")
92
-
93
  return "\n".join(lines)
94
 
95
  # =====================================================
96
- # CHATBOT FUNCTIONS
97
  # =====================================================
98
 
99
  def chatbot_text(user_message, history):
100
- """Text-Chatbot Funktion"""
101
- if not user_message.strip():
102
- return history, ""
103
-
104
- try:
105
- # Get answer from RAG pipeline
106
- answer_text, sources = answer(
107
- question=user_message,
108
- retriever=_retriever,
109
- chat_model=_llm
110
- )
111
-
112
- # Add sources
113
- sources_text = format_sources_markdown(sources)
114
- full_response = f"{answer_text}\n\n{sources_text}"
115
-
116
- # Update history
117
- history.append({"role": "user", "content": user_message})
118
- history.append({"role": "assistant", "content": full_response})
119
-
120
- return history, ""
121
-
122
- except Exception as e:
123
- error_msg = f"Fehler bei der Verarbeitung: {str(e)}"
124
- history.append({"role": "user", "content": user_message})
125
- history.append({"role": "assistant", "content": error_msg})
126
  return history, ""
127
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
  def chatbot_voice(audio_path, history):
129
- """Voice-Chatbot Funktion"""
130
- if not audio_path:
131
- return history, None, ""
132
-
133
- # Transcribe audio
134
  text = transcribe_audio(audio_path)
135
  if not text:
136
- return history, None, "Keine Sprache erkannt"
137
-
138
- # Process with text chatbot
139
- history, _ = chatbot_text(text, history)
140
-
141
- # Get last response for TTS
142
- last_response = None
143
- for msg in reversed(history):
144
- if msg["role"] == "assistant":
145
- last_response = msg["content"]
146
- break
147
-
148
- # Generate audio
149
- audio_output = None
150
- if last_response:
151
- audio_output = synthesize_speech(last_response.split("\n\n")[0]) # Nur erste Teil für TTS
152
-
153
- return history, audio_output, text
 
 
 
 
 
 
154
 
155
  def read_last_answer(history):
156
- """Lese letzte Antwort vor"""
157
  if not history:
158
  return None
159
-
160
  for msg in reversed(history):
161
  if msg["role"] == "assistant":
162
- return synthesize_speech(msg["content"].split("\n\n")[0])
163
-
164
  return None
165
 
166
  # =====================================================
167
- # GRADIO UI
168
  # =====================================================
169
 
170
- def create_ui():
171
- """Erstelle die Gradio Benutzeroberfläche"""
172
-
173
- with gr.Blocks(
174
- title="Prüfungsrechts-Chatbot NRW",
175
- ) as demo:
176
-
177
- # Header
178
- gr.Markdown("""
179
- # 🧑‍⚖️ Prüfungsrechts-Chatbot für NRW Hochschulen
180
-
181
- Dieser Chatbot beantwortet Fragen basierend auf:
182
- - **Prüfungsordnung** (offizielles PDF)
183
- - **Hochschulgesetz NRW** (aktuelle Fassung von recht.nrw.de)
184
-
185
- Fragen können per Text oder Spracheingabe gestellt werden.
186
- """)
187
-
188
- with gr.Row():
189
- # Left Column - Chat
190
- with gr.Column(scale=2):
191
- chatbot = gr.Chatbot(
192
- label="Chat",
193
- height=500
194
- )
195
-
196
- with gr.Row():
197
- msg = gr.Textbox(
198
- label="Frage eingeben",
199
- placeholder="Stellen Sie Ihre Frage zum Prüfungsrecht...",
200
- scale=4,
201
- container=False
202
- )
203
- send_btn = gr.Button("Senden", variant="primary", scale=1)
204
-
205
- # Voice Input
206
- with gr.Accordion("🎤 Spracheingabe", open=False):
207
- with gr.Row():
208
- voice_in = gr.Audio(
209
- sources=["microphone"],
210
- type="filepath",
211
- label="Aufnahme",
212
- scale=3
213
- )
214
- voice_btn = gr.Button("Sprechen & senden", scale=1)
215
-
216
- voice_out = gr.Audio(
217
- label="Antwort als Audio",
218
- type="numpy",
219
- visible=True
220
- )
221
-
222
- # Controls
223
- with gr.Row():
224
- read_btn = gr.Button("🔊 Antwort vorlesen")
225
- clear_btn = gr.Button("🗑️ Chat leeren", variant="secondary")
226
-
227
- # Right Column - Viewer
228
- with gr.Column(scale=1):
229
- # PDF Viewer
230
- gr.Markdown("### 📄 Prüfungsordnung")
231
- if _pdf_path:
232
- pdf_viewer = PDF(_pdf_path, height=350, label="PDF Viewer")
233
- else:
234
- gr.Markdown("⚠️ PDF konnte nicht geladen werden")
235
-
236
- # Law Viewer
237
- gr.Markdown("### 📘 Hochschulgesetz NRW")
238
- gr.HTML(f"""
239
- <iframe
240
- src="{LAW_VIEWER_URL}"
241
- style="width:100%; height:400px; border:none; border-radius:10px;"
242
- title="Hochschulgesetz NRW Viewer"
243
- ></iframe>
244
- """)
245
-
246
- # Event Handlers
247
- # Text input
248
- msg.submit(
249
- chatbot_text,
250
- [msg, chatbot],
251
- [chatbot, msg]
252
- )
253
-
254
- send_btn.click(
255
- chatbot_text,
256
- [msg, chatbot],
257
- [chatbot, msg]
258
- )
259
-
260
- # Voice input
261
- voice_btn.click(
262
- chatbot_voice,
263
- [voice_in, chatbot],
264
- [chatbot, voice_out, msg]
265
- )
266
-
267
- # Controls
268
- read_btn.click(
269
- read_last_answer,
270
- [chatbot],
271
- [voice_out]
272
- )
273
-
274
- clear_btn.click(
275
- lambda: [],
276
- None,
277
- [chatbot]
278
- )
279
-
280
- # Instructions
281
- gr.Markdown("""
282
- ### ℹ️ Nutzungshinweise
283
-
284
- 1. **Präzise Fragen** stellen für bessere Antworten
285
- 2. **Quellen** werden automatisch verlinkt
286
- 3. **Klicken Sie auf Links** im Chat, um direkt zur Quelle zu springen
287
- 4. **Spracheingabe** für hands-free Nutzung
288
-
289
- ### ⚠️ Hinweis
290
- Dies ist ein Assistenzsystem. Für verbindliche rechtliche Auskünfte wenden Sie sich bitte an die zuständigen Prüfungsämter.
291
- """)
292
-
293
- return demo
294
-
295
- # =====================================================
296
- # MAIN
297
- # =====================================================
298
 
299
  if __name__ == "__main__":
300
- demo = create_ui()
301
-
302
- # Konfiguration für HuggingFace Spaces
303
- demo.queue(
304
- max_size=20,
305
- api_open=False
306
- ).launch(
307
- )
 
1
+ # app.py – Prüfungsrechts-Chatbot (RAG + Sprachmodus)
2
+ # Version 26.11ohne Modi, stabil für Text + Voice
 
3
 
4
  import gradio as gr
5
  from gradio_pdf import PDF
6
  from huggingface_hub import hf_hub_download
 
7
 
8
+ from load_documents import load_documents, DATASET, PDF_FILE, HTML_FILE
9
  from split_documents import split_documents
10
  from vectorstore import build_vectorstore
11
  from retriever import get_retriever
12
  from llm import load_llm
13
+ from rag_pipeline import answer, PDF_BASE_URL, LAW_URL
14
 
15
  from speech_io import transcribe_audio, synthesize_speech
16
 
17
  # =====================================================
18
+ # INITIALISIERUNG (global)
19
  # =====================================================
20
 
21
+ print("🔹 Lade Dokumente ...")
 
 
 
 
 
 
 
 
 
22
  _docs = load_documents()
23
 
24
+ print("🔹 Splitte Dokumente ...")
25
  _chunks = split_documents(_docs)
26
 
27
+ print("🔹 Baue VectorStore (FAISS) ...")
28
  _vs = build_vectorstore(_chunks)
29
 
30
+ print("🔹 Erzeuge Retriever ...")
31
  _retriever = get_retriever(_vs)
32
 
33
+ print("🔹 Lade LLM ...")
34
  _llm = load_llm()
35
 
36
+ print("🔹 Lade Dateien für Viewer")
37
+ _pdf_path = hf_hub_download(DATASET, PDF_FILE, repo_type="dataset")
38
+ _html_path = hf_hub_download(DATASET, HTML_FILE, repo_type="dataset")
 
 
 
 
 
 
39
 
40
  # =====================================================
41
+ # Quellen formatieren – Markdown für Chat
42
  # =====================================================
43
 
44
  def format_sources_markdown(sources):
 
45
  if not sources:
46
  return ""
47
+
48
+ lines = ["", "**📚 Quellen (genutzte Dokumentstellen):**"]
 
49
  for s in sources:
50
+ sid = s["id"]
51
+ src = s["source"]
52
  page = s["page"]
 
53
  url = s["url"]
54
+ snippet = s["snippet"]
55
+
56
+ title = f"Quelle {sid} – {src}"
57
+
58
  if url:
59
+ base = f"- [{title}]({url})"
 
 
 
 
 
 
 
60
  else:
61
+ base = f"- {title}"
62
+
63
+ if page and "Prüfungsordnung" in src:
64
+ base += f", Seite {page}"
65
+
66
+ lines.append(base)
67
+
68
  if snippet:
69
+ lines.append(f" > {snippet}")
70
+
71
  return "\n".join(lines)
72
 
73
  # =====================================================
74
+ # TEXT CHATBOT
75
  # =====================================================
76
 
77
  def chatbot_text(user_message, history):
78
+ if not user_message:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  return history, ""
80
 
81
+ answer_text, sources = answer(
82
+ question=user_message,
83
+ retriever=_retriever,
84
+ chat_model=_llm,
85
+ )
86
+
87
+ quellen_block = format_sources_markdown(sources)
88
+
89
+ history = history + [
90
+ {"role": "user", "content": user_message},
91
+ {"role": "assistant", "content": answer_text + quellen_block},
92
+ ]
93
+
94
+ return history, ""
95
+
96
+ # =====================================================
97
+ # VOICE CHATBOT
98
+ # =====================================================
99
+
100
  def chatbot_voice(audio_path, history):
101
+ # 1. Speech → Text
 
 
 
 
102
  text = transcribe_audio(audio_path)
103
  if not text:
104
+ return history, None, ""
105
+
106
+ # Lưu vào lịch sử chat
107
+ history = history + [{"role": "user", "content": text}]
108
+
109
+ # 2. RAG trả lời
110
+ answer_text, sources = answer(
111
+ question=text,
112
+ retriever=_retriever,
113
+ chat_model=_llm,
114
+ )
115
+ quellen_block = format_sources_markdown(sources)
116
+
117
+ bot_msg = answer_text + quellen_block
118
+ history = history + [{"role": "assistant", "content": bot_msg}]
119
+
120
+ # 3. Text → Speech
121
+ audio = synthesize_speech(bot_msg)
122
+
123
+ return history, audio, ""
124
+
125
+ # =====================================================
126
+ # LAST ANSWER → TTS
127
+ # =====================================================
128
 
129
  def read_last_answer(history):
 
130
  if not history:
131
  return None
132
+
133
  for msg in reversed(history):
134
  if msg["role"] == "assistant":
135
+ return synthesize_speech(msg["content"])
136
+
137
  return None
138
 
139
  # =====================================================
140
+ # UI – GRADIO
141
  # =====================================================
142
 
143
+ with gr.Blocks(title="Prüfungsrechts-Chatbot (RAG + Sprache)") as demo:
144
+ gr.Markdown("# 🧑‍⚖️ Prüfungsrechts-Chatbot")
145
+ gr.Markdown(
146
+ "Dieser Chatbot beantwortet Fragen **ausschließlich** aus der "
147
+ "Prüfungsordnung (PDF) und dem Hochschulgesetz NRW (Website). "
148
+ "Du kannst Text eingeben oder direkt ins Mikrofon sprechen."
149
+ )
150
+
151
+ with gr.Row():
152
+ with gr.Column(scale=2):
153
+ chatbot = gr.Chatbot(label="Chat", height=500)
154
+
155
+ msg = gr.Textbox(
156
+ label="Frage eingeben",
157
+ placeholder="Stelle deine Frage zum Prüfungsrecht …",
158
+ )
159
+
160
+ # TEXT SENDEN
161
+ msg.submit(
162
+ chatbot_text,
163
+ [msg, chatbot],
164
+ [chatbot, msg]
165
+ )
166
+
167
+ send_btn = gr.Button("Senden (Text)")
168
+ send_btn.click(
169
+ chatbot_text,
170
+ [msg, chatbot],
171
+ [chatbot, msg]
172
+ )
173
+
174
+ # SPRACHEINGABE
175
+ gr.Markdown("### 🎙️ Spracheingabe")
176
+ voice_in = gr.Audio(sources=["microphone"], type="filepath")
177
+ voice_out = gr.Audio(label="Vorgelesene Antwort", type="numpy")
178
+
179
+ voice_btn = gr.Button("Sprechen & senden")
180
+ voice_btn.click(
181
+ chatbot_voice,
182
+ [voice_in, chatbot],
183
+ [chatbot, voice_out, msg]
184
+ )
185
+
186
+ read_btn = gr.Button("🔁 Antwort erneut vorlesen")
187
+ read_btn.click(
188
+ read_last_answer,
189
+ [chatbot],
190
+ [voice_out]
191
+ )
192
+
193
+ clear_btn = gr.Button("Chat zurücksetzen")
194
+ clear_btn.click(lambda: [], None, chatbot)
195
+
196
+ # =====================
197
+ # RECHTE SPALTE: Viewer
198
+ # =====================
199
+
200
+ with gr.Column(scale=1):
201
+ gr.Markdown("### 📄 Prüfungsordnung (PDF)")
202
+ PDF(_pdf_path, height=350)
203
+
204
+ gr.Markdown("### 📘 Hochschulgesetz NRW (Website)")
205
+ gr.HTML(
206
+ f'<iframe src="{LAW_URL}" style="width:100%;height:350px;border:none;"></iframe>'
207
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
208
 
209
  if __name__ == "__main__":
210
+ demo.queue().launch(ssr_mode=False, show_error=True)
211
+
212
+
 
 
 
 
 
build_hg_viewer.py CHANGED
@@ -1,12 +1,7 @@
1
- """
2
- build_hg_viewer.py
3
- Tạo HTML viewer cho Hochschulgesetz NRW với định dạng chuyên nghiệp
4
- """
5
  import os
6
- import json
7
  from supabase import create_client
8
  from dotenv import load_dotenv
9
- import re
10
 
11
  load_dotenv()
12
 
@@ -18,701 +13,301 @@ if not SUPABASE_URL or not SUPABASE_SERVICE_ROLE:
18
 
19
  supabase = create_client(SUPABASE_URL, SUPABASE_SERVICE_ROLE)
20
 
21
- def get_paragraphs_from_supabase():
22
- """Lấy paragraphs từ Supabase"""
23
- print(">>> Lade Paragraphs aus Supabase...")
24
- response = supabase.table("hg_nrw").select("*").order("order_index").execute()
25
- paragraphs = response.data
26
-
27
- if not paragraphs:
28
- print("❌ Keine Paragraphs in der Datenbank gefunden.")
29
- return []
30
-
31
- print(f"✔ {len(paragraphs)} Paragraphs geladen.")
32
- return paragraphs
33
-
34
- # ======== HTML TEMPLATE MIT PROFESSIONELLEM DESIGN ========
35
- VIEW_TEMPLATE = """<!DOCTYPE html>
36
  <html lang="de">
37
  <head>
38
- <meta charset="UTF-8">
39
- <meta name="viewport" content="width=device-width, initial-scale=1.0">
40
- <title>Hochschulgesetz NRW – Offizielle Viewer</title>
41
- <style>
42
- * {
43
- margin: 0;
44
- padding: 0;
45
- box-sizing: border-box;
46
- }
47
-
48
- body {
49
- font-family: 'Segoe UI', 'Roboto', 'Arial', sans-serif;
50
- line-height: 1.6;
51
- color: #333;
52
- background: #f8f9fa;
53
- display: flex;
54
- min-height: 100vh;
55
- }
56
-
57
- /* ----------- SIDEBAR ------------- */
58
- #sidebar {
59
- width: 320px;
60
- background: #ffffff;
61
- border-right: 1px solid #e0e0e0;
62
- height: 100vh;
63
- overflow-y: auto;
64
- position: fixed;
65
- left: 0;
66
- top: 0;
67
- box-shadow: 2px 0 5px rgba(0,0,0,0.1);
68
- z-index: 1000;
69
- }
70
-
71
- .sidebar-header {
72
- padding: 20px;
73
- background: linear-gradient(135deg, #003366 0%, #00509e 100%);
74
- color: white;
75
- border-bottom: 1px solid #002244;
76
- }
77
-
78
- .sidebar-header h2 {
79
- font-size: 1.4rem;
80
- font-weight: 600;
81
- margin-bottom: 10px;
82
- }
83
-
84
- .sidebar-header p {
85
- font-size: 0.9rem;
86
- opacity: 0.9;
87
- }
88
-
89
- #searchBox {
90
- width: 100%;
91
- padding: 12px 15px;
92
- font-size: 14px;
93
- border: 1px solid #ddd;
94
- border-radius: 8px;
95
- margin: 15px;
96
- background: #f8f9fa;
97
- transition: all 0.3s;
98
- }
99
-
100
- #searchBox:focus {
101
- outline: none;
102
- border-color: #003366;
103
- box-shadow: 0 0 0 3px rgba(0, 51, 102, 0.1);
104
- }
105
-
106
- .paragraph-list {
107
- padding: 0 15px 20px 15px;
108
- }
109
-
110
- .sidebar-link {
111
- display: block;
112
- padding: 12px 15px;
113
- margin-bottom: 5px;
114
- text-decoration: none;
115
- color: #003366;
116
- background: #f8f9fa;
117
- border-left: 4px solid transparent;
118
- border-radius: 6px;
119
- font-size: 14px;
120
- font-weight: 500;
121
- transition: all 0.2s;
122
- }
123
-
124
- .sidebar-link:hover {
125
- background: #e3f2fd;
126
- border-left-color: #003366;
127
- transform: translateX(3px);
128
- }
129
-
130
- .sidebar-link.active {
131
- background: #e3f2fd;
132
- border-left-color: #003366;
133
- font-weight: 600;
134
- }
135
-
136
- /* ----------- MAIN CONTENT ------------- */
137
- #content-wrapper {
138
- flex: 1;
139
- margin-left: 320px;
140
- min-height: 100vh;
141
- }
142
-
143
- #content {
144
- max-width: 900px;
145
- margin: 0 auto;
146
- padding: 30px;
147
- background: white;
148
- min-height: 100vh;
149
- box-shadow: 0 0 20px rgba(0,0,0,0.05);
150
- }
151
-
152
- .page-header {
153
- margin-bottom: 40px;
154
- padding-bottom: 20px;
155
- border-bottom: 2px solid #003366;
156
- }
157
-
158
- .page-header h1 {
159
- color: #003366;
160
- font-size: 2.2rem;
161
- font-weight: 700;
162
- margin-bottom: 10px;
163
- }
164
-
165
- .page-header .subtitle {
166
- color: #666;
167
- font-size: 1.1rem;
168
- }
169
-
170
- /* ----------- PARAGRAPH STYLES ------------- */
171
- .paragraph {
172
- margin-bottom: 50px;
173
- padding: 25px;
174
- background: #ffffff;
175
- border-radius: 10px;
176
- border-left: 5px solid #003366;
177
- box-shadow: 0 2px 10px rgba(0,0,0,0.08);
178
- transition: all 0.3s;
179
- }
180
 
181
- .paragraph.highlight {
182
- animation: highlight-pulse 2s ease;
183
- border-left-color: #ff9800;
184
- box-shadow: 0 0 0 3px rgba(255, 152, 0, 0.2);
185
- }
186
-
187
- .paragraph-header {
188
- margin-bottom: 20px;
189
- }
190
-
191
- .paragraph-title {
192
- color: #003366;
193
- font-size: 1.6rem;
194
- font-weight: 700;
195
- margin-bottom: 10px;
196
- display: flex;
197
- align-items: center;
198
- gap: 10px;
199
- }
200
-
201
- .paragraph-title .anchor {
202
- font-size: 0.8em;
203
- color: #666;
204
- text-decoration: none;
205
- opacity: 0;
206
- transition: opacity 0.2s;
207
- }
208
-
209
- .paragraph:hover .anchor {
210
- opacity: 1;
211
- }
212
-
213
- .paragraph-content {
214
- font-size: 1.05rem;
215
- line-height: 1.8;
216
- color: #333;
217
- }
218
-
219
- .paragraph-content p {
220
- margin-bottom: 15px;
221
- }
222
-
223
- .paragraph-content ul, .paragraph-content ol {
224
- margin: 15px 0 15px 25px;
225
- }
226
-
227
- .paragraph-content li {
228
- margin-bottom: 8px;
229
- }
230
-
231
- /* ----------- FOOTNOTES ------------- */
232
- .footnotes {
233
- margin-top: 25px;
234
- padding-top: 20px;
235
- border-top: 1px solid #eee;
236
- }
237
-
238
- .footnotes-title {
239
- font-weight: 600;
240
- color: #666;
241
- margin-bottom: 15px;
242
- font-size: 0.95rem;
243
- }
244
-
245
- .footnote-item {
246
- margin-bottom: 10px;
247
- padding-left: 15px;
248
- border-left: 2px solid #ddd;
249
- font-size: 0.9rem;
250
- color: #555;
251
- }
252
-
253
- /* ----------- HIGHLIGHT ANIMATION ------------- */
254
- @keyframes highlight-pulse {
255
- 0% { background-color: #fff8e1; }
256
- 70% { background-color: #fff8e1; }
257
- 100% { background-color: #ffffff; }
258
- }
259
-
260
- /* ----------- RESPONSIVE ------------- */
261
- @media (max-width: 992px) {
262
- body {
263
- flex-direction: column;
264
- }
265
-
266
- #sidebar {
267
- position: static;
268
- width: 100%;
269
- height: auto;
270
- max-height: 50vh;
271
- }
272
-
273
- #content-wrapper {
274
- margin-left: 0;
275
- }
276
- }
277
-
278
- /* ----------- BACK TO TOP ------------- */
279
- #back-to-top {
280
- position: fixed;
281
- bottom: 30px;
282
- right: 30px;
283
- width: 50px;
284
- height: 50px;
285
- background: #003366;
286
- color: white;
287
- border-radius: 50%;
288
- display: none;
289
- justify-content: center;
290
- align-items: center;
291
- cursor: pointer;
292
- box-shadow: 0 2px 10px rgba(0,0,0,0.2);
293
- transition: all 0.3s;
294
- z-index: 1000;
295
- }
296
-
297
- #back-to-top:hover {
298
- background: #00509e;
299
- transform: translateY(-3px);
300
- }
301
-
302
- /* ----------- KEYWORD HIGHLIGHT ------------- */
303
- .keyword-highlight {
304
- background: #fff9c4;
305
- padding: 2px 4px;
306
- border-radius: 3px;
307
- font-weight: 500;
308
- }
309
-
310
- /* ----------- PRINT STYLES ------------- */
311
- @media print {
312
- #sidebar {
313
- display: none;
314
- }
315
-
316
- #content-wrapper {
317
- margin-left: 0;
318
- }
319
-
320
- #back-to-top {
321
- display: none !important;
322
- }
323
- }
324
- </style>
325
  </head>
326
  <body>
327
- <!-- SIDEBAR -->
328
- <div id="sidebar">
329
- <div class="sidebar-header">
330
- <h2>Hochschulgesetz NRW</h2>
331
- <p>Inhaltsverzeichnis</p>
332
- </div>
333
-
334
- <input type="text" id="searchBox" placeholder="Paragraph suchen (z.B. §1 oder Text)..."
335
- title="Geben Sie eine Paragraphennummer oder Suchbegriff ein">
336
-
337
- <div class="paragraph-list" id="paragraphList">
338
- <!-- SIDEBAR_LINKS -->
339
- </div>
340
- </div>
341
-
342
- <!-- MAIN CONTENT -->
343
- <div id="content-wrapper">
344
- <div id="content">
345
- <div class="page-header">
346
- <h1>Hochschulgesetz Nordrhein-Westfalen</h1>
347
- <p class="subtitle">Gesetz über die Hochschulen des Landes Nordrhein-Westfalen (Hochschulgesetz – HG)</p>
348
- <p class="subtitle" style="font-size: 0.9rem; color: #777;">
349
- Stand: Aktuelle Fassung | Quelle: <a href="https://recht.nrw.de" target="_blank">recht.nrw.de</a>
350
- </p>
351
- </div>
352
-
353
- <div id="paragraphContent">
354
- <!-- PARAGRAPH_CONTENT -->
355
- </div>
356
- </div>
357
- </div>
358
-
359
- <!-- BACK TO TOP BUTTON -->
360
- <div id="back-to-top" title="Zum Anfang">
361
-
362
- </div>
363
-
364
- <script>
365
- // ========== GLOBAL VARIABLES ==========
366
- let currentParagraphId = '';
367
- let searchTimeout = null;
368
-
369
- // ========== INITIALIZATION ==========
370
- document.addEventListener('DOMContentLoaded', function() {
371
- // Check for URL hash
372
- const hash = window.location.hash.substring(1);
373
- const urlParams = new URLSearchParams(window.location.search);
374
- const keywords = urlParams.get('keywords');
375
-
376
- if (hash) {
377
- scrollToParagraph(hash);
378
- }
379
-
380
- if (keywords) {
381
- highlightKeywords(decodeURIComponent(keywords));
382
- }
383
-
384
- setupEventListeners();
385
- updateActiveLink();
386
- });
387
-
388
- // ========== SCROLL TO PARAGRAPH ==========
389
- function scrollToParagraph(paragraphId, highlight = true) {
390
- const element = document.getElementById(paragraphId);
391
- if (!element) return;
392
-
393
- // Remove previous highlight
394
- document.querySelectorAll('.paragraph.highlight').forEach(el => {
395
- el.classList.remove('highlight');
396
- });
397
-
398
- // Calculate position for smooth scroll
399
- const sidebarHeight = document.getElementById('sidebar').offsetHeight;
400
- const elementPosition = element.getBoundingClientRect().top;
401
- const offsetPosition = elementPosition + window.pageYOffset - 100;
402
-
403
- // Smooth scroll
404
- window.scrollTo({
405
- top: offsetPosition,
406
- behavior: 'smooth'
407
- });
408
-
409
- // Highlight if requested
410
- if (highlight) {
411
- setTimeout(() => {
412
- element.classList.add('highlight');
413
-
414
- // Update URL without page reload
415
- history.replaceState(null, null, `#${paragraphId}`);
416
-
417
- // Update active link in sidebar
418
- updateActiveLink(paragraphId);
419
- }, 300);
420
- }
421
- }
422
 
423
- // ========== SEARCH FUNCTIONALITY ==========
424
- function setupEventListeners() {
425
- const searchBox = document.getElementById('searchBox');
426
-
427
- // Search input with debounce
428
- searchBox.addEventListener('input', function() {
429
- clearTimeout(searchTimeout);
430
- searchTimeout = setTimeout(() => {
431
- filterParagraphs(this.value);
432
- }, 300);
433
- });
434
-
435
- // Enter key to jump to first result
436
- searchBox.addEventListener('keypress', function(e) {
437
- if (e.key === 'Enter') {
438
- e.preventDefault();
439
- jumpToFirstResult(this.value);
440
- }
441
- });
442
-
443
- // Back to top button
444
- const backToTop = document.getElementById('back-to-top');
445
- backToTop.addEventListener('click', function() {
446
- window.scrollTo({
447
- top: 0,
448
- behavior: 'smooth'
449
- });
450
- });
451
-
452
- // Show/hide back to top button
453
- window.addEventListener('scroll', function() {
454
- if (window.scrollY > 500) {
455
- backToTop.style.display = 'flex';
456
- } else {
457
- backToTop.style.display = 'none';
458
- }
459
-
460
- updateActiveLink();
461
- });
462
- }
463
-
464
- function filterParagraphs(searchTerm) {
465
- const links = document.querySelectorAll('.sidebar-link');
466
- const searchLower = searchTerm.toLowerCase();
467
- let hasVisible = false;
468
-
469
- links.forEach(link => {
470
- const text = link.textContent.toLowerCase();
471
- if (text.includes(searchLower)) {
472
- link.style.display = 'block';
473
- hasVisible = true;
474
- } else {
475
- link.style.display = 'none';
476
- }
477
- });
478
-
479
- // Update search box placeholder based on results
480
- const searchBox = document.getElementById('searchBox');
481
- if (!hasVisible && searchTerm) {
482
- searchBox.title = 'Keine Ergebnisse gefunden';
483
- } else {
484
- searchBox.title = '';
485
- }
486
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
487
 
488
- function jumpToFirstResult(searchTerm) {
489
- const links = document.querySelectorAll('.sidebar-link');
490
- const searchLower = searchTerm.toLowerCase();
491
-
492
- for (const link of links) {
493
- if (link.style.display !== 'none') {
494
- const paragraphId = link.getAttribute('href').substring(1);
495
- scrollToParagraph(paragraphId);
496
- break;
497
- }
498
- }
499
- }
500
-
501
- // ========== HIGHLIGHT KEYWORDS ==========
502
- function highlightKeywords(keywords) {
503
- const content = document.getElementById('paragraphContent');
504
- const searchTerms = keywords.split(/[\s,]+/).filter(term => term.length > 2);
505
-
506
- searchTerms.forEach(term => {
507
- const regex = new RegExp(`(${escapeRegExp(term)})`, 'gi');
508
- content.innerHTML = content.innerHTML.replace(regex,
509
- '<span class="keyword-highlight">$1</span>');
510
- });
511
- }
512
-
513
- function escapeRegExp(string) {
514
- return string.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
515
- }
516
-
517
- // ========== UPDATE ACTIVE LINK ==========
518
- function updateActiveLink(forceId = null) {
519
- const links = document.querySelectorAll('.sidebar-link');
520
- const paragraphs = document.querySelectorAll('.paragraph');
521
-
522
- let activeId = forceId;
523
-
524
- if (!activeId) {
525
- // Find paragraph in viewport
526
- const viewportHeight = window.innerHeight;
527
- const viewportMiddle = window.scrollY + (viewportHeight / 2);
528
-
529
- for (const paragraph of paragraphs) {
530
- const rect = paragraph.getBoundingClientRect();
531
- const paragraphTop = window.pageYOffset + rect.top;
532
- const paragraphBottom = paragraphTop + rect.height;
533
-
534
- if (viewportMiddle >= paragraphTop && viewportMiddle <= paragraphBottom) {
535
- activeId = paragraph.id;
536
- break;
537
- }
538
- }
539
- }
540
-
541
- // Update active state
542
- links.forEach(link => {
543
- const href = link.getAttribute('href').substring(1);
544
- if (href === activeId) {
545
- link.classList.add('active');
546
- } else {
547
- link.classList.remove('active');
548
- }
549
- });
550
- }
551
-
552
- // ========== FORMAT CONTENT ==========
553
- function formatContent(text) {
554
- // Replace multiple newlines with paragraphs
555
- return text.split('\n\n').map(paragraph => {
556
- if (paragraph.trim()) {
557
- return `<p>${paragraph.trim()}</p>`;
558
- }
559
- return '';
560
- }).join('');
561
- }
562
-
563
- // ========== COPY TO CLIPBOARD ==========
564
- function copyParagraphLink(paragraphId) {
565
- const url = window.location.origin + window.location.pathname + '#' + paragraphId;
566
- navigator.clipboard.writeText(url).then(() => {
567
- // Show temporary notification
568
- const notification = document.createElement('div');
569
- notification.textContent = 'Link kopiert!';
570
- notification.style.cssText = `
571
- position: fixed;
572
- top: 20px;
573
- right: 20px;
574
- background: #4CAF50;
575
- color: white;
576
- padding: 10px 20px;
577
- border-radius: 5px;
578
- z-index: 10000;
579
- animation: fadeInOut 2s ease;
580
- `;
581
- document.body.appendChild(notification);
582
-
583
- setTimeout(() => {
584
- document.body.removeChild(notification);
585
- }, 2000);
586
- });
587
- }
588
- </script>
589
  </body>
590
  </html>
591
  """
592
 
593
  # -------------------------------------------------------------------
594
- # BUILD VIEWER
595
  # -------------------------------------------------------------------
596
 
597
  def build_html():
598
- """Xây dựng HTML viewer từ dữ liệu Supabase"""
599
- paragraphs = get_paragraphs_from_supabase()
600
-
601
- if not paragraphs:
602
- print("❌ Keine Paragraphs zum Erstellen des Viewers verfügbar.")
603
- return None
604
-
605
- sidebar_links = []
606
- content_html = []
607
-
608
- for p in paragraphs:
609
  pid = p["abs_id"]
610
  title = p["title"]
611
- content = p["content"]
612
-
613
- # Tạo link cho sidebar
614
- sidebar_link = f'''
615
- <a class="sidebar-link" href="#{pid}" onclick="scrollToParagraph('{pid}'); return false;">
616
- {title}
617
- </a>
618
- '''
619
- sidebar_links.append(sidebar_link)
620
-
621
- # Tạo nội dung paragraph
622
- # Phân loại footnote và nội dung chính
623
- lines = content.split('\n')
624
- main_content = []
625
- footnotes = []
626
-
627
  for line in lines:
628
- line = line.strip()
629
- if line.lower().startswith('fn ') or line.lower().startswith('fussnote'):
630
- footnotes.append(line)
631
- elif line:
632
- main_content.append(line)
633
-
634
- # Format main content
635
- formatted_content = '<br>'.join(main_content)
636
-
637
- # Format footnotes
638
- footnotes_html = ''
639
- if footnotes:
640
- footnotes_html = '''
641
- <div class="footnotes">
642
- <div class="footnotes-title">Fußnoten:</div>
643
- ''' + ''.join(f'<div class="footnote-item">{fn}</div>' for fn in footnotes) + '''
644
- </div>
645
- '''
646
-
647
- # Tạo paragraph block
648
- paragraph_html = f'''
649
- <div class="paragraph" id="{pid}">
650
- <div class="paragraph-header">
651
- <h3 class="paragraph-title">
652
- {title}
653
- <a href="#{pid}" class="anchor" onclick="copyParagraphLink('{pid}'); return false;"
654
- title="Link zu diesem Paragraph kopieren">🔗</a>
655
- </h3>
656
- </div>
657
- <div class="paragraph-content">
658
- {formatted_content}
659
- </div>
660
- {footnotes_html}
661
- </div>
662
- '''
663
- content_html.append(paragraph_html)
664
-
665
- # Điền nội dung vào template
666
- html = VIEW_TEMPLATE
667
- html = html.replace('<!-- SIDEBAR_LINKS -->', '\n'.join(sidebar_links))
668
- html = html.replace('<!-- PARAGRAPH_CONTENT -->', '\n'.join(content_html))
669
-
670
- # Thêm metadata
671
- html = html.replace(
672
- 'Aktuelle Fassung',
673
- f'Aktuelle Fassung - {len(paragraphs)} Paragraphs'
674
- )
675
-
676
  return html
677
 
678
  # -------------------------------------------------------------------
679
- # UPLOAD TO SUPABASE STORAGE
680
  # -------------------------------------------------------------------
681
 
682
  def upload_html():
683
- """Tạo và tải lên HTML viewer"""
684
- print(">>> Baue HTML Viewer...")
685
-
686
  html = build_html()
687
- if not html:
688
- print("❌ Konnte HTML nicht erstellen.")
689
- return
690
-
691
- try:
692
- # Tạo bucket nếu chưa tồn tại
693
- try:
694
- supabase.storage.get_bucket("hg_viewer")
695
- except:
696
- supabase.storage.create_bucket("hg_viewer", {
697
- "public": True,
698
- "file_size_limit": 10485760 # 10MB
699
- })
700
-
701
- # Upload HTML file
702
- supabase.storage.from_("hg_viewer").upload(
703
- "hg_viewer.html",
704
- html.encode("utf-8"),
705
- {
706
- "content-type": "text/html",
707
- "cache-control": "public, max-age=3600"
708
- }
709
- )
710
-
711
- print("✅ hg_viewer.html erfolgreich hochgeladen!")
712
- print(f"📁 URL: {SUPABASE_URL}/storage/v1/object/public/hg_viewer/hg_viewer.html")
713
-
714
- except Exception as e:
715
- print(f"❌ Fehler beim Upload: {e}")
716
 
717
  if __name__ == "__main__":
718
- upload_html()
 
1
+ # build_hg_viewer.py
 
 
 
2
  import os
 
3
  from supabase import create_client
4
  from dotenv import load_dotenv
 
5
 
6
  load_dotenv()
7
 
 
13
 
14
  supabase = create_client(SUPABASE_URL, SUPABASE_SERVICE_ROLE)
15
 
16
+ from upload_weblink_to_supabase import extract_paragraphs
17
+
18
+ # ======== HTML TEMPLATE ========
19
+ VIEW_TEMPLATE = """
20
+ <!DOCTYPE html>
 
 
 
 
 
 
 
 
 
 
21
  <html lang="de">
22
  <head>
23
+ <meta charset="UTF-8">
24
+ <title>Hochschulgesetz NRW Paragraph Viewer</title>
25
+
26
+ <style>
27
+ body {
28
+ font-family: Arial, sans-serif;
29
+ margin: 0;
30
+ padding: 0;
31
+ display: flex;
32
+ }
33
+
34
+ /* ----------- SIDEBAR ------------- */
35
+ #sidebar {
36
+ width: 280px;
37
+ height: 100vh;
38
+ overflow-y: auto;
39
+ background: #f5f5f5;
40
+ border-right: 1px solid #ccc;
41
+ padding: 15px;
42
+ position: sticky;
43
+ top: 0;
44
+ }
45
+
46
+ #sidebar h2 {
47
+ margin-top: 0;
48
+ }
49
+
50
+ #searchBox {
51
+ width: 100%;
52
+ padding: 8px;
53
+ font-size: 15px;
54
+ margin-bottom: 10px;
55
+ border: 1px solid #aaa;
56
+ border-radius: 5px;
57
+ }
58
+
59
+ .sidebar-link {
60
+ display: block;
61
+ padding: 6px 8px;
62
+ margin-bottom: 4px;
63
+ text-decoration: none;
64
+ color: #003366;
65
+ border-radius: 4px;
66
+ }
67
+
68
+ .sidebar-link:hover {
69
+ background: #e0e7ff;
70
+ color: #001d4d;
71
+ }
72
+
73
+ /* ----------- CONTENT ------------- */
74
+ #content {
75
+ flex: 1;
76
+ padding: 25px;
77
+ max-width: 900px;
78
+ }
79
+
80
+ /* Absatz block */
81
+ .para {
82
+ padding: 20px 0;
83
+ border-bottom: 1px solid #ddd;
84
+ }
85
+
86
+ .para h2 {
87
+ color: #003366;
88
+ margin-bottom: 10px;
89
+ }
90
+
91
+ /* ----------- Fußnoten ------------- */
92
+ .fn-block {
93
+ background: #fafafa;
94
+ border-left: 4px solid #999;
95
+ padding: 12px;
96
+ margin-top: 10px;
97
+ margin-bottom: 25px;
98
+ }
99
+
100
+ .fn-toggle {
101
+ cursor: pointer;
102
+ font-weight: bold;
103
+ color: #003366;
104
+ margin-bottom: 5px;
105
+ }
106
+
107
+ .fn-content {
108
+ display: none;
109
+ padding-left: 10px;
110
+ }
111
+
112
+ .fn-title {
113
+ font-weight: bold;
114
+ margin-bottom: 6px;
115
+ }
116
+
117
+ .fn-item {
118
+ margin-bottom: 8px;
119
+ }
120
+
121
+ /* ----------- Highlight beim Öffnen ------------- */
122
+ .highlight {
123
+ animation: flash 2s ease-in-out;
124
+ background: #fff8c6 !important;
125
+ }
126
+
127
+ @keyframes flash {
128
+ 0% { background: #fff8c6; }
129
+ 100% { background: transparent; }
130
+ }
131
+
132
+ /* Keyword highlight */
133
+ .keyword {
134
+ background: yellow;
135
+ padding: 2px 3px;
136
+ border-radius: 3px;
137
+ }
138
+
139
+ /* Back to top button */
140
+ #topBtn {
141
+ position: fixed;
142
+ bottom: 25px;
143
+ right: 25px;
144
+ background: #003366;
145
+ color: white;
146
+ border-radius: 8px;
147
+ padding: 10px 14px;
148
+ cursor: pointer;
149
+ font-size: 16px;
150
+ display: none;
151
+ }
152
+ </style>
 
 
 
 
 
 
 
 
 
 
 
 
153
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
  </head>
155
  <body>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
 
157
+ <div id="sidebar">
158
+ <h2>Inhaltsverzeichnis</h2>
159
+ <input type="text" id="searchBox" placeholder="Suchen nach § …">
160
+ <!-- SIDEBAR_LINKS -->
161
+ </div>
162
+
163
+ <div id="content">
164
+ <h1>Hochschulgesetz NRW Paragraph Viewer</h1>
165
+ <!-- PARAGRAPH_CONTENT -->
166
+ </div>
167
+
168
+ <div id="topBtn" onclick="scrollToTop()">⬆️ Top</div>
169
+
170
+ <script>
171
+ // ------ TỰ ĐỘNG HIGHLIGHT Absatz khi có #anchor HIGHLIGHT ABSATZ & SCROLL ------
172
+ window.onload = function() {
173
+ const anchor = window.location.hash.substring(1);
174
+ const params = new URLSearchParams(window.location.search);
175
+ const keywords = params.get("k");
176
+
177
+ if (anchor) {
178
+ const el = document.getElementById(anchor);
179
+ if (el) {
180
+ el.classList.add("highlight");
181
+ el.scrollIntoView({ behavior: "smooth", block: "center" });
182
+ }
183
+ }
184
+
185
+ /* KEYWORD HIGHLIGHT */
186
+ if (keywords) {
187
+ const words = keywords.split("%20");
188
+ highlightKeywords(words);
189
+ }
190
+ };
191
+
192
+ /* --- KEYWORD HIGHLIGHT FUNCTION --- */
193
+ function highlightKeywords(words) {
194
+ const container = document.getElementById("content");
195
+ let html = container.innerHTML;
196
+
197
+ words.forEach(word => {
198
+ if (word.length < 2) return;
199
+ const regex = new RegExp(`(${decodeURIComponent(word)})`, "gi");
200
+ html = html.replace(regex, `<span class="keyword">$1</span>`);
201
+ });
202
+
203
+ container.innerHTML = html;
204
+ }
205
+
206
+ /* --- SEARCH IN SIDEBAR --- */
207
+ document.getElementById("searchBox").addEventListener("input", function() {
208
+ const q = this.value.toLowerCase();
209
+ document.querySelectorAll(".sidebar-link").forEach(link => {
210
+ const txt = link.innerText.toLowerCase();
211
+ link.style.display = txt.includes(q) ? "block" : "none";
212
+ });
213
+ });
214
+
215
+ /* --- COLLAPSIBLE FUSSNOTEN --- */
216
+ document.addEventListener("click", function(e) {
217
+ if (e.target.classList.contains("fn-toggle")) {
218
+ const content = e.target.nextElementSibling;
219
+ content.style.display = content.style.display === "block" ? "none" : "block";
220
+ }
221
+ });
222
+
223
+ /* --- BACK TO TOP BUTTON --- */
224
+ window.onscroll = function() {
225
+ document.getElementById("topBtn").style.display =
226
+ window.scrollY > 300 ? "block" : "none";
227
+ };
228
+
229
+ function scrollToTop() {
230
+ window.scrollTo({ top: 0, behavior: 'smooth' });
231
+ }
232
+
233
+ </script>
234
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
235
  </body>
236
  </html>
237
  """
238
 
239
  # -------------------------------------------------------------------
240
+ # 2. BUILD VIEWER
241
  # -------------------------------------------------------------------
242
 
243
  def build_html():
244
+ print(">>> Lade Paragraphs aus Supabase...")
245
+ paras = extract_paragraphs()
246
+
247
+ sidebar_links = ""
248
+ content_html = ""
249
+
250
+ for p in paras:
 
 
 
 
251
  pid = p["abs_id"]
252
  title = p["title"]
253
+ body = p["content"]
254
+
255
+ # Sidebar item
256
+ sidebar_links += f'<a class="sidebar-link" href="#{pid}">{title}</a>\n'
257
+
258
+ # Fußnoten tách riêng (bắt đầu bằng "Fn 1", "Fn 2", ...)
259
+ lines = body.split("\n")
260
+ main_text = []
261
+ fn_text = []
262
+ in_fn = False
263
+
 
 
 
 
 
264
  for line in lines:
265
+ if line.startswith("Fn "):
266
+ in_fn = True
267
+ if in_fn:
268
+ fn_text.append(line)
269
+ else:
270
+ main_text.append(line)
271
+
272
+ footnotes_html = ""
273
+ if fn_text:
274
+ footnotes_html += '<div class="fn-block">'
275
+ footnotes_html += '<div class="fn-title">Fußnoten:</div>'
276
+ for fn in fn_text:
277
+ footnotes_html += f'<div class="fn-item">{fn}</div>'
278
+ footnotes_html += "</div>"
279
+
280
+ # Paragraph block
281
+ content_html += f"""
282
+ <div class="para" id="{pid}">
283
+ <h2>{title}</h2>
284
+ <div>{'<br>'.join(main_text)}</div>
285
+ {footnotes_html}
286
+ </div>
287
+ """
288
+
289
+ html = VIEW_TEMPLATE.replace("<!-- SIDEBAR_LINKS -->", sidebar_links)
290
+ html = html.replace("<!-- PARAGRAPH_CONTENT -->", content_html)
291
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
292
  return html
293
 
294
  # -------------------------------------------------------------------
295
+ # 3. UPLOAD TO SUPABASE STORAGE
296
  # -------------------------------------------------------------------
297
 
298
  def upload_html():
 
 
 
299
  html = build_html()
300
+
301
+ supabase.storage.from_("hg_viewer").update(
302
+ "hg_clean.html",
303
+ html.encode("utf-8"),
304
+ {
305
+ "content-type": "text/html",
306
+ "x-upsert": "true"
307
+ }
308
+ )
309
+
310
+ print(" hg_clean.html uploaded!")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
311
 
312
  if __name__ == "__main__":
313
+ upload_html()
load_documents.py CHANGED
@@ -1,200 +1,130 @@
1
  """
2
- load_documents.py
3
- Cải thiện việc load tài liệu với xử lý lỗi tốt hơn
 
 
 
 
 
4
  """
5
 
6
  from huggingface_hub import hf_hub_download, list_repo_files
7
  from langchain_community.document_loaders import PyPDFLoader
8
  from langchain_core.documents import Document
9
  from bs4 import BeautifulSoup
10
- import requests
11
- import re
12
- from typing import List, Optional
13
 
14
  DATASET = "Nguyen5/docs"
15
  PDF_FILE = "f10_bpo_ifb_tei_mif_wii_2021-01-04.pdf"
16
- HTML_FILE = "Hochschulgesetz_NRW.html"
17
-
18
- def clean_html_content(text: str) -> str:
19
- """Làm sạch nội dung HTML"""
20
- # Loại bỏ khoảng trắng thừa
21
- text = re.sub(r'\s+', ' ', text)
22
- # Chuẩn hóa dấu câu
23
- text = re.sub(r'\s*([.,;:!?])\s*', r'\1 ', text)
24
- # Đảm bảo chữ cái đầu câu viết hoa
25
- sentences = text.split('. ')
26
- sentences = [s.strip().capitalize() for s in sentences if s.strip()]
27
- return '. '.join(sentences)
28
-
29
- def load_recht_nrw_direct() -> List[Document]:
30
- """Tải trực tiếp từ recht.nrw.de"""
31
- print(">>> Lade Hochschulgesetz NRW direkt von recht.nrw.de...")
32
-
33
- url = "https://recht.nrw.de/lmi/owa/br_text_anzeigen?v_id=10000000000000000654"
34
-
35
- try:
36
- headers = {
37
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
38
- }
39
- response = requests.get(url, headers=headers, timeout=60)
40
- response.raise_for_status()
41
-
42
- soup = BeautifulSoup(response.text, 'html.parser')
43
- docs = []
44
-
45
- # Tìm tất cả các paragraph
46
- for i, element in enumerate(soup.find_all(['p', 'div', 'td'])):
47
- text = element.get_text(" ", strip=True)
48
-
49
- # Chỉ lấy các phần có chứa §
50
- if '§' in text:
51
- # Tách title và content
52
- lines = text.split('\n')
53
- title = lines[0].strip() if lines else f"§ {i+1}"
54
- content = " ".join(lines[1:]) if len(lines) > 1 else text
55
-
56
- metadata = {
57
- "source": "Hochschulgesetz NRW (Website)",
58
- "filename": "recht_nrw_direct.html",
59
- "paragraph_id": f"hg_direct_{i+1}",
60
- "url": url
61
- }
62
-
63
- doc = Document(
64
- page_content=clean_html_content(content),
65
- metadata=metadata
66
- )
67
- docs.append(doc)
68
-
69
- print(f"✅ {len(docs)} Paragraphs direkt von recht.nrw.de geladen.")
70
- return docs
71
-
72
- except Exception as e:
73
- print(f"❌ Fehler beim Laden von recht.nrw.de: {e}")
74
- return []
75
 
76
- def _load_hg_paragraph_documents(html_path: str) -> List[Document]:
77
  """
78
- Lädt Paragraphs aus dem gespeicherten HTML
 
 
 
 
 
 
79
  """
80
- try:
81
- with open(html_path, "r", encoding="utf-8") as f:
82
- html = f.read()
83
-
84
- soup = BeautifulSoup(html, "html.parser")
85
- docs = []
86
-
87
- # Suche nach allen relevanten Inhalten
88
- for i, p in enumerate(soup.find_all(['p', 'div', 'section'])):
89
- text = p.get_text(" ", strip=True)
90
- if not text or len(text) < 10:
91
- continue
92
-
93
- # Check if it's a paragraph
94
- if '§' in text or 'Artikel' in text:
95
- pid = p.get("id", f"hg_para_{i+1}")
96
-
97
- metadata = {
98
- "source": "Hochschulgesetz NRW (HTML)",
99
- "filename": HTML_FILE,
100
- "paragraph_id": pid,
101
- "type": "paragraph"
102
- }
103
-
104
- docs.append(Document(
105
- page_content=clean_html_content(text),
106
- metadata=metadata
107
- ))
108
-
109
- print(f"✅ {len(docs)} Paragraphs aus HTML geladen.")
110
- return docs
111
-
112
- except Exception as e:
113
- print(f"❌ Fehler beim Laden des HTML: {e}")
114
- return []
115
 
116
- def load_pdf_documents() -> List[Document]:
117
- """Lädt PDF-Dokumente"""
118
- print(">>> Lade PDF-Dokumente...")
119
-
 
 
 
 
 
 
 
 
 
120
  try:
121
  pdf_path = hf_hub_download(
122
  repo_id=DATASET,
123
  filename=PDF_FILE,
124
  repo_type="dataset",
125
  )
126
- print(f" PDF heruntergeladen: {pdf_path}")
127
-
128
- # Load PDF with PyPDFLoader
 
 
 
 
129
  pdf_docs = PyPDFLoader(pdf_path).load()
130
-
131
- # Enhance metadata
132
- for i, doc in enumerate(pdf_docs):
133
- doc.metadata.update({
134
- "source": "Prüfungsordnung (PDF)",
135
- "filename": PDF_FILE,
136
- "document_type": "exam_regulation",
137
- "chunk_index": i
138
- })
139
-
140
- print(f"✅ {len(pdf_docs)} Seiten aus PDF geladen.")
141
- return pdf_docs
142
-
143
  except Exception as e:
144
- print(f" Fehler beim Laden des PDF: {e}")
145
  return []
146
 
147
- def load_documents() -> List[Document]:
148
- """
149
- Hauptfunktion zum Laden aller Dokumente
150
- """
151
- print("=== START: load_documents() ===\n")
152
-
153
- all_docs = []
154
-
155
- # 1. Load PDF documents
156
- pdf_docs = load_pdf_documents()
157
- all_docs.extend(pdf_docs)
158
-
159
- # 2. Try loading from dataset HTML
160
- print(">>> Versuche, HTML aus Dataset zu laden...")
161
  try:
162
  html_path = hf_hub_download(
163
  repo_id=DATASET,
164
  filename=HTML_FILE,
165
  repo_type="dataset",
166
  )
167
- print(f" HTML heruntergeladen: {html_path}")
168
-
 
 
 
 
 
169
  html_docs = _load_hg_paragraph_documents(html_path)
170
- all_docs.extend(html_docs)
171
-
172
  except Exception as e:
173
- print(f"⚠️ Konnte HTML nicht aus Dataset laden: {e}")
174
-
175
- # 3. Fallback: Load directly from website
176
- print(">>> Fallback: Lade direkt von recht.nrw.de...")
177
- web_docs = load_recht_nrw_direct()
178
- all_docs.extend(web_docs)
179
-
180
- print(f"\n=== DONE: {len(all_docs)} Dokumente geladen ===")
181
-
182
- # Print summary
183
- pdf_count = len([d for d in all_docs if "PDF" in d.metadata.get("source", "")])
184
- html_count = len([d for d in all_docs if "HTML" in d.metadata.get("source", "")])
185
- web_count = len([d for d in all_docs if "Website" in d.metadata.get("source", "")])
186
-
187
- print(f"📊 Zusammenfassung:")
188
- print(f" - PDF-Seiten: {pdf_count}")
189
- print(f" - HTML-Paragraphs: {html_count}")
190
- print(f" - Web-Paragraphs: {web_count}")
191
-
192
- return all_docs
193
 
194
  if __name__ == "__main__":
 
195
  docs = load_documents()
196
-
197
- if docs:
198
- print(f"\nErstes Dokument (Beispiel):")
199
- print(f"Content: {docs[0].page_content[:200]}...")
200
- print(f"Metadata: {docs[0].metadata}")
 
1
  """
2
+ BƯỚC 1: LOAD DOCUMENTS
3
+ -----------------------
4
+ Debug-full version
5
+
6
+ - Lädt Prüfungsordnung (PDF) seitenweise.
7
+ - Lädt Hochschulgesetz NRW aus dem im Dataset gespeicherten HTML,
8
+ und zerlegt es in einzelne Absätze (Document pro <p>).
9
  """
10
 
11
  from huggingface_hub import hf_hub_download, list_repo_files
12
  from langchain_community.document_loaders import PyPDFLoader
13
  from langchain_core.documents import Document
14
  from bs4 import BeautifulSoup
 
 
 
15
 
16
  DATASET = "Nguyen5/docs"
17
  PDF_FILE = "f10_bpo_ifb_tei_mif_wii_2021-01-04.pdf"
18
+ HTML_FILE = "Hochschulgesetz_NRW.html" # konsistent mit hg_nrw.py
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
+ def _load_hg_paragraph_documents(html_path: str):
21
  """
22
+ Liest das generierte Hochschulgesetz-HTML ein und erzeugt
23
+ pro <p>-Element einen LangChain-Document mit:
24
+ - page_content = Text des Absatzes
25
+ - metadata:
26
+ source = "Hochschulgesetz NRW (HTML)"
27
+ filename = HTML_FILE
28
+ paragraph_id = id-Attribut (z.B. 'hg_abs_12'), falls vorhanden
29
  """
30
+ with open(html_path, "r", encoding="utf-8") as f:
31
+ html = f.read()
32
+
33
+ soup = BeautifulSoup(html, "html.parser")
34
+ docs = []
35
+
36
+ for p in soup.find_all("p"):
37
+ text = p.get_text(" ", strip=True)
38
+ if not text:
39
+ continue
40
+
41
+ pid = p.get("id")
42
+
43
+ metadata = {
44
+ "source": "Hochschulgesetz NRW (HTML)",
45
+ "filename": HTML_FILE,
46
+ }
47
+ if pid:
48
+ metadata["paragraph_id"] = pid
49
+
50
+ docs.append(Document(page_content=text, metadata=metadata))
51
+
52
+ print(f"Loaded {len(docs)} paragraph Documents from HG-HTML.\n")
53
+ return docs
54
+
55
+ def load_documents():
56
+ print("=== START: load_documents() ===\n")
 
 
 
 
 
 
 
 
57
 
58
+ # -------------------------
59
+ # Check files in dataset
60
+ # -------------------------
61
+ print(">>> Checking dataset file list from HuggingFace...")
62
+ files = list_repo_files(DATASET, repo_type="dataset")
63
+ print("Files in dataset:", files, "\n")
64
+
65
+ docs = []
66
+
67
+ # -------------------------
68
+ # Load PDF
69
+ # -------------------------
70
+ print(">>> Step 1: Download PDF from HuggingFace...")
71
  try:
72
  pdf_path = hf_hub_download(
73
  repo_id=DATASET,
74
  filename=PDF_FILE,
75
  repo_type="dataset",
76
  )
77
+ print(f"Downloaded PDF to local cache:\n{pdf_path}\n")
78
+ except Exception as e:
79
+ print("ERROR downloading PDF:", e)
80
+ return []
81
+
82
+ print(">>> Step 1.1: Loading PDF pages...")
83
+ try:
84
  pdf_docs = PyPDFLoader(pdf_path).load()
85
+ print(f"Loaded {len(pdf_docs)} PDF pages.\n")
 
 
 
 
 
 
 
 
 
 
 
 
86
  except Exception as e:
87
+ print("ERROR loading PDF:", e)
88
  return []
89
 
90
+ for d in pdf_docs:
91
+ d.metadata["source"] = "Prüfungsordnung (PDF)"
92
+ d.metadata["filename"] = PDF_FILE
93
+
94
+ docs.extend(pdf_docs)
95
+
96
+ # -------------------------
97
+ # Load HTML (Hochschulgesetz NRW)
98
+ # -------------------------
99
+ print(">>> Step 2: Download HTML from HuggingFace...")
 
 
 
 
100
  try:
101
  html_path = hf_hub_download(
102
  repo_id=DATASET,
103
  filename=HTML_FILE,
104
  repo_type="dataset",
105
  )
106
+ print(f"Downloaded HTML to local cache:\n{html_path}\n")
107
+ except Exception as e:
108
+ print("ERROR downloading HTML:", e)
109
+ return docs
110
+
111
+ print(">>> Step 2.1: Loading HG-HTML and splitting into paragraphs...")
112
+ try:
113
  html_docs = _load_hg_paragraph_documents(html_path)
 
 
114
  except Exception as e:
115
+ print("ERROR loading / parsing HTML:", e)
116
+ return docs
117
+
118
+ docs.extend(html_docs)
119
+
120
+ print("=== DONE: load_documents() ===\n")
121
+ return docs
 
 
 
 
 
 
 
 
 
 
 
 
 
122
 
123
  if __name__ == "__main__":
124
+ print("\n=== Running load_documents.py directly ===\n")
125
  docs = load_documents()
126
+ print(f"\n>>> TOTAL documents loaded: {len(docs)}")
127
+
128
+ if len(docs):
129
+ print("\nExample metadata from 1st document:")
130
+ print(docs[0].metadata)
rag_pipeline.py CHANGED
@@ -1,197 +1,194 @@
1
  """
2
- RAG PIPELINE – Verbesserte Version mit präzisen Prompts
3
  """
4
 
5
  from typing import List, Dict, Any, Tuple
6
  from langchain_core.messages import SystemMessage, HumanMessage
7
- from langchain_core.documents import Document
8
- import re
9
 
 
10
  # URLs für Quellen
11
- PDF_BASE_URL = "https://huggingface.co/datasets/Nguyen5/docs/resolve/main/f10_bpo_ifb_tei_mif_wii_2021-01-04.pdf"
12
- LAW_VIEWER_URL = "https://YOUR_SUPABASE_URL/storage/v1/object/public/hg_viewer/hg_viewer.html"
13
-
14
- MAX_CHARS = 1000
15
-
16
- def format_chunk_content(chunk: Document) -> str:
17
- """Format chunk content for better readability"""
18
- content = chunk.page_content
19
-
20
- # Remove excessive whitespace
21
- content = re.sub(r'\s+', ' ', content)
22
-
23
- # Ensure proper sentence endings
24
- if not content.strip().endswith(('.', '!', '?')):
25
- content = content.strip() + '.'
26
-
27
- return content[:MAX_CHARS]
28
-
29
- def build_sources_metadata(docs: List[Document]) -> List[Dict[str, Any]]:
 
 
 
30
  """
31
- Erzeugt strukturierte Quellen-Informationen
 
 
 
 
 
 
 
 
 
 
 
32
  """
33
- sources = []
34
-
35
- for i, doc in enumerate(docs, 1):
36
- metadata = doc.metadata
37
- source_type = metadata.get("source", "")
38
- page = metadata.get("page")
39
- para_id = metadata.get("paragraph_id", "")
40
-
41
- # Prepare snippet
42
- snippet = format_chunk_content(doc)
43
- if len(snippet) > 300:
44
- snippet = snippet[:297] + "..."
45
-
46
- # Determine URL
47
- url = None
48
- if "PDF" in source_type:
49
  if isinstance(page, int):
 
50
  url = f"{PDF_BASE_URL}#page={page + 1}"
51
  else:
52
  url = PDF_BASE_URL
53
-
54
- elif "HTML" in source_type or "Website" in source_type:
 
 
55
  if para_id:
56
- url = f"{LAW_VIEWER_URL}#{para_id}"
 
57
  else:
58
- url = LAW_VIEWER_URL
59
-
60
- # Build source info
61
- source_info = {
62
- "id": i,
63
- "source": source_type,
64
- "page": page + 1 if isinstance(page, int) else None,
65
- "paragraph_id": para_id,
66
- "url": url,
67
- "snippet": snippet,
68
- "content_preview": doc.page_content[:200] + "..." if len(doc.page_content) > 200 else doc.page_content
69
- }
70
- sources.append(source_info)
71
-
72
- return sources
73
-
74
- def format_context(docs: List[Document]) -> str:
75
- """
76
- Formatiert den Kontext für den Prompt
77
- """
 
 
 
78
  if not docs:
79
- return "KEIN_RELEVANTER_KONTEXT_GEFUNDEN"
80
-
81
- context_parts = []
82
-
83
- for i, doc in enumerate(docs, 1):
84
- content = format_chunk_content(doc)
85
- metadata = doc.metadata
86
-
87
- # Build source description
88
- source_desc = metadata.get("source", "Unbekannte Quelle")
89
- if "page" in metadata and metadata["page"] is not None:
90
- source_desc += f", Seite {metadata['page'] + 1}"
91
- if "paragraph_id" in metadata:
92
- source_desc += f", {metadata['paragraph_id']}"
93
-
94
- context_parts.append(f"【Quelle {i}】{source_desc}\n{content}")
95
-
96
- return "\n\n".join(context_parts)
97
-
98
- # ========== IMPROVED SYSTEM PROMPT ==========
99
- SYSTEM_PROMPT = """
100
- Du bist ein hochpräziser juristischer Assistenz-Chatbot für Prüfungsrecht an Hochschulen in Nordrhein-Westfalen.
101
 
102
- Deine Wissensbasis umfasst ausschließlich:
103
- 1. Die spezifische Prüfungsordnung (PDF-Dokument)
104
- 2. Das Hochschulgesetz NRW (Hochschulgesetz - HG)
 
 
105
 
106
- STRENGE ANWEISUNGEN:
 
 
 
107
 
108
- 1. **AUSSCHLIESSLICHE KONTEXTNUTZUNG:**
109
- - Verwende NUR die bereitgestellten Quellen aus der Wissensbasis.
110
- - Wenn Informationen nicht im Kontext stehen, sage explizit: "Auf Basis der vorliegenden Dokumente kann ich diese Frage nicht sicher beantworten."
111
- - KEINE Vermutungen, Spekulationen oder externes Wissen.
112
 
113
- 2. **PRÄZISE JURISTISCHE ANTWORTEN:**
114
- - Formuliere in vollständigen, grammatikalisch korrekten Sätzen.
115
- - Verwende präzise juristische Sprache, aber bleibe verständlich.
116
- - Strukturiere komplexe Antworten mit Absätzen oder Aufzählungen.
117
 
118
- 3. **QUELLENNACHWEISE:**
119
- - Verweise immer auf die konkrete Quelle (Prüfungsordnung §X oder Hochschulgesetz §Y).
120
- - Bei der Prüfungsordnung gib die Seite an.
121
- - Beim Hochschulgesetz verweise auf den Paragraphen.
122
 
123
- 4. **ANTWORTSTRUKTUR:**
124
- a) Kurze präzise Antwort zuerst
125
- b) Detaillierte Erklärung mit Quellenangaben
126
- c) Falls relevant: praktische Hinweise basierend auf dem Kontext
127
 
128
- 5. **FEHLENDE INFORMATIONEN:**
129
- - Wenn der Kontext unvollständig ist, erkläre, welche Informationen fehlen.
130
- - Biete an, nur die vorhandenen Informationen zusammenzufassen.
131
 
132
- 6. **SPRACHE:**
133
- - Verwende ausschließlich formelles Deutsch.
134
- - Vermeide Umgangssprache und Abkürzungen.
135
 
136
- Deine Antworten müssen rechtlich korrekt, vollständig und nachprüfbar sein.
137
- """
 
 
138
 
139
- def create_human_prompt(question: str, context: str) -> str:
140
- """
141
- Erstellt optimierten Human Prompt
142
- """
143
- return f"""FRAGE DES NUTZERS:
144
- {question}
145
 
146
- VERFÜGBARE RECHTSQUELLEN:
147
- {context if context else "KEINE RELEVANTEN QUELLEN GEFUNDEN"}
148
 
149
- AUFGABE:
150
- Beantworte die Frage ausschließlich auf Basis der oben genannten Rechtsquellen.
 
 
 
 
 
 
151
 
152
- ANFORDERUNGEN:
153
- 1. Gib eine präzise juristische Antwort in vollständigen Sätzen.
154
- 2. Zitiere konkret:
155
- - Für die Prüfungsordnung: "Laut Prüfungsordnung, §X auf Seite Y, ..."
156
- - Für das Hochschulgesetz: "Gemäß Hochschulgesetz NRW §Z, ..."
157
- 3. Wenn mehrere Quellen relevant sind, erwähne alle.
158
- 4. Wenn Informationen fehlen, erkläre dies klar.
159
- 5. Strukturiere die Antwort logisch.
160
 
161
- ANTWORT (auf Deutsch):"""
 
 
162
 
163
  def answer(question: str, retriever, chat_model) -> Tuple[str, List[Dict[str, Any]]]:
164
  """
165
- Haupt-RAG-Funktion mit verbessertem Prompting
 
 
 
 
 
166
  """
167
- # 1. Retrieve relevant documents
168
  docs = retriever.invoke(question)
169
-
170
- # 2. Format context
171
  context_str = format_context(docs)
172
-
173
- # 3. Create prompt
174
- human_prompt = create_human_prompt(question, context_str)
175
-
176
- # 4. Call LLM
177
- messages = [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
  SystemMessage(content=SYSTEM_PROMPT),
179
- HumanMessage(content=human_prompt)
180
  ]
181
-
182
- try:
183
- result = chat_model.invoke(messages)
184
- answer_text = result.content.strip()
185
-
186
- # Clean up answer
187
- answer_text = re.sub(r'\n\s*\n+', '\n\n', answer_text) # Remove excessive newlines
188
- answer_text = answer_text.replace("KEINE RELEVANTEN QUELLEN GEFUNDEN",
189
- "Auf Basis der vorliegenden Dokumente kann ich diese Frage nicht sicher beantworten.")
190
-
191
- except Exception as e:
192
- answer_text = f"Fehler bei der Generierung der Antwort: {str(e)}"
193
-
194
- # 5. Build sources metadata
195
  sources = build_sources_metadata(docs)
196
-
197
- return answer_text, sources
 
1
  """
2
+ RAG PIPELINE – Version 26.11 (ohne Modi, stabil, juristisch korrekt)
3
  """
4
 
5
  from typing import List, Dict, Any, Tuple
6
  from langchain_core.messages import SystemMessage, HumanMessage
7
+ from load_documents import DATASET, PDF_FILE, HTML_FILE
 
8
 
9
+ # -------------------------------------------------------------------
10
  # URLs für Quellen
11
+ # -------------------------------------------------------------------
12
+
13
+ # Direktes PDF im Dataset (für #page)
14
+ PDF_BASE_URL = f"https://huggingface.co/datasets/{DATASET}/resolve/main/{PDF_FILE}"
15
+
16
+ # Hochschulgesetz-HTML im Dataset (enthält <p id="hg_abs_X"> …)
17
+ LAW_DATASET_URL = f"https://huggingface.co/datasets/{DATASET}/resolve/main/{HTML_FILE}"
18
+
19
+ # Offizielle Recht.NRW-Druckversion (für Viewer im Frontend)
20
+ LAW_URL = (
21
+ "https://recht.nrw.de/lmi/owa/br_bes_text?"
22
+ "print=1&anw_nr=2&gld_nr=2&ugl_nr=221&val=28364&ver=0&"
23
+ "aufgehoben=N&keyword=&bes_id=28364&show_preview=1"
24
+ )
25
+
26
+ MAX_CHARS = 900
27
+
28
+ # -----------------------------
29
+ # Quellen formatieren
30
+ # -----------------------------
31
+
32
+ def build_sources_metadata(docs: List) -> List[Dict[str, Any]]:
33
  """
34
+ Erzeugt eine Liste strukturierter Quellen-Infos:
35
+
36
+ [
37
+ {
38
+ "id": 1,
39
+ "source": "Prüfungsordnung (PDF)" / "Hochschulgesetz NRW (HTML)",
40
+ "page": 3, # nur bei PDF
41
+ "url": "...", # direkter Klick-Link
42
+ "snippet": "Erste 300 Zeichen des Chunks..."
43
+ },
44
+ ...
45
+ ]
46
  """
47
+ srcs = []
48
+ for i, d in enumerate(docs):
49
+ meta = d.metadata
50
+ src = meta.get("source", "")
51
+ page = meta.get("page")
52
+ snippet = d.page_content[:300].replace("\n", " ")
53
+
54
+ # PDF-Link
55
+ if "Prüfungsordnung" in src:
 
 
 
 
 
 
 
56
  if isinstance(page, int):
57
+ # PyPDFLoader: page ist 0-basiert, Anzeige 1-basiert
58
  url = f"{PDF_BASE_URL}#page={page + 1}"
59
  else:
60
  url = PDF_BASE_URL
61
+
62
+ # NRW-Gesetz (HTML im Dataset mit Absatz-IDs)
63
+ elif "Hochschulgesetz" in src:
64
+ para_id = meta.get("paragraph_id")
65
  if para_id:
66
+ # Klick führt direkt zum Absatz im Dataset-HTML
67
+ url = f"{LAW_DATASET_URL}#{para_id}"
68
  else:
69
+ # Fallback: offizielle Druckversion (ohne Absatz-Anker)
70
+ url = LAW_URL
71
+ page = None # keine Seitenangabe für Gesetz-HTML
72
+
73
+ else:
74
+ url = None
75
+
76
+ srcs.append(
77
+ {
78
+ "id": i + 1,
79
+ "source": src,
80
+ "page": page + 1 if isinstance(page, int) else None,
81
+ "url": url,
82
+ "snippet": snippet,
83
+ }
84
+ )
85
+ return srcs
86
+
87
+ # -----------------------------
88
+ # Kontext formatieren
89
+ # -----------------------------
90
+
91
+ def format_context(docs):
92
  if not docs:
93
+ return "(Kein relevanter Kontext im Dokument gefunden.)"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
 
95
+ out = []
96
+ for i, d in enumerate(docs):
97
+ txt = d.page_content[:MAX_CHARS]
98
+ src = d.metadata.get("source")
99
+ page = d.metadata.get("page")
100
 
101
+ if "Prüfungsordnung" in (src or "") and isinstance(page, int):
102
+ src_str = f"{src}, Seite {page + 1}"
103
+ else:
104
+ src_str = src
105
 
106
+ out.append(f"[KONTEXT {i+1}] ({src_str})\n{txt}")
 
 
 
107
 
108
+ return "\n\n".join(out)
 
 
 
109
 
110
+ # -----------------------------
111
+ # Systemprompt verschärft
112
+ # -----------------------------
 
113
 
114
+ SYSTEM_PROMPT = """
115
+ Du bist ein hochpräziser juristischer Chatbot für Prüfungsrecht
116
+ mit Zugriff nur auf:
 
117
 
118
+ - die Prüfungsordnung (als PDF) und
119
+ - das Hochschulgesetz NRW (als HTML aus der offiziellen Druckversion).
 
120
 
121
+ Strenge Regeln:
 
 
122
 
123
+ 1. Antworte ausschließlich anhand des bereitgestellten Kontextes
124
+ (KONTEXT-Abschnitte). Wenn die Information nicht im Kontext steht,
125
+ sage ausdrücklich, dass dies aus den vorliegenden Dokumenten nicht
126
+ hervorgeht und du dazu nichts Sicheres sagen kannst.
127
 
128
+ 2.
129
+ Keine Spekulationen, keine Vermutungen.
 
 
 
 
130
 
131
+ 3. Antworte in zusammenhängenden, ganzen Sätzen. Verwende keine Mischung aus Deutsch und Englisch.
 
132
 
133
+ 4. Nenne, soweit aus dem Kontext erkennbar,
134
+ - die rechtliche Grundlage (z.B. Paragraph, Artikel),
135
+ - das Dokument (Prüfungsordnung / Hochschulgesetz NRW),
136
+ - die Seite (bei der Prüfungsordnung), wenn im Kontext vorhanden.
137
+
138
+ 5. Füge KEINE externen Informationen hinzu, z.B. aus anderen Gesetzen,
139
+ Webseiten oder allgemeinem Wissen. Nur das, was im Kontext steht,
140
+ darf in der Antwort verwendet werden.
141
 
142
+ Wenn der Kontext keine eindeutige Antwort zulässt, erkläre klar,
143
+ warum keine sichere Antwort möglich ist und welche Informationen
144
+ im Dokument fehlen.
145
+ """
 
 
 
 
146
 
147
+ # -----------------------------
148
+ # Hauptfunktion
149
+ # -----------------------------
150
 
151
  def answer(question: str, retriever, chat_model) -> Tuple[str, List[Dict[str, Any]]]:
152
  """
153
+ Haupt-RAG-Funktion:
154
+
155
+ - ruft retriever.invoke(question) auf,
156
+ - baut einen präzisen Prompt mit KONTEXT,
157
+ - ruft LLM auf,
158
+ - gibt Antworttext + Quellenliste zurück.
159
  """
160
+ # 1. Dokumente holen
161
  docs = retriever.invoke(question)
 
 
162
  context_str = format_context(docs)
163
+
164
+ # 2. Prompt bauen
165
+ human = f"""
166
+ FRAGE:
167
+ {question}
168
+
169
+ NUTZE AUSSCHLIESSLICH DIESEN KONTEXT:
170
+ {context_str}
171
+
172
+ AUFGABE:
173
+ Formuliere eine juristisch korrekte, gut verständliche Antwort
174
+ ausschließlich anhand des obigen Kontextes.
175
+
176
+ - Wenn der Kontext aus den Dokumenten eine klare Antwort erlaubt,
177
+ erläutere diese strukturiert und in vollständigen Sätzen.
178
+ - Wenn der Kontext KEINE klare Antwort erlaubt oder wichtige Informationen
179
+ fehlen, erkläre das offen und formuliere KEINE Vermutung.
180
+ """
181
+
182
+ msgs = [
183
  SystemMessage(content=SYSTEM_PROMPT),
184
+ HumanMessage(content=human),
185
  ]
186
+
187
+ # 3. LLM aufrufen
188
+ result = chat_model.invoke(msgs)
189
+ answer_text = result.content.strip()
190
+
191
+ # 4. Quellenliste bauen
 
 
 
 
 
 
 
 
192
  sources = build_sources_metadata(docs)
193
+
194
+ return answer_text, sources