Spaces:

Nguyen5
/

chatbot

Sleeping

App Files Files Community

Nguyen5 commited on Dec 4, 2025

Commit

b77e194

1 Parent(s): 612ca73

commit

Browse files

Files changed (6) hide show

app.py +256 -149
build_hg_viewer.py +677 -272
load_documents.py +165 -95
rag_pipeline.py +158 -155
requirements.txt +3 -0
upload_weblink_to_supabase.py +101 -43

app.py CHANGED Viewed

@@ -1,212 +1,319 @@
-# app.py – Prüfungsrechts-Chatbot (RAG + Sprachmodus)
-# Version 26.11 – ohne Modi, stabil für Text + Voice
 import gradio as gr
 from gradio_pdf import PDF
 from huggingface_hub import hf_hub_download
-from load_documents import load_documents, DATASET, PDF_FILE, HTML_FILE
 from split_documents import split_documents
 from vectorstore import build_vectorstore
 from retriever import get_retriever
 from llm import load_llm
-from rag_pipeline import answer, PDF_BASE_URL, LAW_URL
 from speech_io import transcribe_audio, synthesize_speech
 # =====================================================
-# INITIALISIERUNG (global)
 # =====================================================
-print("🔹 Lade Dokumente ...")
 _docs = load_documents()
-print("🔹 Splitte Dokumente ...")
 _chunks = split_documents(_docs)
-print("🔹 Baue VectorStore (FAISS) ...")
 _vs = build_vectorstore(_chunks)
-print("🔹 Erzeuge Retriever ...")
 _retriever = get_retriever(_vs)
-print("🔹 Lade LLM ...")
 _llm = load_llm()
-print("🔹 Lade Dateien für Viewer …")
-_pdf_path = hf_hub_download(DATASET, PDF_FILE, repo_type="dataset")
-_html_path = hf_hub_download(DATASET, HTML_FILE, repo_type="dataset")
 # =====================================================
-# Quellen formatieren – Markdown für Chat
 # =====================================================
 def format_sources_markdown(sources):
     if not sources:
         return ""
-    lines = ["", "**📚 Quellen (genutzte Dokumentstellen):**"]
     for s in sources:
-        sid = s["id"]
-        src = s["source"]
         page = s["page"]
         url = s["url"]
-        snippet = s["snippet"]
-        title = f"Quelle {sid} – {src}"
         if url:
-            base = f"- [{title}]({url})"
         else:
-            base = f"- {title}"
-        if page and "Prüfungsordnung" in src:
-            base += f", Seite {page}"
-        lines.append(base)
         if snippet:
-            lines.append(f"  > {snippet}")
     return "\n".join(lines)
 # =====================================================
-# TEXT CHATBOT
 # =====================================================
 def chatbot_text(user_message, history):
-    if not user_message:
         return history, ""
-    answer_text, sources = answer(
-        question=user_message,
-        retriever=_retriever,
-        chat_model=_llm,
-    )
-    quellen_block = format_sources_markdown(sources)
-    history = history + [
-        {"role": "user", "content": user_message},
-        {"role": "assistant", "content": answer_text + quellen_block},
-    ]
-    return history, ""
-# =====================================================
-# VOICE CHATBOT
-# =====================================================
 def chatbot_voice(audio_path, history):
-    # 1. Speech → Text
     text = transcribe_audio(audio_path)
     if not text:
-        return history, None, ""
-    # Lưu vào lịch sử chat
-    history = history + [{"role": "user", "content": text}]
-    # 2. RAG trả lời
-    answer_text, sources = answer(
-        question=text,
-        retriever=_retriever,
-        chat_model=_llm,
-    )
-    quellen_block = format_sources_markdown(sources)
-    bot_msg = answer_text + quellen_block
-    history = history + [{"role": "assistant", "content": bot_msg}]
-    # 3. Text → Speech
-    audio = synthesize_speech(bot_msg)
-    return history, audio, ""
-# =====================================================
-# LAST ANSWER → TTS
-# =====================================================
 def read_last_answer(history):
     if not history:
         return None
     for msg in reversed(history):
         if msg["role"] == "assistant":
-            return synthesize_speech(msg["content"])
     return None
 # =====================================================
-# UI – GRADIO
 # =====================================================
-with gr.Blocks(title="Prüfungsrechts-Chatbot (RAG + Sprache)") as demo:
-    gr.Markdown("# 🧑‍⚖️ Prüfungsrechts-Chatbot")
-    gr.Markdown(
-        "Dieser Chatbot beantwortet Fragen **ausschließlich** aus der "
-        "Prüfungsordnung (PDF) und dem Hochschulgesetz NRW (Website). "
-        "Du kannst Text eingeben oder direkt ins Mikrofon sprechen."
-    )
-    with gr.Row():
-        with gr.Column(scale=2):
-            chatbot = gr.Chatbot(label="Chat", height=500)
-            msg = gr.Textbox(
-                label="Frage eingeben",
-                placeholder="Stelle deine Frage zum Prüfungsrecht …",
-            )
-            # TEXT SENDEN
-            msg.submit(
-                chatbot_text,
-                [msg, chatbot],
-                [chatbot, msg]
-            )
-            send_btn = gr.Button("Senden (Text)")
-            send_btn.click(
-                chatbot_text,
-                [msg, chatbot],
-                [chatbot, msg]
-            )
-            # SPRACHEINGABE
-            gr.Markdown("### 🎙️ Spracheingabe")
-            voice_in = gr.Audio(sources=["microphone"], type="filepath")
-            voice_out = gr.Audio(label="Vorgelesene Antwort", type="numpy")
-            voice_btn = gr.Button("Sprechen & senden")
-            voice_btn.click(
-                chatbot_voice,
-                [voice_in, chatbot],
-                [chatbot, voice_out, msg]
-            )
-            read_btn = gr.Button("🔁 Antwort erneut vorlesen")
-            read_btn.click(
-                read_last_answer,
-                [chatbot],
-                [voice_out]
-            )
-            clear_btn = gr.Button("Chat zurücksetzen")
-            clear_btn.click(lambda: [], None, chatbot)
-        # =====================
-        # RECHTE SPALTE: Viewer
-        # =====================
-        with gr.Column(scale=1):
-            gr.Markdown("### 📄 Prüfungsordnung (PDF)")
-            PDF(_pdf_path, height=350)
-            gr.Markdown("### 📘 Hochschulgesetz NRW (Website)")
-            gr.HTML(
-                f'<iframe src="{LAW_URL}" style="width:100%;height:350px;border:none;"></iframe>'
-            )
 if __name__ == "__main__":
-    demo.queue().launch(ssr_mode=False, show_error=True)

+"""
+app.py – Aktualisierte Version mit verbessertem Viewer
+"""
 import gradio as gr
 from gradio_pdf import PDF
 from huggingface_hub import hf_hub_download
+import os
+from load_documents import load_documents, DATASET, PDF_FILE
 from split_documents import split_documents
 from vectorstore import build_vectorstore
 from retriever import get_retriever
 from llm import load_llm
+from rag_pipeline import answer
 from speech_io import transcribe_audio, synthesize_speech
 # =====================================================
+# KONFIGURATION
 # =====================================================
+# Viewer URL (ersetze mit deiner Supabase URL)
+SUPABASE_URL = os.environ.get("SUPABASE_URL", "https://your-project.supabase.co")
+LAW_VIEWER_URL = f"{SUPABASE_URL}/storage/v1/object/public/hg_viewer/hg_viewer.html"
+# =====================================================
+# INITIALISIERUNG
+# =====================================================
+print("🔹 Initialisiere System...")
+print("1. Lade Dokumente...")
 _docs = load_documents()
+print("2. Splitte Dokumente...")
 _chunks = split_documents(_docs)
+print("3. Baue VectorStore...")
 _vs = build_vectorstore(_chunks)
+print("4. Erzeuge Retriever...")
 _retriever = get_retriever(_vs)
+print("5. Lade LLM...")
 _llm = load_llm()
+print("6. Lade Dateien für Viewer...")
+try:
+    _pdf_path = hf_hub_download(DATASET, PDF_FILE, repo_type="dataset")
+    print(f"✅ PDF geladen: {_pdf_path}")
+except Exception as e:
+    print(f"⚠️ PDF konnte nicht geladen werden: {e}")
+    _pdf_path = None
+print("✅ System initialisiert!")
 # =====================================================
+# HELPER FUNCTIONS
 # =====================================================
 def format_sources_markdown(sources):
+    """Formatiere Quellen als Markdown"""
     if not sources:
         return ""
+    lines = ["", "**📚 Quellenverweise:**", ""]
     for s in sources:
+        source_type = s["source"]
         page = s["page"]
+        para_id = s.get("paragraph_id", "")
         url = s["url"]
+        snippet = s.get("snippet", "")
+        # Build source line
         if url:
+            if "PDF" in source_type:
+                source_text = f"[{source_type}"
+                if page:
+                    source_text += f", Seite {page}"
+                source_text += f"]({url})"
+            else:
+                display_name = para_id if para_id else "Hochschulgesetz NRW"
+                source_text = f"[{display_name}]({url})"
         else:
+            source_text = source_type
+        lines.append(f"- {source_text}")
         if snippet:
+            lines.append(f"  > *{snippet}*")
     return "\n".join(lines)
 # =====================================================
+# CHATBOT FUNCTIONS
 # =====================================================
 def chatbot_text(user_message, history):
+    """Text-Chatbot Funktion"""
+    if not user_message.strip():
+        return history, ""
+    try:
+        # Get answer from RAG pipeline
+        answer_text, sources = answer(
+            question=user_message,
+            retriever=_retriever,
+            chat_model=_llm
+        )
+        # Add sources
+        sources_text = format_sources_markdown(sources)
+        full_response = f"{answer_text}\n\n{sources_text}"
+        # Update history
+        history.append({"role": "user", "content": user_message})
+        history.append({"role": "assistant", "content": full_response})
+        return history, ""
+    except Exception as e:
+        error_msg = f"Fehler bei der Verarbeitung: {str(e)}"
+        history.append({"role": "user", "content": user_message})
+        history.append({"role": "assistant", "content": error_msg})
         return history, ""
 def chatbot_voice(audio_path, history):
+    """Voice-Chatbot Funktion"""
+    if not audio_path:
+        return history, None, ""
+    # Transcribe audio
     text = transcribe_audio(audio_path)
     if not text:
+        return history, None, "Keine Sprache erkannt"
+    # Process with text chatbot
+    history, _ = chatbot_text(text, history)
+    # Get last response for TTS
+    last_response = None
+    for msg in reversed(history):
+        if msg["role"] == "assistant":
+            last_response = msg["content"]
+            break
+    # Generate audio
+    audio_output = None
+    if last_response:
+        audio_output = synthesize_speech(last_response.split("\n\n")[0])  # Nur erste Teil für TTS
+    return history, audio_output, text
 def read_last_answer(history):
+    """Lese letzte Antwort vor"""
     if not history:
         return None
     for msg in reversed(history):
         if msg["role"] == "assistant":
+            return synthesize_speech(msg["content"].split("\n\n")[0])
     return None
 # =====================================================
+# GRADIO UI
 # =====================================================
+def create_ui():
+    """Erstelle die Gradio Benutzeroberfläche"""
+    with gr.Blocks(
+        title="Prüfungsrechts-Chatbot NRW",
+        theme=gr.themes.Soft(),
+        css="""
+        .chatbot { min-height: 500px; }
+        .viewer-frame { border-radius: 10px; border: 1px solid #e0e0e0; }
+        """
+    ) as demo:
+        # Header
+        gr.Markdown("""
+        # 🧑‍⚖️ Prüfungsrechts-Chatbot für NRW Hochschulen
+        Dieser Chatbot beantwortet Fragen basierend auf:
+        - **Prüfungsordnung** (offizielles PDF)
+        - **Hochschulgesetz NRW** (aktuelle Fassung von recht.nrw.de)
+        Fragen können per Text oder Spracheingabe gestellt werden.
+        """)
+        with gr.Row():
+            # Left Column - Chat
+            with gr.Column(scale=2):
+                chatbot = gr.Chatbot(
+                    label="Chat",
+                    height=500,
+                    bubble_full_width=False,
+                    show_copy_button=True
+                )
+                with gr.Row():
+                    msg = gr.Textbox(
+                        label="Frage eingeben",
+                        placeholder="Stellen Sie Ihre Frage zum Prüfungsrecht...",
+                        scale=4,
+                        container=False
+                    )
+                    send_btn = gr.Button("Senden", variant="primary", scale=1)
+                # Voice Input
+                with gr.Accordion("🎤 Spracheingabe", open=False):
+                    with gr.Row():
+                        voice_in = gr.Audio(
+                            sources=["microphone"],
+                            type="filepath",
+                            label="Aufnahme",
+                            scale=3
+                        )
+                        voice_btn = gr.Button("Sprechen & senden", scale=1)
+                    voice_out = gr.Audio(
+                        label="Antwort als Audio",
+                        type="numpy",
+                        visible=True
+                    )
+                # Controls
+                with gr.Row():
+                    read_btn = gr.Button("🔊 Antwort vorlesen")
+                    clear_btn = gr.Button("🗑️ Chat leeren", variant="secondary")
+            # Right Column - Viewer
+            with gr.Column(scale=1):
+                # PDF Viewer
+                gr.Markdown("### 📄 Prüfungsordnung")
+                if _pdf_path:
+                    pdf_viewer = PDF(_pdf_path, height=350, label="PDF Viewer")
+                else:
+                    gr.Markdown("⚠️ PDF konnte nicht geladen werden")
+                # Law Viewer
+                gr.Markdown("### 📘 Hochschulgesetz NRW")
+                gr.HTML(f"""
+                <iframe
+                    src="{LAW_VIEWER_URL}"
+                    style="width:100%; height:400px; border:none; border-radius:10px;"
+                    title="Hochschulgesetz NRW Viewer"
+                ></iframe>
+                """)
+        # Event Handlers
+        # Text input
+        msg.submit(
+            chatbot_text,
+            [msg, chatbot],
+            [chatbot, msg]
+        )
+        send_btn.click(
+            chatbot_text,
+            [msg, chatbot],
+            [chatbot, msg]
+        )
+        # Voice input
+        voice_btn.click(
+            chatbot_voice,
+            [voice_in, chatbot],
+            [chatbot, voice_out, msg]
+        )
+        # Controls
+        read_btn.click(
+            read_last_answer,
+            [chatbot],
+            [voice_out]
+        )
+        clear_btn.click(
+            lambda: [],
+            None,
+            [chatbot]
+        )
+        # Instructions
+        gr.Markdown("""
+        ### ℹ️ Nutzungshinweise
+        1. **Präzise Fragen** stellen für bessere Antworten
+        2. **Quellen** werden automatisch verlinkt
+        3. **Klicken Sie auf Links** im Chat, um direkt zur Quelle zu springen
+        4. **Spracheingabe** für hands-free Nutzung
+        ### ⚠️ Hinweis
+        Dies ist ein Assistenzsystem. Für verbindliche rechtliche Auskünfte wenden Sie sich bitte an die zuständigen Prüfungsämter.
+        """)
+    return demo
+# =====================================================
+# MAIN
+# =====================================================
 if __name__ == "__main__":
+    demo = create_ui()
+    # Konfiguration für HuggingFace Spaces
+    demo.queue(
+        max_size=20,
+        api_open=False
+    ).launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False,
+        show_error=True,
+        debug=False
+    )

build_hg_viewer.py CHANGED Viewed

@@ -1,7 +1,12 @@
-# build_hg_viewer.py
 import os
 from supabase import create_client
 from dotenv import load_dotenv
 load_dotenv()
@@ -13,301 +18,701 @@ if not SUPABASE_URL or not SUPABASE_SERVICE_ROLE:
 supabase = create_client(SUPABASE_URL, SUPABASE_SERVICE_ROLE)
-from upload_weblink_to_supabase import extract_paragraphs
-# ======== HTML TEMPLATE ========
-VIEW_TEMPLATE = """
-<!DOCTYPE html>
 <html lang="de">
 <head>
-<meta charset="UTF-8">
-<title>Hochschulgesetz NRW – Paragraph Viewer</title>
-<style>
-body {
-    font-family: Arial, sans-serif;
-    margin: 0;
-    padding: 0;
-    display: flex;
-}
-/* ----------- SIDEBAR ------------- */
-#sidebar {
-    width: 280px;
-    height: 100vh;
-    overflow-y: auto;
-    background: #f5f5f5;
-    border-right: 1px solid #ccc;
-    padding: 15px;
-    position: sticky;
-    top: 0;
-}
-#sidebar h2 {
-    margin-top: 0;
-}
-#searchBox {
-    width: 100%;
-    padding: 8px;
-    font-size: 15px;
-    margin-bottom: 10px;
-    border: 1px solid #aaa;
-    border-radius: 5px;
-}
-.sidebar-link {
-    display: block;
-    padding: 6px 8px;
-    margin-bottom: 4px;
-    text-decoration: none;
-    color: #003366;
-    border-radius: 4px;
-}
-.sidebar-link:hover {
-    background: #e0e7ff;
-    color: #001d4d;
-}
-/* ----------- CONTENT ------------- */
-#content {
-    flex: 1;
-    padding: 25px;
-    max-width: 900px;
-}
-/* Absatz block */
-.para {
-    padding: 20px 0;
-    border-bottom: 1px solid #ddd;
-}
-.para h2 {
-    color: #003366;
-    margin-bottom: 10px;
-}
-/* ----------- Fußnoten ------------- */
-.fn-block {
-    background: #fafafa;
-    border-left: 4px solid #999;
-    padding: 12px;
-    margin-top: 10px;
-    margin-bottom: 25px;
-}
-.fn-toggle {
-    cursor: pointer;
-    font-weight: bold;
-    color: #003366;
-    margin-bottom: 5px;
-}
-.fn-content {
-    display: none;
-    padding-left: 10px;
-}
-.fn-title {
-    font-weight: bold;
-    margin-bottom: 6px;
-}
-.fn-item {
-    margin-bottom: 8px;
-}
-/* ----------- Highlight beim Öffnen ------------- */
-.highlight {
-    animation: flash 2s ease-in-out;
-    background: #fff8c6 !important;
-}
-@keyframes flash {
-    0% { background: #fff8c6; }
-    100% { background: transparent; }
-}
-/* Keyword highlight */
-.keyword {
-    background: yellow;
-    padding: 2px 3px;
-    border-radius: 3px;
-}
-/* Back to top button */
-#topBtn {
-    position: fixed;
-    bottom: 25px;
-    right: 25px;
-    background: #003366;
-    color: white;
-    border-radius: 8px;
-    padding: 10px 14px;
-    cursor: pointer;
-    font-size: 16px;
-    display: none;
-}
-</style>
 </head>
 <body>
-<div id="sidebar">
-    <h2>Inhaltsverzeichnis</h2>
-    <input type="text" id="searchBox" placeholder="Suchen nach § …">
-    <!-- SIDEBAR_LINKS -->
-</div>
-<div id="content">
-    <h1>Hochschulgesetz NRW – Paragraph Viewer</h1>
-    <!-- PARAGRAPH_CONTENT -->
-</div>
-<div id="topBtn" onclick="scrollToTop()">⬆️ Top</div>
-<script>
-// ------ TỰ ĐỘNG HIGHLIGHT Absatz khi có #anchor HIGHLIGHT ABSATZ & SCROLL ------
-window.onload = function() {
-    const anchor = window.location.hash.substring(1);
-    const params = new URLSearchParams(window.location.search);
-    const keywords = params.get("k");
-    if (anchor) {
-        const el = document.getElementById(anchor);
-        if (el) {
-            el.classList.add("highlight");
-            el.scrollIntoView({ behavior: "smooth", block: "center" });
-        }
-    }
-    /* KEYWORD HIGHLIGHT */
-    if (keywords) {
-        const words = keywords.split("%20");
-        highlightKeywords(words);
-    }
-};
-/* --- KEYWORD HIGHLIGHT FUNCTION --- */
-function highlightKeywords(words) {
-    const container = document.getElementById("content");
-    let html = container.innerHTML;
-    words.forEach(word => {
-        if (word.length < 2) return;
-        const regex = new RegExp(`(${decodeURIComponent(word)})`, "gi");
-        html = html.replace(regex, `<span class="keyword">$1</span>`);
-    });
-    container.innerHTML = html;
-}
-/* --- SEARCH IN SIDEBAR --- */
-document.getElementById("searchBox").addEventListener("input", function() {
-    const q = this.value.toLowerCase();
-    document.querySelectorAll(".sidebar-link").forEach(link => {
-        const txt = link.innerText.toLowerCase();
-        link.style.display = txt.includes(q) ? "block" : "none";
-    });
-});
-/* --- COLLAPSIBLE FUSSNOTEN --- */
-document.addEventListener("click", function(e) {
-    if (e.target.classList.contains("fn-toggle")) {
-        const content = e.target.nextElementSibling;
-        content.style.display = content.style.display === "block" ? "none" : "block";
-    }
-});
-/* --- BACK TO TOP BUTTON --- */
-window.onscroll = function() {
-    document.getElementById("topBtn").style.display =
-        window.scrollY > 300 ? "block" : "none";
-};
-function scrollToTop() {
-    window.scrollTo({ top: 0, behavior: 'smooth' });
-}
-</script>
 </body>
 </html>
 """
 # -------------------------------------------------------------------
-# 2. BUILD VIEWER
 # -------------------------------------------------------------------
 def build_html():
-    print(">>> Lade Paragraphs aus Supabase...")
-    paras = extract_paragraphs()
-    sidebar_links = ""
-    content_html = ""
-    for p in paras:
         pid = p["abs_id"]
         title = p["title"]
-        body = p["content"]
-        # Sidebar item
-        sidebar_links += f'<a class="sidebar-link" href="#{pid}">{title}</a>\n'
-        # Fußnoten tách riêng (bắt đầu bằng "Fn 1", "Fn 2", ...)
-        lines = body.split("\n")
-        main_text = []
-        fn_text = []
-        in_fn = False
         for line in lines:
-            if line.startswith("Fn "):
-                in_fn = True
-            if in_fn:
-                fn_text.append(line)
-            else:
-                main_text.append(line)
-        footnotes_html = ""
-        if fn_text:
-            footnotes_html += '<div class="fn-block">'
-            footnotes_html += '<div class="fn-title">Fußnoten:</div>'
-            for fn in fn_text:
-                footnotes_html += f'<div class="fn-item">{fn}</div>'
-            footnotes_html += "</div>"
-        # Paragraph block
-        content_html += f"""
-<div class="para" id="{pid}">
-    <h2>{title}</h2>
-    <div>{'<br>'.join(main_text)}</div>
-    {footnotes_html}
-</div>
-"""
-    html = VIEW_TEMPLATE.replace("<!-- SIDEBAR_LINKS -->", sidebar_links)
-    html = html.replace("<!-- PARAGRAPH_CONTENT -->", content_html)
     return html
 # -------------------------------------------------------------------
-# 3. UPLOAD TO SUPABASE STORAGE
 # -------------------------------------------------------------------
 def upload_html():
     html = build_html()
-    supabase.storage.from_("hg_viewer").update(
-        "hg_clean.html",
-        html.encode("utf-8"),
-        {
-            "content-type": "text/html",
-            "x-upsert": "true"
-        }
-    )
-    print("✔ hg_clean.html uploaded!")
 if __name__ == "__main__":
-    upload_html()

+"""
+build_hg_viewer.py
+Tạo HTML viewer cho Hochschulgesetz NRW với định dạng chuyên nghiệp
+"""
 import os
+import json
 from supabase import create_client
 from dotenv import load_dotenv
+import re
 load_dotenv()
 supabase = create_client(SUPABASE_URL, SUPABASE_SERVICE_ROLE)
+def get_paragraphs_from_supabase():
+    """Lấy paragraphs từ Supabase"""
+    print(">>> Lade Paragraphs aus Supabase...")
+    response = supabase.table("hg_nrw").select("*").order("order_index").execute()
+    paragraphs = response.data
+    if not paragraphs:
+        print("❌ Keine Paragraphs in der Datenbank gefunden.")
+        return []
+    print(f"✔ {len(paragraphs)} Paragraphs geladen.")
+    return paragraphs
+# ======== HTML TEMPLATE MIT PROFESSIONELLEM DESIGN ========
+VIEW_TEMPLATE = """<!DOCTYPE html>
 <html lang="de">
 <head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Hochschulgesetz NRW – Offizielle Viewer</title>
+    <style>
+        * {
+            margin: 0;
+            padding: 0;
+            box-sizing: border-box;
+        }
+        body {
+            font-family: 'Segoe UI', 'Roboto', 'Arial', sans-serif;
+            line-height: 1.6;
+            color: #333;
+            background: #f8f9fa;
+            display: flex;
+            min-height: 100vh;
+        }
+        /* ----------- SIDEBAR ------------- */
+        #sidebar {
+            width: 320px;
+            background: #ffffff;
+            border-right: 1px solid #e0e0e0;
+            height: 100vh;
+            overflow-y: auto;
+            position: fixed;
+            left: 0;
+            top: 0;
+            box-shadow: 2px 0 5px rgba(0,0,0,0.1);
+            z-index: 1000;
+        }
+        .sidebar-header {
+            padding: 20px;
+            background: linear-gradient(135deg, #003366 0%, #00509e 100%);
+            color: white;
+            border-bottom: 1px solid #002244;
+        }
+        .sidebar-header h2 {
+            font-size: 1.4rem;
+            font-weight: 600;
+            margin-bottom: 10px;
+        }
+        .sidebar-header p {
+            font-size: 0.9rem;
+            opacity: 0.9;
+        }
+        #searchBox {
+            width: 100%;
+            padding: 12px 15px;
+            font-size: 14px;
+            border: 1px solid #ddd;
+            border-radius: 8px;
+            margin: 15px;
+            background: #f8f9fa;
+            transition: all 0.3s;
+        }
+        #searchBox:focus {
+            outline: none;
+            border-color: #003366;
+            box-shadow: 0 0 0 3px rgba(0, 51, 102, 0.1);
+        }
+        .paragraph-list {
+            padding: 0 15px 20px 15px;
+        }
+        .sidebar-link {
+            display: block;
+            padding: 12px 15px;
+            margin-bottom: 5px;
+            text-decoration: none;
+            color: #003366;
+            background: #f8f9fa;
+            border-left: 4px solid transparent;
+            border-radius: 6px;
+            font-size: 14px;
+            font-weight: 500;
+            transition: all 0.2s;
+        }
+        .sidebar-link:hover {
+            background: #e3f2fd;
+            border-left-color: #003366;
+            transform: translateX(3px);
+        }
+        .sidebar-link.active {
+            background: #e3f2fd;
+            border-left-color: #003366;
+            font-weight: 600;
+        }
+        /* ----------- MAIN CONTENT ------------- */
+        #content-wrapper {
+            flex: 1;
+            margin-left: 320px;
+            min-height: 100vh;
+        }
+        #content {
+            max-width: 900px;
+            margin: 0 auto;
+            padding: 30px;
+            background: white;
+            min-height: 100vh;
+            box-shadow: 0 0 20px rgba(0,0,0,0.05);
+        }
+        .page-header {
+            margin-bottom: 40px;
+            padding-bottom: 20px;
+            border-bottom: 2px solid #003366;
+        }
+        .page-header h1 {
+            color: #003366;
+            font-size: 2.2rem;
+            font-weight: 700;
+            margin-bottom: 10px;
+        }
+        .page-header .subtitle {
+            color: #666;
+            font-size: 1.1rem;
+        }
+        /* ----------- PARAGRAPH STYLES ------------- */
+        .paragraph {
+            margin-bottom: 50px;
+            padding: 25px;
+            background: #ffffff;
+            border-radius: 10px;
+            border-left: 5px solid #003366;
+            box-shadow: 0 2px 10px rgba(0,0,0,0.08);
+            transition: all 0.3s;
+        }
+        .paragraph.highlight {
+            animation: highlight-pulse 2s ease;
+            border-left-color: #ff9800;
+            box-shadow: 0 0 0 3px rgba(255, 152, 0, 0.2);
+        }
+        .paragraph-header {
+            margin-bottom: 20px;
+        }
+        .paragraph-title {
+            color: #003366;
+            font-size: 1.6rem;
+            font-weight: 700;
+            margin-bottom: 10px;
+            display: flex;
+            align-items: center;
+            gap: 10px;
+        }
+        .paragraph-title .anchor {
+            font-size: 0.8em;
+            color: #666;
+            text-decoration: none;
+            opacity: 0;
+            transition: opacity 0.2s;
+        }
+        .paragraph:hover .anchor {
+            opacity: 1;
+        }
+        .paragraph-content {
+            font-size: 1.05rem;
+            line-height: 1.8;
+            color: #333;
+        }
+        .paragraph-content p {
+            margin-bottom: 15px;
+        }
+        .paragraph-content ul, .paragraph-content ol {
+            margin: 15px 0 15px 25px;
+        }
+        .paragraph-content li {
+            margin-bottom: 8px;
+        }
+        /* ----------- FOOTNOTES ------------- */
+        .footnotes {
+            margin-top: 25px;
+            padding-top: 20px;
+            border-top: 1px solid #eee;
+        }
+        .footnotes-title {
+            font-weight: 600;
+            color: #666;
+            margin-bottom: 15px;
+            font-size: 0.95rem;
+        }
+        .footnote-item {
+            margin-bottom: 10px;
+            padding-left: 15px;
+            border-left: 2px solid #ddd;
+            font-size: 0.9rem;
+            color: #555;
+        }
+        /* ----------- HIGHLIGHT ANIMATION ------------- */
+        @keyframes highlight-pulse {
+            0% { background-color: #fff8e1; }
+            70% { background-color: #fff8e1; }
+            100% { background-color: #ffffff; }
+        }
+        /* ----------- RESPONSIVE ------------- */
+        @media (max-width: 992px) {
+            body {
+                flex-direction: column;
+            }
+            #sidebar {
+                position: static;
+                width: 100%;
+                height: auto;
+                max-height: 50vh;
+            }
+            #content-wrapper {
+                margin-left: 0;
+            }
+        }
+        /* ----------- BACK TO TOP ------------- */
+        #back-to-top {
+            position: fixed;
+            bottom: 30px;
+            right: 30px;
+            width: 50px;
+            height: 50px;
+            background: #003366;
+            color: white;
+            border-radius: 50%;
+            display: none;
+            justify-content: center;
+            align-items: center;
+            cursor: pointer;
+            box-shadow: 0 2px 10px rgba(0,0,0,0.2);
+            transition: all 0.3s;
+            z-index: 1000;
+        }
+        #back-to-top:hover {
+            background: #00509e;
+            transform: translateY(-3px);
+        }
+        /* ----------- KEYWORD HIGHLIGHT ------------- */
+        .keyword-highlight {
+            background: #fff9c4;
+            padding: 2px 4px;
+            border-radius: 3px;
+            font-weight: 500;
+        }
+        /* ----------- PRINT STYLES ------------- */
+        @media print {
+            #sidebar {
+                display: none;
+            }
+            #content-wrapper {
+                margin-left: 0;
+            }
+            #back-to-top {
+                display: none !important;
+            }
+        }
+    </style>
 </head>
 <body>
+    <!-- SIDEBAR -->
+    <div id="sidebar">
+        <div class="sidebar-header">
+            <h2>Hochschulgesetz NRW</h2>
+            <p>Inhaltsverzeichnis</p>
+        </div>
+        <input type="text" id="searchBox" placeholder="Paragraph suchen (z.B. §1 oder Text)..."
+               title="Geben Sie eine Paragraphennummer oder Suchbegriff ein">
+        <div class="paragraph-list" id="paragraphList">
+            <!-- SIDEBAR_LINKS -->
+        </div>
+    </div>
+    <!-- MAIN CONTENT -->
+    <div id="content-wrapper">
+        <div id="content">
+            <div class="page-header">
+                <h1>Hochschulgesetz Nordrhein-Westfalen</h1>
+                <p class="subtitle">Gesetz über die Hochschulen des Landes Nordrhein-Westfalen (Hochschulgesetz – HG)</p>
+                <p class="subtitle" style="font-size: 0.9rem; color: #777;">
+                    Stand: Aktuelle Fassung | Quelle: <a href="https://recht.nrw.de" target="_blank">recht.nrw.de</a>
+                </p>
+            </div>
+            <div id="paragraphContent">
+                <!-- PARAGRAPH_CONTENT -->
+            </div>
+        </div>
+    </div>
+    <!-- BACK TO TOP BUTTON -->
+    <div id="back-to-top" title="Zum Anfang">
+        ↑
+    </div>
+    <script>
+        // ========== GLOBAL VARIABLES ==========
+        let currentParagraphId = '';
+        let searchTimeout = null;
+        // ========== INITIALIZATION ==========
+        document.addEventListener('DOMContentLoaded', function() {
+            // Check for URL hash
+            const hash = window.location.hash.substring(1);
+            const urlParams = new URLSearchParams(window.location.search);
+            const keywords = urlParams.get('keywords');
+            if (hash) {
+                scrollToParagraph(hash);
+            }
+            if (keywords) {
+                highlightKeywords(decodeURIComponent(keywords));
+            }
+            setupEventListeners();
+            updateActiveLink();
+        });
+        // ========== SCROLL TO PARAGRAPH ==========
+        function scrollToParagraph(paragraphId, highlight = true) {
+            const element = document.getElementById(paragraphId);
+            if (!element) return;
+            // Remove previous highlight
+            document.querySelectorAll('.paragraph.highlight').forEach(el => {
+                el.classList.remove('highlight');
+            });
+            // Calculate position for smooth scroll
+            const sidebarHeight = document.getElementById('sidebar').offsetHeight;
+            const elementPosition = element.getBoundingClientRect().top;
+            const offsetPosition = elementPosition + window.pageYOffset - 100;
+            // Smooth scroll
+            window.scrollTo({
+                top: offsetPosition,
+                behavior: 'smooth'
+            });
+            // Highlight if requested
+            if (highlight) {
+                setTimeout(() => {
+                    element.classList.add('highlight');
+                    // Update URL without page reload
+                    history.replaceState(null, null, `#${paragraphId}`);
+                    // Update active link in sidebar
+                    updateActiveLink(paragraphId);
+                }, 300);
+            }
+        }
+        // ========== SEARCH FUNCTIONALITY ==========
+        function setupEventListeners() {
+            const searchBox = document.getElementById('searchBox');
+            // Search input with debounce
+            searchBox.addEventListener('input', function() {
+                clearTimeout(searchTimeout);
+                searchTimeout = setTimeout(() => {
+                    filterParagraphs(this.value);
+                }, 300);
+            });
+            // Enter key to jump to first result
+            searchBox.addEventListener('keypress', function(e) {
+                if (e.key === 'Enter') {
+                    e.preventDefault();
+                    jumpToFirstResult(this.value);
+                }
+            });
+            // Back to top button
+            const backToTop = document.getElementById('back-to-top');
+            backToTop.addEventListener('click', function() {
+                window.scrollTo({
+                    top: 0,
+                    behavior: 'smooth'
+                });
+            });
+            // Show/hide back to top button
+            window.addEventListener('scroll', function() {
+                if (window.scrollY > 500) {
+                    backToTop.style.display = 'flex';
+                } else {
+                    backToTop.style.display = 'none';
+                }
+                updateActiveLink();
+            });
+        }
+        function filterParagraphs(searchTerm) {
+            const links = document.querySelectorAll('.sidebar-link');
+            const searchLower = searchTerm.toLowerCase();
+            let hasVisible = false;
+            links.forEach(link => {
+                const text = link.textContent.toLowerCase();
+                if (text.includes(searchLower)) {
+                    link.style.display = 'block';
+                    hasVisible = true;
+                } else {
+                    link.style.display = 'none';
+                }
+            });
+            // Update search box placeholder based on results
+            const searchBox = document.getElementById('searchBox');
+            if (!hasVisible && searchTerm) {
+                searchBox.title = 'Keine Ergebnisse gefunden';
+            } else {
+                searchBox.title = '';
+            }
+        }
+        function jumpToFirstResult(searchTerm) {
+            const links = document.querySelectorAll('.sidebar-link');
+            const searchLower = searchTerm.toLowerCase();
+            for (const link of links) {
+                if (link.style.display !== 'none') {
+                    const paragraphId = link.getAttribute('href').substring(1);
+                    scrollToParagraph(paragraphId);
+                    break;
+                }
+            }
+        }
+        // ========== HIGHLIGHT KEYWORDS ==========
+        function highlightKeywords(keywords) {
+            const content = document.getElementById('paragraphContent');
+            const searchTerms = keywords.split(/[\s,]+/).filter(term => term.length > 2);
+            searchTerms.forEach(term => {
+                const regex = new RegExp(`(${escapeRegExp(term)})`, 'gi');
+                content.innerHTML = content.innerHTML.replace(regex,
+                    '<span class="keyword-highlight">$1</span>');
+            });
+        }
+        function escapeRegExp(string) {
+            return string.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
+        }
+        // ========== UPDATE ACTIVE LINK ==========
+        function updateActiveLink(forceId = null) {
+            const links = document.querySelectorAll('.sidebar-link');
+            const paragraphs = document.querySelectorAll('.paragraph');
+            let activeId = forceId;
+            if (!activeId) {
+                // Find paragraph in viewport
+                const viewportHeight = window.innerHeight;
+                const viewportMiddle = window.scrollY + (viewportHeight / 2);
+                for (const paragraph of paragraphs) {
+                    const rect = paragraph.getBoundingClientRect();
+                    const paragraphTop = window.pageYOffset + rect.top;
+                    const paragraphBottom = paragraphTop + rect.height;
+                    if (viewportMiddle >= paragraphTop && viewportMiddle <= paragraphBottom) {
+                        activeId = paragraph.id;
+                        break;
+                    }
+                }
+            }
+            // Update active state
+            links.forEach(link => {
+                const href = link.getAttribute('href').substring(1);
+                if (href === activeId) {
+                    link.classList.add('active');
+                } else {
+                    link.classList.remove('active');
+                }
+            });
+        }
+        // ========== FORMAT CONTENT ==========
+        function formatContent(text) {
+            // Replace multiple newlines with paragraphs
+            return text.split('\n\n').map(paragraph => {
+                if (paragraph.trim()) {
+                    return `<p>${paragraph.trim()}</p>`;
+                }
+                return '';
+            }).join('');
+        }
+        // ========== COPY TO CLIPBOARD ==========
+        function copyParagraphLink(paragraphId) {
+            const url = window.location.origin + window.location.pathname + '#' + paragraphId;
+            navigator.clipboard.writeText(url).then(() => {
+                // Show temporary notification
+                const notification = document.createElement('div');
+                notification.textContent = 'Link kopiert!';
+                notification.style.cssText = `
+                    position: fixed;
+                    top: 20px;
+                    right: 20px;
+                    background: #4CAF50;
+                    color: white;
+                    padding: 10px 20px;
+                    border-radius: 5px;
+                    z-index: 10000;
+                    animation: fadeInOut 2s ease;
+                `;
+                document.body.appendChild(notification);
+                setTimeout(() => {
+                    document.body.removeChild(notification);
+                }, 2000);
+            });
+        }
+    </script>
 </body>
 </html>
 """
 # -------------------------------------------------------------------
+# BUILD VIEWER
 # -------------------------------------------------------------------
 def build_html():
+    """Xây dựng HTML viewer từ dữ liệu Supabase"""
+    paragraphs = get_paragraphs_from_supabase()
+    if not paragraphs:
+        print("❌ Keine Paragraphs zum Erstellen des Viewers verfügbar.")
+        return None
+    sidebar_links = []
+    content_html = []
+    for p in paragraphs:
         pid = p["abs_id"]
         title = p["title"]
+        content = p["content"]
+        # Tạo link cho sidebar
+        sidebar_link = f'''
+        <a class="sidebar-link" href="#{pid}" onclick="scrollToParagraph('{pid}'); return false;">
+            {title}
+        </a>
+        '''
+        sidebar_links.append(sidebar_link)
+        # Tạo nội dung paragraph
+        # Phân loại footnote và nội dung chính
+        lines = content.split('\n')
+        main_content = []
+        footnotes = []
         for line in lines:
+            line = line.strip()
+            if line.lower().startswith('fn ') or line.lower().startswith('fussnote'):
+                footnotes.append(line)
+            elif line:
+                main_content.append(line)
+        # Format main content
+        formatted_content = '<br>'.join(main_content)
+        # Format footnotes
+        footnotes_html = ''
+        if footnotes:
+            footnotes_html = '''
+            <div class="footnotes">
+                <div class="footnotes-title">Fußnoten:</div>
+                ''' + ''.join(f'<div class="footnote-item">{fn}</div>' for fn in footnotes) + '''
+            </div>
+            '''
+        # Tạo paragraph block
+        paragraph_html = f'''
+        <div class="paragraph" id="{pid}">
+            <div class="paragraph-header">
+                <h3 class="paragraph-title">
+                    {title}
+                    <a href="#{pid}" class="anchor" onclick="copyParagraphLink('{pid}'); return false;"
+                       title="Link zu diesem Paragraph kopieren">🔗</a>
+                </h3>
+            </div>
+            <div class="paragraph-content">
+                {formatted_content}
+            </div>
+            {footnotes_html}
+        </div>
+        '''
+        content_html.append(paragraph_html)
+    # Điền nội dung vào template
+    html = VIEW_TEMPLATE
+    html = html.replace('<!-- SIDEBAR_LINKS -->', '\n'.join(sidebar_links))
+    html = html.replace('<!-- PARAGRAPH_CONTENT -->', '\n'.join(content_html))
+    # Thêm metadata
+    html = html.replace(
+        'Aktuelle Fassung',
+        f'Aktuelle Fassung - {len(paragraphs)} Paragraphs'
+    )
     return html
 # -------------------------------------------------------------------
+# UPLOAD TO SUPABASE STORAGE
 # -------------------------------------------------------------------
 def upload_html():
+    """Tạo và tải lên HTML viewer"""
+    print(">>> Baue HTML Viewer...")
     html = build_html()
+    if not html:
+        print("❌ Konnte HTML nicht erstellen.")
+        return
+    try:
+        # Tạo bucket nếu chưa tồn tại
+        try:
+            supabase.storage.get_bucket("hg_viewer")
+        except:
+            supabase.storage.create_bucket("hg_viewer", {
+                "public": True,
+                "file_size_limit": 10485760  # 10MB
+            })
+        # Upload HTML file
+        supabase.storage.from_("hg_viewer").upload(
+            "hg_viewer.html",
+            html.encode("utf-8"),
+            {
+                "content-type": "text/html",
+                "cache-control": "public, max-age=3600"
+            }
+        )
+        print("✅ hg_viewer.html erfolgreich hochgeladen!")
+        print(f"📁 URL: {SUPABASE_URL}/storage/v1/object/public/hg_viewer/hg_viewer.html")
+    except Exception as e:
+        print(f"❌ Fehler beim Upload: {e}")
 if __name__ == "__main__":
+    upload_html()

load_documents.py CHANGED Viewed

@@ -1,130 +1,200 @@
 """
-BƯỚC 1: LOAD DOCUMENTS
------------------------
-Debug-full version
-- Lädt Prüfungsordnung (PDF) seitenweise.
-- Lädt Hochschulgesetz NRW aus dem im Dataset gespeicherten HTML,
-  und zerlegt es in einzelne Absätze (Document pro <p>).
 """
 from huggingface_hub import hf_hub_download, list_repo_files
 from langchain_community.document_loaders import PyPDFLoader
 from langchain_core.documents import Document
 from bs4 import BeautifulSoup
 DATASET = "Nguyen5/docs"
 PDF_FILE = "f10_bpo_ifb_tei_mif_wii_2021-01-04.pdf"
-HTML_FILE = "Hochschulgesetz_NRW.html"  # konsistent mit hg_nrw.py
-def _load_hg_paragraph_documents(html_path: str):
     """
-    Liest das generierte Hochschulgesetz-HTML ein und erzeugt
-    pro <p>-Element einen LangChain-Document mit:
-      - page_content = Text des Absatzes
-      - metadata:
-          source       = "Hochschulgesetz NRW (HTML)"
-          filename     = HTML_FILE
-          paragraph_id = id-Attribut (z.B. 'hg_abs_12'), falls vorhanden
     """
-    with open(html_path, "r", encoding="utf-8") as f:
-        html = f.read()
-    soup = BeautifulSoup(html, "html.parser")
-    docs = []
-    for p in soup.find_all("p"):
-        text = p.get_text(" ", strip=True)
-        if not text:
-            continue
-        pid = p.get("id")
-        metadata = {
-            "source": "Hochschulgesetz NRW (HTML)",
-            "filename": HTML_FILE,
-        }
-        if pid:
-            metadata["paragraph_id"] = pid
-        docs.append(Document(page_content=text, metadata=metadata))
-    print(f"Loaded {len(docs)} paragraph Documents from HG-HTML.\n")
-    return docs
-def load_documents():
-    print("=== START: load_documents() ===\n")
-    # -------------------------
-    # Check files in dataset
-    # -------------------------
-    print(">>> Checking dataset file list from HuggingFace...")
-    files = list_repo_files(DATASET, repo_type="dataset")
-    print("Files in dataset:", files, "\n")
-    docs = []
-    # -------------------------
-    # Load PDF
-    # -------------------------
-    print(">>> Step 1: Download PDF from HuggingFace...")
     try:
         pdf_path = hf_hub_download(
             repo_id=DATASET,
             filename=PDF_FILE,
             repo_type="dataset",
         )
-        print(f"Downloaded PDF to local cache:\n{pdf_path}\n")
-    except Exception as e:
-        print("ERROR downloading PDF:", e)
-        return []
-    print(">>> Step 1.1: Loading PDF pages...")
-    try:
         pdf_docs = PyPDFLoader(pdf_path).load()
-        print(f"Loaded {len(pdf_docs)} PDF pages.\n")
     except Exception as e:
-        print("ERROR loading PDF:", e)
         return []
-    for d in pdf_docs:
-        d.metadata["source"] = "Prüfungsordnung (PDF)"
-        d.metadata["filename"] = PDF_FILE
-    docs.extend(pdf_docs)
-    # -------------------------
-    # Load HTML (Hochschulgesetz NRW)
-    # -------------------------
-    print(">>> Step 2: Download HTML from HuggingFace...")
     try:
         html_path = hf_hub_download(
             repo_id=DATASET,
             filename=HTML_FILE,
             repo_type="dataset",
         )
-        print(f"Downloaded HTML to local cache:\n{html_path}\n")
-    except Exception as e:
-        print("ERROR downloading HTML:", e)
-        return docs
-    print(">>> Step 2.1: Loading HG-HTML and splitting into paragraphs...")
-    try:
         html_docs = _load_hg_paragraph_documents(html_path)
     except Exception as e:
-        print("ERROR loading / parsing HTML:", e)
-        return docs
-    docs.extend(html_docs)
-    print("=== DONE: load_documents() ===\n")
-    return docs
 if __name__ == "__main__":
-    print("\n=== Running load_documents.py directly ===\n")
     docs = load_documents()
-    print(f"\n>>> TOTAL documents loaded: {len(docs)}")
-    if len(docs):
-        print("\nExample metadata from 1st document:")
-        print(docs[0].metadata)

 """
+load_documents.py
+Cải thiện việc load tài liệu với xử lý lỗi tốt hơn
 """
 from huggingface_hub import hf_hub_download, list_repo_files
 from langchain_community.document_loaders import PyPDFLoader
 from langchain_core.documents import Document
 from bs4 import BeautifulSoup
+import requests
+import re
+from typing import List, Optional
 DATASET = "Nguyen5/docs"
 PDF_FILE = "f10_bpo_ifb_tei_mif_wii_2021-01-04.pdf"
+HTML_FILE = "Hochschulgesetz_NRW.html"
+def clean_html_content(text: str) -> str:
+    """Làm sạch nội dung HTML"""
+    # Loại bỏ khoảng trắng thừa
+    text = re.sub(r'\s+', ' ', text)
+    # Chuẩn hóa dấu câu
+    text = re.sub(r'\s*([.,;:!?])\s*', r'\1 ', text)
+    # Đảm bảo chữ cái đầu câu viết hoa
+    sentences = text.split('. ')
+    sentences = [s.strip().capitalize() for s in sentences if s.strip()]
+    return '. '.join(sentences)
+def load_recht_nrw_direct() -> List[Document]:
+    """Tải trực tiếp từ recht.nrw.de"""
+    print(">>> Lade Hochschulgesetz NRW direkt von recht.nrw.de...")
+    url = "https://recht.nrw.de/lmi/owa/br_text_anzeigen?v_id=10000000000000000654"
+    try:
+        headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
+        }
+        response = requests.get(url, headers=headers, timeout=60)
+        response.raise_for_status()
+        soup = BeautifulSoup(response.text, 'html.parser')
+        docs = []
+        # Tìm tất cả các paragraph
+        for i, element in enumerate(soup.find_all(['p', 'div', 'td'])):
+            text = element.get_text(" ", strip=True)
+            # Chỉ lấy các phần có chứa §
+            if '§' in text:
+                # Tách title và content
+                lines = text.split('\n')
+                title = lines[0].strip() if lines else f"§ {i+1}"
+                content = " ".join(lines[1:]) if len(lines) > 1 else text
+                metadata = {
+                    "source": "Hochschulgesetz NRW (Website)",
+                    "filename": "recht_nrw_direct.html",
+                    "paragraph_id": f"hg_direct_{i+1}",
+                    "url": url
+                }
+                doc = Document(
+                    page_content=clean_html_content(content),
+                    metadata=metadata
+                )
+                docs.append(doc)
+        print(f"✅ {len(docs)} Paragraphs direkt von recht.nrw.de geladen.")
+        return docs
+    except Exception as e:
+        print(f"❌ Fehler beim Laden von recht.nrw.de: {e}")
+        return []
+def _load_hg_paragraph_documents(html_path: str) -> List[Document]:
     """
+    Lädt Paragraphs aus dem gespeicherten HTML
     """
+    try:
+        with open(html_path, "r", encoding="utf-8") as f:
+            html = f.read()
+        soup = BeautifulSoup(html, "html.parser")
+        docs = []
+        # Suche nach allen relevanten Inhalten
+        for i, p in enumerate(soup.find_all(['p', 'div', 'section'])):
+            text = p.get_text(" ", strip=True)
+            if not text or len(text) < 10:
+                continue
+            # Check if it's a paragraph
+            if '§' in text or 'Artikel' in text:
+                pid = p.get("id", f"hg_para_{i+1}")
+                metadata = {
+                    "source": "Hochschulgesetz NRW (HTML)",
+                    "filename": HTML_FILE,
+                    "paragraph_id": pid,
+                    "type": "paragraph"
+                }
+                docs.append(Document(
+                    page_content=clean_html_content(text),
+                    metadata=metadata
+                ))
+        print(f"✅ {len(docs)} Paragraphs aus HTML geladen.")
+        return docs
+    except Exception as e:
+        print(f"❌ Fehler beim Laden des HTML: {e}")
+        return []
+def load_pdf_documents() -> List[Document]:
+    """Lädt PDF-Dokumente"""
+    print(">>> Lade PDF-Dokumente...")
     try:
         pdf_path = hf_hub_download(
             repo_id=DATASET,
             filename=PDF_FILE,
             repo_type="dataset",
         )
+        print(f"✅ PDF heruntergeladen: {pdf_path}")
+        # Load PDF with PyPDFLoader
         pdf_docs = PyPDFLoader(pdf_path).load()
+        # Enhance metadata
+        for i, doc in enumerate(pdf_docs):
+            doc.metadata.update({
+                "source": "Prüfungsordnung (PDF)",
+                "filename": PDF_FILE,
+                "document_type": "exam_regulation",
+                "chunk_index": i
+            })
+        print(f"✅ {len(pdf_docs)} Seiten aus PDF geladen.")
+        return pdf_docs
     except Exception as e:
+        print(f"❌ Fehler beim Laden des PDF: {e}")
         return []
+def load_documents() -> List[Document]:
+    """
+    Hauptfunktion zum Laden aller Dokumente
+    """
+    print("=== START: load_documents() ===\n")
+    all_docs = []
+    # 1. Load PDF documents
+    pdf_docs = load_pdf_documents()
+    all_docs.extend(pdf_docs)
+    # 2. Try loading from dataset HTML
+    print(">>> Versuche, HTML aus Dataset zu laden...")
     try:
         html_path = hf_hub_download(
             repo_id=DATASET,
             filename=HTML_FILE,
             repo_type="dataset",
         )
+        print(f"✅ HTML heruntergeladen: {html_path}")
         html_docs = _load_hg_paragraph_documents(html_path)
+        all_docs.extend(html_docs)
     except Exception as e:
+        print(f"⚠️ Konnte HTML nicht aus Dataset laden: {e}")
+        # 3. Fallback: Load directly from website
+        print(">>> Fallback: Lade direkt von recht.nrw.de...")
+        web_docs = load_recht_nrw_direct()
+        all_docs.extend(web_docs)
+    print(f"\n=== DONE: {len(all_docs)} Dokumente geladen ===")
+    # Print summary
+    pdf_count = len([d for d in all_docs if "PDF" in d.metadata.get("source", "")])
+    html_count = len([d for d in all_docs if "HTML" in d.metadata.get("source", "")])
+    web_count = len([d for d in all_docs if "Website" in d.metadata.get("source", "")])
+    print(f"📊 Zusammenfassung:")
+    print(f"   - PDF-Seiten: {pdf_count}")
+    print(f"   - HTML-Paragraphs: {html_count}")
+    print(f"   - Web-Paragraphs: {web_count}")
+    return all_docs
 if __name__ == "__main__":
     docs = load_documents()
+    if docs:
+        print(f"\nErstes Dokument (Beispiel):")
+        print(f"Content: {docs[0].page_content[:200]}...")
+        print(f"Metadata: {docs[0].metadata}")

rag_pipeline.py CHANGED Viewed

@@ -1,194 +1,197 @@
 """
-RAG PIPELINE – Version 26.11 (ohne Modi, stabil, juristisch korrekt)
 """
 from typing import List, Dict, Any, Tuple
 from langchain_core.messages import SystemMessage, HumanMessage
-from load_documents import DATASET, PDF_FILE, HTML_FILE
-# -------------------------------------------------------------------
 # URLs für Quellen
-# -------------------------------------------------------------------
-# Direktes PDF im Dataset (für #page)
-PDF_BASE_URL = f"https://huggingface.co/datasets/{DATASET}/resolve/main/{PDF_FILE}"
-# Hochschulgesetz-HTML im Dataset (enthält <p id="hg_abs_X"> …)
-LAW_DATASET_URL = f"https://huggingface.co/datasets/{DATASET}/resolve/main/{HTML_FILE}"
-# Offizielle Recht.NRW-Druckversion (für Viewer im Frontend)
-LAW_URL = (
-    "https://recht.nrw.de/lmi/owa/br_bes_text?"
-    "print=1&anw_nr=2&gld_nr=2&ugl_nr=221&val=28364&ver=0&"
-    "aufgehoben=N&keyword=&bes_id=28364&show_preview=1"
-)
-MAX_CHARS = 900
-# -----------------------------
-# Quellen formatieren
-# -----------------------------
-def build_sources_metadata(docs: List) -> List[Dict[str, Any]]:
     """
-    Erzeugt eine Liste strukturierter Quellen-Infos:
-    [
-      {
-        "id": 1,
-        "source": "Prüfungsordnung (PDF)" / "Hochschulgesetz NRW (HTML)",
-        "page": 3,          # nur bei PDF
-        "url": "...",       # direkter Klick-Link
-        "snippet": "Erste 300 Zeichen des Chunks..."
-      },
-      ...
-    ]
     """
-    srcs = []
-    for i, d in enumerate(docs):
-        meta = d.metadata
-        src = meta.get("source", "")
-        page = meta.get("page")
-        snippet = d.page_content[:300].replace("\n", " ")
-        # PDF-Link
-        if "Prüfungsordnung" in src:
             if isinstance(page, int):
-                # PyPDFLoader: page ist 0-basiert, Anzeige 1-basiert
                 url = f"{PDF_BASE_URL}#page={page + 1}"
             else:
                 url = PDF_BASE_URL
-        # NRW-Gesetz (HTML im Dataset mit Absatz-IDs)
-        elif "Hochschulgesetz" in src:
-            para_id = meta.get("paragraph_id")
             if para_id:
-                # Klick führt direkt zum Absatz im Dataset-HTML
-                url = f"{LAW_DATASET_URL}#{para_id}"
             else:
-                # Fallback: offizielle Druckversion (ohne Absatz-Anker)
-                url = LAW_URL
-            page = None  # keine Seitenangabe für Gesetz-HTML
-        else:
-            url = None
-        srcs.append(
-            {
-                "id": i + 1,
-                "source": src,
-                "page": page + 1 if isinstance(page, int) else None,
-                "url": url,
-                "snippet": snippet,
-            }
-        )
-    return srcs
-# -----------------------------
-# Kontext formatieren
-# -----------------------------
-def format_context(docs):
     if not docs:
-        return "(Kein relevanter Kontext im Dokument gefunden.)"
-    out = []
-    for i, d in enumerate(docs):
-        txt = d.page_content[:MAX_CHARS]
-        src = d.metadata.get("source")
-        page = d.metadata.get("page")
-        if "Prüfungsordnung" in (src or "") and isinstance(page, int):
-            src_str = f"{src}, Seite {page + 1}"
-        else:
-            src_str = src
-        out.append(f"[KONTEXT {i+1}] ({src_str})\n{txt}")
-    return "\n\n".join(out)
-# -----------------------------
-# Systemprompt — verschärft
-# -----------------------------
-SYSTEM_PROMPT = """
-Du bist ein hochpräziser juristischer Chatbot für Prüfungsrecht
-mit Zugriff nur auf:
-- die Prüfungsordnung (als PDF) und
-- das Hochschulgesetz NRW (als HTML aus der offiziellen Druckversion).
-Strenge Regeln:
-1. Antworte ausschließlich anhand des bereitgestellten Kontextes
-   (KONTEXT-Abschnitte). Wenn die Information nicht im Kontext steht,
-   sage ausdrücklich, dass dies aus den vorliegenden Dokumenten nicht
-   hervorgeht und du dazu nichts Sicheres sagen kannst.
-2.
-   Keine Spekulationen, keine Vermutungen.
-3. Antworte in zusammenhängenden, ganzen Sätzen. Verwende keine Mischung aus Deutsch und Englisch.
-4. Nenne, soweit aus dem Kontext erkennbar,
-   - die rechtliche Grundlage (z.B. Paragraph, Artikel),
-   - das Dokument (Prüfungsordnung / Hochschulgesetz NRW),
-   - die Seite (bei der Prüfungsordnung), wenn im Kontext vorhanden.
-5. Füge KEINE externen Informationen hinzu, z.B. aus anderen Gesetzen,
-   Webseiten oder allgemeinem Wissen. Nur das, was im Kontext steht,
-   darf in der Antwort verwendet werden.
-Wenn der Kontext keine eindeutige Antwort zulässt, erkläre klar,
-warum keine sichere Antwort möglich ist und welche Informationen
-im Dokument fehlen.
-"""
-# -----------------------------
-# Hauptfunktion
-# -----------------------------
 def answer(question: str, retriever, chat_model) -> Tuple[str, List[Dict[str, Any]]]:
     """
-    Haupt-RAG-Funktion:
-    - ruft retriever.invoke(question) auf,
-    - baut einen präzisen Prompt mit KONTEXT,
-    - ruft LLM auf,
-    - gibt Antworttext + Quellenliste zurück.
     """
-    # 1. Dokumente holen
     docs = retriever.invoke(question)
     context_str = format_context(docs)
-    # 2. Prompt bauen
-    human = f"""
-FRAGE:
-{question}
-NUTZE AUSSCHLIESSLICH DIESEN KONTEXT:
-{context_str}
-AUFGABE:
-Formuliere eine juristisch korrekte, gut verständliche Antwort
-ausschließlich anhand des obigen Kontextes.
-- Wenn der Kontext aus den Dokumenten eine klare Antwort erlaubt,
-  erläutere diese strukturiert und in vollständigen Sätzen.
-- Wenn der Kontext KEINE klare Antwort erlaubt oder wichtige Informationen
-  fehlen, erkläre das offen und formuliere KEINE Vermutung.
-"""
-    msgs = [
         SystemMessage(content=SYSTEM_PROMPT),
-        HumanMessage(content=human),
     ]
-    # 3. LLM aufrufen
-    result = chat_model.invoke(msgs)
-    answer_text = result.content.strip()
-    # 4. Quellenliste bauen
     sources = build_sources_metadata(docs)
-    return answer_text, sources

 """
+RAG PIPELINE – Verbesserte Version mit präzisen Prompts
 """
 from typing import List, Dict, Any, Tuple
 from langchain_core.messages import SystemMessage, HumanMessage
+from langchain_core.documents import Document
+import re
 # URLs für Quellen
+PDF_BASE_URL = "https://huggingface.co/datasets/Nguyen5/docs/resolve/main/f10_bpo_ifb_tei_mif_wii_2021-01-04.pdf"
+LAW_VIEWER_URL = "https://YOUR_SUPABASE_URL/storage/v1/object/public/hg_viewer/hg_viewer.html"
+MAX_CHARS = 1000
+def format_chunk_content(chunk: Document) -> str:
+    """Format chunk content for better readability"""
+    content = chunk.page_content
+    # Remove excessive whitespace
+    content = re.sub(r'\s+', ' ', content)
+    # Ensure proper sentence endings
+    if not content.strip().endswith(('.', '!', '?')):
+        content = content.strip() + '.'
+    return content[:MAX_CHARS]
+def build_sources_metadata(docs: List[Document]) -> List[Dict[str, Any]]:
     """
+    Erzeugt strukturierte Quellen-Informationen
     """
+    sources = []
+    for i, doc in enumerate(docs, 1):
+        metadata = doc.metadata
+        source_type = metadata.get("source", "")
+        page = metadata.get("page")
+        para_id = metadata.get("paragraph_id", "")
+        # Prepare snippet
+        snippet = format_chunk_content(doc)
+        if len(snippet) > 300:
+            snippet = snippet[:297] + "..."
+        # Determine URL
+        url = None
+        if "PDF" in source_type:
             if isinstance(page, int):
                 url = f"{PDF_BASE_URL}#page={page + 1}"
             else:
                 url = PDF_BASE_URL
+        elif "HTML" in source_type or "Website" in source_type:
             if para_id:
+                url = f"{LAW_VIEWER_URL}#{para_id}"
             else:
+                url = LAW_VIEWER_URL
+        # Build source info
+        source_info = {
+            "id": i,
+            "source": source_type,
+            "page": page + 1 if isinstance(page, int) else None,
+            "paragraph_id": para_id,
+            "url": url,
+            "snippet": snippet,
+            "content_preview": doc.page_content[:200] + "..." if len(doc.page_content) > 200 else doc.page_content
+        }
+        sources.append(source_info)
+    return sources
+def format_context(docs: List[Document]) -> str:
+    """
+    Formatiert den Kontext für den Prompt
+    """
     if not docs:
+        return "KEIN_RELEVANTER_KONTEXT_GEFUNDEN"
+    context_parts = []
+    for i, doc in enumerate(docs, 1):
+        content = format_chunk_content(doc)
+        metadata = doc.metadata
+        # Build source description
+        source_desc = metadata.get("source", "Unbekannte Quelle")
+        if "page" in metadata and metadata["page"] is not None:
+            source_desc += f", Seite {metadata['page'] + 1}"
+        if "paragraph_id" in metadata:
+            source_desc += f", {metadata['paragraph_id']}"
+        context_parts.append(f"【Quelle {i}】{source_desc}\n{content}")
+    return "\n\n".join(context_parts)
+# ========== IMPROVED SYSTEM PROMPT ==========
+SYSTEM_PROMPT = """
+Du bist ein hochpräziser juristischer Assistenz-Chatbot für Prüfungsrecht an Hochschulen in Nordrhein-Westfalen.
+Deine Wissensbasis umfasst ausschließlich:
+1. Die spezifische Prüfungsordnung (PDF-Dokument)
+2. Das Hochschulgesetz NRW (Hochschulgesetz - HG)
+❗ STRENGE ANWEISUNGEN:
+1. **AUSSCHLIESSLICHE KONTEXTNUTZUNG:**
+   - Verwende NUR die bereitgestellten Quellen aus der Wissensbasis.
+   - Wenn Informationen nicht im Kontext stehen, sage explizit: "Auf Basis der vorliegenden Dokumente kann ich diese Frage nicht sicher beantworten."
+   - KEINE Vermutungen, Spekulationen oder externes Wissen.
+2. **PRÄZISE JURISTISCHE ANTWORTEN:**
+   - Formuliere in vollständigen, grammatikalisch korrekten Sätzen.
+   - Verwende präzise juristische Sprache, aber bleibe verständlich.
+   - Strukturiere komplexe Antworten mit Absätzen oder Aufzählungen.
+3. **QUELLENNACHWEISE:**
+   - Verweise immer auf die konkrete Quelle (Prüfungsordnung §X oder Hochschulgesetz §Y).
+   - Bei der Prüfungsordnung gib die Seite an.
+   - Beim Hochschulgesetz verweise auf den Paragraphen.
+4. **ANTWORTSTRUKTUR:**
+   a) Kurze präzise Antwort zuerst
+   b) Detaillierte Erklärung mit Quellenangaben
+   c) Falls relevant: praktische Hinweise basierend auf dem Kontext
+5. **FEHLENDE INFORMATIONEN:**
+   - Wenn der Kontext unvollständig ist, erkläre, welche Informationen fehlen.
+   - Biete an, nur die vorhandenen Informationen zusammenzufassen.
+6. **SPRACHE:**
+   - Verwende ausschließlich formelles Deutsch.
+   - Vermeide Umgangssprache und Abkürzungen.
+Deine Antworten müssen rechtlich korrekt, vollständig und nachprüfbar sein.
+"""
+def create_human_prompt(question: str, context: str) -> str:
+    """
+    Erstellt optimierten Human Prompt
+    """
+    return f"""FRAGE DES NUTZERS:
+{question}
+VERFÜGBARE RECHTSQUELLEN:
+{context if context else "KEINE RELEVANTEN QUELLEN GEFUNDEN"}
+AUFGABE:
+Beantworte die Frage ausschließlich auf Basis der oben genannten Rechtsquellen.
+ANFORDERUNGEN:
+1. Gib eine präzise juristische Antwort in vollständigen Sätzen.
+2. Zitiere konkret:
+   - Für die Prüfungsordnung: "Laut Prüfungsordnung, §X auf Seite Y, ..."
+   - Für das Hochschulgesetz: "Gemäß Hochschulgesetz NRW §Z, ..."
+3. Wenn mehrere Quellen relevant sind, erwähne alle.
+4. Wenn Informationen fehlen, erkläre dies klar.
+5. Strukturiere die Antwort logisch.
+ANTWORT (auf Deutsch):"""
 def answer(question: str, retriever, chat_model) -> Tuple[str, List[Dict[str, Any]]]:
     """
+    Haupt-RAG-Funktion mit verbessertem Prompting
     """
+    # 1. Retrieve relevant documents
     docs = retriever.invoke(question)
+    # 2. Format context
     context_str = format_context(docs)
+    # 3. Create prompt
+    human_prompt = create_human_prompt(question, context_str)
+    # 4. Call LLM
+    messages = [
         SystemMessage(content=SYSTEM_PROMPT),
+        HumanMessage(content=human_prompt)
     ]
+    try:
+        result = chat_model.invoke(messages)
+        answer_text = result.content.strip()
+        # Clean up answer
+        answer_text = re.sub(r'\n\s*\n+', '\n\n', answer_text)  # Remove excessive newlines
+        answer_text = answer_text.replace("KEINE RELEVANTEN QUELLEN GEFUNDEN",
+                                         "Auf Basis der vorliegenden Dokumente kann ich diese Frage nicht sicher beantworten.")
+    except Exception as e:
+        answer_text = f"Fehler bei der Generierung der Antwort: {str(e)}"
+    # 5. Build sources metadata
     sources = build_sources_metadata(docs)
+    return answer_text, sources

requirements.txt CHANGED Viewed

@@ -13,6 +13,7 @@ langchain
 langchain-community
 langchain-text-splitters
 langchain-openai
 # === VectorStore ===
 faiss-cpu
@@ -21,6 +22,7 @@ faiss-cpu
 pypdf
 requests
 beautifulsoup4
 # === Audio (STT/TTS local) ===
 transformers
@@ -29,6 +31,7 @@ soundfile
 scipy
 numpy
 torchaudio
 # OpenAI offizielle Bibliothek (kommt i.d.R. mit langchain-openai, zur Sicherheit explizit)
 openai

 langchain-community
 langchain-text-splitters
 langchain-openai
+huggingface-hub
 # === VectorStore ===
 faiss-cpu
 pypdf
 requests
 beautifulsoup4
+lxml
 # === Audio (STT/TTS local) ===
 transformers
 scipy
 numpy
 torchaudio
+torch
 # OpenAI offizielle Bibliothek (kommt i.d.R. mit langchain-openai, zur Sicherheit explizit)
 openai

upload_weblink_to_supabase.py CHANGED Viewed

@@ -1,8 +1,14 @@
 import os
 import requests
 from bs4 import BeautifulSoup
 from supabase import create_client
 from dotenv import load_dotenv
 load_dotenv()
@@ -13,64 +19,116 @@ supabase = create_client(SUPABASE_URL, SUPABASE_SERVICE_ROLE)
 LAW_URL = "https://recht.nrw.de/lmi/owa/br_text_anzeigen?v_id=10000000000000000654"
 def extract_paragraphs():
     print(">>> Lade Hochschulgesetz NRW …")
-    html = requests.get(LAW_URL, timeout=30).text
     soup = BeautifulSoup(html, "html.parser")
-    # Tất cả tiêu đề Paragraph xuất hiện trong <h2> hoặc <h3>
-    headers = soup.find_all(["h2", "h3"])
     paragraphs = []
     order = 1
-    for header in headers:
-        title = header.get_text(" ", strip=True)
-        if not title.startswith("§"):
-            continue  # bỏ các h2/h3 không phải Paragraph
-        # Gom toàn bộ nội dung từ header đến trước h2/h3 tiếp theo
-        content_parts = []
-        sibling = header.find_next_sibling()
-        while sibling and sibling.name not in ["h2", "h3"]:
-            text = sibling.get_text(" ", strip=True)
-            if text:
-                content_parts.append(text)
-            sibling = sibling.find_next_sibling()
-        full_content = "\n".join(content_parts).strip()
-        para_id = f"para_{order}"
-        paragraphs.append({
-            "abs_id": para_id,
-            "title": title,
-            "content": full_content,
-            "order_index": order
-        })
-        order += 1
     print(f"✔ Extracted {len(paragraphs)} paragraphs (§).")
     return paragraphs
 def upload_to_supabase():
     paras = extract_paragraphs()
-    print(">>> Clear table hg_nrw …")
-    supabase.table("hg_nrw").delete().neq("abs_id", "").execute()
-    print(">>> Upload begin …")
-    BATCH = 100
-    for i in range(0, len(paras), BATCH):
-        batch = paras[i:i+BATCH]
-        print(f"   - Upload batch {i} – {i+len(batch)-1}")
-        supabase.table("hg_nrw").upsert(batch).execute()
-    print("✔ DONE uploading complete NRW law.")
 if __name__ == "__main__":
-    upload_to_supabase()

+"""
+upload_weblink_to_supabase.py
+Trích xuất và tải lên các paragraph từ trang web recht.nrw.de
+"""
 import os
 import requests
+import re
 from bs4 import BeautifulSoup
 from supabase import create_client
 from dotenv import load_dotenv
+import time
 load_dotenv()
 LAW_URL = "https://recht.nrw.de/lmi/owa/br_text_anzeigen?v_id=10000000000000000654"
+def clean_text(text):
+    """Làm sạch và định dạng văn bản"""
+    # Loại bỏ khoảng trắng thừa
+    text = re.sub(r'\s+', ' ', text)
+    # Chuẩn hóa dấu câu
+    text = re.sub(r'\s*([.,;:!?])\s*', r'\1 ', text)
+    # Đảm bảo chữ cái đầu câu viết hoa
+    sentences = text.split('. ')
+    sentences = [s.strip().capitalize() for s in sentences if s.strip()]
+    return '. '.join(sentences)
 def extract_paragraphs():
     print(">>> Lade Hochschulgesetz NRW …")
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+    }
+    try:
+        response = requests.get(LAW_URL, headers=headers, timeout=60)
+        response.raise_for_status()
+    except requests.RequestException as e:
+        print(f"❌ Fehler beim Laden der Seite: {e}")
+        return []
+    html = response.text
     soup = BeautifulSoup(html, "html.parser")
+    # Tìm tất cả các section có chứa paragraph
     paragraphs = []
     order = 1
+    # Tìm các phần có chứa § (paragraph symbol)
+    pattern = re.compile(r'§\s*\d+')
+    # Tìm tất cả các element chứa paragraph
+    for element in soup.find_all(['p', 'div', 'td']):
+        text = element.get_text(" ", strip=True)
+        # Kiểm tra nếu có paragraph symbol
+        if pattern.search(text):
+            # Tách title và content
+            lines = text.split('\n')
+            title = lines[0].strip() if lines else ""
+            # Lấy nội dung
+            content = ""
+            if len(lines) > 1:
+                content = clean_text(" ".join(lines[1:]))
+            # Nếu title chưa có §, thêm từ nội dung
+            if '§' not in title and content:
+                # Tìm § trong content để thêm vào title
+                match = pattern.search(content)
+                if match:
+                    title = match.group()
+                    # Xóa title khỏi content
+                    content = content.replace(title, "", 1).strip()
+            # Tạo ID cho paragraph
+            para_id = f"para_{order}"
+            paragraphs.append({
+                "abs_id": para_id,
+                "title": title if title else f"§ {order}",
+                "content": content if content else text,
+                "order_index": order,
+                "source_url": LAW_URL
+            })
+            order += 1
     print(f"✔ Extracted {len(paragraphs)} paragraphs (§).")
+    # In ra mẫu để kiểm tra
+    if paragraphs:
+        print("\nBeispiel Paragraph 1:")
+        print(f"Title: {paragraphs[0]['title']}")
+        print(f"Content (Auszug): {paragraphs[0]['content'][:200]}...\n")
     return paragraphs
 def upload_to_supabase():
     paras = extract_paragraphs()
+    if not paras:
+        print("❌ Keine Paragraphs gefunden. Upload abgebrochen.")
+        return
+    print(">>> Clear table hg_nrw …")
+    try:
+        # Xóa toàn bộ dữ liệu cũ
+        supabase.table("hg_nrw").delete().neq("abs_id", "").execute()
+        print("✔ Tabelle geleert.")
+    except Exception as e:
+        print(f"⚠️ Fehler beim Leeren der Tabelle: {e}")
+    print(">>> Upload beginnt …")
+    BATCH_SIZE = 50
+    for i in range(0, len(paras), BATCH_SIZE):
+        batch = paras[i:i+BATCH_SIZE]
+        try:
+            result = supabase.table("hg_nrw").upsert(batch).execute()
+            print(f"✔ Batch {i//BATCH_SIZE + 1} hochgeladen ({len(batch)} Einträge)")
+            time.sleep(0.1)  # Tránh rate limiting
+        except Exception as e:
+            print(f"❌ Fehler beim Upload von Batch {i//BATCH_SIZE + 1}: {e}")
+    print(f"✔ DONE - {len(paras)} Paragraphs erfolgreich hochgeladen.")
 if __name__ == "__main__":
+    upload_to_supabase()