Spaces:

Nguyen5
/

chatbot

Sleeping

App Files Files Community

Nguyen5 commited on Dec 4, 2025

Commit

cae6054

1 Parent(s): d3ea7b2

commit

Browse files

Files changed (4) hide show

app.py +150 -245
build_hg_viewer.py +272 -677
load_documents.py +95 -165
rag_pipeline.py +155 -158

app.py CHANGED Viewed

@@ -1,307 +1,212 @@
-"""
-app.py – Aktualisierte Version mit verbessertem Viewer
-"""
 import gradio as gr
 from gradio_pdf import PDF
 from huggingface_hub import hf_hub_download
-import os
-from load_documents import load_documents, DATASET, PDF_FILE
 from split_documents import split_documents
 from vectorstore import build_vectorstore
 from retriever import get_retriever
 from llm import load_llm
-from rag_pipeline import answer
 from speech_io import transcribe_audio, synthesize_speech
 # =====================================================
-# KONFIGURATION
 # =====================================================
-# Viewer URL (ersetze mit deiner Supabase URL)
-SUPABASE_URL = os.environ.get("SUPABASE_URL", "https://your-project.supabase.co")
-LAW_VIEWER_URL = f"{SUPABASE_URL}/storage/v1/object/public/hg_viewer/hg_viewer.html"
-# =====================================================
-# INITIALISIERUNG
-# =====================================================
-print("🔹 Initialisiere System...")
-print("1. Lade Dokumente...")
 _docs = load_documents()
-print("2. Splitte Dokumente...")
 _chunks = split_documents(_docs)
-print("3. Baue VectorStore...")
 _vs = build_vectorstore(_chunks)
-print("4. Erzeuge Retriever...")
 _retriever = get_retriever(_vs)
-print("5. Lade LLM...")
 _llm = load_llm()
-print("6. Lade Dateien für Viewer...")
-try:
-    _pdf_path = hf_hub_download(DATASET, PDF_FILE, repo_type="dataset")
-    print(f"✅ PDF geladen: {_pdf_path}")
-except Exception as e:
-    print(f"⚠️ PDF konnte nicht geladen werden: {e}")
-    _pdf_path = None
-print("✅ System initialisiert!")
 # =====================================================
-# HELPER FUNCTIONS
 # =====================================================
 def format_sources_markdown(sources):
-    """Formatiere Quellen als Markdown"""
     if not sources:
         return ""
-    lines = ["", "**📚 Quellenverweise:**", ""]
     for s in sources:
-        source_type = s["source"]
         page = s["page"]
-        para_id = s.get("paragraph_id", "")
         url = s["url"]
-        snippet = s.get("snippet", "")
-        # Build source line
         if url:
-            if "PDF" in source_type:
-                source_text = f"[{source_type}"
-                if page:
-                    source_text += f", Seite {page}"
-                source_text += f"]({url})"
-            else:
-                display_name = para_id if para_id else "Hochschulgesetz NRW"
-                source_text = f"[{display_name}]({url})"
         else:
-            source_text = source_type
-        lines.append(f"- {source_text}")
         if snippet:
-            lines.append(f"  > *{snippet}*")
     return "\n".join(lines)
 # =====================================================
-# CHATBOT FUNCTIONS
 # =====================================================
 def chatbot_text(user_message, history):
-    """Text-Chatbot Funktion"""
-    if not user_message.strip():
-        return history, ""
-    try:
-        # Get answer from RAG pipeline
-        answer_text, sources = answer(
-            question=user_message,
-            retriever=_retriever,
-            chat_model=_llm
-        )
-        # Add sources
-        sources_text = format_sources_markdown(sources)
-        full_response = f"{answer_text}\n\n{sources_text}"
-        # Update history
-        history.append({"role": "user", "content": user_message})
-        history.append({"role": "assistant", "content": full_response})
-        return history, ""
-    except Exception as e:
-        error_msg = f"Fehler bei der Verarbeitung: {str(e)}"
-        history.append({"role": "user", "content": user_message})
-        history.append({"role": "assistant", "content": error_msg})
         return history, ""
 def chatbot_voice(audio_path, history):
-    """Voice-Chatbot Funktion"""
-    if not audio_path:
-        return history, None, ""
-    # Transcribe audio
     text = transcribe_audio(audio_path)
     if not text:
-        return history, None, "Keine Sprache erkannt"
-    # Process with text chatbot
-    history, _ = chatbot_text(text, history)
-    # Get last response for TTS
-    last_response = None
-    for msg in reversed(history):
-        if msg["role"] == "assistant":
-            last_response = msg["content"]
-            break
-    # Generate audio
-    audio_output = None
-    if last_response:
-        audio_output = synthesize_speech(last_response.split("\n\n")[0])  # Nur erste Teil für TTS
-    return history, audio_output, text
 def read_last_answer(history):
-    """Lese letzte Antwort vor"""
     if not history:
         return None
     for msg in reversed(history):
         if msg["role"] == "assistant":
-            return synthesize_speech(msg["content"].split("\n\n")[0])
     return None
 # =====================================================
-# GRADIO UI
 # =====================================================
-def create_ui():
-    """Erstelle die Gradio Benutzeroberfläche"""
-    with gr.Blocks(
-        title="Prüfungsrechts-Chatbot NRW",
-    ) as demo:
-        # Header
-        gr.Markdown("""
-        # 🧑‍⚖️ Prüfungsrechts-Chatbot für NRW Hochschulen
-        Dieser Chatbot beantwortet Fragen basierend auf:
-        - **Prüfungsordnung** (offizielles PDF)
-        - **Hochschulgesetz NRW** (aktuelle Fassung von recht.nrw.de)
-        Fragen können per Text oder Spracheingabe gestellt werden.
-        """)
-        with gr.Row():
-            # Left Column - Chat
-            with gr.Column(scale=2):
-                chatbot = gr.Chatbot(
-                    label="Chat",
-                    height=500
-                )
-                with gr.Row():
-                    msg = gr.Textbox(
-                        label="Frage eingeben",
-                        placeholder="Stellen Sie Ihre Frage zum Prüfungsrecht...",
-                        scale=4,
-                        container=False
-                    )
-                    send_btn = gr.Button("Senden", variant="primary", scale=1)
-                # Voice Input
-                with gr.Accordion("🎤 Spracheingabe", open=False):
-                    with gr.Row():
-                        voice_in = gr.Audio(
-                            sources=["microphone"],
-                            type="filepath",
-                            label="Aufnahme",
-                            scale=3
-                        )
-                        voice_btn = gr.Button("Sprechen & senden", scale=1)
-                    voice_out = gr.Audio(
-                        label="Antwort als Audio",
-                        type="numpy",
-                        visible=True
-                    )
-                # Controls
-                with gr.Row():
-                    read_btn = gr.Button("🔊 Antwort vorlesen")
-                    clear_btn = gr.Button("🗑️ Chat leeren", variant="secondary")
-            # Right Column - Viewer
-            with gr.Column(scale=1):
-                # PDF Viewer
-                gr.Markdown("### 📄 Prüfungsordnung")
-                if _pdf_path:
-                    pdf_viewer = PDF(_pdf_path, height=350, label="PDF Viewer")
-                else:
-                    gr.Markdown("⚠️ PDF konnte nicht geladen werden")
-                # Law Viewer
-                gr.Markdown("### 📘 Hochschulgesetz NRW")
-                gr.HTML(f"""
-                <iframe
-                    src="{LAW_VIEWER_URL}"
-                    style="width:100%; height:400px; border:none; border-radius:10px;"
-                    title="Hochschulgesetz NRW Viewer"
-                ></iframe>
-                """)
-        # Event Handlers
-        # Text input
-        msg.submit(
-            chatbot_text,
-            [msg, chatbot],
-            [chatbot, msg]
-        )
-        send_btn.click(
-            chatbot_text,
-            [msg, chatbot],
-            [chatbot, msg]
-        )
-        # Voice input
-        voice_btn.click(
-            chatbot_voice,
-            [voice_in, chatbot],
-            [chatbot, voice_out, msg]
-        )
-        # Controls
-        read_btn.click(
-            read_last_answer,
-            [chatbot],
-            [voice_out]
-        )
-        clear_btn.click(
-            lambda: [],
-            None,
-            [chatbot]
-        )
-        # Instructions
-        gr.Markdown("""
-        ### ℹ️ Nutzungshinweise
-        1. **Präzise Fragen** stellen für bessere Antworten
-        2. **Quellen** werden automatisch verlinkt
-        3. **Klicken Sie auf Links** im Chat, um direkt zur Quelle zu springen
-        4. **Spracheingabe** für hands-free Nutzung
-        ### ⚠️ Hinweis
-        Dies ist ein Assistenzsystem. Für verbindliche rechtliche Auskünfte wenden Sie sich bitte an die zuständigen Prüfungsämter.
-        """)
-    return demo
-# =====================================================
-# MAIN
-# =====================================================
 if __name__ == "__main__":
-    demo = create_ui()
-    # Konfiguration für HuggingFace Spaces
-    demo.queue(
-        max_size=20,
-        api_open=False
-    ).launch(
-    )

+# app.py – Prüfungsrechts-Chatbot (RAG + Sprachmodus)
+# Version 26.11 – ohne Modi, stabil für Text + Voice
 import gradio as gr
 from gradio_pdf import PDF
 from huggingface_hub import hf_hub_download
+from load_documents import load_documents, DATASET, PDF_FILE, HTML_FILE
 from split_documents import split_documents
 from vectorstore import build_vectorstore
 from retriever import get_retriever
 from llm import load_llm
+from rag_pipeline import answer, PDF_BASE_URL, LAW_URL
 from speech_io import transcribe_audio, synthesize_speech
 # =====================================================
+# INITIALISIERUNG (global)
 # =====================================================
+print("🔹 Lade Dokumente ...")
 _docs = load_documents()
+print("🔹 Splitte Dokumente ...")
 _chunks = split_documents(_docs)
+print("🔹 Baue VectorStore (FAISS) ...")
 _vs = build_vectorstore(_chunks)
+print("🔹 Erzeuge Retriever ...")
 _retriever = get_retriever(_vs)
+print("🔹 Lade LLM ...")
 _llm = load_llm()
+print("🔹 Lade Dateien für Viewer …")
+_pdf_path = hf_hub_download(DATASET, PDF_FILE, repo_type="dataset")
+_html_path = hf_hub_download(DATASET, HTML_FILE, repo_type="dataset")
 # =====================================================
+# Quellen formatieren – Markdown für Chat
 # =====================================================
 def format_sources_markdown(sources):
     if not sources:
         return ""
+    lines = ["", "**📚 Quellen (genutzte Dokumentstellen):**"]
     for s in sources:
+        sid = s["id"]
+        src = s["source"]
         page = s["page"]
         url = s["url"]
+        snippet = s["snippet"]
+        title = f"Quelle {sid} – {src}"
         if url:
+            base = f"- [{title}]({url})"
         else:
+            base = f"- {title}"
+        if page and "Prüfungsordnung" in src:
+            base += f", Seite {page}"
+        lines.append(base)
         if snippet:
+            lines.append(f"  > {snippet}")
     return "\n".join(lines)
 # =====================================================
+# TEXT CHATBOT
 # =====================================================
 def chatbot_text(user_message, history):
+    if not user_message:
         return history, ""
+    answer_text, sources = answer(
+        question=user_message,
+        retriever=_retriever,
+        chat_model=_llm,
+    )
+    quellen_block = format_sources_markdown(sources)
+    history = history + [
+        {"role": "user", "content": user_message},
+        {"role": "assistant", "content": answer_text + quellen_block},
+    ]
+    return history, ""
+# =====================================================
+# VOICE CHATBOT
+# =====================================================
 def chatbot_voice(audio_path, history):
+    # 1. Speech → Text
     text = transcribe_audio(audio_path)
     if not text:
+        return history, None, ""
+    # Lưu vào lịch sử chat
+    history = history + [{"role": "user", "content": text}]
+    # 2. RAG trả lời
+    answer_text, sources = answer(
+        question=text,
+        retriever=_retriever,
+        chat_model=_llm,
+    )
+    quellen_block = format_sources_markdown(sources)
+    bot_msg = answer_text + quellen_block
+    history = history + [{"role": "assistant", "content": bot_msg}]
+    # 3. Text → Speech
+    audio = synthesize_speech(bot_msg)
+    return history, audio, ""
+# =====================================================
+# LAST ANSWER → TTS
+# =====================================================
 def read_last_answer(history):
     if not history:
         return None
     for msg in reversed(history):
         if msg["role"] == "assistant":
+            return synthesize_speech(msg["content"])
     return None
 # =====================================================
+# UI – GRADIO
 # =====================================================
+with gr.Blocks(title="Prüfungsrechts-Chatbot (RAG + Sprache)") as demo:
+    gr.Markdown("# 🧑‍⚖️ Prüfungsrechts-Chatbot")
+    gr.Markdown(
+        "Dieser Chatbot beantwortet Fragen **ausschließlich** aus der "
+        "Prüfungsordnung (PDF) und dem Hochschulgesetz NRW (Website). "
+        "Du kannst Text eingeben oder direkt ins Mikrofon sprechen."
+    )
+    with gr.Row():
+        with gr.Column(scale=2):
+            chatbot = gr.Chatbot(label="Chat", height=500)
+            msg = gr.Textbox(
+                label="Frage eingeben",
+                placeholder="Stelle deine Frage zum Prüfungsrecht …",
+            )
+            # TEXT SENDEN
+            msg.submit(
+                chatbot_text,
+                [msg, chatbot],
+                [chatbot, msg]
+            )
+            send_btn = gr.Button("Senden (Text)")
+            send_btn.click(
+                chatbot_text,
+                [msg, chatbot],
+                [chatbot, msg]
+            )
+            # SPRACHEINGABE
+            gr.Markdown("### 🎙️ Spracheingabe")
+            voice_in = gr.Audio(sources=["microphone"], type="filepath")
+            voice_out = gr.Audio(label="Vorgelesene Antwort", type="numpy")
+            voice_btn = gr.Button("Sprechen & senden")
+            voice_btn.click(
+                chatbot_voice,
+                [voice_in, chatbot],
+                [chatbot, voice_out, msg]
+            )
+            read_btn = gr.Button("🔁 Antwort erneut vorlesen")
+            read_btn.click(
+                read_last_answer,
+                [chatbot],
+                [voice_out]
+            )
+            clear_btn = gr.Button("Chat zurücksetzen")
+            clear_btn.click(lambda: [], None, chatbot)
+        # =====================
+        # RECHTE SPALTE: Viewer
+        # =====================
+        with gr.Column(scale=1):
+            gr.Markdown("### 📄 Prüfungsordnung (PDF)")
+            PDF(_pdf_path, height=350)
+            gr.Markdown("### 📘 Hochschulgesetz NRW (Website)")
+            gr.HTML(
+                f'<iframe src="{LAW_URL}" style="width:100%;height:350px;border:none;"></iframe>'
+            )
 if __name__ == "__main__":
+    demo.queue().launch(ssr_mode=False, show_error=True)

build_hg_viewer.py CHANGED Viewed

@@ -1,12 +1,7 @@
-"""
-build_hg_viewer.py
-Tạo HTML viewer cho Hochschulgesetz NRW với định dạng chuyên nghiệp
-"""
 import os
-import json
 from supabase import create_client
 from dotenv import load_dotenv
-import re
 load_dotenv()
@@ -18,701 +13,301 @@ if not SUPABASE_URL or not SUPABASE_SERVICE_ROLE:
 supabase = create_client(SUPABASE_URL, SUPABASE_SERVICE_ROLE)
-def get_paragraphs_from_supabase():
-    """Lấy paragraphs từ Supabase"""
-    print(">>> Lade Paragraphs aus Supabase...")
-    response = supabase.table("hg_nrw").select("*").order("order_index").execute()
-    paragraphs = response.data
-    if not paragraphs:
-        print("❌ Keine Paragraphs in der Datenbank gefunden.")
-        return []
-    print(f"✔ {len(paragraphs)} Paragraphs geladen.")
-    return paragraphs
-# ======== HTML TEMPLATE MIT PROFESSIONELLEM DESIGN ========
-VIEW_TEMPLATE = """<!DOCTYPE html>
 <html lang="de">
 <head>
-    <meta charset="UTF-8">
-    <meta name="viewport" content="width=device-width, initial-scale=1.0">
-    <title>Hochschulgesetz NRW – Offizielle Viewer</title>
-    <style>
-        * {
-            margin: 0;
-            padding: 0;
-            box-sizing: border-box;
-        }
-        body {
-            font-family: 'Segoe UI', 'Roboto', 'Arial', sans-serif;
-            line-height: 1.6;
-            color: #333;
-            background: #f8f9fa;
-            display: flex;
-            min-height: 100vh;
-        }
-        /* ----------- SIDEBAR ------------- */
-        #sidebar {
-            width: 320px;
-            background: #ffffff;
-            border-right: 1px solid #e0e0e0;
-            height: 100vh;
-            overflow-y: auto;
-            position: fixed;
-            left: 0;
-            top: 0;
-            box-shadow: 2px 0 5px rgba(0,0,0,0.1);
-            z-index: 1000;
-        }
-        .sidebar-header {
-            padding: 20px;
-            background: linear-gradient(135deg, #003366 0%, #00509e 100%);
-            color: white;
-            border-bottom: 1px solid #002244;
-        }
-        .sidebar-header h2 {
-            font-size: 1.4rem;
-            font-weight: 600;
-            margin-bottom: 10px;
-        }
-        .sidebar-header p {
-            font-size: 0.9rem;
-            opacity: 0.9;
-        }
-        #searchBox {
-            width: 100%;
-            padding: 12px 15px;
-            font-size: 14px;
-            border: 1px solid #ddd;
-            border-radius: 8px;
-            margin: 15px;
-            background: #f8f9fa;
-            transition: all 0.3s;
-        }
-        #searchBox:focus {
-            outline: none;
-            border-color: #003366;
-            box-shadow: 0 0 0 3px rgba(0, 51, 102, 0.1);
-        }
-        .paragraph-list {
-            padding: 0 15px 20px 15px;
-        }
-        .sidebar-link {
-            display: block;
-            padding: 12px 15px;
-            margin-bottom: 5px;
-            text-decoration: none;
-            color: #003366;
-            background: #f8f9fa;
-            border-left: 4px solid transparent;
-            border-radius: 6px;
-            font-size: 14px;
-            font-weight: 500;
-            transition: all 0.2s;
-        }
-        .sidebar-link:hover {
-            background: #e3f2fd;
-            border-left-color: #003366;
-            transform: translateX(3px);
-        }
-        .sidebar-link.active {
-            background: #e3f2fd;
-            border-left-color: #003366;
-            font-weight: 600;
-        }
-        /* ----------- MAIN CONTENT ------------- */
-        #content-wrapper {
-            flex: 1;
-            margin-left: 320px;
-            min-height: 100vh;
-        }
-        #content {
-            max-width: 900px;
-            margin: 0 auto;
-            padding: 30px;
-            background: white;
-            min-height: 100vh;
-            box-shadow: 0 0 20px rgba(0,0,0,0.05);
-        }
-        .page-header {
-            margin-bottom: 40px;
-            padding-bottom: 20px;
-            border-bottom: 2px solid #003366;
-        }
-        .page-header h1 {
-            color: #003366;
-            font-size: 2.2rem;
-            font-weight: 700;
-            margin-bottom: 10px;
-        }
-        .page-header .subtitle {
-            color: #666;
-            font-size: 1.1rem;
-        }
-        /* ----------- PARAGRAPH STYLES ------------- */
-        .paragraph {
-            margin-bottom: 50px;
-            padding: 25px;
-            background: #ffffff;
-            border-radius: 10px;
-            border-left: 5px solid #003366;
-            box-shadow: 0 2px 10px rgba(0,0,0,0.08);
-            transition: all 0.3s;
-        }
-        .paragraph.highlight {
-            animation: highlight-pulse 2s ease;
-            border-left-color: #ff9800;
-            box-shadow: 0 0 0 3px rgba(255, 152, 0, 0.2);
-        }
-        .paragraph-header {
-            margin-bottom: 20px;
-        }
-        .paragraph-title {
-            color: #003366;
-            font-size: 1.6rem;
-            font-weight: 700;
-            margin-bottom: 10px;
-            display: flex;
-            align-items: center;
-            gap: 10px;
-        }
-        .paragraph-title .anchor {
-            font-size: 0.8em;
-            color: #666;
-            text-decoration: none;
-            opacity: 0;
-            transition: opacity 0.2s;
-        }
-        .paragraph:hover .anchor {
-            opacity: 1;
-        }
-        .paragraph-content {
-            font-size: 1.05rem;
-            line-height: 1.8;
-            color: #333;
-        }
-        .paragraph-content p {
-            margin-bottom: 15px;
-        }
-        .paragraph-content ul, .paragraph-content ol {
-            margin: 15px 0 15px 25px;
-        }
-        .paragraph-content li {
-            margin-bottom: 8px;
-        }
-        /* ----------- FOOTNOTES ------------- */
-        .footnotes {
-            margin-top: 25px;
-            padding-top: 20px;
-            border-top: 1px solid #eee;
-        }
-        .footnotes-title {
-            font-weight: 600;
-            color: #666;
-            margin-bottom: 15px;
-            font-size: 0.95rem;
-        }
-        .footnote-item {
-            margin-bottom: 10px;
-            padding-left: 15px;
-            border-left: 2px solid #ddd;
-            font-size: 0.9rem;
-            color: #555;
-        }
-        /* ----------- HIGHLIGHT ANIMATION ------------- */
-        @keyframes highlight-pulse {
-            0% { background-color: #fff8e1; }
-            70% { background-color: #fff8e1; }
-            100% { background-color: #ffffff; }
-        }
-        /* ----------- RESPONSIVE ------------- */
-        @media (max-width: 992px) {
-            body {
-                flex-direction: column;
-            }
-            #sidebar {
-                position: static;
-                width: 100%;
-                height: auto;
-                max-height: 50vh;
-            }
-            #content-wrapper {
-                margin-left: 0;
-            }
-        }
-        /* ----------- BACK TO TOP ------------- */
-        #back-to-top {
-            position: fixed;
-            bottom: 30px;
-            right: 30px;
-            width: 50px;
-            height: 50px;
-            background: #003366;
-            color: white;
-            border-radius: 50%;
-            display: none;
-            justify-content: center;
-            align-items: center;
-            cursor: pointer;
-            box-shadow: 0 2px 10px rgba(0,0,0,0.2);
-            transition: all 0.3s;
-            z-index: 1000;
-        }
-        #back-to-top:hover {
-            background: #00509e;
-            transform: translateY(-3px);
-        }
-        /* ----------- KEYWORD HIGHLIGHT ------------- */
-        .keyword-highlight {
-            background: #fff9c4;
-            padding: 2px 4px;
-            border-radius: 3px;
-            font-weight: 500;
-        }
-        /* ----------- PRINT STYLES ------------- */
-        @media print {
-            #sidebar {
-                display: none;
-            }
-            #content-wrapper {
-                margin-left: 0;
-            }
-            #back-to-top {
-                display: none !important;
-            }
-        }
-    </style>
 </head>
 <body>
-    <!-- SIDEBAR -->
-    <div id="sidebar">
-        <div class="sidebar-header">
-            <h2>Hochschulgesetz NRW</h2>
-            <p>Inhaltsverzeichnis</p>
-        </div>
-        <input type="text" id="searchBox" placeholder="Paragraph suchen (z.B. §1 oder Text)..."
-               title="Geben Sie eine Paragraphennummer oder Suchbegriff ein">
-        <div class="paragraph-list" id="paragraphList">
-            <!-- SIDEBAR_LINKS -->
-        </div>
-    </div>
-    <!-- MAIN CONTENT -->
-    <div id="content-wrapper">
-        <div id="content">
-            <div class="page-header">
-                <h1>Hochschulgesetz Nordrhein-Westfalen</h1>
-                <p class="subtitle">Gesetz über die Hochschulen des Landes Nordrhein-Westfalen (Hochschulgesetz – HG)</p>
-                <p class="subtitle" style="font-size: 0.9rem; color: #777;">
-                    Stand: Aktuelle Fassung | Quelle: <a href="https://recht.nrw.de" target="_blank">recht.nrw.de</a>
-                </p>
-            </div>
-            <div id="paragraphContent">
-                <!-- PARAGRAPH_CONTENT -->
-            </div>
-        </div>
-    </div>
-    <!-- BACK TO TOP BUTTON -->
-    <div id="back-to-top" title="Zum Anfang">
-        ↑
-    </div>
-    <script>
-        // ========== GLOBAL VARIABLES ==========
-        let currentParagraphId = '';
-        let searchTimeout = null;
-        // ========== INITIALIZATION ==========
-        document.addEventListener('DOMContentLoaded', function() {
-            // Check for URL hash
-            const hash = window.location.hash.substring(1);
-            const urlParams = new URLSearchParams(window.location.search);
-            const keywords = urlParams.get('keywords');
-            if (hash) {
-                scrollToParagraph(hash);
-            }
-            if (keywords) {
-                highlightKeywords(decodeURIComponent(keywords));
-            }
-            setupEventListeners();
-            updateActiveLink();
-        });
-        // ========== SCROLL TO PARAGRAPH ==========
-        function scrollToParagraph(paragraphId, highlight = true) {
-            const element = document.getElementById(paragraphId);
-            if (!element) return;
-            // Remove previous highlight
-            document.querySelectorAll('.paragraph.highlight').forEach(el => {
-                el.classList.remove('highlight');
-            });
-            // Calculate position for smooth scroll
-            const sidebarHeight = document.getElementById('sidebar').offsetHeight;
-            const elementPosition = element.getBoundingClientRect().top;
-            const offsetPosition = elementPosition + window.pageYOffset - 100;
-            // Smooth scroll
-            window.scrollTo({
-                top: offsetPosition,
-                behavior: 'smooth'
-            });
-            // Highlight if requested
-            if (highlight) {
-                setTimeout(() => {
-                    element.classList.add('highlight');
-                    // Update URL without page reload
-                    history.replaceState(null, null, `#${paragraphId}`);
-                    // Update active link in sidebar
-                    updateActiveLink(paragraphId);
-                }, 300);
-            }
-        }
-        // ========== SEARCH FUNCTIONALITY ==========
-        function setupEventListeners() {
-            const searchBox = document.getElementById('searchBox');
-            // Search input with debounce
-            searchBox.addEventListener('input', function() {
-                clearTimeout(searchTimeout);
-                searchTimeout = setTimeout(() => {
-                    filterParagraphs(this.value);
-                }, 300);
-            });
-            // Enter key to jump to first result
-            searchBox.addEventListener('keypress', function(e) {
-                if (e.key === 'Enter') {
-                    e.preventDefault();
-                    jumpToFirstResult(this.value);
-                }
-            });
-            // Back to top button
-            const backToTop = document.getElementById('back-to-top');
-            backToTop.addEventListener('click', function() {
-                window.scrollTo({
-                    top: 0,
-                    behavior: 'smooth'
-                });
-            });
-            // Show/hide back to top button
-            window.addEventListener('scroll', function() {
-                if (window.scrollY > 500) {
-                    backToTop.style.display = 'flex';
-                } else {
-                    backToTop.style.display = 'none';
-                }
-                updateActiveLink();
-            });
-        }
-        function filterParagraphs(searchTerm) {
-            const links = document.querySelectorAll('.sidebar-link');
-            const searchLower = searchTerm.toLowerCase();
-            let hasVisible = false;
-            links.forEach(link => {
-                const text = link.textContent.toLowerCase();
-                if (text.includes(searchLower)) {
-                    link.style.display = 'block';
-                    hasVisible = true;
-                } else {
-                    link.style.display = 'none';
-                }
-            });
-            // Update search box placeholder based on results
-            const searchBox = document.getElementById('searchBox');
-            if (!hasVisible && searchTerm) {
-                searchBox.title = 'Keine Ergebnisse gefunden';
-            } else {
-                searchBox.title = '';
-            }
-        }
-        function jumpToFirstResult(searchTerm) {
-            const links = document.querySelectorAll('.sidebar-link');
-            const searchLower = searchTerm.toLowerCase();
-            for (const link of links) {
-                if (link.style.display !== 'none') {
-                    const paragraphId = link.getAttribute('href').substring(1);
-                    scrollToParagraph(paragraphId);
-                    break;
-                }
-            }
-        }
-        // ========== HIGHLIGHT KEYWORDS ==========
-        function highlightKeywords(keywords) {
-            const content = document.getElementById('paragraphContent');
-            const searchTerms = keywords.split(/[\s,]+/).filter(term => term.length > 2);
-            searchTerms.forEach(term => {
-                const regex = new RegExp(`(${escapeRegExp(term)})`, 'gi');
-                content.innerHTML = content.innerHTML.replace(regex,
-                    '<span class="keyword-highlight">$1</span>');
-            });
-        }
-        function escapeRegExp(string) {
-            return string.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
-        }
-        // ========== UPDATE ACTIVE LINK ==========
-        function updateActiveLink(forceId = null) {
-            const links = document.querySelectorAll('.sidebar-link');
-            const paragraphs = document.querySelectorAll('.paragraph');
-            let activeId = forceId;
-            if (!activeId) {
-                // Find paragraph in viewport
-                const viewportHeight = window.innerHeight;
-                const viewportMiddle = window.scrollY + (viewportHeight / 2);
-                for (const paragraph of paragraphs) {
-                    const rect = paragraph.getBoundingClientRect();
-                    const paragraphTop = window.pageYOffset + rect.top;
-                    const paragraphBottom = paragraphTop + rect.height;
-                    if (viewportMiddle >= paragraphTop && viewportMiddle <= paragraphBottom) {
-                        activeId = paragraph.id;
-                        break;
-                    }
-                }
-            }
-            // Update active state
-            links.forEach(link => {
-                const href = link.getAttribute('href').substring(1);
-                if (href === activeId) {
-                    link.classList.add('active');
-                } else {
-                    link.classList.remove('active');
-                }
-            });
-        }
-        // ========== FORMAT CONTENT ==========
-        function formatContent(text) {
-            // Replace multiple newlines with paragraphs
-            return text.split('\n\n').map(paragraph => {
-                if (paragraph.trim()) {
-                    return `<p>${paragraph.trim()}</p>`;
-                }
-                return '';
-            }).join('');
-        }
-        // ========== COPY TO CLIPBOARD ==========
-        function copyParagraphLink(paragraphId) {
-            const url = window.location.origin + window.location.pathname + '#' + paragraphId;
-            navigator.clipboard.writeText(url).then(() => {
-                // Show temporary notification
-                const notification = document.createElement('div');
-                notification.textContent = 'Link kopiert!';
-                notification.style.cssText = `
-                    position: fixed;
-                    top: 20px;
-                    right: 20px;
-                    background: #4CAF50;
-                    color: white;
-                    padding: 10px 20px;
-                    border-radius: 5px;
-                    z-index: 10000;
-                    animation: fadeInOut 2s ease;
-                `;
-                document.body.appendChild(notification);
-                setTimeout(() => {
-                    document.body.removeChild(notification);
-                }, 2000);
-            });
-        }
-    </script>
 </body>
 </html>
 """
 # -------------------------------------------------------------------
-# BUILD VIEWER
 # -------------------------------------------------------------------
 def build_html():
-    """Xây dựng HTML viewer từ dữ liệu Supabase"""
-    paragraphs = get_paragraphs_from_supabase()
-    if not paragraphs:
-        print("❌ Keine Paragraphs zum Erstellen des Viewers verfügbar.")
-        return None
-    sidebar_links = []
-    content_html = []
-    for p in paragraphs:
         pid = p["abs_id"]
         title = p["title"]
-        content = p["content"]
-        # Tạo link cho sidebar
-        sidebar_link = f'''
-        <a class="sidebar-link" href="#{pid}" onclick="scrollToParagraph('{pid}'); return false;">
-            {title}
-        </a>
-        '''
-        sidebar_links.append(sidebar_link)
-        # Tạo nội dung paragraph
-        # Phân loại footnote và nội dung chính
-        lines = content.split('\n')
-        main_content = []
-        footnotes = []
         for line in lines:
-            line = line.strip()
-            if line.lower().startswith('fn ') or line.lower().startswith('fussnote'):
-                footnotes.append(line)
-            elif line:
-                main_content.append(line)
-        # Format main content
-        formatted_content = '<br>'.join(main_content)
-        # Format footnotes
-        footnotes_html = ''
-        if footnotes:
-            footnotes_html = '''
-            <div class="footnotes">
-                <div class="footnotes-title">Fußnoten:</div>
-                ''' + ''.join(f'<div class="footnote-item">{fn}</div>' for fn in footnotes) + '''
-            </div>
-            '''
-        # Tạo paragraph block
-        paragraph_html = f'''
-        <div class="paragraph" id="{pid}">
-            <div class="paragraph-header">
-                <h3 class="paragraph-title">
-                    {title}
-                    <a href="#{pid}" class="anchor" onclick="copyParagraphLink('{pid}'); return false;"
-                       title="Link zu diesem Paragraph kopieren">🔗</a>
-                </h3>
-            </div>
-            <div class="paragraph-content">
-                {formatted_content}
-            </div>
-            {footnotes_html}
-        </div>
-        '''
-        content_html.append(paragraph_html)
-    # Điền nội dung vào template
-    html = VIEW_TEMPLATE
-    html = html.replace('<!-- SIDEBAR_LINKS -->', '\n'.join(sidebar_links))
-    html = html.replace('<!-- PARAGRAPH_CONTENT -->', '\n'.join(content_html))
-    # Thêm metadata
-    html = html.replace(
-        'Aktuelle Fassung',
-        f'Aktuelle Fassung - {len(paragraphs)} Paragraphs'
-    )
     return html
 # -------------------------------------------------------------------
-# UPLOAD TO SUPABASE STORAGE
 # -------------------------------------------------------------------
 def upload_html():
-    """Tạo và tải lên HTML viewer"""
-    print(">>> Baue HTML Viewer...")
     html = build_html()
-    if not html:
-        print("❌ Konnte HTML nicht erstellen.")
-        return
-    try:
-        # Tạo bucket nếu chưa tồn tại
-        try:
-            supabase.storage.get_bucket("hg_viewer")
-        except:
-            supabase.storage.create_bucket("hg_viewer", {
-                "public": True,
-                "file_size_limit": 10485760  # 10MB
-            })
-        # Upload HTML file
-        supabase.storage.from_("hg_viewer").upload(
-            "hg_viewer.html",
-            html.encode("utf-8"),
-            {
-                "content-type": "text/html",
-                "cache-control": "public, max-age=3600"
-            }
-        )
-        print("✅ hg_viewer.html erfolgreich hochgeladen!")
-        print(f"📁 URL: {SUPABASE_URL}/storage/v1/object/public/hg_viewer/hg_viewer.html")
-    except Exception as e:
-        print(f"❌ Fehler beim Upload: {e}")
 if __name__ == "__main__":
-    upload_html()

+# build_hg_viewer.py
 import os
 from supabase import create_client
 from dotenv import load_dotenv
 load_dotenv()
 supabase = create_client(SUPABASE_URL, SUPABASE_SERVICE_ROLE)
+from upload_weblink_to_supabase import extract_paragraphs
+# ======== HTML TEMPLATE ========
+VIEW_TEMPLATE = """
+<!DOCTYPE html>
 <html lang="de">
 <head>
+<meta charset="UTF-8">
+<title>Hochschulgesetz NRW – Paragraph Viewer</title>
+<style>
+body {
+    font-family: Arial, sans-serif;
+    margin: 0;
+    padding: 0;
+    display: flex;
+}
+/* ----------- SIDEBAR ------------- */
+#sidebar {
+    width: 280px;
+    height: 100vh;
+    overflow-y: auto;
+    background: #f5f5f5;
+    border-right: 1px solid #ccc;
+    padding: 15px;
+    position: sticky;
+    top: 0;
+}
+#sidebar h2 {
+    margin-top: 0;
+}
+#searchBox {
+    width: 100%;
+    padding: 8px;
+    font-size: 15px;
+    margin-bottom: 10px;
+    border: 1px solid #aaa;
+    border-radius: 5px;
+}
+.sidebar-link {
+    display: block;
+    padding: 6px 8px;
+    margin-bottom: 4px;
+    text-decoration: none;
+    color: #003366;
+    border-radius: 4px;
+}
+.sidebar-link:hover {
+    background: #e0e7ff;
+    color: #001d4d;
+}
+/* ----------- CONTENT ------------- */
+#content {
+    flex: 1;
+    padding: 25px;
+    max-width: 900px;
+}
+/* Absatz block */
+.para {
+    padding: 20px 0;
+    border-bottom: 1px solid #ddd;
+}
+.para h2 {
+    color: #003366;
+    margin-bottom: 10px;
+}
+/* ----------- Fußnoten ------------- */
+.fn-block {
+    background: #fafafa;
+    border-left: 4px solid #999;
+    padding: 12px;
+    margin-top: 10px;
+    margin-bottom: 25px;
+}
+.fn-toggle {
+    cursor: pointer;
+    font-weight: bold;
+    color: #003366;
+    margin-bottom: 5px;
+}
+.fn-content {
+    display: none;
+    padding-left: 10px;
+}
+.fn-title {
+    font-weight: bold;
+    margin-bottom: 6px;
+}
+.fn-item {
+    margin-bottom: 8px;
+}
+/* ----------- Highlight beim Öffnen ------------- */
+.highlight {
+    animation: flash 2s ease-in-out;
+    background: #fff8c6 !important;
+}
+@keyframes flash {
+    0% { background: #fff8c6; }
+    100% { background: transparent; }
+}
+/* Keyword highlight */
+.keyword {
+    background: yellow;
+    padding: 2px 3px;
+    border-radius: 3px;
+}
+/* Back to top button */
+#topBtn {
+    position: fixed;
+    bottom: 25px;
+    right: 25px;
+    background: #003366;
+    color: white;
+    border-radius: 8px;
+    padding: 10px 14px;
+    cursor: pointer;
+    font-size: 16px;
+    display: none;
+}
+</style>
 </head>
 <body>
+<div id="sidebar">
+    <h2>Inhaltsverzeichnis</h2>
+    <input type="text" id="searchBox" placeholder="Suchen nach § …">
+    <!-- SIDEBAR_LINKS -->
+</div>
+<div id="content">
+    <h1>Hochschulgesetz NRW – Paragraph Viewer</h1>
+    <!-- PARAGRAPH_CONTENT -->
+</div>
+<div id="topBtn" onclick="scrollToTop()">⬆️ Top</div>
+<script>
+// ------ TỰ ĐỘNG HIGHLIGHT Absatz khi có #anchor HIGHLIGHT ABSATZ & SCROLL ------
+window.onload = function() {
+    const anchor = window.location.hash.substring(1);
+    const params = new URLSearchParams(window.location.search);
+    const keywords = params.get("k");
+    if (anchor) {
+        const el = document.getElementById(anchor);
+        if (el) {
+            el.classList.add("highlight");
+            el.scrollIntoView({ behavior: "smooth", block: "center" });
+        }
+    }
+    /* KEYWORD HIGHLIGHT */
+    if (keywords) {
+        const words = keywords.split("%20");
+        highlightKeywords(words);
+    }
+};
+/* --- KEYWORD HIGHLIGHT FUNCTION --- */
+function highlightKeywords(words) {
+    const container = document.getElementById("content");
+    let html = container.innerHTML;
+    words.forEach(word => {
+        if (word.length < 2) return;
+        const regex = new RegExp(`(${decodeURIComponent(word)})`, "gi");
+        html = html.replace(regex, `<span class="keyword">$1</span>`);
+    });
+    container.innerHTML = html;
+}
+/* --- SEARCH IN SIDEBAR --- */
+document.getElementById("searchBox").addEventListener("input", function() {
+    const q = this.value.toLowerCase();
+    document.querySelectorAll(".sidebar-link").forEach(link => {
+        const txt = link.innerText.toLowerCase();
+        link.style.display = txt.includes(q) ? "block" : "none";
+    });
+});
+/* --- COLLAPSIBLE FUSSNOTEN --- */
+document.addEventListener("click", function(e) {
+    if (e.target.classList.contains("fn-toggle")) {
+        const content = e.target.nextElementSibling;
+        content.style.display = content.style.display === "block" ? "none" : "block";
+    }
+});
+/* --- BACK TO TOP BUTTON --- */
+window.onscroll = function() {
+    document.getElementById("topBtn").style.display =
+        window.scrollY > 300 ? "block" : "none";
+};
+function scrollToTop() {
+    window.scrollTo({ top: 0, behavior: 'smooth' });
+}
+</script>
 </body>
 </html>
 """
 # -------------------------------------------------------------------
+# 2. BUILD VIEWER
 # -------------------------------------------------------------------
 def build_html():
+    print(">>> Lade Paragraphs aus Supabase...")
+    paras = extract_paragraphs()
+    sidebar_links = ""
+    content_html = ""
+    for p in paras:
         pid = p["abs_id"]
         title = p["title"]
+        body = p["content"]
+        # Sidebar item
+        sidebar_links += f'<a class="sidebar-link" href="#{pid}">{title}</a>\n'
+        # Fußnoten tách riêng (bắt đầu bằng "Fn 1", "Fn 2", ...)
+        lines = body.split("\n")
+        main_text = []
+        fn_text = []
+        in_fn = False
         for line in lines:
+            if line.startswith("Fn "):
+                in_fn = True
+            if in_fn:
+                fn_text.append(line)
+            else:
+                main_text.append(line)
+        footnotes_html = ""
+        if fn_text:
+            footnotes_html += '<div class="fn-block">'
+            footnotes_html += '<div class="fn-title">Fußnoten:</div>'
+            for fn in fn_text:
+                footnotes_html += f'<div class="fn-item">{fn}</div>'
+            footnotes_html += "</div>"
+        # Paragraph block
+        content_html += f"""
+<div class="para" id="{pid}">
+    <h2>{title}</h2>
+    <div>{'<br>'.join(main_text)}</div>
+    {footnotes_html}
+</div>
+"""
+    html = VIEW_TEMPLATE.replace("<!-- SIDEBAR_LINKS -->", sidebar_links)
+    html = html.replace("<!-- PARAGRAPH_CONTENT -->", content_html)
     return html
 # -------------------------------------------------------------------
+# 3. UPLOAD TO SUPABASE STORAGE
 # -------------------------------------------------------------------
 def upload_html():
     html = build_html()
+    supabase.storage.from_("hg_viewer").update(
+        "hg_clean.html",
+        html.encode("utf-8"),
+        {
+            "content-type": "text/html",
+            "x-upsert": "true"
+        }
+    )
+    print("✔ hg_clean.html uploaded!")
 if __name__ == "__main__":
+    upload_html()

load_documents.py CHANGED Viewed

@@ -1,200 +1,130 @@
 """
-load_documents.py
-Cải thiện việc load tài liệu với xử lý lỗi tốt hơn
 """
 from huggingface_hub import hf_hub_download, list_repo_files
 from langchain_community.document_loaders import PyPDFLoader
 from langchain_core.documents import Document
 from bs4 import BeautifulSoup
-import requests
-import re
-from typing import List, Optional
 DATASET = "Nguyen5/docs"
 PDF_FILE = "f10_bpo_ifb_tei_mif_wii_2021-01-04.pdf"
-HTML_FILE = "Hochschulgesetz_NRW.html"
-def clean_html_content(text: str) -> str:
-    """Làm sạch nội dung HTML"""
-    # Loại bỏ khoảng trắng thừa
-    text = re.sub(r'\s+', ' ', text)
-    # Chuẩn hóa dấu câu
-    text = re.sub(r'\s*([.,;:!?])\s*', r'\1 ', text)
-    # Đảm bảo chữ cái đầu câu viết hoa
-    sentences = text.split('. ')
-    sentences = [s.strip().capitalize() for s in sentences if s.strip()]
-    return '. '.join(sentences)
-def load_recht_nrw_direct() -> List[Document]:
-    """Tải trực tiếp từ recht.nrw.de"""
-    print(">>> Lade Hochschulgesetz NRW direkt von recht.nrw.de...")
-    url = "https://recht.nrw.de/lmi/owa/br_text_anzeigen?v_id=10000000000000000654"
-    try:
-        headers = {
-            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
-        }
-        response = requests.get(url, headers=headers, timeout=60)
-        response.raise_for_status()
-        soup = BeautifulSoup(response.text, 'html.parser')
-        docs = []
-        # Tìm tất cả các paragraph
-        for i, element in enumerate(soup.find_all(['p', 'div', 'td'])):
-            text = element.get_text(" ", strip=True)
-            # Chỉ lấy các phần có chứa §
-            if '§' in text:
-                # Tách title và content
-                lines = text.split('\n')
-                title = lines[0].strip() if lines else f"§ {i+1}"
-                content = " ".join(lines[1:]) if len(lines) > 1 else text
-                metadata = {
-                    "source": "Hochschulgesetz NRW (Website)",
-                    "filename": "recht_nrw_direct.html",
-                    "paragraph_id": f"hg_direct_{i+1}",
-                    "url": url
-                }
-                doc = Document(
-                    page_content=clean_html_content(content),
-                    metadata=metadata
-                )
-                docs.append(doc)
-        print(f"✅ {len(docs)} Paragraphs direkt von recht.nrw.de geladen.")
-        return docs
-    except Exception as e:
-        print(f"❌ Fehler beim Laden von recht.nrw.de: {e}")
-        return []
-def _load_hg_paragraph_documents(html_path: str) -> List[Document]:
     """
-    Lädt Paragraphs aus dem gespeicherten HTML
     """
-    try:
-        with open(html_path, "r", encoding="utf-8") as f:
-            html = f.read()
-        soup = BeautifulSoup(html, "html.parser")
-        docs = []
-        # Suche nach allen relevanten Inhalten
-        for i, p in enumerate(soup.find_all(['p', 'div', 'section'])):
-            text = p.get_text(" ", strip=True)
-            if not text or len(text) < 10:
-                continue
-            # Check if it's a paragraph
-            if '§' in text or 'Artikel' in text:
-                pid = p.get("id", f"hg_para_{i+1}")
-                metadata = {
-                    "source": "Hochschulgesetz NRW (HTML)",
-                    "filename": HTML_FILE,
-                    "paragraph_id": pid,
-                    "type": "paragraph"
-                }
-                docs.append(Document(
-                    page_content=clean_html_content(text),
-                    metadata=metadata
-                ))
-        print(f"✅ {len(docs)} Paragraphs aus HTML geladen.")
-        return docs
-    except Exception as e:
-        print(f"❌ Fehler beim Laden des HTML: {e}")
-        return []
-def load_pdf_documents() -> List[Document]:
-    """Lädt PDF-Dokumente"""
-    print(">>> Lade PDF-Dokumente...")
     try:
         pdf_path = hf_hub_download(
             repo_id=DATASET,
             filename=PDF_FILE,
             repo_type="dataset",
         )
-        print(f"✅ PDF heruntergeladen: {pdf_path}")
-        # Load PDF with PyPDFLoader
         pdf_docs = PyPDFLoader(pdf_path).load()
-        # Enhance metadata
-        for i, doc in enumerate(pdf_docs):
-            doc.metadata.update({
-                "source": "Prüfungsordnung (PDF)",
-                "filename": PDF_FILE,
-                "document_type": "exam_regulation",
-                "chunk_index": i
-            })
-        print(f"✅ {len(pdf_docs)} Seiten aus PDF geladen.")
-        return pdf_docs
     except Exception as e:
-        print(f"❌ Fehler beim Laden des PDF: {e}")
         return []
-def load_documents() -> List[Document]:
-    """
-    Hauptfunktion zum Laden aller Dokumente
-    """
-    print("=== START: load_documents() ===\n")
-    all_docs = []
-    # 1. Load PDF documents
-    pdf_docs = load_pdf_documents()
-    all_docs.extend(pdf_docs)
-    # 2. Try loading from dataset HTML
-    print(">>> Versuche, HTML aus Dataset zu laden...")
     try:
         html_path = hf_hub_download(
             repo_id=DATASET,
             filename=HTML_FILE,
             repo_type="dataset",
         )
-        print(f"✅ HTML heruntergeladen: {html_path}")
         html_docs = _load_hg_paragraph_documents(html_path)
-        all_docs.extend(html_docs)
     except Exception as e:
-        print(f"⚠️ Konnte HTML nicht aus Dataset laden: {e}")
-        # 3. Fallback: Load directly from website
-        print(">>> Fallback: Lade direkt von recht.nrw.de...")
-        web_docs = load_recht_nrw_direct()
-        all_docs.extend(web_docs)
-    print(f"\n=== DONE: {len(all_docs)} Dokumente geladen ===")
-    # Print summary
-    pdf_count = len([d for d in all_docs if "PDF" in d.metadata.get("source", "")])
-    html_count = len([d for d in all_docs if "HTML" in d.metadata.get("source", "")])
-    web_count = len([d for d in all_docs if "Website" in d.metadata.get("source", "")])
-    print(f"📊 Zusammenfassung:")
-    print(f"   - PDF-Seiten: {pdf_count}")
-    print(f"   - HTML-Paragraphs: {html_count}")
-    print(f"   - Web-Paragraphs: {web_count}")
-    return all_docs
 if __name__ == "__main__":
     docs = load_documents()
-    if docs:
-        print(f"\nErstes Dokument (Beispiel):")
-        print(f"Content: {docs[0].page_content[:200]}...")
-        print(f"Metadata: {docs[0].metadata}")

 """
+BƯỚC 1: LOAD DOCUMENTS
+-----------------------
+Debug-full version
+- Lädt Prüfungsordnung (PDF) seitenweise.
+- Lädt Hochschulgesetz NRW aus dem im Dataset gespeicherten HTML,
+  und zerlegt es in einzelne Absätze (Document pro <p>).
 """
 from huggingface_hub import hf_hub_download, list_repo_files
 from langchain_community.document_loaders import PyPDFLoader
 from langchain_core.documents import Document
 from bs4 import BeautifulSoup
 DATASET = "Nguyen5/docs"
 PDF_FILE = "f10_bpo_ifb_tei_mif_wii_2021-01-04.pdf"
+HTML_FILE = "Hochschulgesetz_NRW.html"  # konsistent mit hg_nrw.py
+def _load_hg_paragraph_documents(html_path: str):
     """
+    Liest das generierte Hochschulgesetz-HTML ein und erzeugt
+    pro <p>-Element einen LangChain-Document mit:
+      - page_content = Text des Absatzes
+      - metadata:
+          source       = "Hochschulgesetz NRW (HTML)"
+          filename     = HTML_FILE
+          paragraph_id = id-Attribut (z.B. 'hg_abs_12'), falls vorhanden
     """
+    with open(html_path, "r", encoding="utf-8") as f:
+        html = f.read()
+    soup = BeautifulSoup(html, "html.parser")
+    docs = []
+    for p in soup.find_all("p"):
+        text = p.get_text(" ", strip=True)
+        if not text:
+            continue
+        pid = p.get("id")
+        metadata = {
+            "source": "Hochschulgesetz NRW (HTML)",
+            "filename": HTML_FILE,
+        }
+        if pid:
+            metadata["paragraph_id"] = pid
+        docs.append(Document(page_content=text, metadata=metadata))
+    print(f"Loaded {len(docs)} paragraph Documents from HG-HTML.\n")
+    return docs
+def load_documents():
+    print("=== START: load_documents() ===\n")
+    # -------------------------
+    # Check files in dataset
+    # -------------------------
+    print(">>> Checking dataset file list from HuggingFace...")
+    files = list_repo_files(DATASET, repo_type="dataset")
+    print("Files in dataset:", files, "\n")
+    docs = []
+    # -------------------------
+    # Load PDF
+    # -------------------------
+    print(">>> Step 1: Download PDF from HuggingFace...")
     try:
         pdf_path = hf_hub_download(
             repo_id=DATASET,
             filename=PDF_FILE,
             repo_type="dataset",
         )
+        print(f"Downloaded PDF to local cache:\n{pdf_path}\n")
+    except Exception as e:
+        print("ERROR downloading PDF:", e)
+        return []
+    print(">>> Step 1.1: Loading PDF pages...")
+    try:
         pdf_docs = PyPDFLoader(pdf_path).load()
+        print(f"Loaded {len(pdf_docs)} PDF pages.\n")
     except Exception as e:
+        print("ERROR loading PDF:", e)
         return []
+    for d in pdf_docs:
+        d.metadata["source"] = "Prüfungsordnung (PDF)"
+        d.metadata["filename"] = PDF_FILE
+    docs.extend(pdf_docs)
+    # -------------------------
+    # Load HTML (Hochschulgesetz NRW)
+    # -------------------------
+    print(">>> Step 2: Download HTML from HuggingFace...")
     try:
         html_path = hf_hub_download(
             repo_id=DATASET,
             filename=HTML_FILE,
             repo_type="dataset",
         )
+        print(f"Downloaded HTML to local cache:\n{html_path}\n")
+    except Exception as e:
+        print("ERROR downloading HTML:", e)
+        return docs
+    print(">>> Step 2.1: Loading HG-HTML and splitting into paragraphs...")
+    try:
         html_docs = _load_hg_paragraph_documents(html_path)
     except Exception as e:
+        print("ERROR loading / parsing HTML:", e)
+        return docs
+    docs.extend(html_docs)
+    print("=== DONE: load_documents() ===\n")
+    return docs
 if __name__ == "__main__":
+    print("\n=== Running load_documents.py directly ===\n")
     docs = load_documents()
+    print(f"\n>>> TOTAL documents loaded: {len(docs)}")
+    if len(docs):
+        print("\nExample metadata from 1st document:")
+        print(docs[0].metadata)

rag_pipeline.py CHANGED Viewed

@@ -1,197 +1,194 @@
 """
-RAG PIPELINE – Verbesserte Version mit präzisen Prompts
 """
 from typing import List, Dict, Any, Tuple
 from langchain_core.messages import SystemMessage, HumanMessage
-from langchain_core.documents import Document
-import re
 # URLs für Quellen
-PDF_BASE_URL = "https://huggingface.co/datasets/Nguyen5/docs/resolve/main/f10_bpo_ifb_tei_mif_wii_2021-01-04.pdf"
-LAW_VIEWER_URL = "https://YOUR_SUPABASE_URL/storage/v1/object/public/hg_viewer/hg_viewer.html"
-MAX_CHARS = 1000
-def format_chunk_content(chunk: Document) -> str:
-    """Format chunk content for better readability"""
-    content = chunk.page_content
-    # Remove excessive whitespace
-    content = re.sub(r'\s+', ' ', content)
-    # Ensure proper sentence endings
-    if not content.strip().endswith(('.', '!', '?')):
-        content = content.strip() + '.'
-    return content[:MAX_CHARS]
-def build_sources_metadata(docs: List[Document]) -> List[Dict[str, Any]]:
     """
-    Erzeugt strukturierte Quellen-Informationen
     """
-    sources = []
-    for i, doc in enumerate(docs, 1):
-        metadata = doc.metadata
-        source_type = metadata.get("source", "")
-        page = metadata.get("page")
-        para_id = metadata.get("paragraph_id", "")
-        # Prepare snippet
-        snippet = format_chunk_content(doc)
-        if len(snippet) > 300:
-            snippet = snippet[:297] + "..."
-        # Determine URL
-        url = None
-        if "PDF" in source_type:
             if isinstance(page, int):
                 url = f"{PDF_BASE_URL}#page={page + 1}"
             else:
                 url = PDF_BASE_URL
-        elif "HTML" in source_type or "Website" in source_type:
             if para_id:
-                url = f"{LAW_VIEWER_URL}#{para_id}"
             else:
-                url = LAW_VIEWER_URL
-        # Build source info
-        source_info = {
-            "id": i,
-            "source": source_type,
-            "page": page + 1 if isinstance(page, int) else None,
-            "paragraph_id": para_id,
-            "url": url,
-            "snippet": snippet,
-            "content_preview": doc.page_content[:200] + "..." if len(doc.page_content) > 200 else doc.page_content
-        }
-        sources.append(source_info)
-    return sources
-def format_context(docs: List[Document]) -> str:
-    """
-    Formatiert den Kontext für den Prompt
-    """
     if not docs:
-        return "KEIN_RELEVANTER_KONTEXT_GEFUNDEN"
-    context_parts = []
-    for i, doc in enumerate(docs, 1):
-        content = format_chunk_content(doc)
-        metadata = doc.metadata
-        # Build source description
-        source_desc = metadata.get("source", "Unbekannte Quelle")
-        if "page" in metadata and metadata["page"] is not None:
-            source_desc += f", Seite {metadata['page'] + 1}"
-        if "paragraph_id" in metadata:
-            source_desc += f", {metadata['paragraph_id']}"
-        context_parts.append(f"【Quelle {i}】{source_desc}\n{content}")
-    return "\n\n".join(context_parts)
-# ========== IMPROVED SYSTEM PROMPT ==========
-SYSTEM_PROMPT = """
-Du bist ein hochpräziser juristischer Assistenz-Chatbot für Prüfungsrecht an Hochschulen in Nordrhein-Westfalen.
-Deine Wissensbasis umfasst ausschließlich:
-1. Die spezifische Prüfungsordnung (PDF-Dokument)
-2. Das Hochschulgesetz NRW (Hochschulgesetz - HG)
-❗ STRENGE ANWEISUNGEN:
-1. **AUSSCHLIESSLICHE KONTEXTNUTZUNG:**
-   - Verwende NUR die bereitgestellten Quellen aus der Wissensbasis.
-   - Wenn Informationen nicht im Kontext stehen, sage explizit: "Auf Basis der vorliegenden Dokumente kann ich diese Frage nicht sicher beantworten."
-   - KEINE Vermutungen, Spekulationen oder externes Wissen.
-2. **PRÄZISE JURISTISCHE ANTWORTEN:**
-   - Formuliere in vollständigen, grammatikalisch korrekten Sätzen.
-   - Verwende präzise juristische Sprache, aber bleibe verständlich.
-   - Strukturiere komplexe Antworten mit Absätzen oder Aufzählungen.
-3. **QUELLENNACHWEISE:**
-   - Verweise immer auf die konkrete Quelle (Prüfungsordnung §X oder Hochschulgesetz §Y).
-   - Bei der Prüfungsordnung gib die Seite an.
-   - Beim Hochschulgesetz verweise auf den Paragraphen.
-4. **ANTWORTSTRUKTUR:**
-   a) Kurze präzise Antwort zuerst
-   b) Detaillierte Erklärung mit Quellenangaben
-   c) Falls relevant: praktische Hinweise basierend auf dem Kontext
-5. **FEHLENDE INFORMATIONEN:**
-   - Wenn der Kontext unvollständig ist, erkläre, welche Informationen fehlen.
-   - Biete an, nur die vorhandenen Informationen zusammenzufassen.
-6. **SPRACHE:**
-   - Verwende ausschließlich formelles Deutsch.
-   - Vermeide Umgangssprache und Abkürzungen.
-Deine Antworten müssen rechtlich korrekt, vollständig und nachprüfbar sein.
-"""
-def create_human_prompt(question: str, context: str) -> str:
-    """
-    Erstellt optimierten Human Prompt
-    """
-    return f"""FRAGE DES NUTZERS:
-{question}
-VERFÜGBARE RECHTSQUELLEN:
-{context if context else "KEINE RELEVANTEN QUELLEN GEFUNDEN"}
-AUFGABE:
-Beantworte die Frage ausschließlich auf Basis der oben genannten Rechtsquellen.
-ANFORDERUNGEN:
-1. Gib eine präzise juristische Antwort in vollständigen Sätzen.
-2. Zitiere konkret:
-   - Für die Prüfungsordnung: "Laut Prüfungsordnung, §X auf Seite Y, ..."
-   - Für das Hochschulgesetz: "Gemäß Hochschulgesetz NRW §Z, ..."
-3. Wenn mehrere Quellen relevant sind, erwähne alle.
-4. Wenn Informationen fehlen, erkläre dies klar.
-5. Strukturiere die Antwort logisch.
-ANTWORT (auf Deutsch):"""
 def answer(question: str, retriever, chat_model) -> Tuple[str, List[Dict[str, Any]]]:
     """
-    Haupt-RAG-Funktion mit verbessertem Prompting
     """
-    # 1. Retrieve relevant documents
     docs = retriever.invoke(question)
-    # 2. Format context
     context_str = format_context(docs)
-    # 3. Create prompt
-    human_prompt = create_human_prompt(question, context_str)
-    # 4. Call LLM
-    messages = [
         SystemMessage(content=SYSTEM_PROMPT),
-        HumanMessage(content=human_prompt)
     ]
-    try:
-        result = chat_model.invoke(messages)
-        answer_text = result.content.strip()
-        # Clean up answer
-        answer_text = re.sub(r'\n\s*\n+', '\n\n', answer_text)  # Remove excessive newlines
-        answer_text = answer_text.replace("KEINE RELEVANTEN QUELLEN GEFUNDEN",
-                                         "Auf Basis der vorliegenden Dokumente kann ich diese Frage nicht sicher beantworten.")
-    except Exception as e:
-        answer_text = f"Fehler bei der Generierung der Antwort: {str(e)}"
-    # 5. Build sources metadata
     sources = build_sources_metadata(docs)
-    return answer_text, sources

 """
+RAG PIPELINE – Version 26.11 (ohne Modi, stabil, juristisch korrekt)
 """
 from typing import List, Dict, Any, Tuple
 from langchain_core.messages import SystemMessage, HumanMessage
+from load_documents import DATASET, PDF_FILE, HTML_FILE
+# -------------------------------------------------------------------
 # URLs für Quellen
+# -------------------------------------------------------------------
+# Direktes PDF im Dataset (für #page)
+PDF_BASE_URL = f"https://huggingface.co/datasets/{DATASET}/resolve/main/{PDF_FILE}"
+# Hochschulgesetz-HTML im Dataset (enthält <p id="hg_abs_X"> …)
+LAW_DATASET_URL = f"https://huggingface.co/datasets/{DATASET}/resolve/main/{HTML_FILE}"
+# Offizielle Recht.NRW-Druckversion (für Viewer im Frontend)
+LAW_URL = (
+    "https://recht.nrw.de/lmi/owa/br_bes_text?"
+    "print=1&anw_nr=2&gld_nr=2&ugl_nr=221&val=28364&ver=0&"
+    "aufgehoben=N&keyword=&bes_id=28364&show_preview=1"
+)
+MAX_CHARS = 900
+# -----------------------------
+# Quellen formatieren
+# -----------------------------
+def build_sources_metadata(docs: List) -> List[Dict[str, Any]]:
     """
+    Erzeugt eine Liste strukturierter Quellen-Infos:
+    [
+      {
+        "id": 1,
+        "source": "Prüfungsordnung (PDF)" / "Hochschulgesetz NRW (HTML)",
+        "page": 3,          # nur bei PDF
+        "url": "...",       # direkter Klick-Link
+        "snippet": "Erste 300 Zeichen des Chunks..."
+      },
+      ...
+    ]
     """
+    srcs = []
+    for i, d in enumerate(docs):
+        meta = d.metadata
+        src = meta.get("source", "")
+        page = meta.get("page")
+        snippet = d.page_content[:300].replace("\n", " ")
+        # PDF-Link
+        if "Prüfungsordnung" in src:
             if isinstance(page, int):
+                # PyPDFLoader: page ist 0-basiert, Anzeige 1-basiert
                 url = f"{PDF_BASE_URL}#page={page + 1}"
             else:
                 url = PDF_BASE_URL
+        # NRW-Gesetz (HTML im Dataset mit Absatz-IDs)
+        elif "Hochschulgesetz" in src:
+            para_id = meta.get("paragraph_id")
             if para_id:
+                # Klick führt direkt zum Absatz im Dataset-HTML
+                url = f"{LAW_DATASET_URL}#{para_id}"
             else:
+                # Fallback: offizielle Druckversion (ohne Absatz-Anker)
+                url = LAW_URL
+            page = None  # keine Seitenangabe für Gesetz-HTML
+        else:
+            url = None
+        srcs.append(
+            {
+                "id": i + 1,
+                "source": src,
+                "page": page + 1 if isinstance(page, int) else None,
+                "url": url,
+                "snippet": snippet,
+            }
+        )
+    return srcs
+# -----------------------------
+# Kontext formatieren
+# -----------------------------
+def format_context(docs):
     if not docs:
+        return "(Kein relevanter Kontext im Dokument gefunden.)"
+    out = []
+    for i, d in enumerate(docs):
+        txt = d.page_content[:MAX_CHARS]
+        src = d.metadata.get("source")
+        page = d.metadata.get("page")
+        if "Prüfungsordnung" in (src or "") and isinstance(page, int):
+            src_str = f"{src}, Seite {page + 1}"
+        else:
+            src_str = src
+        out.append(f"[KONTEXT {i+1}] ({src_str})\n{txt}")
+    return "\n\n".join(out)
+# -----------------------------
+# Systemprompt — verschärft
+# -----------------------------
+SYSTEM_PROMPT = """
+Du bist ein hochpräziser juristischer Chatbot für Prüfungsrecht
+mit Zugriff nur auf:
+- die Prüfungsordnung (als PDF) und
+- das Hochschulgesetz NRW (als HTML aus der offiziellen Druckversion).
+Strenge Regeln:
+1. Antworte ausschließlich anhand des bereitgestellten Kontextes
+   (KONTEXT-Abschnitte). Wenn die Information nicht im Kontext steht,
+   sage ausdrücklich, dass dies aus den vorliegenden Dokumenten nicht
+   hervorgeht und du dazu nichts Sicheres sagen kannst.
+2.
+   Keine Spekulationen, keine Vermutungen.
+3. Antworte in zusammenhängenden, ganzen Sätzen. Verwende keine Mischung aus Deutsch und Englisch.
+4. Nenne, soweit aus dem Kontext erkennbar,
+   - die rechtliche Grundlage (z.B. Paragraph, Artikel),
+   - das Dokument (Prüfungsordnung / Hochschulgesetz NRW),
+   - die Seite (bei der Prüfungsordnung), wenn im Kontext vorhanden.
+5. Füge KEINE externen Informationen hinzu, z.B. aus anderen Gesetzen,
+   Webseiten oder allgemeinem Wissen. Nur das, was im Kontext steht,
+   darf in der Antwort verwendet werden.
+Wenn der Kontext keine eindeutige Antwort zulässt, erkläre klar,
+warum keine sichere Antwort möglich ist und welche Informationen
+im Dokument fehlen.
+"""
+# -----------------------------
+# Hauptfunktion
+# -----------------------------
 def answer(question: str, retriever, chat_model) -> Tuple[str, List[Dict[str, Any]]]:
     """
+    Haupt-RAG-Funktion:
+    - ruft retriever.invoke(question) auf,
+    - baut einen präzisen Prompt mit KONTEXT,
+    - ruft LLM auf,
+    - gibt Antworttext + Quellenliste zurück.
     """
+    # 1. Dokumente holen
     docs = retriever.invoke(question)
     context_str = format_context(docs)
+    # 2. Prompt bauen
+    human = f"""
+FRAGE:
+{question}
+NUTZE AUSSCHLIESSLICH DIESEN KONTEXT:
+{context_str}
+AUFGABE:
+Formuliere eine juristisch korrekte, gut verständliche Antwort
+ausschließlich anhand des obigen Kontextes.
+- Wenn der Kontext aus den Dokumenten eine klare Antwort erlaubt,
+  erläutere diese strukturiert und in vollständigen Sätzen.
+- Wenn der Kontext KEINE klare Antwort erlaubt oder wichtige Informationen
+  fehlen, erkläre das offen und formuliere KEINE Vermutung.
+"""
+    msgs = [
         SystemMessage(content=SYSTEM_PROMPT),
+        HumanMessage(content=human),
     ]
+    # 3. LLM aufrufen
+    result = chat_model.invoke(msgs)
+    answer_text = result.content.strip()
+    # 4. Quellenliste bauen
     sources = build_sources_metadata(docs)
+    return answer_text, sources