Spaces:

Nguyen5
/

chatbot

Sleeping

App Files Files Community

Nguyen5 commited on Dec 4, 2025

Commit

24753ba

1 Parent(s): 93b6370

commit

Browse files

Files changed (2) hide show

app.py +172 -129
load_documents.py +89 -50

app.py CHANGED Viewed

@@ -1,169 +1,212 @@
-"""
-load_documents.py – Improved Clean Version
-------------------------------------------
-Lädt:
-1) Prüfungsordnung (PDF) seitenweise.
-2) Hochschulgesetz NRW aus generierter HTML-Datei
-   (hg_clean.html oder Hochschulgesetz_NRW.html)
-   und erzeugt pro Absatz (<p>) ein Document.
-Verbesserungen:
-- Keine HTML-Rohartefakte
-- Kein Abbrechen in der Mitte von Sätzen
-- Entfernt doppelte Leerzeichen
-- metadata.paragraph_id wird sauber übernommen
-"""
-from huggingface_hub import hf_hub_download, list_repo_files
-from langchain_community.document_loaders import PyPDFLoader
-from langchain_core.documents import Document
-from bs4 import BeautifulSoup
-DATASET = "Nguyen5/docs"
-PDF_FILE = "f10_bpo_ifb_tei_mif_wii_2021-01-04.pdf"
-HTML_FILE = "Hochschulgesetz_NRW.html"  # stored inside dataset
-# ================================================================
-# Hilfsfunktion: lädt HG-Absätze sauber & robust
-# ================================================================
-def _load_hg_paragraph_documents(html_path: str):
-    """
-    Liest Hochschulgesetz NRW HTML ein und erzeugt pro <p>-Tag ein Document.
-    Verbesserungen:
-    - Entfernt doppelte Leerzeichen -> " ".join(text.split())
-    - Entfernt leere Texte
-    - Übernimmt paragraph_id (id="hg_abs_12" oder id="para_12")
-    """
-    with open(html_path, "r", encoding="utf-8") as f:
-        html = f.read()
-    soup = BeautifulSoup(html, "html.parser")
-    docs = []
-    for p in soup.find_all("p"):
-        text = p.get_text(" ", strip=True)
-        if not text:
-            continue
-        # normalize whitespace
-        text = " ".join(text.split())
-        paragraph_id = p.get("id")
-        metadata = {
-            "source": "Hochschulgesetz NRW (HTML)",
-            "filename": HTML_FILE,
-        }
-        if paragraph_id:
-            metadata["paragraph_id"] = paragraph_id
-        docs.append(
-            Document(
-                page_content=text,
-                metadata=metadata
-            )
-        )
-    print(f"[HG] Loaded {len(docs)} paragraph Documents.\n")
-    return docs
-# ================================================================
-# Hauptfunktion: lädt PDF + HG-HTML
-# ================================================================
-def load_documents():
-    print("\n=== START: load_documents() ===\n")
-    docs = []
-    # ------------------------------------------------------------
-    # 1) Dateien prüfen
-    # ------------------------------------------------------------
-    print(">>> Checking dataset on HuggingFace ...")
-    files = list_repo_files(DATASET, repo_type="dataset")
-    print("Files found:", files, "\n")
-    # ------------------------------------------------------------
-    # 2) PDF laden
-    # ------------------------------------------------------------
-    print(">>> Downloading Prüfungsordnung PDF ...")
-    try:
-        pdf_path = hf_hub_download(
-            repo_id=DATASET,
-            filename=PDF_FILE,
-            repo_type="dataset",
-        )
-        print(f"PDF downloaded:\n{pdf_path}\n")
-    except Exception as e:
-        print("ERROR downloading PDF:", e)
-        return []
-    print(">>> Loading PDF pages ...")
-    try:
-        pdf_docs = PyPDFLoader(pdf_path).load()
-    except Exception as e:
-        print("ERROR loading PDF:", e)
-        return []
-    print(f"Loaded {len(pdf_docs)} PDF pages.\n")
-    # metadata ergänzen
-    for d in pdf_docs:
-        d.metadata["source"] = "Prüfungsordnung (PDF)"
-        d.metadata["filename"] = PDF_FILE
-    docs.extend(pdf_docs)
-    # ------------------------------------------------------------
-    # 3) HTML laden
-    # ------------------------------------------------------------
-    print(">>> Downloading Hochschulgesetz HTML ...")
-    try:
-        html_path = hf_hub_download(
-            repo_id=DATASET,
-            filename=HTML_FILE,
-            repo_type="dataset",
-        )
-        print(f"HTML downloaded:\n{html_path}\n")
-    except Exception as e:
-        print("ERROR downloading HTML:", e)
-        return docs  # PDF at least loaded
-    print(">>> Parsing HG HTML into paragraphs ...")
-    try:
-        html_docs = _load_hg_paragraph_documents(html_path)
-    except Exception as e:
-        print("ERROR parsing HTML:", e)
-        return docs
-    docs.extend(html_docs)
-    print(f"=== DONE: load_documents() → total {len(docs)} documents ===\n")
-    return docs
-# ================================================================
-# Debug
-# ================================================================
 if __name__ == "__main__":
-    print("\n=== Running load_documents.py ===\n")
-    documents = load_documents()
-    print(f"\n>>> TOTAL documents loaded: {len(documents)}")
-    if len(documents):
-        print("\nExample Document:")
-        print(documents[0].page_content[:300])
-        print("Metadata:", documents[0].metadata)

+# app.py – Prüfungsrechts-Chatbot (RAG + Sprachmodus)
+# Version 26.11 – ohne Modi, stabil für Text + Voice
+import gradio as gr
+from gradio_pdf import PDF
+from huggingface_hub import hf_hub_download
+from load_documents import load_documents, DATASET, PDF_FILE, HTML_FILE
+from split_documents import split_documents
+from vectorstore import build_vectorstore
+from retriever import get_retriever
+from llm import load_llm
+from rag_pipeline import answer, PDF_BASE_URL, LAW_URL
+from speech_io import transcribe_audio, synthesize_speech
+# =====================================================
+# INITIALISIERUNG (global)
+# =====================================================
+print("🔹 Lade Dokumente ...")
+_docs = load_documents()
+print("🔹 Splitte Dokumente ...")
+_chunks = split_documents(_docs)
+print("🔹 Baue VectorStore (FAISS) ...")
+_vs = build_vectorstore(_chunks)
+print("🔹 Erzeuge Retriever ...")
+_retriever = get_retriever(_vs)
+print("🔹 Lade LLM ...")
+_llm = load_llm()
+print("🔹 Lade Dateien für Viewer …")
+_pdf_path = hf_hub_download(DATASET, PDF_FILE, repo_type="dataset")
+_html_path = hf_hub_download(DATASET, HTML_FILE, repo_type="dataset")
+# =====================================================
+# Quellen formatieren – Markdown für Chat
+# =====================================================
+def format_sources_markdown(sources):
+    if not sources:
+        return ""
+    lines = ["", "**📚 Quellen (genutzte Dokumentstellen):**"]
+    for s in sources:
+        sid = s["id"]
+        src = s["source"]
+        page = s["page"]
+        url = s["url"]
+        snippet = s["snippet"]
+        title = f"Quelle {sid} – {src}"
+        if url:
+            base = f"- [{title}]({url})"
+        else:
+            base = f"- {title}"
+        if page and "Prüfungsordnung" in src:
+            base += f", Seite {page}"
+        lines.append(base)
+        if snippet:
+            lines.append(f"  > {snippet}")
+    return "\n".join(lines)
+# =====================================================
+# TEXT CHATBOT
+# =====================================================
+def chatbot_text(user_message, history):
+    if not user_message:
+        return history, ""
+    answer_text, sources = answer(
+        question=user_message,
+        retriever=_retriever,
+        chat_model=_llm,
+    )
+    quellen_block = format_sources_markdown(sources)
+    history = history + [
+        {"role": "user", "content": user_message},
+        {"role": "assistant", "content": answer_text + quellen_block},
+    ]
+    return history, ""
+# =====================================================
+# VOICE CHATBOT
+# =====================================================
+def chatbot_voice(audio_path, history):
+    # 1. Speech → Text
+    text = transcribe_audio(audio_path)
+    if not text:
+        return history, None, ""
+    # Lưu vào lịch sử chat
+    history = history + [{"role": "user", "content": text}]
+    # 2. RAG trả lời
+    answer_text, sources = answer(
+        question=text,
+        retriever=_retriever,
+        chat_model=_llm,
+    )
+    quellen_block = format_sources_markdown(sources)
+    bot_msg = answer_text + quellen_block
+    history = history + [{"role": "assistant", "content": bot_msg}]
+    # 3. Text → Speech
+    audio = synthesize_speech(bot_msg)
+    return history, audio, ""
+# =====================================================
+# LAST ANSWER → TTS
+# =====================================================
+def read_last_answer(history):
+    if not history:
+        return None
+    for msg in reversed(history):
+        if msg["role"] == "assistant":
+            return synthesize_speech(msg["content"])
+    return None
+# =====================================================
+# UI – GRADIO
+# =====================================================
+with gr.Blocks(title="Prüfungsrechts-Chatbot (RAG + Sprache)") as demo:
+    gr.Markdown("# 🧑‍⚖️ Prüfungsrechts-Chatbot")
+    gr.Markdown(
+        "Dieser Chatbot beantwortet Fragen **ausschließlich** aus der "
+        "Prüfungsordnung (PDF) und dem Hochschulgesetz NRW (Website). "
+        "Du kannst Text eingeben oder direkt ins Mikrofon sprechen."
+    )
+    with gr.Row():
+        with gr.Column(scale=2):
+            chatbot = gr.Chatbot(label="Chat", height=500)
+            msg = gr.Textbox(
+                label="Frage eingeben",
+                placeholder="Stelle deine Frage zum Prüfungsrecht …",
+            )
+            # TEXT SENDEN
+            msg.submit(
+                chatbot_text,
+                [msg, chatbot],
+                [chatbot, msg]
+            )
+            send_btn = gr.Button("Senden (Text)")
+            send_btn.click(
+                chatbot_text,
+                [msg, chatbot],
+                [chatbot, msg]
+            )
+            # SPRACHEINGABE
+            gr.Markdown("### 🎙️ Spracheingabe")
+            voice_in = gr.Audio(sources=["microphone"], type="filepath")
+            voice_out = gr.Audio(label="Vorgelesene Antwort", type="numpy")
+            voice_btn = gr.Button("Sprechen & senden")
+            voice_btn.click(
+                chatbot_voice,
+                [voice_in, chatbot],
+                [chatbot, voice_out, msg]
+            )
+            read_btn = gr.Button("🔁 Antwort erneut vorlesen")
+            read_btn.click(
+                read_last_answer,
+                [chatbot],
+                [voice_out]
+            )
+            clear_btn = gr.Button("Chat zurücksetzen")
+            clear_btn.click(lambda: [], None, chatbot)
+        # =====================
+        # RECHTE SPALTE: Viewer
+        # =====================
+        with gr.Column(scale=1):
+            gr.Markdown("### 📄 Prüfungsordnung (PDF)")
+            PDF(_pdf_path, height=350)
+            gr.Markdown("### 📘 Hochschulgesetz NRW (Website)")
+            gr.HTML(
+                f'<iframe src="{LAW_URL}" style="width:100%;height:350px;border:none;"></iframe>'
+            )
 if __name__ == "__main__":
+    demo.queue().launch(ssr_mode=False, show_error=True)

load_documents.py CHANGED Viewed

@@ -1,11 +1,18 @@
 """
-BƯỚC 1: LOAD DOCUMENTS
------------------------
-Debug-full version
-- Lädt Prüfungsordnung (PDF) seitenweise.
-- Lädt Hochschulgesetz NRW aus dem im Dataset gespeicherten HTML,
-  und zerlegt es in einzelne Absätze (Document pro <p>).
 """
 from huggingface_hub import hf_hub_download, list_repo_files
@@ -15,22 +22,28 @@ from bs4 import BeautifulSoup
 DATASET = "Nguyen5/docs"
 PDF_FILE = "f10_bpo_ifb_tei_mif_wii_2021-01-04.pdf"
-HTML_FILE = "Hochschulgesetz_NRW.html"  # konsistent mit hg_nrw.py
 def _load_hg_paragraph_documents(html_path: str):
     """
-    Liest das generierte Hochschulgesetz-HTML ein und erzeugt
-    pro <p>-Element einen LangChain-Document mit:
-      - page_content = Text des Absatzes
-      - metadata:
-          source       = "Hochschulgesetz NRW (HTML)"
-          filename     = HTML_FILE
-          paragraph_id = id-Attribut (z.B. 'hg_abs_12'), falls vorhanden
     """
     with open(html_path, "r", encoding="utf-8") as f:
         html = f.read()
     soup = BeautifulSoup(html, "html.parser")
     docs = []
     for p in soup.find_all("p"):
@@ -38,93 +51,119 @@ def _load_hg_paragraph_documents(html_path: str):
         if not text:
             continue
-        pid = p.get("id")
         metadata = {
             "source": "Hochschulgesetz NRW (HTML)",
             "filename": HTML_FILE,
         }
-        if pid:
-            metadata["paragraph_id"] = pid
-        docs.append(Document(page_content=text, metadata=metadata))
-    print(f"Loaded {len(docs)} paragraph Documents from HG-HTML.\n")
     return docs
 def load_documents():
-    print("=== START: load_documents() ===\n")
-    # -------------------------
-    # Check files in dataset
-    # -------------------------
-    print(">>> Checking dataset file list from HuggingFace...")
     files = list_repo_files(DATASET, repo_type="dataset")
-    print("Files in dataset:", files, "\n")
-    docs = []
-    # -------------------------
-    # Load PDF
-    # -------------------------
-    print(">>> Step 1: Download PDF from HuggingFace...")
     try:
         pdf_path = hf_hub_download(
             repo_id=DATASET,
             filename=PDF_FILE,
             repo_type="dataset",
         )
-        print(f"Downloaded PDF to local cache:\n{pdf_path}\n")
     except Exception as e:
         print("ERROR downloading PDF:", e)
         return []
-    print(">>> Step 1.1: Loading PDF pages...")
     try:
         pdf_docs = PyPDFLoader(pdf_path).load()
-        print(f"Loaded {len(pdf_docs)} PDF pages.\n")
     except Exception as e:
         print("ERROR loading PDF:", e)
         return []
     for d in pdf_docs:
         d.metadata["source"] = "Prüfungsordnung (PDF)"
         d.metadata["filename"] = PDF_FILE
     docs.extend(pdf_docs)
-    # -------------------------
-    # Load HTML (Hochschulgesetz NRW)
-    # -------------------------
-    print(">>> Step 2: Download HTML from HuggingFace...")
     try:
         html_path = hf_hub_download(
             repo_id=DATASET,
             filename=HTML_FILE,
             repo_type="dataset",
         )
-        print(f"Downloaded HTML to local cache:\n{html_path}\n")
     except Exception as e:
         print("ERROR downloading HTML:", e)
-        return docs
-    print(">>> Step 2.1: Loading HG-HTML and splitting into paragraphs...")
     try:
         html_docs = _load_hg_paragraph_documents(html_path)
     except Exception as e:
-        print("ERROR loading / parsing HTML:", e)
         return docs
     docs.extend(html_docs)
-    print("=== DONE: load_documents() ===\n")
     return docs
-if __name__ == "__main__":
-    print("\n=== Running load_documents.py directly ===\n")
-    docs = load_documents()
-    print(f"\n>>> TOTAL documents loaded: {len(docs)}")
-    if len(docs):
-        print("\nExample metadata from 1st document:")
-        print(docs[0].metadata)

 """
+load_documents.py – Improved Clean Version
+------------------------------------------
+Lädt:
+1) Prüfungsordnung (PDF) seitenweise.
+2) Hochschulgesetz NRW aus generierter HTML-Datei
+   (hg_clean.html oder Hochschulgesetz_NRW.html)
+   und erzeugt pro Absatz (<p>) ein Document.
+Verbesserungen:
+- Keine HTML-Rohartefakte
+- Kein Abbrechen in der Mitte von Sätzen
+- Entfernt doppelte Leerzeichen
+- metadata.paragraph_id wird sauber übernommen
 """
 from huggingface_hub import hf_hub_download, list_repo_files
 DATASET = "Nguyen5/docs"
 PDF_FILE = "f10_bpo_ifb_tei_mif_wii_2021-01-04.pdf"
+HTML_FILE = "Hochschulgesetz_NRW.html"  # stored inside dataset
+# ================================================================
+# Hilfsfunktion: lädt HG-Absätze sauber & robust
+# ================================================================
 def _load_hg_paragraph_documents(html_path: str):
     """
+    Liest Hochschulgesetz NRW HTML ein und erzeugt pro <p>-Tag ein Document.
+    Verbesserungen:
+    - Entfernt doppelte Leerzeichen -> " ".join(text.split())
+    - Entfernt leere Texte
+    - Übernimmt paragraph_id (id="hg_abs_12" oder id="para_12")
     """
     with open(html_path, "r", encoding="utf-8") as f:
         html = f.read()
     soup = BeautifulSoup(html, "html.parser")
     docs = []
     for p in soup.find_all("p"):
         if not text:
             continue
+        # normalize whitespace
+        text = " ".join(text.split())
+        paragraph_id = p.get("id")
         metadata = {
             "source": "Hochschulgesetz NRW (HTML)",
             "filename": HTML_FILE,
         }
+        if paragraph_id:
+            metadata["paragraph_id"] = paragraph_id
+        docs.append(
+            Document(
+                page_content=text,
+                metadata=metadata
+            )
+        )
+    print(f"[HG] Loaded {len(docs)} paragraph Documents.\n")
     return docs
+# ================================================================
+# Hauptfunktion: lädt PDF + HG-HTML
+# ================================================================
 def load_documents():
+    print("\n=== START: load_documents() ===\n")
+    docs = []
+    # ------------------------------------------------------------
+    # 1) Dateien prüfen
+    # ------------------------------------------------------------
+    print(">>> Checking dataset on HuggingFace ...")
     files = list_repo_files(DATASET, repo_type="dataset")
+    print("Files found:", files, "\n")
+    # ------------------------------------------------------------
+    # 2) PDF laden
+    # ------------------------------------------------------------
+    print(">>> Downloading Prüfungsordnung PDF ...")
     try:
         pdf_path = hf_hub_download(
             repo_id=DATASET,
             filename=PDF_FILE,
             repo_type="dataset",
         )
+        print(f"PDF downloaded:\n{pdf_path}\n")
     except Exception as e:
         print("ERROR downloading PDF:", e)
         return []
+    print(">>> Loading PDF pages ...")
     try:
         pdf_docs = PyPDFLoader(pdf_path).load()
     except Exception as e:
         print("ERROR loading PDF:", e)
         return []
+    print(f"Loaded {len(pdf_docs)} PDF pages.\n")
+    # metadata ergänzen
     for d in pdf_docs:
         d.metadata["source"] = "Prüfungsordnung (PDF)"
         d.metadata["filename"] = PDF_FILE
     docs.extend(pdf_docs)
+    # ------------------------------------------------------------
+    # 3) HTML laden
+    # ------------------------------------------------------------
+    print(">>> Downloading Hochschulgesetz HTML ...")
     try:
         html_path = hf_hub_download(
             repo_id=DATASET,
             filename=HTML_FILE,
             repo_type="dataset",
         )
+        print(f"HTML downloaded:\n{html_path}\n")
     except Exception as e:
         print("ERROR downloading HTML:", e)
+        return docs  # PDF at least loaded
+    print(">>> Parsing HG HTML into paragraphs ...")
     try:
         html_docs = _load_hg_paragraph_documents(html_path)
     except Exception as e:
+        print("ERROR parsing HTML:", e)
         return docs
     docs.extend(html_docs)
+    print(f"=== DONE: load_documents() → total {len(docs)} documents ===\n")
     return docs
+# ================================================================
+# Debug
+# ================================================================
+if __name__ == "__main__":
+    print("\n=== Running load_documents.py ===\n")
+    documents = load_documents()
+    print(f"\n>>> TOTAL documents loaded: {len(documents)}")
+    if len(documents):
+        print("\nExample Document:")
+        print(documents[0].page_content[:300])
+        print("Metadata:", documents[0].metadata)