File size: 6,043 Bytes
3a9ed51
 
 
6548bf5
80c3670
3a9ed51
f2e421a
3a9ed51
c411e11
 
 
 
3a9ed51
3dd5086
3a9ed51
fcc2090
 
c411e11
fcc2090
c411e11
3a9ed51
 
c411e11
3a9ed51
 
c411e11
3a9ed51
 
c411e11
3a9ed51
 
c411e11
3a9ed51
 
c411e11
3a9ed51
 
 
c411e11
 
3a9ed51
c411e11
ed0df67
3a9ed51
 
c411e11
f2e421a
3a9ed51
 
 
 
 
 
 
f2e421a
3a9ed51
b8e573b
3a9ed51
 
 
 
7c86ca3
3a9ed51
 
7c86ca3
3a9ed51
6548bf5
3a9ed51
 
 
 
c411e11
 
3a9ed51
c411e11
3a9ed51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c411e11
 
3a9ed51
c411e11
3a9ed51
80c3670
3a9ed51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80c3670
3a9ed51
 
ed0df67
c411e11
3a9ed51
c411e11
3a9ed51
80c3670
f2e421a
c411e11
3a9ed51
c411e11
3a9ed51
80c3670
3a9ed51
c411e11
3dd5086
6bb0f73
3a9ed51
6bb0f73
3a9ed51
 
 
 
 
 
 
 
 
 
 
533ef4b
3a9ed51
 
 
 
c411e11
3a9ed51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80c3670
3a9ed51
 
 
 
 
 
80c3670
3a9ed51
 
 
 
 
 
 
 
 
 
 
 
80c3670
3a9ed51
 
 
 
c411e11
11e64e1
6548bf5
80c3670
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
# app.py – Prüfungsrechts-Chatbot (RAG + Sprachmodus)
# Version 26.11 – ohne Modi, stabil für Text + Voice

import gradio as gr
from gradio_pdf import PDF
from huggingface_hub import hf_hub_download

from load_documents import load_documents, DATASET, PDF_FILE, HTML_FILE
from split_documents import split_documents
from vectorstore import build_vectorstore
from retriever import get_retriever
from llm import load_llm
from rag_pipeline import answer, PDF_BASE_URL, LAW_URL

from speech_io import transcribe_audio, synthesize_speech

# =====================================================
# INITIALISIERUNG (global)
# =====================================================

print("🔹 Lade Dokumente ...")
_docs = load_documents()

print("🔹 Splitte Dokumente ...")
_chunks = split_documents(_docs)

print("🔹 Baue VectorStore (FAISS) ...")
_vs = build_vectorstore(_chunks)

print("🔹 Erzeuge Retriever ...")
_retriever = get_retriever(_vs)

print("🔹 Lade LLM ...")
_llm = load_llm()

print("🔹 Lade Dateien für Viewer …")
_pdf_path = hf_hub_download(DATASET, PDF_FILE, repo_type="dataset")
_html_path = hf_hub_download(DATASET, HTML_FILE, repo_type="dataset")

# =====================================================
# Quellen formatieren – Markdown für Chat
# =====================================================

def format_sources_markdown(sources):
    if not sources:
        return ""

    lines = ["", "**📚 Quellen (genutzte Dokumentstellen):**"]
    for s in sources:
        sid = s["id"]
        src = s["source"]
        page = s["page"]
        url = s["url"]
        snippet = s["snippet"]

        title = f"Quelle {sid}{src}"

        if url:
            base = f"- [{title}]({url})"
        else:
            base = f"- {title}"

        if page and "Prüfungsordnung" in src:
            base += f", Seite {page}"

        lines.append(base)

        if snippet:
            lines.append(f"  > {snippet}")

    return "\n".join(lines)

# =====================================================
# TEXT CHATBOT
# =====================================================

def chatbot_text(user_message, history):
    if not user_message:
        return history, ""

    answer_text, sources = answer(
        question=user_message,
        retriever=_retriever,
        chat_model=_llm,
    )

    quellen_block = format_sources_markdown(sources)

    history = history + [
        {"role": "user", "content": user_message},
        {"role": "assistant", "content": answer_text + quellen_block},
    ]

    return history, ""

# =====================================================
# VOICE CHATBOT
# =====================================================

def chatbot_voice(audio_path, history):
    # 1. Speech → Text
    text = transcribe_audio(audio_path)
    if not text:
        return history, None, ""

    # Lưu vào lịch sử chat
    history = history + [{"role": "user", "content": text}]

    # 2. RAG trả lời
    answer_text, sources = answer(
        question=text,
        retriever=_retriever,
        chat_model=_llm,
    )
    quellen_block = format_sources_markdown(sources)

    bot_msg = answer_text + quellen_block
    history = history + [{"role": "assistant", "content": bot_msg}]

    # 3. Text → Speech
    audio = synthesize_speech(bot_msg)

    return history, audio, ""

# =====================================================
# LAST ANSWER → TTS
# =====================================================

def read_last_answer(history):
    if not history:
        return None

    for msg in reversed(history):
        if msg["role"] == "assistant":
            return synthesize_speech(msg["content"])

    return None

# =====================================================
# UI – GRADIO
# =====================================================

with gr.Blocks(title="Prüfungsrechts-Chatbot (RAG + Sprache)") as demo:
    gr.Markdown("# 🧑‍⚖️ Prüfungsrechts-Chatbot")
    gr.Markdown(
        "Dieser Chatbot beantwortet Fragen **ausschließlich** aus der "
        "Prüfungsordnung (PDF) und dem Hochschulgesetz NRW (Website). "
        "Du kannst Text eingeben oder direkt ins Mikrofon sprechen."
    )

    with gr.Row():
        with gr.Column(scale=2):
            chatbot = gr.Chatbot(label="Chat", height=500)

            msg = gr.Textbox(
                label="Frage eingeben",
                placeholder="Stelle deine Frage zum Prüfungsrecht …",
            )

            # TEXT SENDEN
            msg.submit(
                chatbot_text,
                [msg, chatbot],
                [chatbot, msg]
            )

            send_btn = gr.Button("Senden (Text)")
            send_btn.click(
                chatbot_text,
                [msg, chatbot],
                [chatbot, msg]
            )

            # SPRACHEINGABE
            gr.Markdown("### 🎙️ Spracheingabe")
            voice_in = gr.Audio(sources=["microphone"], type="filepath")
            voice_out = gr.Audio(label="Vorgelesene Antwort", type="numpy")

            voice_btn = gr.Button("Sprechen & senden")
            voice_btn.click(
                chatbot_voice,
                [voice_in, chatbot],
                [chatbot, voice_out, msg]
            )

            read_btn = gr.Button("🔁 Antwort erneut vorlesen")
            read_btn.click(
                read_last_answer,
                [chatbot],
                [voice_out]
            )

            clear_btn = gr.Button("Chat zurücksetzen")
            clear_btn.click(lambda: [], None, chatbot)

        # =====================
        # RECHTE SPALTE: Viewer
        # =====================

        with gr.Column(scale=1):
            gr.Markdown("### 📄 Prüfungsordnung (PDF)")
            PDF(_pdf_path, height=350)

            gr.Markdown("### 📘 Hochschulgesetz NRW (Website)")
            gr.HTML(
                f'<iframe src="{LAW_URL}" style="width:100%;height:350px;border:none;"></iframe>'
            )

if __name__ == "__main__":
    demo.launch()