Nguyen5 commited on
Commit
24753ba
·
1 Parent(s): 93b6370
Files changed (2) hide show
  1. app.py +172 -129
  2. load_documents.py +89 -50
app.py CHANGED
@@ -1,169 +1,212 @@
1
- """
2
- load_documents.py – Improved Clean Version
3
- ------------------------------------------
4
- Lädt:
5
 
6
- 1) Prüfungsordnung (PDF) seitenweise.
7
- 2) Hochschulgesetz NRW aus generierter HTML-Datei
8
- (hg_clean.html oder Hochschulgesetz_NRW.html)
9
- und erzeugt pro Absatz (<p>) ein Document.
10
 
11
- Verbesserungen:
12
- - Keine HTML-Rohartefakte
13
- - Kein Abbrechen in der Mitte von Sätzen
14
- - Entfernt doppelte Leerzeichen
15
- - metadata.paragraph_id wird sauber übernommen
16
- """
17
 
18
- from huggingface_hub import hf_hub_download, list_repo_files
19
- from langchain_community.document_loaders import PyPDFLoader
20
- from langchain_core.documents import Document
21
- from bs4 import BeautifulSoup
22
 
23
- DATASET = "Nguyen5/docs"
24
- PDF_FILE = "f10_bpo_ifb_tei_mif_wii_2021-01-04.pdf"
25
- HTML_FILE = "Hochschulgesetz_NRW.html" # stored inside dataset
26
 
 
 
27
 
28
- # ================================================================
29
- # Hilfsfunktion: lädt HG-Absätze sauber & robust
30
- # ================================================================
31
 
32
- def _load_hg_paragraph_documents(html_path: str):
33
- """
34
- Liest Hochschulgesetz NRW HTML ein und erzeugt pro <p>-Tag ein Document.
35
 
36
- Verbesserungen:
37
- - Entfernt doppelte Leerzeichen -> " ".join(text.split())
38
- - Entfernt leere Texte
39
- - Übernimmt paragraph_id (id="hg_abs_12" oder id="para_12")
40
- """
41
 
42
- with open(html_path, "r", encoding="utf-8") as f:
43
- html = f.read()
44
 
45
- soup = BeautifulSoup(html, "html.parser")
 
 
46
 
47
- docs = []
 
 
48
 
49
- for p in soup.find_all("p"):
50
- text = p.get_text(" ", strip=True)
51
- if not text:
52
- continue
53
 
54
- # normalize whitespace
55
- text = " ".join(text.split())
 
 
 
 
 
56
 
57
- paragraph_id = p.get("id")
58
 
59
- metadata = {
60
- "source": "Hochschulgesetz NRW (HTML)",
61
- "filename": HTML_FILE,
62
- }
63
 
64
- if paragraph_id:
65
- metadata["paragraph_id"] = paragraph_id
66
 
67
- docs.append(
68
- Document(
69
- page_content=text,
70
- metadata=metadata
71
- )
72
- )
 
 
 
 
 
 
 
 
73
 
74
- print(f"[HG] Loaded {len(docs)} paragraph Documents.\n")
75
- return docs
 
 
 
76
 
 
77
 
78
- # ================================================================
79
- # Hauptfunktion: lädt PDF + HG-HTML
80
- # ================================================================
 
81
 
82
- def load_documents():
83
- print("\n=== START: load_documents() ===\n")
84
 
85
- docs = []
 
 
86
 
87
- # ------------------------------------------------------------
88
- # 1) Dateien prüfen
89
- # ------------------------------------------------------------
90
- print(">>> Checking dataset on HuggingFace ...")
91
- files = list_repo_files(DATASET, repo_type="dataset")
92
- print("Files found:", files, "\n")
93
 
94
- # ------------------------------------------------------------
95
- # 2) PDF laden
96
- # ------------------------------------------------------------
97
- print(">>> Downloading Prüfungsordnung PDF ...")
98
 
99
- try:
100
- pdf_path = hf_hub_download(
101
- repo_id=DATASET,
102
- filename=PDF_FILE,
103
- repo_type="dataset",
104
- )
105
- print(f"PDF downloaded:\n{pdf_path}\n")
106
- except Exception as e:
107
- print("ERROR downloading PDF:", e)
108
- return []
109
 
110
- print(">>> Loading PDF pages ...")
 
111
 
112
- try:
113
- pdf_docs = PyPDFLoader(pdf_path).load()
114
- except Exception as e:
115
- print("ERROR loading PDF:", e)
116
- return []
117
 
118
- print(f"Loaded {len(pdf_docs)} PDF pages.\n")
119
 
120
- # metadata ergänzen
121
- for d in pdf_docs:
122
- d.metadata["source"] = "Prüfungsordnung (PDF)"
123
- d.metadata["filename"] = PDF_FILE
124
 
125
- docs.extend(pdf_docs)
 
 
126
 
127
- # ------------------------------------------------------------
128
- # 3) HTML laden
129
- # ------------------------------------------------------------
130
- print(">>> Downloading Hochschulgesetz HTML ...")
131
 
132
- try:
133
- html_path = hf_hub_download(
134
- repo_id=DATASET,
135
- filename=HTML_FILE,
136
- repo_type="dataset",
137
- )
138
- print(f"HTML downloaded:\n{html_path}\n")
139
- except Exception as e:
140
- print("ERROR downloading HTML:", e)
141
- return docs # PDF at least loaded
142
 
143
- print(">>> Parsing HG HTML into paragraphs ...")
 
 
144
 
145
- try:
146
- html_docs = _load_hg_paragraph_documents(html_path)
147
- except Exception as e:
148
- print("ERROR parsing HTML:", e)
149
- return docs
 
 
150
 
151
- docs.extend(html_docs)
 
 
152
 
153
- print(f"=== DONE: load_documents() → total {len(docs)} documents ===\n")
154
- return docs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
 
157
- # ================================================================
158
- # Debug
159
- # ================================================================
160
 
161
  if __name__ == "__main__":
162
- print("\n=== Running load_documents.py ===\n")
163
- documents = load_documents()
164
- print(f"\n>>> TOTAL documents loaded: {len(documents)}")
165
-
166
- if len(documents):
167
- print("\nExample Document:")
168
- print(documents[0].page_content[:300])
169
- print("Metadata:", documents[0].metadata)
 
1
+
2
+ # app.py – Prüfungsrechts-Chatbot (RAG + Sprachmodus)
3
+ # Version 26.11 – ohne Modi, stabil für Text + Voice
 
4
 
5
+ import gradio as gr
6
+ from gradio_pdf import PDF
7
+ from huggingface_hub import hf_hub_download
 
8
 
9
+ from load_documents import load_documents, DATASET, PDF_FILE, HTML_FILE
10
+ from split_documents import split_documents
11
+ from vectorstore import build_vectorstore
12
+ from retriever import get_retriever
13
+ from llm import load_llm
14
+ from rag_pipeline import answer, PDF_BASE_URL, LAW_URL
15
 
16
+ from speech_io import transcribe_audio, synthesize_speech
 
 
 
17
 
18
+ # =====================================================
19
+ # INITIALISIERUNG (global)
20
+ # =====================================================
21
 
22
+ print("🔹 Lade Dokumente ...")
23
+ _docs = load_documents()
24
 
25
+ print("🔹 Splitte Dokumente ...")
26
+ _chunks = split_documents(_docs)
 
27
 
28
+ print("🔹 Baue VectorStore (FAISS) ...")
29
+ _vs = build_vectorstore(_chunks)
 
30
 
31
+ print("🔹 Erzeuge Retriever ...")
32
+ _retriever = get_retriever(_vs)
 
 
 
33
 
34
+ print("🔹 Lade LLM ...")
35
+ _llm = load_llm()
36
 
37
+ print("🔹 Lade Dateien für Viewer …")
38
+ _pdf_path = hf_hub_download(DATASET, PDF_FILE, repo_type="dataset")
39
+ _html_path = hf_hub_download(DATASET, HTML_FILE, repo_type="dataset")
40
 
41
+ # =====================================================
42
+ # Quellen formatieren – Markdown für Chat
43
+ # =====================================================
44
 
45
+ def format_sources_markdown(sources):
46
+ if not sources:
47
+ return ""
 
48
 
49
+ lines = ["", "**📚 Quellen (genutzte Dokumentstellen):**"]
50
+ for s in sources:
51
+ sid = s["id"]
52
+ src = s["source"]
53
+ page = s["page"]
54
+ url = s["url"]
55
+ snippet = s["snippet"]
56
 
57
+ title = f"Quelle {sid} – {src}"
58
 
59
+ if url:
60
+ base = f"- [{title}]({url})"
61
+ else:
62
+ base = f"- {title}"
63
 
64
+ if page and "Prüfungsordnung" in src:
65
+ base += f", Seite {page}"
66
 
67
+ lines.append(base)
68
+
69
+ if snippet:
70
+ lines.append(f" > {snippet}")
71
+
72
+ return "\n".join(lines)
73
+
74
+ # =====================================================
75
+ # TEXT CHATBOT
76
+ # =====================================================
77
+
78
+ def chatbot_text(user_message, history):
79
+ if not user_message:
80
+ return history, ""
81
 
82
+ answer_text, sources = answer(
83
+ question=user_message,
84
+ retriever=_retriever,
85
+ chat_model=_llm,
86
+ )
87
 
88
+ quellen_block = format_sources_markdown(sources)
89
 
90
+ history = history + [
91
+ {"role": "user", "content": user_message},
92
+ {"role": "assistant", "content": answer_text + quellen_block},
93
+ ]
94
 
95
+ return history, ""
 
96
 
97
+ # =====================================================
98
+ # VOICE CHATBOT
99
+ # =====================================================
100
 
101
+ def chatbot_voice(audio_path, history):
102
+ # 1. Speech → Text
103
+ text = transcribe_audio(audio_path)
104
+ if not text:
105
+ return history, None, ""
 
106
 
107
+ # Lưu vào lịch sử chat
108
+ history = history + [{"role": "user", "content": text}]
 
 
109
 
110
+ # 2. RAG trả lời
111
+ answer_text, sources = answer(
112
+ question=text,
113
+ retriever=_retriever,
114
+ chat_model=_llm,
115
+ )
116
+ quellen_block = format_sources_markdown(sources)
 
 
 
117
 
118
+ bot_msg = answer_text + quellen_block
119
+ history = history + [{"role": "assistant", "content": bot_msg}]
120
 
121
+ # 3. Text → Speech
122
+ audio = synthesize_speech(bot_msg)
 
 
 
123
 
124
+ return history, audio, ""
125
 
126
+ # =====================================================
127
+ # LAST ANSWER → TTS
128
+ # =====================================================
 
129
 
130
+ def read_last_answer(history):
131
+ if not history:
132
+ return None
133
 
134
+ for msg in reversed(history):
135
+ if msg["role"] == "assistant":
136
+ return synthesize_speech(msg["content"])
 
137
 
138
+ return None
 
 
 
 
 
 
 
 
 
139
 
140
+ # =====================================================
141
+ # UI – GRADIO
142
+ # =====================================================
143
 
144
+ with gr.Blocks(title="Prüfungsrechts-Chatbot (RAG + Sprache)") as demo:
145
+ gr.Markdown("# 🧑‍⚖️ Prüfungsrechts-Chatbot")
146
+ gr.Markdown(
147
+ "Dieser Chatbot beantwortet Fragen **ausschließlich** aus der "
148
+ "Prüfungsordnung (PDF) und dem Hochschulgesetz NRW (Website). "
149
+ "Du kannst Text eingeben oder direkt ins Mikrofon sprechen."
150
+ )
151
 
152
+ with gr.Row():
153
+ with gr.Column(scale=2):
154
+ chatbot = gr.Chatbot(label="Chat", height=500)
155
 
156
+ msg = gr.Textbox(
157
+ label="Frage eingeben",
158
+ placeholder="Stelle deine Frage zum Prüfungsrecht …",
159
+ )
160
+
161
+ # TEXT SENDEN
162
+ msg.submit(
163
+ chatbot_text,
164
+ [msg, chatbot],
165
+ [chatbot, msg]
166
+ )
167
+
168
+ send_btn = gr.Button("Senden (Text)")
169
+ send_btn.click(
170
+ chatbot_text,
171
+ [msg, chatbot],
172
+ [chatbot, msg]
173
+ )
174
 
175
+ # SPRACHEINGABE
176
+ gr.Markdown("### 🎙️ Spracheingabe")
177
+ voice_in = gr.Audio(sources=["microphone"], type="filepath")
178
+ voice_out = gr.Audio(label="Vorgelesene Antwort", type="numpy")
179
+
180
+ voice_btn = gr.Button("Sprechen & senden")
181
+ voice_btn.click(
182
+ chatbot_voice,
183
+ [voice_in, chatbot],
184
+ [chatbot, voice_out, msg]
185
+ )
186
+
187
+ read_btn = gr.Button("🔁 Antwort erneut vorlesen")
188
+ read_btn.click(
189
+ read_last_answer,
190
+ [chatbot],
191
+ [voice_out]
192
+ )
193
+
194
+ clear_btn = gr.Button("Chat zurücksetzen")
195
+ clear_btn.click(lambda: [], None, chatbot)
196
+
197
+ # =====================
198
+ # RECHTE SPALTE: Viewer
199
+ # =====================
200
+
201
+ with gr.Column(scale=1):
202
+ gr.Markdown("### 📄 Prüfungsordnung (PDF)")
203
+ PDF(_pdf_path, height=350)
204
+
205
+ gr.Markdown("### 📘 Hochschulgesetz NRW (Website)")
206
+ gr.HTML(
207
+ f'<iframe src="{LAW_URL}" style="width:100%;height:350px;border:none;"></iframe>'
208
+ )
209
 
 
 
 
210
 
211
  if __name__ == "__main__":
212
+ demo.queue().launch(ssr_mode=False, show_error=True)
 
 
 
 
 
 
 
load_documents.py CHANGED
@@ -1,11 +1,18 @@
1
  """
2
- BƯỚC 1: LOAD DOCUMENTS
3
- -----------------------
4
- Debug-full version
5
-
6
- - Lädt Prüfungsordnung (PDF) seitenweise.
7
- - Lädt Hochschulgesetz NRW aus dem im Dataset gespeicherten HTML,
8
- und zerlegt es in einzelne Absätze (Document pro <p>).
 
 
 
 
 
 
 
9
  """
10
 
11
  from huggingface_hub import hf_hub_download, list_repo_files
@@ -15,22 +22,28 @@ from bs4 import BeautifulSoup
15
 
16
  DATASET = "Nguyen5/docs"
17
  PDF_FILE = "f10_bpo_ifb_tei_mif_wii_2021-01-04.pdf"
18
- HTML_FILE = "Hochschulgesetz_NRW.html" # konsistent mit hg_nrw.py
 
 
 
 
 
19
 
20
  def _load_hg_paragraph_documents(html_path: str):
21
  """
22
- Liest das generierte Hochschulgesetz-HTML ein und erzeugt
23
- pro <p>-Element einen LangChain-Document mit:
24
- - page_content = Text des Absatzes
25
- - metadata:
26
- source = "Hochschulgesetz NRW (HTML)"
27
- filename = HTML_FILE
28
- paragraph_id = id-Attribut (z.B. 'hg_abs_12'), falls vorhanden
29
  """
 
30
  with open(html_path, "r", encoding="utf-8") as f:
31
  html = f.read()
32
 
33
  soup = BeautifulSoup(html, "html.parser")
 
34
  docs = []
35
 
36
  for p in soup.find_all("p"):
@@ -38,93 +51,119 @@ def _load_hg_paragraph_documents(html_path: str):
38
  if not text:
39
  continue
40
 
41
- pid = p.get("id")
 
 
 
42
 
43
  metadata = {
44
  "source": "Hochschulgesetz NRW (HTML)",
45
  "filename": HTML_FILE,
46
  }
47
- if pid:
48
- metadata["paragraph_id"] = pid
49
 
50
- docs.append(Document(page_content=text, metadata=metadata))
 
 
 
 
 
 
 
 
51
 
52
- print(f"Loaded {len(docs)} paragraph Documents from HG-HTML.\n")
53
  return docs
54
 
 
 
 
 
 
55
  def load_documents():
56
- print("=== START: load_documents() ===\n")
57
 
58
- # -------------------------
59
- # Check files in dataset
60
- # -------------------------
61
- print(">>> Checking dataset file list from HuggingFace...")
 
 
62
  files = list_repo_files(DATASET, repo_type="dataset")
63
- print("Files in dataset:", files, "\n")
64
 
65
- docs = []
 
 
 
66
 
67
- # -------------------------
68
- # Load PDF
69
- # -------------------------
70
- print(">>> Step 1: Download PDF from HuggingFace...")
71
  try:
72
  pdf_path = hf_hub_download(
73
  repo_id=DATASET,
74
  filename=PDF_FILE,
75
  repo_type="dataset",
76
  )
77
- print(f"Downloaded PDF to local cache:\n{pdf_path}\n")
78
  except Exception as e:
79
  print("ERROR downloading PDF:", e)
80
  return []
81
 
82
- print(">>> Step 1.1: Loading PDF pages...")
 
83
  try:
84
  pdf_docs = PyPDFLoader(pdf_path).load()
85
- print(f"Loaded {len(pdf_docs)} PDF pages.\n")
86
  except Exception as e:
87
  print("ERROR loading PDF:", e)
88
  return []
89
 
 
 
 
90
  for d in pdf_docs:
91
  d.metadata["source"] = "Prüfungsordnung (PDF)"
92
  d.metadata["filename"] = PDF_FILE
93
 
94
  docs.extend(pdf_docs)
95
 
96
- # -------------------------
97
- # Load HTML (Hochschulgesetz NRW)
98
- # -------------------------
99
- print(">>> Step 2: Download HTML from HuggingFace...")
 
100
  try:
101
  html_path = hf_hub_download(
102
  repo_id=DATASET,
103
  filename=HTML_FILE,
104
  repo_type="dataset",
105
  )
106
- print(f"Downloaded HTML to local cache:\n{html_path}\n")
107
  except Exception as e:
108
  print("ERROR downloading HTML:", e)
109
- return docs
 
 
110
 
111
- print(">>> Step 2.1: Loading HG-HTML and splitting into paragraphs...")
112
  try:
113
  html_docs = _load_hg_paragraph_documents(html_path)
114
  except Exception as e:
115
- print("ERROR loading / parsing HTML:", e)
116
  return docs
117
 
118
  docs.extend(html_docs)
119
 
120
- print("=== DONE: load_documents() ===\n")
121
  return docs
122
 
123
- if __name__ == "__main__":
124
- print("\n=== Running load_documents.py directly ===\n")
125
- docs = load_documents()
126
- print(f"\n>>> TOTAL documents loaded: {len(docs)}")
127
 
128
- if len(docs):
129
- print("\nExample metadata from 1st document:")
130
- print(docs[0].metadata)
 
 
 
 
 
 
 
 
 
 
 
1
  """
2
+ load_documents.py Improved Clean Version
3
+ ------------------------------------------
4
+ Lädt:
5
+
6
+ 1) Prüfungsordnung (PDF) seitenweise.
7
+ 2) Hochschulgesetz NRW aus generierter HTML-Datei
8
+ (hg_clean.html oder Hochschulgesetz_NRW.html)
9
+ und erzeugt pro Absatz (<p>) ein Document.
10
+
11
+ Verbesserungen:
12
+ - Keine HTML-Rohartefakte
13
+ - Kein Abbrechen in der Mitte von Sätzen
14
+ - Entfernt doppelte Leerzeichen
15
+ - metadata.paragraph_id wird sauber übernommen
16
  """
17
 
18
  from huggingface_hub import hf_hub_download, list_repo_files
 
22
 
23
  DATASET = "Nguyen5/docs"
24
  PDF_FILE = "f10_bpo_ifb_tei_mif_wii_2021-01-04.pdf"
25
+ HTML_FILE = "Hochschulgesetz_NRW.html" # stored inside dataset
26
+
27
+
28
+ # ================================================================
29
+ # Hilfsfunktion: lädt HG-Absätze sauber & robust
30
+ # ================================================================
31
 
32
  def _load_hg_paragraph_documents(html_path: str):
33
  """
34
+ Liest Hochschulgesetz NRW HTML ein und erzeugt pro <p>-Tag ein Document.
35
+
36
+ Verbesserungen:
37
+ - Entfernt doppelte Leerzeichen -> " ".join(text.split())
38
+ - Entfernt leere Texte
39
+ - Übernimmt paragraph_id (id="hg_abs_12" oder id="para_12")
 
40
  """
41
+
42
  with open(html_path, "r", encoding="utf-8") as f:
43
  html = f.read()
44
 
45
  soup = BeautifulSoup(html, "html.parser")
46
+
47
  docs = []
48
 
49
  for p in soup.find_all("p"):
 
51
  if not text:
52
  continue
53
 
54
+ # normalize whitespace
55
+ text = " ".join(text.split())
56
+
57
+ paragraph_id = p.get("id")
58
 
59
  metadata = {
60
  "source": "Hochschulgesetz NRW (HTML)",
61
  "filename": HTML_FILE,
62
  }
 
 
63
 
64
+ if paragraph_id:
65
+ metadata["paragraph_id"] = paragraph_id
66
+
67
+ docs.append(
68
+ Document(
69
+ page_content=text,
70
+ metadata=metadata
71
+ )
72
+ )
73
 
74
+ print(f"[HG] Loaded {len(docs)} paragraph Documents.\n")
75
  return docs
76
 
77
+
78
+ # ================================================================
79
+ # Hauptfunktion: lädt PDF + HG-HTML
80
+ # ================================================================
81
+
82
  def load_documents():
83
+ print("\n=== START: load_documents() ===\n")
84
 
85
+ docs = []
86
+
87
+ # ------------------------------------------------------------
88
+ # 1) Dateien prüfen
89
+ # ------------------------------------------------------------
90
+ print(">>> Checking dataset on HuggingFace ...")
91
  files = list_repo_files(DATASET, repo_type="dataset")
92
+ print("Files found:", files, "\n")
93
 
94
+ # ------------------------------------------------------------
95
+ # 2) PDF laden
96
+ # ------------------------------------------------------------
97
+ print(">>> Downloading Prüfungsordnung PDF ...")
98
 
 
 
 
 
99
  try:
100
  pdf_path = hf_hub_download(
101
  repo_id=DATASET,
102
  filename=PDF_FILE,
103
  repo_type="dataset",
104
  )
105
+ print(f"PDF downloaded:\n{pdf_path}\n")
106
  except Exception as e:
107
  print("ERROR downloading PDF:", e)
108
  return []
109
 
110
+ print(">>> Loading PDF pages ...")
111
+
112
  try:
113
  pdf_docs = PyPDFLoader(pdf_path).load()
 
114
  except Exception as e:
115
  print("ERROR loading PDF:", e)
116
  return []
117
 
118
+ print(f"Loaded {len(pdf_docs)} PDF pages.\n")
119
+
120
+ # metadata ergänzen
121
  for d in pdf_docs:
122
  d.metadata["source"] = "Prüfungsordnung (PDF)"
123
  d.metadata["filename"] = PDF_FILE
124
 
125
  docs.extend(pdf_docs)
126
 
127
+ # ------------------------------------------------------------
128
+ # 3) HTML laden
129
+ # ------------------------------------------------------------
130
+ print(">>> Downloading Hochschulgesetz HTML ...")
131
+
132
  try:
133
  html_path = hf_hub_download(
134
  repo_id=DATASET,
135
  filename=HTML_FILE,
136
  repo_type="dataset",
137
  )
138
+ print(f"HTML downloaded:\n{html_path}\n")
139
  except Exception as e:
140
  print("ERROR downloading HTML:", e)
141
+ return docs # PDF at least loaded
142
+
143
+ print(">>> Parsing HG HTML into paragraphs ...")
144
 
 
145
  try:
146
  html_docs = _load_hg_paragraph_documents(html_path)
147
  except Exception as e:
148
+ print("ERROR parsing HTML:", e)
149
  return docs
150
 
151
  docs.extend(html_docs)
152
 
153
+ print(f"=== DONE: load_documents() → total {len(docs)} documents ===\n")
154
  return docs
155
 
 
 
 
 
156
 
157
+ # ================================================================
158
+ # Debug
159
+ # ================================================================
160
+
161
+ if __name__ == "__main__":
162
+ print("\n=== Running load_documents.py ===\n")
163
+ documents = load_documents()
164
+ print(f"\n>>> TOTAL documents loaded: {len(documents)}")
165
+
166
+ if len(documents):
167
+ print("\nExample Document:")
168
+ print(documents[0].page_content[:300])
169
+ print("Metadata:", documents[0].metadata)