Nguyen5 commited on
Commit
1fbd132
·
1 Parent(s): a195b72
Files changed (2) hide show
  1. app.py +81 -55
  2. ingest.py +74 -98
app.py CHANGED
@@ -1,4 +1,4 @@
1
- # app.py – fixed Quelle links
2
  import os
3
  import base64
4
  import gradio as gr
@@ -9,22 +9,29 @@ from rag_pipeline import rag_answer
9
 
10
  client = OpenAI()
11
  BUCKET = os.environ["SUPABASE_BUCKET"]
12
-
13
  SUPABASE_URL = os.environ["SUPABASE_URL"]
14
- PDF_URL = f"{SUPABASE_URL}/storage/v1/object/public/{BUCKET}/pruefungsordnung.pdf"
15
- HG_URL = f"{SUPABASE_URL}/storage/v1/object/public/{BUCKET}/hochschulgesetz.html"
16
-
17
 
18
- def encode_pdf_src():
19
- pdf_bytes = load_file_bytes(BUCKET, "pruefungsordnung.pdf")
20
- return f"data:application/pdf;base64,{base64.b64encode(pdf_bytes).decode('utf-8')}"
21
 
 
 
22
 
23
- def encode_html():
24
- html_bytes = load_file_bytes(BUCKET, "hochschulgesetz.html")
25
- return html_bytes.decode("utf-8", errors="ignore")
26
 
 
 
 
 
 
 
 
27
 
 
 
 
28
  def transcribe(audio_path):
29
  if audio_path is None:
30
  return ""
@@ -32,92 +39,111 @@ def transcribe(audio_path):
32
  result = client.audio.transcriptions.create(
33
  model="whisper-1",
34
  file=f,
35
- language="de",
36
  temperature=0.0
37
  )
38
  return (result.text or "").strip()
39
 
40
-
 
 
41
  def chat_fn(text, audio, history):
42
  text = (text or "").strip()
43
 
 
44
  if text:
45
  question = text
46
- elif audio:
47
  question = transcribe(audio)
48
  else:
49
- return history, "<p>Bitte Text oder Mikrofon benutzen.</p>", None
50
 
51
  if not question:
52
- return history, "<p>Spracherkennung fehlgeschlagen.</p>", None
53
 
 
54
  answer, docs = rag_answer(question, history or [])
55
 
56
- html = "<ol>"
 
57
  for i, d in enumerate(docs):
58
- meta = d["metadata"]
59
- src = meta.get("source")
60
- page = meta.get("page")
61
- anchor = meta.get("anchor_id")
62
-
63
- # PDF vs HTML
64
- if src == "Prüfungsordnung (PDF)":
65
- link = f"{PDF_URL}#page={page+1}" if isinstance(page, int) else PDF_URL
 
 
 
 
 
 
 
 
66
  else:
67
- link = f"{HG_URL}#{anchor}" if anchor else HG_URL
 
 
 
 
 
 
68
 
69
- snippet = d["content"][:200].replace("\n", " ")
 
70
 
71
- html += f"""
72
- <li>
73
- <a href="{link}" target="_blank">
74
- <b>Quelle {i+1}: {src}</b>
75
- </a><br>
76
- {snippet}...
77
- </li>
78
- """
79
- html += "</ol>"
80
 
81
  new_history = (history or []) + [
82
  {"role": "user", "content": question},
83
- {"role": "assistant", "content": answer},
84
  ]
85
 
86
- return new_history, html, gr.update(value=None)
87
-
88
 
 
 
 
89
  with gr.Blocks() as demo:
90
- gr.Markdown("# ⚖️ Prüfungsrechts-Chatbot (RAG mit Supabase)")
91
 
92
  with gr.Row():
93
-
94
  with gr.Column(scale=3):
95
- chatbot = gr.Chatbot()
 
96
  text_input = gr.Textbox(label="Frage eingeben")
97
- audio_input = gr.Audio(
98
- type="filepath", label="Spracheingabe (Mikrofon)"
99
- )
100
  send_btn = gr.Button("Senden")
101
 
 
 
 
 
102
  with gr.Column(scale=2):
103
- gr.Markdown("### Prüfungsordnung (PDF)")
104
  gr.HTML(
105
- f"<iframe src='{encode_pdf_src()}' width='100%' height='260px'></iframe>"
106
  )
107
 
108
- gr.Markdown("### Hochschulgesetz NRW")
109
  gr.HTML(
110
- f"<div style='overflow:auto;height:260px;'>{encode_html()}</div>"
111
  )
112
 
113
- sources_html = gr.HTML()
114
-
115
  send_btn.click(
116
  chat_fn,
117
- [text_input, audio_input, chatbot],
118
- [chatbot, sources_html, audio_input]
119
  )
120
 
121
  if __name__ == "__main__":
122
- demo.launch(ssr_mode=False)
123
-
 
1
+ # app.py
2
  import os
3
  import base64
4
  import gradio as gr
 
9
 
10
  client = OpenAI()
11
  BUCKET = os.environ["SUPABASE_BUCKET"]
 
12
  SUPABASE_URL = os.environ["SUPABASE_URL"]
 
 
 
13
 
14
+ # ------------------------------------------
15
+ # URLs cho Prüfungsordnung (PDF) + HG NRW
16
+ # ------------------------------------------
17
 
18
+ # PDF nằm trong Supabase (như trước)
19
+ PDF_URL = f"{SUPABASE_URL}/storage/v1/object/public/{BUCKET}/pruefungsordnung.pdf"
20
 
21
+ # ⚠️ Đây là link chính thức của Hochschulgesetz NRW trên recht.nrw.de
22
+ HG_URL = "https://recht.nrw.de/lmi/owa/br_text_anzeigen?v_id=10000000000000000654"
 
23
 
24
+ # ------------------------------------------
25
+ # Viewer PDF base64
26
+ # ------------------------------------------
27
+ def encode_pdf_src():
28
+ pdf_bytes = load_file_bytes(BUCKET, "pruefungsordnung.pdf")
29
+ b64 = base64.b64encode(pdf_bytes).decode("utf-8")
30
+ return f"data:application/pdf;base64,{b64}"
31
 
32
+ # ------------------------------------------
33
+ # Speech-to-text (Whisper)
34
+ # ------------------------------------------
35
  def transcribe(audio_path):
36
  if audio_path is None:
37
  return ""
 
39
  result = client.audio.transcriptions.create(
40
  model="whisper-1",
41
  file=f,
42
+ language="de", # tiếng Đức
43
  temperature=0.0
44
  )
45
  return (result.text or "").strip()
46
 
47
+ # ------------------------------------------
48
+ # HÀM CHAT CHÍNH
49
+ # ------------------------------------------
50
  def chat_fn(text, audio, history):
51
  text = (text or "").strip()
52
 
53
+ # 1) Ưu tiên TEXT; chỉ dùng audio nếu không có text
54
  if text:
55
  question = text
56
+ elif audio is not None:
57
  question = transcribe(audio)
58
  else:
59
+ return history, "", None # không input
60
 
61
  if not question:
62
+ return history, "Spracherkennung fehlgeschlagen.", None
63
 
64
+ # 2) Gọi RAG
65
  answer, docs = rag_answer(question, history or [])
66
 
67
+ # 3) Xây block Quellen ở dạng Markdown, DÙNG META từ docs
68
+ quellen_md_lines = ["", "### 📚 Quellen (verwendete Dokumentstellen):"]
69
  for i, d in enumerate(docs):
70
+ meta = d.get("metadata", {}) or {}
71
+ src = meta.get("source", "?")
72
+ page = meta.get("page", None)
73
+ anchor_id = meta.get("anchor_id")
74
+
75
+ # Prüfungsordnung (PDF) nhảy đúng Seite
76
+ if src.startswith("Prüfungsordnung"):
77
+ # trong ingest page lưu 1-based; nếu bạn dùng 0-based thì +1 ở đây
78
+ page_num = page if isinstance(page, int) else None
79
+ if page_num:
80
+ url = f"{PDF_URL}#page={page_num}"
81
+ title = f"Quelle {i+1} – {src}, Seite {page_num}"
82
+ else:
83
+ url = PDF_URL
84
+ title = f"Quelle {i+1} – {src}"
85
+ # Hochschulgesetz NRW – dùng URL chính thức + anchor_id (para)
86
  else:
87
+ if anchor_id:
88
+ url = f"{HG_URL}#{anchor_id}"
89
+ else:
90
+ url = HG_URL
91
+ title = f"Quelle {i+1} – Hochschulgesetz NRW"
92
+
93
+ snippet = (d.get("content") or "")[:200].replace("\n", " ")
94
 
95
+ quellen_md_lines.append(f"- [{title}]({url})")
96
+ quellen_md_lines.append(f" > {snippet}")
97
 
98
+ quellen_md = "\n".join(quellen_md_lines)
99
+
100
+ # 4) GỘP câu trả lời + Quellen vào NỘI DUNG CHATBOT
101
+ bot_msg = answer + "\n\n" + quellen_md
 
 
 
 
 
102
 
103
  new_history = (history or []) + [
104
  {"role": "user", "content": question},
105
+ {"role": "assistant", "content": bot_msg},
106
  ]
107
 
108
+ # Trả về history (hiển thị trong Chatbot) + block Markdown (nếu muốn xem riêng) + reset audio
109
+ return new_history, bot_msg, gr.update(value=None)
110
 
111
+ # ------------------------------------------
112
+ # GIAO DIỆN
113
+ # ------------------------------------------
114
  with gr.Blocks() as demo:
115
+ gr.Markdown("# ⚖️ Sprachbasierter Chatbot für Prüfungsrecht")
116
 
117
  with gr.Row():
118
+ # Bên trái: Chat
119
  with gr.Column(scale=3):
120
+ # Chatbot RENDER Markdown (type="messages")
121
+ chatbot = gr.Chatbot(type="messages", label="Chat (RAG)", height=500)
122
  text_input = gr.Textbox(label="Frage eingeben")
123
+ audio_input = gr.Audio(type="filepath", label="Spracheingabe (Mikrofon)")
 
 
124
  send_btn = gr.Button("Senden")
125
 
126
+ # Preview Markdown của câu trả lời cuối (tuỳ chọn)
127
+ answer_preview = gr.Markdown("")
128
+
129
+ # Bên phải: Viewer
130
  with gr.Column(scale=2):
131
+ gr.Markdown("### 📄 Prüfungsordnung (PDF)")
132
  gr.HTML(
133
+ f"<iframe src='{encode_pdf_src()}' width='100%' height='250' style='border:none;'></iframe>"
134
  )
135
 
136
+ gr.Markdown("### 📘 Hochschulgesetz NRW (offizielle Seite)")
137
  gr.HTML(
138
+ f"<iframe src='{HG_URL}' width='100%' height='250' style='border:none;'></iframe>"
139
  )
140
 
141
+ # Nút gửi
 
142
  send_btn.click(
143
  chat_fn,
144
+ inputs=[text_input, audio_input, chatbot],
145
+ outputs=[chatbot, answer_preview, audio_input],
146
  )
147
 
148
  if __name__ == "__main__":
149
+ demo.queue().launch(ssr_mode=False)
 
ingest.py CHANGED
@@ -1,128 +1,104 @@
1
- # app.py – Quelle clickable & styled viewer (recht.nrw.de)
2
-
3
  import os
4
- import base64
5
- import gradio as gr
6
- from openai import OpenAI
7
 
8
- from supabase_client import load_file_bytes
9
- from rag_pipeline import rag_answer
 
 
10
 
11
- client = OpenAI()
12
  BUCKET = os.environ["SUPABASE_BUCKET"]
13
-
14
  SUPABASE_URL = os.environ["SUPABASE_URL"]
15
 
16
- # Prüfungsordnung PDF – Supabase public URL
17
  PDF_URL = f"{SUPABASE_URL}/storage/v1/object/public/{BUCKET}/pruefungsordnung.pdf"
 
18
 
19
- # Hochschulgesetz NRW – offizielle recht.nrw.de URL (WICHTIG)
20
- HG_URL = "https://recht.nrw.de/lmi/owa/br_text_anzeigen?v_id=10000000000000000654"
21
 
22
-
23
- # ------------------------------------------
24
- # PDF inline viewer (Base64)
25
- # ------------------------------------------
26
- def encode_pdf_src():
27
  pdf_bytes = load_file_bytes(BUCKET, "pruefungsordnung.pdf")
28
- b64 = base64.b64encode(pdf_bytes).decode("utf-8")
29
- return f"data:application/pdf;base64,{b64}"
30
-
31
-
32
- # ------------------------------------------
33
- # Speech-to-text
34
- # ------------------------------------------
35
- def transcribe(audio_path):
36
- if audio_path is None:
37
- return ""
38
- with open(audio_path, "rb") as f:
39
- result = client.audio.transcriptions.create(
40
- model="whisper-1",
41
- file=f,
42
- language="de",
43
- temperature=0.0
44
  )
45
- return (result.text or "").strip()
46
 
47
 
48
- # ------------------------------------------
49
- # Chat logic with inline Quelle links
50
- # ------------------------------------------
51
- def chat_fn(text, audio, history):
52
- text = (text or "").strip()
53
 
54
- if text:
55
- question = text
56
- elif audio:
57
- question = transcribe(audio)
58
- else:
59
- return history, "<p>Bitte Text oder Mikrofon benutzen.</p>", None
60
 
61
- answer, docs = rag_answer(question, history or [])
62
-
63
- # Quellenblock (NOW CLICKABLE IN CHAT)
64
- quellen_md = "### 📚 Quellen\n"
65
- for i, d in enumerate(docs):
66
- meta = d["metadata"]
67
- src = meta.get("source")
68
-
69
- # PDF
70
- if src.startswith("Prüfungsordnung"):
71
- page = meta.get("page", 0) + 1
72
- url = f"{PDF_URL}#page={page}"
73
- title = f"Quelle {i+1}: {src}, Seite {page}"
74
 
75
- # HTML – Rechtsportal NRW (WICHTIG)
76
- else:
77
- anchor = meta.get("anchor_id")
78
- url = f"{HG_URL}#{anchor}"
79
- title = f"Quelle {i+1}: Hochschulgesetz NRW (§)"
80
 
81
- snippet = d["content"][:120].replace("\n", " ")
 
 
 
 
 
82
 
83
- quellen_md += f"- [{title}]({url})\n > {snippet}\n"
84
 
85
- bot_msg = f"{answer}\n\n{quellen_md}"
 
 
86
 
87
- new_history = (history or []) + [
88
- {"role": "user", "content": question},
89
- {"role": "assistant", "content": bot_msg},
90
- ]
91
 
92
- return new_history, bot_msg, gr.update(value=None)
 
93
 
 
 
94
 
95
- # ------------------------------------------
96
- # UI
97
- # ------------------------------------------
98
- with gr.Blocks() as demo:
99
- gr.Markdown("# ⚖️ Prüfungsrechts-Chatbot (Quelle-clickable)")
 
100
 
101
- with gr.Row():
 
 
102
 
103
- with gr.Column(scale=2):
104
- chatbot = gr.Chatbot(type="messages", height=550)
105
- text_input = gr.Textbox(label="Frage")
106
- audio_input = gr.Audio(type="filepath", label="Mikrofon")
107
- send_btn = gr.Button("Senden")
108
- answer_preview = gr.Markdown("")
109
 
110
- send_btn.click(
111
- chat_fn,
112
- inputs=[text_input, audio_input, chatbot],
113
- outputs=[chatbot, answer_preview, audio_input]
114
- )
 
 
115
 
116
- with gr.Column(scale=1):
117
- gr.Markdown("### Prüfungsordnung (PDF)")
118
- gr.HTML(
119
- f"<iframe src='{encode_pdf_src()}' width='100%' height='330px' style='border:none'></iframe>"
120
- )
121
 
122
- gr.Markdown("### Hochschulgesetz NRW (offizielle Seite)")
123
- gr.HTML(
124
- f"<iframe src='{HG_URL}' width='100%' height='330px' style='border:none'></iframe>"
125
- )
126
 
127
  if __name__ == "__main__":
128
- demo.launch(ssr_mode=False)
 
1
+ # ingest.py
 
2
  import os
3
+ from io import BytesIO
4
+ from bs4 import BeautifulSoup
5
+ from pypdf import PdfReader
6
 
7
+ from supabase_client import supabase, load_file_bytes
8
+ from langchain_openai import OpenAIEmbeddings
9
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
10
+ from langchain_core.documents import Document
11
 
 
12
  BUCKET = os.environ["SUPABASE_BUCKET"]
 
13
  SUPABASE_URL = os.environ["SUPABASE_URL"]
14
 
 
15
  PDF_URL = f"{SUPABASE_URL}/storage/v1/object/public/{BUCKET}/pruefungsordnung.pdf"
16
+ HG_URL = f"{SUPABASE_URL}/storage/v1/object/public/{BUCKET}/hochschulgesetz.html"
17
 
 
 
18
 
19
+ def load_pdf_docs():
 
 
 
 
20
  pdf_bytes = load_file_bytes(BUCKET, "pruefungsordnung.pdf")
21
+ reader = PdfReader(BytesIO(pdf_bytes))
22
+
23
+ docs = []
24
+ for i, page in enumerate(reader.pages):
25
+ text = page.extract_text() or ""
26
+
27
+ docs.append(
28
+ Document(
29
+ page_content=text,
30
+ metadata={
31
+ "source": "Prüfungsordnung (PDF)",
32
+ "page": i, # ZERO-based: Seite = i+1
33
+ "pdf_url": PDF_URL, # Basis-URL
34
+ },
35
+ )
 
36
  )
37
+ return docs
38
 
39
 
40
+ def load_html_docs():
41
+ html_bytes = load_file_bytes(BUCKET, "hochschulgesetz.html")
42
+ html = html_bytes.decode("utf-8", errors="ignore")
 
 
43
 
44
+ soup = BeautifulSoup(html, "html.parser")
45
+ text = soup.get_text(separator="\n")
 
 
 
 
46
 
47
+ # HTML nicht in Paragraphen getrennt → wir chunk’en später
48
+ return [
49
+ Document(
50
+ page_content=text,
51
+ metadata={
52
+ "source": "Hochschulgesetz NRW",
53
+ # anchor_id wird erst beim Chunken vergeben
54
+ },
55
+ )
56
+ ]
 
 
 
57
 
 
 
 
 
 
58
 
59
+ def chunk_docs(docs):
60
+ splitter = RecursiveCharacterTextSplitter(
61
+ chunk_size=900,
62
+ chunk_overlap=100,
63
+ )
64
+ return splitter.split_documents(docs)
65
 
 
66
 
67
+ def ingest():
68
+ pdf_docs = load_pdf_docs()
69
+ hg_docs = load_html_docs()
70
 
71
+ chunks = chunk_docs(pdf_docs + hg_docs)
 
 
 
72
 
73
+ po_idx = 1
74
+ hg_idx = 1
75
 
76
+ for d in chunks:
77
+ src = d.metadata["source"]
78
 
79
+ if src == "Prüfungsordnung (PDF)":
80
+ d.metadata["anchor_id"] = f"po_{po_idx}"
81
+ po_idx += 1
82
+ else:
83
+ d.metadata["anchor_id"] = f"hg_{hg_idx}"
84
+ hg_idx += 1
85
 
86
+ # HTML Quelle als vollständige URL
87
+ if src == "Hochschulgesetz NRW":
88
+ d.metadata["url"] = f"{HG_URL}#{d.metadata['anchor_id']}"
89
 
90
+ embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
 
 
 
 
 
91
 
92
+ for d in chunks:
93
+ emb = embeddings.embed_query(d.page_content)
94
+ supabase.table("documents").insert({
95
+ "content": d.page_content,
96
+ "metadata": d.metadata,
97
+ "embedding": emb
98
+ }).execute()
99
 
100
+ print("OK ✔ ingest xong – PDF + HTML mit Quelle-URL")
 
 
 
 
101
 
 
 
 
 
102
 
103
  if __name__ == "__main__":
104
+ ingest()