Nguyen5 commited on
Commit
8308ad9
·
1 Parent(s): 9534de3
Files changed (3) hide show
  1. app.py +47 -132
  2. ingest.py +32 -60
  3. rag_pipeline.py +29 -86
app.py CHANGED
@@ -1,163 +1,78 @@
1
- # app.py — UI mit klickbaren Quellen & Voice-Eingabe
2
- import os
3
  import gradio as gr
 
4
  from openai import OpenAI
5
-
6
- from supabase_client import supabase
7
  from rag_pipeline import rag_answer
 
8
 
9
  client = OpenAI()
10
- BUCKET = os.environ["SUPABASE_BUCKET"]
11
-
12
-
13
- # --------------------------------------------------------
14
- # Viewer HTML aus Supabase-Dokumenten bauen
15
- # --------------------------------------------------------
16
- def build_viewer_html():
17
- """Baut HTML-Viewer aus Tabelle documents mit anchor_id."""
18
- resp = supabase.table("documents").select("content, metadata").limit(2000).execute()
19
- data = resp.data or []
20
-
21
- po_blocks = []
22
- hg_blocks = []
23
-
24
- for row in data:
25
- content = row.get("content") or ""
26
- meta = row.get("metadata") or {}
27
- src = meta.get("source", "")
28
- anchor_id = meta.get("anchor_id")
29
- page = meta.get("page", None)
30
- page_info = f"(Seite {page})" if page else ""
31
-
32
- block_html = (
33
- f"<div id='{anchor_id}' style='margin-bottom: 1rem;'>"
34
- f"<b>{src} {page_info}</b><br>{content}</div>"
35
- )
36
 
 
 
 
 
 
 
 
 
 
 
 
37
  if "Prüfungsordnung" in src:
38
- po_blocks.append(block_html)
39
- elif "Hochschulgesetz" in src:
40
- hg_blocks.append(block_html)
41
-
42
- po_html = "<h3>Prüfungsordnung</h3>" + "".join(po_blocks)
43
- hg_html = "<h3>Hochschulgesetz NRW</h3>" + "".join(hg_blocks)
44
-
45
- return po_html, hg_html
46
-
47
-
48
- PO_HTML, HG_HTML = build_viewer_html()
49
 
 
50
 
51
- # --------------------------------------------------------
52
- # Speech-to-Text (Whisper, DE)
53
- # --------------------------------------------------------
54
- def transcribe(audio_path: str) -> str:
55
- if not audio_path:
56
  return ""
57
- with open(audio_path, "rb") as f:
58
- result = client.audio.transcriptions.create(
59
- model="whisper-1",
60
- file=f,
61
- language="de",
62
- temperature=0.0,
63
  )
64
- return (result.text or "").strip()
65
 
66
-
67
- # --------------------------------------------------------
68
- # Chat-Funktion
69
- # --------------------------------------------------------
70
  def chat_fn(text, audio, history):
71
- text = (text or "").strip()
72
-
73
- # 1) Priorität: Text. Nur wenn kein Text → Audio
74
  if text:
75
- question = text
76
- elif audio is not None:
77
- question = transcribe(audio)
78
  else:
79
- return history, "<p>Bitte Text eingeben oder Mikrofon benutzen.</p>", None
80
-
81
- if not question:
82
- return history, "<p>Spracherkennung fehlgeschlagen. Bitte erneut sprechen.</p>", None
83
 
84
- # 2) RAG-Antwort
85
- answer, docs = rag_answer(question, history or [])
86
 
87
- # 3) Quellen-HTML mit klickbaren Anchors
88
  html = "<ol>"
89
  for i, d in enumerate(docs):
90
- meta = d.get("metadata") or {}
91
- src = meta.get("source", "?")
92
- page = meta.get("page", None)
93
- page_info = f"(Seite {page})" if page else ""
94
- anchor_id = meta.get("anchor_id")
95
- snippet = (d.get("content") or "")[:200]
96
-
97
- if anchor_id:
98
- link = f"#{anchor_id}"
99
- html += (
100
- f"<li>"
101
- f"<a href='{link}'><b>Quelle {i+1}: {src} {page_info}</b></a><br>"
102
- f"{snippet}..."
103
- f"</li>"
104
- )
105
- else:
106
- html += (
107
- f"<li><b>Quelle {i+1}: {src} {page_info}</b><br>"
108
- f"{snippet}...</li>"
109
- )
110
  html += "</ol>"
111
 
112
- # 4) History im messages-Format (für Gradio)
113
- new_history = (history or []) + [
114
- {"role": "user", "content": question},
115
- {"role": "assistant", "content": answer},
116
  ]
117
 
118
- # Reset Audio nach dem Senden
119
- return new_history, html, gr.update(value=None)
120
-
121
 
122
- # --------------------------------------------------------
123
- # UI Layout
124
- # --------------------------------------------------------
125
  with gr.Blocks() as demo:
126
- gr.Markdown("# ⚖️ Sprachbasierter Chatbot für Prüfungsrecht")
127
-
128
- with gr.Row():
129
- with gr.Column(scale=3):
130
- chatbot = gr.Chatbot(label="Chat (Prüfungsrecht)")
131
- text_input = gr.Textbox(
132
- label="Text-Eingabe",
133
- placeholder="Frage hier eintippen ..."
134
- )
135
- audio_input = gr.Audio(
136
- type="filepath",
137
- label="Spracheingabe (Mikrofon)"
138
- )
139
- send_btn = gr.Button("Senden")
140
-
141
- with gr.Column(scale=2):
142
- gr.Markdown("### 📄 Prüfungsordnung (mit Ankern)")
143
- gr.HTML(
144
- f"<div style='overflow:auto; height:250px; "
145
- f"border:1px solid #ccc; padding:10px;'>{PO_HTML}</div>"
146
- )
147
-
148
- gr.Markdown("### 📜 Hochschulgesetz NRW (mit Ankern)")
149
- gr.HTML(
150
- f"<div style='overflow:auto; height:250px; "
151
- f"border:1px solid #ccc; padding:10px;'>{HG_HTML}</div>"
152
- )
153
 
154
- sources_html = gr.HTML()
 
 
155
 
156
- send_btn.click(
157
  chat_fn,
158
  inputs=[text_input, audio_input, chatbot],
159
- outputs=[chatbot, sources_html, audio_input],
160
  )
161
 
162
- if __name__ == "__main__":
163
- demo.launch(ssr_mode=False)
 
1
+ # app.py
 
2
  import gradio as gr
3
+ import os
4
  from openai import OpenAI
 
 
5
  from rag_pipeline import rag_answer
6
+ from supabase_client import supabase
7
 
8
  client = OpenAI()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
+ def build_viewer():
11
+ resp = supabase.table("documents").select("content, metadata").execute()
12
+ items = resp.data or []
13
+ po_html = []
14
+ hg_html = []
15
+ for row in items:
16
+ meta = row["metadata"]
17
+ src = meta["source"]
18
+ anchor = meta["anchor_id"]
19
+ page = meta.get("page", "")
20
+ block_html = f"<div id='{anchor}'><b>{src} {page}</b><br>{row['content']}</div>"
21
  if "Prüfungsordnung" in src:
22
+ po_html.append(block_html)
23
+ else:
24
+ hg_html.append(block_html)
25
+ return "".join(po_html), "".join(hg_html)
 
 
 
 
 
 
 
26
 
27
+ PO_HTML, HG_HTML = build_viewer()
28
 
29
+ def transcribe(audio):
30
+ if audio is None:
 
 
 
31
  return ""
32
+ with open(audio, "rb") as f:
33
+ res = client.audio.transcriptions.create(
34
+ model="whisper-1", file=f, language="de", temperature=0
 
 
 
35
  )
36
+ return res.text.strip()
37
 
 
 
 
 
38
  def chat_fn(text, audio, history):
39
+ text = text.strip() if text else ""
 
 
40
  if text:
41
+ q = text
 
 
42
  else:
43
+ q = transcribe(audio)
 
 
 
44
 
45
+ answer, docs = rag_answer(q, history or [])
 
46
 
 
47
  html = "<ol>"
48
  for i, d in enumerate(docs):
49
+ meta = d["metadata"]
50
+ anchor = meta["anchor_id"]
51
+ snippet = d["content"][:200]
52
+ html += f"<li><a href='#{anchor}'><b>Quelle {i+1}</b></a><br>{snippet}...</li>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  html += "</ol>"
54
 
55
+ new_hist = (history or []) + [
56
+ {"role": "user", "content": q},
57
+ {"role": "assistant", "content": answer}
 
58
  ]
59
 
60
+ return new_hist, html, gr.update(value=None) # reset audio
 
 
61
 
 
 
 
62
  with gr.Blocks() as demo:
63
+ chatbot = gr.Chatbot()
64
+ text_input = gr.Textbox(label="Text Eingabe")
65
+ audio_input = gr.Audio(type="filepath", label="Mikrofon")
66
+ send = gr.Button("Senden")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
 
68
+ po_view = gr.HTML(f"<div style='height:250px; overflow:auto'>{PO_HTML}</div>")
69
+ hg_view = gr.HTML(f"<div style='height:250px; overflow:auto'>{HG_HTML}</div>")
70
+ sources = gr.HTML()
71
 
72
+ send.click(
73
  chat_fn,
74
  inputs=[text_input, audio_input, chatbot],
75
+ outputs=[chatbot, sources, audio_input]
76
  )
77
 
78
+ demo.launch()
 
ingest.py CHANGED
@@ -1,4 +1,4 @@
1
- # ingest.py — Ingest mit anchor_id für jeden Absatz
2
  import os
3
  from io import BytesIO
4
  from bs4 import BeautifulSoup
@@ -6,96 +6,68 @@ from pypdf import PdfReader
6
 
7
  from supabase_client import supabase, load_file_bytes
8
  from langchain_openai import OpenAIEmbeddings
9
- from langchain_core.documents import Document
10
  from langchain_text_splitters import RecursiveCharacterTextSplitter
 
11
 
12
  BUCKET = os.environ["SUPABASE_BUCKET"]
13
 
14
-
15
  def load_pdf_docs():
16
- """Lädt Prüfungsordnung.pdf aus Supabase (in-memory) und erzeugt pro Seite ein Document."""
17
  pdf_bytes = load_file_bytes(BUCKET, "pruefungsordnung.pdf")
18
  reader = PdfReader(BytesIO(pdf_bytes))
19
-
20
  docs = []
21
  for i, page in enumerate(reader.pages):
22
  text = page.extract_text() or ""
23
- docs.append(
24
- Document(
25
- page_content=text,
26
- metadata={
27
- "source": "Prüfungsordnung",
28
- "page": i + 1,
29
- },
30
- )
31
- )
32
  return docs
33
 
34
-
35
  def load_html_docs():
36
- """Lädt hochschulgesetz.html aus Supabase und extrahiert reinen Text."""
37
  html_bytes = load_file_bytes(BUCKET, "hochschulgesetz.html")
38
- html_str = html_bytes.decode("utf-8", errors="ignore")
39
- soup = BeautifulSoup(html_str, "html.parser")
40
  text = soup.get_text(separator="\n")
41
-
42
- return [
43
- Document(
44
- page_content=text,
45
- metadata={"source": "Hochschulgesetz NRW"},
46
- )
47
- ]
48
-
49
 
50
  def chunk_docs(docs):
51
- """Chunking in sinnvolle Absätze."""
52
  splitter = RecursiveCharacterTextSplitter(
53
- chunk_size=800,
54
- chunk_overlap=150,
55
- )
56
  return splitter.split_documents(docs)
57
 
58
-
59
  def ingest():
60
- print("📥 Lade Dokumente aus Supabase...")
61
  pdf_docs = load_pdf_docs()
62
  hg_docs = load_html_docs()
63
- all_docs = pdf_docs + hg_docs
64
 
65
- print(f"📄 Rohdokumente geladen: {len(all_docs)}")
66
- chunks = chunk_docs(all_docs)
67
- print(f"✂️ Zu Chunks gesplittet: {len(chunks)}")
68
 
69
- # anchor_id vergeben
70
- po_idx = 1
71
- hg_idx = 1
72
  for d in chunks:
73
- src = d.metadata.get("source", "")
74
- if "Prüfungsordnung" in src:
75
- d.metadata["anchor_id"] = f"po_{po_idx}"
76
- po_idx += 1
77
- elif "Hochschulgesetz" in src:
78
- d.metadata["anchor_id"] = f"hg_{hg_idx}"
79
- hg_idx += 1
80
 
81
  embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
82
 
83
- print("🧠 Erzeuge Embeddings & schreibe nach Supabase (Tabelle documents)...")
84
- for i, d in enumerate(chunks):
85
  emb = embeddings.embed_query(d.page_content)
86
- supabase.table("documents").insert(
87
- {
88
- "content": d.page_content,
89
- "metadata": d.metadata,
90
- "embedding": emb,
91
- }
92
- ).execute()
93
-
94
- if (i + 1) % 50 == 0:
95
- print(f" → {i+1}/{len(chunks)} Chunks gespeichert")
96
-
97
- print("✅ Ingest abgeschlossen – Dokumente mit anchor_id in Supabase gespeichert.")
98
 
 
99
 
100
  if __name__ == "__main__":
101
  ingest()
 
1
+ # ingest.py
2
  import os
3
  from io import BytesIO
4
  from bs4 import BeautifulSoup
 
6
 
7
  from supabase_client import supabase, load_file_bytes
8
  from langchain_openai import OpenAIEmbeddings
 
9
  from langchain_text_splitters import RecursiveCharacterTextSplitter
10
+ from langchain_core.documents import Document
11
 
12
  BUCKET = os.environ["SUPABASE_BUCKET"]
13
 
 
14
  def load_pdf_docs():
 
15
  pdf_bytes = load_file_bytes(BUCKET, "pruefungsordnung.pdf")
16
  reader = PdfReader(BytesIO(pdf_bytes))
 
17
  docs = []
18
  for i, page in enumerate(reader.pages):
19
  text = page.extract_text() or ""
20
+ docs.append(Document(
21
+ page_content=text,
22
+ metadata={"source": "Prüfungsordnung", "page": i + 1},
23
+ ))
 
 
 
 
 
24
  return docs
25
 
 
26
  def load_html_docs():
 
27
  html_bytes = load_file_bytes(BUCKET, "hochschulgesetz.html")
28
+ html = html_bytes.decode("utf-8", errors="ignore")
29
+ soup = BeautifulSoup(html, "html.parser")
30
  text = soup.get_text(separator="\n")
31
+ return [Document(
32
+ page_content=text,
33
+ metadata={"source": "Hochschulgesetz NRW"},
34
+ )]
 
 
 
 
35
 
36
  def chunk_docs(docs):
 
37
  splitter = RecursiveCharacterTextSplitter(
38
+ chunk_size=900, chunk_overlap=100)
 
 
39
  return splitter.split_documents(docs)
40
 
 
41
  def ingest():
 
42
  pdf_docs = load_pdf_docs()
43
  hg_docs = load_html_docs()
44
+ chunks = chunk_docs(pdf_docs + hg_docs)
45
 
46
+ # gán anchor_id
47
+ po_index = 1
48
+ hg_index = 1
49
 
 
 
 
50
  for d in chunks:
51
+ src = d.metadata["source"]
52
+ if src == "Prüfungsordnung":
53
+ d.metadata["anchor_id"] = f"po_{po_index}"
54
+ po_index += 1
55
+ else:
56
+ d.metadata["anchor_id"] = f"hg_{hg_index}"
57
+ hg_index += 1
58
 
59
  embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
60
 
61
+ # insert thủ công
62
+ for d in chunks:
63
  emb = embeddings.embed_query(d.page_content)
64
+ supabase.table("documents").insert({
65
+ "content": d.page_content,
66
+ "metadata": d.metadata,
67
+ "embedding": emb
68
+ }).execute()
 
 
 
 
 
 
 
69
 
70
+ print("OK ✔ ingest xong – đã có anchor_id cho tất cả documents")
71
 
72
  if __name__ == "__main__":
73
  ingest()
rag_pipeline.py CHANGED
@@ -1,108 +1,51 @@
1
- # rag_pipeline.py — RAG mit Supabase RPC & anchor_id
2
  import os
3
  from datetime import date
4
- from typing import Any, List
5
-
6
  from openai import OpenAI
7
- from langchain_openai import OpenAIEmbeddings
8
  from supabase_client import supabase
 
9
 
10
  client = OpenAI()
11
  embedder = OpenAIEmbeddings(model="text-embedding-3-small")
12
 
13
-
14
- def get_relevant_docs(query: str, k: int = 6) -> List[dict]:
15
- """Ruft match_documents in Supabase auf und liefert die besten k Treffer."""
16
- embedding = embedder.embed_query(query)
17
- resp = supabase.rpc(
18
- "match_documents",
19
- {"query_embedding": embedding, "filter": {}},
20
- ).execute()
21
- data = resp.data or []
22
- return data[:k]
23
-
24
-
25
- def save_message(role: str, message: str) -> None:
26
- """Speichert Nachrichten nach Datum gruppiert in chat_history."""
27
- today = date.today().isoformat()
28
- supabase.table("chat_history").insert(
29
- {
30
- "session_date": today,
31
- "role": role,
32
- "message": message,
33
- }
34
- ).execute()
35
-
36
-
37
- def format_history(history: Any) -> str:
38
- """History (list von dict oder tuples) zu einfachem Text für den Prompt."""
39
- if not history:
40
- return ""
41
- out = ""
42
- for turn in history:
43
- if isinstance(turn, dict) and "role" in turn and "content" in turn:
44
- r = turn["role"]
45
- c = str(turn["content"])
46
- if r == "user":
47
- out += f"User: {c}\n"
48
- elif r == "assistant":
49
- out += f"Assistant: {c}\n"
50
- elif isinstance(turn, (list, tuple)) and len(turn) >= 2:
51
- out += f"User: {turn[0]}\nAssistant: {turn[1]}\n"
52
- return out
53
-
54
-
55
- def rag_answer(question: str, history: Any):
56
- """Gibt (Antworttext, Liste von Dokumentdicts) zurück."""
57
- docs = get_relevant_docs(question)
58
-
59
- # Kontext
60
- context_parts = []
61
  for i, d in enumerate(docs):
62
- meta = d.get("metadata") or {}
63
- src = meta.get("source", "Quelle")
64
  page = meta.get("page")
65
  page_info = f"(Seite {page})" if page else ""
66
- text = d.get("content") or ""
67
- context_parts.append(
68
- f"[Quelle {i+1}] {src} {page_info}\n{text}"
69
- )
70
- context = "\n\n".join(context_parts) if context_parts else "Keine relevanten Dokumente gefunden."
71
-
72
- history_text = format_history(history)
73
-
74
- system_prompt = (
75
- "Du bist ein spezialisierter Chatbot für Prüfungsrecht an einer Hochschule. "
76
- "Du antwortest ausschließlich auf Basis der bereitgestellten Dokumente "
77
- "(Prüfungsordnung, Hochschulgesetz NRW). "
78
- "Wenn die Dokumente keine klare Antwort liefern, sag ehrlich, dass es in den vorhandenen Unterlagen nicht eindeutig geregelt ist. "
79
- "Zitiere Quellen immer im Format [Quelle X] und nenne, ob sie aus der Prüfungsordnung oder dem Hochschulgesetz stammen."
80
- )
81
-
82
- user_content = (
83
- f"Frage: {question}\n\n"
84
- f"Bisheriger Chatverlauf:\n{history_text}\n\n"
85
- f"Relevante Auszüge aus den Dokumenten:\n{context}\n\n"
86
- "Formuliere eine klare, juristisch saubere Antwort. "
87
- "Gib am Ende deiner Antwort eine Liste der verwendeten Quellen im Format:\n"
88
- "[Quelle 1: Prüfungsordnung, Seite ..., ggf. Paragraph]\n"
89
- "[Quelle 2: Hochschulgesetz NRW, Seite ..., ggf. Paragraph]\n"
90
- )
91
 
92
  messages = [
93
- {"role": "system", "content": system_prompt},
94
- {"role": "user", "content": user_content},
95
  ]
96
 
97
- completion = client.chat.completions.create(
98
  model="gpt-4.1-mini",
99
  messages=messages,
100
- temperature=0.1,
101
  )
102
 
103
- answer = completion.choices[0].message.content
104
-
105
- save_message("user", question)
106
  save_message("assistant", answer)
107
 
108
  return answer, docs
 
1
+ # rag_pipeline.py
2
  import os
3
  from datetime import date
 
 
4
  from openai import OpenAI
 
5
  from supabase_client import supabase
6
+ from langchain_openai import OpenAIEmbeddings
7
 
8
  client = OpenAI()
9
  embedder = OpenAIEmbeddings(model="text-embedding-3-small")
10
 
11
+ def get_relevant_docs(query, k=4):
12
+ emb = embedder.embed_query(query)
13
+ resp = supabase.rpc("match_documents", {
14
+ "query_embedding": emb,
15
+ "filter": {}
16
+ }).execute()
17
+ return (resp.data or [])[:k]
18
+
19
+ def save_message(role, content):
20
+ supabase.table("chat_history").insert({
21
+ "session_date": date.today().isoformat(),
22
+ "role": role,
23
+ "message": content
24
+ }).execute()
25
+
26
+ def rag_answer(query, history):
27
+ docs = get_relevant_docs(query)
28
+ context = ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  for i, d in enumerate(docs):
30
+ meta = d["metadata"]
31
+ src = meta["source"]
32
  page = meta.get("page")
33
  page_info = f"(Seite {page})" if page else ""
34
+ context += f"[Quelle {i+1}] {src} {page_info}\n{d['content']}\n\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
  messages = [
37
+ {"role": "system", "content": "Du bist Chatbot für Prüfungsrecht…"},
38
+ {"role": "user", "content": f"Frage: {query}\n\nDokumente:\n{context}"}
39
  ]
40
 
41
+ res = client.chat.completions.create(
42
  model="gpt-4.1-mini",
43
  messages=messages,
44
+ temperature=0
45
  )
46
 
47
+ answer = res.choices[0].message.content
48
+ save_message("user", query)
 
49
  save_message("assistant", answer)
50
 
51
  return answer, docs