Nguyen5 commited on
Commit
b586b7b
·
1 Parent(s): f6325e1
Files changed (4) hide show
  1. app.py +64 -153
  2. ingest.py +27 -91
  3. rag_pipeline.py +58 -130
  4. supabase_client.py +11 -20
app.py CHANGED
@@ -2,16 +2,16 @@
2
  import os
3
  import re
4
  import base64
 
 
5
 
6
  import gradio as gr
7
  from openai import OpenAI
8
 
9
  from supabase_client import load_file_bytes
10
- from rag_pipeline import rag_answer # agent_answer alias
 
11
 
12
- # -------------------------------------------------------------------
13
- # OpenAI client cho Whisper (Speech-to-Text)
14
- # -------------------------------------------------------------------
15
  client = OpenAI()
16
 
17
  BUCKET = os.environ["SUPABASE_BUCKET"]
@@ -21,183 +21,94 @@ PDF_URL = f"{SUPABASE_URL}/storage/v1/object/public/{BUCKET}/pruefungsordnung.pd
21
  HG_URL = "https://recht.nrw.de/lmi/owa/br_text_anzeigen?v_id=10000000000000000654"
22
 
23
 
24
- # -------------------------------------------------------------------
25
- # PDF Viewer (Base64 iframe)
26
- # -------------------------------------------------------------------
27
- def encode_pdf_src() -> str:
28
- pdf_bytes = load_file_bytes(BUCKET, "pruefungsordnung.pdf")
29
- b64 = base64.b64encode(pdf_bytes).decode("utf-8")
30
- return f"data:application/pdf;base64,{b64}"
31
-
32
-
33
- # -------------------------------------------------------------------
34
- # Speech-to-Text (Whisper) + cleaning
35
- # -------------------------------------------------------------------
36
- FILLER = [
37
- "äh",
38
- "ähm",
39
- "uh",
40
- "hmm",
41
- "mmh",
42
- "ah",
43
- "oh",
44
- "also",
45
- "sozusagen",
46
- "halt",
47
- "irgendwie",
48
- ]
49
-
50
-
51
- def clean_transcript(t: str) -> str:
52
- if not t:
53
- return ""
54
- t = t.lower().strip()
55
- for f in FILLER:
56
- t = re.sub(rf"\b{re.escape(f)}\b", "", t)
57
- t = re.sub(r"[^a-zA-ZäöüÄÖÜß0-9,.? ]+", " ", t)
58
- t = re.sub(r"\s+", " ", t).strip()
59
- if len(t) > 1:
60
- t = t[0].upper() + t[1:]
61
- return t
62
-
63
-
64
- def transcribe(audio_path: str) -> str:
65
- if audio_path is None:
66
  return ""
67
- with open(audio_path, "rb") as f:
68
- result = client.audio.transcriptions.create(
69
- model="whisper-1",
70
- file=f,
71
- language="de",
72
- temperature=0.0,
73
- )
74
- raw = (result.text or "").strip()
75
- cleaned = clean_transcript(raw)
76
- return cleaned if len(cleaned) >= 3 else ""
77
-
78
-
79
- # -------------------------------------------------------------------
80
- # Hàm CHAT chính – gọi Agent (rag_answer)
81
- # -------------------------------------------------------------------
82
  def chat_fn(mode, text, audio, history):
83
  history = history or []
84
 
85
- # 1) Chọn câu hỏi theo mode
86
  if mode == "text":
87
- if not (text or "").strip():
88
- return history, "Bitte Text eingeben.", None
89
- question = text.strip()
90
- else: # mode == "audio"
91
- if audio is None:
92
- return history, "Bitte ins Mikrofon sprechen.", None
93
- question = transcribe(audio)
94
- if not question:
95
- return (
96
- history,
97
- "Spracherkennung fehlgeschlagen. Bitte erneut versuchen.",
98
- None,
99
- )
100
 
101
- # 2) Gọi Agent (RAG + Tools)
102
- answer, docs = rag_answer(question, history)
103
 
104
- # 3) Xây block Quellen (UI-friendly)
105
- quellen_md_lines = ["", "### 📚 Verwendete Quellen"]
106
 
 
107
  for i, d in enumerate(docs):
108
- meta = d.get("metadata", {}) or {}
109
- src = meta.get("source", "?")
110
- page = meta.get("page", None)
111
-
112
- # Prüfungsordnung – nhảy đúng Seite
113
- if isinstance(src, str) and src.startswith("Prüfungsordnung"):
114
- page_num = page if isinstance(page, int) else None
115
- if page_num:
116
- url = f"{PDF_URL}#page={page_num}"
117
- title = f"Quelle {i+1}: Prüfungsordnung (Seite {page_num})"
118
- else:
119
- url = PDF_URL
120
- title = f"Quelle {i+1}: Prüfungsordnung"
121
- # Hochschulgesetz – link trang chính thức
122
  else:
123
  url = HG_URL
124
- title = f"Quelle {i+1}: Hochschulgesetz NRW"
125
 
126
- snippet = (d.get("content") or "").strip().replace("\n", " ")
127
- snippet = snippet[:200] + ("…" if len(snippet) > 200 else "")
128
 
129
- quellen_md_lines.append(
130
- f"- [{title}]({url})\n"
131
- f" - **Ausschnitt:** „{snippet}“"
132
- )
133
 
134
- quellen_md = "\n".join(quellen_md_lines)
 
 
 
135
 
136
- bot_msg = answer + "\n\n" + quellen_md
137
 
138
- new_history = history + [
139
- {"role": "user", "content": question},
140
- {"role": "assistant", "content": bot_msg},
141
- ]
142
-
143
- return new_history, bot_msg, gr.update(value=None)
144
-
145
-
146
- # -------------------------------------------------------------------
147
- # Giao diện Gradio – UI thân thiện
148
- # -------------------------------------------------------------------
149
  with gr.Blocks() as demo:
150
- gr.Markdown(
151
- """
152
- # ⚖️ Prüfungsrechts-Assistent (NRW)
153
-
154
- Willkommen!
155
- Ich beantworte Ihre Fragen auf Basis der **offiziellen Dokumente**:
156
-
157
- - 📘 *Prüfungsordnung Ihrer Hochschule*
158
- - 📗 *Hochschulgesetz NRW (recht.nrw.de)*
159
-
160
- Wählen Sie unten: **Text** oder **Sprache**.
161
- """
162
- )
163
 
164
  with gr.Row():
165
- # LEFT: Chat
166
  with gr.Column(scale=3):
167
- chatbot = gr.Chatbot(label="Chatverlauf")
168
 
169
- mode_select = gr.Radio(
170
- ["text", "audio"],
171
- value="text",
172
- label="Eingabemodus",
173
- info="Wähle zwischen Text oder Sprache",
174
- )
175
 
176
- text_input = gr.Textbox(label="Text eingeben")
177
- audio_input = gr.Audio(
178
- type="filepath", label="Spracheingabe (Mikrofon)"
 
 
179
  )
180
 
181
- send_btn = gr.Button("Senden")
182
- answer_preview = gr.Markdown("")
183
 
184
- # RIGHT: Viewer
185
  with gr.Column(scale=2):
186
  gr.Markdown("### 📄 Prüfungsordnung (PDF)")
187
- gr.HTML(
188
- f"<iframe src='{encode_pdf_src()}' width='100%' height='250' style='border:none;'></iframe>"
189
- )
190
 
191
- gr.Markdown("### 📘 Hochschulgesetz NRW (offizielle Seite)")
192
- gr.HTML(
193
- f"<iframe src='{HG_URL}' width='100%' height='250' style='border:none;'></iframe>"
194
- )
195
 
196
- send_btn.click(
197
- chat_fn,
198
- inputs=[mode_select, text_input, audio_input, chatbot],
199
- outputs=[chatbot, answer_preview, audio_input],
200
- )
201
 
202
  if __name__ == "__main__":
203
- demo.queue().launch(ssr_mode=False)
 
2
  import os
3
  import re
4
  import base64
5
+ import io
6
+ import soundfile as sf
7
 
8
  import gradio as gr
9
  from openai import OpenAI
10
 
11
  from supabase_client import load_file_bytes
12
+ from rag_pipeline import rag_answer
13
+
14
 
 
 
 
15
  client = OpenAI()
16
 
17
  BUCKET = os.environ["SUPABASE_BUCKET"]
 
21
  HG_URL = "https://recht.nrw.de/lmi/owa/br_text_anzeigen?v_id=10000000000000000654"
22
 
23
 
24
+ def encode_pdf_src():
25
+ b = load_file_bytes(BUCKET, "pruefungsordnung.pdf")
26
+ return f"data:application/pdf;base64,{base64.b64encode(b).decode()}"
27
+
28
+
29
+ # Whisper cleanup
30
+ def clean_text(t):
31
+ t = t.lower()
32
+ t = re.sub(r"[^\wäöüß ,.?-]+", " ", t)
33
+ return t.strip().capitalize()
34
+
35
+
36
+ def transcribe(audio):
37
+ if audio is None:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  return ""
39
+ audio_data, sr = audio
40
+ buf = io.BytesIO()
41
+ sf.write(buf, audio_data, sr, format="WAV")
42
+ buf.seek(0)
43
+
44
+ result = client.audio.transcriptions.create(
45
+ model="whisper-1", file=buf, filename="audio.wav", language="de"
46
+ )
47
+ return clean_text(result.text or "")
48
+
49
+
 
 
 
 
50
  def chat_fn(mode, text, audio, history):
51
  history = history or []
52
 
 
53
  if mode == "text":
54
+ q = text.strip()
55
+ else:
56
+ q = transcribe(audio)
 
 
 
 
 
 
 
 
 
 
57
 
58
+ if not q:
59
+ return history, "Keine gültige Eingabe erkannt.", None
60
 
61
+ answer, docs = rag_answer(q, history)
 
62
 
63
+ quellen = ["", "### 📚 Verwendete Quellen"]
64
  for i, d in enumerate(docs):
65
+ src = d["source"]
66
+ pg = d["page"]
67
+
68
+ if src.startswith("Prüfungsordnung"):
69
+ url = f"{PDF_URL}#page={pg}"
 
 
 
 
 
 
 
 
 
70
  else:
71
  url = HG_URL
 
72
 
73
+ snippet = d["snippet"][:200]
74
+ quellen.append(f"- **{src}** (Seite {pg}) [{url}]({url}) \n „{snippet}…”")
75
 
76
+ bot = answer + "\n\n" + "\n".join(quellen)
 
 
 
77
 
78
+ return history + [
79
+ {"role": "user", "content": q},
80
+ {"role": "assistant", "content": bot},
81
+ ], bot, gr.update(value=None)
82
 
 
83
 
 
 
 
 
 
 
 
 
 
 
 
84
  with gr.Blocks() as demo:
85
+ gr.Markdown("# ⚖️ Prüfungsrechts-Assistent NRW")
 
 
 
 
 
 
 
 
 
 
 
 
86
 
87
  with gr.Row():
 
88
  with gr.Column(scale=3):
89
+ chatbot = gr.Chatbot()
90
 
91
+ mode = gr.Radio(["text", "audio"], value="text", label="Eingabemodus")
92
+ text = gr.Textbox(label="Text eingeben")
 
 
 
 
93
 
94
+ audio = gr.Audio(
95
+ sources=["microphone"],
96
+ type="numpy",
97
+ format="wav",
98
+ label="Spracheingabe (Mikrofon)",
99
  )
100
 
101
+ send = gr.Button("Senden")
102
+ preview = gr.Markdown()
103
 
 
104
  with gr.Column(scale=2):
105
  gr.Markdown("### 📄 Prüfungsordnung (PDF)")
106
+ gr.HTML(f"<iframe src='{encode_pdf_src()}' width='100%' height='260'></iframe>")
 
 
107
 
108
+ gr.Markdown("### 📘 Hochschulgesetz NRW")
109
+ gr.HTML(f"<iframe src='{HG_URL}' width='100%' height='260'></iframe>")
 
 
110
 
111
+ send.click(chat_fn, [mode, text, audio, chatbot], [chatbot, preview, audio])
 
 
 
 
112
 
113
  if __name__ == "__main__":
114
+ demo.queue().launch()
ingest.py CHANGED
@@ -1,53 +1,37 @@
1
  # ingest.py
2
  import os
3
  from io import BytesIO
4
-
5
  from bs4 import BeautifulSoup
6
  from pypdf import PdfReader
7
 
8
  from supabase_client import supabase, load_file_bytes
9
-
10
  from langchain_openai import OpenAIEmbeddings
11
  from langchain_text_splitters import RecursiveCharacterTextSplitter
12
  from langchain_core.documents import Document
13
- from langchain_community.vectorstores import SupabaseVectorStore
14
 
15
- # -------------------------------------------------------------------
16
- # ENV + URLs
17
- # -------------------------------------------------------------------
18
  BUCKET = os.environ["SUPABASE_BUCKET"]
19
  SUPABASE_URL = os.environ["SUPABASE_URL"]
20
 
21
  PDF_URL = f"{SUPABASE_URL}/storage/v1/object/public/{BUCKET}/pruefungsordnung.pdf"
22
- HG_STORAGE_URL = f"{SUPABASE_URL}/storage/v1/object/public/{BUCKET}/hochschulgesetz.html"
23
-
24
  OFFICIAL_HG_URL = (
25
  "https://recht.nrw.de/lmi/owa/br_text_anzeigen?v_id=10000000000000000654"
26
  )
27
 
28
 
29
- # -------------------------------------------------------------------
30
- # Loader PDF Prüfungsordnung
31
- # -------------------------------------------------------------------
32
  def load_pdf_docs():
33
- """
34
- Đọc Prüfungsordnung.pdf từ Supabase Storage và tạo 1 Document cho mỗi
35
- trang (page 1-based).
36
- """
37
  pdf_bytes = load_file_bytes(BUCKET, "pruefungsordnung.pdf")
38
  reader = PdfReader(BytesIO(pdf_bytes))
39
 
40
  docs = []
41
- for i, page in enumerate(reader.pages):
42
- text = page.extract_text() or ""
43
- page_num = i + 1
44
-
45
  docs.append(
46
  Document(
47
  page_content=text,
48
  metadata={
49
  "source": "Prüfungsordnung (PDF)",
50
- "page": page_num,
51
  "pdf_url": PDF_URL,
52
  },
53
  )
@@ -55,103 +39,55 @@ def load_pdf_docs():
55
  return docs
56
 
57
 
58
- # -------------------------------------------------------------------
59
- # Loader HTML Hochschulgesetz
60
- # -------------------------------------------------------------------
61
  def load_html_docs():
62
- """
63
- Đọc hochschulgesetz.html từ Supabase Storage, parse bằng BeautifulSoup,
64
- lấy toàn bộ text thành 1 Document lớn (chunk sau).
65
- """
66
  html_bytes = load_file_bytes(BUCKET, "hochschulgesetz.html")
67
- html = html_bytes.decode("utf-8", errors="ignore")
68
-
69
- soup = BeautifulSoup(html, "html.parser")
70
- text = soup.get_text(separator="\n")
71
 
72
  return [
73
  Document(
74
- page_content=text,
75
- metadata={
76
- "source": "Hochschulgesetz NRW",
77
- "official_url": OFFICIAL_HG_URL,
78
- },
79
  )
80
  ]
81
 
82
 
83
- # -------------------------------------------------------------------
84
- # Chunking – RecursiveCharacterTextSplitter
85
- # -------------------------------------------------------------------
86
  def chunk_docs(docs):
87
- splitter = RecursiveCharacterTextSplitter(
88
- chunk_size=900,
89
- chunk_overlap=100,
90
- )
91
  return splitter.split_documents(docs)
92
 
93
 
94
- # -------------------------------------------------------------------
95
- # Xoá dữ liệu cũ trong bảng documents
96
- # -------------------------------------------------------------------
97
- def delete_old_data():
98
- """
99
- Xoá toàn bộ rows trong bảng 'documents'.
100
-
101
- Cột id là UUID, nên dùng điều kiện >= với UUID nhỏ nhất để tránh lỗi
102
- 'invalid input syntax for type uuid'.
103
- """
104
- print("🔄 Lösche alte Daten aus Tabelle 'documents' ...")
105
  supabase.table("documents").delete().gte(
106
  "id", "00000000-0000-0000-0000-000000000000"
107
  ).execute()
108
- print("✔ Alte Daten in 'documents' gelöscht.")
109
 
110
 
111
- # -------------------------------------------------------------------
112
- # Ingest chính
113
- # -------------------------------------------------------------------
114
  def ingest():
115
- print("🚀 Starte Ingest (PDF + Hochschulgesetz) ...")
116
 
117
- # 1) Xoá data cũ
118
- delete_old_data()
119
-
120
- # 2) Load nguồn
121
  pdf_docs = load_pdf_docs()
122
- hg_docs = load_html_docs()
123
 
124
- # 3) Chunk
125
- chunks = chunk_docs(pdf_docs + hg_docs)
126
 
127
- # 4) Gắn anchor_id & URL meta
128
- po_idx = 1
129
- hg_idx = 1
130
- for d in chunks:
131
- src = d.metadata.get("source")
132
 
133
- if src == "Prüfungsordnung (PDF)":
134
- d.metadata["anchor_id"] = f"po_{po_idx}"
135
- po_idx += 1
136
- else:
137
- d.metadata["anchor_id"] = f"hg_{hg_idx}"
138
- hg_idx += 1
139
- d.metadata["url"] = OFFICIAL_HG_URL
140
 
141
- # 5) Embeddings + SupabaseVectorStore
142
- embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
 
 
 
 
 
143
 
144
- print("🔍 Erstelle Embeddings und speichere in SupabaseVectorStore ...")
145
- SupabaseVectorStore.from_documents(
146
- chunks,
147
- embeddings,
148
- client=supabase,
149
- table_name="documents",
150
- query_name="match_documents",
151
- chunk_size=500, # batch size khi insert
152
- )
153
-
154
- print("🎉 Ingest fertig – 'documents' ist frisch aufgebaut.")
155
 
156
 
157
  if __name__ == "__main__":
 
1
  # ingest.py
2
  import os
3
  from io import BytesIO
 
4
  from bs4 import BeautifulSoup
5
  from pypdf import PdfReader
6
 
7
  from supabase_client import supabase, load_file_bytes
 
8
  from langchain_openai import OpenAIEmbeddings
9
  from langchain_text_splitters import RecursiveCharacterTextSplitter
10
  from langchain_core.documents import Document
 
11
 
 
 
 
12
  BUCKET = os.environ["SUPABASE_BUCKET"]
13
  SUPABASE_URL = os.environ["SUPABASE_URL"]
14
 
15
  PDF_URL = f"{SUPABASE_URL}/storage/v1/object/public/{BUCKET}/pruefungsordnung.pdf"
 
 
16
  OFFICIAL_HG_URL = (
17
  "https://recht.nrw.de/lmi/owa/br_text_anzeigen?v_id=10000000000000000654"
18
  )
19
 
20
 
21
+ # ---------------- Loaders ----------------
 
 
22
  def load_pdf_docs():
 
 
 
 
23
  pdf_bytes = load_file_bytes(BUCKET, "pruefungsordnung.pdf")
24
  reader = PdfReader(BytesIO(pdf_bytes))
25
 
26
  docs = []
27
+ for i, p in enumerate(reader.pages):
28
+ text = p.extract_text() or ""
 
 
29
  docs.append(
30
  Document(
31
  page_content=text,
32
  metadata={
33
  "source": "Prüfungsordnung (PDF)",
34
+ "page": i + 1,
35
  "pdf_url": PDF_URL,
36
  },
37
  )
 
39
  return docs
40
 
41
 
 
 
 
42
  def load_html_docs():
 
 
 
 
43
  html_bytes = load_file_bytes(BUCKET, "hochschulgesetz.html")
44
+ soup = BeautifulSoup(html_bytes.decode("utf-8", "ignore"), "html.parser")
 
 
 
45
 
46
  return [
47
  Document(
48
+ page_content=soup.get_text("\n"),
49
+ metadata={"source": "Hochschulgesetz NRW", "url": OFFICIAL_HG_URL},
 
 
 
50
  )
51
  ]
52
 
53
 
 
 
 
54
  def chunk_docs(docs):
55
+ splitter = RecursiveCharacterTextSplitter(chunk_size=900, chunk_overlap=80)
 
 
 
56
  return splitter.split_documents(docs)
57
 
58
 
59
+ # ---------------- Delete old data ----------------
60
+ def delete_old_documents():
61
+ print("🗑️ Lösche alte Daten…")
 
 
 
 
 
 
 
 
62
  supabase.table("documents").delete().gte(
63
  "id", "00000000-0000-0000-0000-000000000000"
64
  ).execute()
 
65
 
66
 
67
+ # ---------------- Ingest ----------------
 
 
68
  def ingest():
69
+ delete_old_documents()
70
 
 
 
 
 
71
  pdf_docs = load_pdf_docs()
72
+ html_docs = load_html_docs()
73
 
74
+ chunks = chunk_docs(pdf_docs + html_docs)
 
75
 
76
+ embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
 
 
 
 
77
 
78
+ print("📥 Speichere neue Dokumente…")
79
+ for d in chunks:
80
+ emb = embeddings.embed_query(d.page_content)
 
 
 
 
81
 
82
+ supabase.table("documents").insert(
83
+ {
84
+ "content": d.page_content,
85
+ "metadata": d.metadata,
86
+ "embedding": emb,
87
+ }
88
+ ).execute()
89
 
90
+ print(" Ingest abgeschlossen!")
 
 
 
 
 
 
 
 
 
 
91
 
92
 
93
  if __name__ == "__main__":
rag_pipeline.py CHANGED
@@ -1,110 +1,72 @@
1
  # rag_pipeline.py
2
- from typing import Any, List, Dict
3
  from datetime import date
4
 
5
- from supabase_client import supabase
6
 
7
  from langchain_openai import ChatOpenAI, OpenAIEmbeddings
8
- from langchain_community.vectorstores import SupabaseVectorStore
9
  from langchain_core.messages import (
 
10
  HumanMessage,
11
  AIMessage,
12
- SystemMessage,
13
  )
14
 
15
- # ================================================================
16
- # INITIALIZATION
17
- # ================================================================
18
- _embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
19
 
20
- _vector_store = SupabaseVectorStore(
21
- embedding=_embeddings,
22
- client=supabase,
23
- table_name="documents",
24
- query_name="match_documents",
25
- )
26
 
27
- _retriever = _vector_store.as_retriever(search_kwargs={"k": 4})
28
-
29
- _llm = ChatOpenAI(
30
- model="gpt-4o-mini",
31
- temperature=0.0
32
- )
33
-
34
- # ================================================================
35
- # SYSTEM PROMPT (LEGAL GUARDRAILS)
36
- # ================================================================
37
  SYSTEM_PROMPT = """
38
  Du bist ein hochpräziser juristischer Assistent für Prüfungsrecht in NRW.
39
-
40
- DEINE REGELN:
41
- 1) Nutze AUSSCHLIESSLICH Inhalte aus den Dokumenten, die über das Tool
42
- 'suche_pruefungsrecht_dokumente' geliefert werden.
43
- 2) Keine Spekulation – wenn im Kontext nicht vorhanden, antworte:
44
- „Dazu liegen im bereitgestellten Dokumentenkontext keine Informationen vor.“
45
- 3) Antworte strukturiert:
46
- (a) Einordnung
47
- (b) Rechtsgrundlage (sinngemäß)
48
- (c) Bedingungen / Ausnahmen
49
- (d) Konsequenz für Studierende
50
- 4) Keine eigenen Quellenlinks – nur Sachverhalt erklären.
51
  """
52
 
53
- # ================================================================
54
- # DEFINE TOOL (OPENAI FUNCTION-CALLING)
55
- # ================================================================
56
- def tool_suche_dokumente(query: str) -> Dict:
57
- """
58
- Tool thực hiện RAG retrieval.
59
- Trả về list các docs + metadata.
60
- """
61
- docs = _retriever.invoke(query)
62
-
63
- out_docs = []
64
  for i, d in enumerate(docs):
65
- meta = d.metadata or {}
66
- snippet = d.page_content.replace("\n", " ").strip()
67
- snippet = snippet[:500]
68
 
69
- out_docs.append(
70
  {
71
  "index": i + 1,
72
- "source": meta.get("source", "?"),
73
  "page": meta.get("page"),
74
  "snippet": snippet,
 
75
  "metadata": meta,
76
- "content": d.page_content,
77
  }
78
  )
79
 
80
- return {"results": out_docs}
81
 
82
 
83
- # OpenAI tools definition
84
  TOOLS = [
85
  {
86
  "type": "function",
87
  "function": {
88
  "name": "suche_pruefungsrecht_dokumente",
89
- "description": "Sucht relevante Stellen in Prüfungsordnung und Hochschulgesetz NRW.",
90
  "parameters": {
91
  "type": "object",
92
- "properties": {
93
- "query": {"type": "string"}
94
- },
95
  "required": ["query"],
96
  },
97
  },
98
  }
99
  ]
100
 
101
- # Bind tools to LLM
102
- llm_with_tools = _llm.bind_tools(TOOLS)
103
 
104
- # ================================================================
105
- # HISTORY LOGGING (optional)
106
- # ================================================================
107
- def save_message(role: str, content: str) -> None:
108
  supabase.table("chat_history").insert(
109
  {
110
  "session_date": date.today().isoformat(),
@@ -113,91 +75,57 @@ def save_message(role: str, content: str) -> None:
113
  }
114
  ).execute()
115
 
116
- # ================================================================
117
- # CONVERT HISTORY
118
- # ================================================================
119
- def _convert_history(history):
120
  msgs = []
121
- if not isinstance(history, list):
122
- return msgs
123
-
124
- for h in history[-8:]:
125
- role = h.get("role")
126
- content = h.get("content")
127
- if not content:
128
- continue
129
-
130
- if role == "user":
131
- msgs.append(HumanMessage(content=content))
132
- elif role == "assistant":
133
- msgs.append(AIMessage(content=content))
134
  return msgs
135
 
136
 
137
- # ================================================================
138
- # MAIN — AGENT ANSWER
139
- # ================================================================
140
  def agent_answer(query: str, history: Any):
141
- """
142
- 1. Gửi prompt + query vào model.
143
- 2. Nếu model đòi gọi tool → thực thi tool → lấy kết quả → gửi lại vào LLM.
144
- 3. Trích nguồn để UI hiển thị trong phần Quellen.
145
- """
146
-
147
- chat_history_msgs = _convert_history(history)
148
-
149
- # -------- 1) Gửi câu hỏi lần đầu ----------
150
  messages = [
151
  SystemMessage(content=SYSTEM_PROMPT),
152
- *chat_history_msgs,
153
  HumanMessage(content=query),
154
  ]
155
 
156
- first_response = llm_with_tools.invoke(messages)
157
-
158
- # Nếu model muốn call tool
159
- if first_response.tool_calls:
160
- tc = first_response.tool_calls[0]
161
- if tc["name"] == "suche_pruefungsrecht_dokumente":
162
- tool_result = tool_suche_dokumente(tc["args"]["query"])
163
-
164
- # Gửi tool result vào LLM để trả lời cuối cùng
165
- messages.append(first_response)
166
- messages.append(
167
- AIMessage(
168
- content=str(tool_result),
169
- name="suche_pruefungsrecht_dokumente"
170
- )
171
  )
172
 
173
- final_response = _llm.invoke(messages)
174
- answer = final_response.content
175
-
176
- # CREATE docs_info FOR UI
177
- docs_info = tool_result["results"]
178
-
179
  else:
180
  answer = "Tool nicht unterstützt."
181
- docs_info = []
182
-
183
  else:
184
- answer = first_response.content
185
- docs_info = []
186
-
187
- # Safety Hinweis
188
- if "keine informationen" in answer.lower():
189
- answer = (
190
- "⚠️ **Hinweis:** Die Frage kann anhand des bereitgestellten Dokumentenkontextes "
191
- "nur eingeschränkt beantwortet werden.\n\n"
192
- + answer
193
- )
194
 
195
  save_message("user", query)
196
  save_message("assistant", answer)
197
 
198
- return answer, docs_info
199
 
200
 
201
- # Alias để app.py dùng như cũ
202
  def rag_answer(query: str, history: Any):
203
  return agent_answer(query, history)
 
1
  # rag_pipeline.py
2
+ from typing import Any
3
  from datetime import date
4
 
5
+ from supabase_client import supabase, match_documents
6
 
7
  from langchain_openai import ChatOpenAI, OpenAIEmbeddings
 
8
  from langchain_core.messages import (
9
+ SystemMessage,
10
  HumanMessage,
11
  AIMessage,
 
12
  )
13
 
14
+ emb = OpenAIEmbeddings(model="text-embedding-3-small")
15
+ llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.0)
 
 
16
 
 
 
 
 
 
 
17
 
 
 
 
 
 
 
 
 
 
 
18
  SYSTEM_PROMPT = """
19
  Du bist ein hochpräziser juristischer Assistent für Prüfungsrecht in NRW.
20
+ - Nutze AUSSCHLIESSLICH Dokumente, die über das Tool geliefert werden.
21
+ - Keine Spekulation.
22
+ - Antwort strukturiert + verständlich.
 
 
 
 
 
 
 
 
 
23
  """
24
 
25
+
26
+ # ---------------- TOOL: Suche Dokumente ----------------
27
+ def tool_suche_dokumente(query: str):
28
+ vec = emb.embed_query(query)
29
+ docs = match_documents(vec, k=4)
30
+
31
+ results = []
 
 
 
 
32
  for i, d in enumerate(docs):
33
+ meta = d["metadata"] or {}
34
+ snippet = d["content"].replace("\n", " ")[:400]
 
35
 
36
+ results.append(
37
  {
38
  "index": i + 1,
39
+ "source": meta.get("source"),
40
  "page": meta.get("page"),
41
  "snippet": snippet,
42
+ "content": d["content"],
43
  "metadata": meta,
 
44
  }
45
  )
46
 
47
+ return {"results": results}
48
 
49
 
 
50
  TOOLS = [
51
  {
52
  "type": "function",
53
  "function": {
54
  "name": "suche_pruefungsrecht_dokumente",
55
+ "description": "Sucht relevante Stellen im Prüfungsrecht.",
56
  "parameters": {
57
  "type": "object",
58
+ "properties": {"query": {"type": "string"}},
 
 
59
  "required": ["query"],
60
  },
61
  },
62
  }
63
  ]
64
 
65
+ llm_tools = llm.bind_tools(TOOLS)
 
66
 
67
+
68
+ # ---------------- HISTORY LOG ----------------
69
+ def save_message(role: str, content: str):
 
70
  supabase.table("chat_history").insert(
71
  {
72
  "session_date": date.today().isoformat(),
 
75
  }
76
  ).execute()
77
 
78
+
79
+ def convert_history(hist):
 
 
80
  msgs = []
81
+ for h in hist[-6:]:
82
+ if h["role"] == "user":
83
+ msgs.append(HumanMessage(content=h["content"]))
84
+ else:
85
+ msgs.append(AIMessage(content=h["content"]))
 
 
 
 
 
 
 
 
86
  return msgs
87
 
88
 
89
+ # ---------------- AGENT ANSWER ----------------
 
 
90
  def agent_answer(query: str, history: Any):
 
 
 
 
 
 
 
 
 
91
  messages = [
92
  SystemMessage(content=SYSTEM_PROMPT),
93
+ *convert_history(history),
94
  HumanMessage(content=query),
95
  ]
96
 
97
+ first = llm_tools.invoke(messages)
98
+
99
+ if first.tool_calls:
100
+ call = first.tool_calls[0]
101
+ if call["name"] == "suche_pruefungsrecht_dokumente":
102
+ tool_res = tool_suche_dokumente(call["args"]["query"])
103
+
104
+ messages.extend(
105
+ [
106
+ first,
107
+ AIMessage(
108
+ content=str(tool_res),
109
+ name="suche_pruefungsrecht_dokumente",
110
+ ),
111
+ ]
112
  )
113
 
114
+ final = llm.invoke(messages)
115
+ answer = final.content
116
+ docs = tool_res["results"]
 
 
 
117
  else:
118
  answer = "Tool nicht unterstützt."
119
+ docs = []
 
120
  else:
121
+ answer = first.content
122
+ docs = []
 
 
 
 
 
 
 
 
123
 
124
  save_message("user", query)
125
  save_message("assistant", answer)
126
 
127
+ return answer, docs
128
 
129
 
 
130
  def rag_answer(query: str, history: Any):
131
  return agent_answer(query, history)
supabase_client.py CHANGED
@@ -2,13 +2,6 @@
2
  import os
3
  from supabase import create_client
4
 
5
- """
6
- Supabase-Client (Service-Role) – dùng chung cho:
7
- - ingest.py (đọc Storage + ghi embeddings vào bảng documents)
8
- - rag_pipeline.py (tạo SupabaseVectorStore cho Agent)
9
- - app.py (PDF-Viewer)
10
- """
11
-
12
  SUPABASE_URL = os.environ["SUPABASE_URL"]
13
  SUPABASE_SERVICE_ROLE = os.environ["SUPABASE_SERVICE_ROLE"]
14
 
@@ -16,19 +9,17 @@ supabase = create_client(SUPABASE_URL, SUPABASE_SERVICE_ROLE)
16
 
17
 
18
  def load_file_bytes(bucket: str, filename: str) -> bytes:
19
- """
20
- Tải file từ Supabase Storage (PDF, HTML, …) và trả về bytes.
21
 
22
- Parameters
23
- ----------
24
- bucket : str
25
- Tên bucket trong Supabase Storage.
26
- filename : str
27
- Tên / đường dẫn file trong bucket.
28
 
29
- Returns
30
- -------
31
- bytes
32
- Nội dung file.
33
  """
34
- return supabase.storage.from_(bucket).download(filename)
 
 
 
 
 
 
 
 
 
2
  import os
3
  from supabase import create_client
4
 
 
 
 
 
 
 
 
5
  SUPABASE_URL = os.environ["SUPABASE_URL"]
6
  SUPABASE_SERVICE_ROLE = os.environ["SUPABASE_SERVICE_ROLE"]
7
 
 
9
 
10
 
11
  def load_file_bytes(bucket: str, filename: str) -> bytes:
12
+ return supabase.storage.from_(bucket).download(filename)
 
13
 
 
 
 
 
 
 
14
 
15
+ def match_documents(embedding: list, k: int = 4):
 
 
 
16
  """
17
+ Gọi trực tiếp RPC match_documents trong Supabase.
18
+ Trả về list các rows: {content, metadata, embedding?}
19
+ """
20
+ resp = supabase.rpc(
21
+ "match_documents",
22
+ {"query_embedding": embedding, "match_count": k}
23
+ ).execute()
24
+
25
+ return resp.data or []