Nguyen5 commited on
Commit
029df55
·
1 Parent(s): c2d2189
Files changed (4) hide show
  1. app.py +87 -50
  2. ingest.py +50 -47
  3. rag_pipeline.py +158 -110
  4. supabase_client.py +10 -6
app.py CHANGED
@@ -2,12 +2,16 @@
2
  import os
3
  import re
4
  import base64
 
5
  import gradio as gr
6
  from openai import OpenAI
7
 
8
  from supabase_client import load_file_bytes
9
- from rag_pipeline import rag_answer
10
 
 
 
 
11
  client = OpenAI()
12
 
13
  BUCKET = os.environ["SUPABASE_BUCKET"]
@@ -18,32 +22,46 @@ HG_URL = "https://recht.nrw.de/lmi/owa/br_text_anzeigen?v_id=1000000000000000065
18
 
19
 
20
  # -------------------------------------------------------------------
21
- # PDF BASE64 để nhúng iframe
22
  # -------------------------------------------------------------------
23
- def encode_pdf_src():
24
  pdf_bytes = load_file_bytes(BUCKET, "pruefungsordnung.pdf")
25
  b64 = base64.b64encode(pdf_bytes).decode("utf-8")
26
  return f"data:application/pdf;base64,{b64}"
27
 
28
 
29
  # -------------------------------------------------------------------
30
- # CLEAN STT
31
  # -------------------------------------------------------------------
32
- FILLER = ["äh", "ähm", "uh", "hmm", "mmh", "ah", "oh", "also", "sozusagen", "halt"]
33
-
34
-
35
- def clean_transcript(t):
 
 
 
 
 
 
 
 
 
 
 
 
36
  if not t:
37
  return ""
38
  t = t.lower().strip()
39
  for f in FILLER:
40
- t = re.sub(rf"\b{f}\b", "", t)
41
  t = re.sub(r"[^a-zA-ZäöüÄÖÜß0-9,.? ]+", " ", t)
42
  t = re.sub(r"\s+", " ", t).strip()
43
- return t.capitalize()
 
 
44
 
45
 
46
- def transcribe(audio_path):
47
  if audio_path is None:
48
  return ""
49
  with open(audio_path, "rb") as f:
@@ -53,53 +71,69 @@ def transcribe(audio_path):
53
  language="de",
54
  temperature=0.0,
55
  )
56
- return clean_transcript(result.text or "")
 
 
57
 
58
 
59
  # -------------------------------------------------------------------
60
- # CHAT FUNCTION KHÔNG ƯU TIÊN TEXT/AUDIO
61
- # User CHỌN mode: "text" hoặc "audio"
62
  # -------------------------------------------------------------------
63
  def chat_fn(mode, text, audio, history):
64
  history = history or []
65
 
66
- # --- MODE: TEXT ---
67
  if mode == "text":
68
  if not (text or "").strip():
69
  return history, "Bitte Text eingeben.", None
70
  question = text.strip()
71
-
72
- # --- MODE: SPRACHE ---
73
- if mode == "audio":
74
  if audio is None:
75
  return history, "Bitte ins Mikrofon sprechen.", None
76
-
77
  question = transcribe(audio)
78
  if not question:
79
- return history, "Spracherkennung fehlgeschlagen. Bitte erneut versuchen.", None
 
 
 
 
80
 
81
- # --- RAG ---
82
  answer, docs = rag_answer(question, history)
83
 
84
- # --- Quellen ---
85
- quellen = ["", "### 📚 Quellen:"]
86
- for i, d in enumerate(docs):
87
- meta = d["metadata"]
88
- src = meta.get("source")
89
 
90
- if src.startswith("Prüfungsordnung"):
91
- page = meta.get("page")
92
- url = f"{PDF_URL}#page={page}"
93
- title = f"Quelle {i+1} – Prüfungsordnung, Seite {page}"
 
 
 
 
 
 
 
 
 
 
 
94
  else:
95
  url = HG_URL
96
- title = f"Quelle {i+1} Hochschulgesetz NRW"
97
 
98
- snip = d["content"][:160].replace("\n", " ")
99
- quellen.append(f"- [{title}]({url})")
100
- quellen.append(f" > {snip}")
101
 
102
- bot_msg = answer + "\n\n" + "\n".join(quellen)
 
 
 
 
 
 
 
103
 
104
  new_history = history + [
105
  {"role": "user", "content": question},
@@ -110,22 +144,26 @@ def chat_fn(mode, text, audio, history):
110
 
111
 
112
  # -------------------------------------------------------------------
113
- # UI GIỐNG HÌNH ĐÍNH KÈM
114
  # -------------------------------------------------------------------
115
  with gr.Blocks() as demo:
 
 
 
116
 
117
- gr.Markdown("""
118
- # ⚖️ Sprachbasierter Chatbot für Prüfungsrecht
119
- Wähle eine Eingabemethode: Text oder Sprache.
120
- """)
121
 
122
- with gr.Row():
 
123
 
124
- # ======================
125
- # LEFT SIDE: CHAT UI
126
- # ======================
127
- with gr.Column(scale=3):
128
 
 
 
 
129
  chatbot = gr.Chatbot(label="Chatverlauf")
130
 
131
  mode_select = gr.Radio(
@@ -136,16 +174,15 @@ with gr.Blocks() as demo:
136
  )
137
 
138
  text_input = gr.Textbox(label="Text eingeben")
139
- audio_input = gr.Audio(type="filepath", label="Spracheingabe (Mikrofon)")
 
 
140
 
141
  send_btn = gr.Button("Senden")
142
  answer_preview = gr.Markdown("")
143
 
144
- # ======================
145
- # RIGHT SIDE: VIEWER
146
- # ======================
147
  with gr.Column(scale=2):
148
-
149
  gr.Markdown("### 📄 Prüfungsordnung (PDF)")
150
  gr.HTML(
151
  f"<iframe src='{encode_pdf_src()}' width='100%' height='250' style='border:none;'></iframe>"
 
2
  import os
3
  import re
4
  import base64
5
+
6
  import gradio as gr
7
  from openai import OpenAI
8
 
9
  from supabase_client import load_file_bytes
10
+ from rag_pipeline import rag_answer # agent_answer alias
11
 
12
+ # -------------------------------------------------------------------
13
+ # OpenAI client cho Whisper (Speech-to-Text)
14
+ # -------------------------------------------------------------------
15
  client = OpenAI()
16
 
17
  BUCKET = os.environ["SUPABASE_BUCKET"]
 
22
 
23
 
24
  # -------------------------------------------------------------------
25
+ # PDF Viewer (Base64 iframe)
26
  # -------------------------------------------------------------------
27
+ def encode_pdf_src() -> str:
28
  pdf_bytes = load_file_bytes(BUCKET, "pruefungsordnung.pdf")
29
  b64 = base64.b64encode(pdf_bytes).decode("utf-8")
30
  return f"data:application/pdf;base64,{b64}"
31
 
32
 
33
  # -------------------------------------------------------------------
34
+ # Speech-to-Text (Whisper) + cleaning
35
  # -------------------------------------------------------------------
36
+ FILLER = [
37
+ "äh",
38
+ "ähm",
39
+ "uh",
40
+ "hmm",
41
+ "mmh",
42
+ "ah",
43
+ "oh",
44
+ "also",
45
+ "sozusagen",
46
+ "halt",
47
+ "irgendwie",
48
+ ]
49
+
50
+
51
+ def clean_transcript(t: str) -> str:
52
  if not t:
53
  return ""
54
  t = t.lower().strip()
55
  for f in FILLER:
56
+ t = re.sub(rf"\b{re.escape(f)}\b", "", t)
57
  t = re.sub(r"[^a-zA-ZäöüÄÖÜß0-9,.? ]+", " ", t)
58
  t = re.sub(r"\s+", " ", t).strip()
59
+ if len(t) > 1:
60
+ t = t[0].upper() + t[1:]
61
+ return t
62
 
63
 
64
+ def transcribe(audio_path: str) -> str:
65
  if audio_path is None:
66
  return ""
67
  with open(audio_path, "rb") as f:
 
71
  language="de",
72
  temperature=0.0,
73
  )
74
+ raw = (result.text or "").strip()
75
+ cleaned = clean_transcript(raw)
76
+ return cleaned if len(cleaned) >= 3 else ""
77
 
78
 
79
  # -------------------------------------------------------------------
80
+ # Hàm CHAT chính gọi Agent (rag_answer)
 
81
  # -------------------------------------------------------------------
82
  def chat_fn(mode, text, audio, history):
83
  history = history or []
84
 
85
+ # 1) Chọn câu hỏi theo mode
86
  if mode == "text":
87
  if not (text or "").strip():
88
  return history, "Bitte Text eingeben.", None
89
  question = text.strip()
90
+ else: # mode == "audio"
 
 
91
  if audio is None:
92
  return history, "Bitte ins Mikrofon sprechen.", None
 
93
  question = transcribe(audio)
94
  if not question:
95
+ return (
96
+ history,
97
+ "Spracherkennung fehlgeschlagen. Bitte erneut versuchen.",
98
+ None,
99
+ )
100
 
101
+ # 2) Gọi Agent (RAG + Tools)
102
  answer, docs = rag_answer(question, history)
103
 
104
+ # 3) Xây block Quellen (UI-friendly)
105
+ quellen_md_lines = ["", "### 📚 Verwendete Quellen"]
 
 
 
106
 
107
+ for i, d in enumerate(docs):
108
+ meta = d.get("metadata", {}) or {}
109
+ src = meta.get("source", "?")
110
+ page = meta.get("page", None)
111
+
112
+ # Prüfungsordnung – nhảy đúng Seite
113
+ if isinstance(src, str) and src.startswith("Prüfungsordnung"):
114
+ page_num = page if isinstance(page, int) else None
115
+ if page_num:
116
+ url = f"{PDF_URL}#page={page_num}"
117
+ title = f"Quelle {i+1}: Prüfungsordnung (Seite {page_num})"
118
+ else:
119
+ url = PDF_URL
120
+ title = f"Quelle {i+1}: Prüfungsordnung"
121
+ # Hochschulgesetz – link trang chính thức
122
  else:
123
  url = HG_URL
124
+ title = f"Quelle {i+1}: Hochschulgesetz NRW"
125
 
126
+ snippet = (d.get("content") or "").strip().replace("\n", " ")
127
+ snippet = snippet[:200] + ("…" if len(snippet) > 200 else "")
 
128
 
129
+ quellen_md_lines.append(
130
+ f"- [{title}]({url})\n"
131
+ f" - **Ausschnitt:** „{snippet}“"
132
+ )
133
+
134
+ quellen_md = "\n".join(quellen_md_lines)
135
+
136
+ bot_msg = answer + "\n\n" + quellen_md
137
 
138
  new_history = history + [
139
  {"role": "user", "content": question},
 
144
 
145
 
146
  # -------------------------------------------------------------------
147
+ # Giao diện Gradio UI thân thiện
148
  # -------------------------------------------------------------------
149
  with gr.Blocks() as demo:
150
+ gr.Markdown(
151
+ """
152
+ # ⚖️ Prüfungsrechts-Assistent (NRW)
153
 
154
+ Willkommen!
155
+ Ich beantworte Ihre Fragen auf Basis der **offiziellen Dokumente**:
 
 
156
 
157
+ - 📘 *Prüfungsordnung Ihrer Hochschule*
158
+ - 📗 *Hochschulgesetz NRW (recht.nrw.de)*
159
 
160
+ Wählen Sie unten: **Text** oder **Sprache**.
161
+ """
162
+ )
 
163
 
164
+ with gr.Row():
165
+ # LEFT: Chat
166
+ with gr.Column(scale=3):
167
  chatbot = gr.Chatbot(label="Chatverlauf")
168
 
169
  mode_select = gr.Radio(
 
174
  )
175
 
176
  text_input = gr.Textbox(label="Text eingeben")
177
+ audio_input = gr.Audio(
178
+ type="filepath", label="Spracheingabe (Mikrofon)"
179
+ )
180
 
181
  send_btn = gr.Button("Senden")
182
  answer_preview = gr.Markdown("")
183
 
184
+ # RIGHT: Viewer
 
 
185
  with gr.Column(scale=2):
 
186
  gr.Markdown("### 📄 Prüfungsordnung (PDF)")
187
  gr.HTML(
188
  f"<iframe src='{encode_pdf_src()}' width='100%' height='250' style='border:none;'></iframe>"
ingest.py CHANGED
@@ -6,9 +6,11 @@ from bs4 import BeautifulSoup
6
  from pypdf import PdfReader
7
 
8
  from supabase_client import supabase, load_file_bytes
 
9
  from langchain_openai import OpenAIEmbeddings
10
  from langchain_text_splitters import RecursiveCharacterTextSplitter
11
  from langchain_core.documents import Document
 
12
 
13
  # -------------------------------------------------------------------
14
  # ENV + URLs
@@ -16,12 +18,12 @@ from langchain_core.documents import Document
16
  BUCKET = os.environ["SUPABASE_BUCKET"]
17
  SUPABASE_URL = os.environ["SUPABASE_URL"]
18
 
19
- # Public URLs trong Supabase Storage (chỉ dùng để tham chiếu / Quelle)
20
  PDF_URL = f"{SUPABASE_URL}/storage/v1/object/public/{BUCKET}/pruefungsordnung.pdf"
21
  HG_STORAGE_URL = f"{SUPABASE_URL}/storage/v1/object/public/{BUCKET}/hochschulgesetz.html"
22
 
23
- # (In App dùng link chính thức của HG NRW, còn đây chỉ để meta nếu cần)
24
- OFFICIAL_HG_URL = "https://recht.nrw.de/lmi/owa/br_text_anzeigen?v_id=10000000000000000654"
 
25
 
26
 
27
  # -------------------------------------------------------------------
@@ -29,13 +31,8 @@ OFFICIAL_HG_URL = "https://recht.nrw.de/lmi/owa/br_text_anzeigen?v_id=1000000000
29
  # -------------------------------------------------------------------
30
  def load_pdf_docs():
31
  """
32
- PDF Prüfungsordnung:
33
- - Đọc từ Supabase Storage
34
- - Trích text từng trang
35
- - Mỗi trang là 1 Document với metadata:
36
- - source: "Prüfungsordnung (PDF)"
37
- - page: SỐ TRANG 1-based (Seite 1, 2, 3, ...)
38
- - pdf_url: URL public của PDF trong Supabase (không #page)
39
  """
40
  pdf_bytes = load_file_bytes(BUCKET, "pruefungsordnung.pdf")
41
  reader = PdfReader(BytesIO(pdf_bytes))
@@ -43,8 +40,6 @@ def load_pdf_docs():
43
  docs = []
44
  for i, page in enumerate(reader.pages):
45
  text = page.extract_text() or ""
46
-
47
- # Lưu page 1-based để sau dùng trực tiếp trong UI
48
  page_num = i + 1
49
 
50
  docs.append(
@@ -52,8 +47,8 @@ def load_pdf_docs():
52
  page_content=text,
53
  metadata={
54
  "source": "Prüfungsordnung (PDF)",
55
- "page": page_num, # 1-based
56
- "pdf_url": PDF_URL, # Basis-URL
57
  },
58
  )
59
  )
@@ -61,15 +56,12 @@ def load_pdf_docs():
61
 
62
 
63
  # -------------------------------------------------------------------
64
- # Loader HTML Hochschulgesetz (từ Storage)
65
  # -------------------------------------------------------------------
66
  def load_html_docs():
67
  """
68
- Hochschulgesetz NRW (giữ 1 Document lớn, chunk sau).
69
- Lưu ý:
70
- - Ta load bản HTML từ Supabase Storage (trước đó đã crawl/lưu).
71
- - get_text(separator="\\n") để giữ cấu trúc tương đối.
72
- - Việc chunk sẽ do TextSplitter xử lý.
73
  """
74
  html_bytes = load_file_bytes(BUCKET, "hochschulgesetz.html")
75
  html = html_bytes.decode("utf-8", errors="ignore")
@@ -82,7 +74,6 @@ def load_html_docs():
82
  page_content=text,
83
  metadata={
84
  "source": "Hochschulgesetz NRW",
85
- # anchor_id sẽ được gán sau khi chunk
86
  "official_url": OFFICIAL_HG_URL,
87
  },
88
  )
@@ -90,14 +81,9 @@ def load_html_docs():
90
 
91
 
92
  # -------------------------------------------------------------------
93
- # Text-Splitter chung
94
  # -------------------------------------------------------------------
95
  def chunk_docs(docs):
96
- """
97
- Dùng RecursiveCharacterTextSplitter để chia nhỏ nội dung.
98
- - chunk_size: 900
99
- - chunk_overlap: 100
100
- """
101
  splitter = RecursiveCharacterTextSplitter(
102
  chunk_size=900,
103
  chunk_overlap=100,
@@ -106,20 +92,41 @@ def chunk_docs(docs):
106
 
107
 
108
  # -------------------------------------------------------------------
109
- # Ingest vào Supabase (bảng documents)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
  # -------------------------------------------------------------------
111
  def ingest():
112
- # 1) Load nguồn
 
 
 
 
 
113
  pdf_docs = load_pdf_docs()
114
  hg_docs = load_html_docs()
115
 
116
- # 2) Chunk
117
  chunks = chunk_docs(pdf_docs + hg_docs)
118
 
119
- # 3) Thêm anchor_id cho từng chunk để nhận diện
120
  po_idx = 1
121
  hg_idx = 1
122
-
123
  for d in chunks:
124
  src = d.metadata.get("source")
125
 
@@ -129,26 +136,22 @@ def ingest():
129
  else:
130
  d.metadata["anchor_id"] = f"hg_{hg_idx}"
131
  hg_idx += 1
132
-
133
- # Thêm URL cho HG nếu muốn dùng sau
134
- if src == "Hochschulgesetz NRW":
135
  d.metadata["url"] = OFFICIAL_HG_URL
136
 
137
- # 4) Embeddings
138
  embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
139
 
140
- for d in chunks:
141
- emb = embeddings.embed_query(d.page_content)
142
-
143
- supabase.table("documents").insert(
144
- {
145
- "content": d.page_content,
146
- "metadata": d.metadata,
147
- "embedding": emb,
148
- }
149
- ).execute()
150
 
151
- print("OK ingest xong Prüfungsordnung (PDF) + Hochschulgesetz (HTML)")
152
 
153
 
154
  if __name__ == "__main__":
 
6
  from pypdf import PdfReader
7
 
8
  from supabase_client import supabase, load_file_bytes
9
+
10
  from langchain_openai import OpenAIEmbeddings
11
  from langchain_text_splitters import RecursiveCharacterTextSplitter
12
  from langchain_core.documents import Document
13
+ from langchain_community.vectorstores import SupabaseVectorStore
14
 
15
  # -------------------------------------------------------------------
16
  # ENV + URLs
 
18
  BUCKET = os.environ["SUPABASE_BUCKET"]
19
  SUPABASE_URL = os.environ["SUPABASE_URL"]
20
 
 
21
  PDF_URL = f"{SUPABASE_URL}/storage/v1/object/public/{BUCKET}/pruefungsordnung.pdf"
22
  HG_STORAGE_URL = f"{SUPABASE_URL}/storage/v1/object/public/{BUCKET}/hochschulgesetz.html"
23
 
24
+ OFFICIAL_HG_URL = (
25
+ "https://recht.nrw.de/lmi/owa/br_text_anzeigen?v_id=10000000000000000654"
26
+ )
27
 
28
 
29
  # -------------------------------------------------------------------
 
31
  # -------------------------------------------------------------------
32
  def load_pdf_docs():
33
  """
34
+ Đọc Prüfungsordnung.pdf từ Supabase Storage và tạo 1 Document cho mỗi
35
+ trang (page 1-based).
 
 
 
 
 
36
  """
37
  pdf_bytes = load_file_bytes(BUCKET, "pruefungsordnung.pdf")
38
  reader = PdfReader(BytesIO(pdf_bytes))
 
40
  docs = []
41
  for i, page in enumerate(reader.pages):
42
  text = page.extract_text() or ""
 
 
43
  page_num = i + 1
44
 
45
  docs.append(
 
47
  page_content=text,
48
  metadata={
49
  "source": "Prüfungsordnung (PDF)",
50
+ "page": page_num,
51
+ "pdf_url": PDF_URL,
52
  },
53
  )
54
  )
 
56
 
57
 
58
  # -------------------------------------------------------------------
59
+ # Loader HTML Hochschulgesetz
60
  # -------------------------------------------------------------------
61
  def load_html_docs():
62
  """
63
+ Đọc hochschulgesetz.html từ Supabase Storage, parse bằng BeautifulSoup,
64
+ lấy toàn bộ text thành 1 Document lớn (chunk sau).
 
 
 
65
  """
66
  html_bytes = load_file_bytes(BUCKET, "hochschulgesetz.html")
67
  html = html_bytes.decode("utf-8", errors="ignore")
 
74
  page_content=text,
75
  metadata={
76
  "source": "Hochschulgesetz NRW",
 
77
  "official_url": OFFICIAL_HG_URL,
78
  },
79
  )
 
81
 
82
 
83
  # -------------------------------------------------------------------
84
+ # Chunking – RecursiveCharacterTextSplitter
85
  # -------------------------------------------------------------------
86
  def chunk_docs(docs):
 
 
 
 
 
87
  splitter = RecursiveCharacterTextSplitter(
88
  chunk_size=900,
89
  chunk_overlap=100,
 
92
 
93
 
94
  # -------------------------------------------------------------------
95
+ # Xoá dữ liệu cũ trong bảng documents
96
+ # -------------------------------------------------------------------
97
+ def delete_old_data():
98
+ """
99
+ Xoá toàn bộ rows trong bảng 'documents'.
100
+
101
+ Cột id là UUID, nên dùng điều kiện >= với UUID nhỏ nhất để tránh lỗi
102
+ 'invalid input syntax for type uuid'.
103
+ """
104
+ print("🔄 Lösche alte Daten aus Tabelle 'documents' ...")
105
+ supabase.table("documents").delete().gte(
106
+ "id", "00000000-0000-0000-0000-000000000000"
107
+ ).execute()
108
+ print("✔ Alte Daten in 'documents' gelöscht.")
109
+
110
+
111
+ # -------------------------------------------------------------------
112
+ # Ingest chính
113
  # -------------------------------------------------------------------
114
  def ingest():
115
+ print("🚀 Starte Ingest (PDF + Hochschulgesetz) ...")
116
+
117
+ # 1) Xoá data cũ
118
+ delete_old_data()
119
+
120
+ # 2) Load nguồn
121
  pdf_docs = load_pdf_docs()
122
  hg_docs = load_html_docs()
123
 
124
+ # 3) Chunk
125
  chunks = chunk_docs(pdf_docs + hg_docs)
126
 
127
+ # 4) Gắn anchor_id & URL meta
128
  po_idx = 1
129
  hg_idx = 1
 
130
  for d in chunks:
131
  src = d.metadata.get("source")
132
 
 
136
  else:
137
  d.metadata["anchor_id"] = f"hg_{hg_idx}"
138
  hg_idx += 1
 
 
 
139
  d.metadata["url"] = OFFICIAL_HG_URL
140
 
141
+ # 5) Embeddings + SupabaseVectorStore
142
  embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
143
 
144
+ print("🔍 Erstelle Embeddings und speichere in SupabaseVectorStore ...")
145
+ SupabaseVectorStore.from_documents(
146
+ chunks,
147
+ embeddings,
148
+ client=supabase,
149
+ table_name="documents",
150
+ query_name="match_documents",
151
+ chunk_size=500, # batch size khi insert
152
+ )
 
153
 
154
+ print("🎉 Ingest fertig'documents' ist frisch aufgebaut.")
155
 
156
 
157
  if __name__ == "__main__":
rag_pipeline.py CHANGED
@@ -1,70 +1,112 @@
1
  # rag_pipeline.py
2
- from typing import List, Dict, Any
3
  from datetime import date
4
 
5
- from openai import OpenAI
6
  from supabase_client import supabase
7
- from langchain_openai import OpenAIEmbeddings
 
 
 
 
 
 
8
 
9
  # -------------------------------------------------------------------
10
- # OpenAI + Embeddings
11
  # -------------------------------------------------------------------
12
- client = OpenAI()
13
- embedder = OpenAIEmbeddings(model="text-embedding-3-small")
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
  # -------------------------------------------------------------------
16
- # System Prompt (Rất quan trọng cho độ chính xác)
17
  # -------------------------------------------------------------------
18
  SYSTEM_PROMPT = """
19
- Du bist ein hochpräziser, fachlich korrekter Chatbot für Prüfungsrecht in NRW.
20
- Du beantwortest ausschließlich auf Grundlage der offiziellen Rechtsquellen:
21
-
22
- - Prüfungsordnung (PDF)
23
- - Hochschulgesetz NRW (recht.nrw.de)
24
-
25
- REGELN:
26
- 1. Verwende NUR Informationen aus den bereitgestellten Dokumenten (RAG-Kontext).
27
- 2. Spekuliere nie. Wenn etwas nicht im Dokument steht, sage explizit, dass es dort nicht geregelt ist.
28
- 3. Antworte in klaren, gut strukturierten Sätzen auf Deutsch.
29
- 4. Füge am Ende deiner Antwort keine eigenen Quellen hinzu – die Quellen werden separat im UI angezeigt.
30
- 5. Zitiere sinngemäß, nicht wortwörtlich.
31
- 6. Wenn die Frage unklar ist, bitte freundlich um Präzisierung.
32
- 7. Wenn mehrere Dokumentstellen relevant sind, vergleiche sie kurz.
33
-
34
- Wenn du dir unsicher bist, sag offen, dass du es auf Basis der vorliegenden Dokumente nicht sicher beantworten kannst.
35
- """
 
 
 
 
 
 
 
 
 
 
 
 
36
 
 
 
 
 
37
 
38
  # -------------------------------------------------------------------
39
- # Helper: DB RPC – match_documents
40
  # -------------------------------------------------------------------
41
- def get_relevant_docs(query: str, k: int = 4) -> List[Dict[str, Any]]:
42
- """
43
- Ruft die RPC-Funktion `match_documents` in Supabase auf, um die relevantesten
44
- Dokument-Chunks für eine Query zu finden.
45
- """
46
- emb = embedder.embed_query(query)
47
-
48
- resp = (
49
- supabase.rpc(
50
- "match_documents",
51
- {"query_embedding": emb, "filter": {}},
52
- )
53
- .execute()
54
- )
 
 
 
 
 
 
 
 
 
 
 
55
 
56
- data = resp.data or []
57
- return data[:k]
 
58
 
59
 
60
  # -------------------------------------------------------------------
61
- # Helper: Chat-History in DB speichern
62
  # -------------------------------------------------------------------
63
  def save_message(role: str, content: str) -> None:
64
- """
65
- Speichert eine Chatnachricht (role, content) zusammen mit dem heutigen Datum
66
- in der Tabelle `chat_history`.
67
- """
68
  supabase.table("chat_history").insert(
69
  {
70
  "session_date": date.today().isoformat(),
@@ -75,80 +117,86 @@ def save_message(role: str, content: str) -> None:
75
 
76
 
77
  # -------------------------------------------------------------------
78
- # Hauptfunktion: RAG-Antwort generieren
79
  # -------------------------------------------------------------------
80
- def rag_answer(query: str, history: Any):
81
  """
82
- Generiert eine Antwort mit RAG:
83
-
84
- 1. Hole relevante Dokumente aus Supabase (Vektorsuche).
85
- 2. Baue einen kompakten Kontext-String mit Metadaten + Ausschnitten.
86
- 3. Erzeuge eine Chat-Completion mit SYSTEM_PROMPT + Nutzerfrage + Kontext.
87
- 4. Speichere User- und Assistant-Nachricht in chat_history.
88
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
 
90
- # 1) Relevante Dokumente
91
- docs = get_relevant_docs(query)
92
 
93
- # 2) Kontext aus Dokumenten bauen (gekürzt, um "Context Noise" zu vermeiden)
94
- context = ""
95
- for i, d in enumerate(docs):
96
- meta = d.get("metadata", {}) or {}
97
- src = meta.get("source", "Unbekannte Quelle")
98
- page = meta.get("page")
99
-
100
- # Seitenangabe (falls vorhanden)
101
- if isinstance(page, int):
102
- page_info = f"(Seite {page})"
103
- else:
104
- page_info = ""
105
-
106
- # Text-Ausschnitt
107
- snippet = (d.get("content") or "").replace("\n", " ").strip()
108
- short = snippet[:450] # Kontext absichtlich begrenzen
109
-
110
- context += f"[Quelle {i+1}] {src} {page_info}\n{short}\n\n"
111
-
112
- # Optional: kurzen bisherigen Verlauf (für mehr Kontext), nur letzte 6 Einträge
113
- history_text = ""
114
- if isinstance(history, list):
115
- for h in history[-6:]:
116
- if isinstance(h, dict):
117
- r = h.get("role")
118
- c = h.get("content")
119
- if r in ("user", "assistant") and c:
120
- history_text += f"{r}: {c}\n"
121
-
122
- # 3) Messages für OpenAI
123
- user_prompt = f"""
124
- Bisheriger Chatverlauf (kurz):
125
-
126
- {history_text}
127
-
128
- Aktuelle Frage des Nutzers:
129
- {query}
130
-
131
- Relevante Dokumentauszüge:
132
- {context}
133
 
134
- Bitte beantworte die aktuelle Frage ausschließlich auf Basis der Dokumentauszüge.
135
- """
 
 
 
 
136
 
137
- messages = [
138
- {"role": "system", "content": SYSTEM_PROMPT},
139
- {"role": "user", "content": user_prompt},
140
- ]
141
 
142
- res = client.chat.completions.create(
143
- model="gpt-4o-mini",
144
- messages=messages,
145
- temperature=0.0,
146
- )
 
 
147
 
148
- answer = res.choices[0].message.content
 
 
 
 
 
 
 
 
 
 
149
 
150
- # 4) Verlauf in DB speichern
151
  save_message("user", query)
152
  save_message("assistant", answer)
153
 
154
- return answer, docs
 
 
 
 
 
 
 
1
  # rag_pipeline.py
2
+ from typing import Any, List, Dict
3
  from datetime import date
4
 
 
5
  from supabase_client import supabase
6
+
7
+ from langchain_openai import ChatOpenAI, OpenAIEmbeddings
8
+ from langchain_community.vectorstores import SupabaseVectorStore
9
+ from langchain.tools.retriever import create_retriever_tool
10
+ from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
11
+ from langchain_core.messages import HumanMessage, AIMessage
12
+ from langchain.agents import create_openai_tools_agent, AgentExecutor
13
 
14
  # -------------------------------------------------------------------
15
+ # LLM, Embeddings, VectorStore, Retriever
16
  # -------------------------------------------------------------------
17
+ _embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
18
+
19
+ _vector_store = SupabaseVectorStore(
20
+ embedding=_embeddings,
21
+ client=supabase,
22
+ table_name="documents",
23
+ query_name="match_documents",
24
+ )
25
+
26
+ _retriever = _vector_store.as_retriever(search_kwargs={"k": 4})
27
+
28
+ _llm = ChatOpenAI(
29
+ model="gpt-4o-mini",
30
+ temperature=0.0,
31
+ )
32
 
33
  # -------------------------------------------------------------------
34
+ # Prompt engineering legal guardrails
35
  # -------------------------------------------------------------------
36
  SYSTEM_PROMPT = """
37
+ Du bist ein hochpräziser juristischer Assistent für Prüfungsrecht in NRW.
38
+ Du arbeitest ausschließlich auf Grundlage der folgenden Dokumente:
39
+
40
+ 1. Prüfungsordnung (PDF)
41
+ 2. Hochschulgesetz NRW (offizielle Fassung auf recht.nrw.de)
42
+
43
+ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
44
+ REGELN FÜR DEINE ANTWORT
45
+ ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━
46
+
47
+ 1) Nutze AUSSCHLIESSLICH die Dokumentauszüge, die du über das Tool
48
+ 'suche_pruefungsrecht_dokumente' erhältst.
49
+ - Wenn eine Information NICHT im Kontext steht, antworte:
50
+ „Dazu liegen im bereitgestellten Dokumentenkontext keine Informationen vor.“
51
+
52
+ 2) Spekuliere nicht, erfinde nichts, nutze keine externen Quellen.
53
+
54
+ 3) Antworte strukturiert:
55
+ (a) kurze Einordnung,
56
+ (b) Kernaussage / Rechtsgrundlage,
57
+ (c) wichtige Bedingungen oder Ausnahmen,
58
+ (d) praktische Konsequenz für Studierende.
59
+
60
+ 4) Du fügst selbst KEINE Quellenlinks hinzu.
61
+ - Die UI zeigt die Quellen separat an.
62
+ - Du kannst aber sinngemäß auf „die Prüfungsordnung“ oder „das Hochschulgesetz“
63
+ verweisen.
64
+
65
+ 5) Wenn mehrere Dokumentstellen relevant sind, vergleiche sie kurz.
66
 
67
+ 6) Wenn die Frage unklar ist, bitte freundlich um Präzisierung.
68
+
69
+ 7) Schreibe so, dass Studierende ohne Jurastudium dich verstehen.
70
+ """
71
 
72
  # -------------------------------------------------------------------
73
+ # Retriever Tool cho Agent
74
  # -------------------------------------------------------------------
75
+ retriever_tool = create_retriever_tool(
76
+ _retriever,
77
+ name="suche_pruefungsrecht_dokumente",
78
+ description=(
79
+ "Suche in der Prüfungsordnung (PDF) und im Hochschulgesetz NRW "
80
+ "nach relevanten Gesetzesstellen zum Prüfungsrecht. "
81
+ "Nutze dieses Tool IMMER, bevor du eine Antwort gibst."
82
+ ),
83
+ )
84
+
85
+ tools = [retriever_tool]
86
+
87
+ # Prompt cho Agent (dùng Tools + Memory)
88
+ prompt = ChatPromptTemplate.from_messages(
89
+ [
90
+ ("system", SYSTEM_PROMPT),
91
+ MessagesPlaceholder("chat_history"),
92
+ (
93
+ "user",
94
+ "Aktuelle Frage:\n{input}\n\n"
95
+ "Nutze das Tool, um relevante Dokumentstellen zu finden, "
96
+ "und beantworte die Frage ausschließlich anhand dieses Kontextes.",
97
+ ),
98
+ ]
99
+ )
100
 
101
+ # Tạo Agent + Executor
102
+ agent = create_openai_tools_agent(_llm, tools, prompt)
103
+ agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=False)
104
 
105
 
106
  # -------------------------------------------------------------------
107
+ # Lưu chat_history (logging) vào Supabase
108
  # -------------------------------------------------------------------
109
  def save_message(role: str, content: str) -> None:
 
 
 
 
110
  supabase.table("chat_history").insert(
111
  {
112
  "session_date": date.today().isoformat(),
 
117
 
118
 
119
  # -------------------------------------------------------------------
120
+ # Convert history của Gradio → chat_history cho Agent
121
  # -------------------------------------------------------------------
122
+ def _convert_history(history: Any):
123
  """
124
+ Gradio history: list[{"role": "user"/"assistant", "content": str}, ...]
125
+ → list[HumanMessage/AIMessage] cho MessagesPlaceholder.
 
 
 
 
126
  """
127
+ msgs: List[Any] = []
128
+ if not isinstance(history, list):
129
+ return msgs
130
+
131
+ for h in history[-8:]: # chỉ lấy ~8 lượt gần nhất
132
+ if not isinstance(h, dict):
133
+ continue
134
+ role = h.get("role")
135
+ content = h.get("content")
136
+ if not content:
137
+ continue
138
+ if role == "user":
139
+ msgs.append(HumanMessage(content=content))
140
+ elif role == "assistant":
141
+ msgs.append(AIMessage(content=content))
142
+ return msgs
143
 
 
 
144
 
145
+ # -------------------------------------------------------------------
146
+ # Hàm chính: Agent-Antwort + Dokumente für Quellen
147
+ # -------------------------------------------------------------------
148
+ def agent_answer(query: str, history: Any):
149
+ """
150
+ Dùng OpenAI Tools Agent để trả lời câu hỏi:
151
+ - Agent gọi tool 'suche_pruefungsrecht_dokumente' (Retriever) khi cần.
152
+ - Đồng thời, ta tự gọi retriever để lấy docs cho UI (Quellen).
153
+
154
+ Returns
155
+ -------
156
+ answer : str
157
+ Câu trả lời đã qua prompt engineering (không có link).
158
+ docs_info : list[dict]
159
+ Thông tin document cho phần Quellen trong UI.
160
+ """
161
+ chat_history_msgs = _convert_history(history)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
 
163
+ result = agent_executor.invoke(
164
+ {
165
+ "input": query,
166
+ "chat_history": chat_history_msgs,
167
+ }
168
+ )
169
 
170
+ answer: str = result["output"]
 
 
 
171
 
172
+ # Safety-Hinweis, falls Agent selbst zugibt, dass Kontext nicht reicht
173
+ if "keine informationen vor" in answer.lower():
174
+ answer = (
175
+ "⚠️ **Hinweis:** Die Frage kann anhand des bereitgestellten "
176
+ "Dokumentenkontextes nur eingeschränkt beantwortet werden.\n\n"
177
+ + answer
178
+ )
179
 
180
+ # Dokumente separat für UI holen (gleiches Retriever wie Agent)
181
+ retrieved_docs = _retriever.get_relevant_documents(query)
182
+ docs_info: List[Dict[str, Any]] = []
183
+ for doc in retrieved_docs:
184
+ docs_info.append(
185
+ {
186
+ "content": doc.page_content,
187
+ "metadata": doc.metadata or {},
188
+ "score": 0.0, # hier nicht benutzt, aber Feld gelassen
189
+ }
190
+ )
191
 
192
+ # Logging
193
  save_message("user", query)
194
  save_message("assistant", answer)
195
 
196
+ return answer, docs_info
197
+
198
+
199
+ # Alias kompatibel mit Version cũ
200
+ def rag_answer(query: str, history: Any):
201
+ """Alias, để app.py có thể tiếp tục import rag_answer như trước."""
202
+ return agent_answer(query, history)
supabase_client.py CHANGED
@@ -2,9 +2,13 @@
2
  import os
3
  from supabase import create_client
4
 
5
- # -------------------------------------------------------------------
6
- # Supabase Client (Service-Role, dùng cho đọc/ghi DB + Storage)
7
- # -------------------------------------------------------------------
 
 
 
 
8
  SUPABASE_URL = os.environ["SUPABASE_URL"]
9
  SUPABASE_SERVICE_ROLE = os.environ["SUPABASE_SERVICE_ROLE"]
10
 
@@ -13,18 +17,18 @@ supabase = create_client(SUPABASE_URL, SUPABASE_SERVICE_ROLE)
13
 
14
  def load_file_bytes(bucket: str, filename: str) -> bytes:
15
  """
16
- Tải file từ Supabase Storage KHÔNG ghi ra local – trả về bytes.
17
 
18
  Parameters
19
  ----------
20
  bucket : str
21
  Tên bucket trong Supabase Storage.
22
  filename : str
23
- Đường dẫn/tên file bên trong bucket.
24
 
25
  Returns
26
  -------
27
  bytes
28
- Nội dung file ở dạng bytes.
29
  """
30
  return supabase.storage.from_(bucket).download(filename)
 
2
  import os
3
  from supabase import create_client
4
 
5
+ """
6
+ Supabase-Client (Service-Role) dùng chung cho:
7
+ - ingest.py (đọc Storage + ghi embeddings vào bảng documents)
8
+ - rag_pipeline.py (tạo SupabaseVectorStore cho Agent)
9
+ - app.py (PDF-Viewer)
10
+ """
11
+
12
  SUPABASE_URL = os.environ["SUPABASE_URL"]
13
  SUPABASE_SERVICE_ROLE = os.environ["SUPABASE_SERVICE_ROLE"]
14
 
 
17
 
18
  def load_file_bytes(bucket: str, filename: str) -> bytes:
19
  """
20
+ Tải file từ Supabase Storage (PDF, HTML, …) trả về bytes.
21
 
22
  Parameters
23
  ----------
24
  bucket : str
25
  Tên bucket trong Supabase Storage.
26
  filename : str
27
+ Tên / đường dẫn file trong bucket.
28
 
29
  Returns
30
  -------
31
  bytes
32
+ Nội dung file.
33
  """
34
  return supabase.storage.from_(bucket).download(filename)