Nguyen5 commited on
Commit
c2d2189
·
1 Parent(s): 192f4b1
Files changed (4) hide show
  1. app.py +94 -77
  2. ingest.py +65 -14
  3. rag_pipeline.py +116 -17
  4. supabase_client.py +19 -1
app.py CHANGED
@@ -1,5 +1,6 @@
1
  # app.py
2
  import os
 
3
  import base64
4
  import gradio as gr
5
  from openai import OpenAI
@@ -8,30 +9,40 @@ from supabase_client import load_file_bytes
8
  from rag_pipeline import rag_answer
9
 
10
  client = OpenAI()
 
11
  BUCKET = os.environ["SUPABASE_BUCKET"]
12
  SUPABASE_URL = os.environ["SUPABASE_URL"]
13
 
14
- # ------------------------------------------
15
- # URLs cho Prüfungsordnung (PDF) + HG NRW
16
- # ------------------------------------------
17
-
18
- # PDF nằm trong Supabase (như trước)
19
  PDF_URL = f"{SUPABASE_URL}/storage/v1/object/public/{BUCKET}/pruefungsordnung.pdf"
 
20
 
21
- # ⚠️ Đây là link chính thức của Hochschulgesetz NRW trên recht.nrw.de
22
- HG_URL = "https://recht.nrw.de/lmi/owa/br_text_anzeigen?v_id=10000000000000000654"
23
 
24
- # ------------------------------------------
25
- # Viewer PDF base64
26
- # ------------------------------------------
27
  def encode_pdf_src():
28
  pdf_bytes = load_file_bytes(BUCKET, "pruefungsordnung.pdf")
29
  b64 = base64.b64encode(pdf_bytes).decode("utf-8")
30
  return f"data:application/pdf;base64,{b64}"
31
 
32
- # ------------------------------------------
33
- # Speech-to-text (Whisper)
34
- # ------------------------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  def transcribe(audio_path):
36
  if audio_path is None:
37
  return ""
@@ -39,95 +50,102 @@ def transcribe(audio_path):
39
  result = client.audio.transcriptions.create(
40
  model="whisper-1",
41
  file=f,
42
- language="de", # tiếng Đức
43
- temperature=0.0
44
  )
45
- return (result.text or "").strip()
46
-
47
- # ------------------------------------------
48
- # HÀM CHAT CHÍNH
49
- # ------------------------------------------
50
- def chat_fn(text, audio, history):
51
- text = (text or "").strip()
52
-
53
- # 1) Ưu tiên TEXT; chỉ dùng audio nếu không có text
54
- if text:
55
- question = text
56
- elif audio is not None:
57
- question = transcribe(audio)
58
- else:
59
- return history, "", None # không có input
 
 
 
 
 
60
 
61
- if not question:
62
- return history, "Spracherkennung fehlgeschlagen.", None
 
63
 
64
- # 2) Gọi RAG
65
- answer, docs = rag_answer(question, history or [])
66
 
67
- # 3) Xây block Quellen ở dạng Markdown, DÙNG META từ docs
68
- quellen_md_lines = ["", "### 📚 Quellen (verwendete Dokumentstellen):"]
69
  for i, d in enumerate(docs):
70
- meta = d.get("metadata", {}) or {}
71
- src = meta.get("source", "?")
72
- page = meta.get("page", None)
73
- anchor_id = meta.get("anchor_id")
74
 
75
- # Prüfungsordnung (PDF) – nhảy đúng Seite
76
  if src.startswith("Prüfungsordnung"):
77
- # trong ingest page lưu 1-based; nếu bạn dùng 0-based thì +1 ở đây
78
- page_num = page if isinstance(page, int) else None
79
- if page_num:
80
- url = f"{PDF_URL}#page={page_num}"
81
- title = f"Quelle {i+1} – {src}, Seite {page_num}"
82
- else:
83
- url = PDF_URL
84
- title = f"Quelle {i+1} – {src}"
85
- # Hochschulgesetz NRW – dùng URL chính thức + anchor_id (para)
86
  else:
87
- if anchor_id:
88
- url = f"{HG_URL}#{anchor_id}"
89
- else:
90
- url = HG_URL
91
  title = f"Quelle {i+1} – Hochschulgesetz NRW"
92
 
93
- snippet = (d.get("content") or "")[:200].replace("\n", " ")
94
-
95
- quellen_md_lines.append(f"- [{title}]({url})")
96
- quellen_md_lines.append(f" > {snippet}")
97
 
98
- quellen_md = "\n".join(quellen_md_lines)
99
 
100
- # 4) GỘP câu trả lời + Quellen vào NỘI DUNG CHATBOT
101
- bot_msg = answer + "\n\n" + quellen_md
102
-
103
- new_history = (history or []) + [
104
  {"role": "user", "content": question},
105
  {"role": "assistant", "content": bot_msg},
106
  ]
107
 
108
- # Trả về history (hiển thị trong Chatbot) + block Markdown (nếu muốn xem riêng) + reset audio
109
  return new_history, bot_msg, gr.update(value=None)
110
 
111
- # ------------------------------------------
112
- # GIAO DIỆN
113
- # ------------------------------------------
 
114
  with gr.Blocks() as demo:
115
- gr.Markdown("# ⚖️ Sprachbasierter Chatbot für Prüfungsrecht")
 
 
 
 
116
 
117
  with gr.Row():
118
- # Bên trái: Chat
 
 
 
119
  with gr.Column(scale=3):
120
- # Chatbot RENDER Markdown (type="messages")
121
- chatbot = gr.Chatbot()
122
- text_input = gr.Textbox(label="Frage eingeben")
 
 
 
 
 
 
 
 
123
  audio_input = gr.Audio(type="filepath", label="Spracheingabe (Mikrofon)")
124
- send_btn = gr.Button("Senden")
125
 
126
- # Preview Markdown của câu trả lời cuối (tuỳ chọn)
127
  answer_preview = gr.Markdown("")
128
 
129
- # Bên phải: Viewer
 
 
130
  with gr.Column(scale=2):
 
131
  gr.Markdown("### 📄 Prüfungsordnung (PDF)")
132
  gr.HTML(
133
  f"<iframe src='{encode_pdf_src()}' width='100%' height='250' style='border:none;'></iframe>"
@@ -138,10 +156,9 @@ with gr.Blocks() as demo:
138
  f"<iframe src='{HG_URL}' width='100%' height='250' style='border:none;'></iframe>"
139
  )
140
 
141
- # Nút gửi
142
  send_btn.click(
143
  chat_fn,
144
- inputs=[text_input, audio_input, chatbot],
145
  outputs=[chatbot, answer_preview, audio_input],
146
  )
147
 
 
1
  # app.py
2
  import os
3
+ import re
4
  import base64
5
  import gradio as gr
6
  from openai import OpenAI
 
9
  from rag_pipeline import rag_answer
10
 
11
  client = OpenAI()
12
+
13
  BUCKET = os.environ["SUPABASE_BUCKET"]
14
  SUPABASE_URL = os.environ["SUPABASE_URL"]
15
 
 
 
 
 
 
16
  PDF_URL = f"{SUPABASE_URL}/storage/v1/object/public/{BUCKET}/pruefungsordnung.pdf"
17
+ HG_URL = "https://recht.nrw.de/lmi/owa/br_text_anzeigen?v_id=10000000000000000654"
18
 
 
 
19
 
20
+ # -------------------------------------------------------------------
21
+ # PDF BASE64 để nhúng iframe
22
+ # -------------------------------------------------------------------
23
  def encode_pdf_src():
24
  pdf_bytes = load_file_bytes(BUCKET, "pruefungsordnung.pdf")
25
  b64 = base64.b64encode(pdf_bytes).decode("utf-8")
26
  return f"data:application/pdf;base64,{b64}"
27
 
28
+
29
+ # -------------------------------------------------------------------
30
+ # CLEAN STT
31
+ # -------------------------------------------------------------------
32
+ FILLER = ["äh", "ähm", "uh", "hmm", "mmh", "ah", "oh", "also", "sozusagen", "halt"]
33
+
34
+
35
+ def clean_transcript(t):
36
+ if not t:
37
+ return ""
38
+ t = t.lower().strip()
39
+ for f in FILLER:
40
+ t = re.sub(rf"\b{f}\b", "", t)
41
+ t = re.sub(r"[^a-zA-ZäöüÄÖÜß0-9,.? ]+", " ", t)
42
+ t = re.sub(r"\s+", " ", t).strip()
43
+ return t.capitalize()
44
+
45
+
46
  def transcribe(audio_path):
47
  if audio_path is None:
48
  return ""
 
50
  result = client.audio.transcriptions.create(
51
  model="whisper-1",
52
  file=f,
53
+ language="de",
54
+ temperature=0.0,
55
  )
56
+ return clean_transcript(result.text or "")
57
+
58
+
59
+ # -------------------------------------------------------------------
60
+ # CHAT FUNCTION — KHÔNG ƯU TIÊN TEXT/AUDIO
61
+ # User CHỌN mode: "text" hoặc "audio"
62
+ # -------------------------------------------------------------------
63
+ def chat_fn(mode, text, audio, history):
64
+ history = history or []
65
+
66
+ # --- MODE: TEXT ---
67
+ if mode == "text":
68
+ if not (text or "").strip():
69
+ return history, "Bitte Text eingeben.", None
70
+ question = text.strip()
71
+
72
+ # --- MODE: SPRACHE ---
73
+ if mode == "audio":
74
+ if audio is None:
75
+ return history, "Bitte ins Mikrofon sprechen.", None
76
 
77
+ question = transcribe(audio)
78
+ if not question:
79
+ return history, "Spracherkennung fehlgeschlagen. Bitte erneut versuchen.", None
80
 
81
+ # --- RAG ---
82
+ answer, docs = rag_answer(question, history)
83
 
84
+ # --- Quellen ---
85
+ quellen = ["", "### 📚 Quellen:"]
86
  for i, d in enumerate(docs):
87
+ meta = d["metadata"]
88
+ src = meta.get("source")
 
 
89
 
 
90
  if src.startswith("Prüfungsordnung"):
91
+ page = meta.get("page")
92
+ url = f"{PDF_URL}#page={page}"
93
+ title = f"Quelle {i+1} – Prüfungsordnung, Seite {page}"
 
 
 
 
 
 
94
  else:
95
+ url = HG_URL
 
 
 
96
  title = f"Quelle {i+1} – Hochschulgesetz NRW"
97
 
98
+ snip = d["content"][:160].replace("\n", " ")
99
+ quellen.append(f"- [{title}]({url})")
100
+ quellen.append(f" > {snip}")
 
101
 
102
+ bot_msg = answer + "\n\n" + "\n".join(quellen)
103
 
104
+ new_history = history + [
 
 
 
105
  {"role": "user", "content": question},
106
  {"role": "assistant", "content": bot_msg},
107
  ]
108
 
 
109
  return new_history, bot_msg, gr.update(value=None)
110
 
111
+
112
+ # -------------------------------------------------------------------
113
+ # UI — GIỐNG HÌNH ĐÍNH KÈM
114
+ # -------------------------------------------------------------------
115
  with gr.Blocks() as demo:
116
+
117
+ gr.Markdown("""
118
+ # ⚖️ Sprachbasierter Chatbot für Prüfungsrecht
119
+ Wähle eine Eingabemethode: Text oder Sprache.
120
+ """)
121
 
122
  with gr.Row():
123
+
124
+ # ======================
125
+ # LEFT SIDE: CHAT UI
126
+ # ======================
127
  with gr.Column(scale=3):
128
+
129
+ chatbot = gr.Chatbot(label="Chatverlauf")
130
+
131
+ mode_select = gr.Radio(
132
+ ["text", "audio"],
133
+ value="text",
134
+ label="Eingabemodus",
135
+ info="Wähle zwischen Text oder Sprache",
136
+ )
137
+
138
+ text_input = gr.Textbox(label="Text eingeben")
139
  audio_input = gr.Audio(type="filepath", label="Spracheingabe (Mikrofon)")
 
140
 
141
+ send_btn = gr.Button("Senden")
142
  answer_preview = gr.Markdown("")
143
 
144
+ # ======================
145
+ # RIGHT SIDE: VIEWER
146
+ # ======================
147
  with gr.Column(scale=2):
148
+
149
  gr.Markdown("### 📄 Prüfungsordnung (PDF)")
150
  gr.HTML(
151
  f"<iframe src='{encode_pdf_src()}' width='100%' height='250' style='border:none;'></iframe>"
 
156
  f"<iframe src='{HG_URL}' width='100%' height='250' style='border:none;'></iframe>"
157
  )
158
 
 
159
  send_btn.click(
160
  chat_fn,
161
+ inputs=[mode_select, text_input, audio_input, chatbot],
162
  outputs=[chatbot, answer_preview, audio_input],
163
  )
164
 
ingest.py CHANGED
@@ -1,6 +1,7 @@
1
  # ingest.py
2
  import os
3
  from io import BytesIO
 
4
  from bs4 import BeautifulSoup
5
  from pypdf import PdfReader
6
 
@@ -9,14 +10,33 @@ from langchain_openai import OpenAIEmbeddings
9
  from langchain_text_splitters import RecursiveCharacterTextSplitter
10
  from langchain_core.documents import Document
11
 
 
 
 
12
  BUCKET = os.environ["SUPABASE_BUCKET"]
13
  SUPABASE_URL = os.environ["SUPABASE_URL"]
14
 
 
15
  PDF_URL = f"{SUPABASE_URL}/storage/v1/object/public/{BUCKET}/pruefungsordnung.pdf"
16
- HG_URL = f"{SUPABASE_URL}/storage/v1/object/public/{BUCKET}/hochschulgesetz.html"
 
 
 
17
 
18
 
 
 
 
19
  def load_pdf_docs():
 
 
 
 
 
 
 
 
 
20
  pdf_bytes = load_file_bytes(BUCKET, "pruefungsordnung.pdf")
21
  reader = PdfReader(BytesIO(pdf_bytes))
22
 
@@ -24,39 +44,60 @@ def load_pdf_docs():
24
  for i, page in enumerate(reader.pages):
25
  text = page.extract_text() or ""
26
 
 
 
 
27
  docs.append(
28
  Document(
29
  page_content=text,
30
  metadata={
31
  "source": "Prüfungsordnung (PDF)",
32
- "page": i, # ZERO-based: Seite = i+1
33
- "pdf_url": PDF_URL, # Basis-URL
34
  },
35
  )
36
  )
37
  return docs
38
 
39
 
 
 
 
40
  def load_html_docs():
 
 
 
 
 
 
 
41
  html_bytes = load_file_bytes(BUCKET, "hochschulgesetz.html")
42
  html = html_bytes.decode("utf-8", errors="ignore")
43
 
44
  soup = BeautifulSoup(html, "html.parser")
45
  text = soup.get_text(separator="\n")
46
 
47
- # HTML nicht in Paragraphen getrennt → wir chunk’en später
48
  return [
49
  Document(
50
  page_content=text,
51
  metadata={
52
  "source": "Hochschulgesetz NRW",
53
- # anchor_id wird erst beim Chunken vergeben
 
54
  },
55
  )
56
  ]
57
 
58
 
 
 
 
59
  def chunk_docs(docs):
 
 
 
 
 
60
  splitter = RecursiveCharacterTextSplitter(
61
  chunk_size=900,
62
  chunk_overlap=100,
@@ -64,17 +105,23 @@ def chunk_docs(docs):
64
  return splitter.split_documents(docs)
65
 
66
 
 
 
 
67
  def ingest():
 
68
  pdf_docs = load_pdf_docs()
69
  hg_docs = load_html_docs()
70
 
 
71
  chunks = chunk_docs(pdf_docs + hg_docs)
72
 
 
73
  po_idx = 1
74
  hg_idx = 1
75
 
76
  for d in chunks:
77
- src = d.metadata["source"]
78
 
79
  if src == "Prüfungsordnung (PDF)":
80
  d.metadata["anchor_id"] = f"po_{po_idx}"
@@ -83,21 +130,25 @@ def ingest():
83
  d.metadata["anchor_id"] = f"hg_{hg_idx}"
84
  hg_idx += 1
85
 
86
- # HTML Quelle als vollständige URL
87
  if src == "Hochschulgesetz NRW":
88
- d.metadata["url"] = f"{HG_URL}#{d.metadata['anchor_id']}"
89
 
 
90
  embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
91
 
92
  for d in chunks:
93
  emb = embeddings.embed_query(d.page_content)
94
- supabase.table("documents").insert({
95
- "content": d.page_content,
96
- "metadata": d.metadata,
97
- "embedding": emb
98
- }).execute()
99
 
100
- print("OK ✔ ingest xong – PDF + HTML mit Quelle-URL")
 
 
 
 
 
 
 
 
101
 
102
 
103
  if __name__ == "__main__":
 
1
  # ingest.py
2
  import os
3
  from io import BytesIO
4
+
5
  from bs4 import BeautifulSoup
6
  from pypdf import PdfReader
7
 
 
10
  from langchain_text_splitters import RecursiveCharacterTextSplitter
11
  from langchain_core.documents import Document
12
 
13
+ # -------------------------------------------------------------------
14
+ # ENV + URLs
15
+ # -------------------------------------------------------------------
16
  BUCKET = os.environ["SUPABASE_BUCKET"]
17
  SUPABASE_URL = os.environ["SUPABASE_URL"]
18
 
19
+ # Public URLs trong Supabase Storage (chỉ dùng để tham chiếu / Quelle)
20
  PDF_URL = f"{SUPABASE_URL}/storage/v1/object/public/{BUCKET}/pruefungsordnung.pdf"
21
+ HG_STORAGE_URL = f"{SUPABASE_URL}/storage/v1/object/public/{BUCKET}/hochschulgesetz.html"
22
+
23
+ # (In App dùng link chính thức của HG NRW, còn đây chỉ để meta nếu cần)
24
+ OFFICIAL_HG_URL = "https://recht.nrw.de/lmi/owa/br_text_anzeigen?v_id=10000000000000000654"
25
 
26
 
27
+ # -------------------------------------------------------------------
28
+ # Loader PDF Prüfungsordnung
29
+ # -------------------------------------------------------------------
30
  def load_pdf_docs():
31
+ """
32
+ PDF Prüfungsordnung:
33
+ - Đọc từ Supabase Storage
34
+ - Trích text từng trang
35
+ - Mỗi trang là 1 Document với metadata:
36
+ - source: "Prüfungsordnung (PDF)"
37
+ - page: SỐ TRANG 1-based (Seite 1, 2, 3, ...)
38
+ - pdf_url: URL public của PDF trong Supabase (không #page)
39
+ """
40
  pdf_bytes = load_file_bytes(BUCKET, "pruefungsordnung.pdf")
41
  reader = PdfReader(BytesIO(pdf_bytes))
42
 
 
44
  for i, page in enumerate(reader.pages):
45
  text = page.extract_text() or ""
46
 
47
+ # Lưu page 1-based để sau dùng trực tiếp trong UI
48
+ page_num = i + 1
49
+
50
  docs.append(
51
  Document(
52
  page_content=text,
53
  metadata={
54
  "source": "Prüfungsordnung (PDF)",
55
+ "page": page_num, # 1-based
56
+ "pdf_url": PDF_URL, # Basis-URL
57
  },
58
  )
59
  )
60
  return docs
61
 
62
 
63
+ # -------------------------------------------------------------------
64
+ # Loader HTML Hochschulgesetz (từ Storage)
65
+ # -------------------------------------------------------------------
66
  def load_html_docs():
67
+ """
68
+ Hochschulgesetz NRW (giữ 1 Document lớn, chunk sau).
69
+ Lưu ý:
70
+ - Ta load bản HTML từ Supabase Storage (trước đó đã crawl/lưu).
71
+ - get_text(separator="\\n") để giữ cấu trúc tương đối.
72
+ - Việc chunk sẽ do TextSplitter xử lý.
73
+ """
74
  html_bytes = load_file_bytes(BUCKET, "hochschulgesetz.html")
75
  html = html_bytes.decode("utf-8", errors="ignore")
76
 
77
  soup = BeautifulSoup(html, "html.parser")
78
  text = soup.get_text(separator="\n")
79
 
 
80
  return [
81
  Document(
82
  page_content=text,
83
  metadata={
84
  "source": "Hochschulgesetz NRW",
85
+ # anchor_id sẽ được gán sau khi chunk
86
+ "official_url": OFFICIAL_HG_URL,
87
  },
88
  )
89
  ]
90
 
91
 
92
+ # -------------------------------------------------------------------
93
+ # Text-Splitter chung
94
+ # -------------------------------------------------------------------
95
  def chunk_docs(docs):
96
+ """
97
+ Dùng RecursiveCharacterTextSplitter để chia nhỏ nội dung.
98
+ - chunk_size: 900
99
+ - chunk_overlap: 100
100
+ """
101
  splitter = RecursiveCharacterTextSplitter(
102
  chunk_size=900,
103
  chunk_overlap=100,
 
105
  return splitter.split_documents(docs)
106
 
107
 
108
+ # -------------------------------------------------------------------
109
+ # Ingest vào Supabase (bảng documents)
110
+ # -------------------------------------------------------------------
111
  def ingest():
112
+ # 1) Load nguồn
113
  pdf_docs = load_pdf_docs()
114
  hg_docs = load_html_docs()
115
 
116
+ # 2) Chunk
117
  chunks = chunk_docs(pdf_docs + hg_docs)
118
 
119
+ # 3) Thêm anchor_id cho từng chunk để nhận diện
120
  po_idx = 1
121
  hg_idx = 1
122
 
123
  for d in chunks:
124
+ src = d.metadata.get("source")
125
 
126
  if src == "Prüfungsordnung (PDF)":
127
  d.metadata["anchor_id"] = f"po_{po_idx}"
 
130
  d.metadata["anchor_id"] = f"hg_{hg_idx}"
131
  hg_idx += 1
132
 
133
+ # Thêm URL cho HG nếu muốn dùng sau
134
  if src == "Hochschulgesetz NRW":
135
+ d.metadata["url"] = OFFICIAL_HG_URL
136
 
137
+ # 4) Embeddings
138
  embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
139
 
140
  for d in chunks:
141
  emb = embeddings.embed_query(d.page_content)
 
 
 
 
 
142
 
143
+ supabase.table("documents").insert(
144
+ {
145
+ "content": d.page_content,
146
+ "metadata": d.metadata,
147
+ "embedding": emb,
148
+ }
149
+ ).execute()
150
+
151
+ print("OK ✔ ingest xong – Prüfungsordnung (PDF) + Hochschulgesetz (HTML)")
152
 
153
 
154
  if __name__ == "__main__":
rag_pipeline.py CHANGED
@@ -1,44 +1,142 @@
1
  # rag_pipeline.py
2
  from typing import List, Dict, Any
3
  from datetime import date
 
4
  from openai import OpenAI
5
  from supabase_client import supabase
6
  from langchain_openai import OpenAIEmbeddings
7
 
 
 
 
8
  client = OpenAI()
9
  embedder = OpenAIEmbeddings(model="text-embedding-3-small")
10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
- def get_relevant_docs(query, k=4):
 
 
 
 
 
 
 
13
  emb = embedder.embed_query(query)
14
- resp = supabase.rpc("match_documents",
15
- {"query_embedding": emb, "filter": {}}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  ).execute()
17
- return (resp.data or [])[:k]
18
 
19
 
20
- def save_message(role, content):
21
- supabase.table("chat_history").insert({
22
- "session_date": date.today().isoformat(),
23
- "role": role,
24
- "message": content
25
- }).execute()
26
 
 
 
 
 
 
27
 
28
- def rag_answer(query, history):
29
  docs = get_relevant_docs(query)
30
 
 
31
  context = ""
32
  for i, d in enumerate(docs):
33
- meta = d["metadata"]
34
- src = meta.get("source")
35
  page = meta.get("page")
36
- page_info = f"(Seite {page+1})" if isinstance(page, int) else ""
37
- context += f"[Quelle {i+1}] {src} {page_info}\n{d['content']}\n\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
  messages = [
40
- {"role": "system", "content": "Du bist Chatbot für Prüfungsrecht…"},
41
- {"role": "user", "content": f"Frage: {query}\n\nDokumente:\n{context}"}
42
  ]
43
 
44
  res = client.chat.completions.create(
@@ -49,6 +147,7 @@ def rag_answer(query, history):
49
 
50
  answer = res.choices[0].message.content
51
 
 
52
  save_message("user", query)
53
  save_message("assistant", answer)
54
 
 
1
  # rag_pipeline.py
2
  from typing import List, Dict, Any
3
  from datetime import date
4
+
5
  from openai import OpenAI
6
  from supabase_client import supabase
7
  from langchain_openai import OpenAIEmbeddings
8
 
9
+ # -------------------------------------------------------------------
10
+ # OpenAI + Embeddings
11
+ # -------------------------------------------------------------------
12
  client = OpenAI()
13
  embedder = OpenAIEmbeddings(model="text-embedding-3-small")
14
 
15
+ # -------------------------------------------------------------------
16
+ # System Prompt (Rất quan trọng cho độ chính xác)
17
+ # -------------------------------------------------------------------
18
+ SYSTEM_PROMPT = """
19
+ Du bist ein hochpräziser, fachlich korrekter Chatbot für Prüfungsrecht in NRW.
20
+ Du beantwortest ausschließlich auf Grundlage der offiziellen Rechtsquellen:
21
+
22
+ - Prüfungsordnung (PDF)
23
+ - Hochschulgesetz NRW (recht.nrw.de)
24
+
25
+ REGELN:
26
+ 1. Verwende NUR Informationen aus den bereitgestellten Dokumenten (RAG-Kontext).
27
+ 2. Spekuliere nie. Wenn etwas nicht im Dokument steht, sage explizit, dass es dort nicht geregelt ist.
28
+ 3. Antworte in klaren, gut strukturierten Sätzen auf Deutsch.
29
+ 4. Füge am Ende deiner Antwort keine eigenen Quellen hinzu – die Quellen werden separat im UI angezeigt.
30
+ 5. Zitiere sinngemäß, nicht wortwörtlich.
31
+ 6. Wenn die Frage unklar ist, bitte freundlich um Präzisierung.
32
+ 7. Wenn mehrere Dokumentstellen relevant sind, vergleiche sie kurz.
33
+
34
+ Wenn du dir unsicher bist, sag offen, dass du es auf Basis der vorliegenden Dokumente nicht sicher beantworten kannst.
35
+ """
36
+
37
 
38
+ # -------------------------------------------------------------------
39
+ # Helper: DB RPC – match_documents
40
+ # -------------------------------------------------------------------
41
+ def get_relevant_docs(query: str, k: int = 4) -> List[Dict[str, Any]]:
42
+ """
43
+ Ruft die RPC-Funktion `match_documents` in Supabase auf, um die relevantesten
44
+ Dokument-Chunks für eine Query zu finden.
45
+ """
46
  emb = embedder.embed_query(query)
47
+
48
+ resp = (
49
+ supabase.rpc(
50
+ "match_documents",
51
+ {"query_embedding": emb, "filter": {}},
52
+ )
53
+ .execute()
54
+ )
55
+
56
+ data = resp.data or []
57
+ return data[:k]
58
+
59
+
60
+ # -------------------------------------------------------------------
61
+ # Helper: Chat-History in DB speichern
62
+ # -------------------------------------------------------------------
63
+ def save_message(role: str, content: str) -> None:
64
+ """
65
+ Speichert eine Chatnachricht (role, content) zusammen mit dem heutigen Datum
66
+ in der Tabelle `chat_history`.
67
+ """
68
+ supabase.table("chat_history").insert(
69
+ {
70
+ "session_date": date.today().isoformat(),
71
+ "role": role,
72
+ "message": content,
73
+ }
74
  ).execute()
 
75
 
76
 
77
+ # -------------------------------------------------------------------
78
+ # Hauptfunktion: RAG-Antwort generieren
79
+ # -------------------------------------------------------------------
80
+ def rag_answer(query: str, history: Any):
81
+ """
82
+ Generiert eine Antwort mit RAG:
83
 
84
+ 1. Hole relevante Dokumente aus Supabase (Vektorsuche).
85
+ 2. Baue einen kompakten Kontext-String mit Metadaten + Ausschnitten.
86
+ 3. Erzeuge eine Chat-Completion mit SYSTEM_PROMPT + Nutzerfrage + Kontext.
87
+ 4. Speichere User- und Assistant-Nachricht in chat_history.
88
+ """
89
 
90
+ # 1) Relevante Dokumente
91
  docs = get_relevant_docs(query)
92
 
93
+ # 2) Kontext aus Dokumenten bauen (gekürzt, um "Context Noise" zu vermeiden)
94
  context = ""
95
  for i, d in enumerate(docs):
96
+ meta = d.get("metadata", {}) or {}
97
+ src = meta.get("source", "Unbekannte Quelle")
98
  page = meta.get("page")
99
+
100
+ # Seitenangabe (falls vorhanden)
101
+ if isinstance(page, int):
102
+ page_info = f"(Seite {page})"
103
+ else:
104
+ page_info = ""
105
+
106
+ # Text-Ausschnitt
107
+ snippet = (d.get("content") or "").replace("\n", " ").strip()
108
+ short = snippet[:450] # Kontext absichtlich begrenzen
109
+
110
+ context += f"[Quelle {i+1}] {src} {page_info}\n{short}\n\n"
111
+
112
+ # Optional: kurzen bisherigen Verlauf (für mehr Kontext), nur letzte 6 Einträge
113
+ history_text = ""
114
+ if isinstance(history, list):
115
+ for h in history[-6:]:
116
+ if isinstance(h, dict):
117
+ r = h.get("role")
118
+ c = h.get("content")
119
+ if r in ("user", "assistant") and c:
120
+ history_text += f"{r}: {c}\n"
121
+
122
+ # 3) Messages für OpenAI
123
+ user_prompt = f"""
124
+ Bisheriger Chatverlauf (kurz):
125
+
126
+ {history_text}
127
+
128
+ Aktuelle Frage des Nutzers:
129
+ {query}
130
+
131
+ Relevante Dokumentauszüge:
132
+ {context}
133
+
134
+ Bitte beantworte die aktuelle Frage ausschließlich auf Basis der Dokumentauszüge.
135
+ """
136
 
137
  messages = [
138
+ {"role": "system", "content": SYSTEM_PROMPT},
139
+ {"role": "user", "content": user_prompt},
140
  ]
141
 
142
  res = client.chat.completions.create(
 
147
 
148
  answer = res.choices[0].message.content
149
 
150
+ # 4) Verlauf in DB speichern
151
  save_message("user", query)
152
  save_message("assistant", answer)
153
 
supabase_client.py CHANGED
@@ -2,11 +2,29 @@
2
  import os
3
  from supabase import create_client
4
 
 
 
 
5
  SUPABASE_URL = os.environ["SUPABASE_URL"]
6
  SUPABASE_SERVICE_ROLE = os.environ["SUPABASE_SERVICE_ROLE"]
7
 
8
  supabase = create_client(SUPABASE_URL, SUPABASE_SERVICE_ROLE)
9
 
 
10
  def load_file_bytes(bucket: str, filename: str) -> bytes:
11
- """Tải file từ Supabase Storage nhưng KHÔNG ghi ra local – trả về bytes."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  return supabase.storage.from_(bucket).download(filename)
 
2
  import os
3
  from supabase import create_client
4
 
5
+ # -------------------------------------------------------------------
6
+ # Supabase Client (Service-Role, dùng cho đọc/ghi DB + Storage)
7
+ # -------------------------------------------------------------------
8
  SUPABASE_URL = os.environ["SUPABASE_URL"]
9
  SUPABASE_SERVICE_ROLE = os.environ["SUPABASE_SERVICE_ROLE"]
10
 
11
  supabase = create_client(SUPABASE_URL, SUPABASE_SERVICE_ROLE)
12
 
13
+
14
  def load_file_bytes(bucket: str, filename: str) -> bytes:
15
+ """
16
+ Tải file từ Supabase Storage mà KHÔNG ghi ra local – trả về bytes.
17
+
18
+ Parameters
19
+ ----------
20
+ bucket : str
21
+ Tên bucket trong Supabase Storage.
22
+ filename : str
23
+ Đường dẫn/tên file bên trong bucket.
24
+
25
+ Returns
26
+ -------
27
+ bytes
28
+ Nội dung file ở dạng bytes.
29
+ """
30
  return supabase.storage.from_(bucket).download(filename)