Nguyen5 commited on
Commit
f2f54b4
·
1 Parent(s): 85a2072
Files changed (3) hide show
  1. app.py +36 -64
  2. ingest.py +50 -19
  3. rag_pipeline.py +13 -9
app.py CHANGED
@@ -1,4 +1,4 @@
1
- # app.py
2
  import os
3
  import base64
4
  import gradio as gr
@@ -10,30 +10,21 @@ from rag_pipeline import rag_answer
10
  client = OpenAI()
11
  BUCKET = os.environ["SUPABASE_BUCKET"]
12
 
13
- # ------------------------------------------
14
- # Public URLs để mở PDF/HTML khi nhấn Quelle
15
- # ------------------------------------------
16
- PDF_URL = f"{os.environ['SUPABASE_URL']}/storage/v1/object/public/{BUCKET}/pruefungsordnung.pdf"
17
- HG_URL = f"{os.environ['SUPABASE_URL']}/storage/v1/object/public/{BUCKET}/hochschulgesetz.html"
18
 
19
- # ------------------------------------------
20
- # Viewer PDF base64
21
- # ------------------------------------------
22
  def encode_pdf_src():
23
  pdf_bytes = load_file_bytes(BUCKET, "pruefungsordnung.pdf")
24
- b64 = base64.b64encode(pdf_bytes).decode("utf-8")
25
- return f"data:application/pdf;base64,{b64}"
26
 
27
- # ------------------------------------------
28
- # HTML viewer
29
- # ------------------------------------------
30
  def encode_html():
31
  html_bytes = load_file_bytes(BUCKET, "hochschulgesetz.html")
32
  return html_bytes.decode("utf-8", errors="ignore")
33
 
34
- # ------------------------------------------
35
- # Speech-to-text FIXED
36
- # ------------------------------------------
37
  def transcribe(audio_path):
38
  if audio_path is None:
39
  return ""
@@ -41,21 +32,18 @@ def transcribe(audio_path):
41
  result = client.audio.transcriptions.create(
42
  model="whisper-1",
43
  file=f,
44
- language="de", # ép tiếng Đức
45
- temperature=0.0 # ổn định kết quả
46
  )
47
  return (result.text or "").strip()
48
 
49
- # ------------------------------------------
50
- # MAIN CHAT FUNCTION
51
- # ------------------------------------------
52
  def chat_fn(text, audio, history):
53
  text = (text or "").strip()
54
 
55
- # 1) Ưu tiên text, không dùng audio nếu text có
56
  if text:
57
  question = text
58
- elif audio is not None:
59
  question = transcribe(audio)
60
  else:
61
  return history, "<p>Bitte Text oder Mikrofon benutzen.</p>", None
@@ -63,88 +51,72 @@ def chat_fn(text, audio, history):
63
  if not question:
64
  return history, "<p>Spracherkennung fehlgeschlagen.</p>", None
65
 
66
- # 2) RAG
67
  answer, docs = rag_answer(question, history or [])
68
 
69
- # 3) Build Quellen (click được, phân biệt PDF vs HTML)
70
  html = "<ol>"
71
  for i, d in enumerate(docs):
72
- meta = d.get("metadata", {}) or {}
73
- src = meta.get("source", "?")
74
- page = meta.get("page", None)
75
- anchor_id = meta.get("anchor_id") # từ ingest.py
76
-
77
- # Chọn link tùy nguồn
78
- if "Prüfungsordnung" in src:
79
- # cố gắng nhảy đúng Seite
80
- if page:
81
- link = f"{PDF_URL}#page={page}"
82
- else:
83
- link = PDF_URL
84
- page_info = f"(Seite {page})" if page else ""
85
  else:
86
- # Hochschulgesetz NRW dùng anchor_id trong hochschulgesetz.html
87
- if anchor_id:
88
- link = f"{HG_URL}#{anchor_id}"
89
- else:
90
- link = HG_URL
91
- page_info = "" # HTML không có page
92
 
93
- snippet = (d.get("content") or "")[:200]
94
 
95
  html += f"""
96
  <li>
97
  <a href="{link}" target="_blank">
98
- <b>Quelle {i+1}: {src} {page_info}</b>
99
  </a><br>
100
  {snippet}...
101
  </li>
102
  """
103
  html += "</ol>"
104
 
105
- # 4) Gradio message history (kiểu messages)
106
  new_history = (history or []) + [
107
  {"role": "user", "content": question},
108
  {"role": "assistant", "content": answer},
109
  ]
110
 
111
- # Reset audio input (xóa sóng cũ)
112
  return new_history, html, gr.update(value=None)
113
 
114
- # ------------------------------------------
115
- # UI LAYOUT
116
- # ------------------------------------------
117
  with gr.Blocks() as demo:
118
- gr.Markdown("# ⚖️ Sprachbasierter Chatbot für Prüfungsrecht")
119
 
120
  with gr.Row():
 
121
  with gr.Column(scale=3):
122
- chatbot = gr.Chatbot(label="Chat (RAG)")
123
- text_input = gr.Textbox(label="Text Eingabe")
124
  audio_input = gr.Audio(
125
- type="filepath",
126
- label="Spracheingabe (Mikrofon)"
127
  )
128
  send_btn = gr.Button("Senden")
129
 
130
  with gr.Column(scale=2):
131
- gr.Markdown("### 📄 Prüfungsordnung PDF")
132
  gr.HTML(
133
- f"<iframe src='{encode_pdf_src()}' width='100%' height='250'></iframe>"
134
  )
135
 
136
- gr.Markdown("### 📜 Hochschulgesetz NRW")
137
  gr.HTML(
138
- f"<div style='overflow:auto;height:250px;'>{encode_html()}</div>"
139
  )
140
 
141
  sources_html = gr.HTML()
142
 
143
  send_btn.click(
144
  chat_fn,
145
- inputs=[text_input, audio_input, chatbot],
146
- outputs=[chatbot, sources_html, audio_input],
147
  )
148
 
149
  if __name__ == "__main__":
150
- demo.launch(ssr_mode=False)
 
1
+ # app.py – fixed Quelle links
2
  import os
3
  import base64
4
  import gradio as gr
 
10
  client = OpenAI()
11
  BUCKET = os.environ["SUPABASE_BUCKET"]
12
 
13
+ SUPABASE_URL = os.environ["SUPABASE_URL"]
14
+ PDF_URL = f"{SUPABASE_URL}/storage/v1/object/public/{BUCKET}/pruefungsordnung.pdf"
15
+ HG_URL = f"{SUPABASE_URL}/storage/v1/object/public/{BUCKET}/hochschulgesetz.html"
16
+
 
17
 
 
 
 
18
  def encode_pdf_src():
19
  pdf_bytes = load_file_bytes(BUCKET, "pruefungsordnung.pdf")
20
+ return f"data:application/pdf;base64,{base64.b64encode(pdf_bytes).decode('utf-8')}"
21
+
22
 
 
 
 
23
  def encode_html():
24
  html_bytes = load_file_bytes(BUCKET, "hochschulgesetz.html")
25
  return html_bytes.decode("utf-8", errors="ignore")
26
 
27
+
 
 
28
  def transcribe(audio_path):
29
  if audio_path is None:
30
  return ""
 
32
  result = client.audio.transcriptions.create(
33
  model="whisper-1",
34
  file=f,
35
+ language="de",
36
+ temperature=0.0
37
  )
38
  return (result.text or "").strip()
39
 
40
+
 
 
41
  def chat_fn(text, audio, history):
42
  text = (text or "").strip()
43
 
 
44
  if text:
45
  question = text
46
+ elif audio:
47
  question = transcribe(audio)
48
  else:
49
  return history, "<p>Bitte Text oder Mikrofon benutzen.</p>", None
 
51
  if not question:
52
  return history, "<p>Spracherkennung fehlgeschlagen.</p>", None
53
 
 
54
  answer, docs = rag_answer(question, history or [])
55
 
 
56
  html = "<ol>"
57
  for i, d in enumerate(docs):
58
+ meta = d["metadata"]
59
+ src = meta.get("source")
60
+ page = meta.get("page")
61
+ anchor = meta.get("anchor_id")
62
+
63
+ # PDF vs HTML
64
+ if src == "Prüfungsordnung (PDF)":
65
+ link = f"{PDF_URL}#page={page+1}" if isinstance(page, int) else PDF_URL
 
 
 
 
 
66
  else:
67
+ link = f"{HG_URL}#{anchor}" if anchor else HG_URL
 
 
 
 
 
68
 
69
+ snippet = d["content"][:200].replace("\n", " ")
70
 
71
  html += f"""
72
  <li>
73
  <a href="{link}" target="_blank">
74
+ <b>Quelle {i+1}: {src}</b>
75
  </a><br>
76
  {snippet}...
77
  </li>
78
  """
79
  html += "</ol>"
80
 
 
81
  new_history = (history or []) + [
82
  {"role": "user", "content": question},
83
  {"role": "assistant", "content": answer},
84
  ]
85
 
 
86
  return new_history, html, gr.update(value=None)
87
 
88
+
 
 
89
  with gr.Blocks() as demo:
90
+ gr.Markdown("# ⚖️ Prüfungsrechts-Chatbot (RAG mit Supabase)")
91
 
92
  with gr.Row():
93
+
94
  with gr.Column(scale=3):
95
+ chatbot = gr.Chatbot(type="messages", label="Chat")
96
+ text_input = gr.Textbox(label="Frage eingeben")
97
  audio_input = gr.Audio(
98
+ type="filepath", label="Spracheingabe (Mikrofon)"
 
99
  )
100
  send_btn = gr.Button("Senden")
101
 
102
  with gr.Column(scale=2):
103
+ gr.Markdown("### Prüfungsordnung (PDF)")
104
  gr.HTML(
105
+ f"<iframe src='{encode_pdf_src()}' width='100%' height='260px'></iframe>"
106
  )
107
 
108
+ gr.Markdown("### Hochschulgesetz NRW")
109
  gr.HTML(
110
+ f"<div style='overflow:auto;height:260px;'>{encode_html()}</div>"
111
  )
112
 
113
  sources_html = gr.HTML()
114
 
115
  send_btn.click(
116
  chat_fn,
117
+ [text_input, audio_input, chatbot],
118
+ [chatbot, sources_html, audio_input]
119
  )
120
 
121
  if __name__ == "__main__":
122
+ demo.queue().launch(ssr_mode=False, show_error=True)
ingest.py CHANGED
@@ -10,55 +10,85 @@ from langchain_text_splitters import RecursiveCharacterTextSplitter
10
  from langchain_core.documents import Document
11
 
12
  BUCKET = os.environ["SUPABASE_BUCKET"]
 
 
 
 
 
13
 
14
  def load_pdf_docs():
15
  pdf_bytes = load_file_bytes(BUCKET, "pruefungsordnung.pdf")
16
  reader = PdfReader(BytesIO(pdf_bytes))
 
17
  docs = []
18
  for i, page in enumerate(reader.pages):
19
  text = page.extract_text() or ""
20
- docs.append(Document(
21
- page_content=text,
22
- metadata={"source": "Prüfungsordnung", "page": i + 1},
23
- ))
 
 
 
 
 
 
 
24
  return docs
25
 
 
26
  def load_html_docs():
27
  html_bytes = load_file_bytes(BUCKET, "hochschulgesetz.html")
28
  html = html_bytes.decode("utf-8", errors="ignore")
 
29
  soup = BeautifulSoup(html, "html.parser")
30
  text = soup.get_text(separator="\n")
31
- return [Document(
32
- page_content=text,
33
- metadata={"source": "Hochschulgesetz NRW"},
34
- )]
 
 
 
 
 
 
 
 
35
 
36
  def chunk_docs(docs):
37
  splitter = RecursiveCharacterTextSplitter(
38
- chunk_size=900, chunk_overlap=100)
 
 
39
  return splitter.split_documents(docs)
40
 
 
41
  def ingest():
42
  pdf_docs = load_pdf_docs()
43
  hg_docs = load_html_docs()
 
44
  chunks = chunk_docs(pdf_docs + hg_docs)
45
 
46
- # gán anchor_id
47
- po_index = 1
48
- hg_index = 1
49
 
50
  for d in chunks:
51
  src = d.metadata["source"]
52
- if src == "Prüfungsordnung":
53
- d.metadata["anchor_id"] = f"po_{po_index}"
54
- po_index += 1
 
55
  else:
56
- d.metadata["anchor_id"] = f"hg_{hg_index}"
57
- hg_index += 1
 
 
 
 
58
 
59
  embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
60
 
61
- # insert thủ công
62
  for d in chunks:
63
  emb = embeddings.embed_query(d.page_content)
64
  supabase.table("documents").insert({
@@ -67,7 +97,8 @@ def ingest():
67
  "embedding": emb
68
  }).execute()
69
 
70
- print("OK ✔ ingest xong – đã anchor_id cho tất cả documents")
 
71
 
72
  if __name__ == "__main__":
73
  ingest()
 
10
  from langchain_core.documents import Document
11
 
12
  BUCKET = os.environ["SUPABASE_BUCKET"]
13
+ SUPABASE_URL = os.environ["SUPABASE_URL"]
14
+
15
+ PDF_URL = f"{SUPABASE_URL}/storage/v1/object/public/{BUCKET}/pruefungsordnung.pdf"
16
+ HG_URL = f"{SUPABASE_URL}/storage/v1/object/public/{BUCKET}/hochschulgesetz.html"
17
+
18
 
19
  def load_pdf_docs():
20
  pdf_bytes = load_file_bytes(BUCKET, "pruefungsordnung.pdf")
21
  reader = PdfReader(BytesIO(pdf_bytes))
22
+
23
  docs = []
24
  for i, page in enumerate(reader.pages):
25
  text = page.extract_text() or ""
26
+
27
+ docs.append(
28
+ Document(
29
+ page_content=text,
30
+ metadata={
31
+ "source": "Prüfungsordnung (PDF)",
32
+ "page": i, # ZERO-based: Seite = i+1
33
+ "pdf_url": PDF_URL, # Basis-URL
34
+ },
35
+ )
36
+ )
37
  return docs
38
 
39
+
40
  def load_html_docs():
41
  html_bytes = load_file_bytes(BUCKET, "hochschulgesetz.html")
42
  html = html_bytes.decode("utf-8", errors="ignore")
43
+
44
  soup = BeautifulSoup(html, "html.parser")
45
  text = soup.get_text(separator="\n")
46
+
47
+ # HTML nicht in Paragraphen getrennt → wir chunk’en später
48
+ return [
49
+ Document(
50
+ page_content=text,
51
+ metadata={
52
+ "source": "Hochschulgesetz NRW",
53
+ # anchor_id wird erst beim Chunken vergeben
54
+ },
55
+ )
56
+ ]
57
+
58
 
59
  def chunk_docs(docs):
60
  splitter = RecursiveCharacterTextSplitter(
61
+ chunk_size=900,
62
+ chunk_overlap=100,
63
+ )
64
  return splitter.split_documents(docs)
65
 
66
+
67
  def ingest():
68
  pdf_docs = load_pdf_docs()
69
  hg_docs = load_html_docs()
70
+
71
  chunks = chunk_docs(pdf_docs + hg_docs)
72
 
73
+ po_idx = 1
74
+ hg_idx = 1
 
75
 
76
  for d in chunks:
77
  src = d.metadata["source"]
78
+
79
+ if src == "Prüfungsordnung (PDF)":
80
+ d.metadata["anchor_id"] = f"po_{po_idx}"
81
+ po_idx += 1
82
  else:
83
+ d.metadata["anchor_id"] = f"hg_{hg_idx}"
84
+ hg_idx += 1
85
+
86
+ # HTML Quelle als vollständige URL
87
+ if src == "Hochschulgesetz NRW":
88
+ d.metadata["url"] = f"{HG_URL}#{d.metadata['anchor_id']}"
89
 
90
  embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
91
 
 
92
  for d in chunks:
93
  emb = embeddings.embed_query(d.page_content)
94
  supabase.table("documents").insert({
 
97
  "embedding": emb
98
  }).execute()
99
 
100
+ print("OK ✔ ingest xong – PDF + HTML mit Quelle-URL")
101
+
102
 
103
  if __name__ == "__main__":
104
  ingest()
rag_pipeline.py CHANGED
@@ -1,5 +1,5 @@
1
  # rag_pipeline.py
2
- import os
3
  from datetime import date
4
  from openai import OpenAI
5
  from supabase_client import supabase
@@ -8,14 +8,15 @@ from langchain_openai import OpenAIEmbeddings
8
  client = OpenAI()
9
  embedder = OpenAIEmbeddings(model="text-embedding-3-small")
10
 
 
11
  def get_relevant_docs(query, k=4):
12
  emb = embedder.embed_query(query)
13
- resp = supabase.rpc("match_documents", {
14
- "query_embedding": emb,
15
- "filter": {}
16
- }).execute()
17
  return (resp.data or [])[:k]
18
 
 
19
  def save_message(role, content):
20
  supabase.table("chat_history").insert({
21
  "session_date": date.today().isoformat(),
@@ -23,14 +24,16 @@ def save_message(role, content):
23
  "message": content
24
  }).execute()
25
 
 
26
  def rag_answer(query, history):
27
  docs = get_relevant_docs(query)
 
28
  context = ""
29
  for i, d in enumerate(docs):
30
  meta = d["metadata"]
31
- src = meta["source"]
32
  page = meta.get("page")
33
- page_info = f"(Seite {page})" if page else ""
34
  context += f"[Quelle {i+1}] {src} {page_info}\n{d['content']}\n\n"
35
 
36
  messages = [
@@ -39,12 +42,13 @@ def rag_answer(query, history):
39
  ]
40
 
41
  res = client.chat.completions.create(
42
- model="gpt-4.1-mini",
43
  messages=messages,
44
- temperature=0
45
  )
46
 
47
  answer = res.choices[0].message.content
 
48
  save_message("user", query)
49
  save_message("assistant", answer)
50
 
 
1
  # rag_pipeline.py
2
+ from typing import List, Dict, Any
3
  from datetime import date
4
  from openai import OpenAI
5
  from supabase_client import supabase
 
8
  client = OpenAI()
9
  embedder = OpenAIEmbeddings(model="text-embedding-3-small")
10
 
11
+
12
  def get_relevant_docs(query, k=4):
13
  emb = embedder.embed_query(query)
14
+ resp = supabase.rpc("match_documents",
15
+ {"query_embedding": emb, "filter": {}}
16
+ ).execute()
 
17
  return (resp.data or [])[:k]
18
 
19
+
20
  def save_message(role, content):
21
  supabase.table("chat_history").insert({
22
  "session_date": date.today().isoformat(),
 
24
  "message": content
25
  }).execute()
26
 
27
+
28
  def rag_answer(query, history):
29
  docs = get_relevant_docs(query)
30
+
31
  context = ""
32
  for i, d in enumerate(docs):
33
  meta = d["metadata"]
34
+ src = meta.get("source")
35
  page = meta.get("page")
36
+ page_info = f"(Seite {page+1})" if isinstance(page, int) else ""
37
  context += f"[Quelle {i+1}] {src} {page_info}\n{d['content']}\n\n"
38
 
39
  messages = [
 
42
  ]
43
 
44
  res = client.chat.completions.create(
45
+ model="gpt-4o-mini",
46
  messages=messages,
47
+ temperature=0.0,
48
  )
49
 
50
  answer = res.choices[0].message.content
51
+
52
  save_message("user", query)
53
  save_message("assistant", answer)
54