Nguyen5 commited on
Commit
1e7df8d
·
1 Parent(s): d65949b
Files changed (15) hide show
  1. app.py +65 -127
  2. chat_history.py +0 -70
  3. embeddings.py +0 -24
  4. hg_nrw_supabase.py +0 -99
  5. ingest.py +73 -0
  6. llm.py +0 -27
  7. load_documents.py +0 -104
  8. rag_pipeline.py +62 -96
  9. requirements.txt +3 -15
  10. retriever.py +0 -48
  11. speech_io.py +0 -52
  12. split_documents.py +0 -28
  13. supabase_client.py +12 -0
  14. vectorstore.py +0 -55
  15. viewer.py +0 -76
app.py CHANGED
@@ -1,131 +1,69 @@
1
  # app.py
 
 
2
  import os
3
- from typing import List, Tuple
4
 
5
- import gradio as gr
6
- from langchain_core.documents import Document
7
- from langchain_text_splitters import RecursiveCharacterTextSplitter
8
- from langchain_community.vectorstores import FAISS
9
- from langchain_openai import ChatOpenAI, OpenAIEmbeddings
10
-
11
- from load_documents import load_documents
12
- from speech_io import transcribe_audio, synthesize_speech
13
-
14
-
15
- # ========== 1. Lade Dokumente ==========
16
- print("🔹 Lade Dokumente aus Supabase …")
17
- docs: List[Document] = load_documents()
18
- print("✔ DOCUMENTS LOADED:", len(docs))
19
-
20
- print("🔹 Splitte Dokumente …")
21
- text_splitter = RecursiveCharacterTextSplitter(
22
- chunk_size=800,
23
- chunk_overlap=200,
24
- )
25
- chunks = text_splitter.split_documents(docs)
26
- print(f" - {len(chunks)} Chunks erzeugt.")
27
-
28
- print("🔹 Erzeuge VectorStore …")
29
- embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
30
- vectorstore = FAISS.from_documents(chunks, embeddings)
31
- retriever = vectorstore.as_retriever(search_kwargs={"k": 4})
32
- print(">>> Retriever ready.")
33
-
34
- print("🔹 Lade OpenAI LLM …")
35
- llm = ChatOpenAI(
36
- model="gpt-4o-mini",
37
- temperature=0.1,
38
- )
39
-
40
-
41
- # ========== 2. RAG ==========
42
- def build_context(docs: List[Document]) -> str:
43
- parts = []
44
- for i, d in enumerate(docs, 1):
45
- meta = d.metadata
46
- src = meta.get("source")
47
- page = meta.get("page")
48
- abs_id = meta.get("abs_id")
49
-
50
- label = f"[Quelle {i}] {src}"
51
- if page:
52
- label += f", Seite {page}"
53
- if abs_id:
54
- label += f", Abs. {abs_id}"
55
-
56
- parts.append(f"{label}\n{d.page_content}")
57
- return "\n\n".join(parts)
58
-
59
-
60
- def rag_answer(query: str, mode: str):
61
- retrieved = retriever.invoke(query)
62
- ctx = build_context(retrieved)
63
-
64
- modes = {
65
- "Kurz": "Antworte sehr kurz (max. 3 Sätze).",
66
- "Standard": "Antworte ausführlich und verständlich.",
67
- "Juristisch Präzise": "Formuliere juristisch präzise.",
68
- }
69
-
70
- messages = [
71
- {
72
- "role": "system",
73
- "content": "Du bist ein Chatbot für Prüfungsrecht. Antworte nur auf Deutsch."
74
- },
75
- {
76
- "role": "user",
77
- "content": f"FRAGE:\n{query}\n\nKONTEXT:\n{ctx}\n\n{modes[mode]}"
78
- }
79
- ]
80
-
81
- resp = llm.invoke(messages)
82
- return resp.content, retrieved
83
-
84
-
85
- # ========== 3. Chatbot Funktionen (GRADIO 4.x – TUPLES) ==========
86
- def chatbot_text(user_input: str, history: List[Tuple[str, str]], mode: str):
87
- answer, _ = rag_answer(user_input, mode)
88
- history = history + [(user_input, answer)]
89
- return history, history
90
-
91
-
92
- def chatbot_voice(audio_file: str, history: List[Tuple[str, str]], mode: str, language_hint: str):
93
- user_text = transcribe_audio(audio_file, language_hint or None)
94
- answer, _ = rag_answer(user_text, mode)
95
- audio_out = synthesize_speech(answer)
96
-
97
- history = history + [(user_text, answer)]
98
- return history, audio_out, user_text, history
99
-
100
-
101
- # ========== 4. UI ==========
102
- with gr.Blocks(title="Prüfungsrechts-Chatbot") as demo:
103
-
104
- with gr.Tab("💬 Text-Chat"):
105
- mode = gr.Radio(["Kurz", "Standard", "Juristisch Präzise"], value="Standard")
106
- chat = gr.Chatbot()
107
- state = gr.State([])
108
- inp = gr.Textbox(label="Frage eingeben")
109
- send = gr.Button("Senden")
110
-
111
- send.click(chatbot_text, [inp, state, mode], [chat, state])
112
-
113
- with gr.Tab("🎙️ Sprach-Chat"):
114
- mode_v = gr.Radio(["Kurz", "Standard", "Juristisch Präzise"], value="Standard")
115
- chat_v = gr.Chatbot()
116
- state_v = gr.State([])
117
-
118
- mic = gr.Audio(sources=["microphone"], type="filepath")
119
- lang = gr.Textbox(label="Sprache (optional: de/en/vi)")
120
- out_audio = gr.Audio()
121
- transcript = gr.Textbox(label="Transkript")
122
-
123
- btn = gr.Button("Sprechen")
124
- btn.click(
125
- chatbot_voice,
126
- [mic, state_v, mode_v, lang],
127
- [chat_v, out_audio, transcript, state_v]
128
- )
129
 
130
- if __name__ == "__main__":
131
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # app.py
2
+ import gradio as gr
3
+ from openai import OpenAI
4
  import os
 
5
 
6
+ from rag_pipeline import rag_answer
7
+
8
+ client = OpenAI()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
+ PDF_URL = os.environ["PDF_URL"]
11
+ HG_URL = os.environ["HG_URL"]
12
+
13
+ def transcribe(audio_path):
14
+ if audio_path is None:
15
+ return ""
16
+
17
+ with open(audio_path, "rb") as f:
18
+ result = client.audio.transcriptions.create(
19
+ model="whisper-1",
20
+ file=f,
21
+ )
22
+ return result.text
23
+
24
+ def chat_fn(text, audio, history):
25
+ # Microphone input → text
26
+ spoken_text = transcribe(audio)
27
+
28
+ if text and spoken_text:
29
+ question = f"{text}\n(Gesprochen: {spoken_text})"
30
+ elif spoken_text:
31
+ question = spoken_text
32
+ else:
33
+ question = text or ""
34
+
35
+ if not question:
36
+ return history, "<p>Bitte Text oder Mikrofon benutzen.</p>"
37
+
38
+ answer, docs = rag_answer(question, history or [])
39
+
40
+ # Prepare sources
41
+ html = "<ol>"
42
+ for i, d in enumerate(docs):
43
+ src = d.metadata.get("source", "?")
44
+ page = d.metadata.get("page", "")
45
+ url = PDF_URL if "Prüfungsordnung" in src else HG_URL
46
+ html += f"<li><a target='_blank' href='{url}'>{src} {page}</a><br>{d.page_content[:200]}...</li>"
47
+ html += "</ol>"
48
+
49
+ history.append((question, answer))
50
+ return history, html
51
+
52
+ with gr.Blocks() as demo:
53
+ gr.Markdown("# ⚖️ Sprachbasierter Chatbot für Prüfungsrecht")
54
+
55
+ with gr.Row():
56
+ with gr.Column(scale=3):
57
+ chat = gr.Chatbot()
58
+ text = gr.Textbox(label="Text Eingabe")
59
+ audio = gr.Audio(source="microphone", type="filepath")
60
+ send = gr.Button("Senden")
61
+
62
+ with gr.Column(scale=2):
63
+ gr.HTML(f"<iframe src='{PDF_URL}' width='100%' height='250'></iframe>")
64
+ gr.HTML(f"<iframe src='{HG_URL}' width='100%' height='250'></iframe>")
65
+ sources = gr.HTML()
66
+
67
+ send.click(chat_fn, inputs=[text, audio, chat], outputs=[chat, sources])
68
+
69
+ demo.launch()
chat_history.py DELETED
@@ -1,70 +0,0 @@
1
- # chat_history.py – Supabase chat history (messages-format für Gradio)
2
-
3
- import uuid
4
- import os
5
- from supabase import create_client
6
-
7
- SUPABASE_URL = os.environ["SUPABASE_URL"]
8
- SUPABASE_SERVICE_ROLE = os.environ["SUPABASE_SERVICE_ROLE"]
9
-
10
- supabase = create_client(SUPABASE_URL, SUPABASE_SERVICE_ROLE)
11
-
12
-
13
- # ------------------------------------------------------
14
- # Session anlegen
15
- # ------------------------------------------------------
16
-
17
- def create_session() -> str:
18
- session_id = str(uuid.uuid4())
19
-
20
- supabase.table("chat_sessions").insert({
21
- "id": session_id
22
- }).execute()
23
-
24
- return session_id
25
-
26
-
27
- # ------------------------------------------------------
28
- # Message speichern
29
- # ------------------------------------------------------
30
-
31
- def save_message(session_id: str, role: str, content: str):
32
- if not session_id or session_id == "None":
33
- print("⚠ WARN: invalid session_id, skip save_message")
34
- return
35
-
36
- supabase.table("chat_messages").insert({
37
- "session_id": session_id,
38
- "role": role,
39
- "content": content,
40
- }).execute()
41
-
42
-
43
- # ------------------------------------------------------
44
- # History laden – Format: [{role: ..., content: ...}, ...]
45
- # ------------------------------------------------------
46
-
47
- def load_history(session_id: str):
48
- if not session_id or session_id == "None":
49
- return []
50
-
51
- res = (
52
- supabase.table("chat_messages")
53
- .select("*")
54
- .eq("session_id", session_id)
55
- .order("created_at")
56
- .execute()
57
- )
58
-
59
- rows = res.data or []
60
-
61
- history = []
62
- for r in rows:
63
- history.append(
64
- {
65
- "role": r["role"],
66
- "content": r["content"],
67
- }
68
- )
69
-
70
- return history
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
embeddings.py DELETED
@@ -1,24 +0,0 @@
1
- # embeddings.py – OpenAI Version (text-embedding-3-small)
2
-
3
- import os
4
- from langchain_openai import OpenAIEmbeddings
5
-
6
- EMBED_MODEL = "text-embedding-3-small"
7
-
8
- def get_embeddings():
9
- api_key = os.environ.get("OPENAI_API_KEY")
10
- if not api_key:
11
- raise RuntimeError(
12
- "OPENAI_API_KEY fehlt. Bitte als Secret im HuggingFace Space setzen."
13
- )
14
-
15
- print(f">>> Lade OpenAI Embedding Model: {EMBED_MODEL}")
16
- emb = OpenAIEmbeddings(
17
- model=EMBED_MODEL,
18
- api_key=api_key,
19
- )
20
- return emb
21
-
22
- if __name__ == "__main__":
23
- e = get_embeddings()
24
- print(e.embed_query("Test"))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
hg_nrw_supabase.py DELETED
@@ -1,99 +0,0 @@
1
- """
2
- hg_nrw_supabase.py
3
-
4
- Lädt das Hochschulgesetz NRW von recht.nrw.de,
5
- extrahiert alle Paragraphen (§ …) und schreibt sie in
6
- die Supabase-Tabelle public.hg_nrw.
7
-
8
- Erwartete Spalten in hg_nrw:
9
- - abs_id text (z.B. 'para_64')
10
- - title text (z.B. '§ 64 Prüfungsordnungen')
11
- - content text (Volltext)
12
- - order_index int4 (Sortierreihenfolge)
13
- - source_url text (immer die Original-URL von recht.nrw.de)
14
- """
15
-
16
- import os
17
- import requests
18
- from bs4 import BeautifulSoup
19
- from supabase import create_client
20
- from dotenv import load_dotenv
21
-
22
- load_dotenv()
23
-
24
- SUPABASE_URL = os.environ["SUPABASE_URL"]
25
- SUPABASE_SERVICE_ROLE = os.environ["SUPABASE_SERVICE_ROLE"]
26
-
27
- supabase = create_client(SUPABASE_URL, SUPABASE_SERVICE_ROLE)
28
-
29
- # Nur DIESE URL, keine Druckversion:
30
- LAW_URL = "https://recht.nrw.de/lmi/owa/br_text_anzeigen?v_id=10000000000000000654"
31
-
32
-
33
- def extract_paragraphs():
34
- print(">>> Lade Hochschulgesetz NRW von recht.nrw.de …")
35
-
36
- html = requests.get(LAW_URL, timeout=30).text
37
- soup = BeautifulSoup(html, "html.parser")
38
-
39
- # Alle Überschriften <h2>/<h3>, viele davon sind §§
40
- headers = soup.find_all(["h2", "h3"])
41
-
42
- paragraphs = []
43
- order = 1
44
-
45
- for header in headers:
46
- title = header.get_text(" ", strip=True)
47
-
48
- # Nur Überschriften, die mit "§" anfangen
49
- if not title.startswith("§"):
50
- continue
51
-
52
- # Inhalte ab dieser Überschrift bis vor die nächste h2/h3
53
- content_parts = []
54
- sibling = header.find_next_sibling()
55
-
56
- while sibling and sibling.name not in ["h2", "h3"]:
57
- text = sibling.get_text(" ", strip=True)
58
- if text:
59
- content_parts.append(text)
60
- sibling = sibling.find_next_sibling()
61
-
62
- full_content = "\n".join(content_parts).strip()
63
- abs_id = f"para_{order}"
64
-
65
- paragraphs.append(
66
- {
67
- "abs_id": abs_id,
68
- "title": title,
69
- "content": full_content,
70
- "order_index": order,
71
- # dùng trực tiếp web link gốc, không thêm anchor
72
- "source_url": LAW_URL,
73
- }
74
- )
75
-
76
- order += 1
77
-
78
- print(f"✔ Extracted {len(paragraphs)} paragraphs (§).")
79
- return paragraphs
80
-
81
-
82
- def upload_to_supabase():
83
- paras = extract_paragraphs()
84
-
85
- print(">>> Clear table hg_nrw …")
86
- supabase.table("hg_nrw").delete().neq("abs_id", "").execute()
87
-
88
- print(">>> Upload begin …")
89
- BATCH = 100
90
- for i in range(0, len(paras), BATCH):
91
- batch = paras[i : i + BATCH]
92
- print(f" - Upload batch {i} – {i + len(batch) - 1}")
93
- supabase.table("hg_nrw").upsert(batch).execute()
94
-
95
- print("✔ DONE uploading complete NRW law.")
96
-
97
-
98
- if __name__ == "__main__":
99
- upload_to_supabase()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ingest.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ingest.py
2
+ import os
3
+ from io import BytesIO
4
+ from bs4 import BeautifulSoup
5
+ from pypdf import PdfReader
6
+
7
+ from supabase_client import supabase, load_file_bytes
8
+ from langchain_openai import OpenAIEmbeddings
9
+ from langchain_community.vectorstores import SupabaseVectorStore
10
+ from langchain_core.documents import Document
11
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
12
+
13
+ BUCKET = os.environ["SUPABASE_BUCKET"]
14
+
15
+ def load_pdf_docs():
16
+ pdf_bytes = load_file_bytes(BUCKET, "pruefungsordnung.pdf")
17
+ reader = PdfReader(BytesIO(pdf_bytes))
18
+
19
+ docs = []
20
+ for i, page in enumerate(reader.pages):
21
+ text = page.extract_text() or ""
22
+ docs.append(
23
+ Document(
24
+ page_content=text,
25
+ metadata={"source": "Prüfungsordnung", "page": i + 1},
26
+ )
27
+ )
28
+ return docs
29
+
30
+ def load_html_docs():
31
+ html_bytes = load_file_bytes(BUCKET, "hochschulgesetz.html")
32
+ html_str = html_bytes.decode("utf-8", errors="ignore")
33
+ soup = BeautifulSoup(html_str, "html.parser")
34
+ text = soup.get_text(separator="\n")
35
+
36
+ return [
37
+ Document(
38
+ page_content=text,
39
+ metadata={"source": "Hochschulgesetz NRW"},
40
+ )
41
+ ]
42
+
43
+ def chunk_docs(docs):
44
+ splitter = RecursiveCharacterTextSplitter(
45
+ chunk_size=1000,
46
+ chunk_overlap=150
47
+ )
48
+ return splitter.split_documents(docs)
49
+
50
+ def main():
51
+ pdf_docs = load_pdf_docs()
52
+ html_docs = load_html_docs()
53
+ all_docs = pdf_docs + html_docs
54
+
55
+ chunks = chunk_docs(all_docs)
56
+
57
+ embeddings = OpenAIEmbeddings(
58
+ model="text-embedding-3-small"
59
+ )
60
+
61
+ SupabaseVectorStore.from_documents(
62
+ chunks,
63
+ embeddings,
64
+ client=supabase,
65
+ table_name="documents",
66
+ query_name="match_documents",
67
+ chunk_size=200,
68
+ )
69
+
70
+ print("Ingest OK (no local files).")
71
+
72
+ if __name__ == "__main__":
73
+ main()
llm.py DELETED
@@ -1,27 +0,0 @@
1
- # llm.py – OpenAI Chatmodell für RAG
2
-
3
- import os
4
- from langchain_openai import ChatOpenAI
5
-
6
- CHAT_MODEL = "gpt-4o-mini" # günstig & stark
7
-
8
- def load_llm():
9
- api_key = os.environ.get("OPENAI_API_KEY")
10
- if not api_key:
11
- raise RuntimeError(
12
- "OPENAI_API_KEY fehlt. Bitte als Secret im HuggingFace Space setzen."
13
- )
14
-
15
- print(f">>> Lade OpenAI Chatmodell: {CHAT_MODEL}")
16
-
17
- llm = ChatOpenAI(
18
- model=CHAT_MODEL,
19
- temperature=0.0, # deterministisch, wenig Halluzination
20
- api_key=api_key,
21
- )
22
- return llm
23
-
24
- if __name__ == "__main__":
25
- llm = load_llm()
26
- print(llm.invoke("Sag einen Satz zum Prüfungsrecht.").content)
27
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
load_documents.py DELETED
@@ -1,104 +0,0 @@
1
- # load_documents.py
2
-
3
- import os
4
- from io import BytesIO
5
- from typing import List
6
-
7
- from dotenv import load_dotenv
8
- from supabase import create_client, Client
9
- from pypdf import PdfReader
10
- from langchain_core.documents import Document
11
-
12
- load_dotenv()
13
-
14
-
15
- # ============== Supabase Init ==============
16
- def get_supabase_client() -> Client:
17
- url = os.getenv("SUPABASE_URL")
18
- key = (
19
- os.getenv("SUPABASE_SERVICE_ROLE_KEY")
20
- or os.getenv("SUPABASE_SERVICE_ROLE")
21
- or os.getenv("SUPABASE_KEY")
22
- )
23
- if not url or not key:
24
- raise RuntimeError("Supabase ENV fehlen.")
25
-
26
- return create_client(url, key)
27
-
28
-
29
- # ============== HG NRW Paragraphen ==============
30
- def load_hg_paragraphs(supabase: Client) -> List[Document]:
31
- print(">>> Lade Hochschulgesetz NRW (§) aus Supabase…")
32
-
33
- table = os.getenv("HG_TABLE_NAME", "hg_nrw")
34
- rows = supabase.table(table).select("*").order("order_index").execute().data or []
35
-
36
- docs = []
37
- for row in rows:
38
- text = (row.get("title", "") + "\n\n" + row.get("content", "")).strip()
39
- if not text:
40
- continue
41
-
42
- docs.append(Document(
43
- page_content=text,
44
- metadata={
45
- "source": "Hochschulgesetz NRW",
46
- "abs_id": row.get("abs_id"),
47
- "order_index": row.get("order_index"),
48
- "url": "https://recht.nrw.de/lmi/owa/br_text_anzeigen?v_id=10000000000000000654",
49
- "type": "law",
50
- }
51
- ))
52
-
53
- print(f" - {len(docs)} Paragraphen geladen.")
54
- return docs
55
-
56
-
57
- # ============== Prüfungsordnung PDF ==============
58
- def load_pruefungsordnung_from_storage(supabase: Client) -> List[Document]:
59
- bucket = os.getenv("PRUEF_BUCKET")
60
- pdf_path = os.getenv("PRUEF_PDF_PATH")
61
-
62
- if not bucket or not pdf_path:
63
- print(">>> Keine Prüfungsordnung-PDF definiert.")
64
- return []
65
-
66
- print(">>> Lade Prüfungsordnung PDF …")
67
-
68
- try:
69
- data = supabase.storage.from_(bucket).download(pdf_path)
70
- except Exception as e:
71
- print(" Fehler beim PDF Download:", e)
72
- return []
73
-
74
- reader = PdfReader(BytesIO(data))
75
- docs = []
76
-
77
- for i, page in enumerate(reader.pages):
78
- text = (page.extract_text() or "").strip()
79
- if not text:
80
- continue
81
-
82
- docs.append(Document(
83
- page_content=text,
84
- metadata={
85
- "source": "Prüfungsordnung (PDF)",
86
- "page": i + 1,
87
- "type": "pruefungsordnung",
88
- }
89
- ))
90
-
91
- print(f" - {len(docs)} PDF-Seiten geladen.")
92
- return docs
93
-
94
-
95
- # ============== Main Loader ==============
96
- def load_documents() -> List[Document]:
97
- supabase = get_supabase_client()
98
- docs = []
99
-
100
- docs += load_hg_paragraphs(supabase)
101
- docs += load_pruefungsordnung_from_storage(supabase)
102
-
103
- print(f"✔ DOCUMENTS LOADED: {len(docs)}")
104
- return docs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
rag_pipeline.py CHANGED
@@ -1,100 +1,66 @@
1
- # rag_pipeline.py – OpenAI RAG mit Supabase-Dokumenten
2
-
3
- from typing import List, Dict, Any, Tuple
4
- from langchain_core.messages import SystemMessage, HumanMessage
5
-
6
- MAX_CHARS = 900
7
-
8
-
9
- def build_sources_metadata(docs: List) -> List[Dict[str, Any]]:
10
- srcs = []
11
-
12
- for i, d in enumerate(docs):
13
- meta = d.metadata
14
- src = meta.get("source")
15
- page = meta.get("page")
16
- snippet = d.page_content[:300].replace("\n", " ")
17
-
18
- if src == "Prüfungsordnung (PDF)":
19
- pdf_url = meta.get("pdf_url")
20
- if isinstance(page, int) and pdf_url:
21
- url = f"{pdf_url}#page={page + 1}"
22
- else:
23
- url = pdf_url
24
-
25
- elif src == "Hochschulgesetz NRW":
26
- url = meta.get("url")
27
- page = None
28
-
29
- else:
30
- url = None
31
-
32
- srcs.append({
33
- "id": i + 1,
34
- "source": src,
35
- "page": page + 1 if isinstance(page, int) else None,
36
- "url": url,
37
- "snippet": snippet,
38
- })
39
-
40
- return srcs
41
-
42
-
43
- def format_context(docs):
44
- if not docs:
45
- return "(Kein relevanter Kontext gefunden.)"
46
-
47
- out = []
48
  for i, d in enumerate(docs):
49
- txt = d.page_content[:MAX_CHARS]
50
- src = d.metadata.get("source")
51
- page = d.metadata.get("page")
52
-
53
- if src == "Prüfungsordnung (PDF)" and isinstance(page, int):
54
- src_str = f"{src}, Seite {page + 1}"
55
- else:
56
- src_str = src
57
-
58
- out.append(f"[KONTEXT {i+1}] ({src_str})\n{txt}")
59
-
60
- return "\n\n".join(out)
61
-
62
-
63
- SYSTEM_PROMPT = """
64
- Du bist ein juristisch präziser Chatbot für Prüfungsrecht.
65
- Du nutzt ausschließlich die Prüfungsordnung (PDF) und das Hochschulgesetz NRW.
66
-
67
- Regeln:
68
- 1. Antworte nur auf Basis des gelieferten Kontextes.
69
- 2. Wenn der Kontext keine sichere Antwort erlaubt, sage das klar.
70
- 3. Antworte in verständlichem, korrektem Deutsch.
71
- 4. Nenne Paragraphen, Dokumente und Seitenzahlen (bei PDF), wo möglich.
72
- """
73
-
74
-
75
- def answer(question: str, retriever, chat_model) -> Tuple[str, List[Dict[str, Any]]]:
76
- docs = retriever.invoke(question)
77
- context_str = format_context(docs)
78
-
79
- human = f"""
80
- FRAGE:
81
- {question}
82
-
83
- NUTZE AUSSCHLIESSLICH DIESEN KONTEXT:
84
- {context_str}
85
-
86
- AUFGABE:
87
- Erstelle eine juristisch korrekte Antwort ausschließlich basierend
88
- auf diesem Kontext. Falls der Kontext unzureichend ist, sage das klar.
89
- """
90
-
91
- msgs = [
92
- SystemMessage(content=SYSTEM_PROMPT),
93
- HumanMessage(content=human),
94
  ]
95
 
96
- result = chat_model.invoke(msgs)
97
- answer_text = result.content.strip()
 
 
98
 
99
- sources = build_sources_metadata(docs)
100
- return answer_text, sources
 
1
+ # rag_pipeline.py
2
+ import os
3
+ from datetime import date
4
+ from supabase_client import supabase
5
+
6
+ from langchain_community.vectorstores import SupabaseVectorStore
7
+ from langchain_openai import OpenAIEmbeddings, ChatOpenAI
8
+
9
+ def get_vectorstore():
10
+ embeddings = OpenAIEmbeddings(
11
+ model="text-embedding-3-small"
12
+ )
13
+ return SupabaseVectorStore(
14
+ embedding=embeddings,
15
+ client=supabase,
16
+ table_name="documents",
17
+ query_name="match_documents",
18
+ )
19
+
20
+ def save_message(role, message):
21
+ supabase.table("chat_history").insert({
22
+ "session_date": date.today().isoformat(),
23
+ "role": role,
24
+ "message": message
25
+ }).execute()
26
+
27
+ def rag_answer(question, history):
28
+ retriever = get_vectorstore().as_retriever(search_kwargs={"k": 4})
29
+ docs = retriever.get_relevant_documents(question)
30
+
31
+ # Build context
32
+ context = ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  for i, d in enumerate(docs):
34
+ src = d.metadata.get("source", "?")
35
+ pg = d.metadata.get("page", "")
36
+ pg = f"(Seite {pg})" if pg else ""
37
+ context += f"[Quelle {i+1}] {src} {pg}\n{d.page_content}\n\n"
38
+
39
+ # Build history text
40
+ hist = ""
41
+ for u, b in history:
42
+ hist += f"User: {u}\nAssistant: {b}\n"
43
+
44
+ system_prompt = (
45
+ "Du bist ein Sprachbasierter Chatbot für Prüfungsrecht. "
46
+ "Nutze NUR die bereitgestellten Dokumente."
47
+ "Zitiere immer [Quelle X]."
48
+ )
49
+
50
+ llm = ChatOpenAI(model="gpt-4.1-mini", temperature=0.1)
51
+
52
+ msg = [
53
+ ("system", system_prompt),
54
+ ("user",
55
+ f"Frage: {question}\n\n"
56
+ f"Vorheriger Chatverlauf:\n{hist}\n\n"
57
+ f"Dokumente:\n{context}"
58
+ ),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  ]
60
 
61
+ answer = llm.invoke(msg).content
62
+
63
+ save_message("user", question)
64
+ save_message("assistant", answer)
65
 
66
+ return answer, docs
 
requirements.txt CHANGED
@@ -1,21 +1,9 @@
1
- # === UI ===
2
- gradio
3
- gradio_pdf
4
-
5
- # === Core RAG + LangChain ===
6
  langchain
7
  langchain-community
8
- langchain-text-splitters
9
  langchain-openai
10
-
11
- # === OpenAI SDK (LLM, Embeddings, Audio) ===
12
- openai>=1.35.0
13
-
14
- # === VectorStore ===
15
- faiss-cpu
16
-
17
- # === Supabase + Dokumente laden ===
18
  supabase
 
19
  pypdf
20
- requests
21
  python-dotenv
 
 
 
 
 
 
1
  langchain
2
  langchain-community
 
3
  langchain-openai
4
+ openai
 
 
 
 
 
 
 
5
  supabase
6
+ gradio
7
  pypdf
8
+ beautifulsoup4
9
  python-dotenv
retriever.py DELETED
@@ -1,48 +0,0 @@
1
- """
2
- BƯỚC 5: RETRIEVER
3
- -----------------
4
- Tạo LangChain Retriever từ FAISS VectorStore.
5
-
6
- Retriever sẽ dùng trong bước RAG sau này:
7
- - retriever.get_relevant_documents(query)
8
- """
9
-
10
- from langchain_community.vectorstores import FAISS
11
-
12
- # số chunk sẽ lấy cho mỗi câu hỏi
13
- RETRIEVER_K = 4
14
-
15
- def get_retriever(vectorstore: FAISS, k: int = RETRIEVER_K):
16
- """
17
- Tạo retriever từ FAISS VectorStore.
18
- """
19
- print(f">>> Creating retriever with k={k} ...")
20
- retriever = vectorstore.as_retriever(search_kwargs={"k": k})
21
- print(">>> Retriever ready.\n")
22
- return retriever
23
-
24
- if __name__ == "__main__":
25
- # Test: load -> split -> FAISS -> retriever.get_relevant_documents()
26
- from load_documents import load_documents
27
- from split_documents import split_documents
28
- from vectorstore import build_vectorstore
29
-
30
- print("=== TEST: retriever.get_relevant_documents ===\n")
31
-
32
- docs = load_documents()
33
- chunks = split_documents(docs)
34
- vs = build_vectorstore(chunks)
35
- retriever = get_retriever(vs, k=4)
36
-
37
- query = "Wie lange habe ich Zeit, eine Prüfungsleistung zu wiederholen?"
38
- print("Test query:")
39
- print(" ", query, "\n")
40
-
41
- retrieved_docs = retriever.invoke(query)
42
-
43
- print(f"Retriever returned {len(retrieved_docs)} documents.")
44
- for i, d in enumerate(retrieved_docs, start=1):
45
- print(f"\n=== DOC {i} ===")
46
- print(d.page_content[:400], "...")
47
- print("Metadata:", d.metadata)
48
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
speech_io.py DELETED
@@ -1,52 +0,0 @@
1
- # speech_io.py
2
- import os
3
- from tempfile import NamedTemporaryFile
4
- from typing import Optional
5
- from openai import OpenAI
6
-
7
- client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
8
-
9
-
10
- # ======================
11
- # 1. Speech-to-Text (STT)
12
- # ======================
13
- def transcribe_audio(file_path: str, language: Optional[str] = None) -> str:
14
- """
15
- Transkribiert Audio via OpenAI Audio Transcription API (gpt-4o-mini-transcribe).
16
- """
17
- print(">>> Transkribiere Audio via OpenAI Audio API …")
18
-
19
- with open(file_path, "rb") as f:
20
- resp = client.audio.transcriptions.create(
21
- model="gpt-4o-mini-transcribe",
22
- file=f,
23
- language=language,
24
- )
25
-
26
- return resp.text
27
-
28
-
29
- # ======================
30
- # 2. Text-to-Speech (TTS)
31
- # ======================
32
- def synthesize_speech(text: str, voice: str = "alloy") -> str:
33
- """
34
- Wandelt Text in Sprache um (OpenAI TTS - gpt-4o-mini-tts)
35
- Speichert MP3-Datei und gibt den Pfad zurück.
36
- """
37
- print(">>> Synthesizing speech via OpenAI TTS …")
38
-
39
- response = client.audio.speech.create(
40
- model="gpt-4o-mini-tts",
41
- voice=voice,
42
- input=text,
43
- )
44
-
45
- # HF Spaces + OpenAI SDK v2.x → raw bytes
46
- audio_bytes = response.read()
47
-
48
- tmp = NamedTemporaryFile(delete=False, suffix=".mp3")
49
- tmp.write(audio_bytes)
50
- tmp.close()
51
-
52
- return tmp.name
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
split_documents.py DELETED
@@ -1,28 +0,0 @@
1
- # split_documents.py – v2
2
-
3
- from langchain_text_splitters import RecursiveCharacterTextSplitter
4
-
5
- CHUNK_SIZE = 1500
6
- CHUNK_OVERLAP = 200
7
-
8
- def split_documents(docs):
9
- splitter = RecursiveCharacterTextSplitter(
10
- chunk_size=CHUNK_SIZE,
11
- chunk_overlap=CHUNK_OVERLAP,
12
- separators=["\n\n", "\n", ". ", " ", ""],
13
- )
14
- chunks = splitter.split_documents(docs)
15
-
16
- for c in chunks:
17
- c.metadata["chunk_size"] = CHUNK_SIZE
18
- c.metadata["chunk_overlap"] = CHUNK_OVERLAP
19
-
20
- return chunks
21
-
22
- if __name__ == "__main__":
23
- from load_documents import load_documents
24
- docs = load_documents()
25
- chunks = split_documents(docs)
26
- print("Docs:", len(docs), "Chunks:", len(chunks))
27
- print(chunks[0].page_content[:300], chunks[0].metadata)
28
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
supabase_client.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # supabase_client.py
2
+ import os
3
+ from supabase import create_client
4
+
5
+ SUPABASE_URL = os.environ["SUPABASE_URL"]
6
+ SUPABASE_SERVICE_KEY = os.environ["SUPABASE_SERVICE_KEY"]
7
+
8
+ supabase = create_client(SUPABASE_URL, SUPABASE_SERVICE_KEY)
9
+
10
+ def load_file_bytes(bucket: str, filename: str) -> bytes:
11
+ """Tải file từ Supabase Storage nhưng KHÔNG ghi ra local – trả về bytes."""
12
+ return supabase.storage.from_(bucket).download(filename)
vectorstore.py DELETED
@@ -1,55 +0,0 @@
1
- """
2
- BƯỚC 4: VECTORSTORE (FAISS in-memory)
3
- -------------------------------------
4
- Tạo FAISS index từ các CHUNK văn bản.
5
-
6
- - Không ghi file .faiss nào, tất cả nằm trong RAM.
7
- - Embeddings được lấy từ get_embeddings() (Bước 3).
8
- """
9
-
10
- from langchain_community.vectorstores import FAISS
11
- from embeddings import get_embeddings
12
-
13
- def build_vectorstore(chunks):
14
- """
15
- Nhận danh sách Document (đã split) và trả về FAISS VectorStore.
16
- """
17
- print(">>> Initialising embedding model for FAISS index ...")
18
- embeddings = get_embeddings()
19
-
20
- print(f">>> Building FAISS index from {len(chunks)} chunks ...")
21
- vs = FAISS.from_documents(chunks, embeddings)
22
- print(">>> FAISS index built.\n")
23
- return vs
24
-
25
- if __name__ == "__main__":
26
- # Test toàn pipeline: load -> split -> FAISS -> similarity_search
27
- from load_documents import load_documents
28
- from split_documents import split_documents
29
-
30
- print("=== TEST: load_documents -> split_documents -> FAISS.similarity_search ===\n")
31
-
32
- # 1) Load tài liệu (PDF + HTML) từ HuggingFace
33
- docs = load_documents()
34
-
35
- # 2) Split thành chunks
36
- from pprint import pprint
37
- print(f"Loaded {len(docs)} raw documents.")
38
- chunks = split_documents(docs)
39
- print(f"Split into {len(chunks)} chunks.\n")
40
-
41
- # 3) Xây FAISS vectorstore
42
- vectorstore = build_vectorstore(chunks)
43
-
44
- # 4) Test similarity_search
45
- query = "Fristen für die Prüfungsanmeldung im Bachelorstudium"
46
- print("Test query:")
47
- print(" ", query, "\n")
48
-
49
- results = vectorstore.similarity_search(query, k=3)
50
-
51
- print("Top-3 ähnliche Chunks aus dem VectorStore:")
52
- for i, doc in enumerate(results, start=1):
53
- print(f"\n=== RESULT {i} ===")
54
- print(doc.page_content[:400], "...")
55
- print("Metadata:", doc.metadata)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
viewer.py DELETED
@@ -1,76 +0,0 @@
1
- # viewer.py – Dynamischer HTML-Viewer für Hochschulgesetz NRW
2
-
3
- import os
4
- from supabase import create_client
5
-
6
- SUPABASE_URL = os.getenv("SUPABASE_URL")
7
- SUPABASE_ANON_KEY = os.getenv("SUPABASE_ANON_KEY")
8
-
9
- if not SUPABASE_URL or not SUPABASE_ANON_KEY:
10
- raise RuntimeError("Missing SUPABASE_URL / SUPABASE_ANON_KEY")
11
-
12
- supabase = create_client(SUPABASE_URL, SUPABASE_ANON_KEY)
13
-
14
-
15
- def generate_hg_viewer() -> str:
16
- """
17
- Tạo HTML full law, mỗi Paragraph có id="para_xx",
18
- để /hg_view#para_xx có thể scroll đúng đoạn.
19
- """
20
- rows = (
21
- supabase
22
- .table("hg_nrw")
23
- .select("*")
24
- .order("order_index")
25
- .execute()
26
- ).data or []
27
-
28
- html_parts = [
29
- """
30
- <!DOCTYPE html>
31
- <html>
32
- <head>
33
- <meta charset="utf-8">
34
- <title>Hochschulgesetz NRW</title>
35
- <style>
36
- body {
37
- font-family: -apple-system, BlinkMacSystemFont, sans-serif;
38
- padding: 20px;
39
- line-height: 1.6;
40
- }
41
- h1 { margin-bottom: 10px; }
42
- h2 {
43
- margin-top: 30px;
44
- scroll-margin-top: 20px;
45
- }
46
- .para-block {
47
- margin-bottom: 20px;
48
- padding-bottom: 10px;
49
- border-bottom: 1px solid #eee;
50
- }
51
- .subtitle {
52
- color: #555;
53
- font-size: 14px;
54
- }
55
- </style>
56
- </head>
57
- <body>
58
- <h1>Hochschulgesetz NRW</h1>
59
- <p class="subtitle">Dynamisch geladen aus Supabase (Tabelle hg_nrw)</p>
60
- """
61
- ]
62
-
63
- for row in rows:
64
- abs_id = row["abs_id"]
65
- title = row["title"]
66
- content = row["content"]
67
-
68
- html_parts.append(f"""
69
- <div class="para-block" id="{abs_id}">
70
- <h2>{title}</h2>
71
- <p>{content}</p>
72
- </div>
73
- """)
74
-
75
- html_parts.append("</body></html>")
76
- return "\n".join(html_parts)