Nguyen5 commited on
Commit
a195b72
·
1 Parent(s): 8e10bd7
Files changed (1) hide show
  1. ingest.py +98 -74
ingest.py CHANGED
@@ -1,104 +1,128 @@
1
- # ingest.py
 
2
  import os
3
- from io import BytesIO
4
- from bs4 import BeautifulSoup
5
- from pypdf import PdfReader
6
 
7
- from supabase_client import supabase, load_file_bytes
8
- from langchain_openai import OpenAIEmbeddings
9
- from langchain_text_splitters import RecursiveCharacterTextSplitter
10
- from langchain_core.documents import Document
11
 
 
12
  BUCKET = os.environ["SUPABASE_BUCKET"]
 
13
  SUPABASE_URL = os.environ["SUPABASE_URL"]
14
 
 
15
  PDF_URL = f"{SUPABASE_URL}/storage/v1/object/public/{BUCKET}/pruefungsordnung.pdf"
16
- HG_URL = f"{SUPABASE_URL}/storage/v1/object/public/{BUCKET}/hochschulgesetz.html"
17
 
 
 
18
 
19
- def load_pdf_docs():
 
 
 
 
20
  pdf_bytes = load_file_bytes(BUCKET, "pruefungsordnung.pdf")
21
- reader = PdfReader(BytesIO(pdf_bytes))
22
-
23
- docs = []
24
- for i, page in enumerate(reader.pages):
25
- text = page.extract_text() or ""
26
-
27
- docs.append(
28
- Document(
29
- page_content=text,
30
- metadata={
31
- "source": "Prüfungsordnung (PDF)",
32
- "page": i, # ZERO-based: Seite = i+1
33
- "pdf_url": PDF_URL, # Basis-URL
34
- },
35
- )
 
36
  )
37
- return docs
38
 
39
 
40
- def load_html_docs():
41
- html_bytes = load_file_bytes(BUCKET, "hochschulgesetz.html")
42
- html = html_bytes.decode("utf-8", errors="ignore")
 
 
43
 
44
- soup = BeautifulSoup(html, "html.parser")
45
- text = soup.get_text(separator="\n")
 
 
 
 
46
 
47
- # HTML nicht in Paragraphen getrennt → wir chunk’en später
48
- return [
49
- Document(
50
- page_content=text,
51
- metadata={
52
- "source": "Hochschulgesetz NRW",
53
- # anchor_id wird erst beim Chunken vergeben
54
- },
55
- )
56
- ]
57
 
 
 
 
 
 
58
 
59
- def chunk_docs(docs):
60
- splitter = RecursiveCharacterTextSplitter(
61
- chunk_size=900,
62
- chunk_overlap=100,
63
- )
64
- return splitter.split_documents(docs)
65
 
 
66
 
67
- def ingest():
68
- pdf_docs = load_pdf_docs()
69
- hg_docs = load_html_docs()
70
 
71
- chunks = chunk_docs(pdf_docs + hg_docs)
72
 
73
- po_idx = 1
74
- hg_idx = 1
 
 
75
 
76
- for d in chunks:
77
- src = d.metadata["source"]
78
 
79
- if src == "Prüfungsordnung (PDF)":
80
- d.metadata["anchor_id"] = f"po_{po_idx}"
81
- po_idx += 1
82
- else:
83
- d.metadata["anchor_id"] = f"hg_{hg_idx}"
84
- hg_idx += 1
85
 
86
- # HTML Quelle als vollständige URL
87
- if src == "Hochschulgesetz NRW":
88
- d.metadata["url"] = f"{HG_URL}#{d.metadata['anchor_id']}"
 
 
89
 
90
- embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
91
 
92
- for d in chunks:
93
- emb = embeddings.embed_query(d.page_content)
94
- supabase.table("documents").insert({
95
- "content": d.page_content,
96
- "metadata": d.metadata,
97
- "embedding": emb
98
- }).execute()
99
 
100
- print("OK ✔ ingest xong – PDF + HTML mit Quelle-URL")
 
 
 
 
101
 
 
 
 
 
 
 
 
 
 
 
102
 
103
  if __name__ == "__main__":
104
- ingest()
 
1
+ # app.py – Quelle clickable & styled viewer (recht.nrw.de)
2
+
3
  import os
4
+ import base64
5
+ import gradio as gr
6
+ from openai import OpenAI
7
 
8
+ from supabase_client import load_file_bytes
9
+ from rag_pipeline import rag_answer
 
 
10
 
11
+ client = OpenAI()
12
  BUCKET = os.environ["SUPABASE_BUCKET"]
13
+
14
  SUPABASE_URL = os.environ["SUPABASE_URL"]
15
 
16
+ # Prüfungsordnung PDF – Supabase public URL
17
  PDF_URL = f"{SUPABASE_URL}/storage/v1/object/public/{BUCKET}/pruefungsordnung.pdf"
 
18
 
19
+ # Hochschulgesetz NRW – offizielle recht.nrw.de URL (WICHTIG)
20
+ HG_URL = "https://recht.nrw.de/lmi/owa/br_text_anzeigen?v_id=10000000000000000654"
21
 
22
+
23
+ # ------------------------------------------
24
+ # PDF inline viewer (Base64)
25
+ # ------------------------------------------
26
+ def encode_pdf_src():
27
  pdf_bytes = load_file_bytes(BUCKET, "pruefungsordnung.pdf")
28
+ b64 = base64.b64encode(pdf_bytes).decode("utf-8")
29
+ return f"data:application/pdf;base64,{b64}"
30
+
31
+
32
+ # ------------------------------------------
33
+ # Speech-to-text
34
+ # ------------------------------------------
35
+ def transcribe(audio_path):
36
+ if audio_path is None:
37
+ return ""
38
+ with open(audio_path, "rb") as f:
39
+ result = client.audio.transcriptions.create(
40
+ model="whisper-1",
41
+ file=f,
42
+ language="de",
43
+ temperature=0.0
44
  )
45
+ return (result.text or "").strip()
46
 
47
 
48
+ # ------------------------------------------
49
+ # Chat logic with inline Quelle links
50
+ # ------------------------------------------
51
+ def chat_fn(text, audio, history):
52
+ text = (text or "").strip()
53
 
54
+ if text:
55
+ question = text
56
+ elif audio:
57
+ question = transcribe(audio)
58
+ else:
59
+ return history, "<p>Bitte Text oder Mikrofon benutzen.</p>", None
60
 
61
+ answer, docs = rag_answer(question, history or [])
62
+
63
+ # Quellenblock (NOW CLICKABLE IN CHAT)
64
+ quellen_md = "### 📚 Quellen\n"
65
+ for i, d in enumerate(docs):
66
+ meta = d["metadata"]
67
+ src = meta.get("source")
 
 
 
68
 
69
+ # PDF
70
+ if src.startswith("Prüfungsordnung"):
71
+ page = meta.get("page", 0) + 1
72
+ url = f"{PDF_URL}#page={page}"
73
+ title = f"Quelle {i+1}: {src}, Seite {page}"
74
 
75
+ # HTML – Rechtsportal NRW (WICHTIG)
76
+ else:
77
+ anchor = meta.get("anchor_id")
78
+ url = f"{HG_URL}#{anchor}"
79
+ title = f"Quelle {i+1}: Hochschulgesetz NRW (§)"
 
80
 
81
+ snippet = d["content"][:120].replace("\n", " ")
82
 
83
+ quellen_md += f"- [{title}]({url})\n > {snippet}\n"
 
 
84
 
85
+ bot_msg = f"{answer}\n\n{quellen_md}"
86
 
87
+ new_history = (history or []) + [
88
+ {"role": "user", "content": question},
89
+ {"role": "assistant", "content": bot_msg},
90
+ ]
91
 
92
+ return new_history, bot_msg, gr.update(value=None)
 
93
 
 
 
 
 
 
 
94
 
95
+ # ------------------------------------------
96
+ # UI
97
+ # ------------------------------------------
98
+ with gr.Blocks() as demo:
99
+ gr.Markdown("# ⚖️ Prüfungsrechts-Chatbot (Quelle-clickable)")
100
 
101
+ with gr.Row():
102
 
103
+ with gr.Column(scale=2):
104
+ chatbot = gr.Chatbot(type="messages", height=550)
105
+ text_input = gr.Textbox(label="Frage")
106
+ audio_input = gr.Audio(type="filepath", label="Mikrofon")
107
+ send_btn = gr.Button("Senden")
108
+ answer_preview = gr.Markdown("")
 
109
 
110
+ send_btn.click(
111
+ chat_fn,
112
+ inputs=[text_input, audio_input, chatbot],
113
+ outputs=[chatbot, answer_preview, audio_input]
114
+ )
115
 
116
+ with gr.Column(scale=1):
117
+ gr.Markdown("### Prüfungsordnung (PDF)")
118
+ gr.HTML(
119
+ f"<iframe src='{encode_pdf_src()}' width='100%' height='330px' style='border:none'></iframe>"
120
+ )
121
+
122
+ gr.Markdown("### Hochschulgesetz NRW (offizielle Seite)")
123
+ gr.HTML(
124
+ f"<iframe src='{HG_URL}' width='100%' height='330px' style='border:none'></iframe>"
125
+ )
126
 
127
  if __name__ == "__main__":
128
+ demo.launch(ssr_mode=False)