Nguyen5 commited on
Commit
9370c0a
·
1 Parent(s): a34dda6
Files changed (7) hide show
  1. app.py +49 -44
  2. build_hg_viewer.py +11 -5
  3. llm.py +11 -1
  4. load_documents.py +96 -105
  5. rag_pipeline.py +54 -84
  6. requirements.txt +2 -1
  7. upload_weblink_to_supabase.py +81 -26
app.py CHANGED
@@ -5,70 +5,59 @@ import gradio as gr
5
  from gradio_pdf import PDF
6
  from huggingface_hub import hf_hub_download
7
 
8
- from load_documents import load_documents, DATASET, PDF_FILE, HTML_FILE
 
 
 
 
 
 
 
 
 
9
  from split_documents import split_documents
10
  from vectorstore import build_vectorstore
11
  from retriever import get_retriever
12
  from llm import load_llm
13
- from rag_pipeline import answer, PDF_BASE_URL, LAW_URL
14
-
15
  from speech_io import transcribe_audio, synthesize_speech
16
 
17
  # =====================================================
18
  # INITIALISIERUNG (global)
19
  # =====================================================
20
 
21
- print("🔹 Lade Dokumente ...")
22
- _docs = load_documents()
23
 
24
- print("🔹 Splitte Dokumente ...")
25
- _chunks = split_documents(_docs)
26
 
27
- print("🔹 Baue VectorStore (FAISS) ...")
28
- _vs = build_vectorstore(_chunks)
29
 
30
- print("🔹 Erzeuge Retriever ...")
31
- _retriever = get_retriever(_vs)
32
 
33
- print("🔹 Lade LLM ...")
34
- _llm = load_llm()
35
-
36
- print("🔹 Lade Dateien für Viewer …")
37
- _pdf_path = hf_hub_download(DATASET, PDF_FILE, repo_type="dataset")
38
- _html_path = hf_hub_download(DATASET, HTML_FILE, repo_type="dataset")
39
 
40
  # =====================================================
41
  # Quellen formatieren – Markdown für Chat
42
  # =====================================================
43
 
44
- def format_sources_markdown(sources):
45
- if not sources:
46
  return ""
47
 
48
- lines = ["", "**📚 Quellen (genutzte Dokumentstellen):**"]
49
- for s in sources:
50
- sid = s["id"]
51
- src = s["source"]
52
- page = s["page"]
53
- url = s["url"]
54
- snippet = s["snippet"]
55
-
56
- title = f"Quelle {sid} – {src}"
57
-
58
- if url:
59
- base = f"- [{title}]({url})"
60
- else:
61
- base = f"- {title}"
62
 
63
- if page and "Prüfungsordnung" in src:
64
- base += f", Seite {page}"
 
 
 
65
 
66
- lines.append(base)
67
-
68
- if snippet:
69
- lines.append(f" > {snippet}")
70
-
71
- return "\n".join(lines)
72
 
73
  # =====================================================
74
  # TEXT CHATBOT
@@ -197,15 +186,31 @@ with gr.Blocks(title="Prüfungsrechts-Chatbot (RAG + Sprache)") as demo:
197
  # RECHTE SPALTE: Viewer
198
  # =====================
199
 
 
 
 
 
 
 
 
 
 
200
  with gr.Column(scale=1):
201
  gr.Markdown("### 📄 Prüfungsordnung (PDF)")
202
- PDF(_pdf_path, height=350)
 
 
 
 
 
 
203
 
204
- gr.Markdown("### 📘 Hochschulgesetz NRW (Website)")
205
  gr.HTML(
206
- f'<iframe src="{LAW_URL}" style="width:100%;height:350px;border:none;"></iframe>'
 
207
  )
208
 
 
209
  if __name__ == "__main__":
210
  demo.queue().launch(ssr_mode=False, show_error=True)
211
 
 
5
  from gradio_pdf import PDF
6
  from huggingface_hub import hf_hub_download
7
 
8
+ # from load_documents import load_documents, DATASET, PDF_FILE, HTML_FILE
9
+ # from split_documents import split_documents
10
+ # from vectorstore import build_vectorstore
11
+ # from retriever import get_retriever
12
+ # from llm import load_llm
13
+ # from rag_pipeline import answer, PDF_BASE_URL, LAW_URL
14
+
15
+ # from speech_io import transcribe_audio, synthesize_speech
16
+
17
+ from load_documents import load_all_documents
18
  from split_documents import split_documents
19
  from vectorstore import build_vectorstore
20
  from retriever import get_retriever
21
  from llm import load_llm
22
+ from rag_pipeline import answer
 
23
  from speech_io import transcribe_audio, synthesize_speech
24
 
25
  # =====================================================
26
  # INITIALISIERUNG (global)
27
  # =====================================================
28
 
29
+ print("📚 Lade Dokumente")
30
+ docs = load_all_documents()
31
 
32
+ print("🔪 Splitte Dokumente")
33
+ chunks = split_documents(docs)
34
 
35
+ print("🔍 Erstelle VectorStore")
36
+ vs = build_vectorstore(chunks)
37
 
38
+ print("🔎 Erzeuge Retriever")
39
+ retriever = get_retriever(vs)
40
 
41
+ print("🤖 Lade LLM")
42
+ llm = load_llm()
 
 
 
 
43
 
44
  # =====================================================
45
  # Quellen formatieren – Markdown für Chat
46
  # =====================================================
47
 
48
+ def format_sources(src):
49
+ if not src:
50
  return ""
51
 
52
+ out = ["", "## 📚 Quellen"]
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
+ for s in src:
55
+ line = f"- [{s['source']}]({s['url']})"
56
+ if s.get("page"):
57
+ line += f" (Seite {s['page']})"
58
+ out.append(line)
59
 
60
+ return "\n".join(out)
 
 
 
 
 
61
 
62
  # =====================================================
63
  # TEXT CHATBOT
 
186
  # RECHTE SPALTE: Viewer
187
  # =====================
188
 
189
+ # with gr.Column(scale=1):
190
+ # gr.Markdown("### 📄 Prüfungsordnung (PDF)")
191
+ # PDF(_pdf_path, height=350)
192
+
193
+ # gr.Markdown("### 📘 Hochschulgesetz NRW (Website)")
194
+ # gr.HTML(
195
+ # f'<iframe src="{LAW_URL}" style="width:100%;height:350px;border:none;"></iframe>'
196
+ # )
197
+
198
  with gr.Column(scale=1):
199
  gr.Markdown("### 📄 Prüfungsordnung (PDF)")
200
+ # PDF đã được load_documents cung cấp pdf_url — dùng metadata trực tiếp
201
+ pdf_meta = next(d.metadata for d in docs if d.metadata["type"] == "pdf")
202
+ PDF(pdf_meta["pdf_url"], height=350)
203
+
204
+ gr.Markdown("### 📘 Hochschulgesetz NRW")
205
+ hg_meta = next(d.metadata for d in docs if d.metadata["type"] == "hg")
206
+ hg_view_url = hg_meta["viewer_url"].split("#")[0]
207
 
 
208
  gr.HTML(
209
+ f'<iframe src="{hg_view_url}" '
210
+ 'style="width:100%;height:350px;border:none;"></iframe>'
211
  )
212
 
213
+
214
  if __name__ == "__main__":
215
  demo.queue().launch(ssr_mode=False, show_error=True)
216
 
build_hg_viewer.py CHANGED
@@ -13,7 +13,7 @@ if not SUPABASE_URL or not SUPABASE_SERVICE_ROLE:
13
 
14
  supabase = create_client(SUPABASE_URL, SUPABASE_SERVICE_ROLE)
15
 
16
- from upload_weblink_to_supabase import extract_paragraphs
17
 
18
  # ======== HTML TEMPLATE ========
19
  VIEW_TEMPLATE = """
@@ -240,9 +240,15 @@ function scrollToTop() {
240
  # 2. BUILD VIEWER
241
  # -------------------------------------------------------------------
242
 
243
- def build_html():
244
- print(">>> Lade Paragraphs aus Supabase...")
245
- paras = extract_paragraphs()
 
 
 
 
 
 
246
 
247
  sidebar_links = ""
248
  content_html = ""
@@ -296,7 +302,7 @@ def build_html():
296
  # -------------------------------------------------------------------
297
 
298
  def upload_html():
299
- html = build_html()
300
 
301
  supabase.storage.from_("hg_viewer").update(
302
  "hg_clean.html",
 
13
 
14
  supabase = create_client(SUPABASE_URL, SUPABASE_SERVICE_ROLE)
15
 
16
+ #from upload_weblink_to_supabase import extract_paragraphs
17
 
18
  # ======== HTML TEMPLATE ========
19
  VIEW_TEMPLATE = """
 
240
  # 2. BUILD VIEWER
241
  # -------------------------------------------------------------------
242
 
243
+ def build_html_from_db():
244
+ """
245
+ Liest alle Paragraphen aus hg_nrw und baut daraus HTML.
246
+ """
247
+ print(">>> Lade Paragraphen aus Supabase (hg_nrw) …")
248
+ #paras = extract_paragraphs()
249
+ # 5.12_2:13
250
+ res = supabase.table("hg_nrw").select("*").order("order_index").execute()
251
+ rows = res.data or []
252
 
253
  sidebar_links = ""
254
  content_html = ""
 
302
  # -------------------------------------------------------------------
303
 
304
  def upload_html():
305
+ html = build_html_from_db()
306
 
307
  supabase.storage.from_("hg_viewer").update(
308
  "hg_clean.html",
llm.py CHANGED
@@ -14,9 +14,19 @@ def load_llm():
14
 
15
  print(f">>> Lade OpenAI Chatmodell: {CHAT_MODEL}")
16
 
 
 
 
 
 
 
 
17
  llm = ChatOpenAI(
18
  model=CHAT_MODEL,
19
- temperature=0.0, # deterministisch, wenig Halluzination
 
 
 
20
  api_key=api_key,
21
  )
22
  return llm
 
14
 
15
  print(f">>> Lade OpenAI Chatmodell: {CHAT_MODEL}")
16
 
17
+ # llm = ChatOpenAI(
18
+ # model=CHAT_MODEL,
19
+ # temperature=0.0, # deterministisch, wenig Halluzination
20
+ # api_key=api_key,
21
+ # )
22
+ # return llm
23
+ # 5.12_2:13
24
  llm = ChatOpenAI(
25
  model=CHAT_MODEL,
26
+ temperature=0.0,
27
+ top_p=1.0,
28
+ presence_penalty=0.0,
29
+ frequency_penalty=0.0,
30
  api_key=api_key,
31
  )
32
  return llm
load_documents.py CHANGED
@@ -1,130 +1,121 @@
1
  """
2
- BƯỚC 1: LOAD DOCUMENTS
3
- -----------------------
4
- Debug-full version
5
 
6
- - Lädt Prüfungsordnung (PDF) seitenweise.
7
- - Lädt Hochschulgesetz NRW aus dem im Dataset gespeicherten HTML,
8
- und zerlegt es in einzelne Absätze (Document pro <p>).
 
9
  """
10
 
11
- from huggingface_hub import hf_hub_download, list_repo_files
 
 
12
  from langchain_community.document_loaders import PyPDFLoader
13
  from langchain_core.documents import Document
14
- from bs4 import BeautifulSoup
15
 
16
- DATASET = "Nguyen5/docs"
 
 
 
 
 
 
 
 
 
17
  PDF_FILE = "f10_bpo_ifb_tei_mif_wii_2021-01-04.pdf"
18
- HTML_FILE = "Hochschulgesetz_NRW.html" # konsistent mit hg_nrw.py
19
-
20
- def _load_hg_paragraph_documents(html_path: str):
21
- """
22
- Liest das generierte Hochschulgesetz-HTML ein und erzeugt
23
- pro <p>-Element einen LangChain-Document mit:
24
- - page_content = Text des Absatzes
25
- - metadata:
26
- source = "Hochschulgesetz NRW (HTML)"
27
- filename = HTML_FILE
28
- paragraph_id = id-Attribut (z.B. 'hg_abs_12'), falls vorhanden
29
- """
30
- with open(html_path, "r", encoding="utf-8") as f:
31
- html = f.read()
32
-
33
- soup = BeautifulSoup(html, "html.parser")
34
- docs = []
35
 
36
- for p in soup.find_all("p"):
37
- text = p.get_text(" ", strip=True)
38
- if not text:
39
- continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
- pid = p.get("id")
 
 
 
42
 
43
- metadata = {
44
- "source": "Hochschulgesetz NRW (HTML)",
45
- "filename": HTML_FILE,
 
 
 
 
 
 
46
  }
47
- if pid:
48
- metadata["paragraph_id"] = pid
49
 
50
- docs.append(Document(page_content=text, metadata=metadata))
 
51
 
52
- print(f"Loaded {len(docs)} paragraph Documents from HG-HTML.\n")
53
- return docs
54
 
55
- def load_documents():
56
- print("=== START: load_documents() ===\n")
 
57
 
58
- # -------------------------
59
- # Check files in dataset
60
- # -------------------------
61
- print(">>> Checking dataset file list from HuggingFace...")
62
- files = list_repo_files(DATASET, repo_type="dataset")
63
- print("Files in dataset:", files, "\n")
64
 
 
 
 
 
 
 
 
65
  docs = []
66
 
67
- # -------------------------
68
- # Load PDF
69
- # -------------------------
70
- print(">>> Step 1: Download PDF from HuggingFace...")
71
- try:
72
- pdf_path = hf_hub_download(
73
- repo_id=DATASET,
74
- filename=PDF_FILE,
75
- repo_type="dataset",
 
 
 
 
 
 
 
 
 
76
  )
77
- print(f"Downloaded PDF to local cache:\n{pdf_path}\n")
78
- except Exception as e:
79
- print("ERROR downloading PDF:", e)
80
- return []
81
-
82
- print(">>> Step 1.1: Loading PDF pages...")
83
- try:
84
- pdf_docs = PyPDFLoader(pdf_path).load()
85
- print(f"Loaded {len(pdf_docs)} PDF pages.\n")
86
- except Exception as e:
87
- print("ERROR loading PDF:", e)
88
- return []
89
-
90
- for d in pdf_docs:
91
- d.metadata["source"] = "Prüfungsordnung (PDF)"
92
- d.metadata["filename"] = PDF_FILE
93
-
94
- docs.extend(pdf_docs)
95
-
96
- # -------------------------
97
- # Load HTML (Hochschulgesetz NRW)
98
- # -------------------------
99
- print(">>> Step 2: Download HTML from HuggingFace...")
100
- try:
101
- html_path = hf_hub_download(
102
- repo_id=DATASET,
103
- filename=HTML_FILE,
104
- repo_type="dataset",
105
- )
106
- print(f"Downloaded HTML to local cache:\n{html_path}\n")
107
- except Exception as e:
108
- print("ERROR downloading HTML:", e)
109
- return docs
110
 
111
- print(">>> Step 2.1: Loading HG-HTML and splitting into paragraphs...")
112
- try:
113
- html_docs = _load_hg_paragraph_documents(html_path)
114
- except Exception as e:
115
- print("ERROR loading / parsing HTML:", e)
116
- return docs
117
 
118
- docs.extend(html_docs)
 
 
119
 
120
- print("=== DONE: load_documents() ===\n")
121
- return docs
 
 
122
 
123
- if __name__ == "__main__":
124
- print("\n=== Running load_documents.py directly ===\n")
125
- docs = load_documents()
126
- print(f"\n>>> TOTAL documents loaded: {len(docs)}")
127
 
128
- if len(docs):
129
- print("\nExample metadata from 1st document:")
130
- print(docs[0].metadata)
 
 
1
  """
2
+ LOAD_DOCUMENTS SINGLE SOURCE OF TRUTH
 
 
3
 
4
+ Nhiệm vụ:
5
+ 1) Lade Prüfungsordnung PDF direkt aus Supabase-Storage.
6
+ 2) Lade Hochschulgesetz NRW aus Supabase-Tabelle hg_nrw.
7
+ 3) Cung cấp metadata đầy đủ để các file khác KHÔNG PHẢI tính lại URL.
8
  """
9
 
10
+ import os
11
+ import tempfile
12
+ from dotenv import load_dotenv
13
  from langchain_community.document_loaders import PyPDFLoader
14
  from langchain_core.documents import Document
15
+ from supabase import create_client
16
 
17
+ load_dotenv()
18
+
19
+ # ===== Supabase config =====
20
+ SUPABASE_URL = os.getenv("SUPABASE_URL")
21
+ SUPABASE_SERVICE_ROLE = os.getenv("SUPABASE_SERVICE_ROLE")
22
+
23
+ supabase = create_client(SUPABASE_URL, SUPABASE_SERVICE_ROLE)
24
+
25
+ # ===== Storage Config =====
26
+ PDF_BUCKET = "File PDF"
27
  PDF_FILE = "f10_bpo_ifb_tei_mif_wii_2021-01-04.pdf"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
+ PDF_URL = f"{SUPABASE_URL}/storage/v1/object/public/{PDF_BUCKET}/{PDF_FILE}"
30
+
31
+ # ===== Viewer URL =====
32
+ HG_VIEWER_URL = (
33
+ f"{SUPABASE_URL}/storage/v1/object/public/hg_viewer/hg_clean.html"
34
+ )
35
+
36
+
37
+ # ============================================================
38
+ # 1) PDF aus Supabase laden
39
+ # ============================================================
40
+
41
+ def load_pdf_from_supabase() -> list[Document]:
42
+ print("📥 Lade Prüfungsordnung PDF aus Supabase...")
43
+
44
+ response = supabase.storage.from_(PDF_BUCKET).download(PDF_FILE)
45
+ if response is None:
46
+ raise ValueError("❌ Konnte PDF nicht laden!")
47
 
48
+ # Temporäre Datei
49
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
50
+ tmp.write(response)
51
+ temp_pdf_path = tmp.name
52
 
53
+ pages = PyPDFLoader(temp_pdf_path).load()
54
+
55
+ for i, p in enumerate(pages):
56
+ p.metadata = {
57
+ "type": "pdf",
58
+ "source": "Prüfungsordnung",
59
+ "page": i,
60
+ "pdf_url": f"{PDF_URL}#page={i+1}",
61
+ "filename": PDF_FILE,
62
  }
 
 
63
 
64
+ print(f"✔ {len(pages)} PDF-Seiten geladen.")
65
+ return pages
66
 
 
 
67
 
68
+ # ============================================================
69
+ # 2) HG aus Tabelle laden
70
+ # ============================================================
71
 
72
+ def load_hg_from_supabase() -> list[Document]:
73
+ print("📥 Lade Hochschulgesetz NRW aus Tabelle hg_nrw...")
 
 
 
 
74
 
75
+ res = (
76
+ supabase.table("hg_nrw")
77
+ .select("*")
78
+ .order("order_index", desc=False)
79
+ .execute()
80
+ )
81
+ rows = res.data or []
82
  docs = []
83
 
84
+ for row in rows:
85
+ abs_id = row["abs_id"]
86
+ title = row["title"]
87
+ content = row["content"]
88
+
89
+ viewer_url = f"{HG_VIEWER_URL}#{abs_id}"
90
+
91
+ docs.append(
92
+ Document(
93
+ page_content=content,
94
+ metadata={
95
+ "type": "hg",
96
+ "source": "Hochschulgesetz NRW",
97
+ "abs_id": abs_id,
98
+ "title": title,
99
+ "viewer_url": viewer_url,
100
+ },
101
+ )
102
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
 
104
+ print(f" {len(docs)} HG-Absätze geladen.")
105
+ return docs
106
+
 
 
 
107
 
108
+ # ============================================================
109
+ # 3) ALLES LADEN
110
+ # ============================================================
111
 
112
+ def load_all_documents():
113
+ pdf_docs = load_pdf_from_supabase()
114
+ hg_docs = load_hg_from_supabase()
115
+ return pdf_docs + hg_docs
116
 
 
 
 
 
117
 
118
+ if __name__ == "__main__":
119
+ docs = load_all_documents()
120
+ print("📚 Gesamt:", len(docs))
121
+ print("🔎 Beispiel metadata:", docs[0].metadata)
rag_pipeline.py CHANGED
@@ -2,108 +2,78 @@
2
  RAG PIPELINE – Version 26.11 (ohne Modi, stabil, juristisch korrekt)
3
  """
4
 
 
 
 
 
5
  from typing import List, Dict, Any, Tuple
 
6
  from langchain_core.messages import SystemMessage, HumanMessage
7
- from load_documents import DATASET, PDF_FILE, HTML_FILE
8
 
9
- # -------------------------------------------------------------------
10
- # URLs für Quellen
11
- # -------------------------------------------------------------------
12
-
13
- # Direktes PDF im Dataset (für #page)
14
- PDF_BASE_URL = f"https://huggingface.co/datasets/{DATASET}/resolve/main/{PDF_FILE}"
15
-
16
- # Hochschulgesetz-HTML im Dataset (enthält <p id="hg_abs_X"> …)
17
- LAW_DATASET_URL = f"https://huggingface.co/datasets/{DATASET}/resolve/main/{HTML_FILE}"
18
-
19
- # Offizielle Recht.NRW-Druckversion (für Viewer im Frontend)
20
- LAW_URL = (
21
- "https://recht.nrw.de/lmi/owa/br_text_anzeigen?v_id=10000000000000000654"
22
- )
23
 
24
  MAX_CHARS = 900
25
 
26
- # -----------------------------
27
- # Quellen formatieren
28
- # -----------------------------
29
 
30
  def build_sources_metadata(docs: List) -> List[Dict[str, Any]]:
31
- """
32
- Erzeugt eine Liste strukturierter Quellen-Infos:
33
-
34
- [
35
- {
36
- "id": 1,
37
- "source": "Prüfungsordnung (PDF)" / "Hochschulgesetz NRW (HTML)",
38
- "page": 3, # nur bei PDF
39
- "url": "...", # direkter Klick-Link
40
- "snippet": "Erste 300 Zeichen des Chunks..."
41
- },
42
- ...
43
- ]
44
- """
45
- srcs = []
46
- for i, d in enumerate(docs):
47
  meta = d.metadata
48
- src = meta.get("source", "")
49
- page = meta.get("page")
50
  snippet = d.page_content[:300].replace("\n", " ")
51
 
52
- # PDF-Link
53
- if "Prüfungsordnung" in src:
54
- if isinstance(page, int):
55
- # PyPDFLoader: page ist 0-basiert, Anzeige 1-basiert
56
- url = f"{PDF_BASE_URL}#page={page + 1}"
57
- else:
58
- url = PDF_BASE_URL
59
-
60
- # NRW-Gesetz (HTML im Dataset mit Absatz-IDs)
61
- elif "Hochschulgesetz" in src:
62
- para_id = meta.get("paragraph_id")
63
- if para_id:
64
- # Klick führt direkt zum Absatz im Dataset-HTML
65
- url = f"{LAW_DATASET_URL}#{para_id}"
66
- else:
67
- # Fallback: offizielle Druckversion (ohne Absatz-Anker)
68
- url = LAW_URL
69
- page = None # keine Seitenangabe für Gesetz-HTML
70
-
71
- else:
72
- url = None
73
-
74
- srcs.append(
75
- {
76
- "id": i + 1,
77
- "source": src,
78
- "page": page + 1 if isinstance(page, int) else None,
79
- "url": url,
80
  "snippet": snippet,
81
- }
82
- )
83
- return srcs
 
 
 
 
 
 
 
 
 
 
84
 
85
- # -----------------------------
86
- # Kontext formatieren
87
- # -----------------------------
 
 
88
 
89
- def format_context(docs):
90
  if not docs:
91
- return "(Kein relevanter Kontext im Dokument gefunden.)"
 
 
92
 
93
- out = []
94
  for i, d in enumerate(docs):
95
- txt = d.page_content[:MAX_CHARS]
96
- src = d.metadata.get("source")
97
- page = d.metadata.get("page")
98
 
99
- if "Prüfungsordnung" in (src or "") and isinstance(page, int):
100
- src_str = f"{src}, Seite {page + 1}"
101
- else:
102
- src_str = src
103
 
104
- out.append(f"[KONTEXT {i+1}] ({src_str})\n{txt}")
 
 
 
 
 
 
105
 
106
- return "\n\n".join(out)
107
 
108
  # -----------------------------
109
  # Systemprompt — verschärft
@@ -160,7 +130,7 @@ def answer(question: str, retriever, chat_model) -> Tuple[str, List[Dict[str, An
160
  context_str = format_context(docs)
161
 
162
  # 2. Prompt bauen
163
- human = f"""
164
  FRAGE:
165
  {question}
166
 
@@ -179,7 +149,7 @@ ausschließlich anhand des obigen Kontextes.
179
 
180
  msgs = [
181
  SystemMessage(content=SYSTEM_PROMPT),
182
- HumanMessage(content=human),
183
  ]
184
 
185
  # 3. LLM aufrufen
 
2
  RAG PIPELINE – Version 26.11 (ohne Modi, stabil, juristisch korrekt)
3
  """
4
 
5
+ # from typing import List, Dict, Any, Tuple
6
+ # from langchain_core.messages import SystemMessage, HumanMessage
7
+ # from load_documents import DATASET, PDF_FILE, HTML_FILE
8
+ # 5.12_2:13
9
  from typing import List, Dict, Any, Tuple
10
+ import os
11
  from langchain_core.messages import SystemMessage, HumanMessage
12
+ from load_documents import DATASET, PDF_FILE
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
  MAX_CHARS = 900
16
 
17
+ # ============================================================
18
+ # Quellenaufbereitung – NUR metadata verwenden!
19
+ # ============================================================
20
 
21
  def build_sources_metadata(docs: List) -> List[Dict[str, Any]]:
22
+ sources = []
23
+
24
+ for idx, d in enumerate(docs):
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  meta = d.metadata
 
 
26
  snippet = d.page_content[:300].replace("\n", " ")
27
 
28
+ # PDF
29
+ if meta.get("type") == "pdf":
30
+ sources.append({
31
+ "id": idx + 1,
32
+ "source": "Prüfungsordnung (PDF)",
33
+ "page": meta.get("page"),
34
+ "url": meta.get("pdf_url"), # KHÔNG tạo lại!
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  "snippet": snippet,
36
+ })
37
+ continue
38
+
39
+ # Hochschulgesetz NRW
40
+ if meta.get("type") == "hg":
41
+ sources.append({
42
+ "id": idx + 1,
43
+ "source": "Hochschulgesetz NRW",
44
+ "page": None,
45
+ "url": meta.get("viewer_url"), # KHÔNG tạo lại!
46
+ "snippet": snippet,
47
+ })
48
+ continue
49
 
50
+ return sources
51
+
52
+ # ============================================================
53
+ # Kontextaufbereitung
54
+ # ============================================================
55
 
56
+ def format_context(docs: List) -> str:
57
  if not docs:
58
+ return "(Kein relevanter Kontext gefunden.)"
59
+
60
+ blocks = []
61
 
 
62
  for i, d in enumerate(docs):
63
+ meta = d.metadata
64
+ doc_type = meta.get("type")
 
65
 
66
+ label = "Prüfungsordnung" if doc_type == "pdf" else "Hochschulgesetz NRW"
 
 
 
67
 
68
+ if doc_type == "pdf":
69
+ page = meta.get("page")
70
+ label += f", Seite {page+1}" if isinstance(page, int) else ""
71
+
72
+ blocks.append(
73
+ f"[KONTEXT {i+1}] ({label})\n{d.page_content[:MAX_CHARS]}"
74
+ )
75
 
76
+ return "\n\n".join(blocks)
77
 
78
  # -----------------------------
79
  # Systemprompt — verschärft
 
130
  context_str = format_context(docs)
131
 
132
  # 2. Prompt bauen
133
+ user_prompt = f"""
134
  FRAGE:
135
  {question}
136
 
 
149
 
150
  msgs = [
151
  SystemMessage(content=SYSTEM_PROMPT),
152
+ HumanMessage(content=user_prompt),
153
  ]
154
 
155
  # 3. LLM aufrufen
requirements.txt CHANGED
@@ -33,5 +33,6 @@ numpy
33
  torchaudio
34
  torch
35
 
36
- # OpenAI offizielle Bibliothek (kommt i.d.R. mit langchain-openai, zur Sicherheit explizit)
37
  openai
 
 
33
  torchaudio
34
  torch
35
 
36
+ # === OpenAI + HF Hub ===
37
  openai
38
+ huggingface_hub
upload_weblink_to_supabase.py CHANGED
@@ -11,13 +11,34 @@ SUPABASE_SERVICE_ROLE = os.environ["SUPABASE_SERVICE_ROLE"]
11
 
12
  supabase = create_client(SUPABASE_URL, SUPABASE_SERVICE_ROLE)
13
 
 
14
  LAW_URL = "https://recht.nrw.de/lmi/owa/br_text_anzeigen?v_id=10000000000000000654"
15
 
16
  def extract_paragraphs():
17
- print(">>> Lade Hochschulgesetz NRW …")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
- html = requests.get(LAW_URL, timeout=30).text
20
- soup = BeautifulSoup(html, "html.parser")
21
 
22
  # Tất cả tiêu đề Paragraph xuất hiện trong <h2> hoặc <h3>
23
  headers = soup.find_all(["h2", "h3"])
@@ -25,45 +46,79 @@ def extract_paragraphs():
25
  paragraphs = []
26
  order = 1
27
 
28
- for header in headers:
29
- title = header.get_text(" ", strip=True)
30
 
31
- if not title.startswith("§"):
32
- continue # bỏ các h2/h3 không phải Paragraph
33
 
34
- # Gom toàn bộ nội dung từ header đến trước h2/h3 tiếp theo
35
- content_parts = []
36
- sibling = header.find_next_sibling()
37
 
38
- while sibling and sibling.name not in ["h2", "h3"]:
39
- text = sibling.get_text(" ", strip=True)
40
- if text:
41
- content_parts.append(text)
42
- sibling = sibling.find_next_sibling()
43
 
44
- full_content = "\n".join(content_parts).strip()
45
 
46
- para_id = f"para_{order}"
47
 
48
- paragraphs.append({
49
- "abs_id": para_id,
50
- "title": title,
51
- "content": full_content,
52
- "order_index": order
53
- })
54
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  order += 1
56
 
57
- print(f"✔ Extracted {len(paragraphs)} paragraphs (§).")
58
  return paragraphs
59
 
60
  def upload_to_supabase():
61
  paras = extract_paragraphs()
62
 
63
- print(">>> Clear table hg_nrw …")
64
  supabase.table("hg_nrw").delete().neq("abs_id", "").execute()
65
 
66
- print(">>> Upload begin …")
67
  BATCH = 100
68
  for i in range(0, len(paras), BATCH):
69
  batch = paras[i:i+BATCH]
 
11
 
12
  supabase = create_client(SUPABASE_URL, SUPABASE_SERVICE_ROLE)
13
 
14
+ # URL CHÍNH THỨC – không dùng Druckversion
15
  LAW_URL = "https://recht.nrw.de/lmi/owa/br_text_anzeigen?v_id=10000000000000000654"
16
 
17
  def extract_paragraphs():
18
+ """
19
+ Lädt die aktuelle Fassung des Hochschulgesetzes NRW
20
+ von recht.nrw.de (br_text_anzeigen) und extrahiert Paragraphen.
21
+
22
+ Ergebnis: Liste von Dicts mit:
23
+ - abs_id: para_1, para_2, ...
24
+ - title: "§ 1 ...", "§ 2 ..."
25
+ - content: gesamter Text des Paragraphen
26
+ - order_index: laufende Nummer
27
+ """
28
+ print(">>> Lade offizielles Hochschulgesetz NRW von recht.nrw.de …")
29
+
30
+
31
+ # html = requests.get(LAW_URL, timeout=30).text
32
+ # soup = BeautifulSoup(html, "html.parser")
33
+ # 5.12_2:13
34
+ resp = requests.get(LAW_URL, timeout=30)
35
+ resp.raise_for_status()
36
+ soup = BeautifulSoup(resp.text, "html.parser")
37
+
38
+ # 5.12_2:13
39
+ # Paragraph-Überschriften: häufig in <p>, <b> oder <strong>
40
+ candidates = soup.find_all(["p", "b", "strong"])
41
 
 
 
42
 
43
  # Tất cả tiêu đề Paragraph xuất hiện trong <h2> hoặc <h3>
44
  headers = soup.find_all(["h2", "h3"])
 
46
  paragraphs = []
47
  order = 1
48
 
49
+ # for header in headers:
50
+ # title = header.get_text(" ", strip=True)
51
 
52
+ # if not title.startswith("§"):
53
+ # continue # bỏ các h2/h3 không phải Paragraph
54
 
55
+ # # Gom toàn bộ nội dung từ header đến trước h2/h3 tiếp theo
56
+ # content_parts = []
57
+ # sibling = header.find_next_sibling()
58
 
59
+ # while sibling and sibling.name not in ["h2", "h3"]:
60
+ # text = sibling.get_text(" ", strip=True)
61
+ # if text:
62
+ # content_parts.append(text)
63
+ # sibling = sibling.find_next_sibling()
64
 
65
+ # full_content = "\n".join(content_parts).strip()
66
 
67
+ # para_id = f"para_{order}"
68
 
69
+ # paragraphs.append({
70
+ # "abs_id": para_id,
71
+ # "title": title,
72
+ # "content": full_content,
73
+ # "order_index": order
74
+ # })
75
 
76
+ # order += 1
77
+
78
+ # print(f"✔ Extracted {len(paragraphs)} paragraphs (§).")
79
+ # return paragraphs
80
+ # 5.12_2:13
81
+ for tag in candidates:
82
+ text = tag.get_text(" ", strip=True)
83
+ if not text.startswith("§"):
84
+ continue
85
+
86
+ title = text
87
+ content_parts = []
88
+ sibling = tag.find_next_sibling()
89
+
90
+ while sibling and not (
91
+ (sibling.name in ["p", "b", "strong"])
92
+ and sibling.get_text(" ", strip=True).startswith("§")
93
+ ):
94
+ txt = sibling.get_text(" ", strip=True)
95
+ if txt:
96
+ content_parts.append(txt)
97
+ sibling = sibling.find_next_sibling()
98
+
99
+ full_content = "\n".join(content_parts).strip()
100
+ abs_id = f"para_{order}"
101
+
102
+ paragraphs.append(
103
+ {
104
+ "abs_id": abs_id,
105
+ "title": title,
106
+ "content": full_content,
107
+ "order_index": order,
108
+ }
109
+ )
110
  order += 1
111
 
112
+ print(f"✔ {len(paragraphs)} Paragraphen extrahiert.")
113
  return paragraphs
114
 
115
  def upload_to_supabase():
116
  paras = extract_paragraphs()
117
 
118
+ print(">>> Leere Tabelle hg_nrw …")
119
  supabase.table("hg_nrw").delete().neq("abs_id", "").execute()
120
 
121
+ print(">>> Upload nach Supabase …")
122
  BATCH = 100
123
  for i in range(0, len(paras), BATCH):
124
  batch = paras[i:i+BATCH]