Nguyen5 commited on
Commit
4da3e87
·
1 Parent(s): 3a9ed51
Files changed (1) hide show
  1. load_documents.py +106 -103
load_documents.py CHANGED
@@ -1,128 +1,131 @@
1
  """
2
- LOAD_DOCUMENTS SINGLE SOURCE OF TRUTH
3
- Nhiệm vụ:
4
- 1) Lade Prüfungsordnung PDF direkt aus Supabase-Storage.
5
- 2) Lade Hochschulgesetz NRW aus Supabase-Tabelle hg_nrw.
6
- 3) Cung cấp metadata đầy đủ để các file khác KHÔNG PHẢI tính lại URL.
 
 
7
  """
8
 
9
- import os
10
- import tempfile
11
- from dotenv import load_dotenv
12
  from langchain_community.document_loaders import PyPDFLoader
13
  from langchain_core.documents import Document
14
- from supabase import create_client
15
-
16
- load_dotenv()
17
-
18
- import urllib.parse
19
-
20
- # ===== Supabase config =====
21
- SUPABASE_URL = os.getenv("SUPABASE_URL")
22
- SUPABASE_SERVICE_ROLE = os.getenv("SUPABASE_SERVICE_ROLE")
23
-
24
- supabase = create_client(SUPABASE_URL, SUPABASE_SERVICE_ROLE)
25
-
26
- # ===== Storage Config =====
27
 
28
- #import urllib.parse
29
  PDF_FILE = "f10_bpo_ifb_tei_mif_wii_2021-01-04.pdf"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
- PDF_BUCKET = "File PDF"
32
- ENC_BUCKET = urllib.parse.quote(PDF_BUCKET) # "File%20PDF"
33
-
34
- #PDF_URL = f"{SUPABASE_URL}/storage/v1/object/public/{PDF_BUCKET}/{PDF_FILE}"
35
- PDF_URL = f"{SUPABASE_URL}/storage/v1/object/public/{ENC_BUCKET}/{PDF_FILE}"
36
-
37
-
38
- # ===== Viewer URL =====
39
- HG_VIEWER_BUCKET = "hg_viewer"
40
- HG_VIEWER_FILE = "hg_clean.html"
41
- HG_VIEWER_URL = f"{SUPABASE_URL}/storage/v1/object/public/{HG_VIEWER_BUCKET}/{HG_VIEWER_FILE}"
42
-
43
-
44
- # ============================================================
45
- # 1) PDF aus Supabase laden
46
- # ============================================================
47
-
48
- def load_pdf_from_supabase() -> list[Document]:
49
- print("📥 Lade Prüfungsordnung PDF aus Supabase...")
50
-
51
- response = supabase.storage.from_(PDF_BUCKET).download(PDF_FILE)
52
- if response is None:
53
- raise ValueError("❌ Konnte PDF nicht laden!")
54
-
55
- # Temporäre Datei
56
- with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
57
- tmp.write(response)
58
- temp_pdf_path = tmp.name
59
 
60
- pages = PyPDFLoader(temp_pdf_path).load()
61
 
62
- for i, p in enumerate(pages):
63
- p.metadata = {
64
- "type": "pdf",
65
- "source": "Prüfungsordnung",
66
- "page": i,
67
- "pdf_url": f"{PDF_URL}#page={i}",
68
- "filename": PDF_FILE,
69
  }
 
 
70
 
71
- print(f"✔ {len(pages)} PDF-Seiten geladen.")
72
- return pages
73
 
 
 
74
 
75
- # ============================================================
76
- # 2) HG aus Tabelle laden
77
- # ============================================================
78
 
79
- def load_hg_from_supabase() -> list[Document]:
80
- print("📥 Lade Hochschulgesetz NRW aus Tabelle hg_nrw...")
 
 
 
 
81
 
82
- res = (
83
- supabase.table("hg_nrw")
84
- .select("*")
85
- .order("order_index", desc=False)
86
- .execute()
87
- )
88
- rows = res.data or []
89
  docs = []
90
 
91
- for row in rows:
92
- abs_id = row["abs_id"]
93
- title = row["title"]
94
- content = row["content"]
95
-
96
- viewer_url = f"{HG_VIEWER_URL}#{abs_id}"
97
-
98
- docs.append(
99
- Document(
100
- page_content=content,
101
- metadata={
102
- "type": "hg",
103
- "source": "Hochschulgesetz NRW",
104
- "abs_id": abs_id,
105
- "title": title,
106
- "viewer_url": viewer_url,
107
- },
108
- )
109
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
 
111
- print(f" {len(docs)} HG-Absätze geladen.")
112
- return docs
 
 
 
 
113
 
 
114
 
115
- # ============================================================
116
- # 3) ALLES LADEN
117
- # ============================================================
118
 
119
- def load_all_documents():
120
- pdf_docs = load_pdf_from_supabase()
121
- hg_docs = load_hg_from_supabase()
122
- return pdf_docs + hg_docs
123
 
 
 
 
124
 
125
- if __name__ == "__main__":
126
- docs = load_all_documents()
127
- print("📚 Gesamt:", len(docs))
128
- print("🔎 Beispiel metadata:", docs[0].metadata)
 
1
  """
2
+ BƯỚC 1: LOAD DOCUMENTS
3
+ -----------------------
4
+ Debug-full version
5
+
6
+ - Lädt Prüfungsordnung (PDF) seitenweise.
7
+ - Lädt Hochschulgesetz NRW aus dem im Dataset gespeicherten HTML,
8
+ und zerlegt es in einzelne Absätze (Document pro <p>).
9
  """
10
 
11
+ from huggingface_hub import hf_hub_download, list_repo_files
 
 
12
  from langchain_community.document_loaders import PyPDFLoader
13
  from langchain_core.documents import Document
14
+ from bs4 import BeautifulSoup
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
+ DATASET = "Nguyen5/docs"
17
  PDF_FILE = "f10_bpo_ifb_tei_mif_wii_2021-01-04.pdf"
18
+ HTML_FILE = "Hochschulgesetz_NRW.html" # konsistent mit hg_nrw.py
19
+
20
+ def _load_hg_paragraph_documents(html_path: str):
21
+ """
22
+ Liest das generierte Hochschulgesetz-HTML ein und erzeugt
23
+ pro <p>-Element einen LangChain-Document mit:
24
+ - page_content = Text des Absatzes
25
+ - metadata:
26
+ source = "Hochschulgesetz NRW (HTML)"
27
+ filename = HTML_FILE
28
+ paragraph_id = id-Attribut (z.B. 'hg_abs_12'), falls vorhanden
29
+ """
30
+ with open(html_path, "r", encoding="utf-8") as f:
31
+ html = f.read()
32
+
33
+ soup = BeautifulSoup(html, "html.parser")
34
+ docs = []
35
 
36
+ for p in soup.find_all("p"):
37
+ text = p.get_text(" ", strip=True)
38
+ if not text:
39
+ continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
+ pid = p.get("id")
42
 
43
+ metadata = {
44
+ "source": "Hochschulgesetz NRW (HTML)",
45
+ "filename": HTML_FILE,
 
 
 
 
46
  }
47
+ if pid:
48
+ metadata["paragraph_id"] = pid
49
 
50
+ docs.append(Document(page_content=text, metadata=metadata))
 
51
 
52
+ print(f"Loaded {len(docs)} paragraph Documents from HG-HTML.\n")
53
+ return docs
54
 
55
+ def load_documents():
56
+ print("=== START: load_documents() ===\n")
 
57
 
58
+ # -------------------------
59
+ # Check files in dataset
60
+ # -------------------------
61
+ print(">>> Checking dataset file list from HuggingFace...")
62
+ files = list_repo_files(DATASET, repo_type="dataset")
63
+ print("Files in dataset:", files, "\n")
64
 
 
 
 
 
 
 
 
65
  docs = []
66
 
67
+ # -------------------------
68
+ # Load PDF
69
+ # -------------------------
70
+ print(">>> Step 1: Download PDF from HuggingFace...")
71
+ try:
72
+ pdf_path = hf_hub_download(
73
+ repo_id=DATASET,
74
+ filename=PDF_FILE,
75
+ repo_type="dataset",
 
 
 
 
 
 
 
 
 
76
  )
77
+ print(f"Downloaded PDF to local cache:\n{pdf_path}\n")
78
+ except Exception as e:
79
+ print("ERROR downloading PDF:", e)
80
+ return []
81
+
82
+ print(">>> Step 1.1: Loading PDF pages...")
83
+ try:
84
+ pdf_docs = PyPDFLoader(pdf_path).load()
85
+ print(f"Loaded {len(pdf_docs)} PDF pages.\n")
86
+ except Exception as e:
87
+ print("ERROR loading PDF:", e)
88
+ return []
89
+
90
+ for d in pdf_docs:
91
+ d.metadata["source"] = "Prüfungsordnung (PDF)"
92
+ d.metadata["filename"] = PDF_FILE
93
+
94
+ docs.extend(pdf_docs)
95
+
96
+ # -------------------------
97
+ # Load HTML (Hochschulgesetz NRW)
98
+ # -------------------------
99
+ print(">>> Step 2: Download HTML from HuggingFace...")
100
+ try:
101
+ html_path = hf_hub_download(
102
+ repo_id=DATASET,
103
+ filename=HTML_FILE,
104
+ repo_type="dataset",
105
+ )
106
+ print(f"Downloaded HTML to local cache:\n{html_path}\n")
107
+ except Exception as e:
108
+ print("ERROR downloading HTML:", e)
109
+ return docs
110
 
111
+ print(">>> Step 2.1: Loading HG-HTML and splitting into paragraphs...")
112
+ try:
113
+ html_docs = _load_hg_paragraph_documents(html_path)
114
+ except Exception as e:
115
+ print("ERROR loading / parsing HTML:", e)
116
+ return docs
117
 
118
+ docs.extend(html_docs)
119
 
120
+ print("=== DONE: load_documents() ===\n")
121
+ return docs
 
122
 
123
+ if __name__ == "__main__":
124
+ print("\n=== Running load_documents.py directly ===\n")
125
+ docs = load_documents()
126
+ print(f"\n>>> TOTAL documents loaded: {len(docs)}")
127
 
128
+ if len(docs):
129
+ print("\nExample metadata from 1st document:")
130
+ print(docs[0].metadata)
131