Nguyen5 commited on
Commit
bdc9240
·
1 Parent(s): 2102c78
Files changed (2) hide show
  1. app.py +0 -1
  2. upload_weblink_to_supabase.py +43 -101
app.py CHANGED
@@ -172,7 +172,6 @@ def create_ui():
172
 
173
  with gr.Blocks(
174
  title="Prüfungsrechts-Chatbot NRW",
175
- theme=gr.themes.Soft(),
176
  css="""
177
  .chatbot { min-height: 500px; }
178
  .viewer-frame { border-radius: 10px; border: 1px solid #e0e0e0; }
 
172
 
173
  with gr.Blocks(
174
  title="Prüfungsrechts-Chatbot NRW",
 
175
  css="""
176
  .chatbot { min-height: 500px; }
177
  .viewer-frame { border-radius: 10px; border: 1px solid #e0e0e0; }
upload_weblink_to_supabase.py CHANGED
@@ -1,14 +1,8 @@
1
- """
2
- upload_weblink_to_supabase.py
3
- Trích xuất và tải lên các paragraph từ trang web recht.nrw.de
4
- """
5
  import os
6
  import requests
7
- import re
8
  from bs4 import BeautifulSoup
9
  from supabase import create_client
10
  from dotenv import load_dotenv
11
- import time
12
 
13
  load_dotenv()
14
 
@@ -19,116 +13,64 @@ supabase = create_client(SUPABASE_URL, SUPABASE_SERVICE_ROLE)
19
 
20
  LAW_URL = "https://recht.nrw.de/lmi/owa/br_text_anzeigen?v_id=10000000000000000654"
21
 
22
- def clean_text(text):
23
- """Làm sạch và định dạng văn bản"""
24
- # Loại bỏ khoảng trắng thừa
25
- text = re.sub(r'\s+', ' ', text)
26
- # Chuẩn hóa dấu câu
27
- text = re.sub(r'\s*([.,;:!?])\s*', r'\1 ', text)
28
- # Đảm bảo chữ cái đầu câu viết hoa
29
- sentences = text.split('. ')
30
- sentences = [s.strip().capitalize() for s in sentences if s.strip()]
31
- return '. '.join(sentences)
32
-
33
  def extract_paragraphs():
34
  print(">>> Lade Hochschulgesetz NRW …")
35
 
36
- headers = {
37
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
38
- }
39
-
40
- try:
41
- response = requests.get(LAW_URL, headers=headers, timeout=60)
42
- response.raise_for_status()
43
- except requests.RequestException as e:
44
- print(f"❌ Fehler beim Laden der Seite: {e}")
45
- return []
46
-
47
- html = response.text
48
  soup = BeautifulSoup(html, "html.parser")
49
 
50
- # Tìm tất cả các section chứa paragraph
 
 
51
  paragraphs = []
52
  order = 1
53
 
54
- # Tìm các phần có chứa § (paragraph symbol)
55
- pattern = re.compile(r'§\s*\d+')
56
-
57
- # Tìm tất cả các element chứa paragraph
58
- for element in soup.find_all(['p', 'div', 'td']):
59
- text = element.get_text(" ", strip=True)
60
-
61
- # Kiểm tra nếu có paragraph symbol
62
- if pattern.search(text):
63
- # Tách title và content
64
- lines = text.split('\n')
65
- title = lines[0].strip() if lines else ""
66
-
67
- # Lấy nội dung
68
- content = ""
69
- if len(lines) > 1:
70
- content = clean_text(" ".join(lines[1:]))
71
-
72
- # Nếu title chưa có §, thêm từ nội dung
73
- if '§' not in title and content:
74
- # Tìm § trong content để thêm vào title
75
- match = pattern.search(content)
76
- if match:
77
- title = match.group()
78
- # Xóa title khỏi content
79
- content = content.replace(title, "", 1).strip()
80
-
81
- # Tạo ID cho paragraph
82
- para_id = f"para_{order}"
83
-
84
- paragraphs.append({
85
- "abs_id": para_id,
86
- "title": title if title else f"§ {order}",
87
- "content": content if content else text,
88
- "order_index": order,
89
- "source_url": LAW_URL
90
- })
91
-
92
- order += 1
93
 
94
  print(f"✔ Extracted {len(paragraphs)} paragraphs (§).")
95
-
96
- # In ra mẫu để kiểm tra
97
- if paragraphs:
98
- print("\nBeispiel Paragraph 1:")
99
- print(f"Title: {paragraphs[0]['title']}")
100
- print(f"Content (Auszug): {paragraphs[0]['content'][:200]}...\n")
101
-
102
  return paragraphs
103
 
104
  def upload_to_supabase():
105
  paras = extract_paragraphs()
106
 
107
- if not paras:
108
- print("❌ Keine Paragraphs gefunden. Upload abgebrochen.")
109
- return
110
-
111
  print(">>> Clear table hg_nrw …")
112
- try:
113
- # Xóa toàn bộ dữ liệu cũ
114
- supabase.table("hg_nrw").delete().neq("abs_id", "").execute()
115
- print("✔ Tabelle geleert.")
116
- except Exception as e:
117
- print(f"⚠️ Fehler beim Leeren der Tabelle: {e}")
118
-
119
- print(">>> Upload beginnt …")
120
- BATCH_SIZE = 50
121
-
122
- for i in range(0, len(paras), BATCH_SIZE):
123
- batch = paras[i:i+BATCH_SIZE]
124
- try:
125
- result = supabase.table("hg_nrw").upsert(batch).execute()
126
- print(f"✔ Batch {i//BATCH_SIZE + 1} hochgeladen ({len(batch)} Einträge)")
127
- time.sleep(0.1) # Tránh rate limiting
128
- except Exception as e:
129
- print(f"❌ Fehler beim Upload von Batch {i//BATCH_SIZE + 1}: {e}")
130
-
131
- print(f"✔ DONE - {len(paras)} Paragraphs erfolgreich hochgeladen.")
132
 
133
  if __name__ == "__main__":
134
- upload_to_supabase()
 
 
 
 
 
1
  import os
2
  import requests
 
3
  from bs4 import BeautifulSoup
4
  from supabase import create_client
5
  from dotenv import load_dotenv
 
6
 
7
  load_dotenv()
8
 
 
13
 
14
  LAW_URL = "https://recht.nrw.de/lmi/owa/br_text_anzeigen?v_id=10000000000000000654"
15
 
 
 
 
 
 
 
 
 
 
 
 
16
  def extract_paragraphs():
17
  print(">>> Lade Hochschulgesetz NRW …")
18
 
19
+ html = requests.get(LAW_URL, timeout=30).text
 
 
 
 
 
 
 
 
 
 
 
20
  soup = BeautifulSoup(html, "html.parser")
21
 
22
+ # Tất cả tiêu đề Paragraph xuất hiện trong <h2> hoặc <h3>
23
+ headers = soup.find_all(["h2", "h3"])
24
+
25
  paragraphs = []
26
  order = 1
27
 
28
+ for header in headers:
29
+ title = header.get_text(" ", strip=True)
30
+
31
+ if not title.startswith("§"):
32
+ continue # bỏ các h2/h3 không phải Paragraph
33
+
34
+ # Gom toàn bộ nội dung từ header đến trước h2/h3 tiếp theo
35
+ content_parts = []
36
+ sibling = header.find_next_sibling()
37
+
38
+ while sibling and sibling.name not in ["h2", "h3"]:
39
+ text = sibling.get_text(" ", strip=True)
40
+ if text:
41
+ content_parts.append(text)
42
+ sibling = sibling.find_next_sibling()
43
+
44
+ full_content = "\n".join(content_parts).strip()
45
+
46
+ para_id = f"para_{order}"
47
+
48
+ paragraphs.append({
49
+ "abs_id": para_id,
50
+ "title": title,
51
+ "content": full_content,
52
+ "order_index": order
53
+ })
54
+
55
+ order += 1
 
 
 
 
 
 
 
 
 
 
 
56
 
57
  print(f"✔ Extracted {len(paragraphs)} paragraphs (§).")
 
 
 
 
 
 
 
58
  return paragraphs
59
 
60
  def upload_to_supabase():
61
  paras = extract_paragraphs()
62
 
 
 
 
 
63
  print(">>> Clear table hg_nrw …")
64
+ supabase.table("hg_nrw").delete().neq("abs_id", "").execute()
65
+
66
+ print(">>> Upload begin …")
67
+ BATCH = 100
68
+ for i in range(0, len(paras), BATCH):
69
+ batch = paras[i:i+BATCH]
70
+ print(f" - Upload batch {i} – {i+len(batch)-1}")
71
+ supabase.table("hg_nrw").upsert(batch).execute()
72
+
73
+ print("✔ DONE uploading complete NRW law.")
 
 
 
 
 
 
 
 
 
 
74
 
75
  if __name__ == "__main__":
76
+ upload_to_supabase()