Spaces:

Nguyen5
/

chatbot

Sleeping

App Files Files Community

Nguyen5 commited on Dec 4, 2025

Commit

bdc9240

1 Parent(s): 2102c78

commit

Browse files

Files changed (2) hide show

app.py +0 -1
upload_weblink_to_supabase.py +43 -101

app.py CHANGED Viewed

@@ -172,7 +172,6 @@ def create_ui():
     with gr.Blocks(
         title="Prüfungsrechts-Chatbot NRW",
-        theme=gr.themes.Soft(),
         css="""
         .chatbot { min-height: 500px; }
         .viewer-frame { border-radius: 10px; border: 1px solid #e0e0e0; }

     with gr.Blocks(
         title="Prüfungsrechts-Chatbot NRW",
         css="""
         .chatbot { min-height: 500px; }
         .viewer-frame { border-radius: 10px; border: 1px solid #e0e0e0; }

upload_weblink_to_supabase.py CHANGED Viewed

@@ -1,14 +1,8 @@
-"""
-upload_weblink_to_supabase.py
-Trích xuất và tải lên các paragraph từ trang web recht.nrw.de
-"""
 import os
 import requests
-import re
 from bs4 import BeautifulSoup
 from supabase import create_client
 from dotenv import load_dotenv
-import time
 load_dotenv()
@@ -19,116 +13,64 @@ supabase = create_client(SUPABASE_URL, SUPABASE_SERVICE_ROLE)
 LAW_URL = "https://recht.nrw.de/lmi/owa/br_text_anzeigen?v_id=10000000000000000654"
-def clean_text(text):
-    """Làm sạch và định dạng văn bản"""
-    # Loại bỏ khoảng trắng thừa
-    text = re.sub(r'\s+', ' ', text)
-    # Chuẩn hóa dấu câu
-    text = re.sub(r'\s*([.,;:!?])\s*', r'\1 ', text)
-    # Đảm bảo chữ cái đầu câu viết hoa
-    sentences = text.split('. ')
-    sentences = [s.strip().capitalize() for s in sentences if s.strip()]
-    return '. '.join(sentences)
 def extract_paragraphs():
     print(">>> Lade Hochschulgesetz NRW …")
-    headers = {
-        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
-    }
-    try:
-        response = requests.get(LAW_URL, headers=headers, timeout=60)
-        response.raise_for_status()
-    except requests.RequestException as e:
-        print(f"❌ Fehler beim Laden der Seite: {e}")
-        return []
-    html = response.text
     soup = BeautifulSoup(html, "html.parser")
-    # Tìm tất cả các section có chứa paragraph
     paragraphs = []
     order = 1
-    # Tìm các phần có chứa § (paragraph symbol)
-    pattern = re.compile(r'§\s*\d+')
-    # Tìm tất cả các element chứa paragraph
-    for element in soup.find_all(['p', 'div', 'td']):
-        text = element.get_text(" ", strip=True)
-        # Kiểm tra nếu có paragraph symbol
-        if pattern.search(text):
-            # Tách title và content
-            lines = text.split('\n')
-            title = lines[0].strip() if lines else ""
-            # Lấy nội dung
-            content = ""
-            if len(lines) > 1:
-                content = clean_text(" ".join(lines[1:]))
-            # Nếu title chưa có §, thêm từ nội dung
-            if '§' not in title and content:
-                # Tìm § trong content để thêm vào title
-                match = pattern.search(content)
-                if match:
-                    title = match.group()
-                    # Xóa title khỏi content
-                    content = content.replace(title, "", 1).strip()
-            # Tạo ID cho paragraph
-            para_id = f"para_{order}"
-            paragraphs.append({
-                "abs_id": para_id,
-                "title": title if title else f"§ {order}",
-                "content": content if content else text,
-                "order_index": order,
-                "source_url": LAW_URL
-            })
-            order += 1
     print(f"✔ Extracted {len(paragraphs)} paragraphs (§).")
-    # In ra mẫu để kiểm tra
-    if paragraphs:
-        print("\nBeispiel Paragraph 1:")
-        print(f"Title: {paragraphs[0]['title']}")
-        print(f"Content (Auszug): {paragraphs[0]['content'][:200]}...\n")
     return paragraphs
 def upload_to_supabase():
     paras = extract_paragraphs()
-    if not paras:
-        print("❌ Keine Paragraphs gefunden. Upload abgebrochen.")
-        return
     print(">>> Clear table hg_nrw …")
-    try:
-        # Xóa toàn bộ dữ liệu cũ
-        supabase.table("hg_nrw").delete().neq("abs_id", "").execute()
-        print("✔ Tabelle geleert.")
-    except Exception as e:
-        print(f"⚠️ Fehler beim Leeren der Tabelle: {e}")
-    print(">>> Upload beginnt …")
-    BATCH_SIZE = 50
-    for i in range(0, len(paras), BATCH_SIZE):
-        batch = paras[i:i+BATCH_SIZE]
-        try:
-            result = supabase.table("hg_nrw").upsert(batch).execute()
-            print(f"✔ Batch {i//BATCH_SIZE + 1} hochgeladen ({len(batch)} Einträge)")
-            time.sleep(0.1)  # Tránh rate limiting
-        except Exception as e:
-            print(f"❌ Fehler beim Upload von Batch {i//BATCH_SIZE + 1}: {e}")
-    print(f"✔ DONE - {len(paras)} Paragraphs erfolgreich hochgeladen.")
 if __name__ == "__main__":
-    upload_to_supabase()

 import os
 import requests
 from bs4 import BeautifulSoup
 from supabase import create_client
 from dotenv import load_dotenv
 load_dotenv()
 LAW_URL = "https://recht.nrw.de/lmi/owa/br_text_anzeigen?v_id=10000000000000000654"
 def extract_paragraphs():
     print(">>> Lade Hochschulgesetz NRW …")
+    html = requests.get(LAW_URL, timeout=30).text
     soup = BeautifulSoup(html, "html.parser")
+    # Tất cả tiêu đề Paragraph xuất hiện trong <h2> hoặc <h3>
+    headers = soup.find_all(["h2", "h3"])
     paragraphs = []
     order = 1
+    for header in headers:
+        title = header.get_text(" ", strip=True)
+        if not title.startswith("§"):
+            continue  # bỏ các h2/h3 không phải Paragraph
+        # Gom toàn bộ nội dung từ header đến trước h2/h3 tiếp theo
+        content_parts = []
+        sibling = header.find_next_sibling()
+        while sibling and sibling.name not in ["h2", "h3"]:
+            text = sibling.get_text(" ", strip=True)
+            if text:
+                content_parts.append(text)
+            sibling = sibling.find_next_sibling()
+        full_content = "\n".join(content_parts).strip()
+        para_id = f"para_{order}"
+        paragraphs.append({
+            "abs_id": para_id,
+            "title": title,
+            "content": full_content,
+            "order_index": order
+        })
+        order += 1
     print(f"✔ Extracted {len(paragraphs)} paragraphs (§).")
     return paragraphs
 def upload_to_supabase():
     paras = extract_paragraphs()
     print(">>> Clear table hg_nrw …")
+    supabase.table("hg_nrw").delete().neq("abs_id", "").execute()
+    print(">>> Upload begin …")
+    BATCH = 100
+    for i in range(0, len(paras), BATCH):
+        batch = paras[i:i+BATCH]
+        print(f"   - Upload batch {i} – {i+len(batch)-1}")
+        supabase.table("hg_nrw").upsert(batch).execute()
+    print("✔ DONE uploading complete NRW law.")
 if __name__ == "__main__":
+    upload_to_supabase()