Spaces:

Ines1994
/

ENA-Chatbot

Sleeping

App Files Files Community

Ines1994 commited on Apr 11

Commit

8dcf3a7

verified ·

1 Parent(s): 70a9baf

Upload 3 files

Browse files

Files changed (3) hide show

build_chroma.py +3 -4
ena_full_data.json +0 -0
scraper_api.py +20 -2

build_chroma.py CHANGED Viewed

@@ -21,11 +21,10 @@ def build():
     print(f"Loaded {len(pages)} pages.")
-    # Intelligent Chunking
-    # We use specific separators to avoid breaking administrative lists (numbered items)
     text_splitter = RecursiveCharacterTextSplitter(
-        chunk_size=500,
-        chunk_overlap=100,
         separators=["\n\n", "\n", " - ", " * ", ". ", " "]
     )

     print(f"Loaded {len(pages)} pages.")
+    # Intelligent Chunking — larger chunks to keep legal lists intact
     text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=1000,   # was 500 — doubled to avoid splitting condition lists
+        chunk_overlap=200, # was 100 — increased to preserve context between chunks
         separators=["\n\n", "\n", " - ", " * ", ". ", " "]
     )

ena_full_data.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

scraper_api.py CHANGED Viewed

@@ -91,11 +91,29 @@ SKIP_SLUGS = [
 # 🛠️ HELPERS
 # ══════════════════════════════════════════════════════════
-def get_category(url: str) -> str:
     ul = url.lower()
     for p, c in CATS.items():
         if p in ul:
             return c
     return "other"
 def get_lang(url: str) -> str:
@@ -180,7 +198,7 @@ def process_api_items(items: list[dict], content_type: str) -> list[dict]:
             "url": link,
             "source": "ena.tn-api",
             "langue": get_lang(link),
-            "category": get_category(link),
             "content_type": content_type,
             "date": date,
             "content": full_content,

 # 🛠️ HELPERS
 # ══════════════════════════════════════════════════════════
+CONTENT_CATS = {
+    "concours_ar": ["مناظرة", "ترشح", "شروط الدخول", "بقاع", "اختبار", "مرحلة عليا", "مرحلة متوسطة", "أعوان"],
+    "concours_fr": ["concours", "candidature", "cycle supérieur", "cycle moyen", "épreuve", "places"],
+    "formation_continue": ["تكوين مستمر", "formation continue", "fad", "تكوين عن بعد"],
+    "formation_ar": ["تكوين", "برنامج", "تأهيل", "cycle de formation"],
+    "news_ar": ["إعلان", "بلاغ", "أخبار", "مستجدات", "إعلام"],
+    "news_fr": ["actualité", "communiqué", "annonce", "information"],
+    "inscription": ["تسجيل", "inscription", "2026"],
+    "fad": ["fad", "تعليم عن بعد", "formation à distance"],
+}
+def get_category(url: str, text: str = "") -> str:
     ul = url.lower()
+    # First try URL-based matching (most reliable)
     for p, c in CATS.items():
         if p in ul:
             return c
+    # Then try content-based matching for API pages
+    if text:
+        tl = text.lower()
+        for cat, keywords in CONTENT_CATS.items():
+            if any(kw in tl for kw in keywords):
+                return cat
     return "other"
 def get_lang(url: str) -> str:
             "url": link,
             "source": "ena.tn-api",
             "langue": get_lang(link),
+            "category": get_category(link, full_content),
             "content_type": content_type,
             "date": date,
             "content": full_content,