Spaces:

Ines1994
/

ENA-Chatbot

Sleeping

App Files Files Community

Ines1994 commited on Apr 10

Commit

08db80a

verified ·

1 Parent(s): 15242ce

Upload 16 files

Browse files

Files changed (3) hide show

build_chroma.py +6 -6
ena_full_data.json +0 -0
scraper.py +167 -55

build_chroma.py CHANGED Viewed

@@ -13,13 +13,13 @@ load_dotenv()
 def build():
     # Load Data
     if not os.path.exists("ena_full_data.json"):
-        print("❌ Error: ena_full_data.json not found!")
         return
     with open("ena_full_data.json", "r", encoding="utf-8") as f:
         pages = json.load(f)
-    print(f"📄 Loaded {len(pages)} pages.")
     # Intelligent Chunking
     # We use specific separators to avoid breaking administrative lists (numbered items)
@@ -43,7 +43,7 @@ def build():
                 "content": chunk
             })
-    print(f"✅ Created {len(all_chunks)} chunks.")
     # Embeddings
     embeddings = HuggingFaceEmbeddings(
@@ -56,7 +56,7 @@ def build():
     client = chromadb.PersistentClient(path=CHROMA_PATH)
     try:
         client.delete_collection(COLLECTION_NAME)
-        print("🗑️ Old collection deleted.")
     except:
         pass
@@ -79,9 +79,9 @@ def build():
                 "category": c["category"]
             } for c in batch]
         )
-        print(f"📥 Inserted {min(i+BATCH_SIZE, len(all_chunks))}/{len(all_chunks)}")
-    print(f"🚀 Success! Total documents: {vector_store._collection.count()}")
 if __name__ == "__main__":
     build()

 def build():
     # Load Data
     if not os.path.exists("ena_full_data.json"):
+        print("Error: ena_full_data.json not found!")
         return
     with open("ena_full_data.json", "r", encoding="utf-8") as f:
         pages = json.load(f)
+    print(f"Loaded {len(pages)} pages.")
     # Intelligent Chunking
     # We use specific separators to avoid breaking administrative lists (numbered items)
                 "content": chunk
             })
+    print(f"Created {len(all_chunks)} chunks.")
     # Embeddings
     embeddings = HuggingFaceEmbeddings(
     client = chromadb.PersistentClient(path=CHROMA_PATH)
     try:
         client.delete_collection(COLLECTION_NAME)
+        print("Old collection deleted.")
     except:
         pass
                 "category": c["category"]
             } for c in batch]
         )
+        print(f"Inserted {min(i+BATCH_SIZE, len(all_chunks))}/{len(all_chunks)}")
+    print(f"Success! Total documents: {vector_store._collection.count()}")
 if __name__ == "__main__":
     build()

ena_full_data.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

scraper.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """
-Crawl www.ena.tn (ar + fr seeds) and save structured text to ena_full_data.json.
-Run: pip install requests beautifulsoup4 && python scraper.py
 """
 from __future__ import annotations
@@ -8,12 +9,45 @@ import json
 import re
 from collections import deque
 from typing import Optional
-from urllib.parse import urljoin, urlparse
 import requests
 from bs4 import BeautifulSoup
-BASE_URLS = [
     "https://www.ena.tn/ar/",
     "https://www.ena.tn/fr/",
 ]
@@ -24,15 +58,36 @@ HEADERS = {
 }
 CATS = {
-    "/concours/": "concours_fr",
-    "/formation/": "formation_fr",
-    "/gouvernance/": "gouvernance",
-    "/ar/concours": "concours_ar",
-    "/ar/formation": "formation_ar",
-    "/ar/service": "service_ar",
-    "/actualites/": "news",
 }
 def _get_cat(u: str) -> str:
     ul = u.lower()
@@ -54,32 +109,60 @@ def normalize_url(url: str) -> Optional[str]:
     host = p.netloc.lower()
     if "ena.tn" not in host:
         return None
     path = p.path or "/"
     query = f"?{p.query}" if p.query else ""
     return f"https://{host}{path}{query}"
-def _should_skip_href(url: str) -> bool:
-    low = url.lower().split("?")[0]
-    return bool(
-        re.search(r"\.(pdf|jpg|jpeg|png|gif|zip|css|js|ico|svg|woff2?)(\?|$)", low)
-    )
 def page_lang(url: str) -> str:
-    u = url.lower()
-    if "/ar/" in u or "/ar" in urlparse(u).path[:5]:
         return "ar"
-    if "/fr/" in u or "/fr" in urlparse(u).path[:5]:
         return "fr"
     return "fr"
-def extract_text(soup: BeautifulSoup) -> str:
-    for t in soup(["script", "style", "nav", "footer", "header"]):
-        t.decompose()
-    return soup.get_text(" ", strip=True)
 def crawl(base_list: list[str], max_depth: int = 3) -> list[dict]:
     all_data: list[dict] = []
@@ -88,48 +171,52 @@ def crawl(base_list: list[str], max_depth: int = 3) -> list[dict]:
     for base in base_list:
         nu = normalize_url(base)
-        if nu:
             queue.append((nu, 0))
     while queue:
         url, depth = queue.popleft()
-        if url in visited:
-            continue
-        if depth > max_depth:
             continue
         visited.add(url)
         try:
-            r = requests.get(url, headers=HEADERS, timeout=25)
             r.raise_for_status()
         except (requests.RequestException, OSError) as e:
-            print(f"skip {url}: {e}")
             continue
         ctype = (r.headers.get("Content-Type") or "").lower()
-        if "html" not in ctype and "xml" not in ctype:
             continue
         soup = BeautifulSoup(r.text, "html.parser")
         text = extract_text(soup)
-        if len(text) < 50:
             continue
-        path = urlparse(url).path.strip("/")
-        page_name = path.split("/")[-1] if path else "home"
-        all_data.append(
-            {
-                "page_name": page_name,
-                "url": url,
-                "source": "ena.tn",
-                "langue": page_lang(url),
-                "category": _get_cat(url),
-                "content": text,
-                "chars": len(text),
-            }
-        )
         if depth < max_depth:
             for a in soup.find_all("a", href=True):
                 href = (a.get("href") or "").strip()
@@ -137,29 +224,54 @@ def crawl(base_list: list[str], max_depth: int = 3) -> list[dict]:
                     continue
                 next_u = urljoin(url, href)
                 nu = normalize_url(next_u)
-                if not nu or _should_skip_href(nu):
-                    continue
-                if nu not in visited:
                     queue.append((nu, depth + 1))
     return all_data
 if __name__ == "__main__":
-    print("Crawling ENA website (ar + fr seeds)...\n")
     all_data = crawl(BASE_URLS, max_depth=3)
     unique: list[dict] = []
     seen_texts: set[str] = set()
     for page in all_data:
-        if page["content"] not in seen_texts:
             unique.append(page)
             seen_texts.add(page["content"])
     with open("ena_full_data.json", "w", encoding="utf-8") as f:
         json.dump(unique, f, ensure_ascii=False, indent=2)
-    print(f"\nPages: {len(unique)}")
-    print(f"Characters: {sum(p['chars'] for p in unique):,}")
-    print("Saved to ena_full_data.json")

 """
+ENA Chatbot — Scraper v3.0 Final
+Crawl www.ena.tn (ar + fr) and save structured text to ena_full_data.json.
+Run: python scraper.py
 """
 from __future__ import annotations
 import re
 from collections import deque
 from typing import Optional
+from urllib.parse import urljoin, urlparse, unquote
 import requests
 from bs4 import BeautifulSoup
+# ══════════════════════════════════════════════════════════
+# ⚙️ CONFIG
+# ══════════════════════════════════════════════════════════
+# صفحات مضمونة التحميل — تبدأ منها قبل الـ crawl العام
+PRIORITY_URLS = [
+    # المناظرات — عربي
+    "https://www.ena.tn/ar/concours-ar/cycle-superieur-arr/concours-entree-cycle-superieur-ar/",
+    "https://www.ena.tn/ar/concours-ar/informations-generales-ar/",
+    "https://www.ena.tn/ar/concours-ar/cycle-moyen-ar/entree-au-cycle-de-formation-des-cadres-moyens-ar/",
+    "https://www.ena.tn/ar/concours-ar/agents-categorie-a3-ar/",
+    "https://www.ena.tn/ar/preparation-au-concours-ar/",
+    # المناظرات — فرنسي
+    "https://www.ena.tn/fr/concours/cycle-superieur/le-concours-dentree-au-cycle-superieur/",
+    "https://www.ena.tn/fr/concours/informations-generales/",
+    "https://www.ena.tn/fr/concours/cycle-moyen/concours-dentree-au-cycle-de-formation-des-cadres-moyens-de-la-sous-categorie-a2-2/",
+    "https://www.ena.tn/fr/concours/agents-de-la-sous-categorie-a3/",
+    "https://www.ena.tn/fr/concours/cycle-superieur/preparation-au-concours/",
+    # التكوين المستمر — عربي
+    "https://www.ena.tn/ar/formation-continue-ar/formation-continue-a-distance-et-presentielle-ar/",
+    "https://www.ena.tn/ar/formation-continue-ar/developpement-de-competences-ar/",
+    # التكوين المستمر — فرنسي
+    "https://www.ena.tn/fr/formation-continue/formation-continue-a-distance-et-presentielle/",
+    # المستجدات والأخبار
+    "https://www.ena.tn/ar/actualites-ar/",
+    "https://www.ena.tn/fr/actualites-fr/",
+    "https://www.ena.tn/ar/%d9%85%d8%b3%d8%aa%d8%ac%d8%af%d8%a7%d8%aa/",
+    # صفحات مهمة 2026
+    "https://www.ena.tn/ar/inscription2026/",
+    "https://www.ena.tn/ar/ouverturefad2026/",
+    "https://www.ena.tn/ar/fad2026/",
+]
+BASE_URLS = PRIORITY_URLS + [
     "https://www.ena.tn/ar/",
     "https://www.ena.tn/fr/",
 ]
 }
 CATS = {
+    "/concours/":         "concours_fr",
+    "/concours-ar":       "concours_ar",
+    "/ar/concours":       "concours_ar",
+    "/formation/":        "formation_fr",
+    "/ar/formation":      "formation_ar",
+    "/formation-continue":"formation_continue",
+    "/gouvernance/":      "gouvernance",
+    "/ar/service":        "service_ar",
+    "/actualites/":       "news_fr",
+    "/actualites-fr/":    "news_fr",
+    "/actualites-ar/":    "news_ar",
+    "/evenement":         "news_fr",
+    "/evenement-ar":      "news_ar",
+    "/leadership":        "leadership",
+    "/inscription":       "inscription",
+    "/fad":               "fad",
 }
+# صفحات نتجاهلها — ما عندهاش محتوى مفيد
+SKIP_PATTERNS = [
+    "wp-admin", "wp-login", "wp-json", "xmlrpc",
+    "woocommerce", "cart", "checkout", "my-account",
+    "politique-de-confidentialite", "page-d-exemple",
+    "elementor", "gravatar", "automattic",
+    "log_file", "attachment",
+]
+# ══════════════════════════════════════════════════════════
+# 🛠️ HELPERS
+# ══════════════════════════════════════════════════════════
 def _get_cat(u: str) -> str:
     ul = u.lower()
     host = p.netloc.lower()
     if "ena.tn" not in host:
         return None
+    # تجاهل روابط IP الداخلية
+    if host.startswith("193."):
+        return None
     path = p.path or "/"
     query = f"?{p.query}" if p.query else ""
     return f"https://{host}{path}{query}"
+def _should_skip(url: str) -> bool:
+    low = url.lower()
+    # تجاهل الملفات
+    if re.search(r"\.(pdf|jpg|jpeg|png|gif|zip|css|js|ico|svg|woff2?|txt|mp4|mp3)(\?|$)", low.split("?")[0]):
+        return True
+    # تجاهل الصفحات غير المفيدة
+    if any(p in low for p in SKIP_PATTERNS):
+        return True
+    return False
 def page_lang(url: str) -> str:
+    path = urlparse(url.lower()).path
+    if "/ar/" in path or path.startswith("/ar"):
         return "ar"
+    if "/fr/" in path or path.startswith("/fr"):
         return "fr"
     return "fr"
+def get_page_name(url: str) -> str:
+    """استخراج اسم الصفحة مع فك تشفير العربية"""
+    path = urlparse(url).path.strip("/")
+    raw_name = path.split("/")[-1] if path else "home"
+    try:
+        return unquote(raw_name)
+    except Exception:
+        return raw_name
+def extract_text(soup: BeautifulSoup) -> str:
+    """استخراج النص الصافي بدون navigation وscripts"""
+    for tag in soup(["script", "style", "nav", "footer", "header", "aside"]):
+        tag.decompose()
+    # إزالة Breadcrumbs
+    for tag in soup.find_all(class_=re.compile(r"breadcrumb|menu|sidebar", re.I)):
+        tag.decompose()
+    text = soup.get_text(" ", strip=True)
+    # تنظيف المسافات الزائدة
+    text = re.sub(r"\s{3,}", "  ", text)
+    return text
+# ══════════════════════════════════════════════════════════
+# 🕷️ CRAWLER
+# ══════════════════════════════════════════════════════════
 def crawl(base_list: list[str], max_depth: int = 3) -> list[dict]:
     all_data: list[dict] = []
     for base in base_list:
         nu = normalize_url(base)
+        if nu and not _should_skip(nu):
             queue.append((nu, 0))
+    total_fetched = 0
     while queue:
         url, depth = queue.popleft()
+        if url in visited or depth > max_depth:
             continue
         visited.add(url)
         try:
+            r = requests.get(url, headers=HEADERS, timeout=25, allow_redirects=True)
             r.raise_for_status()
         except (requests.RequestException, OSError) as e:
+            print(f"  skip {url[:60]}: {e}")
             continue
         ctype = (r.headers.get("Content-Type") or "").lower()
+        if "html" not in ctype:
             continue
         soup = BeautifulSoup(r.text, "html.parser")
         text = extract_text(soup)
+        if len(text) < 100:
             continue
+        page_name = get_page_name(url)
+        category  = _get_cat(url)
+        lang      = page_lang(url)
+        all_data.append({
+            "page_name": page_name,
+            "url":       url,
+            "source":    "ena.tn",
+            "langue":    lang,
+            "category":  category,
+            "content":   text,
+            "chars":     len(text),
+        })
+        total_fetched += 1
+        if total_fetched % 20 == 0:
+            print(f"  {total_fetched} pages fetched...")
+        # تابع الـ links إذا ما وصلناش للعمق الأقصى
         if depth < max_depth:
             for a in soup.find_all("a", href=True):
                 href = (a.get("href") or "").strip()
                     continue
                 next_u = urljoin(url, href)
                 nu = normalize_url(next_u)
+                if nu and not _should_skip(nu) and nu not in visited:
                     queue.append((nu, depth + 1))
     return all_data
+# ══════════════════════════════════════════════════════════
+# 🚀 MAIN
+# ══════════════════════════════════════════════════════════
 if __name__ == "__main__":
+    print("=" * 60)
+    print("ENA Scraper v3.0 -- Starting crawl...")
+    print(f"   Priority URLs: {len(PRIORITY_URLS)}")
+    print("=" * 60)
     all_data = crawl(BASE_URLS, max_depth=3)
+    # إزالة المكررات
     unique: list[dict] = []
     seen_texts: set[str] = set()
+    seen_urls: set[str] = set()
     for page in all_data:
+        # تجنب تكرار نفس النص أو نفس الـ URL
+        if page["content"] not in seen_texts and page["url"] not in seen_urls:
             unique.append(page)
             seen_texts.add(page["content"])
+            seen_urls.add(page["url"])
+    # إحصائيات
+    print("\n" + "=" * 60)
+    print(f"OK. Pages collected: {len(unique)}")
+    print(f"Total characters: {sum(p['chars'] for p in unique):,}")
+    from collections import Counter
+    cats = Counter(p["category"] for p in unique)
+    langs = Counter(p["langue"] for p in unique)
+    print("\nBy category:")
+    for cat, count in cats.most_common():
+        print(f"   {cat}: {count}")
+    print("\nBy language:")
+    for lang, count in langs.items():
+        print(f"   {lang}: {count}")
+    # حفظ
     with open("ena_full_data.json", "w", encoding="utf-8") as f:
         json.dump(unique, f, ensure_ascii=False, indent=2)
+    print("\nSaved to ena_full_data.json")
+    print("=" * 60)
+    print("Done! Now run: python build_chroma.py")