""" ENA Chatbot — Scraper v4.0 (API-Based) يستخدم WordPress REST API + RSS Feed + Sitemap بدل الـ scraping التقليدي Run: python scraper_api.py """ from __future__ import annotations import json import re import time import xml.etree.ElementTree as ET from urllib.parse import urlparse, unquote from html import unescape import requests from bs4 import BeautifulSoup # ══════════════════════════════════════════════════════════ # ⚙️ CONFIG # ══════════════════════════════════════════════════════════ BASE = "https://www.ena.tn" API_ENDPOINTS = { "posts_ar": f"{BASE}/wp-json/wp/v2/posts?per_page=100&lang=ar", "posts_fr": f"{BASE}/wp-json/wp/v2/posts?per_page=100&lang=fr", "posts_all": f"{BASE}/wp-json/wp/v2/posts?per_page=100", "pages_ar": f"{BASE}/wp-json/wp/v2/pages?per_page=100&lang=ar", "pages_fr": f"{BASE}/wp-json/wp/v2/pages?per_page=100&lang=fr", "pages_all": f"{BASE}/wp-json/wp/v2/pages?per_page=100", } RSS_FEEDS = [ f"{BASE}/feed/", f"{BASE}/ar/feed/", f"{BASE}/fr/feed/", ] SITEMAPS = [ f"{BASE}/ar/wp-sitemap-posts-post-1.xml", f"{BASE}/ar/wp-sitemap-posts-page-1.xml", f"{BASE}/fr/wp-sitemap-posts-post-1.xml", f"{BASE}/fr/wp-sitemap-posts-page-1.xml", ] # صفحات مهمة نجيبها مباشرة بالـ scraping (ما تظهرش في الـ API) PRIORITY_PAGES = [ f"{BASE}/ar/concours-ar/cycle-superieur-arr/concours-entree-cycle-superieur-ar/", f"{BASE}/ar/concours-ar/informations-generales-ar/", f"{BASE}/ar/concours-ar/cycle-moyen-ar/entree-au-cycle-de-formation-des-cadres-moyens-ar/", f"{BASE}/ar/concours-ar/agents-categorie-a3-ar/", f"{BASE}/ar/preparation-au-concours-ar/", f"{BASE}/fr/concours/cycle-superieur/le-concours-dentree-au-cycle-superieur/", f"{BASE}/fr/concours/informations-generales/", f"{BASE}/fr/concours/cycle-moyen/concours-dentree-au-cycle-de-formation-des-cadres-moyens-de-la-sous-categorie-a2-2/", f"{BASE}/fr/concours/agents-de-la-sous-categorie-a3/", f"{BASE}/ar/formation-continue-ar/formation-continue-a-distance-et-presentielle-ar/", f"{BASE}/fr/formation-continue/formation-continue-a-distance-et-presentielle/", f"{BASE}/ar/inscription2026/", f"{BASE}/ar/ouverturefad2026/", f"{BASE}/ar/fad2026/", ] HEADERS = { "User-Agent": "Mozilla/5.0 (compatible; ENA-Chatbot/4.0; +https://www.ena.tn)", "Accept": "application/json, text/html", } CATS = { "/concours/": "concours_fr", "/concours-ar": "concours_ar", "/ar/concours": "concours_ar", "/formation/": "formation_fr", "/ar/formation": "formation_ar", "/formation-continue": "formation_continue", "/gouvernance/": "gouvernance", "/actualites/": "news_fr", "/actualites-ar/": "news_ar", "/leadership": "leadership", "/inscription": "inscription", "/fad": "fad", } SKIP_SLUGS = [ "page-dexemple", "sample-page", "politique-de-confidentialite", "shop", "cart", "checkout", "my-account", "woocommerce", "default-kit", "elementor", "log-file", ] # ══════════════════════════════════════════════════════════ # 🛠️ HELPERS # ══════════════════════════════════════════════════════════ CONTENT_CATS = { "concours_ar": ["مناظرة", "ترشح", "شروط الدخول", "بقاع", "اختبار", "مرحلة عليا", "مرحلة متوسطة", "أعوان"], "concours_fr": ["concours", "candidature", "cycle supérieur", "cycle moyen", "épreuve", "places"], "formation_continue": ["تكوين مستمر", "formation continue", "fad", "تكوين عن بعد"], "formation_ar": ["تكوين", "برنامج", "تأهيل", "cycle de formation"], "news_ar": ["إعلان", "بلاغ", "أخبار", "مستجدات", "إعلام"], "news_fr": ["actualité", "communiqué", "annonce", "information"], "inscription": ["تسجيل", "inscription", "2026"], "fad": ["fad", "تعليم عن بعد", "formation à distance"], } def get_category(url: str, text: str = "") -> str: ul = url.lower() # First try URL-based matching (most reliable) for p, c in CATS.items(): if p in ul: return c # Then try content-based matching for API pages if text: tl = text.lower() for cat, keywords in CONTENT_CATS.items(): if any(kw in tl for kw in keywords): return cat return "other" def get_lang(url: str) -> str: path = urlparse(url.lower()).path if "/ar/" in path or path.startswith("/ar"): return "ar" return "fr" def clean_html(html_text: str) -> str: """إزالة HTML tags وتنظيف النص""" if not html_text: return "" soup = BeautifulSoup(html_text, "html.parser") text = soup.get_text(" ", strip=True) text = unescape(text) text = re.sub(r"\s{3,}", " ", text) return text.strip() def should_skip(slug: str, title: str) -> bool: slug_lower = slug.lower() title_lower = title.lower() return any(s in slug_lower or s in title_lower for s in SKIP_SLUGS) # ══════════════════════════════════════════════════════════ # 📡 1. WordPress REST API # ══════════════════════════════════════════════════════════ def fetch_api(endpoint: str) -> list[dict]: """يجيب البيانات من WordPress API""" all_items = [] page = 1 while True: url = f"{endpoint}&page={page}" try: r = requests.get(url, headers=HEADERS, timeout=20) if r.status_code == 400: # No more pages break r.raise_for_status() items = r.json() if not items: break all_items.extend(items) # إذا أقل من 100 → آخر صفحة if len(items) < 100: break page += 1 time.sleep(0.5) # respectful delay except Exception as e: print(f" API error {url[:60]}: {e}") break return all_items def process_api_items(items: list[dict], content_type: str) -> list[dict]: """تحويل API items لصيغة موحّدة""" results = [] for item in items: slug = item.get("slug", "") title_raw = item.get("title", {}).get("rendered", "") title = clean_html(title_raw) content_raw = item.get("content", {}).get("rendered", "") excerpt_raw = item.get("excerpt", {}).get("rendered", "") link = item.get("link", "") date = item.get("date", "")[:10] # YYYY-MM-DD if should_skip(slug, title): continue # نجمع المحتوى الكامل content = clean_html(content_raw) if not content or len(content) < 50: content = clean_html(excerpt_raw) if not content or len(content) < 50: continue # نضيف العنوان في بداية المحتوى full_content = f"{title}\n\n{content}" if title else content results.append({ "page_name": unquote(slug), "url": link, "source": "ena.tn-api", "langue": get_lang(link), "category": get_category(link, full_content), "content_type": content_type, "date": date, "content": full_content, "chars": len(full_content), }) return results # ══════════════════════════════════════════════════════════ # 📰 2. RSS Feed # ══════════════════════════════════════════════════════════ def fetch_rss(feed_url: str) -> list[dict]: """يجيب آخر الأخبار من RSS""" results = [] try: r = requests.get(feed_url, headers=HEADERS, timeout=15) r.raise_for_status() # Clean potential weird characters at start content = r.content.strip() root = ET.fromstring(content) # RSS namespace ns = {"content": "http://purl.org/rss/1.0/modules/content/"} for item in root.findall(".//item"): title = item.findtext("title", "").strip() link = item.findtext("link", "").strip() desc = item.findtext("description", "") date = item.findtext("pubDate", "")[:16] # محتوى كامل إذا متوفر content_encoded = item.find("content:encoded", ns) if content_encoded is not None and content_encoded.text: content = clean_html(content_encoded.text) else: content = clean_html(desc) if not content or len(content) < 50: continue full_content = f"{title}\n\n{content}" if title else content slug = urlparse(link).path.strip("/").split("/")[-1] results.append({ "page_name": unquote(slug), "url": link, "source": "ena.tn-rss", "langue": get_lang(link), "category": "news_ar" if "/ar/" in link else "news_fr", "content_type": "news", "date": date, "content": full_content, "chars": len(full_content), }) except Exception as e: print(f" RSS error {feed_url}: {e}") return results # ══════════════════════════════════════════════════════════ # 🗺️ 3. Sitemap → Scrape important pages # ══════════════════════════════════════════════════════════ def fetch_sitemap_urls(sitemap_url: str) -> list[str]: """يجيب كل URLs من الـ sitemap""" urls = [] try: r = requests.get(sitemap_url, headers=HEADERS, timeout=15) r.raise_for_status() root = ET.fromstring(r.content) ns = {"sm": "http://www.sitemaps.org/schemas/sitemap/0.9"} for loc in root.findall(".//sm:loc", ns): if loc.text: urls.append(loc.text.strip()) except Exception as e: print(f" Sitemap error {sitemap_url}: {e}") return urls def scrape_page(url: str) -> dict | None: """يجيب محتوى صفحة واحدة بالـ scraping""" try: r = requests.get(url, headers=HEADERS, timeout=20, allow_redirects=True) r.raise_for_status() soup = BeautifulSoup(r.text, "html.parser") # إزالة العناصر غير المفيدة for tag in soup(["script", "style", "nav", "footer", "header", "aside"]): tag.decompose() for tag in soup.find_all(class_=re.compile(r"breadcrumb|menu|sidebar|widget", re.I)): tag.decompose() text = soup.get_text(" ", strip=True) text = re.sub(r"\s{3,}", " ", text) if len(text) < 100: return None slug = urlparse(url).path.strip("/").split("/")[-1] return { "page_name": unquote(slug), "url": url, "source": "ena.tn-scrape", "langue": get_lang(url), "category": get_category(url), "content_type": "page", "date": "", "content": text, "chars": len(text), } except Exception as e: print(f" skip {url[:60]}: {e}") return None # ══════════════════════════════════════════════════════════ # 🚀 MAIN # ══════════════════════════════════════════════════════════ if __name__ == "__main__": print("=" * 60) print("Step: ENA Scraper v4.0 -- API + RSS + Sitemap") print("=" * 60) all_data: list[dict] = [] seen_urls: set[str] = set() seen_texts: set[str] = set() def add_unique(items: list[dict]): for item in items: url = item.get("url", "") text = item.get("content", "") if url not in seen_urls and text not in seen_texts and len(text) > 50: all_data.append(item) seen_urls.add(url) seen_texts.add(text) # ── 1. WordPress API ── print("\nStep 1: WordPress REST API...") for name, endpoint in API_ENDPOINTS.items(): print(f" Fetching {name}...") items = fetch_api(endpoint) processed = process_api_items(items, "post" if "posts" in name else "page") add_unique(processed) print(f" OK: {len(processed)} items from {name}") # ── 2. RSS Feed ── print("\nStep 2: RSS Feeds...") for feed_url in RSS_FEEDS: print(f" Fetching {feed_url}...") items = fetch_rss(feed_url) add_unique(items) print(f" OK: {len(items)} items from RSS") # ── 3. Sitemap URLs ── print("\nStep 3: Sitemap pages...") sitemap_urls = [] for sm in SITEMAPS: urls = fetch_sitemap_urls(sm) sitemap_urls.extend(urls) print(f" Found {len(urls)} URLs in {sm.split('/')[-1]}") # Scrape sitemap pages not already fetched new_urls = [u for u in sitemap_urls if u not in seen_urls] print(f" Scraping {len(new_urls)} new pages from sitemap...") for i, url in enumerate(new_urls): page = scrape_page(url) if page: add_unique([page]) if (i + 1) % 20 == 0: print(f" {i + 1}/{len(new_urls)} scraped...") time.sleep(0.3) # ── 4. Priority Pages ── print("\nStep 4: Priority pages (concours, conditions)...") priority_new = [u for u in PRIORITY_PAGES if u not in seen_urls] for url in priority_new: page = scrape_page(url) if page: add_unique([page]) print(f" OK: {page['page_name']}") time.sleep(0.3) # ── Stats ── print("\n" + "=" * 60) print(f"OK. Total pages: {len(all_data)}") print(f"Total characters: {sum(p['chars'] for p in all_data):,}") from collections import Counter cats = Counter(p["category"] for p in all_data) langs = Counter(p["langue"] for p in all_data) srcs = Counter(p["source"] for p in all_data) print("\nBy category:") for cat, count in cats.most_common(): print(f" {cat}: {count}") print("\nBy language:") for lang, count in langs.items(): print(f" {lang}: {count}") print("\nBy source:") for src, count in srcs.items(): print(f" {src}: {count}") # ── Save ── with open("ena_full_data.json", "w", encoding="utf-8") as f: json.dump(all_data, f, ensure_ascii=False, indent=2) print("\nSaved to ena_full_data.json") print("=" * 60) print("Done! Now run: python build_chroma.py")