Spaces:
Sleeping
Sleeping
| """ | |
| ENA Chatbot — Scraper v4.0 (API-Based) | |
| يستخدم WordPress REST API + RSS Feed + Sitemap بدل الـ scraping التقليدي | |
| Run: python scraper_api.py | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import re | |
| import time | |
| import xml.etree.ElementTree as ET | |
| from urllib.parse import urlparse, unquote | |
| from html import unescape | |
| import requests | |
| from bs4 import BeautifulSoup | |
| # ══════════════════════════════════════════════════════════ | |
| # ⚙️ CONFIG | |
| # ══════════════════════════════════════════════════════════ | |
| BASE = "https://www.ena.tn" | |
| API_ENDPOINTS = { | |
| "posts_ar": f"{BASE}/wp-json/wp/v2/posts?per_page=100&lang=ar", | |
| "posts_fr": f"{BASE}/wp-json/wp/v2/posts?per_page=100&lang=fr", | |
| "posts_all": f"{BASE}/wp-json/wp/v2/posts?per_page=100", | |
| "pages_ar": f"{BASE}/wp-json/wp/v2/pages?per_page=100&lang=ar", | |
| "pages_fr": f"{BASE}/wp-json/wp/v2/pages?per_page=100&lang=fr", | |
| "pages_all": f"{BASE}/wp-json/wp/v2/pages?per_page=100", | |
| } | |
| RSS_FEEDS = [ | |
| f"{BASE}/feed/", | |
| f"{BASE}/ar/feed/", | |
| f"{BASE}/fr/feed/", | |
| ] | |
| SITEMAPS = [ | |
| f"{BASE}/ar/wp-sitemap-posts-post-1.xml", | |
| f"{BASE}/ar/wp-sitemap-posts-page-1.xml", | |
| f"{BASE}/fr/wp-sitemap-posts-post-1.xml", | |
| f"{BASE}/fr/wp-sitemap-posts-page-1.xml", | |
| ] | |
| # صفحات مهمة نجيبها مباشرة بالـ scraping (ما تظهرش في الـ API) | |
| PRIORITY_PAGES = [ | |
| f"{BASE}/ar/concours-ar/cycle-superieur-arr/concours-entree-cycle-superieur-ar/", | |
| f"{BASE}/ar/concours-ar/informations-generales-ar/", | |
| f"{BASE}/ar/concours-ar/cycle-moyen-ar/entree-au-cycle-de-formation-des-cadres-moyens-ar/", | |
| f"{BASE}/ar/concours-ar/agents-categorie-a3-ar/", | |
| f"{BASE}/ar/preparation-au-concours-ar/", | |
| f"{BASE}/fr/concours/cycle-superieur/le-concours-dentree-au-cycle-superieur/", | |
| f"{BASE}/fr/concours/informations-generales/", | |
| f"{BASE}/fr/concours/cycle-moyen/concours-dentree-au-cycle-de-formation-des-cadres-moyens-de-la-sous-categorie-a2-2/", | |
| f"{BASE}/fr/concours/agents-de-la-sous-categorie-a3/", | |
| f"{BASE}/ar/formation-continue-ar/formation-continue-a-distance-et-presentielle-ar/", | |
| f"{BASE}/fr/formation-continue/formation-continue-a-distance-et-presentielle/", | |
| f"{BASE}/ar/inscription2026/", | |
| f"{BASE}/ar/ouverturefad2026/", | |
| f"{BASE}/ar/fad2026/", | |
| ] | |
| HEADERS = { | |
| "User-Agent": "Mozilla/5.0 (compatible; ENA-Chatbot/4.0; +https://www.ena.tn)", | |
| "Accept": "application/json, text/html", | |
| } | |
| CATS = { | |
| "/concours/": "concours_fr", | |
| "/concours-ar": "concours_ar", | |
| "/ar/concours": "concours_ar", | |
| "/formation/": "formation_fr", | |
| "/ar/formation": "formation_ar", | |
| "/formation-continue": "formation_continue", | |
| "/gouvernance/": "gouvernance", | |
| "/actualites/": "news_fr", | |
| "/actualites-ar/": "news_ar", | |
| "/leadership": "leadership", | |
| "/inscription": "inscription", | |
| "/fad": "fad", | |
| } | |
| SKIP_SLUGS = [ | |
| "page-dexemple", "sample-page", "politique-de-confidentialite", | |
| "shop", "cart", "checkout", "my-account", "woocommerce", | |
| "default-kit", "elementor", "log-file", | |
| ] | |
| # ══════════════════════════════════════════════════════════ | |
| # 🛠️ HELPERS | |
| # ══════════════════════════════════════════════════════════ | |
| CONTENT_CATS = { | |
| "concours_ar": ["مناظرة", "ترشح", "شروط الدخول", "بقاع", "اختبار", "مرحلة عليا", "مرحلة متوسطة", "أعوان"], | |
| "concours_fr": ["concours", "candidature", "cycle supérieur", "cycle moyen", "épreuve", "places"], | |
| "formation_continue": ["تكوين مستمر", "formation continue", "fad", "تكوين عن بعد"], | |
| "formation_ar": ["تكوين", "برنامج", "تأهيل", "cycle de formation"], | |
| "news_ar": ["إعلان", "بلاغ", "أخبار", "مستجدات", "إعلام"], | |
| "news_fr": ["actualité", "communiqué", "annonce", "information"], | |
| "inscription": ["تسجيل", "inscription", "2026"], | |
| "fad": ["fad", "تعليم عن بعد", "formation à distance"], | |
| } | |
| def get_category(url: str, text: str = "") -> str: | |
| ul = url.lower() | |
| # First try URL-based matching (most reliable) | |
| for p, c in CATS.items(): | |
| if p in ul: | |
| return c | |
| # Then try content-based matching for API pages | |
| if text: | |
| tl = text.lower() | |
| for cat, keywords in CONTENT_CATS.items(): | |
| if any(kw in tl for kw in keywords): | |
| return cat | |
| return "other" | |
| def get_lang(url: str) -> str: | |
| path = urlparse(url.lower()).path | |
| if "/ar/" in path or path.startswith("/ar"): | |
| return "ar" | |
| return "fr" | |
| def clean_html(html_text: str) -> str: | |
| """إزالة HTML tags وتنظيف النص""" | |
| if not html_text: | |
| return "" | |
| soup = BeautifulSoup(html_text, "html.parser") | |
| text = soup.get_text(" ", strip=True) | |
| text = unescape(text) | |
| text = re.sub(r"\s{3,}", " ", text) | |
| return text.strip() | |
| def should_skip(slug: str, title: str) -> bool: | |
| slug_lower = slug.lower() | |
| title_lower = title.lower() | |
| return any(s in slug_lower or s in title_lower for s in SKIP_SLUGS) | |
| # ══════════════════════════════════════════════════════════ | |
| # 📡 1. WordPress REST API | |
| # ══════════════════════════════════════════════════════════ | |
| def fetch_api(endpoint: str) -> list[dict]: | |
| """يجيب البيانات من WordPress API""" | |
| all_items = [] | |
| page = 1 | |
| while True: | |
| url = f"{endpoint}&page={page}" | |
| try: | |
| r = requests.get(url, headers=HEADERS, timeout=20) | |
| if r.status_code == 400: # No more pages | |
| break | |
| r.raise_for_status() | |
| items = r.json() | |
| if not items: | |
| break | |
| all_items.extend(items) | |
| # إذا أقل من 100 → آخر صفحة | |
| if len(items) < 100: | |
| break | |
| page += 1 | |
| time.sleep(0.5) # respectful delay | |
| except Exception as e: | |
| print(f" API error {url[:60]}: {e}") | |
| break | |
| return all_items | |
| def process_api_items(items: list[dict], content_type: str) -> list[dict]: | |
| """تحويل API items لصيغة موحّدة""" | |
| results = [] | |
| for item in items: | |
| slug = item.get("slug", "") | |
| title_raw = item.get("title", {}).get("rendered", "") | |
| title = clean_html(title_raw) | |
| content_raw = item.get("content", {}).get("rendered", "") | |
| excerpt_raw = item.get("excerpt", {}).get("rendered", "") | |
| link = item.get("link", "") | |
| date = item.get("date", "")[:10] # YYYY-MM-DD | |
| if should_skip(slug, title): | |
| continue | |
| # نجمع المحتوى الكامل | |
| content = clean_html(content_raw) | |
| if not content or len(content) < 50: | |
| content = clean_html(excerpt_raw) | |
| if not content or len(content) < 50: | |
| continue | |
| # نضيف العنوان في بداية المحتوى | |
| full_content = f"{title}\n\n{content}" if title else content | |
| results.append({ | |
| "page_name": unquote(slug), | |
| "url": link, | |
| "source": "ena.tn-api", | |
| "langue": get_lang(link), | |
| "category": get_category(link, full_content), | |
| "content_type": content_type, | |
| "date": date, | |
| "content": full_content, | |
| "chars": len(full_content), | |
| }) | |
| return results | |
| # ══════════════════════════════════════════════════════════ | |
| # 📰 2. RSS Feed | |
| # ══════════════════════════════════════════════════════════ | |
| def fetch_rss(feed_url: str) -> list[dict]: | |
| """يجيب آخر الأخبار من RSS""" | |
| results = [] | |
| try: | |
| r = requests.get(feed_url, headers=HEADERS, timeout=15) | |
| r.raise_for_status() | |
| # Clean potential weird characters at start | |
| content = r.content.strip() | |
| root = ET.fromstring(content) | |
| # RSS namespace | |
| ns = {"content": "http://purl.org/rss/1.0/modules/content/"} | |
| for item in root.findall(".//item"): | |
| title = item.findtext("title", "").strip() | |
| link = item.findtext("link", "").strip() | |
| desc = item.findtext("description", "") | |
| date = item.findtext("pubDate", "")[:16] | |
| # محتوى كامل إذا متوفر | |
| content_encoded = item.find("content:encoded", ns) | |
| if content_encoded is not None and content_encoded.text: | |
| content = clean_html(content_encoded.text) | |
| else: | |
| content = clean_html(desc) | |
| if not content or len(content) < 50: | |
| continue | |
| full_content = f"{title}\n\n{content}" if title else content | |
| slug = urlparse(link).path.strip("/").split("/")[-1] | |
| results.append({ | |
| "page_name": unquote(slug), | |
| "url": link, | |
| "source": "ena.tn-rss", | |
| "langue": get_lang(link), | |
| "category": "news_ar" if "/ar/" in link else "news_fr", | |
| "content_type": "news", | |
| "date": date, | |
| "content": full_content, | |
| "chars": len(full_content), | |
| }) | |
| except Exception as e: | |
| print(f" RSS error {feed_url}: {e}") | |
| return results | |
| # ══════════════════════════════════════════════════════════ | |
| # 🗺️ 3. Sitemap → Scrape important pages | |
| # ══════════════════════════════════════════════════════════ | |
| def fetch_sitemap_urls(sitemap_url: str) -> list[str]: | |
| """يجيب كل URLs من الـ sitemap""" | |
| urls = [] | |
| try: | |
| r = requests.get(sitemap_url, headers=HEADERS, timeout=15) | |
| r.raise_for_status() | |
| root = ET.fromstring(r.content) | |
| ns = {"sm": "http://www.sitemaps.org/schemas/sitemap/0.9"} | |
| for loc in root.findall(".//sm:loc", ns): | |
| if loc.text: | |
| urls.append(loc.text.strip()) | |
| except Exception as e: | |
| print(f" Sitemap error {sitemap_url}: {e}") | |
| return urls | |
| def scrape_page(url: str) -> dict | None: | |
| """يجيب محتوى صفحة واحدة بالـ scraping""" | |
| try: | |
| r = requests.get(url, headers=HEADERS, timeout=20, allow_redirects=True) | |
| r.raise_for_status() | |
| soup = BeautifulSoup(r.text, "html.parser") | |
| # إزالة العناصر غير المفيدة | |
| for tag in soup(["script", "style", "nav", "footer", "header", "aside"]): | |
| tag.decompose() | |
| for tag in soup.find_all(class_=re.compile(r"breadcrumb|menu|sidebar|widget", re.I)): | |
| tag.decompose() | |
| text = soup.get_text(" ", strip=True) | |
| text = re.sub(r"\s{3,}", " ", text) | |
| if len(text) < 100: | |
| return None | |
| slug = urlparse(url).path.strip("/").split("/")[-1] | |
| return { | |
| "page_name": unquote(slug), | |
| "url": url, | |
| "source": "ena.tn-scrape", | |
| "langue": get_lang(url), | |
| "category": get_category(url), | |
| "content_type": "page", | |
| "date": "", | |
| "content": text, | |
| "chars": len(text), | |
| } | |
| except Exception as e: | |
| print(f" skip {url[:60]}: {e}") | |
| return None | |
| # ══════════════════════════════════════════════════════════ | |
| # 🚀 MAIN | |
| # ══════════════════════════════════════════════════════════ | |
| if __name__ == "__main__": | |
| print("=" * 60) | |
| print("Step: ENA Scraper v4.0 -- API + RSS + Sitemap") | |
| print("=" * 60) | |
| all_data: list[dict] = [] | |
| seen_urls: set[str] = set() | |
| seen_texts: set[str] = set() | |
| def add_unique(items: list[dict]): | |
| for item in items: | |
| url = item.get("url", "") | |
| text = item.get("content", "") | |
| if url not in seen_urls and text not in seen_texts and len(text) > 50: | |
| all_data.append(item) | |
| seen_urls.add(url) | |
| seen_texts.add(text) | |
| # ── 1. WordPress API ── | |
| print("\nStep 1: WordPress REST API...") | |
| for name, endpoint in API_ENDPOINTS.items(): | |
| print(f" Fetching {name}...") | |
| items = fetch_api(endpoint) | |
| processed = process_api_items(items, "post" if "posts" in name else "page") | |
| add_unique(processed) | |
| print(f" OK: {len(processed)} items from {name}") | |
| # ── 2. RSS Feed ── | |
| print("\nStep 2: RSS Feeds...") | |
| for feed_url in RSS_FEEDS: | |
| print(f" Fetching {feed_url}...") | |
| items = fetch_rss(feed_url) | |
| add_unique(items) | |
| print(f" OK: {len(items)} items from RSS") | |
| # ── 3. Sitemap URLs ── | |
| print("\nStep 3: Sitemap pages...") | |
| sitemap_urls = [] | |
| for sm in SITEMAPS: | |
| urls = fetch_sitemap_urls(sm) | |
| sitemap_urls.extend(urls) | |
| print(f" Found {len(urls)} URLs in {sm.split('/')[-1]}") | |
| # Scrape sitemap pages not already fetched | |
| new_urls = [u for u in sitemap_urls if u not in seen_urls] | |
| print(f" Scraping {len(new_urls)} new pages from sitemap...") | |
| for i, url in enumerate(new_urls): | |
| page = scrape_page(url) | |
| if page: | |
| add_unique([page]) | |
| if (i + 1) % 20 == 0: | |
| print(f" {i + 1}/{len(new_urls)} scraped...") | |
| time.sleep(0.3) | |
| # ── 4. Priority Pages ── | |
| print("\nStep 4: Priority pages (concours, conditions)...") | |
| priority_new = [u for u in PRIORITY_PAGES if u not in seen_urls] | |
| for url in priority_new: | |
| page = scrape_page(url) | |
| if page: | |
| add_unique([page]) | |
| print(f" OK: {page['page_name']}") | |
| time.sleep(0.3) | |
| # ── Stats ── | |
| print("\n" + "=" * 60) | |
| print(f"OK. Total pages: {len(all_data)}") | |
| print(f"Total characters: {sum(p['chars'] for p in all_data):,}") | |
| from collections import Counter | |
| cats = Counter(p["category"] for p in all_data) | |
| langs = Counter(p["langue"] for p in all_data) | |
| srcs = Counter(p["source"] for p in all_data) | |
| print("\nBy category:") | |
| for cat, count in cats.most_common(): | |
| print(f" {cat}: {count}") | |
| print("\nBy language:") | |
| for lang, count in langs.items(): | |
| print(f" {lang}: {count}") | |
| print("\nBy source:") | |
| for src, count in srcs.items(): | |
| print(f" {src}: {count}") | |
| # ── Save ── | |
| with open("ena_full_data.json", "w", encoding="utf-8") as f: | |
| json.dump(all_data, f, ensure_ascii=False, indent=2) | |
| print("\nSaved to ena_full_data.json") | |
| print("=" * 60) | |
| print("Done! Now run: python build_chroma.py") | |