Spaces:

Ines1994
/

ENA-Chatbot

Sleeping

App Files Files Community

ENA-Chatbot / scraper_api.py

Ines1994

Upload 3 files

8dcf3a7 verified about 1 month ago

raw

history blame contribute delete

16 kB

	"""
	ENA Chatbot — Scraper v4.0 (API-Based)
	يستخدم WordPress REST API + RSS Feed + Sitemap بدل الـ scraping التقليدي
	Run: python scraper_api.py
	"""
	from __future__ import annotations

	import json
	import re
	import time
	import xml.etree.ElementTree as ET
	from urllib.parse import urlparse, unquote
	from html import unescape

	import requests
	from bs4 import BeautifulSoup

	# ══════════════════════════════════════════════════════════
	# ⚙️ CONFIG
	# ══════════════════════════════════════════════════════════

	BASE = "https://www.ena.tn"

	API_ENDPOINTS = {
	"posts_ar": f"{BASE}/wp-json/wp/v2/posts?per_page=100&lang=ar",
	"posts_fr": f"{BASE}/wp-json/wp/v2/posts?per_page=100&lang=fr",
	"posts_all": f"{BASE}/wp-json/wp/v2/posts?per_page=100",
	"pages_ar": f"{BASE}/wp-json/wp/v2/pages?per_page=100&lang=ar",
	"pages_fr": f"{BASE}/wp-json/wp/v2/pages?per_page=100&lang=fr",
	"pages_all": f"{BASE}/wp-json/wp/v2/pages?per_page=100",
	}

	RSS_FEEDS = [
	f"{BASE}/feed/",
	f"{BASE}/ar/feed/",
	f"{BASE}/fr/feed/",
	]

	SITEMAPS = [
	f"{BASE}/ar/wp-sitemap-posts-post-1.xml",
	f"{BASE}/ar/wp-sitemap-posts-page-1.xml",
	f"{BASE}/fr/wp-sitemap-posts-post-1.xml",
	f"{BASE}/fr/wp-sitemap-posts-page-1.xml",
	]

	# صفحات مهمة نجيبها مباشرة بالـ scraping (ما تظهرش في الـ API)
	PRIORITY_PAGES = [
	f"{BASE}/ar/concours-ar/cycle-superieur-arr/concours-entree-cycle-superieur-ar/",
	f"{BASE}/ar/concours-ar/informations-generales-ar/",
	f"{BASE}/ar/concours-ar/cycle-moyen-ar/entree-au-cycle-de-formation-des-cadres-moyens-ar/",
	f"{BASE}/ar/concours-ar/agents-categorie-a3-ar/",
	f"{BASE}/ar/preparation-au-concours-ar/",
	f"{BASE}/fr/concours/cycle-superieur/le-concours-dentree-au-cycle-superieur/",
	f"{BASE}/fr/concours/informations-generales/",
	f"{BASE}/fr/concours/cycle-moyen/concours-dentree-au-cycle-de-formation-des-cadres-moyens-de-la-sous-categorie-a2-2/",
	f"{BASE}/fr/concours/agents-de-la-sous-categorie-a3/",
	f"{BASE}/ar/formation-continue-ar/formation-continue-a-distance-et-presentielle-ar/",
	f"{BASE}/fr/formation-continue/formation-continue-a-distance-et-presentielle/",
	f"{BASE}/ar/inscription2026/",
	f"{BASE}/ar/ouverturefad2026/",
	f"{BASE}/ar/fad2026/",
	]

	HEADERS = {
	"User-Agent": "Mozilla/5.0 (compatible; ENA-Chatbot/4.0; +https://www.ena.tn)",
	"Accept": "application/json, text/html",
	}

	CATS = {
	"/concours/": "concours_fr",
	"/concours-ar": "concours_ar",
	"/ar/concours": "concours_ar",
	"/formation/": "formation_fr",
	"/ar/formation": "formation_ar",
	"/formation-continue": "formation_continue",
	"/gouvernance/": "gouvernance",
	"/actualites/": "news_fr",
	"/actualites-ar/": "news_ar",
	"/leadership": "leadership",
	"/inscription": "inscription",
	"/fad": "fad",
	}

	SKIP_SLUGS = [
	"page-dexemple", "sample-page", "politique-de-confidentialite",
	"shop", "cart", "checkout", "my-account", "woocommerce",
	"default-kit", "elementor", "log-file",
	]

	# ══════════════════════════════════════════════════════════
	# 🛠️ HELPERS
	# ══════════════════════════════════════════════════════════

	CONTENT_CATS = {
	"concours_ar": ["مناظرة", "ترشح", "شروط الدخول", "بقاع", "اختبار", "مرحلة عليا", "مرحلة متوسطة", "أعوان"],
	"concours_fr": ["concours", "candidature", "cycle supérieur", "cycle moyen", "épreuve", "places"],
	"formation_continue": ["تكوين مستمر", "formation continue", "fad", "تكوين عن بعد"],
	"formation_ar": ["تكوين", "برنامج", "تأهيل", "cycle de formation"],
	"news_ar": ["إعلان", "بلاغ", "أخبار", "مستجدات", "إعلام"],
	"news_fr": ["actualité", "communiqué", "annonce", "information"],
	"inscription": ["تسجيل", "inscription", "2026"],
	"fad": ["fad", "تعليم عن بعد", "formation à distance"],
	}

	def get_category(url: str, text: str = "") -> str:
	ul = url.lower()
	# First try URL-based matching (most reliable)
	for p, c in CATS.items():
	if p in ul:
	return c
	# Then try content-based matching for API pages
	if text:
	tl = text.lower()
	for cat, keywords in CONTENT_CATS.items():
	if any(kw in tl for kw in keywords):
	return cat
	return "other"

	def get_lang(url: str) -> str:
	path = urlparse(url.lower()).path
	if "/ar/" in path or path.startswith("/ar"):
	return "ar"
	return "fr"

	def clean_html(html_text: str) -> str:
	"""إزالة HTML tags وتنظيف النص"""
	if not html_text:
	return ""
	soup = BeautifulSoup(html_text, "html.parser")
	text = soup.get_text(" ", strip=True)
	text = unescape(text)
	text = re.sub(r"\s{3,}", " ", text)
	return text.strip()

	def should_skip(slug: str, title: str) -> bool:
	slug_lower = slug.lower()
	title_lower = title.lower()
	return any(s in slug_lower or s in title_lower for s in SKIP_SLUGS)

	# ══════════════════════════════════════════════════════════
	# 📡 1. WordPress REST API
	# ══════════════════════════════════════════════════════════

	def fetch_api(endpoint: str) -> list[dict]:
	"""يجيب البيانات من WordPress API"""
	all_items = []
	page = 1

	while True:
	url = f"{endpoint}&page={page}"
	try:
	r = requests.get(url, headers=HEADERS, timeout=20)
	if r.status_code == 400: # No more pages
	break
	r.raise_for_status()
	items = r.json()
	if not items:
	break
	all_items.extend(items)
	# إذا أقل من 100 → آخر صفحة
	if len(items) < 100:
	break
	page += 1
	time.sleep(0.5) # respectful delay
	except Exception as e:
	print(f" API error {url[:60]}: {e}")
	break

	return all_items

	def process_api_items(items: list[dict], content_type: str) -> list[dict]:
	"""تحويل API items لصيغة موحّدة"""
	results = []
	for item in items:
	slug = item.get("slug", "")
	title_raw = item.get("title", {}).get("rendered", "")
	title = clean_html(title_raw)
	content_raw = item.get("content", {}).get("rendered", "")
	excerpt_raw = item.get("excerpt", {}).get("rendered", "")
	link = item.get("link", "")
	date = item.get("date", "")[:10] # YYYY-MM-DD

	if should_skip(slug, title):
	continue

	# نجمع المحتوى الكامل
	content = clean_html(content_raw)
	if not content or len(content) < 50:
	content = clean_html(excerpt_raw)
	if not content or len(content) < 50:
	continue

	# نضيف العنوان في بداية المحتوى
	full_content = f"{title}\n\n{content}" if title else content

	results.append({
	"page_name": unquote(slug),
	"url": link,
	"source": "ena.tn-api",
	"langue": get_lang(link),
	"category": get_category(link, full_content),
	"content_type": content_type,
	"date": date,
	"content": full_content,
	"chars": len(full_content),
	})

	return results

	# ══════════════════════════════════════════════════════════
	# 📰 2. RSS Feed
	# ══════════════════════════════════════════════════════════

	def fetch_rss(feed_url: str) -> list[dict]:
	"""يجيب آخر الأخبار من RSS"""
	results = []
	try:
	r = requests.get(feed_url, headers=HEADERS, timeout=15)
	r.raise_for_status()
	# Clean potential weird characters at start
	content = r.content.strip()
	root = ET.fromstring(content)

	# RSS namespace
	ns = {"content": "http://purl.org/rss/1.0/modules/content/"}

	for item in root.findall(".//item"):
	title = item.findtext("title", "").strip()
	link = item.findtext("link", "").strip()
	desc = item.findtext("description", "")
	date = item.findtext("pubDate", "")[:16]

	# محتوى كامل إذا متوفر
	content_encoded = item.find("content:encoded", ns)
	if content_encoded is not None and content_encoded.text:
	content = clean_html(content_encoded.text)
	else:
	content = clean_html(desc)

	if not content or len(content) < 50:
	continue

	full_content = f"{title}\n\n{content}" if title else content
	slug = urlparse(link).path.strip("/").split("/")[-1]

	results.append({
	"page_name": unquote(slug),
	"url": link,
	"source": "ena.tn-rss",
	"langue": get_lang(link),
	"category": "news_ar" if "/ar/" in link else "news_fr",
	"content_type": "news",
	"date": date,
	"content": full_content,
	"chars": len(full_content),
	})

	except Exception as e:
	print(f" RSS error {feed_url}: {e}")

	return results

	# ══════════════════════════════════════════════════════════
	# 🗺️ 3. Sitemap → Scrape important pages
	# ══════════════════════════════════════════════════════════

	def fetch_sitemap_urls(sitemap_url: str) -> list[str]:
	"""يجيب كل URLs من الـ sitemap"""
	urls = []
	try:
	r = requests.get(sitemap_url, headers=HEADERS, timeout=15)
	r.raise_for_status()
	root = ET.fromstring(r.content)
	ns = {"sm": "http://www.sitemaps.org/schemas/sitemap/0.9"}
	for loc in root.findall(".//sm:loc", ns):
	if loc.text:
	urls.append(loc.text.strip())
	except Exception as e:
	print(f" Sitemap error {sitemap_url}: {e}")
	return urls

	def scrape_page(url: str) -> dict \| None:
	"""يجيب محتوى صفحة واحدة بالـ scraping"""
	try:
	r = requests.get(url, headers=HEADERS, timeout=20, allow_redirects=True)
	r.raise_for_status()
	soup = BeautifulSoup(r.text, "html.parser")

	# إزالة العناصر غير المفيدة
	for tag in soup(["script", "style", "nav", "footer", "header", "aside"]):
	tag.decompose()
	for tag in soup.find_all(class_=re.compile(r"breadcrumb\|menu\|sidebar\|widget", re.I)):
	tag.decompose()

	text = soup.get_text(" ", strip=True)
	text = re.sub(r"\s{3,}", " ", text)

	if len(text) < 100:
	return None

	slug = urlparse(url).path.strip("/").split("/")[-1]
	return {
	"page_name": unquote(slug),
	"url": url,
	"source": "ena.tn-scrape",
	"langue": get_lang(url),
	"category": get_category(url),
	"content_type": "page",
	"date": "",
	"content": text,
	"chars": len(text),
	}
	except Exception as e:
	print(f" skip {url[:60]}: {e}")
	return None

	# ══════════════════════════════════════════════════════════
	# 🚀 MAIN
	# ══════════════════════════════════════════════════════════

	if __name__ == "__main__":
	print("=" * 60)
	print("Step: ENA Scraper v4.0 -- API + RSS + Sitemap")
	print("=" * 60)

	all_data: list[dict] = []
	seen_urls: set[str] = set()
	seen_texts: set[str] = set()

	def add_unique(items: list[dict]):
	for item in items:
	url = item.get("url", "")
	text = item.get("content", "")
	if url not in seen_urls and text not in seen_texts and len(text) > 50:
	all_data.append(item)
	seen_urls.add(url)
	seen_texts.add(text)

	# ── 1. WordPress API ──
	print("\nStep 1: WordPress REST API...")
	for name, endpoint in API_ENDPOINTS.items():
	print(f" Fetching {name}...")
	items = fetch_api(endpoint)
	processed = process_api_items(items, "post" if "posts" in name else "page")
	add_unique(processed)
	print(f" OK: {len(processed)} items from {name}")

	# ── 2. RSS Feed ──
	print("\nStep 2: RSS Feeds...")
	for feed_url in RSS_FEEDS:
	print(f" Fetching {feed_url}...")
	items = fetch_rss(feed_url)
	add_unique(items)
	print(f" OK: {len(items)} items from RSS")

	# ── 3. Sitemap URLs ──
	print("\nStep 3: Sitemap pages...")
	sitemap_urls = []
	for sm in SITEMAPS:
	urls = fetch_sitemap_urls(sm)
	sitemap_urls.extend(urls)
	print(f" Found {len(urls)} URLs in {sm.split('/')[-1]}")

	# Scrape sitemap pages not already fetched
	new_urls = [u for u in sitemap_urls if u not in seen_urls]
	print(f" Scraping {len(new_urls)} new pages from sitemap...")
	for i, url in enumerate(new_urls):
	page = scrape_page(url)
	if page:
	add_unique([page])
	if (i + 1) % 20 == 0:
	print(f" {i + 1}/{len(new_urls)} scraped...")
	time.sleep(0.3)

	# ── 4. Priority Pages ──
	print("\nStep 4: Priority pages (concours, conditions)...")
	priority_new = [u for u in PRIORITY_PAGES if u not in seen_urls]
	for url in priority_new:
	page = scrape_page(url)
	if page:
	add_unique([page])
	print(f" OK: {page['page_name']}")
	time.sleep(0.3)

	# ── Stats ──
	print("\n" + "=" * 60)
	print(f"OK. Total pages: {len(all_data)}")
	print(f"Total characters: {sum(p['chars'] for p in all_data):,}")

	from collections import Counter
	cats = Counter(p["category"] for p in all_data)
	langs = Counter(p["langue"] for p in all_data)
	srcs = Counter(p["source"] for p in all_data)

	print("\nBy category:")
	for cat, count in cats.most_common():
	print(f" {cat}: {count}")
	print("\nBy language:")
	for lang, count in langs.items():
	print(f" {lang}: {count}")
	print("\nBy source:")
	for src, count in srcs.items():
	print(f" {src}: {count}")

	# ── Save ──
	with open("ena_full_data.json", "w", encoding="utf-8") as f:
	json.dump(all_data, f, ensure_ascii=False, indent=2)

	print("\nSaved to ena_full_data.json")
	print("=" * 60)
	print("Done! Now run: python build_chroma.py")