Spaces:

lljz66
/

scrape.gov

Runtime error

App Files Files Community

scrape.gov / app.py

lljz66

Update app.py

1f16459 verified about 1 month ago

raw

history blame contribute delete

12.4 kB

	import asyncio
	import sqlite3
	import json
	import os
	import time
	from datetime import datetime
	from pathlib import Path
	from playwright.async_api import async_playwright
	from bs4 import BeautifulSoup
	import pandas as pd

	# ========== الإعدادات ==========
	DB_PATH = "/tmp/marches_publics.db"
	DATA_DIR = Path("/app/data")
	RAW_DIR = DATA_DIR / "raw"
	RAW_DIR.mkdir(parents=True, exist_ok=True)

	# ========== جميع الروابط الصحيحة ==========
	URLS = {
	"consultations_en_cours": "https://www.marchespublics.gov.ma/index.php?page=entreprise.EntrepriseAdvancedSearch&searchAnnCons",
	"recherche_avancee_consultations": "https://www.marchespublics.gov.ma/index.php?page=entreprise.EntrepriseAdvancedSearch&searchAnnCons",
	"avis_achat_en_cours": "https://www.marchespublics.gov.ma/bdc/entreprise/consultation/",
	"bons_commande_attribues": "https://www.marchespublics.gov.ma/index.php?page=entreprise.ListeAnnonceAnnuelle&typeAnnonce=1",
	"toutes_annonces_info": "https://www.marchespublics.gov.ma/index.php?page=entreprise.EntrepriseAdvancedSearch&AvisInformation",
	"tous_extraits_pv": "https://www.marchespublics.gov.ma/index.php?page=entreprise.EntrepriseAdvancedSearch&AllAnn",
	"tous_resultats_definitifs": "https://www.marchespublics.gov.ma/index.php?page=entreprise.EntrepriseAdvancedSearch&AllAnn",
	"tous_rapports_achevement": "https://www.marchespublics.gov.ma/index.php?page=entreprise.EntrepriseAdvancedSearch&AllAnn",
	"tous_rapports_presentation": "https://www.marchespublics.gov.ma/index.php?page=entreprise.EntrepriseAdvancedSearch&AvisRapportPresentation",
	"toutes_decisions_resiliation": "https://www.marchespublics.gov.ma/index.php?page=entreprise.EntrepriseAdvancedSearch&AllAnn",
	"recherche_avancee_toutes": "https://www.marchespublics.gov.ma/index.php?page=entreprise.EntrepriseAdvancedSearch&AllAnn",
	"programme_previsionnel": "https://www.marchespublics.gov.ma/index.php?page=entreprise.ListePPs",
	"synthese_rapport_audit": "https://www.marchespublics.gov.ma/index.php?page=entreprise.ListeSRA",
	"marches_attribues": "https://www.marchespublics.gov.ma/index.php?page=entreprise.ListeAnnonceAnnuelle&typeAnnonce=2",
	"conventions_contrats": "https://www.marchespublics.gov.ma/index.php?page=entreprise.ListeAnnonceAnnuelle&typeAnnonce=3",
	"consultations_annulees": "https://www.marchespublics.gov.ma/index.php?page=entreprise.EntrepriseAdvancedSearch&searchAnnCons&consAnnulee=1",
	"recherche_avancee_annulees": "https://www.marchespublics.gov.ma/index.php?page=entreprise.EntrepriseAdvancedSearch&searchAnnCons&consAnnulee=1",
	}

	# ========== دوال قاعدة البيانات ==========
	def init_db():
	conn = sqlite3.connect(DB_PATH)
	c = conn.cursor()
	c.execute('''
	CREATE TABLE IF NOT EXISTS scraped_data (
	id INTEGER PRIMARY KEY AUTOINCREMENT,
	source_name TEXT NOT NULL,
	source_url TEXT NOT NULL,
	scraped_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
	page_number INTEGER,
	total_pages INTEGER,
	raw_html TEXT,
	parsed_data JSON,
	status TEXT DEFAULT 'pending',
	error_message TEXT
	)
	''')
	c.execute('''
	CREATE TABLE IF NOT EXISTS pagination_state (
	source_name TEXT PRIMARY KEY,
	last_page INTEGER DEFAULT 0,
	total_items INTEGER,
	completed BOOLEAN DEFAULT 0,
	updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
	)
	''')
	c.execute('CREATE INDEX IF NOT EXISTS idx_source ON scraped_data(source_name)')
	c.execute('CREATE INDEX IF NOT EXISTS idx_status ON scraped_data(status)')
	conn.commit()
	conn.close()

	def backup_db():
	import shutil
	try:
	shutil.copy2(DB_PATH, DATA_DIR / "marches_publics_backup.db")
	print("🗄️ Database backed up")
	except Exception as e:
	print(f"❌ Backup error: {e}")

	def export_all_csv():
	conn = sqlite3.connect(DB_PATH)
	for source in URLS.keys():
	df = pd.read_sql_query(
	"SELECT source_name, page_number, parsed_data, scraped_at FROM scraped_data WHERE source_name = ? AND status = 'completed'",
	conn, params=(source,)
	)
	if not df.empty:
	records = []
	for _, row in df.iterrows():
	try:
	data = json.loads(row['parsed_data'])
	for item in data.get('items', []):
	item['source'] = row['source_name']
	item['page'] = row['page_number']
	item['scraped_at'] = row['scraped_at']
	records.append(item)
	except: pass
	if records:
	out_df = pd.json_normalize(records)
	filename = f"{source}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
	out_df.to_csv(DATA_DIR / filename, index=False, encoding='utf-8-sig')
	print(f"📁 Exported: {filename}")
	conn.close()

	# ========== استخراج البيانات ==========
	def extract_data(soup, source_name):
	data = []
	tables = soup.find_all('table')
	for table in tables:
	rows = table.find_all('tr')
	headers = [th.get_text(strip=True) for th in rows[0].find_all(['th', 'td'])] if rows else []
	for row in rows[1:]:
	cells = row.find_all(['td', 'th'])
	row_data = {}
	for i, cell in enumerate(cells):
	key = headers[i] if i < len(headers) else f'col_{i}'
	links = cell.find_all('a')
	if links:
	row_data[key] = {
	'text': cell.get_text(strip=True),
	'href': links[0].get('href'),
	'links': [a.get('href') for a in links]
	}
	else:
	row_data[key] = cell.get_text(strip=True)
	if row_data:
	data.append(row_data)
	if not data:
	items = soup.find_all('div', class_=lambda x: x and ('result' in x.lower() or 'item' in x.lower()))
	for item in items:
	data.append({'text': item.get_text(strip=True)})
	return {
	'source': source_name,
	'extracted_at': datetime.now().isoformat(),
	'item_count': len(data),
	'items': data
	}

	# ========== وظائف الكشط ==========
	async def advanced_search_with_retry(page, url, max_retries=2):
	"""يفتح صفحة البحث المتقدم وينقر على زر البحث، مع إعادة المحاولة عند الفشل."""
	for attempt in range(max_retries):
	try:
	print(f" ⟳ Attempt {attempt+1}...")
	await page.goto(url, wait_until='networkidle', timeout=60000)
	submit = page.locator('input[type=submit], button[type=submit], input#rechercher, a.rechercher, form[method=post] input[type=submit]')
	if await submit.count() > 0:
	await submit.first.click()
	await page.wait_for_selector('table, #contenu, .resultats, .liste, .annonce', timeout=15000)
	else:
	await page.wait_for_timeout(5000)
	return await page.content()
	except Exception as e:
	print(f" ⚠️ محاولة {attempt+1} فشلت: {e}")
	await asyncio.sleep(2)
	return await page.content()

	async def scrape_direct_list(page, url):
	"""للصفحات التي تعرض النتائج مباشرة."""
	await page.goto(url, wait_until='networkidle', timeout=60000)
	await page.wait_for_selector('table, #contenu, .resultats, .liste', timeout=20000)
	await page.wait_for_timeout(2000)
	return await page.content()

	# ========== حفظ الصفحة ==========
	def save_page(source, page_num, html):
	folder = RAW_DIR / source
	folder.mkdir(exist_ok=True)
	(folder / f"page_{page_num}.html").write_text(html, encoding='utf-8')
	print(f" 💾 حفظ {source}/page_{page_num}.html")

	# ========== الحلقة الرئيسية للكشط ==========
	async def scrape_all():
	print("🚀 بدء عملية الكشط الكاملة...")
	init_db()
	async with async_playwright() as p:
	browser = await p.chromium.launch(
	headless=True,
	args=['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--disable-gpu', '--single-process']
	)
	context = await browser.new_context(
	viewport={'width': 1920, 'height': 1080},
	user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
	locale='fr-FR',
	timezone_id='Africa/Casablanca'
	)
	await context.add_init_script("""
	Object.defineProperty(navigator, 'webdriver', {get: () => undefined});
	window.chrome = {runtime: {}};
	""")
	conn = sqlite3.connect(DB_PATH)

	for src_name, url in URLS.items():
	print(f"\n📄 {src_name}")
	page = await context.new_page()
	try:
	await asyncio.sleep(2)
	if "ListeAnnonceAnnuelle" in url or "ListePPs" in url or "ListeSRA" in url:
	html = await scrape_direct_list(page, url)
	else:
	html = await advanced_search_with_retry(page, url)

	soup = BeautifulSoup(html, 'html.parser')
	total_pages = 1
	pagination = soup.select('a[href*="page="]')
	if pagination:
	nums = [int(a.text) for a in pagination if a.text.isdigit()]
	if nums: total_pages = max(nums)

	print(f" 📊 إجمالي الصفحات: {total_pages}")
	save_page(src_name, 1, html)
	data = extract_data(soup, src_name)
	conn.execute('''
	INSERT INTO scraped_data (source_name, source_url, page_number, total_pages, raw_html, parsed_data, status)
	VALUES (?,?,?,?,?,?,?)
	''', (src_name, page.url, 1, total_pages, html, json.dumps(data, ensure_ascii=False), 'completed'))
	conn.commit()

	for pgnum in range(2, total_pages + 1):
	await asyncio.sleep(2)
	next_url = f"{url}&page={pgnum}" if '?' in url else f"{url}?page={pgnum}"
	if "ListeAnnonceAnnuelle" in url or "ListePPs" in url or "ListeSRA" in url:
	html = await scrape_direct_list(page, next_url)
	else:
	html = await advanced_search_with_retry(page, next_url)
	save_page(src_name, pgnum, html)
	data = extract_data(BeautifulSoup(html, 'html.parser'), src_name)
	conn.execute('''
	INSERT INTO scraped_data (source_name, source_url, page_number, total_pages, raw_html, parsed_data, status)
	VALUES (?,?,?,?,?,?,?)
	''', (src_name, page.url, pgnum, total_pages, html, json.dumps(data, ensure_ascii=False), 'completed'))
	conn.commit()
	print(f" ✅ صفحة {pgnum}")
	if pgnum % 10 == 0:
	backup_db()

	conn.execute('''
	INSERT OR REPLACE INTO pagination_state (source_name, last_page, total_items, completed, updated_at)
	VALUES (?,?,?,1, CURRENT_TIMESTAMP)
	''', (src_name, total_pages, total_pages))
	conn.commit()
	except Exception as e:
	print(f" ❌ خطأ: {e}")
	conn.execute('''
	INSERT INTO scraped_data (source_name, source_url, status, error_message)
	VALUES (?,?,?,?)
	''', (src_name, url, 'error', str(e)))
	conn.commit()
	finally:
	await page.close()

	conn.close()
	await browser.close()
	backup_db()
	export_all_csv()
	print("✅ اكتمل الكشط.")

	if __name__ == "__main__":
	asyncio.run(scrape_all())
	print("🔄 البرنامج في وضع السكون لمنع الإغلاق.")
	while True:
	time.sleep(3600)