Spaces:
Runtime error
Runtime error
| import asyncio | |
| import sqlite3 | |
| import json | |
| import os | |
| import time | |
| from datetime import datetime | |
| from pathlib import Path | |
| from playwright.async_api import async_playwright | |
| from bs4 import BeautifulSoup | |
| import pandas as pd | |
| # ========== الإعدادات ========== | |
| DB_PATH = "/tmp/marches_publics.db" | |
| DATA_DIR = Path("/app/data") | |
| RAW_DIR = DATA_DIR / "raw" | |
| RAW_DIR.mkdir(parents=True, exist_ok=True) | |
| # ========== جميع الروابط الصحيحة ========== | |
| URLS = { | |
| "consultations_en_cours": "https://www.marchespublics.gov.ma/index.php?page=entreprise.EntrepriseAdvancedSearch&searchAnnCons", | |
| "recherche_avancee_consultations": "https://www.marchespublics.gov.ma/index.php?page=entreprise.EntrepriseAdvancedSearch&searchAnnCons", | |
| "avis_achat_en_cours": "https://www.marchespublics.gov.ma/bdc/entreprise/consultation/", | |
| "bons_commande_attribues": "https://www.marchespublics.gov.ma/index.php?page=entreprise.ListeAnnonceAnnuelle&typeAnnonce=1", | |
| "toutes_annonces_info": "https://www.marchespublics.gov.ma/index.php?page=entreprise.EntrepriseAdvancedSearch&AvisInformation", | |
| "tous_extraits_pv": "https://www.marchespublics.gov.ma/index.php?page=entreprise.EntrepriseAdvancedSearch&AllAnn", | |
| "tous_resultats_definitifs": "https://www.marchespublics.gov.ma/index.php?page=entreprise.EntrepriseAdvancedSearch&AllAnn", | |
| "tous_rapports_achevement": "https://www.marchespublics.gov.ma/index.php?page=entreprise.EntrepriseAdvancedSearch&AllAnn", | |
| "tous_rapports_presentation": "https://www.marchespublics.gov.ma/index.php?page=entreprise.EntrepriseAdvancedSearch&AvisRapportPresentation", | |
| "toutes_decisions_resiliation": "https://www.marchespublics.gov.ma/index.php?page=entreprise.EntrepriseAdvancedSearch&AllAnn", | |
| "recherche_avancee_toutes": "https://www.marchespublics.gov.ma/index.php?page=entreprise.EntrepriseAdvancedSearch&AllAnn", | |
| "programme_previsionnel": "https://www.marchespublics.gov.ma/index.php?page=entreprise.ListePPs", | |
| "synthese_rapport_audit": "https://www.marchespublics.gov.ma/index.php?page=entreprise.ListeSRA", | |
| "marches_attribues": "https://www.marchespublics.gov.ma/index.php?page=entreprise.ListeAnnonceAnnuelle&typeAnnonce=2", | |
| "conventions_contrats": "https://www.marchespublics.gov.ma/index.php?page=entreprise.ListeAnnonceAnnuelle&typeAnnonce=3", | |
| "consultations_annulees": "https://www.marchespublics.gov.ma/index.php?page=entreprise.EntrepriseAdvancedSearch&searchAnnCons&consAnnulee=1", | |
| "recherche_avancee_annulees": "https://www.marchespublics.gov.ma/index.php?page=entreprise.EntrepriseAdvancedSearch&searchAnnCons&consAnnulee=1", | |
| } | |
| # ========== دوال قاعدة البيانات ========== | |
| def init_db(): | |
| conn = sqlite3.connect(DB_PATH) | |
| c = conn.cursor() | |
| c.execute(''' | |
| CREATE TABLE IF NOT EXISTS scraped_data ( | |
| id INTEGER PRIMARY KEY AUTOINCREMENT, | |
| source_name TEXT NOT NULL, | |
| source_url TEXT NOT NULL, | |
| scraped_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, | |
| page_number INTEGER, | |
| total_pages INTEGER, | |
| raw_html TEXT, | |
| parsed_data JSON, | |
| status TEXT DEFAULT 'pending', | |
| error_message TEXT | |
| ) | |
| ''') | |
| c.execute(''' | |
| CREATE TABLE IF NOT EXISTS pagination_state ( | |
| source_name TEXT PRIMARY KEY, | |
| last_page INTEGER DEFAULT 0, | |
| total_items INTEGER, | |
| completed BOOLEAN DEFAULT 0, | |
| updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP | |
| ) | |
| ''') | |
| c.execute('CREATE INDEX IF NOT EXISTS idx_source ON scraped_data(source_name)') | |
| c.execute('CREATE INDEX IF NOT EXISTS idx_status ON scraped_data(status)') | |
| conn.commit() | |
| conn.close() | |
| def backup_db(): | |
| import shutil | |
| try: | |
| shutil.copy2(DB_PATH, DATA_DIR / "marches_publics_backup.db") | |
| print("🗄️ Database backed up") | |
| except Exception as e: | |
| print(f"❌ Backup error: {e}") | |
| def export_all_csv(): | |
| conn = sqlite3.connect(DB_PATH) | |
| for source in URLS.keys(): | |
| df = pd.read_sql_query( | |
| "SELECT source_name, page_number, parsed_data, scraped_at FROM scraped_data WHERE source_name = ? AND status = 'completed'", | |
| conn, params=(source,) | |
| ) | |
| if not df.empty: | |
| records = [] | |
| for _, row in df.iterrows(): | |
| try: | |
| data = json.loads(row['parsed_data']) | |
| for item in data.get('items', []): | |
| item['source'] = row['source_name'] | |
| item['page'] = row['page_number'] | |
| item['scraped_at'] = row['scraped_at'] | |
| records.append(item) | |
| except: pass | |
| if records: | |
| out_df = pd.json_normalize(records) | |
| filename = f"{source}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv" | |
| out_df.to_csv(DATA_DIR / filename, index=False, encoding='utf-8-sig') | |
| print(f"📁 Exported: {filename}") | |
| conn.close() | |
| # ========== استخراج البيانات ========== | |
| def extract_data(soup, source_name): | |
| data = [] | |
| tables = soup.find_all('table') | |
| for table in tables: | |
| rows = table.find_all('tr') | |
| headers = [th.get_text(strip=True) for th in rows[0].find_all(['th', 'td'])] if rows else [] | |
| for row in rows[1:]: | |
| cells = row.find_all(['td', 'th']) | |
| row_data = {} | |
| for i, cell in enumerate(cells): | |
| key = headers[i] if i < len(headers) else f'col_{i}' | |
| links = cell.find_all('a') | |
| if links: | |
| row_data[key] = { | |
| 'text': cell.get_text(strip=True), | |
| 'href': links[0].get('href'), | |
| 'links': [a.get('href') for a in links] | |
| } | |
| else: | |
| row_data[key] = cell.get_text(strip=True) | |
| if row_data: | |
| data.append(row_data) | |
| if not data: | |
| items = soup.find_all('div', class_=lambda x: x and ('result' in x.lower() or 'item' in x.lower())) | |
| for item in items: | |
| data.append({'text': item.get_text(strip=True)}) | |
| return { | |
| 'source': source_name, | |
| 'extracted_at': datetime.now().isoformat(), | |
| 'item_count': len(data), | |
| 'items': data | |
| } | |
| # ========== وظائف الكشط ========== | |
| async def advanced_search_with_retry(page, url, max_retries=2): | |
| """يفتح صفحة البحث المتقدم وينقر على زر البحث، مع إعادة المحاولة عند الفشل.""" | |
| for attempt in range(max_retries): | |
| try: | |
| print(f" ⟳ Attempt {attempt+1}...") | |
| await page.goto(url, wait_until='networkidle', timeout=60000) | |
| submit = page.locator('input[type=submit], button[type=submit], input#rechercher, a.rechercher, form[method=post] input[type=submit]') | |
| if await submit.count() > 0: | |
| await submit.first.click() | |
| await page.wait_for_selector('table, #contenu, .resultats, .liste, .annonce', timeout=15000) | |
| else: | |
| await page.wait_for_timeout(5000) | |
| return await page.content() | |
| except Exception as e: | |
| print(f" ⚠️ محاولة {attempt+1} فشلت: {e}") | |
| await asyncio.sleep(2) | |
| return await page.content() | |
| async def scrape_direct_list(page, url): | |
| """للصفحات التي تعرض النتائج مباشرة.""" | |
| await page.goto(url, wait_until='networkidle', timeout=60000) | |
| await page.wait_for_selector('table, #contenu, .resultats, .liste', timeout=20000) | |
| await page.wait_for_timeout(2000) | |
| return await page.content() | |
| # ========== حفظ الصفحة ========== | |
| def save_page(source, page_num, html): | |
| folder = RAW_DIR / source | |
| folder.mkdir(exist_ok=True) | |
| (folder / f"page_{page_num}.html").write_text(html, encoding='utf-8') | |
| print(f" 💾 حفظ {source}/page_{page_num}.html") | |
| # ========== الحلقة الرئيسية للكشط ========== | |
| async def scrape_all(): | |
| print("🚀 بدء عملية الكشط الكاملة...") | |
| init_db() | |
| async with async_playwright() as p: | |
| browser = await p.chromium.launch( | |
| headless=True, | |
| args=['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--disable-gpu', '--single-process'] | |
| ) | |
| context = await browser.new_context( | |
| viewport={'width': 1920, 'height': 1080}, | |
| user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36', | |
| locale='fr-FR', | |
| timezone_id='Africa/Casablanca' | |
| ) | |
| await context.add_init_script(""" | |
| Object.defineProperty(navigator, 'webdriver', {get: () => undefined}); | |
| window.chrome = {runtime: {}}; | |
| """) | |
| conn = sqlite3.connect(DB_PATH) | |
| for src_name, url in URLS.items(): | |
| print(f"\n📄 {src_name}") | |
| page = await context.new_page() | |
| try: | |
| await asyncio.sleep(2) | |
| if "ListeAnnonceAnnuelle" in url or "ListePPs" in url or "ListeSRA" in url: | |
| html = await scrape_direct_list(page, url) | |
| else: | |
| html = await advanced_search_with_retry(page, url) | |
| soup = BeautifulSoup(html, 'html.parser') | |
| total_pages = 1 | |
| pagination = soup.select('a[href*="page="]') | |
| if pagination: | |
| nums = [int(a.text) for a in pagination if a.text.isdigit()] | |
| if nums: total_pages = max(nums) | |
| print(f" 📊 إجمالي الصفحات: {total_pages}") | |
| save_page(src_name, 1, html) | |
| data = extract_data(soup, src_name) | |
| conn.execute(''' | |
| INSERT INTO scraped_data (source_name, source_url, page_number, total_pages, raw_html, parsed_data, status) | |
| VALUES (?,?,?,?,?,?,?) | |
| ''', (src_name, page.url, 1, total_pages, html, json.dumps(data, ensure_ascii=False), 'completed')) | |
| conn.commit() | |
| for pgnum in range(2, total_pages + 1): | |
| await asyncio.sleep(2) | |
| next_url = f"{url}&page={pgnum}" if '?' in url else f"{url}?page={pgnum}" | |
| if "ListeAnnonceAnnuelle" in url or "ListePPs" in url or "ListeSRA" in url: | |
| html = await scrape_direct_list(page, next_url) | |
| else: | |
| html = await advanced_search_with_retry(page, next_url) | |
| save_page(src_name, pgnum, html) | |
| data = extract_data(BeautifulSoup(html, 'html.parser'), src_name) | |
| conn.execute(''' | |
| INSERT INTO scraped_data (source_name, source_url, page_number, total_pages, raw_html, parsed_data, status) | |
| VALUES (?,?,?,?,?,?,?) | |
| ''', (src_name, page.url, pgnum, total_pages, html, json.dumps(data, ensure_ascii=False), 'completed')) | |
| conn.commit() | |
| print(f" ✅ صفحة {pgnum}") | |
| if pgnum % 10 == 0: | |
| backup_db() | |
| conn.execute(''' | |
| INSERT OR REPLACE INTO pagination_state (source_name, last_page, total_items, completed, updated_at) | |
| VALUES (?,?,?,1, CURRENT_TIMESTAMP) | |
| ''', (src_name, total_pages, total_pages)) | |
| conn.commit() | |
| except Exception as e: | |
| print(f" ❌ خطأ: {e}") | |
| conn.execute(''' | |
| INSERT INTO scraped_data (source_name, source_url, status, error_message) | |
| VALUES (?,?,?,?) | |
| ''', (src_name, url, 'error', str(e))) | |
| conn.commit() | |
| finally: | |
| await page.close() | |
| conn.close() | |
| await browser.close() | |
| backup_db() | |
| export_all_csv() | |
| print("✅ اكتمل الكشط.") | |
| if __name__ == "__main__": | |
| asyncio.run(scrape_all()) | |
| print("🔄 البرنامج في وضع السكون لمنع الإغلاق.") | |
| while True: | |
| time.sleep(3600) |