File size: 10,139 Bytes
1c8f838 7b1c181 1c8f838 aaaa9f1 1c8f838 4f8201a 1c8f838 d03b33c 1c8f838 d03b33c aaaa9f1 1c8f838 5e75932 1c8f838 aaaa9f1 4771e2b 5e75932 4771e2b 1c8f838 aaaa9f1 4771e2b 1c8f838 aaaa9f1 1c8f838 4771e2b 7b1c181 4771e2b 7b1c181 4771e2b 7b1c181 4771e2b 7b1c181 4771e2b 1e0eb13 7b1c181 c4905c4 7b1c181 4771e2b 1c8f838 4771e2b 1dc21ab 7b1c181 4771e2b 1c8f838 7b1c181 1dc21ab 1e0eb13 7b1c181 1dc21ab 7b1c181 1dc21ab 7b1c181 1dc21ab 7b1c181 1c8f838 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 | # Fichier: app.py
# ===================================================================================
# WAHIS SCRAPER - VERSION FINALE, SIMPLE ET ROBUSTE (FILTRES ET TABLEAU)
# ===================================================================================
import gradio as gr
import pandas as pd
import json
from datetime import datetime
from pathlib import Path
import warnings
import traceback
import asyncio
import subprocess
import zipfile
from playwright.async_api import async_playwright
from playwright_stealth import stealth_async
# Installation du navigateur (ne change pas)
def install_playwright_browsers():
print("Vérification et installation des navigateurs Playwright...")
try:
subprocess.run(["playwright", "install", "chromium"], capture_output=True, text=True, check=True)
print("Installation de Chromium terminée avec succès.")
except Exception as e:
print(f"Erreur lors de l'installation de Chromium : {e}")
raise
install_playwright_browsers()
warnings.filterwarnings('ignore')
OUTPUT_DIR = Path("outputs")
OUTPUT_DIR.mkdir(exist_ok=True)
class WAHISScraper:
# La logique du scraper est stable et correcte.
def __init__(self): self.logs = []
def log(self, message):
timestamp = datetime.now().strftime("%H:%M:%S")
self.logs.append(f"[{timestamp}] {message}")
print(message)
async def run_extraction_async(self):
self.log("🚀 Lancement de l'extraction en trois phases...")
async with async_playwright() as p:
browser = None
try:
self.log("🔧 Lancement d'un navigateur Chromium...")
browser = await p.chromium.launch(headless=True)
page = await browser.new_page()
self.log("🕵️ Application du camouflage 'stealth'...")
await stealth_async(page)
self.log("🌍 Visite de la page principale pour passer le challenge Cloudflare...")
await page.goto("https://wahis.woah.org/#/event-management", wait_until="networkidle", timeout=90000)
self.log("🍪 Challenge Cloudflare réussi.")
headers = { 'Content-Type': 'application/json', 'Accept': 'application/json', 'clientid': 'OIEwebsite', 'env': 'PRD', 'security-token': 'token', 'type': 'REQUEST' }
self.log("--- PHASE 1 : Récupération de la liste des rapports ---")
list_api_url = "https://wahis.woah.org/api/v1/pi/event/filtered-list?language=fr"
payload_list = { "pageNumber": 1, "pageSize": 100, "sortColName": "REP_LAST_UPDATE", "sortColOrder": "DESC", "reportFilters": {}, "languageChanged": False }
list_response_json = await page.evaluate("async (args) => (await fetch(args.url, { method: 'POST', headers: args.headers, body: JSON.stringify(args.payload) })).json()", {'url': list_api_url, 'headers': headers, 'payload': payload_list})
report_list = list_response_json.get('list', [])
if not report_list: raise Exception("Échec de la phase 1.")
self.log(f"✅ Phase 1 réussie : {len(report_list)} rapports de base récupérés.")
unique_event_ids = sorted(list(set(item['eventId'] for item in report_list if 'eventId' in item)))
self.log(f"--- PHASE 2 : Récupération des données GPS pour {len(unique_event_ids)} événements...")
outbreaks_api_url = "https://wahis.woah.org/api/v1/pi/map-data/outbreaks-from-event-ids?language=fr"
all_outbreaks_data = await page.evaluate("async (args) => (await fetch(args.url, { method: 'POST', headers: args.headers, body: JSON.stringify(args.payload) })).json()", {'url': outbreaks_api_url, 'headers': headers, 'payload': unique_event_ids})
self.log(f"✅ Phase 2 réussie : {len(all_outbreaks_data)} foyers récupérés.")
unique_outbreak_ids = sorted(list(set(item['outbreakId'] for item in all_outbreaks_data if 'outbreakId' in item)))
self.log(f"--- PHASE 3 : Récupération des détails épidémiologiques pour {len(unique_outbreak_ids)} foyers...")
additional_info_data = []
if unique_outbreak_ids:
additional_info_api_url = "https://wahis.woah.org/api/v1/pi/outbreak/additional-information"
additional_info_data = await page.evaluate("async (args) => (await fetch(args.url, { method: 'POST', headers: args.headers, body: JSON.stringify(args.payload) })).json()", {'url': additional_info_api_url, 'headers': headers, 'payload': unique_outbreak_ids})
self.log(f"✅ Phase 3 réussie : {len(additional_info_data)} fiches de détails récupérées.")
return report_list, all_outbreaks_data, additional_info_data, "\n".join(self.logs)
except Exception as e:
self.log(f"❌ Une erreur critique est survenue."); self.log(traceback.format_exc())
return None, None, None, "\n".join(self.logs)
finally:
if browser and browser.is_connected(): await browser.close()
def process_data_and_create_zip(reports, outbreaks, additional_infos):
if not reports: return pd.DataFrame(), [], [], [], None
# Enrichir les données des foyers avec le nom de la maladie et du pays
report_map = {report['eventId']: {'disease': report['disease'], 'country': report['country']} for report in reports}
for outbreak in outbreaks:
event_info = report_map.get(outbreak.get('eventId'), {})
outbreak['disease'] = event_info.get('disease')
outbreak['country'] = event_info.get('country')
valid_additional_infos = [info for info in additional_infos if isinstance(info, dict)]
additional_info_map = {info.get('outbreakId'): info for info in valid_additional_infos}
for outbreak in outbreaks:
outbreak_id = outbreak.get('outbreakId')
if outbreak_id in additional_info_map: outbreak.update(additional_info_map[outbreak_id])
df_outbreaks = pd.DataFrame(outbreaks)
# Créer les listes pour les filtres à partir des données complètes
all_countries = sorted(df_outbreaks['country'].dropna().unique()) if 'country' in df_outbreaks else []
all_diseases = sorted(df_outbreaks['disease'].dropna().unique()) if 'disease' in df_outbreaks else []
all_species = sorted(df_outbreaks['species'].dropna().unique()) if 'species' in df_outbreaks else []
# Créer le package ZIP
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
zip_path = OUTPUT_DIR / f"wahis_package_{timestamp}.zip"
with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
df_summary = pd.DataFrame(reports)
df_summary.to_excel(OUTPUT_DIR / "1_summary_events.xlsx", index=False)
df_outbreaks.to_excel(OUTPUT_DIR / "2_outbreaks_full_details.xlsx", index=False)
zipf.write(OUTPUT_DIR / "1_summary_events.xlsx", arcname="1_summary_events.xlsx")
zipf.write(OUTPUT_DIR / "2_outbreaks_full_details.xlsx", arcname="2_outbreaks_full_details.xlsx")
return df_outbreaks, ["Tous"] + all_countries, ["Toutes"] + all_diseases, ["Toutes"] + all_species, str(zip_path)
# --- Fonctions de l'interface Gradio ---
async def run_and_update_ui():
scraper = WAHISScraper()
reports, outbreaks, additional_infos, logs = await scraper.run_extraction_async()
if not reports:
return {status_textbox: logs, ui_visibility_group: gr.Group(visible=False)}
df_outbreaks, countries, diseases, species, zip_path = process_data_and_create_zip(reports, outbreaks, additional_infos)
return {
status_textbox: logs,
outbreak_data_state: df_outbreaks,
filter_country: gr.Dropdown(choices=countries, value="Tous"),
filter_disease: gr.Dropdown(choices=diseases, value="Toutes"),
filter_species: gr.Dropdown(choices=species, value="Toutes"),
outbreaks_table: df_outbreaks,
download_section: gr.File(value=zip_path, visible=True),
ui_visibility_group: gr.Group(visible=True)
}
def update_table(country, disease, species, df_outbreaks):
if df_outbreaks is None or df_outbreaks.empty: return pd.DataFrame()
filtered_df = df_outbreaks.copy()
if country != "Tous": filtered_df = filtered_df[filtered_df['country'] == country]
if disease != "Toutes": filtered_df = filtered_df[filtered_df['disease'] == disease]
if species != "Toutes": filtered_df = filtered_df[filtered_df['species'] == species]
return filtered_df
with gr.Blocks(theme=gr.themes.Soft(), title="WAHIS Scraper") as demo:
outbreak_data_state = gr.State()
gr.Markdown("# 🤖 Scraper pour WAHIS (WOAH) - Version Robuste")
run_button = gr.Button("🚀 Lancer l'extraction des données", variant="primary")
with gr.Group(visible=False) as ui_visibility_group:
gr.Markdown("### 🔍 Filtrez les données")
with gr.Row():
filter_country = gr.Dropdown(label="Pays")
filter_disease = gr.Dropdown(label="Maladie")
filter_species = gr.Dropdown(label="Espèce")
gr.Markdown("### 📋 Tableau Détaillé des Foyers (avec Coordonnées GPS)")
outbreaks_table = gr.DataFrame(wrap=True)
with gr.Accordion("Journal d'exécution et Téléchargement", open=False):
status_textbox = gr.Textbox(lines=15, label="📜 Logs", interactive=False)
download_section = gr.File(label="💾 Télécharger le Package Complet (.zip)")
run_button.click(
fn=run_and_update_ui,
inputs=[],
outputs=[status_textbox, ui_visibility_group, outbreak_data_state, filter_country, filter_disease, filter_species, outbreaks_table, download_section]
)
filters = [filter_country, filter_disease, filter_species]
for f in filters:
f.change(
fn=update_table,
inputs=[filter_country, filter_disease, filter_species, outbreak_data_state],
outputs=[outbreaks_table]
)
if __name__ == "__main__":
demo.launch() |