Spaces:

MMOON
/

MLRSTREAMLIT

Sleeping

App Files Files Community

MLRSTREAMLIT / app.py

MMOON

Update app.py

6365d52 verified 9 months ago

raw

history blame contribute delete

21.8 kB

	import logging
	import json
	import io
	import zipfile
	from datetime import datetime, timedelta
	from typing import Dict, List, Optional, Any # Tuple n'est pas explicitement utilisé mais bon à garder
	import pandas as pd
	import requests
	import plotly.express as px
	import streamlit as st
	from tenacity import retry, stop_after_attempt, wait_exponential
	# import time # Non utilisé directement
	from collections import defaultdict
	# import hashlib # Non utilisé directement
	import tempfile
	import os

	# Configuration Streamlit
	st.set_page_config(page_title="Pesticide Data Explorer - Optimized", page_icon="🌿", layout="wide")

	# Configuration logging
	logging.basicConfig(
	level=logging.INFO,
	format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
	handlers=[
	logging.FileHandler("pesticide_app_optimized.log", mode='a', encoding="utf-8"),
	logging.StreamHandler()
	],
	)
	logger = logging.getLogger(__name__)

	class PesticideDataFetcher:
	BASE_URL = "https://api.datalake.sante.service.ec.europa.eu/sante/pesticides"
	HEADERS = {
	"Content-Type": "application/json",
	"Cache-Control": "no-cache",
	"User-Agent": "StreamlitPesticideApp/1.2 (compatible; Mozilla/5.0)"
	}

	def __init__(self):
	self.session = requests.Session()
	self.session.headers.update(self.HEADERS)
	self.api_calls = 0

	@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=2, min=4, max=30))
	def download_data(self, endpoint: str, params: Dict, stream_large_json: bool = False) -> Optional[Any]:
	url = f"{self.BASE_URL}{endpoint}"
	temp_file_path = None

	try:
	self.api_calls += 1
	logger.info(f"Téléchargement: {url} \| Params: {params} \| API Call #{self.api_calls}")

	if stream_large_json and params.get('format') == 'json':
	logger.info(f"Mode streaming activé pour {url}")
	with self.session.get(url, params=params, timeout=(10, 300), stream=True) as r:
	r.raise_for_status()
	with tempfile.NamedTemporaryFile(mode='w+', delete=False, encoding='utf-8', suffix=".json") as tmp_file:
	temp_file_path = tmp_file.name
	logger.info(f"Sauvegarde streamée vers: {temp_file_path}")
	for chunk in r.iter_content(chunk_size=1024*1024): # 1MB chunks
	if chunk: tmp_file.write(chunk.decode('utf-8', errors='replace'))
	logger.info(f"Sauvegarde streamée terminée: {temp_file_path}")

	logger.info(f"Lecture JSON depuis: {temp_file_path}")
	with open(temp_file_path, 'r', encoding='utf-8') as f:
	data = json.load(f)
	return data

	else:
	response = self.session.get(url, params=params, timeout=(10, 180))
	response.raise_for_status()
	content_type = response.headers.get('Content-Type', '')

	if 'json' in content_type or params.get('format') == 'json':
	return response.json()
	elif 'csv' in content_type or params.get('format') == 'csv':
	return response.text
	elif 'zip' in content_type:
	with zipfile.ZipFile(io.BytesIO(response.content)) as zf:
	if not zf.namelist():
	logger.error(f"Fichier ZIP vide: {url}")
	return None
	filename = zf.namelist()[0]
	with zf.open(filename) as f_zip:
	content = f_zip.read().decode('utf-8', errors='replace')
	return json.loads(content) if filename.endswith('.json') else content
	else:
	logger.warning(f"Type contenu non géré: {content_type} pour {url}. Retour texte.")
	return response.text

	except requests.exceptions.Timeout as e:
	logger.error(f"Timeout: {url} - {e}")
	return None
	except requests.RequestException as e:
	logger.error(f"Erreur requête: {url} - {e}")
	if hasattr(e, 'response') and e.response is not None:
	logger.error(f"Status: {e.response.status_code}, Réponse: {e.response.text[:200]}...")
	return None
	except json.JSONDecodeError as e:
	logger.error(f"Erreur décodage JSON: {url} - {e}")
	if temp_file_path and os.path.exists(temp_file_path):
	logger.error(f"Contenu début fichier temp ({temp_file_path}):")
	try:
	with open(temp_file_path, 'r', encoding='utf-8') as f_err: logger.error(f_err.read(1000))
	except Exception as read_err: logger.error(f"Impossible lire fichier temp: {read_err}")
	return None
	except Exception as e:
	logger.error(f"Erreur inattendue (download_data) pour {url}: {e}", exc_info=True)
	return None
	finally:
	if stream_large_json and temp_file_path and os.path.exists(temp_file_path):
	try:
	os.remove(temp_file_path)
	logger.info(f"Fichier temp supprimé: {temp_file_path}")
	except OSError as e_os: logger.error(f"Impossible supprimer fichier temp {temp_file_path}: {e_os}")

	def get_products_paginated(self, language: str = 'FR') -> List[Dict]:
	all_products = []
	url_base = f"{self.BASE_URL}/pesticide_residues_products"
	params_initial = {'format': 'json', 'language': language, 'api-version': 'v2.0'}
	current_url = url_base
	page_num = 0
	MAX_PAGES = 30

	while current_url and page_num < MAX_PAGES:
	self.api_calls += 1
	page_num += 1
	params_req = params_initial if current_url == url_base else None
	logger.info(f"Produits page {page_num} depuis {current_url} (API global #{self.api_calls})")
	try:
	resp = self.session.get(current_url, params=params_req, timeout=(10, 45))
	resp.raise_for_status()
	data_page = resp.json()
	except requests.RequestException as e:
	logger.error(f"Erreur requête produits page {current_url}: {e}"); break
	except json.JSONDecodeError as e:
	logger.error(f"Erreur JSON produits page {current_url}: {e}. Reçu: {resp.text[:100]}"); break

	items = data_page.get('value', []) if isinstance(data_page, dict) else (data_page if isinstance(data_page, list) else [])
	if isinstance(items, list): all_products.extend(items)

	current_url = data_page.get('nextLink') if isinstance(data_page, dict) else None
	if not current_url: logger.info("Fin pagination produits.")

	if page_num >= MAX_PAGES and current_url:
	logger.warning(f"Limite pagination ({MAX_PAGES}) produits atteinte. Données potentiellement tronquées.")
	logger.info(f"Récupéré {len(all_products)} produits ({page_num} pages).")
	return all_products

	@st.cache_data(ttl=86400, show_spinner="Chargement initial des données de référence...")
	def download_all_data() -> Dict[str, Any]:
	fetcher = PesticideDataFetcher()
	results = {'substances': {}, 'mrls': [], 'products': [], 'product_dict': {}, 'stats': {}}

	with st.status("Initialisation du téléchargement...", expanded=True) as status_bar:
	status_bar.update(label="📥 Substances actives...")
	data_subst = fetcher.download_data("/active_substances/download", {"format": "json", "api-version": "v2.0"})
	if data_subst:
	list_s = data_subst.get('value', []) if isinstance(data_subst, dict) else data_subst
	if isinstance(list_s, list):
	results['substances'] = {
	i['substance_id']: i['substance_name']
	for i in list_s if isinstance(i, dict) and i.get('substance_id') and i.get('substance_name')
	}
	logger.info(f"✓ {len(results['substances'])} substances.")

	status_bar.update(label="📥 Enregistrements LMR (volumineux)...")
	data_mrls = fetcher.download_data("/pesticide_residues_mrls/download",
	{"format": "json", "language": "FR", "api-version": "v2.0"},
	stream_large_json=True)
	if data_mrls:
	list_m = data_mrls.get('value', []) if isinstance(data_mrls, dict) else data_mrls
	if isinstance(list_m, list): results['mrls'] = [i for i in list_m if isinstance(i, dict)]
	logger.info(f"✓ {len(results['mrls'])} LMRs.")

	status_bar.update(label="📥 Produits alimentaires...")
	list_prods = fetcher.get_products_paginated(language='FR')
	if isinstance(list_prods, list):
	results['products'] = list_prods
	results['product_dict'] = {
	p['product_id']: p['product_name']
	for p in list_prods if isinstance(p, dict) and p.get('product_id') and p.get('product_name')
	}
	logger.info(f"✓ {len(results['products'])} produits.")

	results['stats'] = {'api_calls': fetcher.api_calls, 'substances_count': len(results['substances']),
	'mrls_count': len(results['mrls']), 'products_count': len(results['products']),
	'download_time': datetime.now().strftime('%d/%m/%Y %H:%M:%S')}
	status_bar.update(label=f"✅ Données chargées! ({results['stats']['download_time']})", state="complete", expanded=False)
	return results

	class PesticideInterface:
	def __init__(self):
	self.data = download_all_data()
	self._create_indexes()

	def _create_indexes(self):
	self.mrls_by_product = defaultdict(list)
	for mrl_item in self.data.get('mrls', []):
	if isinstance(mrl_item, dict) and mrl_item.get('product_id'):
	self.mrls_by_product[mrl_item['product_id']].append(mrl_item)
	self.product_choices = {
	p_item['product_name']: p_item['product_id']
	for p_item in self.data.get('products', []) if isinstance(p_item, dict) and p_item.get('product_name') and p_item.get('product_id')
	}
	logger.info(f"Index créés: {len(self.mrls_by_product)} produits avec LMR.")

	def get_product_details(self, product_names_sel: List[str], future_only_flag: bool = False) -> pd.DataFrame:
	sel_ids = [self.product_choices[name] for name in product_names_sel if name in self.product_choices]
	if not sel_ids: return pd.DataFrame()

	mrls_list = [mrl for pid_sel in sel_ids for mrl in self.mrls_by_product.get(pid_sel, [])]
	if not mrls_list: return pd.DataFrame()

	df_data = pd.DataFrame(mrls_list)
	if df_data.empty: return pd.DataFrame()

	df_data["Substance"] = df_data["pesticide_residue_id"].map(self.data.get('substances', {})).fillna("Inconnue")
	df_data["Produit"] = df_data["product_id"].map(self.data.get('product_dict', {})).fillna("Inconnu")

	def format_reg_link(row_data):
	url_val, num_val = row_data.get("regulation_url"), row_data.get("regulation_number", "N/A")
	return f"[{num_val}]({url_val})" if pd.notna(url_val) and str(url_val).strip().lower().startswith('http') else num_val
	df_data["Lien Règlement"] = df_data.apply(format_reg_link, axis=1)

	df_data["Date d'application"] = pd.to_datetime(df_data.get("entry_into_force_date"), errors="coerce")

	if future_only_flag:
	ts_now_utc = pd.Timestamp.now(tz='UTC')
	df_dates_col = df_data["Date d'application"].copy()
	if df_dates_col.dt.tz is None: df_dates_col = df_dates_col.dt.tz_localize('UTC', ambiguous='NaT', nonexistent='NaT')
	else: df_dates_col = df_dates_col.dt.tz_convert('UTC')

	ts_future_utc = ts_now_utc + timedelta(days=180)
	df_data = df_data[ (df_dates_col.notna()) & (df_dates_col > ts_now_utc) & (df_dates_col <= ts_future_utc) ]
	if df_data.empty: return pd.DataFrame()

	df_data["Valeur LMR"] = pd.to_numeric(df_data.get("mrl_value"), errors='coerce')

	cols_final = [c for c in ["Produit", "Substance", "Valeur LMR", "Date d'application", "Lien Règlement"] if c in df_data.columns]
	df_data = df_data[cols_final].copy()

	sort_order_cols = ["Produit"]
	sort_asc = [True]
	if "Date d'application" in df_data.columns:
	sort_order_cols.append("Date d'application")
	sort_asc.append(False)
	df_data = df_data.sort_values(by=sort_order_cols, ascending=sort_asc, na_position='last')
	return df_data

	def create_interface(self):
	st.title("🌿 EU Pesticides Database Explorer")
	app_stats = self.data.get('stats', {})
	m_col1, m_col2, m_col3, m_col4 = st.columns(4)
	with m_col1: st.metric("📦 Produits", f"{app_stats.get('products_count', 0):,}")
	with m_col2: st.metric("🧪 Substances", f"{app_stats.get('substances_count', 0):,}")
	with m_col3: st.metric("📊 Enregistrements LMR", f"{app_stats.get('mrls_count', 0):,}")
	with m_col4: st.metric("📞 Appels API", app_stats.get('api_calls', 0))
	st.caption(f"Données de référence chargées ({app_stats.get('download_time', 'N/A')}).")
	st.markdown("---")

	ui_col1, ui_col2 = st.columns([3, 1])
	with ui_col1:
	opts_prods = sorted(list(self.product_choices.keys()))
	sel_prods_names = st.multiselect("🔍 Sélectionnez produit(s)", options=opts_prods, placeholder="Commencez à taper...")
	with ui_col2:
	sel_future_only = st.checkbox("📅 Changements futurs (6 mois)", value=False, help="Nouveaux LMR ou modifications prévues.")

	if sel_prods_names:
	df_res = self.get_product_details(sel_prods_names, sel_future_only)
	if df_res.empty:
	info_msg = "Aucun changement LMR prévu." if sel_future_only else "Aucune donnée LMR trouvée."
	st.info(f"{info_msg} pour la sélection actuelle.")
	else:
	st.markdown("### 📊 Résultats des LMR")
	df_lmr_num = df_res[df_res["Valeur LMR"].notna()]
	disp_col1, disp_col2 = st.columns(2)
	with disp_col1: st.metric("Lignes affichées", len(df_res))
	with disp_col2: st.metric("Substances uniques", df_lmr_num["Substance"].nunique() if not df_lmr_num.empty else 0)

	with st.expander("⚙️ Options d'affichage", expanded=False):
	opt_show_low_mrl = st.checkbox("Inclure LMR < 0.01 mg/kg", value=True)
	opts_sort = [c for c in ["Produit", "Substance", "Valeur LMR", "Date d'application"] if c in df_res.columns]
	opt_sort_by = None
	if opts_sort:
	def_sort_idx_opt = opts_sort.index("Date d'application") if "Date d'application" in opts_sort else 0
	opt_sort_by = st.selectbox("Trier par", opts_sort, index=def_sort_idx_opt)
	opt_sort_dir = st.radio("Ordre", ["Croissant", "Décroissant"], horizontal=True, index=1 if opt_sort_by=="Date d'application" else 0)

	df_view = df_res.copy()
	if not opt_show_low_mrl and "Valeur LMR" in df_view.columns:
	df_view = df_view[df_view["Valeur LMR"] >= 0.01]
	if opt_sort_by and opt_sort_by in df_view.columns:
	df_view = df_view.sort_values(opt_sort_by, ascending=(opt_sort_dir == "Croissant"), na_position='last')

	st.dataframe(df_view, use_container_width=True, hide_index=True,
	column_config={
	"Valeur LMR": st.column_config.NumberColumn("LMR (mg/kg)", format="%.4f", help="Limite Maximale de Résidus"),
	"Date d'application": st.column_config.DateColumn("Application", format="%d/%m/%Y"), # Standard French date format
	"Lien Règlement": st.column_config.TextColumn("Règlement")
	})

	if not df_view.empty: self.create_visualizations(df_view) # Visualiser si df_view n'est pas vide

	fname_prods = "_".join(sel_prods_names[:2]).replace(" ", "_").replace("/", "_") + ("_etc" if len(sel_prods_names)>2 else "")
	fname_csv = f"lmr_{fname_prods}_{datetime.now().strftime('%Y%m%d%H%M')}.csv"
	st.download_button("📥 Export CSV", df_view.to_csv(index=False).encode('utf-8'), fname_csv, "text/csv")
	else:
	st.info("👆 Sélectionnez un ou plusieurs produits pour afficher leurs LMR.")

	def create_visualizations(self, df: pd.DataFrame):
	st.markdown("### 🎨 Visualisations")
	tabs_viz = st.tabs(["📈 Évolution Temporelle", "📊 Distribution", "🏆 Top Substances"])
	df_plot = df[df["Valeur LMR"].notna()].copy() # Utiliser une copie pour les modifications spécifiques aux graphiques
	if df_plot.empty:
	st.warning("Aucune donnée LMR numérique valide pour les graphiques.")
	return

	with tabs_viz[0]:
	if "Date d'application" in df_plot.columns and df_plot["Date d'application"].notna().any():
	df_plot_time = df_plot[df_plot["Date d'application"].notna()].sort_values("Date d'application")
	if not df_plot_time.empty:
	fig_scatter = px.scatter(df_plot_time, x="Date d'application", y="Valeur LMR", color="Substance", size="Valeur LMR",
	hover_data=["Produit", "Lien Règlement"], title="Évolution LMR (axe Y log)", log_y=True)
	fig_scatter.update_layout(legend_title_text='Substance', legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1))
	st.plotly_chart(fig_scatter, use_container_width=True)
	else: st.info("Pas de données datées pour ce graphique.")
	else: st.info("'Date d'application' manquante ou vide.")

	with tabs_viz[1]:
	# CORRECTION: .nunique() retourne un int, pas besoin de .item()
	color_by_prod = "Produit" if df_plot["Produit"].nunique() < 10 and df_plot["Produit"].nunique() > 0 else None
	fig_hist_dist = px.histogram(df_plot, x="Valeur LMR", nbins=30, title="Distribution des LMR (axe X log)", log_x=True,
	labels={"Valeur LMR": "LMR (mg/kg)"}, color=color_by_prod)
	st.plotly_chart(fig_hist_dist, use_container_width=True)

	# CORRECTION: .nunique() retourne un int, pas besoin de .item()
	if df_plot["Produit"].nunique() > 1:
	fig_box_dist = px.box(df_plot, x="Produit", y="Valeur LMR", title="LMR par Produit (axe Y log)", log_y=True,
	color="Produit", points="outliers")
	st.plotly_chart(fig_box_dist, use_container_width=True)

	with tabs_viz[2]:
	if "Substance" in df_plot.columns:
	df_top_subs = (df_plot.groupby("Substance")["Valeur LMR"]
	.agg(['max', 'count', 'mean']).rename(columns=str.capitalize)
	.sort_values('Max', ascending=False).head(15).reset_index())
	if not df_top_subs.empty:
	fig_bar_top = px.bar(df_top_subs, y="Substance", x='Max', orientation='h', title="Top 15 Substances (LMR max)",
	labels={'Max': 'LMR max (mg/kg)'}, hover_data={'Count': True, 'Mean': ':.4f'})
	fig_bar_top.update_layout(yaxis={'categoryorder':'total ascending'})
	st.plotly_chart(fig_bar_top, use_container_width=True)
	else: st.info("Pas assez de données pour le Top Substances.")
	else: st.info("'Substance' manquante pour ce graphique.")

	def main():
	with st.sidebar:
	st.header("EU Pesticides Explorer")
	st.caption("Version Optimisée")
	st.markdown("Analyse des LMR de pesticides dans l'UE. Données via API de la Commission Européenne.")
	if st.button("🔄 Forcer MAJ Données", key="sidebar_btn_reload", help="Efface le cache et recharge tout."):
	st.cache_data.clear()
	if 'pesticide_app_interface' in st.session_state: del st.session_state.pesticide_app_interface
	st.rerun()
	st.markdown("---")

	if 'pesticide_app_interface' not in st.session_state:
	logger.info("Initialisation PesticideInterface (session_state)...")
	with st.spinner("Préparation de l'application et chargement des données de référence..."):
	st.session_state.pesticide_app_interface = PesticideInterface()
	logger.info("PesticideInterface initialisée.")

	st.session_state.pesticide_app_interface.create_interface()

	app_data_stats = st.session_state.pesticide_app_interface.data.get('stats', {})
	if app_data_stats.get('download_time'):
	st.sidebar.caption(f"Données chargées le: {app_data_stats['download_time']}")
	else:
	st.sidebar.caption("Statut chargement données non disponible.")

	if __name__ == "__main__":
	main()