# codex_app_fixed.py import streamlit as st import requests from bs4 import BeautifulSoup import re import pandas as pd from datetime import datetime import urllib.parse import time import html # --- Configuration --- CODEX_CATEGORIES = { 'codes': { 'name': 'Codes de Pratique (CXC)', 'url': 'https://www.fao.org/fao-who-codexalimentarius/codex-texts/codes-of-practice/fr/', 'prefix': 'CXC', 'icon': '📋', 'color': '#FF6B6B' }, 'standards': { 'name': 'Normes (CXS)', 'url': 'https://www.fao.org/fao-who-codexalimentarius/codex-texts/list-standards/fr/', 'prefix': 'CXS', 'icon': '⚖️', 'color': '#4ECDC4' }, 'guidelines': { 'name': 'Directives (CXG)', 'url': 'https://www.fao.org/fao-who-codexalimentarius/codex-texts/guidelines/fr/', 'prefix': 'CXG', 'icon': '📖', 'color': '#45B7D1' } } HEADERS = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36' } TIMEOUT = 30 BASE_URL = "https://www.fao.org" # Configuration de la page st.set_page_config( page_title="Codex Alimentarius Monitor", page_icon="🔬", layout="wide", initial_sidebar_state="expanded" ) def clean_text(text): """Nettoyer le texte des caractères problématiques""" if not text: return "" # Décoder les entités HTML text = html.unescape(str(text)) # Supprimer les balises HTML restantes text = re.sub(r'<[^>]+>', '', text) # Nettoyer les caractères spéciaux text = text.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ') # Supprimer les espaces multiples text = re.sub(r'\s+', ' ', text).strip() return text @st.cache_data(ttl=1800) def extract_documents_from_url(url, category_key): """Fonction pour extraire les documents d'une catégorie Codex.""" category_info = CODEX_CATEGORIES[category_key] documents = [] seen_codes = set() try: response = requests.get(url, headers=HEADERS, timeout=TIMEOUT) response.raise_for_status() soup = BeautifulSoup(response.content, 'html.parser') tables = soup.find_all('table') if not tables: return documents for table in tables: rows = table.find_all('tr') for row in rows: cells = row.find_all(['td', 'th']) if len(cells) >= 5: # Nettoyer chaque cellule cell_texts = [clean_text(cell.get_text(strip=True)) for cell in cells[:4]] code_candidate = cell_texts[0] if cell_texts else "" prefix = category_info['prefix'] code_match = re.match(rf'^({prefix})\s+([\w\-R]*\d+(?:-\d+)?[R]?)$', code_candidate) if code_match: prefix_found = code_match.group(1) number_part = code_match.group(2) full_code = f"{prefix_found} {number_part}" if full_code not in seen_codes: seen_codes.add(full_code) # Nettoyer toutes les données texte title = clean_text(cell_texts[1]) if len(cell_texts) > 1 else "Titre non trouvé" committee = clean_text(cell_texts[2]) if len(cell_texts) > 2 else "COMITE" year_str = clean_text(cell_texts[3]) if len(cell_texts) > 3 else "" # Validation et nettoyage de l'année try: year = int(re.search(r'\d{4}', year_str).group()) if re.search(r'\d{4}', year_str) else 0 except (ValueError, AttributeError): year = 0 # Extraction du lien PDF pdf_cell = cells[4] pdf_url = None link_tag = pdf_cell.find('a', href=re.compile(r'.*\.pdf', re.IGNORECASE)) if link_tag: href = link_tag.get('href') if href: decoded_href = urllib.parse.unquote(href) pdf_url = urllib.parse.urljoin(BASE_URL, decoded_href) if not pdf_url: pdf_url = f"https://www.fao.org/fao-who-codexalimentarius/search/en/?q={full_code.replace(' ', '%20')}" is_new = year >= datetime.now().year - 1 is_updated = year == datetime.now().year documents.append({ 'code': full_code, 'title': title, 'committee': committee, 'year': year, 'category': category_key, 'category_name': category_info['name'], 'pdf_url': pdf_url, 'is_new': is_new, 'is_updated': is_updated, 'icon': category_info['icon'], 'color': category_info['color'] }) return documents except Exception as e: st.error(f"Erreur lors de l'extraction de {category_info['name']} : {e}") return [] def display_document_card(doc): """Afficher une carte de document de manière sécurisée""" # Créer les badges de statut badges = [] badges.append(f"{doc['icon']} {doc['category_name']}") if doc['is_new']: badges.append("✨ NOUVEAU") if doc['is_updated']: badges.append("🔄 MIS À JOUR") # Affichage sécurisé sans HTML with st.container(): # Badges en colonnes cols = st.columns(len(badges)) for i, badge in enumerate(badges): with cols[i]: if "NOUVEAU" in badge: st.success(badge) elif "MIS À JOUR" in badge: st.info(badge) else: st.caption(badge) # Titre et informations st.subheader(f"{doc['code']} - {doc['title']}") col1, col2 = st.columns([2, 1]) with col1: st.text(f"🏢 Comité: {doc['committee']}") st.text(f"📅 Année: {doc['year']}") with col2: st.link_button( "📄 Télécharger PDF", doc['pdf_url'], type="primary", use_container_width=True ) st.divider() def display_metrics_safe(df): """Afficher les métriques de manière sécurisée""" if df.empty: return col1, col2, col3, col4 = st.columns(4) with col1: st.metric( label="📊 Total Documents", value=len(df) ) with col2: new_docs = len(df[df['is_new']]) st.metric( label="✨ Nouveaux", value=new_docs ) with col3: updated_docs = len(df[df['is_updated']]) st.metric( label="🔄 Mis à jour", value=updated_docs ) with col4: categories = df['category_name'].nunique() st.metric( label="📂 Catégories", value=categories ) def display_category_stats(df): """Afficher les statistiques par catégorie de manière sécurisée""" if df.empty: return st.subheader("📋 Répartition par Catégorie") category_stats = df.groupby(['category_name', 'category']).agg({ 'code': 'count', 'is_new': 'sum', 'is_updated': 'sum' }).reset_index() for _, row in category_stats.iterrows(): category_key = row['category'] category_info = CODEX_CATEGORIES[category_key] with st.expander(f"{category_info['icon']} {row['category_name']}", expanded=True): col1, col2, col3 = st.columns(3) with col1: st.metric("Total", int(row['code'])) with col2: st.metric("Nouveaux", int(row['is_new'])) with col3: st.metric("Mis à jour", int(row['is_updated'])) # CSS minimal et sécurisé st.markdown(""" """, unsafe_allow_html=True) # Initialisation if 'documents' not in st.session_state: st.session_state.documents = [] st.session_state.last_update = None # Interface principale st.title("🔬 Codex Alimentarius Monitor") st.caption("Exploration et suivi des normes alimentaires internationales") # Sidebar with st.sidebar: st.header("🎛️ Centre de Contrôle") if st.button("🔄 Actualiser les Documents", type="primary", use_container_width=True): with st.spinner("🔍 Extraction en cours..."): all_documents = [] progress_bar = st.progress(0) status_text = st.empty() for i, (cat_key, cat_info) in enumerate(CODEX_CATEGORIES.items()): status_text.info(f"{cat_info['icon']} Extraction de {cat_info['name']}...") docs = extract_documents_from_url(cat_info['url'], cat_key) all_documents.extend(docs) progress_bar.progress((i + 1) / len(CODEX_CATEGORIES)) time.sleep(0.5) st.session_state.documents = all_documents st.session_state.last_update = datetime.now() status_text.success("✅ Données actualisées!") time.sleep(1) status_text.empty() if st.session_state.last_update: st.success(f"🕒 Dernière MAJ: {st.session_state.last_update.strftime('%d/%m/%Y %H:%M')}") # Filtres if st.session_state.documents: st.divider() st.header("🔍 Filtres") df_all = pd.DataFrame(st.session_state.documents) # Filtre par catégorie categories = ['Toutes'] + list(df_all['category_name'].unique()) selected_category = st.selectbox("📂 Catégorie:", categories) # Filtre par comité committees = ['Tous'] + sorted(df_all['committee'].unique()) selected_committee = st.selectbox("🏢 Comité:", committees) # Filtres de statut col1, col2 = st.columns(2) with col1: filter_new = st.checkbox("✨ Nouveaux") with col2: filter_updated = st.checkbox("🔄 Mis à jour") # Filtre par année years = sorted([y for y in df_all['year'].unique() if y > 0], reverse=True) if years: selected_years = st.multiselect("📅 Années:", years, default=years) # Toutes les années par défaut # Recherche search_term = st.text_input("🔍 Recherche:", placeholder="Code ou titre...") # Application des filtres filtered_df = df_all.copy() if selected_category != 'Toutes': filtered_df = filtered_df[filtered_df['category_name'] == selected_category] if selected_committee != 'Tous': filtered_df = filtered_df[filtered_df['committee'] == selected_committee] if filter_new: filtered_df = filtered_df[filtered_df['is_new']] if filter_updated: filtered_df = filtered_df[filtered_df['is_updated']] if 'selected_years' in locals() and selected_years: filtered_df = filtered_df[filtered_df['year'].isin(selected_years)] if search_term: search_mask = ( filtered_df['title'].str.contains(search_term, case=False, na=False) | filtered_df['code'].str.contains(search_term, case=False, na=False) ) filtered_df = filtered_df[search_mask] st.session_state.filtered_df = filtered_df st.info(f"📊 {len(filtered_df)} documents trouvés") # Contenu principal if st.session_state.documents: df_display = st.session_state.get('filtered_df', pd.DataFrame(st.session_state.documents)) df_display = df_display.sort_values(by=['year', 'code'], ascending=[False, True]).reset_index(drop=True) # Métriques display_metrics_safe(df_display) # Onglets tab1, tab2, tab3 = st.tabs(["📋 Aperçu", "📊 Analyses", "📄 Documents"]) with tab1: display_category_stats(df_display) if not df_display.empty: st.subheader("📈 Informations Générales") col1, col2 = st.columns(2) with col1: st.write("**📊 Statistiques**") st.write(f"• Période: {df_display['year'].min()} - {df_display['year'].max()}") st.write(f"• Comités: {df_display['committee'].nunique()}") st.write(f"• Documents récents: {len(df_display[df_display['year'] >= 2023])}") with col2: st.write("**🏆 Top 5 Comités**") top_committees = df_display['committee'].value_counts().head() for committee, count in top_committees.items(): st.write(f"• {committee}: {count}") with tab2: st.subheader("📊 Analyses Visuelles") col1, col2 = st.columns(2) with col1: st.write("##### 📊 Par Catégorie") category_counts = df_display['category_name'].value_counts() st.bar_chart(category_counts) with col2: st.write("##### 📈 Par Année") year_counts = df_display[df_display['year'] > 2000]['year'].value_counts().sort_index() st.line_chart(year_counts) with tab3: st.subheader(f"📄 Liste des Documents ({len(df_display)} résultats)") # Pagination simple items_per_page = 10 if 'page_num' not in st.session_state: st.session_state.page_num = 0 total_pages = (len(df_display) - 1) // items_per_page + 1 if total_pages > 1: col1, col2, col3 = st.columns([1, 2, 1]) with col1: if st.button("⬅️ Précédent", disabled=(st.session_state.page_num == 0)): st.session_state.page_num -= 1 st.rerun() with col2: st.write(f"Page {st.session_state.page_num + 1} sur {total_pages}") with col3: if st.button("Suivant ➡️", disabled=(st.session_state.page_num >= total_pages - 1)): st.session_state.page_num += 1 st.rerun() # Documents de la page actuelle start_idx = st.session_state.page_num * items_per_page end_idx = start_idx + items_per_page page_docs = df_display.iloc[start_idx:end_idx] # Affichage sécurisé des documents for _, doc in page_docs.iterrows(): display_document_card(doc) # Export if not df_display.empty: st.divider() st.subheader("💾 Export des Données") col1, col2 = st.columns(2) with col1: csv = df_display.to_csv(index=False, sep=';') st.download_button( "📊 Télécharger CSV", csv, f"codex_{datetime.now().strftime('%Y%m%d')}.csv", "text/csv", use_container_width=True ) with col2: json_str = df_display.to_json(orient='records', indent=2) st.download_button( "📋 Télécharger JSON", json_str, f"codex_{datetime.now().strftime('%Y%m%d')}.json", "application/json", use_container_width=True ) else: # Page d'accueil st.info("👈 Cliquez sur 'Actualiser les Documents' dans la barre latérale pour commencer") st.subheader("🎯 Fonctionnalités") st.write("• 📋 **Codes de Pratique (CXC)** - Procédures et bonnes pratiques") st.write("• ⚖️ **Normes (CXS)** - Standards alimentaires officiels") st.write("• 📖 **Directives (CXG)** - Lignes directrices et recommandations") st.write("• 🔍 **Recherche avancée** - Filtrage par catégorie, comité, année") st.write("• 📊 **Analyses visuelles** - Graphiques et statistiques") st.write("• 💾 **Export de données** - CSV et JSON")