Spaces:
Sleeping
Sleeping
| # codex_app_fixed.py | |
| import streamlit as st | |
| import requests | |
| from bs4 import BeautifulSoup | |
| import re | |
| import pandas as pd | |
| from datetime import datetime | |
| import urllib.parse | |
| import time | |
| import html | |
| # --- Configuration --- | |
| CODEX_CATEGORIES = { | |
| 'codes': { | |
| 'name': 'Codes de Pratique (CXC)', | |
| 'url': 'https://www.fao.org/fao-who-codexalimentarius/codex-texts/codes-of-practice/fr/', | |
| 'prefix': 'CXC', | |
| 'icon': '📋', | |
| 'color': '#FF6B6B' | |
| }, | |
| 'standards': { | |
| 'name': 'Normes (CXS)', | |
| 'url': 'https://www.fao.org/fao-who-codexalimentarius/codex-texts/list-standards/fr/', | |
| 'prefix': 'CXS', | |
| 'icon': '⚖️', | |
| 'color': '#4ECDC4' | |
| }, | |
| 'guidelines': { | |
| 'name': 'Directives (CXG)', | |
| 'url': 'https://www.fao.org/fao-who-codexalimentarius/codex-texts/guidelines/fr/', | |
| 'prefix': 'CXG', | |
| 'icon': '📖', | |
| 'color': '#45B7D1' | |
| } | |
| } | |
| HEADERS = { | |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36' | |
| } | |
| TIMEOUT = 30 | |
| BASE_URL = "https://www.fao.org" | |
| # Configuration de la page | |
| st.set_page_config( | |
| page_title="Codex Alimentarius Monitor", | |
| page_icon="🔬", | |
| layout="wide", | |
| initial_sidebar_state="expanded" | |
| ) | |
| def clean_text(text): | |
| """Nettoyer le texte des caractères problématiques""" | |
| if not text: | |
| return "" | |
| # Décoder les entités HTML | |
| text = html.unescape(str(text)) | |
| # Supprimer les balises HTML restantes | |
| text = re.sub(r'<[^>]+>', '', text) | |
| # Nettoyer les caractères spéciaux | |
| text = text.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ') | |
| # Supprimer les espaces multiples | |
| text = re.sub(r'\s+', ' ', text).strip() | |
| return text | |
| def extract_documents_from_url(url, category_key): | |
| """Fonction pour extraire les documents d'une catégorie Codex.""" | |
| category_info = CODEX_CATEGORIES[category_key] | |
| documents = [] | |
| seen_codes = set() | |
| try: | |
| response = requests.get(url, headers=HEADERS, timeout=TIMEOUT) | |
| response.raise_for_status() | |
| soup = BeautifulSoup(response.content, 'html.parser') | |
| tables = soup.find_all('table') | |
| if not tables: | |
| return documents | |
| for table in tables: | |
| rows = table.find_all('tr') | |
| for row in rows: | |
| cells = row.find_all(['td', 'th']) | |
| if len(cells) >= 5: | |
| # Nettoyer chaque cellule | |
| cell_texts = [clean_text(cell.get_text(strip=True)) for cell in cells[:4]] | |
| code_candidate = cell_texts[0] if cell_texts else "" | |
| prefix = category_info['prefix'] | |
| code_match = re.match(rf'^({prefix})\s+([\w\-R]*\d+(?:-\d+)?[R]?)$', code_candidate) | |
| if code_match: | |
| prefix_found = code_match.group(1) | |
| number_part = code_match.group(2) | |
| full_code = f"{prefix_found} {number_part}" | |
| if full_code not in seen_codes: | |
| seen_codes.add(full_code) | |
| # Nettoyer toutes les données texte | |
| title = clean_text(cell_texts[1]) if len(cell_texts) > 1 else "Titre non trouvé" | |
| committee = clean_text(cell_texts[2]) if len(cell_texts) > 2 else "COMITE" | |
| year_str = clean_text(cell_texts[3]) if len(cell_texts) > 3 else "" | |
| # Validation et nettoyage de l'année | |
| try: | |
| year = int(re.search(r'\d{4}', year_str).group()) if re.search(r'\d{4}', year_str) else 0 | |
| except (ValueError, AttributeError): | |
| year = 0 | |
| # Extraction du lien PDF | |
| pdf_cell = cells[4] | |
| pdf_url = None | |
| link_tag = pdf_cell.find('a', href=re.compile(r'.*\.pdf', re.IGNORECASE)) | |
| if link_tag: | |
| href = link_tag.get('href') | |
| if href: | |
| decoded_href = urllib.parse.unquote(href) | |
| pdf_url = urllib.parse.urljoin(BASE_URL, decoded_href) | |
| if not pdf_url: | |
| pdf_url = f"https://www.fao.org/fao-who-codexalimentarius/search/en/?q={full_code.replace(' ', '%20')}" | |
| is_new = year >= datetime.now().year - 1 | |
| is_updated = year == datetime.now().year | |
| documents.append({ | |
| 'code': full_code, | |
| 'title': title, | |
| 'committee': committee, | |
| 'year': year, | |
| 'category': category_key, | |
| 'category_name': category_info['name'], | |
| 'pdf_url': pdf_url, | |
| 'is_new': is_new, | |
| 'is_updated': is_updated, | |
| 'icon': category_info['icon'], | |
| 'color': category_info['color'] | |
| }) | |
| return documents | |
| except Exception as e: | |
| st.error(f"Erreur lors de l'extraction de {category_info['name']} : {e}") | |
| return [] | |
| def display_document_card(doc): | |
| """Afficher une carte de document de manière sécurisée""" | |
| # Créer les badges de statut | |
| badges = [] | |
| badges.append(f"{doc['icon']} {doc['category_name']}") | |
| if doc['is_new']: | |
| badges.append("✨ NOUVEAU") | |
| if doc['is_updated']: | |
| badges.append("🔄 MIS À JOUR") | |
| # Affichage sécurisé sans HTML | |
| with st.container(): | |
| # Badges en colonnes | |
| cols = st.columns(len(badges)) | |
| for i, badge in enumerate(badges): | |
| with cols[i]: | |
| if "NOUVEAU" in badge: | |
| st.success(badge) | |
| elif "MIS À JOUR" in badge: | |
| st.info(badge) | |
| else: | |
| st.caption(badge) | |
| # Titre et informations | |
| st.subheader(f"{doc['code']} - {doc['title']}") | |
| col1, col2 = st.columns([2, 1]) | |
| with col1: | |
| st.text(f"🏢 Comité: {doc['committee']}") | |
| st.text(f"📅 Année: {doc['year']}") | |
| with col2: | |
| st.link_button( | |
| "📄 Télécharger PDF", | |
| doc['pdf_url'], | |
| type="primary", | |
| use_container_width=True | |
| ) | |
| st.divider() | |
| def display_metrics_safe(df): | |
| """Afficher les métriques de manière sécurisée""" | |
| if df.empty: | |
| return | |
| col1, col2, col3, col4 = st.columns(4) | |
| with col1: | |
| st.metric( | |
| label="📊 Total Documents", | |
| value=len(df) | |
| ) | |
| with col2: | |
| new_docs = len(df[df['is_new']]) | |
| st.metric( | |
| label="✨ Nouveaux", | |
| value=new_docs | |
| ) | |
| with col3: | |
| updated_docs = len(df[df['is_updated']]) | |
| st.metric( | |
| label="🔄 Mis à jour", | |
| value=updated_docs | |
| ) | |
| with col4: | |
| categories = df['category_name'].nunique() | |
| st.metric( | |
| label="📂 Catégories", | |
| value=categories | |
| ) | |
| def display_category_stats(df): | |
| """Afficher les statistiques par catégorie de manière sécurisée""" | |
| if df.empty: | |
| return | |
| st.subheader("📋 Répartition par Catégorie") | |
| category_stats = df.groupby(['category_name', 'category']).agg({ | |
| 'code': 'count', | |
| 'is_new': 'sum', | |
| 'is_updated': 'sum' | |
| }).reset_index() | |
| for _, row in category_stats.iterrows(): | |
| category_key = row['category'] | |
| category_info = CODEX_CATEGORIES[category_key] | |
| with st.expander(f"{category_info['icon']} {row['category_name']}", expanded=True): | |
| col1, col2, col3 = st.columns(3) | |
| with col1: | |
| st.metric("Total", int(row['code'])) | |
| with col2: | |
| st.metric("Nouveaux", int(row['is_new'])) | |
| with col3: | |
| st.metric("Mis à jour", int(row['is_updated'])) | |
| # CSS minimal et sécurisé | |
| st.markdown(""" | |
| <style> | |
| .stApp > header { | |
| background-color: transparent; | |
| } | |
| .main > div { | |
| padding-top: 2rem; | |
| } | |
| h1 { | |
| color: #1f77b4; | |
| text-align: center; | |
| padding: 1rem; | |
| background: linear-gradient(90deg, #667eea 0%, #764ba2 100%); | |
| color: white; | |
| border-radius: 10px; | |
| margin-bottom: 2rem; | |
| } | |
| </style> | |
| """, unsafe_allow_html=True) | |
| # Initialisation | |
| if 'documents' not in st.session_state: | |
| st.session_state.documents = [] | |
| st.session_state.last_update = None | |
| # Interface principale | |
| st.title("🔬 Codex Alimentarius Monitor") | |
| st.caption("Exploration et suivi des normes alimentaires internationales") | |
| # Sidebar | |
| with st.sidebar: | |
| st.header("🎛️ Centre de Contrôle") | |
| if st.button("🔄 Actualiser les Documents", type="primary", use_container_width=True): | |
| with st.spinner("🔍 Extraction en cours..."): | |
| all_documents = [] | |
| progress_bar = st.progress(0) | |
| status_text = st.empty() | |
| for i, (cat_key, cat_info) in enumerate(CODEX_CATEGORIES.items()): | |
| status_text.info(f"{cat_info['icon']} Extraction de {cat_info['name']}...") | |
| docs = extract_documents_from_url(cat_info['url'], cat_key) | |
| all_documents.extend(docs) | |
| progress_bar.progress((i + 1) / len(CODEX_CATEGORIES)) | |
| time.sleep(0.5) | |
| st.session_state.documents = all_documents | |
| st.session_state.last_update = datetime.now() | |
| status_text.success("✅ Données actualisées!") | |
| time.sleep(1) | |
| status_text.empty() | |
| if st.session_state.last_update: | |
| st.success(f"🕒 Dernière MAJ: {st.session_state.last_update.strftime('%d/%m/%Y %H:%M')}") | |
| # Filtres | |
| if st.session_state.documents: | |
| st.divider() | |
| st.header("🔍 Filtres") | |
| df_all = pd.DataFrame(st.session_state.documents) | |
| # Filtre par catégorie | |
| categories = ['Toutes'] + list(df_all['category_name'].unique()) | |
| selected_category = st.selectbox("📂 Catégorie:", categories) | |
| # Filtre par comité | |
| committees = ['Tous'] + sorted(df_all['committee'].unique()) | |
| selected_committee = st.selectbox("🏢 Comité:", committees) | |
| # Filtres de statut | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| filter_new = st.checkbox("✨ Nouveaux") | |
| with col2: | |
| filter_updated = st.checkbox("🔄 Mis à jour") | |
| # Filtre par année | |
| years = sorted([y for y in df_all['year'].unique() if y > 0], reverse=True) | |
| if years: | |
| selected_years = st.multiselect("📅 Années:", years, default=years) # Toutes les années par défaut | |
| # Recherche | |
| search_term = st.text_input("🔍 Recherche:", placeholder="Code ou titre...") | |
| # Application des filtres | |
| filtered_df = df_all.copy() | |
| if selected_category != 'Toutes': | |
| filtered_df = filtered_df[filtered_df['category_name'] == selected_category] | |
| if selected_committee != 'Tous': | |
| filtered_df = filtered_df[filtered_df['committee'] == selected_committee] | |
| if filter_new: | |
| filtered_df = filtered_df[filtered_df['is_new']] | |
| if filter_updated: | |
| filtered_df = filtered_df[filtered_df['is_updated']] | |
| if 'selected_years' in locals() and selected_years: | |
| filtered_df = filtered_df[filtered_df['year'].isin(selected_years)] | |
| if search_term: | |
| search_mask = ( | |
| filtered_df['title'].str.contains(search_term, case=False, na=False) | | |
| filtered_df['code'].str.contains(search_term, case=False, na=False) | |
| ) | |
| filtered_df = filtered_df[search_mask] | |
| st.session_state.filtered_df = filtered_df | |
| st.info(f"📊 {len(filtered_df)} documents trouvés") | |
| # Contenu principal | |
| if st.session_state.documents: | |
| df_display = st.session_state.get('filtered_df', pd.DataFrame(st.session_state.documents)) | |
| df_display = df_display.sort_values(by=['year', 'code'], ascending=[False, True]).reset_index(drop=True) | |
| # Métriques | |
| display_metrics_safe(df_display) | |
| # Onglets | |
| tab1, tab2, tab3 = st.tabs(["📋 Aperçu", "📊 Analyses", "📄 Documents"]) | |
| with tab1: | |
| display_category_stats(df_display) | |
| if not df_display.empty: | |
| st.subheader("📈 Informations Générales") | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| st.write("**📊 Statistiques**") | |
| st.write(f"• Période: {df_display['year'].min()} - {df_display['year'].max()}") | |
| st.write(f"• Comités: {df_display['committee'].nunique()}") | |
| st.write(f"• Documents récents: {len(df_display[df_display['year'] >= 2023])}") | |
| with col2: | |
| st.write("**🏆 Top 5 Comités**") | |
| top_committees = df_display['committee'].value_counts().head() | |
| for committee, count in top_committees.items(): | |
| st.write(f"• {committee}: {count}") | |
| with tab2: | |
| st.subheader("📊 Analyses Visuelles") | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| st.write("##### 📊 Par Catégorie") | |
| category_counts = df_display['category_name'].value_counts() | |
| st.bar_chart(category_counts) | |
| with col2: | |
| st.write("##### 📈 Par Année") | |
| year_counts = df_display[df_display['year'] > 2000]['year'].value_counts().sort_index() | |
| st.line_chart(year_counts) | |
| with tab3: | |
| st.subheader(f"📄 Liste des Documents ({len(df_display)} résultats)") | |
| # Pagination simple | |
| items_per_page = 10 | |
| if 'page_num' not in st.session_state: | |
| st.session_state.page_num = 0 | |
| total_pages = (len(df_display) - 1) // items_per_page + 1 | |
| if total_pages > 1: | |
| col1, col2, col3 = st.columns([1, 2, 1]) | |
| with col1: | |
| if st.button("⬅️ Précédent", disabled=(st.session_state.page_num == 0)): | |
| st.session_state.page_num -= 1 | |
| st.rerun() | |
| with col2: | |
| st.write(f"Page {st.session_state.page_num + 1} sur {total_pages}") | |
| with col3: | |
| if st.button("Suivant ➡️", disabled=(st.session_state.page_num >= total_pages - 1)): | |
| st.session_state.page_num += 1 | |
| st.rerun() | |
| # Documents de la page actuelle | |
| start_idx = st.session_state.page_num * items_per_page | |
| end_idx = start_idx + items_per_page | |
| page_docs = df_display.iloc[start_idx:end_idx] | |
| # Affichage sécurisé des documents | |
| for _, doc in page_docs.iterrows(): | |
| display_document_card(doc) | |
| # Export | |
| if not df_display.empty: | |
| st.divider() | |
| st.subheader("💾 Export des Données") | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| csv = df_display.to_csv(index=False, sep=';') | |
| st.download_button( | |
| "📊 Télécharger CSV", | |
| csv, | |
| f"codex_{datetime.now().strftime('%Y%m%d')}.csv", | |
| "text/csv", | |
| use_container_width=True | |
| ) | |
| with col2: | |
| json_str = df_display.to_json(orient='records', indent=2) | |
| st.download_button( | |
| "📋 Télécharger JSON", | |
| json_str, | |
| f"codex_{datetime.now().strftime('%Y%m%d')}.json", | |
| "application/json", | |
| use_container_width=True | |
| ) | |
| else: | |
| # Page d'accueil | |
| st.info("👈 Cliquez sur 'Actualiser les Documents' dans la barre latérale pour commencer") | |
| st.subheader("🎯 Fonctionnalités") | |
| st.write("• 📋 **Codes de Pratique (CXC)** - Procédures et bonnes pratiques") | |
| st.write("• ⚖️ **Normes (CXS)** - Standards alimentaires officiels") | |
| st.write("• 📖 **Directives (CXG)** - Lignes directrices et recommandations") | |
| st.write("• 🔍 **Recherche avancée** - Filtrage par catégorie, comité, année") | |
| st.write("• 📊 **Analyses visuelles** - Graphiques et statistiques") | |
| st.write("• 💾 **Export de données** - CSV et JSON") |