CODEXMONITEUR / src /streamlit_app.py
MMOON's picture
Update src/streamlit_app.py
5b621c0 verified
# codex_app_fixed.py
import streamlit as st
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
from datetime import datetime
import urllib.parse
import time
import html
# --- Configuration ---
CODEX_CATEGORIES = {
'codes': {
'name': 'Codes de Pratique (CXC)',
'url': 'https://www.fao.org/fao-who-codexalimentarius/codex-texts/codes-of-practice/fr/',
'prefix': 'CXC',
'icon': '📋',
'color': '#FF6B6B'
},
'standards': {
'name': 'Normes (CXS)',
'url': 'https://www.fao.org/fao-who-codexalimentarius/codex-texts/list-standards/fr/',
'prefix': 'CXS',
'icon': '⚖️',
'color': '#4ECDC4'
},
'guidelines': {
'name': 'Directives (CXG)',
'url': 'https://www.fao.org/fao-who-codexalimentarius/codex-texts/guidelines/fr/',
'prefix': 'CXG',
'icon': '📖',
'color': '#45B7D1'
}
}
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36'
}
TIMEOUT = 30
BASE_URL = "https://www.fao.org"
# Configuration de la page
st.set_page_config(
page_title="Codex Alimentarius Monitor",
page_icon="🔬",
layout="wide",
initial_sidebar_state="expanded"
)
def clean_text(text):
"""Nettoyer le texte des caractères problématiques"""
if not text:
return ""
# Décoder les entités HTML
text = html.unescape(str(text))
# Supprimer les balises HTML restantes
text = re.sub(r'<[^>]+>', '', text)
# Nettoyer les caractères spéciaux
text = text.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
# Supprimer les espaces multiples
text = re.sub(r'\s+', ' ', text).strip()
return text
@st.cache_data(ttl=1800)
def extract_documents_from_url(url, category_key):
"""Fonction pour extraire les documents d'une catégorie Codex."""
category_info = CODEX_CATEGORIES[category_key]
documents = []
seen_codes = set()
try:
response = requests.get(url, headers=HEADERS, timeout=TIMEOUT)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
tables = soup.find_all('table')
if not tables:
return documents
for table in tables:
rows = table.find_all('tr')
for row in rows:
cells = row.find_all(['td', 'th'])
if len(cells) >= 5:
# Nettoyer chaque cellule
cell_texts = [clean_text(cell.get_text(strip=True)) for cell in cells[:4]]
code_candidate = cell_texts[0] if cell_texts else ""
prefix = category_info['prefix']
code_match = re.match(rf'^({prefix})\s+([\w\-R]*\d+(?:-\d+)?[R]?)$', code_candidate)
if code_match:
prefix_found = code_match.group(1)
number_part = code_match.group(2)
full_code = f"{prefix_found} {number_part}"
if full_code not in seen_codes:
seen_codes.add(full_code)
# Nettoyer toutes les données texte
title = clean_text(cell_texts[1]) if len(cell_texts) > 1 else "Titre non trouvé"
committee = clean_text(cell_texts[2]) if len(cell_texts) > 2 else "COMITE"
year_str = clean_text(cell_texts[3]) if len(cell_texts) > 3 else ""
# Validation et nettoyage de l'année
try:
year = int(re.search(r'\d{4}', year_str).group()) if re.search(r'\d{4}', year_str) else 0
except (ValueError, AttributeError):
year = 0
# Extraction du lien PDF
pdf_cell = cells[4]
pdf_url = None
link_tag = pdf_cell.find('a', href=re.compile(r'.*\.pdf', re.IGNORECASE))
if link_tag:
href = link_tag.get('href')
if href:
decoded_href = urllib.parse.unquote(href)
pdf_url = urllib.parse.urljoin(BASE_URL, decoded_href)
if not pdf_url:
pdf_url = f"https://www.fao.org/fao-who-codexalimentarius/search/en/?q={full_code.replace(' ', '%20')}"
is_new = year >= datetime.now().year - 1
is_updated = year == datetime.now().year
documents.append({
'code': full_code,
'title': title,
'committee': committee,
'year': year,
'category': category_key,
'category_name': category_info['name'],
'pdf_url': pdf_url,
'is_new': is_new,
'is_updated': is_updated,
'icon': category_info['icon'],
'color': category_info['color']
})
return documents
except Exception as e:
st.error(f"Erreur lors de l'extraction de {category_info['name']} : {e}")
return []
def display_document_card(doc):
"""Afficher une carte de document de manière sécurisée"""
# Créer les badges de statut
badges = []
badges.append(f"{doc['icon']} {doc['category_name']}")
if doc['is_new']:
badges.append("✨ NOUVEAU")
if doc['is_updated']:
badges.append("🔄 MIS À JOUR")
# Affichage sécurisé sans HTML
with st.container():
# Badges en colonnes
cols = st.columns(len(badges))
for i, badge in enumerate(badges):
with cols[i]:
if "NOUVEAU" in badge:
st.success(badge)
elif "MIS À JOUR" in badge:
st.info(badge)
else:
st.caption(badge)
# Titre et informations
st.subheader(f"{doc['code']} - {doc['title']}")
col1, col2 = st.columns([2, 1])
with col1:
st.text(f"🏢 Comité: {doc['committee']}")
st.text(f"📅 Année: {doc['year']}")
with col2:
st.link_button(
"📄 Télécharger PDF",
doc['pdf_url'],
type="primary",
use_container_width=True
)
st.divider()
def display_metrics_safe(df):
"""Afficher les métriques de manière sécurisée"""
if df.empty:
return
col1, col2, col3, col4 = st.columns(4)
with col1:
st.metric(
label="📊 Total Documents",
value=len(df)
)
with col2:
new_docs = len(df[df['is_new']])
st.metric(
label="✨ Nouveaux",
value=new_docs
)
with col3:
updated_docs = len(df[df['is_updated']])
st.metric(
label="🔄 Mis à jour",
value=updated_docs
)
with col4:
categories = df['category_name'].nunique()
st.metric(
label="📂 Catégories",
value=categories
)
def display_category_stats(df):
"""Afficher les statistiques par catégorie de manière sécurisée"""
if df.empty:
return
st.subheader("📋 Répartition par Catégorie")
category_stats = df.groupby(['category_name', 'category']).agg({
'code': 'count',
'is_new': 'sum',
'is_updated': 'sum'
}).reset_index()
for _, row in category_stats.iterrows():
category_key = row['category']
category_info = CODEX_CATEGORIES[category_key]
with st.expander(f"{category_info['icon']} {row['category_name']}", expanded=True):
col1, col2, col3 = st.columns(3)
with col1:
st.metric("Total", int(row['code']))
with col2:
st.metric("Nouveaux", int(row['is_new']))
with col3:
st.metric("Mis à jour", int(row['is_updated']))
# CSS minimal et sécurisé
st.markdown("""
<style>
.stApp > header {
background-color: transparent;
}
.main > div {
padding-top: 2rem;
}
h1 {
color: #1f77b4;
text-align: center;
padding: 1rem;
background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
color: white;
border-radius: 10px;
margin-bottom: 2rem;
}
</style>
""", unsafe_allow_html=True)
# Initialisation
if 'documents' not in st.session_state:
st.session_state.documents = []
st.session_state.last_update = None
# Interface principale
st.title("🔬 Codex Alimentarius Monitor")
st.caption("Exploration et suivi des normes alimentaires internationales")
# Sidebar
with st.sidebar:
st.header("🎛️ Centre de Contrôle")
if st.button("🔄 Actualiser les Documents", type="primary", use_container_width=True):
with st.spinner("🔍 Extraction en cours..."):
all_documents = []
progress_bar = st.progress(0)
status_text = st.empty()
for i, (cat_key, cat_info) in enumerate(CODEX_CATEGORIES.items()):
status_text.info(f"{cat_info['icon']} Extraction de {cat_info['name']}...")
docs = extract_documents_from_url(cat_info['url'], cat_key)
all_documents.extend(docs)
progress_bar.progress((i + 1) / len(CODEX_CATEGORIES))
time.sleep(0.5)
st.session_state.documents = all_documents
st.session_state.last_update = datetime.now()
status_text.success("✅ Données actualisées!")
time.sleep(1)
status_text.empty()
if st.session_state.last_update:
st.success(f"🕒 Dernière MAJ: {st.session_state.last_update.strftime('%d/%m/%Y %H:%M')}")
# Filtres
if st.session_state.documents:
st.divider()
st.header("🔍 Filtres")
df_all = pd.DataFrame(st.session_state.documents)
# Filtre par catégorie
categories = ['Toutes'] + list(df_all['category_name'].unique())
selected_category = st.selectbox("📂 Catégorie:", categories)
# Filtre par comité
committees = ['Tous'] + sorted(df_all['committee'].unique())
selected_committee = st.selectbox("🏢 Comité:", committees)
# Filtres de statut
col1, col2 = st.columns(2)
with col1:
filter_new = st.checkbox("✨ Nouveaux")
with col2:
filter_updated = st.checkbox("🔄 Mis à jour")
# Filtre par année
years = sorted([y for y in df_all['year'].unique() if y > 0], reverse=True)
if years:
selected_years = st.multiselect("📅 Années:", years, default=years) # Toutes les années par défaut
# Recherche
search_term = st.text_input("🔍 Recherche:", placeholder="Code ou titre...")
# Application des filtres
filtered_df = df_all.copy()
if selected_category != 'Toutes':
filtered_df = filtered_df[filtered_df['category_name'] == selected_category]
if selected_committee != 'Tous':
filtered_df = filtered_df[filtered_df['committee'] == selected_committee]
if filter_new:
filtered_df = filtered_df[filtered_df['is_new']]
if filter_updated:
filtered_df = filtered_df[filtered_df['is_updated']]
if 'selected_years' in locals() and selected_years:
filtered_df = filtered_df[filtered_df['year'].isin(selected_years)]
if search_term:
search_mask = (
filtered_df['title'].str.contains(search_term, case=False, na=False) |
filtered_df['code'].str.contains(search_term, case=False, na=False)
)
filtered_df = filtered_df[search_mask]
st.session_state.filtered_df = filtered_df
st.info(f"📊 {len(filtered_df)} documents trouvés")
# Contenu principal
if st.session_state.documents:
df_display = st.session_state.get('filtered_df', pd.DataFrame(st.session_state.documents))
df_display = df_display.sort_values(by=['year', 'code'], ascending=[False, True]).reset_index(drop=True)
# Métriques
display_metrics_safe(df_display)
# Onglets
tab1, tab2, tab3 = st.tabs(["📋 Aperçu", "📊 Analyses", "📄 Documents"])
with tab1:
display_category_stats(df_display)
if not df_display.empty:
st.subheader("📈 Informations Générales")
col1, col2 = st.columns(2)
with col1:
st.write("**📊 Statistiques**")
st.write(f"• Période: {df_display['year'].min()} - {df_display['year'].max()}")
st.write(f"• Comités: {df_display['committee'].nunique()}")
st.write(f"• Documents récents: {len(df_display[df_display['year'] >= 2023])}")
with col2:
st.write("**🏆 Top 5 Comités**")
top_committees = df_display['committee'].value_counts().head()
for committee, count in top_committees.items():
st.write(f"• {committee}: {count}")
with tab2:
st.subheader("📊 Analyses Visuelles")
col1, col2 = st.columns(2)
with col1:
st.write("##### 📊 Par Catégorie")
category_counts = df_display['category_name'].value_counts()
st.bar_chart(category_counts)
with col2:
st.write("##### 📈 Par Année")
year_counts = df_display[df_display['year'] > 2000]['year'].value_counts().sort_index()
st.line_chart(year_counts)
with tab3:
st.subheader(f"📄 Liste des Documents ({len(df_display)} résultats)")
# Pagination simple
items_per_page = 10
if 'page_num' not in st.session_state:
st.session_state.page_num = 0
total_pages = (len(df_display) - 1) // items_per_page + 1
if total_pages > 1:
col1, col2, col3 = st.columns([1, 2, 1])
with col1:
if st.button("⬅️ Précédent", disabled=(st.session_state.page_num == 0)):
st.session_state.page_num -= 1
st.rerun()
with col2:
st.write(f"Page {st.session_state.page_num + 1} sur {total_pages}")
with col3:
if st.button("Suivant ➡️", disabled=(st.session_state.page_num >= total_pages - 1)):
st.session_state.page_num += 1
st.rerun()
# Documents de la page actuelle
start_idx = st.session_state.page_num * items_per_page
end_idx = start_idx + items_per_page
page_docs = df_display.iloc[start_idx:end_idx]
# Affichage sécurisé des documents
for _, doc in page_docs.iterrows():
display_document_card(doc)
# Export
if not df_display.empty:
st.divider()
st.subheader("💾 Export des Données")
col1, col2 = st.columns(2)
with col1:
csv = df_display.to_csv(index=False, sep=';')
st.download_button(
"📊 Télécharger CSV",
csv,
f"codex_{datetime.now().strftime('%Y%m%d')}.csv",
"text/csv",
use_container_width=True
)
with col2:
json_str = df_display.to_json(orient='records', indent=2)
st.download_button(
"📋 Télécharger JSON",
json_str,
f"codex_{datetime.now().strftime('%Y%m%d')}.json",
"application/json",
use_container_width=True
)
else:
# Page d'accueil
st.info("👈 Cliquez sur 'Actualiser les Documents' dans la barre latérale pour commencer")
st.subheader("🎯 Fonctionnalités")
st.write("• 📋 **Codes de Pratique (CXC)** - Procédures et bonnes pratiques")
st.write("• ⚖️ **Normes (CXS)** - Standards alimentaires officiels")
st.write("• 📖 **Directives (CXG)** - Lignes directrices et recommandations")
st.write("• 🔍 **Recherche avancée** - Filtrage par catégorie, comité, année")
st.write("• 📊 **Analyses visuelles** - Graphiques et statistiques")
st.write("• 💾 **Export de données** - CSV et JSON")