import streamlit as st
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sklearn.preprocessing import LabelEncoder
import plotly.express as px
import plotly.graph_objects as go
import umap
import shap
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from reportlab.lib.pagesizes import A4
from reportlab.pdfgen import canvas
from reportlab.lib import colors
import io
import logging
from datetime import datetime
import warnings
import os
warnings.filterwarnings('ignore')
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
# Chemin du modèle dans l’image Docker
model_path = 'src/omics_vae_best_hyperparams.pth'
# Biomarqueurs IRC
irc_biomarkers = [
'UMOD_rs12917707', 'APOL1_rs73885319', 'MYH9_rs4821480',
'HAVCR1', 'TGFB1', 'IL6', 'HNF4A', 'NPHS1', 'AQP2',
'B2MG', 'Albumin', 'NGAL', 'Cystatin_C', 'Uromodulin', 'KLOTHO',
'Kynurenine', 'Indoxyl_Sulfate', 'Creatinine', '5-MTP'
]
# Définition du modèle OmicsVAE
class OmicsVAE(nn.Module):
def __init__(self, input_dims, hidden_dim=256, latent_dim=64, num_heads=8, num_layers=3, dropout=0.4, num_classes=2):
super(OmicsVAE, self).__init__()
self.input_dims = input_dims
self.hidden_dim = hidden_dim
self.latent_dim = latent_dim
self.num_omics = len(input_dims)
self.num_classes = num_classes
self.input_projections = nn.ModuleList([nn.Linear(dim, hidden_dim) for dim in input_dims])
self.positional_encoding = self.create_positional_encoding(hidden_dim, max_len=self.num_omics)
transformer_layer = nn.TransformerEncoderLayer(
d_model=hidden_dim, nhead=num_heads,
dim_feedforward=hidden_dim * 4, dropout=dropout,
batch_first=True
)
self.transformer_encoder = nn.TransformerEncoder(transformer_layer, num_layers=num_layers)
self.fc_mu = nn.Linear(hidden_dim * self.num_omics, latent_dim)
self.fc_log_var = nn.Linear(hidden_dim * self.num_omics, latent_dim)
self.fc_decode = nn.Linear(latent_dim, hidden_dim * self.num_omics)
self.decoder_projections = nn.ModuleList([nn.Linear(hidden_dim, dim) for dim in input_dims])
self.fc_classify = nn.Linear(latent_dim, num_classes)
def create_positional_encoding(self, d_model, max_len):
pe = torch.zeros(max_len, d_model)
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
return pe.unsqueeze(0)
def reparameterize(self, mu, log_var):
std = torch.exp(0.5 * log_var)
eps = torch.randn_like(std)
return mu + eps * std
def forward(self, x_list):
encoded = []
for i, x in enumerate(x_list):
proj = self.input_projections[i](x)
pe = self.positional_encoding[:, i, :].to(x.device)
proj = proj + pe.expand(x.size(0), -1)
encoded.append(proj.unsqueeze(1))
encoded = torch.cat(encoded, dim=1)
transformer_out = self.transformer_encoder(encoded)
transformer_out = transformer_out.contiguous().view(transformer_out.size(0), -1)
mu = self.fc_mu(transformer_out)
log_var = self.fc_log_var(transformer_out)
z = self.reparameterize(mu, log_var)
decoded = self.fc_decode(z).view(z.size(0), self.num_omics, self.hidden_dim)
outputs = [self.decoder_projections[i](decoded[:, i, :]) for i in range(self.num_omics)]
class_logits = self.fc_classify(z)
return outputs, z, mu, log_var, class_logits
# Fonction pour générer des recommandations avec BioBERT
def generate_recommendation_with_biobert(patient_data, patient_id, biomarkers, tokenizer, model, data_dict):
# Extraire les valeurs réelles des biomarqueurs
biomarker_values = []
for omic in data_dict:
for biomarker in biomarkers[:3]:
if biomarker in data_dict[omic].columns:
value = data_dict[omic].loc[patient_id, biomarker]
biomarker_values.append(f"{biomarker}: {value:.2f}")
# Structurer le texte d’entrée
text = f"""
Patient: {patient_id}, {patient_data['sex']}, {patient_data['age']} ans.
Score de risque IRC: {patient_data['risk_score']:.1f}%.
Antécédents familiaux: IRC ({'Oui' if patient_data['family_history_irc'] else 'Non'}),
Diabète ({'Oui' if patient_data['family_history_diabetes'] else 'Non'}),
Hypertension ({'Oui' if patient_data['family_history_hypertension'] else 'Non'}).
Comorbidités: Diabète ({'Oui' if patient_data['diabetes'] else 'Non'}),
Hypertension ({'Oui' if patient_data['hypertension'] else 'Non'}).
Biomarqueurs: {', '.join(biomarker_values)}.
"""
# Tokenisation et classification
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
with torch.no_grad():
outputs = model(**inputs)
logits = outputs.logits
prediction = torch.argmax(logits, dim=1).item()
# Génération dynamique des recommandations
base_advice = {
0: {
'title': f"Patient {patient_id} : Risque Faible ({patient_data['risk_score']:.1f}%)",
'state': "Faible probabilité de progression vers l'IRC.",
'lifestyle': "Adopter une alimentation faible en sel (<2g/jour), riche en fruits et légumes. Activité physique modérée (30 min/jour, 5 jours/semaine).",
'monitoring': f"Bilan rénal annuel, surveiller {', '.join(biomarker_values[:2])}. Hydratation adéquate (1,5-2L/jour).",
'therapy': "Aucune thérapie spécifique requise."
},
1: {
'title': f"Patient {patient_id} : Risque Modéré ({patient_data['risk_score']:.1f}%)",
'state': "Risque intermédiaire de progression vers l'IRC.",
'lifestyle': "Régime strict : réduire protéines animales, sodium (<1,5g/jour). Contrôler pression artérielle (<130/80 mmHg).",
'monitoring': f"Consultation néphrologue trimestrielle, évaluer {', '.join(biomarker_values[:2])}. Éviter AINS sauf prescription.",
'therapy': "Considérer un contrôle glycémique strict si diabétique."
},
2: {
'title': f"Patient {patient_id} : Risque Élevé ({patient_data['risk_score']:.1f}%)",
'state': "Forte probabilité de progression vers l'IRC.",
'lifestyle': "Régime rénal strict : faible en potassium, phosphore, sodium.",
'monitoring': f"Consultation néphrologue urgente, surveillance hebdomadaire (créatinine, DFG). Analyser {', '.join(biomarker_values[:3])}. ",
'therapy': "Envisager inhibiteurs de l’ECA ou diurétiques, après évaluation."
}
}
advice = base_advice[prediction]
if patient_data['diabetes']:
advice['therapy'] += " Contrôle strict de la glycémie (HbA1c <7%)."
if patient_data['hypertension']:
advice['therapy'] += " Médicaments antihypertenseurs (ex. : losartan) sous supervision."
if patient_data['family_history_irc']:
advice['monitoring'] += " Surveillance accrue des antécédents familiaux."
for biomarker in biomarker_values:
if "Creatinine" in biomarker and float(biomarker.split(":")[1]) > 1.5:
advice['monitoring'] += f" Attention : Créatinine élevée ({biomarker.split(':')[1]} mg/dL), suivi rapproché recommandé."
formatted_advice = f"""
**{advice['title']}**
- **État** : {advice['state']}
- **Mode de vie** : {advice['lifestyle']}
- **Suivi** : {advice['monitoring']}
- **Thérapie** : {advice['therapy']}
**Avertissement** : Ces recommandations doivent être validées par un médecin.
"""
return formatted_advice
# Fonction pour générer un rapport PDF en mémoire
def generate_pdf_report(patient_id, patient_data, advice, umap_df, shap_importance=None):
buffer = io.BytesIO()
c = canvas.Canvas(buffer, pagesize=A4)
c.setFont("Helvetica-Bold", 16)
c.drawString(50, 800, f"Rapport IRC - Patient {patient_id}")
c.setFont("Helvetica", 12)
c.drawString(50, 770, f"Date: {datetime.now().strftime('%Y-%m-%d')}")
# Informations patient
c.setFont("Helvetica-Bold", 14)
c.drawString(50, 740, "Informations du Patient")
c.setFont("Helvetica", 12)
y = 720
c.drawString(50, y, f"Âge: {patient_data['age']} ans")
c.drawString(50, y-20, f"Sexe: {patient_data['sex']}")
c.drawString(50, y-40, f"Score de risque: {patient_data['risk_score']:.1f}%")
c.drawString(50, y-60, f"Antécédents familiaux: IRC ({'Oui' if patient_data['family_history_irc'] else 'Non'}), "
f"Diabète ({'Oui' if patient_data['family_history_diabetes'] else 'Non'}), "
f"Hypertension ({'Oui' if patient_data['family_history_hypertension'] else 'Non'})")
c.drawString(50, y-80, f"Comorbidités: Diabète ({'Oui' if patient_data['diabetes'] else 'Non'}), "
f"Hypertension ({'Oui' if patient_data['hypertension'] else 'Non'})")
# Recommandations
c.setFont("Helvetica-Bold", 14)
c.drawString(50, y-110, "Recommandations")
c.setFont("Helvetica", 12)
text_object = c.beginText(50, y-130)
text_object.setLeading(14)
for line in advice.split('\n'):
text_object.textLine(line)
c.drawText(text_object)
# Graphique UMAP
if 'umap_df' in st.session_state:
fig = px.scatter(
umap_df, x='UMAP1', y='UMAP2', color='Score de Risque (%)', symbol='Status',
title='Projection UMAP',
color_continuous_scale='RdYlGn_r',
template='plotly_dark'
)
fig.update_traces(marker=dict(size=12))
img_buffer = io.BytesIO()
fig.write_image(img_buffer, format='png', width=500, height=300)
img_buffer.seek(0)
c.drawImage(img_buffer, 50, y-400, width=500, height=300)
# Graphique SHAP
if shap_importance is not None:
c.showPage()
c.setFont("Helvetica-Bold", 14)
c.drawString(50, 800, "Analyse SHAP")
fig, ax = plt.subplots(figsize=(6, 4))
sns.barplot(data=shap_importance.head(10), x='Importance SHAP', y='Biomarqueur', palette='Set2')
plt.title('Top 10 Biomarqueurs')
img_buffer = io.BytesIO()
fig.savefig(img_buffer, format='png', bbox_inches='tight')
plt.close()
img_buffer.seek(0)
c.drawImage(img_buffer, 50, 700, width=500, height=300)
c.save()
buffer.seek(0)
return buffer
# Configuration de Streamlit
st.set_page_config(page_title="Analyse Multi-Omique IRC", layout="wide")
st.markdown("""
""", unsafe_allow_html=True)
st.title("Plateforme d’Analyse Multi-Omique pour l’IRC")
st.markdown("**Ngoue David, M2 Intelligence Artificielle et Big Data** | Hôpital Général de Yaoundé")
# Menu latéral
st.sidebar.header("Navigation")
page = st.sidebar.radio("Étapes", [
"Présentation",
"Chargement des Données",
"Analyse Exploratoire",
"Clustering",
"Scores de Risque",
"Analyse SHAP",
"Conseiller Médical",
"Résumé"
])
# Chargement de BioBERT
@st.cache_resource
def load_biobert():
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-v1.1")
model = AutoModelForSequenceClassification.from_pretrained("dmis-lab/biobert-v1.1", num_labels=3)
return tokenizer, model
biobert_tokenizer, biobert_model = load_biobert()
# Chargement du modèle VAE
@st.cache_resource
def load_model(input_dims, num_classes):
model = OmicsVAE(input_dims=input_dims, num_classes=num_classes)
try:
model.load_state_dict(torch.load(model_path))
except FileNotFoundError:
raise FileNotFoundError(f"Modèle {model_path} non trouvé dans l’image Docker.")
model.eval()
return model
# Présentation
if page == "Présentation":
st.header("Contexte et Innovation")
st.markdown("""
**Projet : Thérapie Personnalisée de l’IRC**
Réalisé par Ngoue David, ce projet révolutionne la prise en charge de l’IRC à l’Hôpital Général de Yaoundé via une approche multi-omique. Une architecture de transformers hybrides (OmicsVAE) permet :
- **Prédiction** des risques de progression de l’IRC.
- **Thérapies sur mesure** basées sur les profils moléculaires.
- **Suivi intelligent** avec un conseiller BioBERT.
**Impact** : Médecine de précision pour le Cameroun, meilleurs résultats, coûts réduits.
**Explorez** via le menu latéral.
""")
# Chargement des Données
elif page == "Chargement des Données":
st.header("Chargement des Données")
st.markdown("Uploadez les fichiers omiques (CSV) pour l’analyse.")
uploaded_files = {}
omics_types = ['génomique', 'transcriptomique', 'protéomique', 'métabolomique']
for omic in omics_types:
uploaded_file = st.file_uploader(f"Données {omic} (CSV)", type="csv", key=omic)
if uploaded_file:
uploaded_files[omic] = uploaded_file
if st.button("Initialiser l’Analyse") and len(uploaded_files) == len(omics_types):
try:
data_dict = {}
for omic, file in uploaded_files.items():
df = pd.read_csv(file, index_col='Patient_ID')
if 'Status' not in df.columns:
raise ValueError(f"Le fichier {omic} doit contenir une colonne 'Status'.")
data_dict[omic] = df.drop(columns=['Status'])
labels = pd.read_csv(uploaded_files['génomique'], index_col='Patient_ID')['Status']
le = LabelEncoder()
encoded_labels = pd.Series(le.fit_transform(labels), index=labels.index, name='Status')
common_samples = data_dict['génomique'].index
for omic in data_dict:
data_dict[omic] = data_dict[omic].loc[common_samples]
labels = encoded_labels.loc[common_samples]
input_dims = [data_dict[omic].shape[1] for omic in data_dict]
model = load_model(input_dims, len(np.unique(encoded_labels)))
st.session_state['data_dict'] = data_dict
st.session_state['labels'] = labels
st.session_state['label_encoder'] = le
st.session_state['common_samples'] = common_samples
st.session_state['model'] = model
st.session_state['input_dims'] = input_dims
st.success("Données et modèle chargés avec succès !")
except Exception as e:
st.error(f"Erreur : {str(e)}")
# Analyse Exploratoire
elif page == "Analyse Exploratoire":
st.header("Analyse Exploratoire")
if 'data_dict' not in st.session_state:
st.warning("Chargez les données d'abord.")
else:
data_dict = st.session_state['data_dict']
labels = st.session_state['labels']
omic = st.selectbox("Type omique", list(data_dict.keys()))
biomarkers = [col for col in data_dict[omic].columns if col in irc_biomarkers]
if biomarkers:
st.subheader(f"Matrice de Corrélation ({omic})")
corr_matrix = data_dict[omic][biomarkers].corr()
fig = go.Figure(data=go.Heatmap(
z=corr_matrix.values,
x=corr_matrix.columns,
y=corr_matrix.columns,
colorscale='Magma',
zmin=-1, zmax=1,
text=np.round(corr_matrix.values, 2),
texttemplate="%{text}",
hovertemplate='Biomarqueur 1: %{x}
Biomarqueur 2: %{y}
Corrélation: %{z:.2f}