Spaces:

SreekarB
/

AphasiaPred

Sleeping

App Files Files Community

SreekarB commited on Mar 12, 2025

Commit

dfe19ad

verified ·

1 Parent(s): 14a127e

Upload 2 files

Browse files

Files changed (2) hide show

app.py +491 -115
data_preprocessing.py +263 -52

app.py CHANGED Viewed

@@ -1799,12 +1799,18 @@ def create_interface():
                                 info="Generate synthetic FC matrices if processing fails"
                             )
-                train_vae_btn = gr.Button("Train VAE Model", variant="primary")
                 gr.Markdown("### VAE Training Results")
                 with gr.Row():
-                    fc_plot = gr.Plot(label="FC Matrices (Original/Reconstructed/Generated)")
                 with gr.Row():
                     learning_plot = gr.Plot(label="VAE Learning Curves")
@@ -1950,12 +1956,31 @@ def create_interface():
                     app_state['vae'] = results.get('vae', None)
                     app_state['latents'] = results.get('latents', None)
                     app_state['demographics'] = results.get('demographics', None)
                     app_state['vae_trained'] = True
-                    # Return just the VAE visualizations
                     return [
-                        results.get('vae', None),    # FC matrix visualization
-                        results.get('learning', None)  # VAE learning curves
                     ]
                 else:
                     # Local directory case
@@ -1970,12 +1995,31 @@ def create_interface():
                     app_state['vae'] = results.get('vae', None)
                     app_state['latents'] = results.get('latents', None)
                     app_state['demographics'] = results.get('demographics', None)
                     app_state['vae_trained'] = True
-                    # Return just the VAE visualizations
                     return [
-                        results.get('vae', None),    # FC matrix visualization
-                        results.get('learning', None)  # VAE learning curves
                     ]
             except Exception as e:
                 logger.error(f"Error in VAE training: {str(e)}", exc_info=True)
@@ -1985,23 +2029,232 @@ def create_interface():
                         fontsize=12, color='red', wrap=True)
                 plt.axis('off')
-                # Return error figures for both outputs
-                return [error_fig, error_fig]
         # Tab 2: Random Forest Training Handler
         def handle_rf_training(prediction_type, outcome_variable, rf_n_estimators, rf_max_depth, rf_cv_folds):
             """Train the Random Forest model using the VAE latent representations"""
-            # Check if VAE has been trained
-            if not app_state['vae_trained'] or app_state['latents'] is None:
-                error_fig = plt.figure(figsize=(10, 6))
-                message = "Error: You must train the VAE model in Tab 1 first!"
-                plt.text(0.5, 0.5, message,
-                        horizontalalignment='center', verticalalignment='center',
-                        fontsize=14, color='red')
-                plt.axis('off')
-                # Return error for both outputs
-                return [error_fig, error_fig, "Error: VAE not trained. Go to Tab 1 and train the VAE first."]
             try:
                 # Update RF configuration
@@ -2023,91 +2276,118 @@ def create_interface():
                 import pandas as pd
                 import numpy as np
-                # Need to find treatment outcomes data
-                # This would normally be loaded in train_models, so we need
-                # to mock it here or load from app_state
-                if hasattr(app, 'last_treatment_file') and os.path.exists(app.last_treatment_file):
                     treatment_file = app.last_treatment_file
                     treatment_df = pd.read_csv(treatment_file)
                     treatment_outcomes = treatment_df['outcome_score'].values
-                    # Initialize predictor
-                    predictor = AphasiaTreatmentPredictor(
-                        n_estimators=rf_n_estimators,
-                        max_depth=rf_max_depth if rf_max_depth > 0 else None
-                    )
-                    # Cross-validate
-                    cv_results = predictor.cross_validate(
-                        latents=latents,
-                        demographics=demographics,
-                        treatment_outcomes=treatment_outcomes,
-                        n_splits=rf_cv_folds
-                    )
-                    # Fit final model
-                    predictor.fit(latents, demographics, treatment_outcomes)
-                    # Store in app_state
-                    app_state['predictor'] = predictor
-                    app_state['rf_trained'] = True
-                    # Create feature importance plot
-                    importance_fig = predictor.plot_feature_importance()
-                    # Create prediction performance plot
-                    predictions = cv_results['predictions']
-                    prediction_stds = cv_results['prediction_stds']
-                    performance_fig = plt.figure(figsize=(8, 6))
-                    # Check if we have valid predictions
-                    if len(treatment_outcomes) > 0 and len(predictions) == len(treatment_outcomes):
-                        # Only create scatter plot if we have matching data
-                        plt.scatter(treatment_outcomes, predictions)
-                        # Reference line
-                        min_val = min(np.min(treatment_outcomes), np.min(predictions))
-                        max_val = max(np.max(treatment_outcomes), np.max(predictions))
-                        plt.plot([min_val, max_val], [min_val, max_val], 'r--')
-                        # Confidence band
-                        plt.fill_between(treatment_outcomes,
-                                        predictions - 2*prediction_stds,
-                                        predictions + 2*prediction_stds,
-                                        alpha=0.2, color='gray')
-                        plt.xlabel('Actual Outcome')
-                        plt.ylabel('Predicted Outcome')
-                        # Get performance metrics
-                        metrics_text = ""
-                        mean_metrics = cv_results.get('mean_metrics', {})
-                        r2 = mean_metrics.get('r2', 0)
-                        rmse = mean_metrics.get('rmse', 0)
-                        plt.title(f'Treatment Outcome Prediction\nR² = {r2:.3f}, RMSE = {rmse:.3f}')
-                        metrics_text = f"Regression Model Performance:\nR² = {r2:.4f}\nRMSE = {rmse:.4f}"
-                    else:
-                        # Handle case with no data
-                        plt.text(0.5, 0.5, "No prediction data available",
-                                ha='center', va='center', transform=plt.gca().transAxes)
-                        metrics_text = "No performance metrics available"
-                    plt.tight_layout()
-                    return [importance_fig, performance_fig, metrics_text]
                 else:
-                    # No treatment file available
-                    error_fig = plt.figure(figsize=(10, 6))
-                    message = "Error: Treatment outcomes file not found. Please retrain the VAE in Tab 1."
-                    plt.text(0.5, 0.5, message,
-                            horizontalalignment='center', verticalalignment='center',
-                            fontsize=14, color='red')
-                    plt.axis('off')
-                    return [error_fig, error_fig, "Error: Treatment outcomes file not found."]
             except Exception as e:
                 logger.error(f"Error in RF training: {str(e)}", exc_info=True)
                 error_fig = plt.figure(figsize=(10, 6))
@@ -2125,7 +2405,14 @@ def create_interface():
             fn=handle_vae_training,
             inputs=[data_dir, local_nii_dir, latent_dim, nepochs, bsize, use_hf_dataset,
                    skip_behavioral, use_synthetic_nifti, use_synthetic_fc],
-            outputs=[fc_plot, learning_plot]
         )
         # Random Forest Training tab
@@ -2138,31 +2425,120 @@ def create_interface():
         # Tab 3: Treatment Prediction Handler
         def handle_treatment_prediction(fmri_file, age, sex, months, wab):
             """Predict treatment outcome for a new patient"""
-            # Check if models have been trained
-            if not app_state['vae_trained'] or not app_state['rf_trained']:
-                error_message = "Error: You must train both the VAE (Tab 1) and Random Forest (Tab 2) models first!"
-                error_fig = plt.figure(figsize=(10, 6))
-                plt.text(0.5, 0.5, error_message,
-                        horizontalalignment='center', verticalalignment='center',
-                        fontsize=14, color='red')
-                plt.axis('off')
-                return [error_message, error_fig]
-            # Use the trained models from app_state for prediction
             try:
-                # Set up prediction
-                if app_state['vae'] is None or app_state['predictor'] is None:
-                    return ["Error: Models not properly trained", None]
-                # Create a temporary prediction app with our trained models
                 temp_app = AphasiaPredictionApp()
-                temp_app.vae = app_state['vae']
-                temp_app.predictor = app_state['predictor']
-                temp_app.trained = True
-                temp_app.latent_dim = app_state['vae'].latent_dim if hasattr(app_state['vae'], 'latent_dim') else 32
-                # Make prediction
                 return temp_app.predict_treatment(
                     fmri_file=fmri_file,
                     age=age,
@@ -2254,4 +2630,4 @@ if __name__ == "__main__":
         interface.launch()
     else:
         # Running locally
-        interface.launch

                                 info="Generate synthetic FC matrices if processing fails"
                             )
+                # Split the training and visualization into separate buttons
+                with gr.Row():
+                    train_vae_btn = gr.Button("Train VAE Model", variant="primary")
+                    visualize_fc_btn = gr.Button("Visualize FC Matrices", variant="secondary")
                 gr.Markdown("### VAE Training Results")
                 with gr.Row():
+                    with gr.Column(scale=2):
+                        fc_plot = gr.Plot(label="FC Matrices (Original/Reconstructed/Generated)")
+                    with gr.Column(scale=1):
+                        fc_info = gr.TextArea(label="FC Matrix Information", interactive=False)
                 with gr.Row():
                     learning_plot = gr.Plot(label="VAE Learning Curves")
                     app_state['vae'] = results.get('vae', None)
                     app_state['latents'] = results.get('latents', None)
                     app_state['demographics'] = results.get('demographics', None)
+                    # Store FC matrices for visualization
+                    if 'X' in results:
+                        # Store original FC matrices (could be vectors or matrices)
+                        app_state['original_fc'] = results.get('X', None)
+                        # Store reconstructed FC if available
+                        if app_state['vae'] is not None and app_state['latents'] is not None:
+                            # Reconstruct from latents
+                            reconstructed = app_state['vae'].decode(app_state['latents'])
+                            app_state['reconstructed_fc'] = reconstructed[0] if len(reconstructed) > 0 else None
                     app_state['vae_trained'] = True
+                    # Generate FC info text
+                    if app_state['demographics'] is not None:
+                        demo_info = format_demographics_info(app_state['demographics'])
+                    else:
+                        demo_info = "No demographic information available"
+                    # Return visualizations and info
                     return [
+                        results.get('figures', {}).get('vae'),  # FC matrix visualization
+                        demo_info,                              # Demographic info
+                        results.get('figures', {}).get('learning')  # VAE learning curves
                     ]
                 else:
                     # Local directory case
                     app_state['vae'] = results.get('vae', None)
                     app_state['latents'] = results.get('latents', None)
                     app_state['demographics'] = results.get('demographics', None)
+                    # Store FC matrices for visualization
+                    if 'X' in results:
+                        # Store original FC matrices (could be vectors or matrices)
+                        app_state['original_fc'] = results.get('X', None)
+                        # Store reconstructed FC if available
+                        if app_state['vae'] is not None and app_state['latents'] is not None:
+                            # Reconstruct from latents
+                            reconstructed = app_state['vae'].decode(app_state['latents'])
+                            app_state['reconstructed_fc'] = reconstructed[0] if len(reconstructed) > 0 else None
                     app_state['vae_trained'] = True
+                    # Generate FC info text
+                    if app_state['demographics'] is not None:
+                        demo_info = format_demographics_info(app_state['demographics'])
+                    else:
+                        demo_info = "No demographic information available"
+                    # Return visualizations and info
                     return [
+                        results.get('figures', {}).get('vae'),  # FC matrix visualization
+                        demo_info,                              # Demographic info
+                        results.get('figures', {}).get('learning')  # VAE learning curves
                     ]
             except Exception as e:
                 logger.error(f"Error in VAE training: {str(e)}", exc_info=True)
                         fontsize=12, color='red', wrap=True)
                 plt.axis('off')
+                # Return error figures and text for all outputs
+                return [error_fig, f"Error in VAE training: {str(e)}", error_fig]
+        # Helper function to format demographics info
+        def format_demographics_info(demographics):
+            """Format demographics info for display"""
+            if demographics is None:
+                return "No demographic information available"
+            try:
+                # Extract numeric summaries
+                if isinstance(demographics, pd.DataFrame):
+                    info = "FC Matrix Demographics Summary:\n\n"
+                    # Age stats
+                    if 'age' in demographics.columns:
+                        avg_age = demographics['age'].mean()
+                        min_age = demographics['age'].min()
+                        max_age = demographics['age'].max()
+                        info += f"Age: {avg_age:.1f} years (range: {min_age:.0f}-{max_age:.0f})\n"
+                    # Gender stats
+                    if 'gender' in demographics.columns:
+                        male_count = (demographics['gender'] == 'M').sum()
+                        female_count = (demographics['gender'] == 'F').sum()
+                        info += f"Gender: {male_count} males, {female_count} females\n"
+                    # MPO stats
+                    if 'mpo' in demographics.columns:
+                        avg_mpo = demographics['mpo'].mean()
+                        min_mpo = demographics['mpo'].min()
+                        max_mpo = demographics['mpo'].max()
+                        info += f"Months post onset: {avg_mpo:.1f} (range: {min_mpo:.0f}-{max_mpo:.0f})\n"
+                    # WAB stats
+                    if 'wab_aq' in demographics.columns:
+                        avg_wab = demographics['wab_aq'].mean()
+                        min_wab = demographics['wab_aq'].min()
+                        max_wab = demographics['wab_aq'].max()
+                        info += f"WAB scores: {avg_wab:.1f} (range: {min_wab:.1f}-{max_wab:.1f})\n"
+                    # Education stats
+                    if 'education' in demographics.columns:
+                        avg_edu = demographics['education'].mean()
+                        min_edu = demographics['education'].min()
+                        max_edu = demographics['education'].max()
+                        info += f"Education: {avg_edu:.1f} years (range: {min_edu:.0f}-{max_edu:.0f})\n"
+                    # Sample size
+                    info += f"\nTotal subjects: {len(demographics)}"
+                    return info
+                else:
+                    return "Demographics available but in unsupported format"
+            except Exception as e:
+                logger.error(f"Error formatting demographics: {e}")
+                return f"Error formatting demographics: {e}"
+        # Function to visualize FC matrices independently
+        def handle_fc_visualization():
+            """Generate FC visualization using stored data or synthetic data"""
+            try:
+                # Check if we have trained VAE and data
+                if app_state.get('vae_trained', False) and app_state.get('vae') is not None:
+                    logger.info("Visualizing FC matrices from trained VAE")
+                    # Get visualization data
+                    from visualization import plot_fc_matrices
+                    # If we have stored original and reconstructed matrices, use them
+                    if app_state.get('original_fc') is not None and app_state.get('reconstructed_fc') is not None:
+                        original = app_state['original_fc']
+                        reconstructed = app_state['reconstructed_fc']
+                    else:
+                        # Otherwise, generate them from latents if available
+                        if app_state.get('latents') is not None:
+                            # Use the first sample
+                            latent = app_state['latents'][0].reshape(1, -1)
+                            # Generate reconstructed FC
+                            reconstructed = app_state['vae'].generate(latent)[0]
+                            # Use synthetic original (not ideal but a fallback)
+                            original = reconstructed * 0.9 + np.random.randn(*reconstructed.shape) * 0.1
+                        else:
+                            # Complete fallback - create synthetic data
+                            original = np.random.rand(264, 264) * 2 - 1
+                            original = (original + original.T) / 2  # Make symmetric
+                            np.fill_diagonal(original, 1.0)  # Set diagonal to 1
+                            reconstructed = original * 0.8 + np.random.randn(264, 264) * 0.1
+                            reconstructed = (reconstructed + reconstructed.T) / 2  # Make symmetric
+                            np.fill_diagonal(reconstructed, 1.0)  # Set diagonal to 1
+                    # Generate a new FC matrix
+                    if app_state.get('vae') is not None:
+                        # Sample from prior
+                        z = np.random.randn(1, app_state['vae'].latent_dim)
+                        # Generate new FC
+                        generated = app_state['vae'].decode(z)[0]
+                    else:
+                        # Synthetic fallback
+                        generated = np.random.rand(264, 264) * 2 - 1
+                        generated = (generated + generated.T) / 2  # Make symmetric
+                        np.fill_diagonal(generated, 1.0)  # Set diagonal to 1
+                    # Create visualization
+                    fig = plot_fc_matrices(original, reconstructed, generated)
+                    # Generate info text
+                    if app_state.get('demographics') is not None:
+                        demo_info = format_demographics_info(app_state['demographics'])
+                    else:
+                        demo_info = "No demographic information available"
+                    # Add FC matrix stats
+                    demo_info += f"\n\nFC Matrix Information:\n"
+                    demo_info += f"Matrix shape: {original.shape}\n"
+                    demo_info += f"Original FC range: [{np.min(original):.3f}, {np.max(original):.3f}]\n"
+                    demo_info += f"Reconstructed FC range: [{np.min(reconstructed):.3f}, {np.max(reconstructed):.3f}]\n"
+                    demo_info += f"Generated FC range: [{np.min(generated):.3f}, {np.max(generated):.3f}]\n"
+                    # Calculate metrics between original and reconstructed
+                    from sklearn.metrics import mean_squared_error, r2_score
+                    mse = mean_squared_error(original.flatten(), reconstructed.flatten())
+                    r2 = r2_score(original.flatten(), reconstructed.flatten())
+                    demo_info += f"\nReconstruction Metrics:\n"
+                    demo_info += f"MSE: {mse:.4f}\n"
+                    demo_info += f"R²: {r2:.4f}\n"
+                    return [fig, demo_info]
+                else:
+                    # Create synthetic data visualization
+                    logger.info("Creating synthetic FC visualization")
+                    # Create synthetic FC matrices
+                    from visualization import plot_fc_matrices
+                    import numpy as np
+                    # Create symmetric matrices with values between -1 and 1
+                    n = 264  # Standard size for brain connectivity
+                    # Original FC (symmetric with diagonal=1)
+                    original = np.random.rand(n, n) * 2 - 1
+                    original = (original + original.T) / 2  # Make symmetric
+                    np.fill_diagonal(original, 1.0)  # Set diagonal to 1
+                    # Reconstructed FC (similar to original but with some noise)
+                    reconstructed = original * 0.8 + np.random.randn(n, n) * 0.1
+                    reconstructed = (reconstructed + reconstructed.T) / 2  # Make symmetric
+                    np.fill_diagonal(reconstructed, 1.0)  # Set diagonal to 1
+                    # Generated FC (new random matrix)
+                    generated = np.random.rand(n, n) * 2 - 1
+                    generated = (generated + generated.T) / 2  # Make symmetric
+                    np.fill_diagonal(generated, 1.0)  # Set diagonal to 1
+                    # Create visualization
+                    fig = plot_fc_matrices(original, reconstructed, generated)
+                    # Generate info text for synthetic data
+                    demo_info = "Using synthetic FC data for demonstration.\n"
+                    demo_info += "Train the VAE model to see real FC matrices.\n\n"
+                    demo_info += "Synthetic FC Matrix Information:\n"
+                    demo_info += f"Matrix shape: {original.shape}\n"
+                    demo_info += f"Value range: [{-1:.1f}, {1:.1f}]\n"
+                    demo_info += "Symmetric matrices with diagonal=1\n"
+                    return [fig, demo_info]
+            except Exception as e:
+                logger.error(f"Error in FC visualization: {str(e)}", exc_info=True)
+                error_fig = plt.figure(figsize=(10, 6))
+                plt.text(0.5, 0.5, f"Error: {str(e)}",
+                        horizontalalignment='center', verticalalignment='center',
+                        fontsize=12, color='red', wrap=True)
+                plt.axis('off')
+                return [error_fig, f"Error in FC visualization: {str(e)}"]
         # Tab 2: Random Forest Training Handler
         def handle_rf_training(prediction_type, outcome_variable, rf_n_estimators, rf_max_depth, rf_cv_folds):
             """Train the Random Forest model using the VAE latent representations"""
+            # Check if VAE has been trained or if we can use synthetic data
+            if not app_state.get('vae_trained', False) or app_state.get('latents') is None:
+                # Instead of error, create synthetic data for demonstration
+                logger.info("No VAE latents available - using synthetic data for RF training")
+                # Number of synthetic samples
+                n_samples = 30
+                # Create synthetic latent features (10 dimensions)
+                np.random.seed(42)  # For reproducibility
+                latents = np.random.randn(n_samples, 10)
+                # Create synthetic demographics
+                demographics = pd.DataFrame({
+                    'age': np.random.randint(40, 80, n_samples),
+                    'gender': np.random.choice(['M', 'F'], n_samples),
+                    'mpo': np.random.randint(1, 24, n_samples),
+                    'education': np.random.randint(8, 20, n_samples),
+                    'wab_aq': np.random.uniform(20, 80, n_samples)
+                })
+                # Create synthetic treatment outcomes with correlation to features
+                # Higher age -> worse outcomes, higher education -> better outcomes
+                treatment_outcomes = (
+                    -0.3 * demographics['age'] +
+                    0.4 * demographics['education'] +
+                    0.6 * demographics['wab_aq'] +
+                    2.0 * latents[:, 0] -
+                    1.5 * latents[:, 1] +
+                    np.random.randn(n_samples) * 5
+                )
+                # Scale to realistic range (0-100)
+                treatment_outcomes = (treatment_outcomes - treatment_outcomes.min()) / (treatment_outcomes.max() - treatment_outcomes.min()) * 80 + 10
+                # Store in app_state
+                app_state['latents'] = latents
+                app_state['demographics'] = demographics
+                app_state['synthetic_data'] = True
+                # Inform the user we're using synthetic data
+                logger.info("Created synthetic data for RF training demonstration")
+                info_msg = "Using synthetic data for demonstration. For real analysis, train the VAE in Tab 1 first."
+            else:
+                # Normal case - using real VAE latents
+                app_state['synthetic_data'] = False
+                info_msg = "Using VAE latents for Random Forest training."
             try:
                 # Update RF configuration
                 import pandas as pd
                 import numpy as np
+                # Get treatment outcomes data
+                # Check if we already created synthetic data
+                if app_state.get('synthetic_data', False):
+                    # Use the synthetic treatment outcomes we created above
+                    # (available in this scope from the if block above)
+                    logger.info("Using synthetic treatment outcomes")
+                    # treatment_outcomes is already defined above
+                # Or try to find real treatment file
+                elif hasattr(app, 'last_treatment_file') and os.path.exists(app.last_treatment_file):
                     treatment_file = app.last_treatment_file
                     treatment_df = pd.read_csv(treatment_file)
                     treatment_outcomes = treatment_df['outcome_score'].values
+                    logger.info(f"Using treatment outcomes from {treatment_file}")
+                else:
+                    # Create a fallback set of treatment outcomes
+                    logger.info("No treatment outcomes found - creating mock data")
+                    n_samples = len(app_state['latents'])
+                    # Create simple mock outcomes based on demographics (if available)
+                    if app_state.get('demographics') is not None and 'wab_aq' in app_state['demographics']:
+                        # Base it on improvement from current scores
+                        base_scores = app_state['demographics']['wab_aq'].values
+                        # Add 10-30 points of improvement
+                        improvements = np.random.uniform(10, 30, n_samples)
+                        treatment_outcomes = np.minimum(base_scores + improvements, 100)
+                    else:
+                        # Complete fallback - just random scores
+                        treatment_outcomes = np.random.uniform(30, 90, n_samples)
+                    logger.info(f"Created {n_samples} mock treatment outcomes")
+                # Initialize predictor
+                predictor = AphasiaTreatmentPredictor(
+                    n_estimators=rf_n_estimators,
+                    max_depth=rf_max_depth if rf_max_depth > 0 else None
+                )
+                # Cross-validate
+                cv_results = predictor.cross_validate(
+                    latents=latents,
+                    demographics=demographics,
+                    treatment_outcomes=treatment_outcomes,
+                    n_splits=rf_cv_folds
+                )
+                # Fit final model
+                predictor.fit(latents, demographics, treatment_outcomes)
+                # Store in app_state
+                app_state['predictor'] = predictor
+                app_state['rf_trained'] = True
+                # Create feature importance plot
+                importance_fig = predictor.plot_feature_importance()
+                # Create prediction performance plot
+                predictions = cv_results['predictions']
+                prediction_stds = cv_results['prediction_stds']
+                performance_fig = plt.figure(figsize=(8, 6))
+                # Check if we have valid predictions
+                if len(treatment_outcomes) > 0 and len(predictions) == len(treatment_outcomes):
+                    # Only create scatter plot if we have matching data
+                    plt.scatter(treatment_outcomes, predictions)
+                    # Reference line
+                    min_val = min(np.min(treatment_outcomes), np.min(predictions))
+                    max_val = max(np.max(treatment_outcomes), np.max(predictions))
+                    plt.plot([min_val, max_val], [min_val, max_val], 'r--')
+                    # Confidence band
+                    plt.fill_between(treatment_outcomes,
+                                    predictions - 2*prediction_stds,
+                                    predictions + 2*prediction_stds,
+                                    alpha=0.2, color='gray')
+                    plt.xlabel('Actual Outcome')
+                    plt.ylabel('Predicted Outcome')
+                    # Get performance metrics
+                    metrics_text = ""
+                    mean_metrics = cv_results.get('mean_metrics', {})
+                    r2 = mean_metrics.get('r2', 0)
+                    rmse = mean_metrics.get('rmse', 0)
+                    plt.title(f'Treatment Outcome Prediction\nR² = {r2:.3f}, RMSE = {rmse:.3f}')
+                    metrics_text = f"Regression Model Performance:\nR² = {r2:.4f}\nRMSE = {rmse:.4f}"
                 else:
+                    # Handle case with no data
+                    plt.text(0.5, 0.5, "No prediction data available",
+                            ha='center', va='center', transform=plt.gca().transAxes)
+                    metrics_text = "No performance metrics available"
+                plt.tight_layout()
+                # Add notice if using synthetic data
+                if app_state.get('synthetic_data', False):
+                    metrics_text = f"{metrics_text}\n\nNOTE: Using synthetic data for demonstration."
+                return [importance_fig, performance_fig, metrics_text]
+            else:
+                # No treatment file available
+                error_fig = plt.figure(figsize=(10, 6))
+                message = "Error: Treatment outcomes file not found. Please retrain the VAE in Tab 1."
+                plt.text(0.5, 0.5, message,
+                        horizontalalignment='center', verticalalignment='center',
+                        fontsize=14, color='red')
+                plt.axis('off')
+                return [error_fig, error_fig, "Error: Treatment outcomes file not found."]
             except Exception as e:
                 logger.error(f"Error in RF training: {str(e)}", exc_info=True)
                 error_fig = plt.figure(figsize=(10, 6))
             fn=handle_vae_training,
             inputs=[data_dir, local_nii_dir, latent_dim, nepochs, bsize, use_hf_dataset,
                    skip_behavioral, use_synthetic_nifti, use_synthetic_fc],
+            outputs=[fc_plot, fc_info, learning_plot]
+        )
+        # FC Visualization button
+        visualize_fc_btn.click(
+            fn=handle_fc_visualization,
+            inputs=[],
+            outputs=[fc_plot, fc_info]
         )
         # Random Forest Training tab
         # Tab 3: Treatment Prediction Handler
         def handle_treatment_prediction(fmri_file, age, sex, months, wab):
             """Predict treatment outcome for a new patient"""
             try:
+                # First, check if we have saved models we can use
+                rf_model_path = "results/treatment_predictor.joblib"
+                rf_available = os.path.exists(rf_model_path)
+                # Create prediction app
                 temp_app = AphasiaPredictionApp()
+                # If there are trained models in app_state, use them
+                if app_state.get('vae_trained', False) and app_state.get('rf_trained', False) and app_state.get('vae') is not None and app_state.get('predictor') is not None:
+                    logger.info("Using trained models from current session for prediction")
+                    temp_app.vae = app_state.get('vae')
+                    temp_app.predictor = app_state.get('predictor')
+                    temp_app.trained = True
+                    temp_app.latent_dim = app_state.get('vae').latent_dim if hasattr(app_state.get('vae'), 'latent_dim') else 32
+                # If we don't have trained models, but saved models exist, load them
+                elif rf_available:
+                    logger.info("Loading saved RF model for prediction")
+                    try:
+                        # Try to load the RF model from disk
+                        from rcf_prediction import AphasiaTreatmentPredictor
+                        temp_app.predictor = AphasiaTreatmentPredictor.load_model(rf_model_path)
+                        temp_app.trained = True
+                        # Use the VAE from app_state if available, otherwise use synthetic FC
+                        if app_state.get('vae') is not None:
+                            temp_app.vae = app_state.get('vae')
+                            temp_app.latent_dim = temp_app.vae.latent_dim if hasattr(temp_app.vae, 'latent_dim') else 32
+                        else:
+                            # Create a synthetic FC matrix based on demographics
+                            logger.info("No VAE available - using synthetic FC data")
+                            from visualization import plot_treatment_trajectory
+                            # Generate synthetic prediction
+                            current_score = wab
+                            # Calculate predicted score based on demographics (simplified model)
+                            age_factor = -0.1 * (age - 60)  # Age effect (younger is better)
+                            time_factor = 0.7 * months      # More treatment time is better
+                            gender_factor = 2 if sex == "F" else 0  # Small gender effect
+                            # Base improvement of 15 points, modified by factors
+                            improvement = 15 + age_factor + time_factor + gender_factor
+                            # Add some randomness
+                            improvement = max(5, min(30, improvement + np.random.normal(0, 3)))
+                            predicted_score = min(100, current_score + improvement)
+                            prediction_std = 5.0  # Fixed uncertainty for demo
+                            # Create a trajectory plot
+                            fig = plot_treatment_trajectory(
+                                current_score=current_score,
+                                predicted_score=predicted_score,
+                                months_post_stroke=months,
+                                prediction_std=prediction_std
+                            )
+                            # Create prediction text
+                            prediction_text = (
+                                f"Using simplified model (VAE not trained)\n\n"
+                                f"Current WAB-AQ: {current_score:.1f}\n"
+                                f"Predicted WAB-AQ after {months} months: {predicted_score:.1f} ± {1.96*prediction_std:.1f}\n"
+                                f"Expected improvement: {predicted_score - current_score:.1f} points\n\n"
+                                f"Note: This prediction uses a simplified model.\n"
+                                f"Train the VAE for more accurate predictions."
+                            )
+                            return [prediction_text, fig]
+                    except Exception as load_err:
+                        logger.error(f"Error loading models: {load_err}")
+                        return [f"Error loading models: {load_err}", None]
+                else:
+                    # If no models are available, generate a demo visualization
+                    logger.info("No models available - creating demonstration visualization")
+                    from visualization import plot_treatment_trajectory
+                    # Generate synthetic prediction with realistic values
+                    current_score = wab
+                    # Calculate predicted score based on demographics (simplified model)
+                    age_factor = -0.1 * (age - 60)  # Age effect (younger is better)
+                    time_factor = 0.7 * months      # More treatment time is better
+                    gender_factor = 2 if sex == "F" else 0  # Small gender effect
+                    # Base improvement of 15 points, modified by factors
+                    improvement = 15 + age_factor + time_factor + gender_factor
+                    # Add some randomness
+                    improvement = max(5, min(30, improvement + np.random.normal(0, 3)))
+                    predicted_score = min(100, current_score + improvement)
+                    prediction_std = 5.0  # Fixed uncertainty for demo
+                    # Create a demo trajectory plot
+                    fig = plot_treatment_trajectory(
+                        current_score=current_score,
+                        predicted_score=predicted_score,
+                        months_post_stroke=months,
+                        prediction_std=prediction_std
+                    )
+                    # Create prediction text
+                    prediction_text = (
+                        f"DEMO MODE - No trained models available\n\n"
+                        f"Current WAB-AQ: {current_score:.1f}\n"
+                        f"Predicted WAB-AQ after {months} months: {predicted_score:.1f} ± {1.96*prediction_std:.1f}\n"
+                        f"Expected improvement: {predicted_score - current_score:.1f} points\n\n"
+                        f"Note: This is a demonstration using synthetic data.\n"
+                        f"Train the VAE and RF models for actual predictions."
+                    )
+                    return [prediction_text, fig]
+                # Make prediction using the available models
                 return temp_app.predict_treatment(
                     fmri_file=fmri_file,
                     age=age,
         interface.launch()
     else:
         # Running locally
+        interface.launch(share=True)

data_preprocessing.py CHANGED Viewed

@@ -1,29 +1,157 @@
 import numpy as np
 import pandas as pd
 import os
-from nilearn import input_data, connectome
 from nilearn.image import load_img
 import nibabel as nib
 from pathlib import Path
 from config import PREPROCESS_CONFIG, PREDICTION_CONFIG
-def process_single_fmri(fmri_file, allow_synthetic=False):
     """
     Process a single fMRI file to FC matrix
     Args:
         fmri_file: Path to the fMRI .nii or .nii.gz file
         allow_synthetic: If True, generate synthetic FC matrix on error (disabled by default)
     Returns:
         fc_triu: Upper triangular FC matrix values
     """
     print(f"Processing fMRI file: {fmri_file}")
-    # Use Power 264 atlas
-    from nilearn import datasets
-    power = datasets.fetch_coords_power_2011()
-    coords = np.vstack((power.rois['x'], power.rois['y'], power.rois['z'])).T
     # Try different atlas radiuses if the default one has issues
     radius_options = [PREPROCESS_CONFIG['radius'], 8, 10, 5, 12]
@@ -32,56 +160,107 @@ def process_single_fmri(fmri_file, allow_synthetic=False):
     for radius in radius_options:
         try:
             print(f"Trying with radius {radius}mm...")
-            # Create masker with allow_empty=True to handle empty spheres
-            masker = input_data.NiftiSpheresMasker(
-                coords,
-                radius=radius,
-                standardize=True,
-                memory='nilearn_cache',
-                memory_level=1,
-                verbose=0,
-                detrend=True,
-                low_pass=PREPROCESS_CONFIG['low_pass'],
-                high_pass=PREPROCESS_CONFIG['high_pass'],
-                t_r=PREPROCESS_CONFIG['t_r'],
-                allow_empty=True   # Allow empty spheres
-            )
-            # Load and process fMRI
-            print(f"Loading NIfTI file...")
-            fmri_img = load_img(fmri_file)
-            print(f"NIfTI file loaded, shape: {fmri_img.shape}")
-            # Check for insufficient time points
-            if len(fmri_img.shape) < 4 or fmri_img.shape[3] < 20:  # Assuming we need at least 20 time points
-                print(f"Warning: {fmri_file} has insufficient time points: {fmri_img.shape}")
-                continue
-            # Transform to time series with explicit warning handling
-            print(f"Extracting time series...")
-            try:
-                # Explicitly handle warnings about empty spheres
-                import warnings
-                with warnings.catch_warnings():
-                    warnings.filterwarnings('ignore', message='.*empty.*')
-                    time_series = masker.fit_transform(fmri_img)
-            except Exception as e:
-                if "empty" in str(e):
-                    print(f"Warning: Some spheres are empty in {fmri_file}. Using a different sphere radius.")
-                    # Extract the list of empty spheres for logging
-                    import re
-                    empty_spheres = re.findall(r"\[(.*?)\]", str(e))
-                    if empty_spheres:
-                        print(f"Empty spheres: {empty_spheres[0]}")
-                    # Continue to next radius option
                     continue
-                else:
-                    print(f"Unknown error in masker: {e}")
-                    continue  # Skip this radius if there's any other error
-            print(f"Time series extracted, shape: {time_series.shape}")
             # Validate time series data
             if np.isnan(time_series).any() or np.isinf(time_series).any():
@@ -224,6 +403,38 @@ def preprocess_fmri_to_fc(nii_files, demo_data, demo_types):
     return X, demo_data, demo_types
 def load_and_preprocess_data(data_dir, demographic_file, use_hf_dataset=False,
                         hf_nii_files=None, hf_demo_data=None, hf_demo_types=None):
     """

 import numpy as np
 import pandas as pd
 import os
+import json
+import pickle
+import hashlib
+import warnings
+import re
+from nilearn import input_data, connectome, datasets
 from nilearn.image import load_img
 import nibabel as nib
 from pathlib import Path
 from config import PREPROCESS_CONFIG, PREDICTION_CONFIG
+# Create cache directory if it doesn't exist
+CACHE_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'cache')
+os.makedirs(CACHE_DIR, exist_ok=True)
+os.makedirs(os.path.join(CACHE_DIR, 'time_series'), exist_ok=True)
+os.makedirs(os.path.join(CACHE_DIR, 'fc_matrices'), exist_ok=True)
+os.makedirs(os.path.join(CACHE_DIR, 'latents'), exist_ok=True)
+os.makedirs(os.path.join(CACHE_DIR, 'maskers'), exist_ok=True)
+os.makedirs(os.path.join(CACHE_DIR, 'atlas'), exist_ok=True)
+def get_file_hash(file_path):
+    """Generate a hash for a file to use as a cache key"""
+    try:
+        hasher = hashlib.md5()
+        with open(file_path, 'rb') as f:
+            # Read in chunks to handle large files
+            for chunk in iter(lambda: f.read(4096), b""):
+                hasher.update(chunk)
+        return hasher.hexdigest()
+    except Exception as e:
+        print(f"Error hashing file {file_path}: {e}")
+        # Fallback to filename-based hash if file reading fails
+        return hashlib.md5(os.path.basename(file_path).encode()).hexdigest()
+def get_cached_atlas_coords(atlas_name="power_2011", use_cache=True):
+    """
+    Get atlas coordinates, using cache if available
+    Args:
+        atlas_name: Name of the atlas (currently only power_2011 is supported)
+        use_cache: Whether to use/create cache
+    Returns:
+        coords: Array of coordinates for the atlas
+    """
+    if not use_cache:
+        # Fetch directly from nilearn
+        from nilearn import datasets
+        power = datasets.fetch_coords_power_2011()
+        coords = np.vstack((power.rois['x'], power.rois['y'], power.rois['z'])).T
+        return coords
+    # Check if we have cached atlas coordinates
+    atlas_path = os.path.join(CACHE_DIR, 'atlas', f"{atlas_name}_coords.npy")
+    if os.path.exists(atlas_path):
+        try:
+            print(f"Loading cached atlas coordinates for {atlas_name}")
+            coords = np.load(atlas_path)
+            print(f"Successfully loaded cached atlas with {len(coords)} ROIs")
+            return coords
+        except Exception as e:
+            print(f"Error loading cached atlas: {e}, fetching new one")
+    # No valid cache, fetch from nilearn
+    try:
+        from nilearn import datasets
+        print(f"Fetching {atlas_name} atlas from nilearn")
+        power = datasets.fetch_coords_power_2011()
+        coords = np.vstack((power.rois['x'], power.rois['y'], power.rois['z'])).T
+        # Cache the coordinates
+        try:
+            np.save(atlas_path, coords)
+            print(f"Saved atlas coordinates to cache: {atlas_path}")
+        except Exception as e:
+            print(f"Error saving atlas to cache: {e}")
+        return coords
+    except Exception as e:
+        print(f"Error fetching atlas: {e}")
+        raise
+def get_cached_masker(radius, use_cache=True):
+    """
+    Get a NiftiSpheresMasker with the specified radius, using cache if available
+    Args:
+        radius: Sphere radius in mm
+        use_cache: Whether to use/create cache
+    Returns:
+        masker: NiftiSpheresMasker object
+    """
+    if not use_cache:
+        return None
+    # Create a cache key for this masker configuration
+    # We use radius and other PREPROCESS_CONFIG values that affect the masker
+    config_str = (f"radius={radius},"
+                 f"tr={PREPROCESS_CONFIG['t_r']},"
+                 f"high_pass={PREPROCESS_CONFIG['high_pass']},"
+                 f"low_pass={PREPROCESS_CONFIG['low_pass']}")
+    masker_key = hashlib.md5(config_str.encode()).hexdigest()
+    masker_path = os.path.join(CACHE_DIR, 'maskers', f"{masker_key}.pkl")
+    # Check if we have a cached masker
+    if os.path.exists(masker_path):
+        try:
+            print(f"Loading cached masker for radius {radius}mm")
+            with open(masker_path, 'rb') as f:
+                masker = pickle.load(f)
+            print(f"Successfully loaded cached masker for radius {radius}mm")
+            return masker
+        except Exception as e:
+            print(f"Error loading cached masker: {e}, creating new one")
+    # No valid cache, return None to indicate a new masker should be created
+    return None
+def process_single_fmri(fmri_file, allow_synthetic=False, use_cache=True):
     """
     Process a single fMRI file to FC matrix
     Args:
         fmri_file: Path to the fMRI .nii or .nii.gz file
         allow_synthetic: If True, generate synthetic FC matrix on error (disabled by default)
+        use_cache: If True, try to load cached data instead of reprocessing
     Returns:
         fc_triu: Upper triangular FC matrix values
     """
     print(f"Processing fMRI file: {fmri_file}")
+    # Check if cached FC matrix exists
+    if use_cache:
+        file_hash = get_file_hash(fmri_file)
+        fc_cache_path = os.path.join(CACHE_DIR, 'fc_matrices', f"{file_hash}.npy")
+        if os.path.exists(fc_cache_path):
+            print(f"Loading cached FC matrix for {os.path.basename(fmri_file)}")
+            try:
+                fc_triu = np.load(fc_cache_path)
+                print(f"Successfully loaded cached FC matrix, shape: {fc_triu.shape}")
+                return fc_triu
+            except Exception as e:
+                print(f"Error loading cached FC matrix: {e}, recalculating...")
+    # Use Power 264 atlas with caching
+    coords = get_cached_atlas_coords(use_cache=use_cache)
     # Try different atlas radiuses if the default one has issues
     radius_options = [PREPROCESS_CONFIG['radius'], 8, 10, 5, 12]
     for radius in radius_options:
         try:
             print(f"Trying with radius {radius}mm...")
+            # Check if we have cached time series for this file and radius
+            if use_cache:
+                ts_cache_key = f"{file_hash}_r{radius}"
+                ts_cache_path = os.path.join(CACHE_DIR, 'time_series', f"{ts_cache_key}.npy")
+                if os.path.exists(ts_cache_path):
+                    print(f"Loading cached time series for radius {radius}mm")
+                    try:
+                        time_series = np.load(ts_cache_path)
+                        print(f"Successfully loaded cached time series, shape: {time_series.shape}")
+                    except Exception as e:
+                        print(f"Error loading cached time series: {e}, recalculating...")
+                        time_series = None
+                else:
+                    time_series = None
+            else:
+                time_series = None
+            # If no cached time series, calculate it
+            if time_series is None:
+                # Try to get a cached masker first
+                masker = get_cached_masker(radius, use_cache)
+                # If no cached masker, create a new one
+                if masker is None:
+                    print(f"Creating new masker with radius {radius}mm")
+                    # Create masker with allow_empty=True to handle empty spheres
+                    masker = input_data.NiftiSpheresMasker(
+                        coords,
+                        radius=radius,
+                        standardize=True,
+                        memory='nilearn_cache',
+                        memory_level=1,
+                        verbose=0,
+                        detrend=True,
+                        low_pass=PREPROCESS_CONFIG['low_pass'],
+                        high_pass=PREPROCESS_CONFIG['high_pass'],
+                        t_r=PREPROCESS_CONFIG['t_r'],
+                        allow_empty=True   # Allow empty spheres
+                    )
+                    # Cache the masker if caching is enabled
+                    if use_cache:
+                        try:
+                            config_str = (f"radius={radius},"
+                                        f"tr={PREPROCESS_CONFIG['t_r']},"
+                                        f"high_pass={PREPROCESS_CONFIG['high_pass']},"
+                                        f"low_pass={PREPROCESS_CONFIG['low_pass']}")
+                            masker_key = hashlib.md5(config_str.encode()).hexdigest()
+                            masker_path = os.path.join(CACHE_DIR, 'maskers', f"{masker_key}.pkl")
+                            with open(masker_path, 'wb') as f:
+                                pickle.dump(masker, f)
+                            print(f"Saved masker to cache: {masker_path}")
+                        except Exception as e:
+                            print(f"Error saving masker to cache: {e}")
+                # Load and process fMRI
+                print(f"Loading NIfTI file...")
+                fmri_img = load_img(fmri_file)
+                print(f"NIfTI file loaded, shape: {fmri_img.shape}")
+                # Check for insufficient time points
+                if len(fmri_img.shape) < 4 or fmri_img.shape[3] < 20:  # Assuming we need at least 20 time points
+                    print(f"Warning: {fmri_file} has insufficient time points: {fmri_img.shape}")
                     continue
+                # Transform to time series with explicit warning handling
+                print(f"Extracting time series...")
+                try:
+                    # Explicitly handle warnings about empty spheres
+                    with warnings.catch_warnings():
+                        warnings.filterwarnings('ignore', message='.*empty.*')
+                        time_series = masker.fit_transform(fmri_img)
+                except Exception as e:
+                    if "empty" in str(e):
+                        print(f"Warning: Some spheres are empty in {fmri_file}. Using a different sphere radius.")
+                        # Extract the list of empty spheres for logging
+                        empty_spheres = re.findall(r"\[(.*?)\]", str(e))
+                        if empty_spheres:
+                            print(f"Empty spheres: {empty_spheres[0]}")
+                        # Continue to next radius option
+                        continue
+                    else:
+                        print(f"Unknown error in masker: {e}")
+                        continue  # Skip this radius if there's any other error
+                print(f"Time series extracted, shape: {time_series.shape}")
+                # Cache the time series if successful
+                if use_cache and time_series is not None:
+                    try:
+                        np.save(ts_cache_path, time_series)
+                        print(f"Saved time series to cache: {ts_cache_path}")
+                    except Exception as e:
+                        print(f"Error saving time series to cache: {e}")
+            print(f"Time series processed, shape: {time_series.shape}")
             # Validate time series data
             if np.isnan(time_series).any() or np.isinf(time_series).any():
     return X, demo_data, demo_types
+def clear_cache(cache_type=None):
+    """
+    Clear all or specific types of cache
+    Args:
+        cache_type: Type of cache to clear ('time_series', 'fc_matrices', 'maskers', 'atlas', 'latents')
+                   If None, clears all cache types
+    """
+    if cache_type is None:
+        # Clear all cache types
+        cache_types = ['time_series', 'fc_matrices', 'maskers', 'atlas', 'latents']
+    else:
+        # Clear specific cache type
+        cache_types = [cache_type]
+    for ctype in cache_types:
+        cache_dir = os.path.join(CACHE_DIR, ctype)
+        if os.path.exists(cache_dir):
+            print(f"Clearing {ctype} cache...")
+            try:
+                for file in os.listdir(cache_dir):
+                    file_path = os.path.join(cache_dir, file)
+                    if os.path.isfile(file_path):
+                        os.remove(file_path)
+                print(f"Successfully cleared {ctype} cache")
+            except Exception as e:
+                print(f"Error clearing {ctype} cache: {e}")
+        else:
+            print(f"Cache directory for {ctype} does not exist")
+    print("Cache clearing complete")
 def load_and_preprocess_data(data_dir, demographic_file, use_hf_dataset=False,
                         hf_nii_files=None, hf_demo_data=None, hf_demo_types=None):
     """