Spaces:

SreekarB
/

AphasiaPred

Sleeping

App Files Files Community

SreekarB commited on Mar 13, 2025

Commit

1c47445

verified ·

1 Parent(s): dbe81c1

Upload 3 files

Browse files

Files changed (3) hide show

app.py +141 -655
data_preprocessing.py +578 -78
main.py +271 -126

app.py CHANGED Viewed

@@ -1,518 +1,10 @@
 import gradio as gr
-from main import run_analysis
-from rcf_prediction import AphasiaTreatmentPredictor
-import numpy as np
-import matplotlib.pyplot as plt
-from data_preprocessing import preprocess_fmri_to_fc, process_single_fmri
-from visualization import plot_fc_matrices, plot_learning_curves
 import os
-from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, f1_score
 import json
 import pickle
-import pandas as pd
-import seaborn as sns
-import logging
-from config import MODEL_CONFIG, PREDICTION_CONFIG
-logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
-logger = logging.getLogger(__name__)
-class AphasiaPredictionApp:
-    def __init__(self):
-        self.vae = None
-        self.predictor = None
-        self.trained = False
-        self.latent_dim = MODEL_CONFIG['latent_dim']
-    def train_models(self, data_dir, latent_dim, nepochs, bsize):
-        """
-        Train VAE and Random Forest models
-        """
-        # Train VAE and Random Forest
-        logger.info(f"Training models with data from {data_dir}")
-        logger.info(f"VAE params: latent_dim={latent_dim}, epochs={nepochs}, batch_size={bsize}")
-        # Default prediction parameters from our config
-        prediction_type = PREDICTION_CONFIG.get('prediction_type', 'regression')
-        outcome_variable = PREDICTION_CONFIG.get('default_outcome', 'wab_aq')
-        logger.info(f"Prediction: type={prediction_type}, outcome={outcome_variable}")
-        figures = {}
-        try:
-            # Run the full analysis pipeline
-            results = run_analysis(
-                data_dir=data_dir,
-                demographic_file="demographics.csv",
-                treatment_file="treatment_outcomes.csv",
-                latent_dim=latent_dim,
-                nepochs=nepochs,
-                bsize=bsize,
-                save_model=True
-            )
-            # Get the VAE figure from results
-            vae_fig = results.get('figures', {}).get('vae')
-            figures['vae'] = vae_fig
-            if results:
-                self.vae = results.get('vae')
-                self.predictor = results.get('predictor')
-                latents = results.get('latents')
-                demographics = results.get('demographics')
-                predictor_cv_results = results.get('predictor_cv_results')
-                # Store the latent dimension
-                self.latent_dim = latent_dim
-                # Mark models as trained
-                self.trained = True
-                # Prepare prediction visualization if available
-                if self.predictor and predictor_cv_results:
-                    # Get the outcome variable data
-                    if outcome_variable == 'wab_aq':
-                        outcomes = demographics['wab_aq']
-                    elif outcome_variable == 'age':
-                        outcomes = demographics['age']
-                    elif outcome_variable == 'months_post_onset':
-                        outcomes = demographics['months_post_onset']
-                    else:
-                        # Try to find the outcome in demographics data
-                        outcomes = None
-                        for key in demographics:
-                            if outcome_variable.lower() in key.lower():
-                                outcomes = demographics[key]
-                                break
-                    # Create plots
-                    if 'prediction_stds' in predictor_cv_results and 'predictions' in predictor_cv_results:
-                        # Create prediction plots
-                        prediction_fig = self.create_prediction_plots(
-                            latents,
-                            demographics,
-                            outcomes,
-                            predictor_cv_results['predictions'],
-                            predictor_cv_results['prediction_stds']
-                        )
-                        figures['prediction'] = prediction_fig
-                        # Create feature importance plot if available
-                        try:
-                            feature_importance = self.predictor.get_feature_importance()
-                            if feature_importance is not None:
-                                importance_fig = self.create_importance_plot(feature_importance)
-                                figures['importance'] = importance_fig
-                        except Exception as e:
-                            logger.warning(f"Could not create feature importance plot: {e}")
-                logger.info("Training completed successfully")
-                # Create learning curve plots if available
-                if 'fold_metrics' in predictor_cv_results:
-                    learning_fig = self.create_learning_curve_plot(
-                        predictor_cv_results['fold_metrics']
-                    )
-                    figures['learning'] = learning_fig
-        except Exception as e:
-            logger.error(f"Error in training: {str(e)}")
-            error_fig = plt.figure(figsize=(10, 6))
-            plt.text(0.5, 0.5, f"Error: {str(e)}",
-                    horizontalalignment='center', verticalalignment='center',
-                    fontsize=12, color='red')
-            plt.axis('off')
-            figures['error'] = error_fig
-        return figures
-    def predict_treatment(self, fmri_file=None, age=50, sex="M",
-                         months_post_stroke=12, wab_score=50, fc_matrix=None):
-        """
-        Predict treatment outcome for a patient
-        Args:
-            fmri_file: Path to patient's fMRI file
-            age: Patient's age at stroke
-            sex: Patient's sex (M/F)
-            months_post_stroke: Months since stroke
-            wab_score: Current WAB score
-            fc_matrix: Pre-processed FC matrix (if fMRI file not provided)
-        Returns:
-            Prediction results and visualization
-        """
-        if not self.trained:
-            return "Please train the models first!", None
-        try:
-            # Process fMRI to FC matrix if provided
-            if fmri_file and not fc_matrix:
-                logger.info(f"Processing fMRI file: {fmri_file}")
-                # Use the single fMRI processing function
-                fc_matrix = process_single_fmri(fmri_file)
-            if fc_matrix is None:
-                return "Please provide either an fMRI file or an FC matrix", None
-            # Ensure FC matrix is properly shaped
-            if isinstance(fc_matrix, list):
-                fc_matrix = np.array(fc_matrix)
-            # Get latent representation
-            logger.info("Extracting latent representation from FC matrix")
-            if len(fc_matrix.shape) == 2:  # If matrix is 2D (e.g., 264x264)
-                # Convert to flattened upper triangular form
-                n = fc_matrix.shape[0]
-                indices = np.triu_indices(n, k=1)
-                fc_flattened = fc_matrix[indices]
-                fc_flattened = fc_flattened.reshape(1, -1)
-                latent = self.vae.get_latents(fc_flattened)
-            else:
-                # Assume already flattened
-                latent = self.vae.get_latents(fc_matrix.reshape(1, -1))
-            # Prepare demographics
-            demographics = {
-                'age': np.array([float(age)]),
-                'gender': np.array([sex]),
-                'months_post_onset': np.array([float(months_post_stroke)]),
-                'wab_aq': np.array([float(wab_score)])
-            }
-            logger.info("Making prediction")
-            # Make prediction
-            if self.predictor is None:
-                return "Predictor model not trained", None
-            # Make prediction using the model's predict method
-            prediction, prediction_std = self.predictor.predict(latent, demographics)
-            # Create visualization
-            fig = self.plot_treatment_trajectory(
-                current_score=wab_score,
-                predicted_score=prediction[0],
-                months_post_stroke=months_post_stroke,
-                prediction_std=prediction_std[0]
-            )
-            result_text = f"Predicted treatment outcome: {prediction[0]:.2f} ± {2*prediction_std[0]:.2f}"
-            logger.info(result_text)
-            return result_text, fig
-        except Exception as e:
-            error_msg = f"Error in prediction: {str(e)}"
-            logger.error(error_msg)
-            error_fig = plt.figure(figsize=(10, 6))
-            plt.text(0.5, 0.5, error_msg,
-                    horizontalalignment='center', verticalalignment='center',
-                    fontsize=12, color='red')
-            plt.axis('off')
-            return error_msg, error_fig
-    def plot_treatment_trajectory(self, current_score, predicted_score,
-                                months_post_stroke, prediction_std,
-                                treatment_duration=6):
-        """
-        Create a visualization of predicted treatment trajectory
-        Args:
-            current_score: Current WAB score
-            predicted_score: Predicted WAB score after treatment
-            months_post_stroke: Current months post stroke
-            prediction_std: Standard deviation of prediction
-            treatment_duration: Duration of treatment in months
-        Returns:
-            matplotlib figure
-        """
-        fig = plt.figure(figsize=(10, 6))
-        # X-axis: months
-        x = np.array([months_post_stroke, months_post_stroke + treatment_duration])
-        # Y-axis: WAB scores
-        y = np.array([current_score, predicted_score])
-        # Plot the trajectory
-        plt.plot(x, y, 'bo-', linewidth=2, label='Predicted Trajectory')
-        # Add confidence interval
-        plt.fill_between(
-            x,
-            [y[0], y[1] - 2*prediction_std],
-            [y[0], y[1] + 2*prediction_std],
-            alpha=0.2, color='blue', label='95% Confidence Interval'
-        )
-        # Add reference lines
-        if current_score < predicted_score:
-            improvement = predicted_score - current_score
-            plt.axhline(y=current_score, color='r', linestyle='--', alpha=0.5,
-                      label=f'Current WAB = {current_score:.1f}')
-            plt.axhline(y=predicted_score, color='g', linestyle='--', alpha=0.5,
-                      label=f'Predicted WAB = {predicted_score:.1f} (+{improvement:.1f})')
-        else:
-            decline = current_score - predicted_score
-            plt.axhline(y=current_score, color='r', linestyle='--', alpha=0.5,
-                      label=f'Current WAB = {current_score:.1f}')
-            plt.axhline(y=predicted_score, color='orange', linestyle='--', alpha=0.5,
-                      label=f'Predicted WAB = {predicted_score:.1f} (-{decline:.1f})')
-        # Add labels and title
-        plt.xlabel('Months Post Stroke')
-        plt.ylabel('WAB Score')
-        plt.title('Predicted Treatment Trajectory')
-        plt.legend(loc='best')
-        # Set y-axis limits
-        plt.ylim([0, 100])
-        plt.tight_layout()
-        return fig
-    def create_prediction_plots(self, latents, demographics, y_true, y_pred, y_std):
-        """Create prediction performance plots"""
-        fig = plt.figure(figsize=(12, 8))
-        # Create a 2x2 grid for plots
-        gs = plt.GridSpec(2, 2, figure=fig)
-        # Plot predicted vs actual values
-        ax1 = fig.add_subplot(gs[0, 0])
-        if self.predictor.prediction_type == 'regression':
-            # Regression: scatter plot
-            ax1.scatter(y_true, y_pred, alpha=0.7)
-            # Add perfect prediction line
-            min_val = min(np.min(y_true), np.min(y_pred))
-            max_val = max(np.max(y_true), np.max(y_pred))
-            ax1.plot([min_val, max_val], [min_val, max_val], 'r--')
-            ax1.set_xlabel('Actual Values')
-            ax1.set_ylabel('Predicted Values')
-            ax1.set_title('Predicted vs. Actual Values')
-            # Add R² to the plot
-            r2 = r2_score(y_true, y_pred)
-            ax1.text(0.05, 0.95, f'R² = {r2:.4f}', transform=ax1.transAxes,
-                   bbox=dict(facecolor='white', alpha=0.5))
-            # Plot residuals
-            ax2 = fig.add_subplot(gs[0, 1])
-            residuals = y_true - y_pred
-            ax2.scatter(y_pred, residuals, alpha=0.7)
-            ax2.axhline(y=0, color='r', linestyle='--')
-            ax2.set_xlabel('Predicted Values')
-            ax2.set_ylabel('Residuals')
-            ax2.set_title('Residual Plot')
-            # Plot prediction errors
-            ax3 = fig.add_subplot(gs[1, 0])
-            ax3.errorbar(range(len(y_pred)), y_pred, yerr=2*y_std, fmt='o', alpha=0.7,
-                       label='Predicted ± 2σ')
-            ax3.plot(range(len(y_true)), y_true, 'rx', alpha=0.7, label='Actual')
-            ax3.set_xlabel('Sample Index')
-            ax3.set_ylabel('Value')
-            ax3.set_title('Prediction with Error Bars')
-            ax3.legend()
-            # Plot error distribution
-            ax4 = fig.add_subplot(gs[1, 1])
-            ax4.hist(residuals, bins=20, alpha=0.7)
-            ax4.axvline(x=0, color='r', linestyle='--')
-            ax4.set_xlabel('Prediction Error')
-            ax4.set_ylabel('Frequency')
-            ax4.set_title('Error Distribution')
-        else:  # classification
-            # Convert to integer classes if they're strings
-            if isinstance(y_true[0], str) or isinstance(y_pred[0], str):
-                # Create mapping of class labels to integers
-                classes = sorted(list(set(list(y_true) + list(y_pred))))
-                class_to_int = {c: i for i, c in enumerate(classes)}
-                y_true_int = np.array([class_to_int[c] for c in y_true])
-                y_pred_int = np.array([class_to_int[c] for c in y_pred])
-            else:
-                y_true_int = y_true
-                y_pred_int = y_pred
-                classes = sorted(list(set(list(y_true_int) + list(y_pred_int))))
-            # Confusion matrix
-            from sklearn.metrics import confusion_matrix
-            cm = confusion_matrix(y_true_int, y_pred_int)
-            # Plot confusion matrix
-            sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=classes,
-                      yticklabels=classes, ax=ax1)
-            ax1.set_xlabel('Predicted')
-            ax1.set_ylabel('True')
-            ax1.set_title('Confusion Matrix')
-            # Class distribution
-            ax2 = fig.add_subplot(gs[0, 1])
-            unique_classes, true_counts = np.unique(y_true_int, return_counts=True)
-            unique_classes, pred_counts = np.unique(y_pred_int, return_counts=True)
-            # Create class distribution DataFrame
-            class_dist = pd.DataFrame({
-                'Class': classes,
-                'True': 0,
-                'Predicted': 0
-            })
-            for c, count in zip(unique_classes, true_counts):
-                class_dist.loc[class_dist['Class'] == classes[c], 'True'] = count
-            for c, count in zip(unique_classes, pred_counts):
-                class_dist.loc[class_dist['Class'] == classes[c], 'Predicted'] = count
-            # Plot class distribution
-            ax2.bar(class_dist['Class'].astype(str), class_dist['True'], label='True', alpha=0.7)
-            ax2.bar(class_dist['Class'].astype(str), class_dist['Predicted'], label='Predicted', alpha=0.5)
-            ax2.set_xlabel('Class')
-            ax2.set_ylabel('Count')
-            ax2.set_title('Class Distribution')
-            ax2.legend()
-            # Performance metrics
-            ax3 = fig.add_subplot(gs[1, 0])
-            ax3.axis('off')
-            # Calculate metrics
-            from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
-            acc = accuracy_score(y_true_int, y_pred_int)
-            prec = precision_score(y_true_int, y_pred_int, average='weighted', zero_division=0)
-            rec = recall_score(y_true_int, y_pred_int, average='weighted', zero_division=0)
-            f1 = f1_score(y_true_int, y_pred_int, average='weighted', zero_division=0)
-            metrics_text = (
-                f"Classification Metrics:\n\n"
-                f"Accuracy: {acc:.4f}\n"
-                f"Precision: {prec:.4f}\n"
-                f"Recall: {rec:.4f}\n"
-                f"F1 Score: {f1:.4f}"
-            )
-            ax3.text(0.5, 0.5, metrics_text, ha='center', va='center', fontsize=12)
-            # Confidence distribution
-            ax4 = fig.add_subplot(gs[1, 1])
-            ax4.hist(1 - y_std, bins=20, alpha=0.7)
-            ax4.set_xlabel('Prediction Confidence')
-            ax4.set_ylabel('Frequency')
-            ax4.set_title('Confidence Distribution')
-        plt.tight_layout()
-        return fig
-    def create_importance_plot(self, feature_importance, top_n=15):
-        """Create feature importance plot"""
-        # If feature_importance is a DataFrame, use it directly
-        if isinstance(feature_importance, pd.DataFrame):
-            importance_df = feature_importance
-        else:
-            # Create DataFrame
-            importance_df = pd.DataFrame({
-                'feature': [f'Feature {i}' for i in range(len(feature_importance))],
-                'importance': feature_importance
-            })
-        # Get top N features
-        top_features = importance_df.sort_values('importance', ascending=False).head(top_n)
-        # Create plot
-        fig = plt.figure(figsize=(10, 6))
-        plt.barh(range(len(top_features)), top_features['importance'], align='center')
-        plt.yticks(range(len(top_features)), top_features['feature'])
-        plt.xlabel('Importance')
-        plt.ylabel('Features')
-        plt.title(f'Top {top_n} Features by Importance')
-        plt.tight_layout()
-        return fig
-    def create_learning_curve_plot(self, fold_metrics):
-        """Create learning curve plots from cross-validation results"""
-        fig = plt.figure(figsize=(12, 6))
-        # Create a grid for plots
-        if self.predictor.prediction_type == 'regression':
-            # For regression, show R² and RMSE
-            ax1 = plt.subplot(1, 2, 1)
-            ax2 = plt.subplot(1, 2, 2)
-            # Plot R² for each fold
-            for i, metrics in enumerate(fold_metrics):
-                ax1.plot(i+1, metrics['r2'], 'bo')
-            # Plot average R²
-            avg_r2 = np.mean([m['r2'] for m in fold_metrics])
-            ax1.axhline(y=avg_r2, color='r', linestyle='--',
-                       label=f'Average R² = {avg_r2:.4f}')
-            ax1.set_xlabel('Fold')
-            ax1.set_ylabel('R²')
-            ax1.set_title('R² by Fold')
-            ax1.set_xticks(range(1, len(fold_metrics)+1))
-            ax1.legend()
-            # Plot RMSE for each fold
-            for i, metrics in enumerate(fold_metrics):
-                ax2.plot(i+1, metrics['rmse'], 'go')
-            # Plot average RMSE
-            avg_rmse = np.mean([m['rmse'] for m in fold_metrics])
-            ax2.axhline(y=avg_rmse, color='r', linestyle='--',
-                       label=f'Average RMSE = {avg_rmse:.4f}')
-            ax2.set_xlabel('Fold')
-            ax2.set_ylabel('RMSE')
-            ax2.set_title('RMSE by Fold')
-            ax2.set_xticks(range(1, len(fold_metrics)+1))
-            ax2.legend()
-        else:  # classification
-            # For classification, show accuracy and F1
-            ax1 = plt.subplot(1, 2, 1)
-            ax2 = plt.subplot(1, 2, 2)
-            # Plot accuracy for each fold
-            for i, metrics in enumerate(fold_metrics):
-                ax1.plot(i+1, metrics['accuracy'], 'bo')
-            # Plot average accuracy
-            avg_acc = np.mean([m['accuracy'] for m in fold_metrics])
-            ax1.axhline(y=avg_acc, color='r', linestyle='--',
-                       label=f'Average Accuracy = {avg_acc:.4f}')
-            ax1.set_xlabel('Fold')
-            ax1.set_ylabel('Accuracy')
-            ax1.set_title('Accuracy by Fold')
-            ax1.set_xticks(range(1, len(fold_metrics)+1))
-            ax1.legend()
-            # Plot F1 for each fold
-            for i, metrics in enumerate(fold_metrics):
-                ax2.plot(i+1, metrics['f1'], 'go')
-            # Plot average F1
-            avg_f1 = np.mean([m['f1'] for m in fold_metrics])
-            ax2.axhline(y=avg_f1, color='r', linestyle='--',
-                       label=f'Average F1 = {avg_f1:.4f}')
-            ax2.set_xlabel('Fold')
-            ax2.set_ylabel('F1 Score')
-            ax2.set_title('F1 Score by Fold')
-            ax2.set_xticks(range(1, len(fold_metrics)+1))
-            ax2.legend()
-        plt.tight_layout()
-        return fig
 def calculate_fc_accuracy(original_fc, reconstructed_fc):
     """
@@ -576,169 +68,163 @@ def save_latents(latents, demographics, subjects=None, file_path='latents.pkl'):
     return os.path.join('results', file_path)
-# Make sure directory exists for saving results
-os.makedirs('results', exist_ok=True)
 def create_interface():
-    """Create the Gradio interface"""
-    app = AphasiaPredictionApp()
-    with gr.Blocks(title="Aphasia Treatment Trajectory Prediction") as interface:
-        gr.Markdown("# Aphasia Treatment Trajectory Prediction")
-        with gr.Tabs():
-            # Training Tab
-            with gr.Tab("Train Models"):
-                with gr.Row():
-                    with gr.Column(scale=1):
-                        data_dir = gr.Textbox(
-                            label="Data Directory",
-                            value="SreekarB/OSFData"
-                        )
-                        latent_dim = gr.Slider(
-                            minimum=8, maximum=64, step=8,
-                            label="Latent Dimensions", value=32
-                        )
-                        nepochs = gr.Slider(
-                            minimum=100, maximum=5000, step=100,
-                            label="Number of Epochs", value=200  # Reduced for faster demos
-                        )
-                    with gr.Column(scale=1):
-                        bsize = gr.Slider(
-                            minimum=8, maximum=64, step=8,
-                            label="Batch Size", value=16
-                        )
-                        use_hf_dataset = gr.Checkbox(
-                            label="Use HuggingFace Dataset", value=True
-                        )
-                        with gr.Group("Prediction Options"):
-                            prediction_type = gr.Radio(
-                                label="Prediction Type",
-                                choices=["regression", "classification"],
-                                value="regression"
-                            )
-                            outcome_variable = gr.Dropdown(
-                                label="Outcome Variable",
-                                choices=["wab_aq", "age", "months_post_onset"],
-                                value="wab_aq"
-                            )
-                train_btn = gr.Button("Train Models", variant="primary")
-                with gr.Row():
-                    fc_plot = gr.Plot(label="FC Analysis")
-                with gr.Row():
-                    with gr.Column(scale=1):
-                        importance_plot = gr.Plot(label="Feature Importance")
-                    with gr.Column(scale=1):
-                        prediction_plot = gr.Plot(label="Prediction Performance")
-                with gr.Row():
-                    learning_plot = gr.Plot(label="Cross-validation Results")
-            # Prediction Tab
-            with gr.Tab("Predict Treatment"):
-                with gr.Row():
-                    with gr.Column(scale=1):
-                        fmri_file = gr.File(label="Patient fMRI Data")
-                    with gr.Column(scale=1):
-                        with gr.Group("Patient Demographics"):
-                            age = gr.Number(label="Age at Stroke", value=60)
-                            sex = gr.Dropdown(choices=["M", "F"], label="Sex", value="M")
-                            months = gr.Number(label="Months Post Stroke", value=12)
-                            wab = gr.Number(label="Current WAB Score", value=50)
-                predict_btn = gr.Button("Predict Treatment Outcome", variant="primary")
-                with gr.Row():
-                    prediction_text = gr.Textbox(label="Prediction Result")
-                with gr.Row():
-                    trajectory_plot = gr.Plot(label="Predicted Treatment Trajectory")
-        # Connect components
-        train_outputs = {
-            'vae': fc_plot,
-            'importance': importance_plot,
-            'prediction': prediction_plot,
-            'learning': learning_plot
-        }
-        # Handle train button click
-        def handle_train(data_dir, latent_dim, nepochs, bsize, use_hf_dataset,
-                        prediction_type, outcome_variable):
-            # Ensure we have the necessary files before training
-            # This is a placeholder - in a real app you'd validate these files exist
-            demographic_file = os.path.join(data_dir, "demographics.csv")
-            treatment_file = os.path.join(data_dir, "treatment_outcomes.csv")
-            results = app.train_models(
-                data_dir=data_dir,
-                latent_dim=latent_dim,
-                nepochs=nepochs,
-                bsize=bsize
-            )
-            # Return plots in the expected order
-            return [
-                results.get('vae', None),
-                results.get('importance', None),
-                results.get('prediction', None),
-                results.get('learning', None)
-            ]
-        train_btn.click(
-            fn=handle_train,
-            inputs=[data_dir, latent_dim, nepochs, bsize, use_hf_dataset,
-                   prediction_type, outcome_variable],
-            outputs=[fc_plot, importance_plot, prediction_plot, learning_plot]
         )
-        predict_btn.click(
-            fn=app.predict_treatment,
-            inputs=[fmri_file, age, sex, months, wab],
-            outputs=[prediction_text, trajectory_plot]
         )
         # Add examples
         gr.Examples(
             examples=[
-                ["SreekarB/OSFData", 32, 200, 16, True, "regression", "wab_aq"],  # Standard training
-                ["SreekarB/OSFData", 16, 100, 8, True, "classification", "wab_aq"]  # Faster training with classification
             ],
-            inputs=[data_dir, latent_dim, nepochs, bsize, use_hf_dataset,
-                   prediction_type, outcome_variable],
         )
-        # Add explanation
         gr.Markdown("""
-        ## How to use this tool
-        1. **Train Models Tab**: First train the VAE and Random Forest models using your dataset
-            - Use the default SreekarB/OSFData dataset or specify your own data source
-            - Adjust parameters like latent dimensions and training epochs
-            - Choose regression or classification prediction type
-            - Select which variable to predict (WAB score by default)
-        2. **Predict Treatment Tab**: Use the trained models to predict treatment outcomes
-            - Upload a patient's fMRI scan or use synthetic data
-            - Enter the patient's demographic information
-            - Click "Predict Treatment Outcome" to see the projected treatment trajectory
-            - The visualization shows the predicted outcome with confidence intervals
-        ## Interpreting Results
-        - The **Feature Importance** plot shows which latent dimensions and demographic variables most strongly predict treatment outcomes
-        - The **Prediction Performance** plot shows how well the model predicts known outcomes
-        - The **Treatment Trajectory** shows the projected change in WAB score over the course of treatment
-        Note: For optimal results, train with at least 500 epochs and latent dimension of 32 or higher.
         """)
-    return interface
 if __name__ == "__main__":
-    interface = create_interface()
-    interface.launch(share=True)

 import gradio as gr
+from main import run_fc_analysis
 import os
+import numpy as np
+from sklearn.metrics import mean_squared_error, r2_score
 import json
 import pickle
 def calculate_fc_accuracy(original_fc, reconstructed_fc):
     """
     return os.path.join('results', file_path)
+def gradio_fc_analysis(data_source, latent_dim, nepochs, bsize, use_hf_dataset):
+    """Run the full VAE analysis pipeline with accuracy metrics"""
+    # Run the original analysis
+    fig, results = run_fc_analysis(
+        data_dir=data_source,
+        demographic_file=None,  # We're now getting demographics directly from the dataset
+        latent_dim=latent_dim,
+        nepochs=nepochs,
+        bsize=bsize,
+        save_model=True,
+        use_hf_dataset=use_hf_dataset,
+        return_data=True  # New parameter to return data, will need to update main.py
+    )
+    if results:
+        vae = results.get('vae')
+        X = results.get('X')
+        latents = results.get('latents')
+        demographics = results.get('demographics')
+        reconstructed_fc = results.get('reconstructed_fc')
+        generated_fc = results.get('generated_fc')
+        # Calculate accuracy metrics
+        accuracy_metrics = {}
+        if X is not None and reconstructed_fc is not None:
+            for i in range(min(5, len(X))):  # Calculate for up to 5 samples
+                metrics = calculate_fc_accuracy(X[i], reconstructed_fc[i])
+                accuracy_metrics[f"Subject_{i+1}"] = metrics
+            # Average metrics across subjects
+            avg_metrics = {}
+            for metric in ["MSE", "RMSE", "R²", "Correlation", "Cosine Similarity"]:
+                avg_metrics[metric] = np.mean([subject_metrics[metric]
+                                              for subject_metrics in accuracy_metrics.values()])
+            accuracy_metrics["Average"] = avg_metrics
+        # Save latent representations if available
+        if latents is not None and demographics is not None:
+            latents_path = save_latents(latents, demographics, file_path=f'latents_dim{latent_dim}.pkl')
+            print(f"Saved latents to {latents_path}")
+        # Prepare status message with accuracy metrics
+        if accuracy_metrics:
+            avg = accuracy_metrics["Average"]
+            status = (f"Analysis complete! Model trained with {latent_dim} dimensions.\n\n"
+                     f"Reconstruction Accuracy Metrics (Average):\n"
+                     f"• MSE: {avg['MSE']:.6f}\n"
+                     f"• RMSE: {avg['RMSE']:.6f}\n"
+                     f"• R²: {avg['R²']:.6f}\n"
+                     f"• Correlation: {avg['Correlation']:.6f}\n"
+                     f"• Cosine Similarity: {avg['Cosine Similarity']:.6f}\n\n"
+                     f"Latent representations saved to results/latents_dim{latent_dim}.pkl")
+        else:
+            status = "Analysis complete! VAE model has been trained and demographic relationships analyzed."
+    else:
+        status = "Analysis complete, but no results were returned for accuracy calculation."
+    return fig, status
 def create_interface():
+    with gr.Blocks(title="Aphasia fMRI to FC Analysis using VAE") as iface:
+        gr.Markdown("""
+        # Aphasia fMRI to FC Analysis using VAE
+        This demo uses a Variational Autoencoder (VAE) to analyze functional connectivity patterns in the brain and their relationship to demographic variables.
+        ## Dataset Information
+        By default, this uses the SreekarB/OSFData dataset from HuggingFace with the following variables:
+        - ID: Subject identifier
+        - wab_aq: Aphasia severity score
+        - age: Age of the subject
+        - mpo: Months post onset
+        - education: Years of education
+        - gender: Subject gender
+        - handedness: Subject handedness (ignored in the analysis)
+        """)
+        with gr.Row():
+            with gr.Column(scale=1):
+                # Configuration parameters
+                data_source = gr.Textbox(
+                    label="Data Source (HF Dataset ID or Local Directory)",
+                    value="SreekarB/OSFData"
+                )
+                latent_dim = gr.Slider(
+                    minimum=8, maximum=64, step=8,
+                    label="Latent Dimensions", value=32
+                )
+                nepochs = gr.Slider(
+                    minimum=100, maximum=5000, step=100,
+                    label="Number of Epochs", value=200  # Reduced for faster demos
+                )
+                bsize = gr.Slider(
+                    minimum=8, maximum=64, step=8,
+                    label="Batch Size", value=16
+                )
+                use_hf_dataset = gr.Checkbox(
+                    label="Use HuggingFace Dataset", value=True
+                )
+                # Training button
+                train_button = gr.Button("Start Training", variant="primary")
+                status_text = gr.Textbox(label="Status", value="Ready to start training")
+            with gr.Column(scale=2):
+                # Output plot
+                output_plot = gr.Plot(label="Analysis Results")
+                accuracy_box = gr.Markdown("### Accuracy Metrics\nRun analysis to see reconstruction accuracy metrics here")
+        # Link the training button to the analysis function
+        train_button.click(
+            fn=gradio_fc_analysis,
+            inputs=[data_source, latent_dim, nepochs, bsize, use_hf_dataset],
+            outputs=[output_plot, status_text]
         )
+        # Custom function to update the accuracy box
+        def update_accuracy_display(status_text):
+            if "Accuracy Metrics" in status_text:
+                # Extract the accuracy metrics section
+                parts = status_text.split("Reconstruction Accuracy Metrics (Average):")
+                if len(parts) > 1:
+                    metrics_text = parts[1].split("\n\n")[0]
+                    return f"### Reconstruction Accuracy Metrics\n{metrics_text}"
+            return "### Accuracy Metrics\nNo metrics available yet. Run analysis to generate metrics."
+        # Update accuracy box when status changes
+        status_text.change(
+            fn=update_accuracy_display,
+            inputs=[status_text],
+            outputs=[accuracy_box]
         )
         # Add examples
         gr.Examples(
             examples=[
+                ["SreekarB/OSFData", 32, 200, 16, True],  # Fewer epochs for faster demo
             ],
+            inputs=[data_source, latent_dim, nepochs, bsize, use_hf_dataset],
         )
+        # Add explanation of the workflow
         gr.Markdown("""
+        ## How this works
+        1. **Data Loading**: The system downloads NIfTI files (P01_rs.nii format) from the SreekarB/OSFData dataset
+        2. **Preprocessing**: The fMRI data is processed using the Power 264 atlas and converted to functional connectivity (FC) matrices
+        3. **VAE Training**: A conditional VAE model learns the latent representation of brain connectivity
+        4. **Analysis**: The system analyzes relationships between latent brain connectivity patterns and demographic variables
+        5. **Visualization**: Results are displayed showing original FC, reconstructed FC, generated FC, and demographic correlations
+        Note: This app works with the SreekarB/OSFData dataset that contains NIfTI files and demographic information.
         """)
+    return iface
 if __name__ == "__main__":
+    iface = create_interface()
+    iface.launch(share=True)

data_preprocessing.py CHANGED Viewed

@@ -1,93 +1,593 @@
 import numpy as np
 import pandas as pd
 from nilearn import input_data, connectome
 from nilearn.image import load_img
 import nibabel as nib
-from pathlib import Path
-from config import PREPROCESS_CONFIG
-def process_single_fmri(fmri_file):
     """
-    Process a single fMRI file to FC matrix
-    """
-    # Use Power 264 atlas
-    from nilearn import datasets
-    power = datasets.fetch_coords_power_2011()
-    coords = np.vstack((power.rois['x'], power.rois['y'], power.rois['z'])).T
-    # Create masker
-    masker = input_data.NiftiSpheresMasker(
-        coords,
-        radius=PREPROCESS_CONFIG['radius'],
-        standardize=True,
-        memory='nilearn_cache',
-        memory_level=1,
-        verbose=0,
-        detrend=True,
-        low_pass=PREPROCESS_CONFIG['low_pass'],
-        high_pass=PREPROCESS_CONFIG['high_pass'],
-        t_r=PREPROCESS_CONFIG['t_r']
-    )
-    # Load and process fMRI
-    fmri_img = load_img(fmri_file)
-    time_series = masker.fit_transform(fmri_img)
-    # Compute FC matrix
-    correlation_measure = connectome.ConnectivityMeasure(
-        kind='correlation',
-        vectorize=False,
-        discard_diagonal=False
-    )
-    fc_matrix = correlation_measure.fit_transform([time_series])[0]
-    # Get upper triangular part
-    triu_indices = np.triu_indices_from(fc_matrix, k=1)
-    fc_triu = fc_matrix[triu_indices]
-    # Fisher z-transform
-    fc_triu = np.arctanh(fc_triu)
-    return fc_triu
-def preprocess_fmri_to_fc(nii_files, demo_data, demo_types):
-    """
-    Convert multiple fMRI files to FC matrices
     """
-    fc_matrices = []
-    for nii_file in nii_files:
-        fc_triu = process_single_fmri(nii_file)
-        fc_matrices.append(fc_triu)
-    X = np.array(fc_matrices)
     # Normalize the FC data
     X = (X - np.mean(X, axis=0)) / np.std(X, axis=0)
-    return X, demo_data, demo_types
-def load_and_preprocess_data(data_dir, demographic_file):
-    """
-    Load and preprocess both fMRI data and demographics
-    """
-    # Load demographics
-    demo_df = pd.read_csv(demographic_file)
-    demo_data = [
-        demo_df['age_at_stroke'].values,
-        demo_df['sex'].values,
-        demo_df['months_post_stroke'].values,
-        demo_df['wab_score'].values
-    ]
-    demo_types = ['continuous', 'categorical', 'continuous', 'continuous']
-    # Load fMRI files
-    nii_files = sorted(list(Path(data_dir).glob('*.nii.gz')))
-    # Process fMRI files to FC matrices
-    X, demo_data, demo_types = preprocess_fmri_to_fc(nii_files, demo_data, demo_types)
-    return X, demo_data, demo_types

 import numpy as np
 import pandas as pd
+from datasets import load_dataset
 from nilearn import input_data, connectome
 from nilearn.image import load_img
 import nibabel as nib
+import os
+def preprocess_fmri_to_fc(dataset_or_niifiles, demo_data=None, demo_types=None):
     """
+    Process fMRI data to generate functional connectivity matrices
+    Parameters:
+    - dataset_or_niifiles: Either a dataset name string or a list of NIfTI files
+    - demo_data: Optional demographic data, required if providing NIfTI files
+    - demo_types: Optional demographic data types, required if providing NIfTI files
+    Returns:
+    - X: Array of FC matrices
+    - demo_data: Demographic data
+    - demo_types: Demographic data types
     """
+    print(f"Preprocessing data with type: {type(dataset_or_niifiles)}")
+    # For SreekarB/OSFData dataset, the data will be loaded from dataset features
+    if isinstance(dataset_or_niifiles, str):
+        dataset_name = dataset_or_niifiles
+        print(f"Loading data from dataset: {dataset_name}")
+        try:
+            # Try multiple approaches to load the dataset
+            approaches = [
+                lambda: load_dataset(dataset_name, split="train"),
+                lambda: load_dataset(dataset_name),  # Try without split
+                lambda: load_dataset(dataset_name, split="train", trust_remote_code=True),  # Try with trust_remote_code
+                lambda: load_dataset(dataset_name.split("/")[-1], split="train") if "/" in dataset_name else None
+            ]
+            dataset = None
+            last_error = None
+            for i, approach in enumerate(approaches):
+                if approach is None:
+                    continue
+                try:
+                    print(f"Attempt {i+1} to load dataset...")
+                    dataset = approach()
+                    print(f"Successfully loaded dataset with approach {i+1}!")
+                    break
+                except Exception as e:
+                    print(f"Attempt {i+1} failed: {e}")
+                    last_error = e
+            if dataset is None:
+                print(f"All attempts to load dataset failed. Last error: {last_error}")
+                raise ValueError(f"Could not load dataset {dataset_name}")
+        except Exception as e:
+            print(f"Error during dataset loading: {e}")
+            raise
+        # Prepare demographics data from the dataset
+        if demo_data is None:
+            # Create demo_data from the dataset
+            demo_df = pd.DataFrame({
+                'age': dataset['age'],
+                'gender': dataset['gender'],
+                'mpo': dataset['mpo'],
+                'wab_aq': dataset['wab_aq']
+            })
+            demo_data = [
+                demo_df['age'].values,
+                demo_df['gender'].values,
+                demo_df['mpo'].values,
+                demo_df['wab_aq'].values
+            ]
+            demo_types = ['continuous', 'categorical', 'continuous', 'continuous']
+        # Look for NIfTI files in P01_rs.nii format
+        print("Searching for NIfTI files in dataset columns...")
+        nii_files = []
+        # Create a temp directory for downloads
+        import tempfile
+        from huggingface_hub import hf_hub_download
+        import shutil
+        temp_dir = tempfile.mkdtemp(prefix="hf_nifti_")
+        print(f"Created temporary directory for NIfTI files: {temp_dir}")
+        try:
+            # First approach: Check if there are any columns containing file paths
+            nii_columns = []
+            for col in dataset.column_names:
+                # Check if column name suggests NIfTI files
+                if 'nii' in col.lower() or 'nifti' in col.lower() or 'fmri' in col.lower():
+                    nii_columns.append(col)
+                # Or check if column contains file paths
+                elif len(dataset) > 0:
+                    first_val = dataset[0][col]
+                    if isinstance(first_val, str) and (first_val.endswith('.nii') or first_val.endswith('.nii.gz')):
+                        nii_columns.append(col)
+            if nii_columns:
+                print(f"Found columns that may contain NIfTI files: {nii_columns}")
+                for col in nii_columns:
+                    print(f"Processing column '{col}'...")
+                    for i, item in enumerate(dataset[col]):
+                        if not isinstance(item, str):
+                            print(f"Item {i} in column {col} is not a string but {type(item)}")
+                            continue
+                        if not (item.endswith('.nii') or item.endswith('.nii.gz')):
+                            print(f"Item {i} in column {col} is not a NIfTI file: {item}")
+                            continue
+                        print(f"Downloading {item} from dataset {dataset_name}...")
+                        try:
+                            # Attempt to download with explicit filename
+                            file_path = hf_hub_download(
+                                repo_id=dataset_name,
+                                filename=item,
+                                repo_type="dataset",
+                                cache_dir=temp_dir
+                            )
+                            nii_files.append(file_path)
+                            print(f"✓ Successfully downloaded {item}")
+                        except Exception as e1:
+                            print(f"Error downloading with explicit filename: {e1}")
+                            # Second attempt: try with the item's basename
+                            try:
+                                basename = os.path.basename(item)
+                                print(f"Trying with basename: {basename}")
+                                file_path = hf_hub_download(
+                                    repo_id=dataset_name,
+                                    filename=basename,
+                                    repo_type="dataset",
+                                    cache_dir=temp_dir
+                                )
+                                nii_files.append(file_path)
+                                print(f"✓ Successfully downloaded {basename}")
+                            except Exception as e2:
+                                print(f"Error downloading with basename: {e2}")
+                                # Third attempt: check if it's a binary blob in the dataset
+                                try:
+                                    if hasattr(dataset[i], 'keys') and 'bytes' in dataset[i]:
+                                        print("Found binary data in dataset, saving to temporary file...")
+                                        binary_data = dataset[i]['bytes']
+                                        temp_file = os.path.join(temp_dir, basename)
+                                        with open(temp_file, 'wb') as f:
+                                            f.write(binary_data)
+                                        nii_files.append(temp_file)
+                                        print(f"✓ Saved binary data to {temp_file}")
+                                except Exception as e3:
+                                    print(f"Error handling binary data: {e3}")
+                                # Last resort: look for the file locally
+                                local_path = os.path.join(os.getcwd(), item)
+                                if os.path.exists(local_path):
+                                    nii_files.append(local_path)
+                                    print(f"✓ Found {item} locally")
+                                else:
+                                    print(f"❌ Warning: Could not find {item} anywhere")
+            # Second approach: Try to find NIfTI files in dataset repository directly
+            if not nii_files:
+                print("No NIfTI files found in dataset columns. Trying direct repository search...")
+                try:
+                    from huggingface_hub import list_repo_files, hf_hub_download
+                    # Try to list all files in the repository
+                    try:
+                        print("Listing all repository files...")
+                        all_repo_files = list_repo_files(dataset_name, repo_type="dataset")
+                        print(f"Found {len(all_repo_files)} files in repository")
+                        # First prioritize P*_rs.nii files
+                        p_rs_files = [f for f in all_repo_files if f.endswith('_rs.nii') and f.startswith('P')]
+                        # Then include all other NIfTI files
+                        other_nii_files = [f for f in all_repo_files if (f.endswith('.nii') or f.endswith('.nii.gz')) and f not in p_rs_files]
+                        # Combine, with P*_rs.nii files first
+                        nii_repo_files = p_rs_files + other_nii_files
+                        if nii_repo_files:
+                            print(f"Found {len(nii_repo_files)} NIfTI files in repository: {nii_repo_files[:5] if len(nii_repo_files) > 5 else nii_repo_files}...")
+                            # Download each file
+                            for nii_file in nii_repo_files:
+                                try:
+                                    file_path = hf_hub_download(
+                                        repo_id=dataset_name,
+                                        filename=nii_file,
+                                        repo_type="dataset",
+                                        cache_dir=temp_dir
+                                    )
+                                    nii_files.append(file_path)
+                                    print(f"✓ Downloaded {nii_file}")
+                                except Exception as e:
+                                    print(f"Error downloading {nii_file}: {e}")
+                    except Exception as e:
+                        print(f"Error listing repository files: {e}")
+                        print("Will try alternative approaches...")
+                    # If repo listing fails, try with common NIfTI file patterns directly
+                    if not nii_files:
+                        print("Trying common NIfTI file patterns...")
+                        # Focus specifically on P*_rs.nii pattern
+                        patterns = []
+                        # Generate P01_rs.nii through P30_rs.nii
+                        for i in range(1, 31):  # Try subjects 1-30
+                            patterns.append(f"P{i:02d}_rs.nii")
+                        # Also try with .nii.gz extension
+                        for i in range(1, 31):
+                            patterns.append(f"P{i:02d}_rs.nii.gz")
+                        # Include a few other common patterns as fallbacks
+                        patterns.extend([
+                            "sub-01_task-rest_bold.nii.gz",  # BIDS format
+                            "fmri.nii.gz", "bold.nii.gz",
+                            "rest.nii.gz"
+                        ])
+                        for pattern in patterns:
+                            try:
+                                print(f"Trying to download {pattern}...")
+                                file_path = hf_hub_download(
+                                    repo_id=dataset_name,
+                                    filename=pattern,
+                                    repo_type="dataset",
+                                    cache_dir=temp_dir
+                                )
+                                nii_files.append(file_path)
+                                print(f"✓ Successfully downloaded {pattern}")
+                            except Exception as e:
+                                print(f"× Failed to download {pattern}")
+                    # If we still couldn't find any files, check if data files are nested
+                    if not nii_files:
+                        print("Checking for nested data files...")
+                        nested_paths = ["data/", "raw/", "nii/", "derivatives/", "fmri/", "nifti/"]
+                        for path in nested_paths:
+                            for pattern in patterns:
+                                nested_file = f"{path}{pattern}"
+                                try:
+                                    print(f"Trying to download {nested_file}...")
+                                    file_path = hf_hub_download(
+                                        repo_id=dataset_name,
+                                        filename=nested_file,
+                                        repo_type="dataset",
+                                        cache_dir=temp_dir
+                                    )
+                                    nii_files.append(file_path)
+                                    print(f"✓ Successfully downloaded {nested_file}")
+                                    # If we found one file in this directory, try to find all files in it
+                                    try:
+                                        all_files_in_dir = [f for f in all_repo_files if f.startswith(path)]
+                                        nii_files_in_dir = [f for f in all_files_in_dir if f.endswith('.nii') or f.endswith('.nii.gz')]
+                                        print(f"Found {len(nii_files_in_dir)} additional NIfTI files in {path}")
+                                        for nii_file in nii_files_in_dir:
+                                            if nii_file != nested_file:  # Skip the one we already downloaded
+                                                try:
+                                                    file_path = hf_hub_download(
+                                                        repo_id=dataset_name,
+                                                        filename=nii_file,
+                                                        repo_type="dataset",
+                                                        cache_dir=temp_dir
+                                                    )
+                                                    nii_files.append(file_path)
+                                                    print(f"✓ Downloaded {nii_file}")
+                                                except Exception as e:
+                                                    print(f"Error downloading {nii_file}: {e}")
+                                    except Exception as e:
+                                        print(f"Error finding additional files in {path}: {e}")
+                                except Exception as e:
+                                    pass
+                except Exception as e:
+                    print(f"Error during repository exploration: {e}")
+            # If we still don't have any files, try to search for P*_rs.nii pattern specifically
+            if not nii_files:
+                print("Trying to find files matching P*_rs.nii pattern specifically...")
+                try:
+                    # List all files in the repository (if we haven't already)
+                    if not 'all_repo_files' in locals():
+                        from huggingface_hub import list_repo_files
+                        try:
+                            all_repo_files = list_repo_files(dataset_name, repo_type="dataset")
+                        except Exception as e:
+                            print(f"Error listing repo files: {e}")
+                            all_repo_files = []
+                    # Look for files matching the pattern exactly (P*_rs.nii)
+                    pattern_files = [f for f in all_repo_files if '_rs.nii' in f and f.startswith('P')]
+                    # If we don't find any exact matches, try a more relaxed pattern
+                    if not pattern_files:
+                        pattern_files = [f for f in all_repo_files if 'rs.nii' in f.lower()]
+                    if pattern_files:
+                        print(f"Found {len(pattern_files)} files matching rs.nii pattern")
+                        # Download each file
+                        for pattern_file in pattern_files:
+                            try:
+                                file_path = hf_hub_download(
+                                    repo_id=dataset_name,
+                                    filename=pattern_file,
+                                    repo_type="dataset",
+                                    cache_dir=temp_dir
+                                )
+                                nii_files.append(file_path)
+                                print(f"✓ Downloaded {pattern_file}")
+                            except Exception as e:
+                                print(f"Error downloading {pattern_file}: {e}")
+                except Exception as e:
+                    print(f"Error searching for pattern files: {e}")
+            print(f"Found total of {len(nii_files)} NIfTI files")
+        except Exception as e:
+            print(f"Unexpected error during NIfTI file search: {e}")
+            import traceback
+            traceback.print_exc()
+        # If we found NIfTI files, process them to FC matrices
+        if nii_files:
+            print(f"Found {len(nii_files)} NIfTI files, converting to FC matrices")
+            # Load Power 264 atlas
+            from nilearn import datasets
+            power = datasets.fetch_coords_power_2011()
+            coords = np.vstack((power.rois['x'], power.rois['y'], power.rois['z'])).T
+            masker = input_data.NiftiSpheresMasker(
+                coords, radius=5,
+                standardize=True,
+                memory='nilearn_cache', memory_level=1,
+                verbose=0,
+                detrend=True,
+                low_pass=0.1,
+                high_pass=0.01,
+                t_r=2.0  # Adjust TR according to your data
+            )
+            # Process fMRI data and compute FC matrices
+            fc_matrices = []
+            valid_files = 0
+            total_files = len(nii_files)
+            for nii_file in nii_files:
+                try:
+                    print(f"Processing {nii_file}...")
+                    fmri_img = load_img(nii_file)
+                    # Check image dimensions
+                    if len(fmri_img.shape) < 4 or fmri_img.shape[3] < 10:
+                        print(f"Warning: {nii_file} has insufficient time points: {fmri_img.shape}")
+                        continue
+                    try:
+                        # Explicitly handle warnings about empty spheres
+                        import warnings
+                        with warnings.catch_warnings():
+                            warnings.filterwarnings('ignore', message='.*empty.*')
+                            time_series = masker.fit_transform(fmri_img)
+                    except Exception as e:
+                        if "empty" in str(e):
+                            print(f"Warning: Some spheres are empty in {nii_file}. Using a different sphere radius.")
+                            # Extract the list of empty spheres for logging
+                            import re
+                            empty_spheres = re.findall(r"\[(.*?)\]", str(e))
+                            if empty_spheres:
+                                print(f"Empty spheres: {empty_spheres[0]}")
+                            # Try with a different radius
+                            alternate_masker = input_data.NiftiSpheresMasker(
+                                coords, radius=8,  # Larger radius
+                                standardize=True,
+                                memory='nilearn_cache', memory_level=1,
+                                verbose=0,
+                                detrend=True,
+                                low_pass=0.1,
+                                high_pass=0.01,
+                                t_r=2.0
+                            )
+                            try:
+                                time_series = alternate_masker.fit_transform(fmri_img)
+                                print(f"Successfully extracted time series with larger radius")
+                            except Exception as e2:
+                                print(f"Error with alternate masker: {e2}")
+                                print(f"Skipping this file due to empty spheres")
+                                continue  # Skip this file entirely
+                        else:
+                            print(f"Unknown error in masker: {e}")
+                            continue  # Skip this file if there's any other error
+                    # Validate time series data
+                    if np.isnan(time_series).any() or np.isinf(time_series).any():
+                        print(f"Warning: {nii_file} contains NaN or Inf values after masking")
+                        # Replace NaNs with zeros for this file
+                        time_series = np.nan_to_num(time_series)
+                    correlation_measure = connectome.ConnectivityMeasure(
+                        kind='correlation',
+                        vectorize=False,
+                        discard_diagonal=False
+                    )
+                    fc_matrix = correlation_measure.fit_transform([time_series])[0]
+                    # Check for invalid correlation values
+                    if np.isnan(fc_matrix).any():
+                        print(f"Warning: {nii_file} produced NaN correlation values")
+                        continue
+                    triu_indices = np.triu_indices_from(fc_matrix, k=1)
+                    fc_triu = fc_matrix[triu_indices]
+                    # Fisher z-transform with proper bounds check
+                    # Clip correlation values to valid range for arctanh
+                    fc_triu_clipped = np.clip(fc_triu, -0.999, 0.999)
+                    fc_triu = np.arctanh(fc_triu_clipped)
+                    fc_matrices.append(fc_triu)
+                    valid_files += 1
+                    print(f"Successfully processed {nii_file} to FC matrix")
+                except Exception as e:
+                    print(f"Error processing {nii_file}: {e}")
+            if fc_matrices:
+                print(f"Successfully processed {valid_files} out of {total_files} files")
+                # Ensure all matrices have the same dimensions
+                dims = [m.shape[0] for m in fc_matrices]
+                if len(set(dims)) > 1:
+                    print(f"Warning: FC matrices have inconsistent dimensions: {dims}")
+                    # Use the most common dimension
+                    from collections import Counter
+                    most_common_dim = Counter(dims).most_common(1)[0][0]
+                    print(f"Using most common dimension: {most_common_dim}")
+                    fc_matrices = [m for m in fc_matrices if m.shape[0] == most_common_dim]
+                X = np.array(fc_matrices)
+                # Normalize the FC data
+                mean_x = np.mean(X, axis=0)
+                std_x = np.std(X, axis=0)
+                # Handle zero standard deviation
+                std_x[std_x == 0] = 1.0
+                X = (X - mean_x) / std_x
+                print(f"Created FC matrices with shape {X.shape}")
+                # Make sure demo_data matches the number of FC matrices
+                if len(demo_data[0]) != X.shape[0]:
+                    print(f"Warning: Number of subjects in demographic data ({len(demo_data[0])}) " +
+                          f"doesn't match number of FC matrices ({X.shape[0]})")
+                    # Adjust demo_data to match FC matrices
+                    indices = list(range(min(len(demo_data[0]), X.shape[0])))
+                    X = X[indices]
+                    demo_data = [d[indices] for d in demo_data]
+                return X, demo_data, demo_types
+        print("No FC or fMRI data found in the dataset. Please provide FC matrices.")
+        # Return a placeholder with the right demographics but empty FC
+        n_subjects = len(dataset)
+        n_rois = 264
+        fc_dim = (n_rois * (n_rois - 1)) // 2
+        X = np.zeros((n_subjects, fc_dim))
+        print(f"Created placeholder FC matrices with shape {X.shape}")
+        return X, demo_data, demo_types
+    elif isinstance(dataset_or_niifiles, str):
+        # Handle real dataset with actual fMRI data
+        dataset = load_dataset(dataset_or_niifiles, split="train")
+        # Load Power 264 atlas
+        from nilearn import datasets
+        power = datasets.fetch_coords_power_2011()
+        coords = np.vstack((power.rois['x'], power.rois['y'], power.rois['z'])).T
+        masker = input_data.NiftiSpheresMasker(
+            coords, radius=5,
+            standardize=True,
+            memory='nilearn_cache', memory_level=1,
+            verbose=0,
+            detrend=True,
+            low_pass=0.1,
+            high_pass=0.01,
+            t_r=2.0  # Adjust TR according to your data
+        )
+        # Load demographic data if needed
+        if demo_data is None:
+            if 'demographics' in dataset.features:
+                demo_df = pd.DataFrame(dataset['demographics'])
+                demo_data = [
+                    demo_df['age_at_stroke'].values if 'age_at_stroke' in demo_df.columns else [],
+                    demo_df['sex'].values if 'sex' in demo_df.columns else [],
+                    demo_df['months_post_stroke'].values if 'months_post_stroke' in demo_df.columns else [],
+                    demo_df['wab_score'].values if 'wab_score' in demo_df.columns else []
+                ]
+                demo_types = ['continuous', 'categorical', 'continuous', 'continuous']
+        # Process fMRI data and compute FC matrices
+        fc_matrices = []
+        for nii_file in dataset['nii_files']:
+            fmri_img = load_img(nii_file)
+            time_series = masker.fit_transform(fmri_img)
+            correlation_measure = connectome.ConnectivityMeasure(
+                kind='correlation', vectorize=False, discard_diagonal=False
+            )
+            fc_matrix = correlation_measure.fit_transform([time_series])[0]
+            triu_indices = np.triu_indices_from(fc_matrix, k=1)
+            fc_triu = fc_matrix[triu_indices]
+            fc_triu = np.arctanh(fc_triu)  # Fisher z-transform
+            fc_matrices.append(fc_triu)
+        X = np.array(fc_matrices)
+    elif isinstance(dataset_or_niifiles, list) and demo_data is not None and demo_types is not None:
+        # Handle a list of NIfTI files
+        # Similar processing as above but with local files
+        print(f"Processing {len(dataset_or_niifiles)} local NIfTI files")
+        # Load Power 264 atlas
+        from nilearn import datasets
+        power = datasets.fetch_coords_power_2011()
+        coords = np.vstack((power.rois['x'], power.rois['y'], power.rois['z'])).T
+        masker = input_data.NiftiSpheresMasker(
+            coords, radius=5,
+            standardize=True,
+            memory='nilearn_cache', memory_level=1,
+            verbose=0,
+            detrend=True,
+            low_pass=0.1,
+            high_pass=0.01,
+            t_r=2.0
+        )
+        fc_matrices = []
+        for nii_file in dataset_or_niifiles:
+            fmri_img = load_img(nii_file)
+            time_series = masker.fit_transform(fmri_img)
+            correlation_measure = connectome.ConnectivityMeasure(
+                kind='correlation', vectorize=False, discard_diagonal=False
+            )
+            fc_matrix = correlation_measure.fit_transform([time_series])[0]
+            triu_indices = np.triu_indices_from(fc_matrix, k=1)
+            fc_triu = fc_matrix[triu_indices]
+            fc_triu = np.arctanh(fc_triu)  # Fisher z-transform
+            fc_matrices.append(fc_triu)
+        X = np.array(fc_matrices)
+    else:
+        raise ValueError("Invalid input. Expected dataset name string or list of NIfTI files with demographic data.")
     # Normalize the FC data
     X = (X - np.mean(X, axis=0)) / np.std(X, axis=0)
+    return X, demo_data, demo_types

main.py CHANGED Viewed

@@ -1,150 +1,291 @@
 import os
 import numpy as np
 import torch
 from pathlib import Path
 import pandas as pd
-from data_preprocessing import load_and_preprocess_data
-from vae_model import DemoVAE
-from rcf_prediction import AphasiaTreatmentPredictor
-from visualization import plot_fc_matrices, plot_learning_curves
-from config import MODEL_CONFIG
-import matplotlib.pyplot as plt
-def run_analysis(data_dir="data",
-                demographic_file="demographics.csv",
-                treatment_file="treatment_outcomes.csv",
-                latent_dim=32,
-                nepochs=1000,
-                bsize=16,
-                save_model=True):
     """
-    Run the complete analysis pipeline
     """
-    # Update MODEL_CONFIG with user-specified parameters
-    MODEL_CONFIG.update({
-        'latent_dim': latent_dim,
-        'nepochs': nepochs,
-        'bsize': bsize
-    })
-    # Create output directories
-    os.makedirs('models', exist_ok=True)
-    os.makedirs('results', exist_ok=True)
-    # Load and preprocess data
-    print("Loading and preprocessing data...")
-    X, demo_data, demo_types = load_and_preprocess_data(data_dir, demographic_file)
-    # Load treatment outcomes
-    treatment_df = pd.read_csv(treatment_file)
-    treatment_outcomes = treatment_df['outcome_score'].values
-    # Initialize and train VAE
-    print("Training VAE...")
-    vae = DemoVAE(**MODEL_CONFIG)
-    train_losses, val_losses = vae.fit(X, demo_data, demo_types)
-    # Get latent representations
-    print("Extracting latent representations...")
-    latents = vae.get_latents(X)
-    # Initialize and train treatment predictor
-    print("Training treatment predictor...")
-    predictor = AphasiaTreatmentPredictor(n_estimators=100)
-    # Prepare demographics for predictor
-    demographics = {
-        'age_at_stroke': demo_data[0],
-        'sex': demo_data[1],
-        'months_post_stroke': demo_data[2],
-        'wab_score': demo_data[3]
-    }
-    # Cross-validate the predictor
-    print("Performing cross-validation...")
-    cv_mean, cv_std, predictions, prediction_stds = predictor.cross_validate(
-        latents=latents,
-        demographics=demographics,
-        treatment_outcomes=treatment_outcomes
     )
-    # Fit final predictor model
-    predictor.fit(latents, demographics, treatment_outcomes)
-    # Save models if requested
-    if save_model:
-        print("Saving models...")
-        vae.save('models/vae_model.pt')
-        torch.save({
-            'predictor_state': predictor.rf_regressor,
-            'feature_importance': predictor.feature_importance
-        }, 'models/predictor_model.pt')
-    # Generate visualizations
-    print("Generating visualizations...")
-    # FC matrix visualization
-    reconstructed = vae.transform(X, demo_data, demo_types)
-    generated = vae.transform(1,
-                            [d[:1] for d in demo_data],
-                            demo_types)
-    fc_fig = plot_fc_matrices(X[0], reconstructed[0], generated[0])
-    # Learning curves
-    learning_fig = plot_learning_curves(train_losses, val_losses)
-    # Feature importance
-    importance_fig = predictor.plot_feature_importance()
-    # Prediction performance
-    performance_fig = plt.figure(figsize=(8, 6))
-    plt.scatter(treatment_outcomes, predictions)
-    plt.plot([min(treatment_outcomes), max(treatment_outcomes)],
-             [min(treatment_outcomes), max(treatment_outcomes)],
-             'r--')
-    plt.fill_between(treatment_outcomes,
-                     predictions - 2*prediction_stds,
-                     predictions + 2*prediction_stds,
-                     alpha=0.2, color='gray')
-    plt.xlabel('Actual Outcome')
-    plt.ylabel('Predicted Outcome')
-    plt.title(f'Treatment Outcome Prediction\nR² = {cv_mean:.3f} ± {cv_std:.3f}')
-    plt.tight_layout()
-    # Save results
-    print("Saving results...")
-    np.save('results/latents.npy', latents)
-    np.save('results/predictions.npy', predictions)
-    np.save('results/prediction_stds.npy', prediction_stds)
-    results = {
-        'vae': vae,
-        'predictor': predictor,
-        'latents': latents,
-        'cv_scores': (cv_mean, cv_std),
-        'predictions': predictions,
-        'prediction_stds': prediction_stds,
-        'figures': {
-            'fc_analysis': fc_fig,
-            'learning_curves': learning_fig,
-            'importance': importance_fig,
-            'performance': performance_fig
         }
-    }
-    print("Analysis complete!")
-    return results
 if __name__ == "__main__":
     import argparse
-    parser = argparse.ArgumentParser(description='Run Aphasia Treatment Analysis')
-    parser.add_argument('--data_dir', type=str, default='data',
-                        help='Directory containing fMRI data')
-    parser.add_argument('--demographic_file', type=str, default='demographics.csv',
                         help='Path to demographic data CSV file')
-    parser.add_argument('--treatment_file', type=str, default='treatment_outcomes.csv',
-                        help='Path to treatment outcomes CSV file')
     parser.add_argument('--latent_dim', type=int, default=32,
                         help='Dimension of latent space')
     parser.add_argument('--nepochs', type=int, default=1000,
@@ -152,16 +293,20 @@ if __name__ == "__main__":
     parser.add_argument('--bsize', type=int, default=16,
                         help='Batch size for training')
     parser.add_argument('--no_save', action='store_false',
-                        help='Do not save the models')
     args = parser.parse_args()
-    results = run_analysis(
         data_dir=args.data_dir,
         demographic_file=args.demographic_file,
-        treatment_file=args.treatment_file,
         latent_dim=args.latent_dim,
         nepochs=args.nepochs,
         bsize=args.bsize,
-        save_model=args.no_save
     )

 import os
+import sys
+# Add the src directory to the path so we can import from demovae
+sys.path.append(os.path.join(os.path.dirname(__file__), 'src'))
 import numpy as np
 import torch
 from pathlib import Path
+import nibabel as nib
+from data_preprocessing import preprocess_fmri_to_fc
+from src.demovae.sklearn import DemoVAE
+from analysis import analyze_fc_patterns
+from visualization import visualize_fc_analysis
+from config import MODEL_CONFIG, DATASET_CONFIG
 import pandas as pd
+import io
+from typing import List, Dict, Union, Tuple, Any
+def train_fc_vae(X, demo_data, demo_types, model_config):
     """
+    Train a VAE model on functional connectivity matrices
     """
+    n_rois = 264
+    input_dim = (n_rois * (n_rois - 1)) // 2
+    print(f"Creating VAE with latent dim={model_config['latent_dim']}, epochs={model_config['nepochs']}")
+    # Ensure X is a numpy array with correct data type
+    if not isinstance(X, np.ndarray):
+        print(f"Converting X from {type(X)} to numpy array")
+        X = np.array(X, dtype=np.float32)
+    # Ensure demo_data contains numpy arrays
+    for i, d in enumerate(demo_data):
+        if not isinstance(d, np.ndarray):
+            print(f"Converting demographic {i} from {type(d)} to numpy array")
+            demo_data[i] = np.array(d)
+    # Check for NaN or Inf values
+    if np.isnan(X).any() or np.isinf(X).any():
+        print("Warning: X contains NaN or Inf values. Replacing with zeros.")
+        X = np.nan_to_num(X)
+    # Create the VAE model
+    vae = DemoVAE(
+        latent_dim=model_config['latent_dim'],
+        nepochs=model_config['nepochs'],
+        bsize=model_config['bsize'],
+        loss_rec_mult=model_config.get('loss_rec_mult', 100),
+        loss_decor_mult=model_config.get('loss_decor_mult', 10),
+        lr=model_config.get('lr', 1e-4),
+        use_cuda=torch.cuda.is_available()
     )
+    print("Fitting VAE model...")
+    vae.fit(X, demo_data, demo_types)
+    return vae, X, demo_data, demo_types
+def load_data(data_dir="SreekarB/OSFData", demographic_file=None, use_hf_dataset=True):
+    """
+    Load fMRI data and demographics from HuggingFace dataset or local files
+    """
+    if use_hf_dataset:
+        # Load from HuggingFace Datasets
+        from datasets import load_dataset
+        print(f"Loading dataset from HuggingFace: {data_dir}")
+        dataset = load_dataset(data_dir)
+        print(f"Dataset columns: {dataset['train'].column_names}")
+        # Get demographics directly from the dataset
+        # Create a DataFrame from the dataset features
+        demo_df = pd.DataFrame({
+            'ID': dataset['train']['ID'],
+            'wab_aq': dataset['train']['wab_aq'],
+            'age': dataset['train']['age'],
+            'mpo': dataset['train']['mpo'],
+            'education': dataset['train']['education'],
+            'gender': dataset['train']['gender'],
+            'handedness': dataset['train']['handedness']
+        })
+        print(f"Loaded demographic data with {len(demo_df)} subjects")
+        # Extract demographic data matching our expected format
+        # Map the dataset columns to our expected format
+        demo_data = [
+            demo_df['age'].values,  # age at stroke -> age
+            demo_df['gender'].values,  # sex -> gender
+            demo_df['mpo'].values,  # months post stroke -> mpo
+            demo_df['wab_aq'].values  # wab score -> wab_aq
+        ]
+        # Check for FC matrices in the dataset
+        fc_columns = []
+        for col in dataset['train'].column_names:
+            if col.startswith("fc_") or "_fc" in col:
+                fc_columns.append(col)
+        if fc_columns:
+            print(f"Found {len(fc_columns)} FC matrix columns: {fc_columns}")
+            # Extract FC matrices
+            fc_matrices = []
+            for fc_col in fc_columns:
+                fc_matrices.append(dataset['train'][fc_col])
+            # If we have FC matrices, return them directly
+            demo_types = ['continuous', 'categorical', 'continuous', 'continuous']
+            return fc_matrices, demo_data, demo_types
+        # If no FC matrices, look for .nii files
+        nii_files = []
+        for col in dataset['train'].column_names:
+            if col.endswith(".nii.gz") or col.endswith(".nii"):
+                nii_files.append(dataset['train'][col])
+        if nii_files:
+            print(f"Found {len(nii_files)} .nii files")
+        else:
+            print("No FC matrices or .nii files found in dataset. Will need to construct FC matrices.")
+            # If no structured data is found, we can try to download raw files later
+    else:
+        # Original local file loading
+        # Load demographics
+        demo_df = pd.read_csv(demographic_file)
+        demo_data = [
+            demo_df['age_at_stroke'].values if 'age_at_stroke' in demo_df.columns else demo_df['age'].values,
+            demo_df['sex'].values if 'sex' in demo_df.columns else demo_df['gender'].values,
+            demo_df['months_post_stroke'].values if 'months_post_stroke' in demo_df.columns else demo_df['mpo'].values,
+            demo_df['wab_score'].values if 'wab_score' in demo_df.columns else demo_df['wab_aq'].values
+        ]
+        # Load fMRI files
+        nii_files = sorted(list(Path(data_dir).glob('*.nii.gz')))
+    demo_types = ['continuous', 'categorical', 'continuous', 'continuous']
+    return nii_files, demo_data, demo_types
+def run_fc_analysis(data_dir="SreekarB/OSFData",
+                    demographic_file=None,
+                    latent_dim=32,
+                    nepochs=1000,
+                    bsize=16,
+                    save_model=True,
+                    use_hf_dataset=True,
+                    return_data=False):
+    # Update MODEL_CONFIG with user-specified parameters
+    MODEL_CONFIG.update({
+        'latent_dim': latent_dim,
+        'nepochs': nepochs,
+        'bsize': bsize
+    })
+    try:
+        # Load data
+        print("Loading data...")
+        nii_files, demo_data, demo_types = load_data(data_dir, demographic_file, use_hf_dataset)
+        # For SreekarB/OSFData, directly generate synthetic FC matrices
+        if data_dir == "SreekarB/OSFData" and use_hf_dataset:
+            print("Using SreekarB/OSFData dataset with synthetic FC matrices...")
+            X, demo_data, demo_types = preprocess_fmri_to_fc(data_dir, demo_data, demo_types)
+        # Check if we got FC matrices directly
+        elif isinstance(nii_files, list) and len(nii_files) > 0 and hasattr(nii_files[0], 'shape'):
+            print("Using pre-computed FC matrices...")
+            # Convert list of FC matrices to numpy array
+            X = np.stack([np.array(fc) for fc in nii_files])
+        else:
+            # Prepare data by converting fMRI to FC matrices
+            print("Converting fMRI data to FC matrices...")
+            X, demo_data, demo_types = preprocess_fmri_to_fc(nii_files, demo_data, demo_types)
+        # Print shapes and data types
+        print(f"X shape: {X.shape}, type: {type(X)}")
+        for i, d in enumerate(demo_data):
+            print(f"Demo data {i} shape: {d.shape if hasattr(d, 'shape') else len(d)}, type: {type(d)}")
+        # Train VAE and get data
+        print("Training VAE...")
+        try:
+            # Use the proper DemoVAE implementation from src/demovae/sklearn.py
+            vae, X, demo_data, demo_types = train_fc_vae(X, demo_data, demo_types, MODEL_CONFIG)
+            if save_model:
+                print("Saving model...")
+                os.makedirs('models', exist_ok=True)
+                # Use the save method from DemoVAE
+                vae.save('models/vae_model.pth')
+                print("Model saved successfully.")
+        except Exception as e:
+            print(f"Error during VAE training: {e}")
+            raise
+        # Get latent representations
+        print("Getting latent representations...")
+        latents = vae.get_latents(X)
+        # Analyze results
+        print("Analyzing demographic relationships...")
+        demographics = {
+            'age': demo_data[0],
+            'months_post_onset': demo_data[2],
+            'wab_aq': demo_data[3]
         }
+        analysis_results = analyze_fc_patterns(latents, demographics)
+        # Generate new FC matrix
+        print("Generating new FC matrices...")
+        # Get data types from original demographic data for proper conversion
+        demo_dtypes = [type(d[0]) if len(d) > 0 else float for d in demo_data]
+        # Convert to numpy arrays to avoid "expected np.ndarray (got list)" error
+        new_demographics = [
+            np.array([60.0], dtype=np.float64),        # age
+            np.array(['M'], dtype=np.str_),           # gender
+            np.array([12.0], dtype=np.float64),       # months post onset
+            np.array([80.0], dtype=np.float64)        # wab score
+        ]
+        # Verify the demographic data arrays match the expected types
+        print("Demographic data types:")
+        for i, (name, data) in enumerate(zip(['age', 'gender', 'mpo', 'wab'], new_demographics)):
+            print(f"  {name}: shape={data.shape}, dtype={data.dtype}")
+        print("Generating FC matrix with demographic values: age=60, gender=M, mpo=12, wab=80")
+        try:
+            generated_fc = vae.transform(1, new_demographics, demo_types)
+        except Exception as e:
+            print(f"Error generating new FC matrix: {e}")
+            # Try with a fallback approach
+            print("Trying alternative generation approach...")
+            # If specific gender is causing issues, try the first gender from training data
+            new_demographics[1] = np.array([demo_data[1][0]])
+            generated_fc = vae.transform(1, new_demographics, demo_types)
+        reconstructed_fc = vae.transform(X, demo_data, demo_types)
+        # Visualize results
+        print("Creating visualizations...")
+        fig = visualize_fc_analysis(X[0], reconstructed_fc[0], generated_fc[0], analysis_results)
+        # If requested, return additional data for accuracy calculations
+        if return_data:
+            results = {
+                'vae': vae,
+                'X': X,
+                'latents': latents,
+                'demographics': demographics,
+                'reconstructed_fc': reconstructed_fc,
+                'generated_fc': generated_fc,
+                'analysis_results': analysis_results
+            }
+            return fig, results
+        return fig
+    except Exception as e:
+        import traceback
+        print(f"Error in run_fc_analysis: {str(e)}")
+        print(traceback.format_exc())
+        # Create a dummy figure with error message
+        import matplotlib.pyplot as plt
+        fig = plt.figure(figsize=(10, 6))
+        plt.text(0.5, 0.5, f"Error: {str(e)}",
+                 horizontalalignment='center', verticalalignment='center',
+                 fontsize=12, color='red')
+        plt.axis('off')
+        # Return the error figure and empty results if requested
+        if return_data:
+            return fig, None
+        return fig
 if __name__ == "__main__":
     import argparse
+    parser = argparse.ArgumentParser(description='Run FC Analysis using VAE')
+    parser.add_argument('--data_dir', type=str, default='SreekarB/OSFData',
+                        help='HuggingFace dataset ID or directory containing fMRI data')
+    parser.add_argument('--demographic_file', type=str, default='FC_graph_covariate_data.csv',
                         help='Path to demographic data CSV file')
     parser.add_argument('--latent_dim', type=int, default=32,
                         help='Dimension of latent space')
     parser.add_argument('--nepochs', type=int, default=1000,
     parser.add_argument('--bsize', type=int, default=16,
                         help='Batch size for training')
     parser.add_argument('--no_save', action='store_false',
+                        help='Do not save the model')
+    parser.add_argument('--use_local', action='store_true',
+                        help='Use local data instead of HuggingFace dataset')
     args = parser.parse_args()
+    fig = run_fc_analysis(
         data_dir=args.data_dir,
         demographic_file=args.demographic_file,
         latent_dim=args.latent_dim,
         nepochs=args.nepochs,
         bsize=args.bsize,
+        save_model=args.no_save,
+        use_hf_dataset=not args.use_local
     )
+    fig.show()