Spaces:

SreekarB
/

AphasiaPred

Sleeping

File size: 29,608 Bytes

import numpy as np
import pandas as pd
from datasets import load_dataset
from nilearn import input_data, connectome
from nilearn.image import load_img
import nibabel as nib
import os

def preprocess_fmri_to_fc(dataset_or_niifiles, demo_data=None, demo_types=None):
    """
    Process fMRI data to generate functional connectivity matrices
    
    Parameters:
    - dataset_or_niifiles: Either a dataset name string or a list of NIfTI files
    - demo_data: Optional demographic data, required if providing NIfTI files
    - demo_types: Optional demographic data types, required if providing NIfTI files
    
    Returns:
    - X: Array of FC matrices
    - demo_data: Demographic data
    - demo_types: Demographic data types
    """
    print(f"Preprocessing data with type: {type(dataset_or_niifiles)}")
    
    # For SreekarB/OSFData dataset, the data will be loaded from dataset features
    if isinstance(dataset_or_niifiles, str):
        dataset_name = dataset_or_niifiles
        print(f"Loading data from dataset: {dataset_name}")
        try:
            # Try multiple approaches to load the dataset
            approaches = [
                lambda: load_dataset(dataset_name, split="train"),
                lambda: load_dataset(dataset_name),  # Try without split
                lambda: load_dataset(dataset_name, split="train", trust_remote_code=True),  # Try with trust_remote_code
                lambda: load_dataset(dataset_name.split("/")[-1], split="train") if "/" in dataset_name else None
            ]
            
            dataset = None
            last_error = None
            
            for i, approach in enumerate(approaches):
                if approach is None:
                    continue
                    
                try:
                    print(f"Attempt {i+1} to load dataset...")
                    dataset = approach()
                    print(f"Successfully loaded dataset with approach {i+1}!")
                    break
                except Exception as e:
                    print(f"Attempt {i+1} failed: {e}")
                    last_error = e
                    
            if dataset is None:
                print(f"All attempts to load dataset failed. Last error: {last_error}")
                raise ValueError(f"Could not load dataset {dataset_name}")
        except Exception as e:
            print(f"Error during dataset loading: {e}")
            raise
        
        # Prepare demographics data from the dataset
        if demo_data is None:
            # Create demo_data from the dataset
            demo_df = pd.DataFrame({
                'age': dataset['age'],
                'gender': dataset['gender'],
                'mpo': dataset['mpo'],
                'wab_aq': dataset['wab_aq']
            })
            
            demo_data = [
                demo_df['age'].values,
                demo_df['gender'].values,
                demo_df['mpo'].values,
                demo_df['wab_aq'].values
            ]
            
            demo_types = ['continuous', 'categorical', 'continuous', 'continuous']
        
        # Look for NIfTI files in P01_rs.nii format
        print("Searching for NIfTI files in dataset columns...")
        nii_files = []
        
        # Create a temp directory for downloads
        import tempfile
        from huggingface_hub import hf_hub_download
        import shutil
        
        temp_dir = tempfile.mkdtemp(prefix="hf_nifti_")
        print(f"Created temporary directory for NIfTI files: {temp_dir}")
        
        try:
            # First approach: Check if there are any columns containing file paths
            nii_columns = []
            for col in dataset.column_names:
                # Check if column name suggests NIfTI files
                if 'nii' in col.lower() or 'nifti' in col.lower() or 'fmri' in col.lower():
                    nii_columns.append(col)
                # Or check if column contains file paths
                elif len(dataset) > 0:
                    first_val = dataset[0][col]
                    if isinstance(first_val, str) and (first_val.endswith('.nii') or first_val.endswith('.nii.gz')):
                        nii_columns.append(col)
            
            if nii_columns:
                print(f"Found columns that may contain NIfTI files: {nii_columns}")
                
                for col in nii_columns:
                    print(f"Processing column '{col}'...")
                    
                    for i, item in enumerate(dataset[col]):
                        if not isinstance(item, str):
                            print(f"Item {i} in column {col} is not a string but {type(item)}")
                            continue
                            
                        if not (item.endswith('.nii') or item.endswith('.nii.gz')):
                            print(f"Item {i} in column {col} is not a NIfTI file: {item}")
                            continue
                            
                        print(f"Downloading {item} from dataset {dataset_name}...")
                        
                        try:
                            # Attempt to download with explicit filename
                            file_path = hf_hub_download(
                                repo_id=dataset_name,
                                filename=item,
                                repo_type="dataset",
                                cache_dir=temp_dir
                            )
                            nii_files.append(file_path)
                            print(f"✓ Successfully downloaded {item}")
                        except Exception as e1:
                            print(f"Error downloading with explicit filename: {e1}")
                            
                            # Second attempt: try with the item's basename
                            try:
                                basename = os.path.basename(item)
                                print(f"Trying with basename: {basename}")
                                file_path = hf_hub_download(
                                    repo_id=dataset_name,
                                    filename=basename,
                                    repo_type="dataset",
                                    cache_dir=temp_dir
                                )
                                nii_files.append(file_path)
                                print(f"✓ Successfully downloaded {basename}")
                            except Exception as e2:
                                print(f"Error downloading with basename: {e2}")
                                
                                # Third attempt: check if it's a binary blob in the dataset
                                try:
                                    if hasattr(dataset[i], 'keys') and 'bytes' in dataset[i]:
                                        print("Found binary data in dataset, saving to temporary file...")
                                        binary_data = dataset[i]['bytes']
                                        temp_file = os.path.join(temp_dir, basename)
                                        with open(temp_file, 'wb') as f:
                                            f.write(binary_data)
                                        nii_files.append(temp_file)
                                        print(f"✓ Saved binary data to {temp_file}")
                                except Exception as e3:
                                    print(f"Error handling binary data: {e3}")
                                    
                                # Last resort: look for the file locally
                                local_path = os.path.join(os.getcwd(), item)
                                if os.path.exists(local_path):
                                    nii_files.append(local_path)
                                    print(f"✓ Found {item} locally")
                                else:
                                    print(f"❌ Warning: Could not find {item} anywhere")
            
            # Second approach: Try to find NIfTI files in dataset repository directly
            if not nii_files:
                print("No NIfTI files found in dataset columns. Trying direct repository search...")
                
                try:
                    from huggingface_hub import list_repo_files, hf_hub_download
                    
                    # Try to list all files in the repository
                    try:
                        print("Listing all repository files...")
                        all_repo_files = list_repo_files(dataset_name, repo_type="dataset")
                        print(f"Found {len(all_repo_files)} files in repository")
                        
                        # First prioritize P*_rs.nii files
                        p_rs_files = [f for f in all_repo_files if f.endswith('_rs.nii') and f.startswith('P')]
                        
                        # Then include all other NIfTI files
                        other_nii_files = [f for f in all_repo_files if (f.endswith('.nii') or f.endswith('.nii.gz')) and f not in p_rs_files]
                        
                        # Combine, with P*_rs.nii files first
                        nii_repo_files = p_rs_files + other_nii_files
                        
                        if nii_repo_files:
                            print(f"Found {len(nii_repo_files)} NIfTI files in repository: {nii_repo_files[:5] if len(nii_repo_files) > 5 else nii_repo_files}...")
                            
                            # Download each file
                            for nii_file in nii_repo_files:
                                try:
                                    file_path = hf_hub_download(
                                        repo_id=dataset_name,
                                        filename=nii_file,
                                        repo_type="dataset",
                                        cache_dir=temp_dir
                                    )
                                    nii_files.append(file_path)
                                    print(f"✓ Downloaded {nii_file}")
                                except Exception as e:
                                    print(f"Error downloading {nii_file}: {e}")
                    except Exception as e:
                        print(f"Error listing repository files: {e}")
                        print("Will try alternative approaches...")
                    
                    # If repo listing fails, try with common NIfTI file patterns directly
                    if not nii_files:
                        print("Trying common NIfTI file patterns...")
                        
                        # Focus specifically on P*_rs.nii pattern
                        patterns = []
                        
                        # Generate P01_rs.nii through P30_rs.nii
                        for i in range(1, 31):  # Try subjects 1-30
                            patterns.append(f"P{i:02d}_rs.nii")
                            
                        # Also try with .nii.gz extension
                        for i in range(1, 31):
                            patterns.append(f"P{i:02d}_rs.nii.gz")
                            
                        # Include a few other common patterns as fallbacks
                        patterns.extend([
                            "sub-01_task-rest_bold.nii.gz",  # BIDS format
                            "fmri.nii.gz", "bold.nii.gz", 
                            "rest.nii.gz"
                        ])
                        
                        for pattern in patterns:
                            try:
                                print(f"Trying to download {pattern}...")
                                file_path = hf_hub_download(
                                    repo_id=dataset_name,
                                    filename=pattern,
                                    repo_type="dataset",
                                    cache_dir=temp_dir
                                )
                                nii_files.append(file_path)
                                print(f"✓ Successfully downloaded {pattern}")
                            except Exception as e:
                                print(f"× Failed to download {pattern}")
                                
                    # If we still couldn't find any files, check if data files are nested
                    if not nii_files:
                        print("Checking for nested data files...")
                        nested_paths = ["data/", "raw/", "nii/", "derivatives/", "fmri/", "nifti/"]
                        
                        for path in nested_paths:
                            for pattern in patterns:
                                nested_file = f"{path}{pattern}"
                                try:
                                    print(f"Trying to download {nested_file}...")
                                    file_path = hf_hub_download(
                                        repo_id=dataset_name,
                                        filename=nested_file,
                                        repo_type="dataset",
                                        cache_dir=temp_dir
                                    )
                                    nii_files.append(file_path)
                                    print(f"✓ Successfully downloaded {nested_file}")
                                    # If we found one file in this directory, try to find all files in it
                                    try:
                                        all_files_in_dir = [f for f in all_repo_files if f.startswith(path)]
                                        nii_files_in_dir = [f for f in all_files_in_dir if f.endswith('.nii') or f.endswith('.nii.gz')]
                                        print(f"Found {len(nii_files_in_dir)} additional NIfTI files in {path}")
                                        
                                        for nii_file in nii_files_in_dir:
                                            if nii_file != nested_file:  # Skip the one we already downloaded
                                                try:
                                                    file_path = hf_hub_download(
                                                        repo_id=dataset_name,
                                                        filename=nii_file,
                                                        repo_type="dataset",
                                                        cache_dir=temp_dir
                                                    )
                                                    nii_files.append(file_path)
                                                    print(f"✓ Downloaded {nii_file}")
                                                except Exception as e:
                                                    print(f"Error downloading {nii_file}: {e}")
                                    except Exception as e:
                                        print(f"Error finding additional files in {path}: {e}")
                                except Exception as e:
                                    pass
                                
                except Exception as e:
                    print(f"Error during repository exploration: {e}")
            
            # If we still don't have any files, try to search for P*_rs.nii pattern specifically
            if not nii_files:
                print("Trying to find files matching P*_rs.nii pattern specifically...")
                
                try:
                    # List all files in the repository (if we haven't already)
                    if not 'all_repo_files' in locals():
                        from huggingface_hub import list_repo_files
                        try:
                            all_repo_files = list_repo_files(dataset_name, repo_type="dataset")
                        except Exception as e:
                            print(f"Error listing repo files: {e}")
                            all_repo_files = []
                    
                    # Look for files matching the pattern exactly (P*_rs.nii)
                    pattern_files = [f for f in all_repo_files if '_rs.nii' in f and f.startswith('P')]
                    
                    # If we don't find any exact matches, try a more relaxed pattern
                    if not pattern_files:
                        pattern_files = [f for f in all_repo_files if 'rs.nii' in f.lower()]
                    
                    if pattern_files:
                        print(f"Found {len(pattern_files)} files matching rs.nii pattern")
                        
                        # Download each file
                        for pattern_file in pattern_files:
                            try:
                                file_path = hf_hub_download(
                                    repo_id=dataset_name,
                                    filename=pattern_file,
                                    repo_type="dataset",
                                    cache_dir=temp_dir
                                )
                                nii_files.append(file_path)
                                print(f"✓ Downloaded {pattern_file}")
                            except Exception as e:
                                print(f"Error downloading {pattern_file}: {e}")
                except Exception as e:
                    print(f"Error searching for pattern files: {e}")
                
            print(f"Found total of {len(nii_files)} NIfTI files")
        except Exception as e:
            print(f"Unexpected error during NIfTI file search: {e}")
            import traceback
            traceback.print_exc()
        
        # If we found NIfTI files, process them to FC matrices
        if nii_files:
            print(f"Found {len(nii_files)} NIfTI files, converting to FC matrices")
            
            # Load Power 264 atlas
            from nilearn import datasets
            power = datasets.fetch_coords_power_2011()
            coords = np.vstack((power.rois['x'], power.rois['y'], power.rois['z'])).T
            
            masker = input_data.NiftiSpheresMasker(
                coords, radius=5,
                standardize=True,
                memory='nilearn_cache', memory_level=1,
                verbose=0,
                detrend=True,
                low_pass=0.1,
                high_pass=0.01,
                t_r=2.0  # Adjust TR according to your data
            )
            
            # Process fMRI data and compute FC matrices
            fc_matrices = []
            valid_files = 0
            total_files = len(nii_files)
            
            for nii_file in nii_files:
                try:
                    print(f"Processing {nii_file}...")
                    fmri_img = load_img(nii_file)
                    
                    # Check image dimensions
                    if len(fmri_img.shape) < 4 or fmri_img.shape[3] < 10:
                        print(f"Warning: {nii_file} has insufficient time points: {fmri_img.shape}")
                        continue
                        
                    try:
                        # Explicitly handle warnings about empty spheres
                        import warnings
                        with warnings.catch_warnings():
                            warnings.filterwarnings('ignore', message='.*empty.*')
                            time_series = masker.fit_transform(fmri_img)
                    except Exception as e:
                        if "empty" in str(e):
                            print(f"Warning: Some spheres are empty in {nii_file}. Using a different sphere radius.")
                            
                            # Extract the list of empty spheres for logging
                            import re
                            empty_spheres = re.findall(r"\[(.*?)\]", str(e))
                            if empty_spheres:
                                print(f"Empty spheres: {empty_spheres[0]}")
                            
                            # Try with a different radius
                            alternate_masker = input_data.NiftiSpheresMasker(
                                coords, radius=8,  # Larger radius
                                standardize=True,
                                memory='nilearn_cache', memory_level=1,
                                verbose=0,
                                detrend=True,
                                low_pass=0.1,
                                high_pass=0.01,
                                t_r=2.0
                            )
                            try:
                                time_series = alternate_masker.fit_transform(fmri_img)
                                print(f"Successfully extracted time series with larger radius")
                            except Exception as e2:
                                print(f"Error with alternate masker: {e2}")
                                print(f"Skipping this file due to empty spheres")
                                continue  # Skip this file entirely
                        else:
                            print(f"Unknown error in masker: {e}")
                            continue  # Skip this file if there's any other error
                        
                    # Validate time series data
                    if np.isnan(time_series).any() or np.isinf(time_series).any():
                        print(f"Warning: {nii_file} contains NaN or Inf values after masking")
                        # Replace NaNs with zeros for this file
                        time_series = np.nan_to_num(time_series)
                    
                    correlation_measure = connectome.ConnectivityMeasure(
                        kind='correlation',
                        vectorize=False,
                        discard_diagonal=False
                    )
                    
                    fc_matrix = correlation_measure.fit_transform([time_series])[0]
                    
                    # Check for invalid correlation values
                    if np.isnan(fc_matrix).any():
                        print(f"Warning: {nii_file} produced NaN correlation values")
                        continue
                        
                    triu_indices = np.triu_indices_from(fc_matrix, k=1)
                    fc_triu = fc_matrix[triu_indices]
                    
                    # Fisher z-transform with proper bounds check
                    # Clip correlation values to valid range for arctanh
                    fc_triu_clipped = np.clip(fc_triu, -0.999, 0.999)
                    fc_triu = np.arctanh(fc_triu_clipped)
                    
                    fc_matrices.append(fc_triu)
                    valid_files += 1
                    print(f"Successfully processed {nii_file} to FC matrix")
                    
                except Exception as e:
                    print(f"Error processing {nii_file}: {e}")
            
            if fc_matrices:
                print(f"Successfully processed {valid_files} out of {total_files} files")
                
                # Ensure all matrices have the same dimensions
                dims = [m.shape[0] for m in fc_matrices]
                if len(set(dims)) > 1:
                    print(f"Warning: FC matrices have inconsistent dimensions: {dims}")
                    # Use the most common dimension
                    from collections import Counter
                    most_common_dim = Counter(dims).most_common(1)[0][0]
                    print(f"Using most common dimension: {most_common_dim}")
                    fc_matrices = [m for m in fc_matrices if m.shape[0] == most_common_dim]
                
                X = np.array(fc_matrices)
                
                # Normalize the FC data
                mean_x = np.mean(X, axis=0)
                std_x = np.std(X, axis=0)
                
                # Handle zero standard deviation
                std_x[std_x == 0] = 1.0
                
                X = (X - mean_x) / std_x
                print(f"Created FC matrices with shape {X.shape}")
                
                # Make sure demo_data matches the number of FC matrices
                if len(demo_data[0]) != X.shape[0]:
                    print(f"Warning: Number of subjects in demographic data ({len(demo_data[0])}) " +
                          f"doesn't match number of FC matrices ({X.shape[0]})")
                    # Adjust demo_data to match FC matrices
                    indices = list(range(min(len(demo_data[0]), X.shape[0])))
                    X = X[indices]
                    demo_data = [d[indices] for d in demo_data]
                    
                return X, demo_data, demo_types

        print("No FC or fMRI data found in the dataset. Please provide FC matrices.")
        # Return a placeholder with the right demographics but empty FC
        n_subjects = len(dataset)
        n_rois = 264
        fc_dim = (n_rois * (n_rois - 1)) // 2
        X = np.zeros((n_subjects, fc_dim))
        print(f"Created placeholder FC matrices with shape {X.shape}")
        return X, demo_data, demo_types
        
    elif isinstance(dataset_or_niifiles, str):
        # Handle real dataset with actual fMRI data
        dataset = load_dataset(dataset_or_niifiles, split="train")
        
        # Load Power 264 atlas
        from nilearn import datasets
        power = datasets.fetch_coords_power_2011()
        coords = np.vstack((power.rois['x'], power.rois['y'], power.rois['z'])).T
        
        masker = input_data.NiftiSpheresMasker(
            coords, radius=5,
            standardize=True,
            memory='nilearn_cache', memory_level=1,
            verbose=0,
            detrend=True,
            low_pass=0.1,
            high_pass=0.01,
            t_r=2.0  # Adjust TR according to your data
        )

        # Load demographic data if needed
        if demo_data is None:
            if 'demographics' in dataset.features:
                demo_df = pd.DataFrame(dataset['demographics'])
                
                demo_data = [
                    demo_df['age_at_stroke'].values if 'age_at_stroke' in demo_df.columns else [],
                    demo_df['sex'].values if 'sex' in demo_df.columns else [],
                    demo_df['months_post_stroke'].values if 'months_post_stroke' in demo_df.columns else [],
                    demo_df['wab_score'].values if 'wab_score' in demo_df.columns else []
                ]
                
                demo_types = ['continuous', 'categorical', 'continuous', 'continuous']
        
        # Process fMRI data and compute FC matrices
        fc_matrices = []
        for nii_file in dataset['nii_files']:
            fmri_img = load_img(nii_file)
            time_series = masker.fit_transform(fmri_img)
            
            correlation_measure = connectome.ConnectivityMeasure(
                kind='correlation', vectorize=False, discard_diagonal=False
            )
            
            fc_matrix = correlation_measure.fit_transform([time_series])[0]
            
            triu_indices = np.triu_indices_from(fc_matrix, k=1)
            fc_triu = fc_matrix[triu_indices]
            
            fc_triu = np.arctanh(fc_triu)  # Fisher z-transform
            
            fc_matrices.append(fc_triu)
        
        X = np.array(fc_matrices)
        
    elif isinstance(dataset_or_niifiles, list) and demo_data is not None and demo_types is not None:
        # Handle a list of NIfTI files
        # Similar processing as above but with local files
        print(f"Processing {len(dataset_or_niifiles)} local NIfTI files")
        
        # Load Power 264 atlas
        from nilearn import datasets
        power = datasets.fetch_coords_power_2011()
        coords = np.vstack((power.rois['x'], power.rois['y'], power.rois['z'])).T
        
        masker = input_data.NiftiSpheresMasker(
            coords, radius=5,
            standardize=True,
            memory='nilearn_cache', memory_level=1,
            verbose=0,
            detrend=True,
            low_pass=0.1,
            high_pass=0.01,
            t_r=2.0
        )
        
        fc_matrices = []
        for nii_file in dataset_or_niifiles:
            fmri_img = load_img(nii_file)
            time_series = masker.fit_transform(fmri_img)
            
            correlation_measure = connectome.ConnectivityMeasure(
                kind='correlation', vectorize=False, discard_diagonal=False
            )
            
            fc_matrix = correlation_measure.fit_transform([time_series])[0]
            
            triu_indices = np.triu_indices_from(fc_matrix, k=1)
            fc_triu = fc_matrix[triu_indices]
            
            fc_triu = np.arctanh(fc_triu)  # Fisher z-transform
            
            fc_matrices.append(fc_triu)
            
        X = np.array(fc_matrices)
    else:
        raise ValueError("Invalid input. Expected dataset name string or list of NIfTI files with demographic data.")
    
    # Normalize the FC data
    X = (X - np.mean(X, axis=0)) / np.std(X, axis=0)
    
    return X, demo_data, demo_types