Spaces:

QBRC
/

multiTAP

Running

App Files Files Community

ivangzf commited on Dec 5, 2023

Commit

b78c3b8

1 Parent(s): 33242c6

add multitap files

Browse files

Files changed (10) hide show

app.py +485 -0
cytof/__init__.py +4 -0
cytof/batch_preprocess.py +279 -0
cytof/classes.py +0 -0
cytof/hyperion_analysis.py +1477 -0
cytof/hyperion_preprocess.py +335 -0
cytof/hyperion_segmentation.py +341 -0
cytof/segmentation_functions.py +815 -0
cytof/utils.py +514 -0
requirements.txt +17 -0

app.py ADDED Viewed

	@@ -0,0 +1,485 @@

+# give some time reference to the user
+print('Importing Gradio app packages... (first launch takes about 3-5 minutes)')
+import gradio as gr
+import yaml
+import skimage
+import numpy as np
+import matplotlib.pyplot as plt
+from matplotlib.pyplot import cm
+import plotly.express as px
+import plotly.graph_objs as go
+from plotly.subplots import make_subplots
+import os
+import seaborn as sns
+from cytof import classes
+from classes import CytofImage, CytofCohort, CytofImageTiff
+from cytof.hyperion_preprocess import cytof_read_data_roi
+from cytof.utils import show_color_table
+OUTDIR = './output'
+def cytof_tiff_eval(file_path, marker_path, cytof_state):
+    # set to generic names because uploaded filenames is unpredictable
+    slide = 'slide0'
+    roi = 'roi1'
+    # read in the data
+    cytof_img, _ = cytof_read_data_roi(file_path, slide, roi)
+    # case 1. user uploaded TXT/CSV
+    if marker_path is None:
+        # get markers
+        cytof_img.get_markers()
+        # prepsocess
+        cytof_img.preprocess()
+        cytof_img.get_image()
+    # case 2. user uploaded TIFF
+    else:
+        labels_markers = yaml.load(open(marker_path, "rb"), Loader=yaml.Loader)
+        cytof_img.set_markers(**labels_markers)
+    viz = cytof_img.check_channels(ncols=3, savedir='.')
+    msg = f'Your uploaded TIFF has {len(cytof_img.markers)} markers'
+    cytof_state = cytof_img
+    return msg, viz, cytof_state
+def channel_select(cytof_img):
+    # one for define unwanted channels, one for defining nuclei, one for defining membrane
+    return gr.Dropdown(choices=cytof_img.channels, multiselect=True), gr.Dropdown(choices=cytof_img.channels, multiselect=True), gr.Dropdown(choices=cytof_img.channels, multiselect=True)
+def nuclei_select(cytof_img):
+    # one for defining nuclei, one for defining membrane
+    return gr.Dropdown(choices=cytof_img.channels, multiselect=True), gr.Dropdown(choices=cytof_img.channels, multiselect=True)
+def modify_channels(cytof_img, unwanted_channels, nuc_channels, mem_channels):
+    """
+    3-step function. 1) removes unwanted channels, 2) define nuclei channels, 3) define membrane channels
+    """
+    cytof_img_updated = cytof_img.copy()
+    cytof_img_updated.remove_special_channels(unwanted_channels)
+    # define and remove nuclei channels
+    nuclei_define = {'nuclei' : nuc_channels}
+    channels_rm = cytof_img_updated.define_special_channels(nuclei_define)
+    cytof_img_updated.remove_special_channels(channels_rm)
+    # define and keep membrane channels
+    membrane_define = {'membrane' : mem_channels}
+    cytof_img_updated.define_special_channels(membrane_define)
+    # only get image when need to derive from df. CytofImageTIFF has inherent image attribute
+    if type(cytof_img_updated) is CytofImage:
+        cytof_img_updated.get_image()
+    nuclei_channel_str = ', '.join(channels_rm)
+    membrane_channel_str = ', '.join(mem_channels)
+    msg = 'Your remaining channels are: ' + ', '.join(cytof_img_updated.channels) + '.\n\n Nuclei channels: ' + nuclei_channel_str + '.\n\n Membrane channels: ' + membrane_channel_str
+    return msg, cytof_img_updated
+def update_dropdown_options(cytof_img, selected_self, selected_other1, selected_other2):
+    """
+    Remove the selected option in the dropdown from the other two dropdowns
+    """
+    updated_choices = cytof_img.channels.copy()
+    unavail_options = selected_self + selected_other1 + selected_other2
+    for opt in unavail_options:
+        updated_choices.remove(opt)
+    return gr.Dropdown(choices=updated_choices+selected_other1, value=selected_other1, multiselect=True), gr.Dropdown(choices=updated_choices+selected_other2, value=selected_other2, multiselect=True)
+def cell_seg(cytof_img, radius):
+    # check if membrane channel available
+    use_membrane = 'membrane' in cytof_img.channels
+    nuclei_seg, cell_seg = cytof_img.get_seg(use_membrane=use_membrane, radius=radius, show_process=False)
+    # visualize nuclei and cells segmentation
+    marked_image_nuclei = cytof_img.visualize_seg(segtype="nuclei", show=False)
+    marked_image_cell = cytof_img.visualize_seg(segtype="cell", show=False)
+    # visualizing nuclei and/or membrane, plus the first marker in channels
+    marker_visualized = cytof_img.channels[0]
+    # similar to plt.imshow()
+    fig = px.imshow(marked_image_cell)
+    # add scatter plot dots as legends
+    fig.add_trace(go.Scatter(x=[None], y=[None], mode='markers', marker=dict(color='white'), name='membrane boundaries'))
+    fig.add_trace(go.Scatter(x=[None], y=[None], mode='markers', marker=dict(color='yellow'), name='nucleus boundaries'))
+    fig.add_trace(go.Scatter(x=[None], y=[None], mode='markers', marker=dict(color='red'), name='nucleus'))
+    fig.add_trace(go.Scatter(x=[None], y=[None], mode='markers', marker=dict(color='green'), name=marker_visualized))
+    fig.update_layout(legend=dict(orientation="v", bgcolor='lightgray'))
+    return fig, cytof_img
+def feature_extraction(cytof_img, cohort_state, percentile_threshold):
+    # extract and normalize all features
+    cytof_img.extract_features(filename=cytof_img.filename)
+    cytof_img.feature_quantile_normalization(qs=[percentile_threshold])
+    # create dir if not exist
+    if not os.path.isdir(OUTDIR):
+        os.makedirs(OUTDIR)
+    cytof_img.export_feature(f"df_feature_{percentile_threshold}normed", os.path.join(OUTDIR, f"feature_{percentile_threshold}normed.csv"))
+    df_feature = getattr(cytof_img, f"df_feature_{percentile_threshold}normed" )
+    # each file upload in Gradio will always have the same filename
+    # also the temp path created by Gradio is too long to be visually satisfying.
+    df_feature = df_feature.loc[:, df_feature.columns != 'filename']
+    # calculates quantiles between each marker and cell
+    cytof_img.calculate_quantiles(qs=[75])
+    dict_cytof_img = {f"{cytof_img.slide}_{cytof_img.roi}": cytof_img}
+    # convert to cohort and prepare downstream analysis
+    cytof_cohort = CytofCohort(cytof_images=dict_cytof_img, dir_out=OUTDIR)
+    cytof_cohort.batch_process_feature()
+    cytof_cohort.generate_summary()
+    cohort_state = cytof_cohort
+    msg = 'Feature extraction completed!'
+    return cytof_img, cytof_cohort, df_feature
+def co_expression(cytof_img, percentile_threshold):
+    feat_name = f"{percentile_threshold}normed"
+    df_co_pos_prob, df_expected_prob = cytof_img.roi_co_expression(feature_name=feat_name, accumul_type='sum', return_components=False)
+    epsilon = 1e-6 # avoid divide by 0 or log(0)
+    # Normalize and fix Nan
+    edge_percentage_norm = np.log10(df_co_pos_prob.values / (df_expected_prob.values+epsilon) + epsilon)
+    # if observed/expected = 0, then log odds ratio will have log10(epsilon)
+    # no observed means co-expression cannot be determined, does not mean strong negative co-expression
+    edge_percentage_norm[edge_percentage_norm == np.log10(epsilon)] = 0
+    # do some post processing
+    marker_all_clean = [m.replace('_cell_sum', '') for m in df_expected_prob.columns]
+    # fig = plt.figure()
+    clustergrid = sns.clustermap(edge_percentage_norm,
+                    # clustergrid = sns.clustermap(edge_percentage_norm,
+                    center=np.log10(1 + epsilon), cmap='RdBu_r', vmin=-1, vmax=3,
+                    xticklabels=marker_all_clean, yticklabels=marker_all_clean)
+    # retrieve matplotlib.Figure object from clustermap
+    fig = clustergrid.ax_heatmap.get_figure()
+    return fig, cytof_img
+def spatial_interaction(cytof_img, percentile_threshold, method, cluster_threshold):
+    feat_name = f"{percentile_threshold}normed"
+    df_expected_prob, df_cell_interaction_prob = cytof_img.roi_interaction_graphs(feature_name=feat_name, accumul_type='sum', method=method, threshold=cluster_threshold)
+    epsilon = 1e-6
+    # Normalize and fix Nan
+    edge_percentage_norm = np.log10(df_cell_interaction_prob.values / (df_expected_prob.values+epsilon) + epsilon)
+    # if observed/expected = 0, then log odds ratio will have log10(epsilon)
+    # no observed means interaction cannot be determined, does not mean strong negative interaction
+    edge_percentage_norm[edge_percentage_norm == np.log10(epsilon)] = 0
+    # do some post processing
+    marker_all_clean = [m.replace('_cell_sum', '') for m in df_expected_prob.columns]
+    clustergrid = sns.clustermap(edge_percentage_norm,
+                                # clustergrid = sns.clustermap(edge_percentage_norm,
+                                center=np.log10(1 + epsilon), cmap='bwr', vmin=-2, vmax=2,
+                                xticklabels=marker_all_clean, yticklabels=marker_all_clean)
+    # retrieve matplotlib.Figure object from clustermap
+    fig = clustergrid.ax_heatmap.get_figure()
+    return fig, cytof_img
+def get_marker_pos_options(cytof_img):
+    options = cytof_img.channels.copy()
+    # nuclei is guaranteed to exist after defining channels
+    options.remove('nuclei')
+    # search for channel "membrane" and delete, skip if cannot find
+    try:
+        options.remove('membrane')
+    except ValueError:
+        pass
+    return gr.Dropdown(choices=options, interactive=True), gr.Dropdown(choices=options, interactive=True)
+def viz_pos_marker_pair(cytof_img, marker1, marker2, percentile_threshold):
+    stain_nuclei1, stain_cell1, color_dict = cytof_img.visualize_marker_positive(
+                                        marker=marker1,
+                                        feature_type="normed",
+                                        accumul_type="sum",
+                                        normq=percentile_threshold,
+                                        show_boundary=True,
+                                        color_list=[(0,0,1), (0,1,0)], # negative, positive
+                                        color_bound=(0,0,0),
+                                        show_colortable=False)
+    stain_nuclei2, stain_cell2, color_dict = cytof_img.visualize_marker_positive(
+                                        marker=marker2,
+                                        feature_type="normed",
+                                        accumul_type="sum",
+                                        normq=percentile_threshold,
+                                        show_boundary=True,
+                                        color_list=[(0,0,1), (0,1,0)], # negative, positive
+                                        color_bound=(0,0,0),
+                                        show_colortable=False)
+    # create two subplots
+    fig = make_subplots(rows=1, cols=2, shared_xaxes=True, shared_yaxes=True, subplot_titles=(f"positive {marker1} cells", f"positive {marker2} cells"))
+    fig.add_trace(px.imshow(stain_cell1).data[0], row=1, col=1)
+    fig.add_trace(px.imshow(stain_cell2).data[0], row=1, col=2)
+    # Synchronize axes
+    fig.update_xaxes(matches='x')
+    fig.update_yaxes(matches='y')
+    fig.update_layout(title_text=" ")
+    return fig
+def phenograph(cytof_cohort):
+    key_pheno = cytof_cohort.clustering_phenograph()
+    df_feats, commus, cluster_protein_exps, figs, figs_scatter, figs_exps = cytof_cohort.vis_phenograph(
+        key_pheno=key_pheno,
+        level="cohort",
+        save_vis=False,
+        show_plots=False,
+        plot_together=False)
+    umap = figs_scatter['cohort']
+    expression = figs_exps['cohort']['cell_sum']
+    return umap, cytof_cohort
+def cluster_interaction_fn(cytof_img, cytof_cohort):
+    # avoid calling the clustering algorithm again. cohort is guaranteed to have one phenogrpah
+    key_pheno = list(cytof_cohort.phenograph.keys())[0]
+    epsilon = 1e-6
+    interacts, clustergrid = cytof_cohort.cluster_interaction_analysis(key_pheno)
+    interact = interacts[cytof_img.slide]
+    clustergrid_interaction = sns.clustermap(interact, center=np.log10(1+epsilon),
+                                cmap='RdBu_r', vmin=-1, vmax=1,
+                                xticklabels=np.arange(interact.shape[0]),
+                                yticklabels=np.arange(interact.shape[0]))
+    # retrieve matplotlib.Figure object from clustermap
+    fig = clustergrid.ax_heatmap.get_figure()
+    return fig, cytof_img, cytof_cohort
+def get_cluster_pos_options(cytof_img):
+    options = cytof_img.channels.copy()
+    # nuclei is guaranteed to exist after defining channels
+    options.remove('nuclei')
+    # search for channel "membrane" and delete, skip if cannot find
+    try:
+        options.remove('membrane')
+    except ValueError:
+        pass
+    return gr.Dropdown(choices=options, interactive=True)
+def viz_cluster_positive(marker, percentile_threshold, cytof_img, cytof_cohort):
+    # avoid calling the clustering algorithm again. cohort is guaranteed to have one phenogrpah
+    key_pheno = list(cytof_cohort.phenograph.keys())[0]
+    # marker positive cell
+    stain_nuclei1, stain_cell1, color_dict = cytof_img.visualize_marker_positive(
+                                        marker=marker,
+                                        feature_type="normed",
+                                        accumul_type="sum",
+                                        normq=percentile_threshold,
+                                        show_boundary=True,
+                                        color_list=[(0,0,1), (0,1,0)], # negative, positive
+                                        color_bound=(0,0,0),
+                                        show_colortable=False)
+    # attch PhenoGraph results to individual ROIs
+    cytof_cohort.attach_individual_roi_pheno(key_pheno, override=True)
+    # PhenoGraph clustering visualization
+    pheno_stain_nuclei, pheno_stain_cell, color_dict = cytof_img.visualize_pheno(key_pheno=key_pheno)
+    # create two subplots
+    fig = make_subplots(rows=1, cols=2, shared_xaxes=True, shared_yaxes=True, subplot_titles=(f"positive {marker} cells", "PhenoGraph clusters on cells"))
+    fig.add_trace(px.imshow(stain_cell1).data[0], row=1, col=1)
+    fig.add_trace(px.imshow(pheno_stain_cell).data[0], row=1, col=2)
+    # Synchronize axes
+    fig.update_xaxes(matches='x')
+    fig.update_yaxes(matches='y')
+    fig.update_layout(title_text=" ")
+    return fig, cytof_img, cytof_cohort
+# Gradio App template
+with gr.Blocks() as demo:
+    cytof_state = gr.State(CytofImage())
+     # used in scenrios where users define/remove channels multiple times
+    cytof_original_state = gr.State(CytofImage())
+    gr.Markdown("# Step 1. Upload images")
+    gr.Markdown('You may upload one or two files depending on your use case.')
+    gr.Markdown('Case 1: A single TXT or CSV file that contains information about antibodies, rare heavy metal isotopes, and image channel names. Make sure files are following the CyTOF, IMC, or multiplex data convention. Leave the `Marker File` upload section blank.')
+    gr.Markdown('Case 2: Multiple file uploads required. First, a TIFF file containing Regions of Interest (ROIs) stored as multiplexed images. Then, upload a `Marker File` listing the channels to identify the antibodies.')
+    with gr.Row(): # first row where 1) asks for TIFF upload and 2) displays marker info
+        img_path = gr.File(file_types=[".tiff", '.tif', '.txt', '.csv'], label='(Required) A file containing Regions of Interest (ROIs) of multiplexed imaging slides.')
+        img_info = gr.Textbox(label='Marker information', info='Ensure the number of markers displayed below matches the expected number.')
+    with gr.Row(equal_height=True): # second row where 1) asks for marker file upload and 2) displays the visualization of individual channels
+        with gr.Column():
+            marker_path = gr.File(file_types=['.txt'], label='(Optional) Marker File. A list used to identify the antibodies in each TIFF layer. Upload one TXT file.')
+            with gr.Row():
+                clear_btn = gr.Button("Clear")
+                submit_btn = gr.Button("Upload")
+        img_viz = gr.Plot(label="Visualization of individual channels")
+    gr.Markdown("# Step 2. Modify existing channels")
+    gr.Markdown("After visualizing the individual channels, did you notice any that should not be included in the next steps? Remove those if so.")
+    gr.Markdown("Define channels designed to visualize nuclei. Optionally, define channels degisned to visualize membranes.")
+    with gr.Row(equal_height=True): # third row selects nuclei channels
+        with gr.Column():
+            selected_unwanted_channel = gr.Dropdown(label='(Optional) Select the unwanted channel', interactive=True)
+            selected_nuclei = gr.Dropdown(label='(Required) Select the nuclei channel', interactive=True)
+            selected_membrane = gr.Dropdown(label='(Optional) Select the membrane channel', interactive=True)
+            define_btn = gr.Button('Modify channels')
+        channel_feedback = gr.Textbox(label='Channels info update')
+        # upload the file, and gather channel info. Then populate to the unwanted_channel, nuclei, and membrane components
+        submit_btn.click(
+            fn=cytof_tiff_eval, inputs=[img_path, marker_path, cytof_original_state], outputs=[img_info, img_viz, cytof_original_state],
+            api_name='upload'
+        ).success(
+            fn=channel_select, inputs=cytof_original_state, outputs=[selected_unwanted_channel, selected_nuclei, selected_membrane]
+        )
+    selected_unwanted_channel.change(fn=update_dropdown_options, inputs=[cytof_original_state, selected_unwanted_channel, selected_nuclei, selected_membrane], outputs=[selected_nuclei, selected_membrane], api_name='dropdown_monitor1') # api_name used to identify in the endpoints
+    selected_nuclei.change(fn=update_dropdown_options, inputs=[cytof_original_state, selected_nuclei, selected_membrane, selected_unwanted_channel], outputs=[selected_membrane, selected_unwanted_channel], api_name='dropdown_monitor2')
+    selected_membrane.change(fn=update_dropdown_options, inputs=[cytof_original_state, selected_membrane, selected_nuclei, selected_unwanted_channel], outputs=[selected_nuclei, selected_unwanted_channel], api_name='dropdown_monitor3')
+    # modifies the channels per user input
+    define_btn.click(fn=modify_channels, inputs=[cytof_original_state, selected_unwanted_channel, selected_nuclei, selected_membrane], outputs=[channel_feedback, cytof_state])
+    gr.Markdown('# Step 3. Perform cell segmentation based on the defined nuclei and membrane channels')
+    with gr.Row(): # This row defines cell radius and performs segmentation
+        with gr.Column():
+            cell_radius = gr.Number(value=5, precision=0, label='Cell size', info='Please enter the desired radius for cell segmentation (in pixels; default value: 5)')
+            seg_btn = gr.Button("Segment")
+        seg_viz = gr.Plot(label="Visualization of the segmentation. Hover over graph to zoom, pan, save, etc.")
+        seg_btn.click(fn=cell_seg, inputs=[cytof_state, cell_radius], outputs=[seg_viz, cytof_state])
+    gr.Markdown('# Step 4. Extract cell features')
+    cohort_state = gr.State(CytofCohort())
+    with gr.Row(): # feature extraction related functinos
+        with gr.Column():
+            gr.CheckboxGroup(choices=['Yes', 'Yes', 'Yes'], label='Note: This step will take significantly longer than the previous ones. A 300MB IMC file takes about 7 minutes to compute. Did you read this note?')
+            norm_percentile = gr.Slider(minimum=50, maximum=99, step=1, value=75, interactive=True, label='Normalized quantification percentile')
+            extract_btn = gr.Button('Extract')
+        feat_df = gr.DataFrame(headers=['id','coordinate_x','coordinate_y','area_nuclei'], label='Feature extraction summary')
+        extract_btn.click(fn=feature_extraction, inputs=[cytof_state, cohort_state, norm_percentile],
+        outputs=[cytof_state, cohort_state, feat_df])
+    gr.Markdown('# Step 5. Downstream analysis')
+    with gr.Row(): # show co-expression and spatial analysis
+        with gr.Column():
+            co_exp_viz = gr.Plot(label="Visualization of cell coexpression of markers")
+            co_exp_btn = gr.Button('Run co-expression analysis')
+        with gr.Column():
+            spatial_viz = gr.Plot(label="Visualization of cell spatial interaction of markers")
+            cluster_method = gr.Radio(label='Select the clustering method', value='k-neighbor',  choices=['k-neighbor', 'distance'], info='K-neighbor: classifies the threshold number of surrounding cells as neighborhood pairs. Distance: classifies cells within threshold distance as neighborhood pairs.')
+            cluster_threshold = gr.Slider(minimum=1, maximum=100, step=1, value=30, interactive=True, label='Clustering threshold')
+            spatial_btn = gr.Button('Run spatial interaction analysis')
+        co_exp_btn.click(fn=co_expression, inputs=[cytof_state, norm_percentile], outputs=[co_exp_viz, cytof_state])
+        # spatial_btn logic is in step6. This is populate the marker positive dropdown options
+    gr.Markdown('# Step 6. Visualize positive markers')
+    gr.Markdown('Select two markers for side-by-side comparison to visualize their positive states in cells. This serves two purposes. 1) Validate the co-expression analysis results. High expression level should mean a similar number of positive markers within the two slides, whereas low expression level mean a large difference of in the number of positive markers. 2) Validate teh spatial interaction analysis results. High interaction means the two positive markers are in close proximity of each other (proximity is previously defined in `clustering threshold`), and vice versa.')
+    with gr.Row(): # two marker positive visualization - dropdown options
+        selected_marker1 = gr.Dropdown(label='Select one marker', info='Select a marker to visualize', interactive=True)
+        selected_marker2 = gr.Dropdown(label='Select another marker', info='Selecting the same marker as the previous one is allowed', interactive=True)
+        pos_viz_btn = gr.Button('Visualize these two markers')
+    with gr.Row(): # two marker positive visualization - visualization
+        marker_pos_viz = gr.Plot(label="Visualization of the two markers. Hover over graph to zoom, pan, save, etc.")
+        spatial_btn.click(
+            fn=spatial_interaction, inputs=[cytof_state, norm_percentile, cluster_method, cluster_threshold], outputs=[spatial_viz, cytof_state]
+        ).success(
+            fn=get_marker_pos_options, inputs=[cytof_state], outputs=[selected_marker1, selected_marker2]
+        )
+        pos_viz_btn.click(fn=viz_pos_marker_pair, inputs=[cytof_state, selected_marker1, selected_marker2, norm_percentile], outputs=[marker_pos_viz])
+    gr.Markdown('# Step 7. Phenogrpah Clustering')
+    gr.Markdown('Cells can be clustered into sub-groups based on the extracted single-cell data. Time reference: a 300MB IMC file takes about 2 minutes to compute.')
+    with gr.Row(): # add two plots to visualize phenograph results
+        phenograph_umap = gr.Plot(label="UMAP results")
+        cluster_interaction = gr.Plot(label="Spatial interaction of clusters")
+    with gr.Row(equal_height=False): # action components
+        umap_btn = gr.Button('Run Phenograph clustering')
+        cluster_interact_btn = gr.Button('Run clustering interaction')
+        cluster_interact_btn.click(cluster_interaction_fn, inputs=[cytof_state, cohort_state], outputs=[cluster_interaction, cytof_state, cohort_state])
+    with gr.Row():
+        with gr.Column():
+            selected_cluster_marker = gr.Dropdown(label='Select one marker', info='Select a marker to visualize', interactive=True)
+            cluster_positive_btn = gr.Button('Compare clusters and positive markers')
+        with gr.Column():
+            cluster_v_positive = gr.Plot(label="Cluster assignment vs. positive cells. Hover over graph to zoom, pan, save, etc.")
+        umap_btn.click(
+            fn=phenograph, inputs=[cohort_state], outputs=[phenograph_umap, cohort_state]
+        ).success(
+            fn=get_cluster_pos_options, inputs=[cytof_state], outputs=[selected_cluster_marker], api_name='selectClusterMarker'
+        )
+        cluster_positive_btn.click(fn=viz_cluster_positive, inputs=[selected_cluster_marker, norm_percentile, cytof_state, cohort_state], outputs=[cluster_v_positive, cytof_state, cohort_state])
+    # clear everything if clicked
+    clear_components = [img_path, marker_path, img_info, img_viz, channel_feedback, seg_viz, feat_df, co_exp_viz, spatial_viz, marker_pos_viz, phenograph_umap, cluster_interaction, cluster_v_positive]
+    clear_btn.click(lambda: [None]*len(clear_components), outputs=clear_components)
+if __name__ == "__main__":
+    demo.launch(server_name='0.0.0.0', server_port=5323)

cytof/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+# from .hyperion_analysis import *
+from .hyperion_preprocess import *
+from .utils import *
+from .segmentation_functions import *

cytof/batch_preprocess.py ADDED Viewed

	@@ -0,0 +1,279 @@

+#!/usr/bin/env python
+# coding: utf-8
+import os
+import glob
+import matplotlib.pyplot as plt
+import pickle as pkl
+import numpy as np
+import argparse
+import yaml
+import pandas as pd
+import skimage
+import sys
+import platform
+from pathlib import Path
+FILE = Path(__file__).resolve()
+ROOT = FILE.parents[0]  # cytof root directory
+if str(ROOT) not in sys.path:
+    sys.path.append(str(ROOT))  # add ROOT to PATH
+if platform.system() != 'Windows':
+    ROOT = Path(os.path.relpath(ROOT, Path.cwd()))  # relative
+from classes import CytofImage, CytofImageTiff
+# import sys
+# sys.path.append('../cytof')
+from hyperion_preprocess import cytof_read_data_roi
+from hyperion_analysis import batch_scale_feature
+from utils import save_multi_channel_img
+def makelist(string):
+    delim = ','
+    # return [float(_) for _ in string.split(delim)]
+    return [_ for _ in string.split(delim)]
+def parse_opt():
+    parser = argparse.ArgumentParser('Cytof batch process', add_help=False)
+    parser.add_argument('--cohort_file', type=str,
+                        help='a txt file with information of all file paths in the cohort')
+    parser.add_argument('--params_ROI', type=str,
+                        help='a txt file with parameters used to process single ROI previously')
+    parser.add_argument('--outdir', type=str, help='directory to save outputs')
+    parser.add_argument('--save_channel_images', action='store_true',
+                        help='an indicator of whether save channel images')
+    parser.add_argument('--save_seg_vis', action='store_true',
+                        help='an indicator of whether save sample visualization of segmentation')
+    parser.add_argument('--show_seg_process', action='store_true',
+                        help='an indicator of whether show segmentation process')
+    parser.add_argument('--quality_control_thres', type=int, default=50,
+                        help='the smallest image size for an image to be kept')
+    return parser
+def main(args):
+    # if args.save_channel_images:
+    #     print("saving channel images")
+    # else:
+    #     print("NOT saving channel images")
+    # if args.save_seg_vis:
+    #     print("saving segmentation visualization")
+    # else:
+    #     print("NOT saving segmentation visualization")
+    # if args.show_seg_process:
+    #     print("showing segmentation process")
+    # else:
+    #     print("NOT showing segmentation process")
+    # parameters used when processing single ROI
+    params_ROI   = yaml.load(open(args.params_ROI, "rb"), Loader=yaml.Loader)
+    channel_dict = params_ROI["channel_dict"]
+    channels_remove = params_ROI["channels_remove"]
+    quality_control_thres = params_ROI["quality_control_thres"]
+    # name of the batch and saving directory
+    cohort_name = os.path.basename(args.cohort_file).split('.csv')[0]
+    print(cohort_name)
+    outdir = os.path.join(args.outdir, cohort_name)
+    if not os.path.exists(outdir):
+        os.makedirs(outdir)
+    feat_dirs = {}
+    feat_dirs['orig'] = os.path.join(outdir, "feature")
+    if not os.path.exists(feat_dirs['orig']):
+        os.makedirs(feat_dirs['orig'])
+    for q in params_ROI["normalize_qs"]:
+        dir_qnorm = os.path.join(outdir, f"feature_{q}normed")
+        feat_dirs[f"{q}normed"] = dir_qnorm
+        if not os.path.exists(dir_qnorm):
+            os.makedirs(dir_qnorm)
+    dir_img_cytof = os.path.join(outdir, "cytof_images")
+    if not os.path.exists(dir_img_cytof):
+        os.makedirs(dir_img_cytof)
+    if args.save_seg_vis:
+        dir_seg_vis = os.path.join(outdir, "segmentation_visualization")
+        if not os.path.exists(dir_seg_vis):
+            os.makedirs(dir_seg_vis)
+    # process batch files
+    cohort_files_ = pd.read_csv(args.cohort_file)
+    # cohort_files = [os.path.join(cohort_files_.loc[i, "path"], "{}".format(cohort_files_.loc[i, "ROI"])) \
+    #                 for i in range(cohort_files_.shape[0])]
+    print("Start processing {} files".format(cohort_files_.shape[0]))
+    cytof_imgs = {}  # a dictionary contain the full file path of all results
+    seen = 0
+    dfs_scale_params = {}  # key: quantile q; item: features to be scaled
+    df_io = pd.DataFrame(columns=["Slide", "ROI", "path", "output_file"])
+    df_bad_rois = pd.DataFrame(columns=["Slide", "ROI", "path", "size (W*H)"])
+    # for f_roi in cohort_files:
+    for i in range(cohort_files_.shape[0]):
+        slide, pth_i, f_roi_ = cohort_files_.loc[i, "Slide"], cohort_files_.loc[i, "path"], cohort_files_.loc[i, "ROI"]
+        f_roi = os.path.join(pth_i, f_roi_)
+        print("\nNow analyzing {}".format(f_roi))
+        roi   = f_roi_.split('.txt')[0]
+        print("{}-{}".format(slide, roi))
+        ## 1) Read and preprocess data
+        # read data: file name -> dataframe
+        cytof_img = cytof_read_data_roi(f_roi, slide, roi)
+        # quality control section
+        cytof_img.quality_control(thres=quality_control_thres)
+        if not cytof_img.keep:
+            H = max(cytof_img.df['Y'].values) + 1
+            W = max(cytof_img.df['X'].values) + 1
+        # if (H < args.quality_control_thres) or (W < quality_control_thres):
+        #     print("At least one dimension of the image {}-{} is smaller than {}, skipping" \
+        #           .format(cytof_img.slide, cytof_img.roi, quality_control_thres))
+            df_bad_rois = pd.concat([df_bad_rois,
+                                     pd.DataFrame.from_dict([{"Slide": slide,
+                                      "ROI": roi,
+                                      "path": pth_i,
+                                      "size (W*H)": (W,H)}])])
+            continue
+        if args.save_channel_images:
+            dir_roi_channel_img = os.path.join(outdir, "channel_images", f_roi_)
+            if not os.path.exists(dir_roi_channel_img):
+                os.makedirs(dir_roi_channel_img)
+        # markers used when capturing the image
+        cytof_img.get_markers()
+        # preprocess: fill missing values with 0.
+        cytof_img.preprocess()
+        # save info
+        if seen == 0:
+            f_info = open(os.path.join(outdir, 'readme.txt'), 'w')
+            f_info.write("Original markers: ")
+            f_info.write('\n{}'.format(", ".join(cytof_img.markers)))
+            f_info.write("\nOriginal channels: ")
+            f_info.write('\n{}'.format(", ".join(cytof_img.channels)))
+        ## (optional): save channel images
+        if args.save_channel_images:
+            cytof_img.get_image()
+            cytof_img.save_channel_images(dir_roi_channel_img)
+        ## remove special channels if defined
+        if len(channels_remove) > 0:
+            cytof_img.remove_special_channels(channels_remove)
+            cytof_img.get_image()
+        ## 2) nuclei & membrane channels and visualization
+        cytof_img.define_special_channels(channel_dict)
+        assert len(cytof_img.channels) == cytof_img.image.shape[-1]
+        # #### Dataframe -> raw image
+        # cytof_img.get_image()
+        ## (optional): save channel images
+        if args.save_channel_images:
+            cytof_img.get_image()
+            vis_channels = [k for (k, itm) in params_ROI["channel_dict"].items() if len(itm)>0]
+            cytof_img.save_channel_images(dir_roi_channel_img, channels=vis_channels)
+        ## 3) Nuclei and cell segmentation
+        nuclei_seg, cell_seg = cytof_img.get_seg(use_membrane=params_ROI["use_membrane"],
+                                                 radius=params_ROI["cell_radius"],
+                                                 show_process=args.show_seg_process)
+        if args.save_seg_vis:
+            marked_image_nuclei = cytof_img.visualize_seg(segtype="nuclei", show=False)
+            save_multi_channel_img(skimage.img_as_ubyte(marked_image_nuclei[0:100, 0:100, :]),
+                                    os.path.join(dir_seg_vis, "{}_{}_nuclei_seg.png".format(slide, roi)))
+            marked_image_cell = cytof_img.visualize_seg(segtype="cell", show=False)
+            save_multi_channel_img(skimage.img_as_ubyte(marked_image_cell[0:100, 0:100, :]),
+                                    os.path.join(dir_seg_vis, "{}_{}_cell_seg.png".format(slide, roi)))
+        ## 4) Feature extraction
+        cytof_img.extract_features(f_roi)
+        # save the original extracted feature
+        cytof_img.df_feature.to_csv(os.path.join(feat_dirs['orig'], "{}_{}_feature_summary.csv".format(slide, roi)),
+                                    index=False)
+        ### 4.1) Log transform and quantile normalization
+        cytof_img.feature_quantile_normalization(qs=params_ROI["normalize_qs"], savedir=feat_dirs['orig'])
+        # calculate scaling parameters
+        ## features to be scaled
+        if seen == 0:
+            s_features = [col for key, features in cytof_img.features.items() \
+                      for f in features \
+                      for col in cytof_img.df_feature.columns if col.startswith(f)]
+            f_info.write("\nChannels removed: ")
+            f_info.write("\n{}".format(", ".join(channels_remove)))
+            f_info.write("\nFinal markers: ")
+            f_info.write("\n{}".format(', '.join(cytof_img.markers)))
+            f_info.write("\nFinal channels: ")
+            f_info.write("\n{}".format(', '.join(cytof_img.channels)))
+            f_info.close()
+        ## loop over quantiles
+        for q, quantile in cytof_img.dict_quantiles.items():
+            n_attr = f"df_feature_{q}normed"
+            df_normed = getattr(cytof_img, n_attr)
+            # save the normalized features to csv
+            df_normed.to_csv(os.path.join(feat_dirs[f"{q}normed"],
+                                          "{}_{}_feature_summary.csv".format(slide, roi)),
+                             index=False)
+            if seen == 0:
+                dfs_scale_params[q] = df_normed[s_features]
+                dict_quantiles = cytof_img.dict_quantiles
+            else:
+                # dfs_scale_params[q] = dfs_scale_params[q].append(df_normed[s_features], ignore_index=True)
+                dfs_scale_params[q] = pd.concat([dfs_scale_params[q], df_normed[s_features]])
+        seen += 1
+        # save the class instance
+        out_file = os.path.join(dir_img_cytof, "{}_{}.pkl".format(slide, roi))
+        cytof_img.save_cytof(out_file)
+        cytof_imgs[roi] = out_file
+        # df_io = df_io.append({"Slide": slide,
+        #                       "ROI": roi,
+        #                       "path": pth_i,
+        #                       "output_file": out_file}, ignore_index=True)
+        df_io = pd.concat([df_io,
+                           pd.DataFrame.from_dict([{"Slide": slide,
+                            "ROI": roi,
+                            "path": pth_i,
+                            "output_file": os.path.abspath(out_file) # use absolute path
+                            }])
+                           ])
+    for q in dict_quantiles.keys():
+        df_scale_params = dfs_scale_params[q].mean().to_frame(name="mean").transpose()
+        # df_scale_params = df_scale_params.append(dfs_scale_params[q].std().to_frame(name="std").transpose(),
+        #                                          ignore_index=True)
+        df_scale_params = pd.concat([df_scale_params, dfs_scale_params[q].std().to_frame(name="std").transpose()])
+        df_scale_params.to_csv(os.path.join(outdir, f"{q}normed_scale_params.csv"), index=False)
+    # df_io = pd.DataFrame.from_dict(cytof_imgs, orient="index", columns=['output_file'])
+    # df_io.reset_index(inplace=True)
+    # df_io.rename(columns={'index': 'input_file'}, inplace=True)
+    df_io.to_csv(os.path.join(outdir, "input_output.csv"), index=False)
+    if len(df_bad_rois) > 0:
+        df_bad_rois.to_csv(os.path.join(outdir, "skipped_rois.csv"), index=False)
+    # scale feature
+    batch_scale_feature(outdir, normqs=params_ROI["normalize_qs"], df_io=df_io)
+    # return cytof_imgs, feat_dirs
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser('Cytof batch process', parents=[parse_opt()])
+    args  = parser.parse_args()
+    main(args)

cytof/classes.py ADDED Viewed

The diff for this file is too large to render. See raw diff

cytof/hyperion_analysis.py ADDED Viewed

	@@ -0,0 +1,1477 @@

+import os
+import re
+import glob
+import pickle as pkl
+import copy
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+from matplotlib.pyplot import cm
+import warnings
+from tqdm import tqdm
+import skimage
+import phenograph
+import umap
+import seaborn as sns
+from scipy.stats import spearmanr
+import sys
+import platform
+from pathlib import Path
+FILE = Path(__file__).resolve()
+ROOT = FILE.parents[0]  # cytof root directory
+if str(ROOT) not in sys.path:
+    sys.path.append(str(ROOT))  # add ROOT to PATH
+if platform.system() != 'Windows':
+    ROOT = Path(os.path.relpath(ROOT, Path.cwd()))  # relative
+from classes import CytofImage, CytofImageTiff
+import hyperion_preprocess as pre
+import hyperion_segmentation as seg
+from utils import load_CytofImage
+# from cytof import hyperion_preprocess as pre
+# from cytof import hyperion_segmentation as seg
+# from cytof.utils import load_CytofImage
+def _longest_substring(str1, str2):
+    ans = ""
+    len1, len2 = len(str1), len(str2)
+    for i in range(len1):
+        for j in range(len2):
+            match = ""
+            _len = 0
+            while ((i+_len < len1) and (j+_len < len2) and str1[i+_len] == str2[j+_len]):
+                match += str1[i+_len]
+                _len += 1
+                if len(match) > len(ans):
+                    ans = match
+    return ans
+def extract_feature(channels, raw_image, nuclei_seg, cell_seg, filename, show_head=False):
+    """ Extract nuclei and cell level feature from cytof image based on nuclei segmentation and cell segmentation
+        results
+    Inputs:
+        channels   = channels to extract feature from
+        raw_image  = raw cytof image
+        nuclei_seg = nuclei segmentation result
+        cell_seg   = cell segmentation result
+        filename   = filename of current cytof image
+    Returns:
+        feature_summary_df = a dataframe containing summary of extracted features
+        morphology         = names of morphology features extracted
+    :param channels: list
+    :param raw_image: numpy.ndarray
+    :param nuclei_seg: numpy.ndarray
+    :param cell_seg: numpy.ndarray
+    :param filename: string
+    :param morpholoty: list
+    :return feature_summary_df: pandas.core.frame.DataFrame
+    """
+    assert (len(channels) == raw_image.shape[-1])
+    # morphology features to be extracted
+    morphology = ["area", "convex_area", "eccentricity", "extent",
+                "filled_area", "major_axis_length", "minor_axis_length",
+                "orientation", "perimeter", "solidity", "pa_ratio"]
+    ## morphology features
+    nuclei_morphology = [_ + '_nuclei' for _ in morphology]  # morphology - nuclei level
+    cell_morphology = [_ + '_cell' for _ in morphology]  # morphology - cell level
+    ## single cell features
+    # nuclei level
+    sum_exp_nuclei = [_ + '_nuclei_sum' for _ in channels]  # sum expression over nuclei
+    ave_exp_nuclei = [_ + '_nuclei_ave' for _ in channels]  # average expression over nuclei
+    # cell level
+    sum_exp_cell   = [_ + '_cell_sum' for _ in channels]  # sum expression over cell
+    ave_exp_cell   = [_ + '_cell_ave' for _ in channels]  # average expression over cell
+    # column names of final result dataframe
+    column_names       = ["filename", "id", "coordinate_x", "coordinate_y"] + \
+                         sum_exp_nuclei + ave_exp_nuclei + nuclei_morphology + \
+                         sum_exp_cell + ave_exp_cell + cell_morphology
+    # Initiate
+    res = dict()
+    for column_name in column_names:
+        res[column_name] = []
+    n_nuclei = np.max(nuclei_seg)
+    for nuclei_id in tqdm(range(2, n_nuclei + 1), position=0, leave=True):
+        res["filename"].append(filename)
+        res["id"].append(nuclei_id)
+        regions = skimage.measure.regionprops((nuclei_seg == nuclei_id) * 1)  # , coordinates='xy') (deprecated)
+        if len(regions) >= 1:
+            this_nucleus = regions[0]
+        else:
+            continue
+        regions = skimage.measure.regionprops((cell_seg == nuclei_id) * 1)  # , coordinates='xy') (deprecated)
+        if len(regions) >= 1:
+            this_cell = regions[0]
+        else:
+            continue
+        centroid_y, centroid_x = this_nucleus.centroid  # y: rows; x: columns
+        res['coordinate_x'].append(centroid_x)
+        res['coordinate_y'].append(centroid_y)
+        # morphology
+        for i, feature in enumerate(morphology[:-1]):
+            res[nuclei_morphology[i]].append(getattr(this_nucleus, feature))
+            res[cell_morphology[i]].append(getattr(this_cell, feature))
+        res[nuclei_morphology[-1]].append(1.0 * this_nucleus.perimeter ** 2 / this_nucleus.filled_area)
+        res[cell_morphology[-1]].append(1.0 * this_cell.perimeter ** 2 / this_cell.filled_area)
+        # markers
+        for i, marker in enumerate(channels):
+            ch = i
+            res[sum_exp_nuclei[i]].append(np.sum(raw_image[nuclei_seg == nuclei_id, ch]))
+            res[ave_exp_nuclei[i]].append(np.average(raw_image[nuclei_seg == nuclei_id, ch]))
+            res[sum_exp_cell[i]].append(np.sum(raw_image[cell_seg == nuclei_id, ch]))
+            res[ave_exp_cell[i]].append(np.average(raw_image[cell_seg == nuclei_id, ch]))
+    feature_summary_df = pd.DataFrame(res)
+    if show_head:
+        print(feature_summary_df.head())
+    return feature_summary_df
+###############################################################################
+# def check_feature_distribution(feature_summary_df, features):
+#     """ Visualize feature distribution for each feature
+#     Inputs:
+#         feature_summary_df = dataframe of extracted feature summary
+#         features           = features to check distribution
+#     Returns:
+#         None
+#     :param feature_summary_df: pandas.core.frame.DataFrame
+#     :param features: list
+#     """
+#     for feature in features:
+#         print(feature)
+#         fig, ax = plt.subplots(1, 1, figsize=(3, 2))
+#         ax.hist(np.log2(feature_summary_df[feature] + 0.0001), 100)
+#         ax.set_xlim(-15, 15)
+#         plt.show()
+def feature_quantile_normalization(feature_summary_df, features, qs=[75,99]):
+    """ Calculate the q-quantiles of selected features given quantile q values. Then perform q-quantile normalization
+     on these features using calculated quantile values. The feature_summary_df will be updated in-place with new
+     columns "feature_qnormed" generated and added. Meanwhile, visualize distribution of log2 features before and after
+     q-normalization
+    Inputs:
+        feature_summary_df = dataframe of extracted feature summary
+        features           = features to be normalized
+        qs                 = quantile q values (default=[75,99])
+    Returns:
+        quantiles          = quantile values for each q
+    :param feature_summary_df: pandas.core.frame.DataFrame
+    :param features: list
+    :param qs: list
+    :return quantiles: dict
+    """
+    expressions = []
+    expressions_normed = dict((key, []) for key in qs)
+    quantiles   = {}
+    colors = cm.rainbow(np.linspace(0, 1, len(qs)))
+    for feat in features:
+        quantiles[feat] = {}
+        expressions.extend(feature_summary_df[feat])
+        plt.hist(np.log2(np.array(expressions) + 0.0001), 100, density=True)
+        for q, c in zip(qs, colors):
+            quantile_val = np.quantile(expressions, q/100)
+            quantiles[feat][q] = quantile_val
+            plt.axvline(np.log2(quantile_val), label=f"{q}th percentile", c=c)
+            print(f"{q}th percentile: {quantile_val}")
+            # log-quantile normalization
+            normed = np.log2(feature_summary_df.loc[:, feat] / quantile_val + 0.0001)
+            feature_summary_df.loc[:, f"{feat}_{q}normed"] = normed
+            expressions_normed[q].extend(normed)
+        plt.xlim(-15, 15)
+        plt.xlabel("log2(expression of all markers)")
+        plt.legend()
+        plt.show()
+    # visualize before & after quantile normalization
+    '''N = len(qs)+1 # (len(qs)+1) // 2 + (len(qs)+1) %2'''
+    log_expressions = tuple([np.log2(np.array(expressions) + 0.0001)] + [expressions_normed[q] for q in qs])
+    labels = ["before normalization"] + [f"after {q} normalization" for q in qs]
+    fig, ax = plt.subplots(1, 1, figsize=(12, 7))
+    ax.hist(log_expressions, 100, density=True, label=labels)
+    ax.set_xlabel("log2(expressions for all markers)")
+    plt.legend()
+    plt.show()
+    return quantiles
+def feature_scaling(feature_summary_df, features, inplace=False):
+    """Perform in-place mean-std scaling on selected features. Normally, do not scale nuclei sum feature
+    Inputs:
+        feature_summary_df = dataframe of extracted feature summary
+        features           = features to perform scaling on
+        inplace            = an indicator of whether perform the scaling in-place (Default=False)
+    Returns:
+    :param feature_summary_df: pandas.core.frame.DataFrame
+    :param features: list
+    :param inplace: bool
+    """
+    scaled_feature_summary_df = feature_summary_df if inplace else feature_summary_df.copy()
+    for feat in features:
+        if feat not in feature_summary_df.columns:
+            print(f"Warning: {feat} not available!")
+            continue
+        scaled_feature_summary_df[feat] = \
+            (scaled_feature_summary_df[feat] - np.average(scaled_feature_summary_df[feat])) \
+            / np.std(scaled_feature_summary_df[feat])
+    if not inplace:
+        return scaled_feature_summary_df
+def generate_summary(feature_summary_df, features, thresholds):
+    """Generate (cell level) summary table for each feature in features: feature name, total number (of cells),
+        calculated GMM threshold for this feature, number of individuals (cells) with greater than threshold values,
+        ratio of individuals (cells) with greater than threshold values
+    Inputs:
+        feature_summary_df = dataframe of extracted feature summary
+        features           = a list of features to generate summary table
+        thresholds         = (calculated GMM-based) thresholds for each feature
+    Outputs:
+        df_info    = summary table for each feature
+    :param feature_summary_df: pandas.core.frame.DataFrame
+    :param features: list
+    :param thresholds: dict
+    :return df_info: pandas.core.frame.DataFrame
+    """
+    df_info = pd.DataFrame(columns=['feature', 'total number', 'threshold', 'positive counts', 'positive ratio'])
+    for feature in features:
+        # calculate threshold
+        thres = thresholds[feature]
+        X = feature_summary_df[feature].values
+        n = sum(X > thres)
+        N = len(X)
+        df_new_row = pd.DataFrame({'feature': feature,'total number':N, 'threshold':thres,
+                                  'positive counts':n, 'positive ratio': n/N}, index=[0])
+        df_info = pd.concat([df_info, df_new_row])
+    return df_info
+# def visualize_thresholding_outcome(feat,
+#                                    feature_summary_df,
+#                                    raw_image,
+#                                    channel_names,
+#                                    thres,
+#                                    nuclei_seg,
+#                                    cell_seg,
+#                                    vis_quantile_q=0.9, savepath=None):
+#     """ Visualize calculated threshold for a feature by mapping back to nuclei and cell segmentation outputs - showing
+#         greater than threshold pixels in red color, others with blue color.
+#         Meanwhile, visualize the original image with red color indicating the channel correspond to the feature.
+#     Inputs:
+#         feat               = name of the feature to visualize
+#         feature_summary_df = dataframe of extracted feature summary
+#         raw_image          = raw cytof image
+#         channel_names       = a list of marker names, which is consistent with each channel in the raw_image
+#         thres              = threshold value for feature "feat"
+#         nuclei_seg         = nuclei segmentation output
+#         cell_seg           = cell segmentation output
+#     Outputs:
+#         stain_nuclei       = nuclei segmentation output stained with threshold information
+#         stain_cell         = cell segmentation output stained with threshold information
+#     :param feat: string
+#     :param feature_summary_df: pandas.core.frame.DataFrame
+#     :param raw_image: numpy.ndarray
+#     :param channel_names: list
+#     :param thres: float
+#     :param nuclei_seg: numpy.ndarray
+#     :param cell_seg: numpy.ndarray
+#     :return stain_nuclei: numpy.ndarray
+#     :return stain_cell: numpy.ndarray
+#     """
+#     col_name = channel_names[np.argmax([len(_longest_substring(feat, x)) for x in channel_names])]
+#     col_id   = channel_names.index(col_name)
+#     df_temp = pd.DataFrame(columns=[f"{feat}_overthres"], data=np.zeros(len(feature_summary_df), dtype=np.int32))
+#     df_temp.loc[feature_summary_df[feat] > thres, f"{feat}_overthres"] = 1
+#     feature_summary_df = pd.concat([feature_summary_df, df_temp], axis=1)
+#     # feature_summary_df.loc[:, f"{feat}_overthres"] = 0
+#     # feature_summary_df.loc[feature_summary_df[feat] > thres, f"{feat}_overthres"] = 1
+#
+#     '''rgba_color = [plt.cm.get_cmap('tab20').colors[_ % 20] for _ in feature_summary_df.loc[:, f"{feat}_overthres"]]'''
+#     color_ids  = []
+#
+#     # stained Nuclei image
+#     stain_nuclei = np.zeros((nuclei_seg.shape[0], nuclei_seg.shape[1], 3)) + 1
+#     for i in range(2, np.max(nuclei_seg) + 1):
+#         color_id = feature_summary_df[f"{feat}_overthres"][feature_summary_df['id'] == i].values[0] * 2
+#         if color_id not in color_ids:
+#             color_ids.append(color_id)
+#         stain_nuclei[nuclei_seg == i] = plt.cm.get_cmap('tab20').colors[color_id][:3]
+#
+#     # stained Cell image
+#     stain_cell = np.zeros((cell_seg.shape[0], cell_seg.shape[1], 3)) + 1
+#     for i in range(2, np.max(cell_seg) + 1):
+#         color_id = feature_summary_df[f"{feat}_overthres"][feature_summary_df['id'] == i].values[0] * 2
+#         stain_cell[cell_seg == i] = plt.cm.get_cmap('tab20').colors[color_id][:3]
+#
+#     fig, axs = plt.subplots(1,3,figsize=(16, 8))
+#     if col_id != 0:
+#         channel_ids = (col_id, 0)
+#     else:
+#         channel_ids = (col_id, -1)
+#     '''print(channel_ids)'''
+#     quantiles = [np.quantile(raw_image[..., _], vis_quantile_q) for _ in channel_ids]
+#     vis_img, _ = pre.cytof_merge_channels(raw_image, channel_names=channel_names,
+#                                           channel_ids=channel_ids, quantiles=quantiles)
+#     marker = feat.split("(")[0]
+#     print(f"Nuclei and cell with high {marker} expression shown in orange, low in blue.")
+#
+#     axs[0].imshow(vis_img)
+#     axs[1].imshow(stain_nuclei)
+#     axs[2].imshow(stain_cell)
+#     axs[0].set_title("pseudo-colored original image")
+#     axs[1].set_title(f"{marker} expression shown in nuclei")
+#     axs[2].set_title(f"{marker} expression shown in cell")
+#     if savepath is not None:
+#         plt.savefig(savepath)
+#     plt.show()
+#     return stain_nuclei, stain_cell, vis_img
+########################################################################################################################
+############################################### batch functions ########################################################
+########################################################################################################################
+def batch_extract_feature(files, markers, nuclei_markers, membrane_markers=None, show_vis=False):
+    """Extract features for cytof images from a list of files. Normally this list contains ROIs of the same slide
+    Inputs:
+        files            = a list of files to be processed
+        markers          = a list of marker names used when generating the image
+        nuclei_markers   = a list of markers define the nuclei channel (used for nuclei segmentation)
+        membrane_markers = a list of markers define the membrane channel (used for cell segmentation) (Default=None)
+        show_vis         = an indicator of showing visualization during process
+    Outputs:
+        file_features    = a dictionary contains extracted features for each file
+    :param files: list
+    :param markers: list
+    :param nuclei_markers: list
+    :param membrane_markers: list
+    :param show_vis: bool
+    :return file_features: dict
+    """
+    file_features = {}
+    for f in tqdm(files):
+        # read data
+        df = pre.cytof_read_data(f)
+        # preprocess
+        df_ = pre.cytof_preprocess(df)
+        column_names = markers[:]
+        df_output = pre.define_special_channel(df_, 'nuclei', markers=nuclei_markers)
+        column_names.insert(0, 'nuclei')
+        if membrane_markers is not None:
+            df_output = pre.define_special_channel(df_output, 'membrane', markers=membrane_markers)
+            column_names.append('membrane')
+        raw_image = pre.cytof_txt2img(df_output, marker_names=column_names)
+        if show_vis:
+            merged_im, _ = pre.cytof_merge_channels(raw_image, channel_ids=[0, -1], quantiles=None, visualize=False)
+            plt.imshow(merged_im[0:200, 200:400, ...])
+            plt.title('Selected region of raw cytof image')
+            plt.show()
+        # nuclei and cell segmentation
+        nuclei_img = raw_image[..., column_names.index('nuclei')]
+        nuclei_seg, color_dict = seg.cytof_nuclei_segmentation(nuclei_img, show_process=False)
+        if membrane_markers is not None:
+            membrane_img = raw_image[..., column_names.index('membrane')]
+            cell_seg, _ = seg.cytof_cell_segmentation(nuclei_seg, membrane_channel=membrane_img, show_process=False)
+        else:
+            cell_seg, _ = seg.cytof_cell_segmentation(nuclei_seg, show_process=False)
+        if show_vis:
+            marked_image_nuclei = seg.visualize_segmentation(raw_image, nuclei_seg, channel_ids=(0, -1), show=False)
+            marked_image_cell = seg.visualize_segmentation(raw_image, cell_seg, channel_ids=(-1, 0), show=False)
+            fig, axs = plt.subplots(1,2,figsize=(10,6))
+            axs[0].imshow(marked_image_nuclei[0:200, 200:400, :]), axs[0].set_title('nuclei segmentation')
+            axs[1].imshow(marked_image_cell[0:200, 200:400, :]), axs[1].set_title('cell segmentation')
+            plt.show()
+        # feature extraction
+        feat_names = markers[:]
+        feat_names.insert(0, 'nuclei')
+        df_feat_sum = extract_feature(feat_names, raw_image, nuclei_seg, cell_seg, filename=f)
+        file_features[f] = df_feat_sum
+    return file_features
+def batch_norm_scale(file_features, column_names, qs=[75,99]):
+    """Perform feature log transform, quantile normalization and scaling in a batch
+    Inputs:
+        file_features = A dictionary of dataframes containing extracted features. key - file name, item - feature table
+        column_names  = A list of markers. Should be consistent with column names in dataframe of features
+        qs            = quantile q values (Default=[75,99])
+    Outputs:
+        file_features_out = log transformed, quantile normalized and scaled features for each file in the batch
+        quantiles         = a dictionary of quantile values for each file in the batch
+    :param file_features: dict
+    :param column_names: list
+    :param qs: list
+    :return file_features_out: dict
+    :return quantiles: dict
+    """
+    file_features_out = copy.deepcopy(file_features) # maintain a copy of original file_features
+    # marker features
+    cell_markers_sum   = [_ + '_cell_sum' for _ in column_names]
+    cell_markers_ave   = [_ + '_cell_ave' for _ in column_names]
+    nuclei_markers_sum = [_ + '_nuclei_sum' for _ in column_names]
+    nuclei_markers_ave = [_ + '_nuclei_ave' for _ in column_names]
+    # morphology features
+    morphology = ["area", "convex_area", "eccentricity", "extent",
+                  "filled_area", "major_axis_length", "minor_axis_length",
+                  "orientation", "perimeter", "solidity", "pa_ratio"]
+    nuclei_morphology = [_ + '_nuclei' for _ in morphology]  # morphology - nuclei level
+    cell_morphology   = [_ + '_cell' for _ in morphology]  # morphology - cell level
+    # features to be normalized
+    features_to_norm = [x for x in nuclei_markers_sum + nuclei_markers_ave + cell_markers_sum + cell_markers_ave \
+                        if not x.startswith('nuclei')]
+    # features to be scaled
+    scale_features = []
+    for feature_name in nuclei_morphology + cell_morphology + nuclei_markers_sum + nuclei_markers_ave + \
+                        cell_markers_sum + cell_markers_ave:
+        '''if feature_name not in nuclei_morphology + cell_morphology and not feature_name.startswith('nuclei'):
+            scale_features += [feature_name, f"{feature_name}_75normed", f"{feature_name}_99normed"]
+        else:
+            scale_features += [feature_name]'''
+        temp = [feature_name]
+        if feature_name not in nuclei_morphology + cell_morphology and not feature_name.startswith('nuclei'):
+            for q in qs:
+                temp += [f"{feature_name}_{q}normed"]
+        scale_features += temp
+    quantiles = {}
+    for f, df in file_features_out.items():
+        print(f)
+        quantiles[f] = feature_quantile_normalization(df, features=features_to_norm, qs=qs)
+        feature_scaling(df, features=scale_features, inplace=True)
+    return file_features_out, quantiles
+def batch_scale_feature(outdir, normqs, df_io=None, files_scale=None):
+    """
+    Inputs:
+        outdir      = output saving directory, which contains the scale file generated previously,
+                     the input_output_csv file with the list of available cytof_img class instances in the batch,
+                     as well as previously saved cytof_img class instances in .pkl files
+        normqs      = a list of q values of percentile normalization
+        files_scale = full file name of the scaling information
+    Outputs: None
+        Scaled feature are saved as .csv files in subfolder "feature_qnormed_scaled" in outdir
+        A new attribute will be added to cytof_img class instance, and the update class instance is saved in outdir
+    """
+    if df_io is None:
+        df_io = pd.read_csv(os.path.join(outdir, "input_output.csv"))
+    for _i, normq in enumerate(normqs):
+        n_attr = f"df_feature_{normq}normed"
+        n_attr_scaled = f"{n_attr}_scaled"
+        file_scale = files_scale[_i] if files_scale is not None else os.path.join(outdir, "{}normed_scale_params.csv".format(normq))
+        # saving directory of scaled normed feature
+        dirq = os.path.join(outdir, f"feature_{normq}normed_scaled")
+        if not os.path.exists(dirq):
+            os.makedirs(dirq)
+        # load scaling parameters
+        df_scale = pd.read_csv(file_scale, index_col=False)
+        m = df_scale[df_scale.columns].iloc[0] # mean
+        s = df_scale[df_scale.columns].iloc[1] # std.dev
+        dfs = {}
+        cytofs = {}
+        # save scaled feature
+        for f_cytof in df_io['output_file']:
+    #     for roi, f_cytof in zip(df_io['ROI'], df_io['output_file']):
+            cytof_img = pkl.load(open(f_cytof, "rb"))
+            assert hasattr(cytof_img, n_attr), f"attribute {n_attr} not exist"
+            df_feat = copy.deepcopy(getattr(cytof_img, n_attr))
+            assert len([x for x in df_scale.columns if x not in df_feat.columns]) == 0
+            # scale
+            df_feat[df_scale.columns] = (df_feat[df_scale.columns] - m) / s
+            # save scaled feature to csv
+            df_feat.to_csv(os.path.join(dirq, os.path.basename(f_cytof).replace('.pkl', '.csv')), index=False)
+            # add attribute "df_feature_scaled"
+            setattr(cytof_img, n_attr_scaled, df_feat)
+            # save updated cytof_img class instance
+            pkl.dump(cytof_img, open(f_cytof, "wb"))
+def batch_generate_summary(outdir, feature_type="normed", normq=75, scaled=True, vis_thres=False):
+    """
+    Inputs:
+        outdir       = output saving directory, which contains the scale file generated previously, as well as previously saved
+                     cytof_img class instances in .pkl files
+        feature_type = type of feature to be used, available choices: "original", "normed", "scaled"
+        normq        = q value of quantile normalization
+        scaled       = a flag indicating whether or not use the scaled version of features (Default=False)
+        vis_thres    = a flag indicating whether or not visualize the process of calculating thresholds (Default=False)
+    Outputs: None
+        Two .csv files, one for cell sum and the other for cell average features, are saved for each ROI, containing the
+        threshold and cell count information of each feature, in the subfolder "marker_summary" under outdir
+    """
+    assert feature_type in ["original", "normed", "scaled"], 'accepted feature types are "original", "normed", "scaled"'
+    if feature_type == "original":
+        feat_name = ""
+    elif feature_type == "normed":
+        feat_name = f"{normq}normed"
+    else:
+        feat_name = f"{normq}normed_scaled"
+    n_attr = f"df_feature_{feat_name}"
+    dir_sum = os.path.join(outdir, "marker_summary", feat_name)
+    print(dir_sum)
+    if not os.path.exists(dir_sum):
+        os.makedirs(dir_sum)
+    seen = 0
+    dfs = {}
+    cytofs = {}
+    df_io  = pd.read_csv(os.path.join(outdir, "input_output.csv"))
+    for f in df_io['output_file'].tolist():
+        f_roi = os.path.basename(f).split(".pkl")[0]
+        cytof_img = pkl.load(open(f, "rb"))
+        ##### updated #####
+        df_feat = getattr(cytof_img, n_attr)
+        dfs[f]  = getattr(cytof_img, n_attr)
+        cytofs[f] = cytof_img
+        ##### end updated #####
+        if seen == 0:
+            feat_cell_sum = cytof_img.features['cell_sum']
+            feat_cell_ave = cytof_img.features['cell_ave']
+        seen += 1
+    ##### updated #####
+    all_df     = pd.concat(dfs.values(), ignore_index=True)
+    print("Getting thresholds for marker sum")
+    thres_sum = _get_thresholds(all_df, feat_cell_sum, visualize=vis_thres)
+    print("Getting thresholds for marker average")
+    thres_ave = _get_thresholds(all_df, feat_cell_ave, visualize=vis_thres)
+    for f, cytof_img in cytofs.items():
+        f_roi = os.path.basename(f).split(".pkl")[0]
+        df_info_cell_sum_f = generate_summary(dfs[f], features=feat_cell_sum, thresholds=thres_sum)
+        df_info_cell_ave_f = generate_summary(dfs[f], features=feat_cell_ave, thresholds=thres_ave)
+        setattr(cytof_img, f"cell_count_{feat_name}_sum", df_info_cell_sum_f)
+        setattr(cytof_img, f"cell_count_{feat_name}_ave", df_info_cell_ave_f)
+        df_info_cell_sum_f.to_csv(os.path.join(dir_sum, f"{f_roi}_cell_count_sum.csv"), index=False)
+        df_info_cell_ave_f.to_csv(os.path.join(dir_sum, f"{f_roi}_cell_count_ave.csv"), index=False)
+        pkl.dump(cytof_img, open(f, "wb"))
+    return dir_sum
+def _gather_roi_expressions(df_io, normqs=[75]):
+    """Only cell level sum"""
+    expressions = {}
+    expressions_normed = {}
+    for roi in df_io["ROI"].unique():
+        expressions[roi] = []
+        f_cytof_im = df_io.loc[df_io["ROI"] == roi, "output_file"].values[0]
+        cytof_im = load_CytofImage(f_cytof_im)
+        for feature_name in cytof_im.features['cell_sum']:
+            expressions[roi].extend(cytof_im.df_feature[feature_name])
+        expressions_normed[roi] = dict((q, {}) for q in normqs)
+        for q in expressions_normed[roi].keys():
+            expressions_normed[roi][q] = []
+            normed_feat = getattr(cytof_im, "df_feature_{}normed".format(q))
+            for feature_name in cytof_im.features['cell_sum']:
+                expressions_normed[roi][q].extend(normed_feat[feature_name])
+    return expressions, expressions_normed
+def visualize_normalization(df_slide_roi, normqs=[75], level="slide"):
+    expressions_, expressions_normed_ = _gather_roi_expressions(df_slide_roi, normqs=normqs)
+    if level == "slide":
+        prefix = "Slide"
+        expressions, expressions_normed = {}, {}
+        for slide in df_slide_roi["Slide"].unique():
+            f_rois = df_slide_roi.loc[df_slide_roi["Slide"] == slide, "ROI"].values
+            rois = [x.replace('.txt', '') for x in f_rois]
+            expressions[slide] = []
+            expressions_normed[slide] = dict((q, []) for q in normqs)
+            for roi in rois:
+                expressions[slide].extend(expressions_[roi])
+                for q in expressions_normed[slide].keys():
+                    expressions_normed[slide][q].extend(expressions_normed_[roi][q])
+    else:
+        expressions, expressions_normed = expressions_, expressions_normed_
+        prefix = "ROI"
+    num_q = len(normqs)
+    for key, key_exp in expressions.items():  # create a new plot for each slide (or ROI)
+        print("Showing {} {}".format(prefix, key))
+        fig, ax = plt.subplots(1, num_q + 1, figsize=(4 * (num_q + 1), 4))
+        ax[0].hist((np.log2(np.array(key_exp) + 0.0001),), 100, density=True)
+        ax[0].set_title("Before normalization")
+        ax[0].set_xlabel("log2(cellular expression of all markers)")
+        for i, q in enumerate(normqs):
+            ax[i + 1].hist((np.array(expressions_normed[key][q]) + 0.0001,), 100, density=True)
+            ax[i + 1].set_title("After {}-th percentile normalization".format(q))
+            ax[i + 1].set_xlabel("log2(cellular expression of all markers)")
+        plt.show()
+    return expressions, expressions_normed
+###########################################################
+############# marker level analysis functions #############
+###########################################################
+############# marker co-expression analysis #############
+def _gather_roi_co_exp(df_slide_roi, outdir, feat_name, accumul_type):
+    """roi level co-expression analysis"""
+    n_attr = f"df_feature_{feat_name}"
+    expected_percentages = {}
+    edge_percentages     = {}
+    num_cells            = {}
+    for seen_roi, f_roi in enumerate(df_slide_roi["ROI"].unique()):
+        roi = f_roi.replace(".txt", "")
+        slide = df_slide_roi.loc[df_slide_roi["ROI"] == f_roi, "Slide"].values[0]
+        f_cytof_im = "{}_{}.pkl".format(slide, roi)
+        if not f_cytof_im in os.listdir(os.path.join(outdir, "cytof_images")):
+            print("{} not found, skip".format(f_cytof_im))
+            continue
+        cytof_im   = load_CytofImage(os.path.join(outdir, "cytof_images", f_cytof_im))
+        df_feat = getattr(cytof_im, n_attr)
+        if seen_roi == 0:
+            # all gene (marker) columns
+            marker_col_all = [x for x in df_feat.columns if "cell_{}".format(accumul_type) in x]
+            marker_all = [x.split('(')[0] for x in marker_col_all]
+            n_marker = len(marker_col_all)
+        n_cell   = len(df_feat)
+        # corresponding marker positive info file
+        df_info_cell = getattr(cytof_im,"cell_count_{}_{}".format(feat_name,accumul_type))
+        pos_nums   = df_info_cell["positive counts"].values
+        pos_ratios = df_info_cell["positive ratio"].values
+        thresholds = df_info_cell["threshold"].values
+        # create new expected_percentage matrix for each ROI
+        expected_percentage = np.zeros((n_marker, n_marker))
+        # expected_percentage
+        # an N by N matrix, where N represent for the number of total gene (marker)
+        # each ij-th element represents for the percentage that both the i-th and the j-th gene is "positive"
+        # based on the threshold defined previously
+        for ii in range(n_marker):
+            for jj in range(n_marker):
+                expected_percentage[ii, jj] = pos_nums[ii] * pos_nums[jj]
+        expected_percentages[roi] = expected_percentage
+        # edge_percentage
+        # an N by N matrix, where N represent for the number of gene (marker)
+        # each ij-th element represents for the percentage of cells that show positive in both i-th and j-th gene
+        edge_nums = np.zeros_like(expected_percentage)
+        for ii in range(n_marker):
+            _x = df_feat[marker_col_all[ii]].values > thresholds[ii] # _x = df_feat[marker_col_all[ii]].values > thresholds[marker_idx[ii]]
+            for jj in range(n_marker):
+                _y = df_feat[marker_col_all[jj]].values > thresholds[jj] # _y = df_feat[marker_col_all[jj]].values > thresholds[marker_idx[jj]]
+                edge_nums[ii, jj] = np.sum(np.all([_x, _y], axis=0)) # / n_cell
+        edge_percentages[roi] = edge_nums
+        num_cells[roi]        = n_cell
+    return expected_percentages, edge_percentages, num_cells, marker_all, marker_col_all
+def co_expression_analysis(df_slide_roi, outdir, feature_type, accumul_type, co_exp_markers="all", normq=75,
+                           level="slide", clustergrid=None):
+    """
+    """
+    assert level in ["slide", "roi"], "Only slide or roi levels are accepted!"
+    assert feature_type in ["original", "normed", "scaled"]
+    if feature_type == "original":
+        feat_name = ""
+    elif feature_type == "normed":
+        feat_name = f"{normq}normed"
+    else:
+        feat_name = f"{normq}normed_scaled"
+    print(feat_name)
+    dir_cytof_img = os.path.join(outdir, "cytof_images")
+    expected_percentages, edge_percentages, num_cells, marker_all, marker_col_all = \
+        _gather_roi_co_exp(df_slide_roi, outdir, feat_name, accumul_type)
+    if co_exp_markers != "all":
+        # assert (isinstance(co_exp_markers, list) and all([x in cytof_img.markers for x in co_exp_markers]))
+        assert (isinstance(co_exp_markers, list) and all([x in marker_all for x in co_exp_markers]))
+        marker_idx = np.array([marker_all.index(x) for x in co_exp_markers])
+        marker_all = [marker_all[x] for x in marker_idx]
+        marker_col_all = [marker_col_all[x] for x in marker_idx]
+    else:
+        marker_idx = np.arange(len(marker_all))
+    if level == "slide":
+        # expected_percentages, edge_percentages = {}, {}
+        for slide in df_slide_roi["Slide"].unique():  ## for each slide
+            for seen_roi, f_roi in enumerate(df_slide_roi.loc[df_slide_roi["Slide"] == slide, "ROI"]):  ## for each ROI
+                roi = f_roi.replace(".txt", "")
+                if roi not in expected_percentages:
+                    continue
+                if seen_roi == 0:
+                    expected_percentages[slide] = expected_percentages[roi]
+                    edge_percentages[slide] = edge_percentages[roi]
+                    num_cells[slide] = num_cells[roi]
+                else:
+                    expected_percentages[slide] += expected_percentages[roi]
+                    edge_percentages[slide] += edge_percentages[roi]
+                    num_cells[slide] += num_cells[roi]
+                expected_percentages.pop(roi)
+                edge_percentages.pop(roi)
+                num_cells.pop(roi)
+    co_exps = {}
+    for key, expected_percentage in expected_percentages.items():
+        expected_percentage = expected_percentage / num_cells[key] ** 2
+        edge_percentage = edge_percentages[key] / num_cells[key]
+        # Normalize
+        edge_percentage_norm = np.log10(edge_percentage / expected_percentage + 0.1)
+        # Fix Nan
+        edge_percentage_norm[np.isnan(edge_percentage_norm)] = np.log10(1 + 0.1)
+        co_exps[key] = edge_percentage_norm
+    # plot
+    for f_key, edge_percentage_norm in co_exps.items():
+        plt.figure(figsize=(6, 6))
+        ax = sns.heatmap(edge_percentage_norm[marker_idx, :][:, marker_idx], center=np.log10(1 + 0.1),
+                         # ax = sns.heatmap(edge_percentage_norm, center=np.log10(1 + 0.1),
+                         cmap='RdBu_r', vmin=-1, vmax=3,
+                         xticklabels=marker_all, yticklabels=marker_all)
+        ax.set_aspect('equal')
+        plt.title(f_key)
+        plt.show()
+        if clustergrid is None:
+            plt.figure()
+            clustergrid = sns.clustermap(edge_percentage_norm[marker_idx, :][:, marker_idx],
+                                         # clustergrid = sns.clustermap(edge_percentage_norm,
+                                         center=np.log10(1 + 0.1), cmap='RdBu_r', vmin=-1, vmax=3,
+                                         xticklabels=marker_all, yticklabels=marker_all, figsize=(6, 6))
+            plt.title(f_key)
+            plt.show()
+        # else:
+        plt.figure()
+        sns.clustermap(edge_percentage_norm[marker_idx, :][:, marker_idx] \
+                       # sns.clustermap(edge_percentage_norm \
+                       [clustergrid.dendrogram_row.reordered_ind, :][:, clustergrid.dendrogram_row.reordered_ind],
+                       center=np.log10(1 + 0.1), cmap='RdBu_r', vmin=-1, vmax=3,
+                       xticklabels=np.array(marker_all)[clustergrid.dendrogram_row.reordered_ind],
+                       yticklabels=np.array(marker_all)[clustergrid.dendrogram_row.reordered_ind],
+                       figsize=(6, 6), row_cluster=False, col_cluster=False)
+        plt.title(f_key)
+        plt.show()
+    return co_exps, marker_idx, clustergrid
+############# marker correlation #############
+from scipy.stats import spearmanr
+def _gather_roi_corr(df_slide_roi, outdir, feat_name, accumul_type):
+    """roi level correlation analysis"""
+    n_attr = f"df_feature_{feat_name}"
+    feats = {}
+    for seen_roi, f_roi in enumerate(df_slide_roi["ROI"].unique()):## for each ROI
+        roi = f_roi.replace(".txt", "")
+        slide = df_slide_roi.loc[df_slide_roi["ROI"] == f_roi, "Slide"].values[0]
+        f_cytof_im = "{}_{}.pkl".format(slide, roi)
+        if not f_cytof_im in os.listdir(os.path.join(outdir, "cytof_images")):
+            print("{} not found, skip".format(f_cytof_im))
+            continue
+        cytof_im   = load_CytofImage(os.path.join(outdir, "cytof_images", f_cytof_im))
+        df_feat    = getattr(cytof_im, n_attr)
+        feats[roi] = df_feat
+        if seen_roi == 0:
+            # all gene (marker) columns
+            marker_col_all = [x for x in df_feat.columns if "cell_{}".format(accumul_type) in x]
+            marker_all = [x.split('(')[0] for x in marker_col_all]
+    return feats, marker_all, marker_col_all
+def correlation_analysis(df_slide_roi, outdir, feature_type, accumul_type, corr_markers="all", normq=75, level="slide",
+                         clustergrid=None):
+    """
+    """
+    assert level in ["slide", "roi"], "Only slide or roi levels are accepted!"
+    assert feature_type in ["original", "normed", "scaled"]
+    if feature_type == "original":
+        feat_name = ""
+    elif feature_type == "normed":
+        feat_name = f"{normq}normed"
+    else:
+        feat_name = f"{normq}normed_scaled"
+    print(feat_name)
+    dir_cytof_img = os.path.join(outdir, "cytof_images")
+    feats, marker_all, marker_col_all = _gather_roi_corr(df_slide_roi, outdir, feat_name, accumul_type)
+    n_marker = len(marker_all)
+    corrs = {}
+    # n_marker = len(marker_all)
+    if level == "slide":
+        for slide in df_slide_roi["Slide"].unique():  ## for each slide
+            for seen_roi, f_roi in enumerate(df_slide_roi.loc[df_slide_roi["Slide"] == slide, "ROI"]):  ## for each ROI
+                roi = f_roi.replace(".txt", "")
+                if roi not in feats:
+                    continue
+                if seen_roi == 0:
+                    feats[slide] = feats[roi]
+                else:
+                    #                     feats[slide] = feats[slide].append(feats[roi], ignore_index=True)
+                    feats[slide] = pd.concat([feats[slide], feats[roi]])
+                feats.pop(roi)
+    for key, feat in feats.items():
+        correlation = np.zeros((n_marker, n_marker))
+        for i, feature_i in enumerate(marker_col_all):
+            for j, feature_j in enumerate(marker_col_all):
+                correlation[i, j] = spearmanr(feat[feature_i].values, feat[feature_j].values).correlation
+        corrs[key] = correlation
+    if corr_markers != "all":
+        assert (isinstance(corr_markers, list) and all([x in marker_all for x in corr_markers]))
+        marker_idx = np.array([marker_all.index(x) for x in corr_markers])
+        marker_all = [marker_all[x] for x in marker_idx]
+        marker_col_all = [marker_col_all[x] for x in marker_idx]
+    else:
+        marker_idx = np.arange(len(marker_all))
+    # plot
+    for f_key, corr in corrs.items():
+        plt.figure(figsize=(6, 6))
+        ax = sns.heatmap(corr[marker_idx, :][:, marker_idx], center=np.log10(1 + 0.1),
+                         cmap='RdBu_r', vmin=-1, vmax=1,
+                         xticklabels=corr_markers, yticklabels=corr_markers)
+        ax.set_aspect('equal')
+        plt.title(f_key)
+        plt.show()
+        if clustergrid is None:
+            plt.figure()
+            clustergrid = sns.clustermap(corr[marker_idx, :][:, marker_idx],
+                                         center=np.log10(1 + 0.1), cmap='RdBu_r', vmin=-1, vmax=1,
+                                         xticklabels=corr_markers, yticklabels=corr_markers, figsize=(6, 6))
+            plt.title(f_key)
+            plt.show()
+        plt.figure()
+        sns.clustermap(corr[marker_idx, :][:, marker_idx] \
+                           [clustergrid.dendrogram_row.reordered_ind, :][:, clustergrid.dendrogram_row.reordered_ind],
+                       center=np.log10(1 + 0.1), cmap='RdBu_r', vmin=-1, vmax=1,
+                       xticklabels=np.array(corr_markers)[clustergrid.dendrogram_row.reordered_ind],
+                       yticklabels=np.array(corr_markers)[clustergrid.dendrogram_row.reordered_ind],
+                       figsize=(6, 6), row_cluster=False, col_cluster=False)
+        plt.title(f_key)
+        plt.show()
+    return corrs, marker_idx, clustergrid
+############# marker interaction #############
+from sklearn.neighbors import DistanceMetric
+from tqdm import tqdm
+def _gather_roi_interact(df_slide_roi, outdir, feat_name, accumul_type, interact_markers="all", thres_dist=50):
+    dist = DistanceMetric.get_metric('euclidean')
+    n_attr = f"df_feature_{feat_name}"
+    edge_percentages = {}
+    num_edges = {}
+    for seen_roi, f_roi in enumerate(df_slide_roi["ROI"].unique()):  ## for each ROI
+        roi = f_roi.replace(".txt", "")
+        slide = df_slide_roi.loc[df_slide_roi["ROI"] == f_roi, "Slide"].values[0]
+        f_cytof_im = "{}_{}.pkl".format(slide, roi)
+        if not f_cytof_im in os.listdir(os.path.join(outdir, "cytof_images")):
+            print("{} not found, skip".format(f_cytof_im))
+            continue
+        cytof_im = load_CytofImage(os.path.join(outdir, "cytof_images", f_cytof_im))
+        df_feat  = getattr(cytof_im, n_attr)
+        n_cell   = len(df_feat)
+        dist_matrix = dist.pairwise(df_feat.loc[:, ['coordinate_x', 'coordinate_y']].values)
+        if seen_roi==0:
+            # all gene (marker) columns
+            marker_col_all = [x for x in df_feat.columns if "cell_{}".format(accumul_type) in x]
+            marker_all = [x.split('(')[0] for x in marker_col_all]
+            n_marker = len(marker_col_all)
+        # corresponding marker positive info file
+        df_info_cell = getattr(cytof_im,"cell_count_{}_{}".format(feat_name,accumul_type))
+        thresholds   = df_info_cell["threshold"].values#[marker_idx]
+        n_edges = 0
+        # expected_percentage = np.zeros((n_marker, n_marker))
+        # edge_percentage = np.zeros_like(expected_percentage)
+        edge_nums = np.zeros((n_marker, n_marker))
+        # interaction
+        cluster_sub = []
+        for i_cell in range(n_cell):
+            _temp = set()
+            for k in range(n_marker):
+                if df_feat[marker_col_all[k]].values[i_cell] > thresholds[k]:
+                    _temp = _temp | {k}
+            cluster_sub.append(_temp)
+        for i in tqdm(range(n_cell)):
+            for j in range(n_cell):
+                 if dist_matrix[i, j] > 0 and dist_matrix[i, j] < thres_dist:
+                        n_edges += 1
+                        for m in cluster_sub[i]:
+                            for n in cluster_sub[j]:
+                                edge_nums[m, n] += 1
+        edge_percentages[roi] = edge_nums#/n_edges
+        num_edges[roi] = n_edges
+    return edge_percentages, num_edges, marker_all, marker_col_all
+def interaction_analysis(df_slide_roi,
+                         outdir,
+                         feature_type,
+                         accumul_type,
+                         interact_markers="all",
+                         normq=75,
+                         level="slide",
+                         thres_dist=50,
+                         clustergrid=None):
+    """
+    """
+    assert level in ["slide", "roi"], "Only slide or roi levels are accepted!"
+    assert feature_type in ["original", "normed", "scaled"]
+    if feature_type == "original":
+        feat_name = ""
+    elif feature_type == "normed":
+        feat_name = f"{normq}normed"
+    else:
+        feat_name = f"{normq}normed_scaled"
+    print(feat_name)
+    dir_cytof_img = os.path.join(outdir, "cytof_images")
+    expected_percentages, _, num_cells, marker_all_, marker_col_all_ = \
+        _gather_roi_co_exp(df_slide_roi, outdir, feat_name, accumul_type)
+    edge_percentages, num_edges, marker_all, marker_col_all = \
+        _gather_roi_interact(df_slide_roi, outdir, feat_name, accumul_type, interact_markers="all",
+                             thres_dist=thres_dist)
+    if level == "slide":
+        for slide in df_slide_roi["Slide"].unique():  ## for each slide
+            for seen_roi, f_roi in enumerate(df_slide_roi.loc[df_slide_roi["Slide"] == slide, "ROI"]):  ## for each ROI
+                roi = f_roi.replace(".txt", "")
+                if roi not in expected_percentages:
+                    continue
+                if seen_roi == 0:
+                    expected_percentages[slide] = expected_percentages[roi]
+                    edge_percentages[slide] = edge_percentages[roi]
+                    num_edges[slide] = num_edges[roi]
+                    num_cells[slide] = num_cells[roi]
+                else:
+                    expected_percentages[slide] += expected_percentages[roi]
+                    edge_percentages[slide] += edge_percentages[roi]
+                    num_edges[slide] += num_edges[roi]
+                    num_cells[slide] += num_cells[roi]
+                expected_percentages.pop(roi)
+                edge_percentages.pop(roi)
+                num_edges.pop(roi)
+                num_cells.pop(roi)
+    if interact_markers != "all":
+        assert (isinstance(interact_markers, list) and all([x in marker_all for x in interact_markers]))
+        marker_idx = np.array([marker_all.index(x) for x in interact_markers])
+        marker_all = [marker_all[x] for x in marker_idx]
+        marker_col_all = [marker_col_all[x] for x in marker_idx]
+    else:
+        marker_idx = np.arange(len(marker_all))
+    interacts = {}
+    for key, edge_percentage in edge_percentages.items():
+        expected_percentage = expected_percentages[key] / num_cells[key] ** 2
+        edge_percentage = edge_percentage / num_edges[key]
+        # Normalize
+        edge_percentage_norm = np.log10(edge_percentage / expected_percentage + 0.1)
+        # Fix Nan
+        edge_percentage_norm[np.isnan(edge_percentage_norm)] = np.log10(1 + 0.1)
+        interacts[key] = edge_percentage_norm
+    # plot
+    for f_key, interact_ in interacts.items():
+        interact = interact_[marker_idx, :][:, marker_idx]
+        plt.figure(figsize=(6, 6))
+        ax = sns.heatmap(interact, center=np.log10(1 + 0.1),
+                         cmap='RdBu_r', vmin=-1, vmax=1,
+                         xticklabels=interact_markers, yticklabels=interact_markers)
+        ax.set_aspect('equal')
+        plt.title(f_key)
+        plt.show()
+        if clustergrid is None:
+            plt.figure()
+            clustergrid = sns.clustermap(interact, center=np.log10(1 + 0.1), cmap='RdBu_r', vmin=-1, vmax=1,
+                                         xticklabels=interact_markers, yticklabels=interact_markers, figsize=(6, 6))
+            plt.title(f_key)
+            plt.show()
+        plt.figure()
+        sns.clustermap(
+            interact[clustergrid.dendrogram_row.reordered_ind, :][:, clustergrid.dendrogram_row.reordered_ind],
+            center=np.log10(1 + 0.1), cmap='RdBu_r', vmin=-1, vmax=1,
+            xticklabels=np.array(interact_markers)[clustergrid.dendrogram_row.reordered_ind],
+            yticklabels=np.array(interact_markers)[clustergrid.dendrogram_row.reordered_ind],
+            figsize=(6, 6), row_cluster=False, col_cluster=False)
+        plt.title(f_key)
+        plt.show()
+    return interacts, clustergrid
+###########################################################
+######## Pheno-Graph clustering analysis functions ########
+###########################################################
+def clustering_phenograph(cohort_file, outdir, normq=75, feat_comb="all", k=None, save_vis=False, pheno_markers="all"):
+    """Perform Pheno-graph clustering for the cohort
+        Inputs:
+            cohort_file   = a .csv file include the whole cohort
+            outdir        = output saving directory, previously saved cytof_img class instances in .pkl files
+            normq         = q value for quantile normalization
+            feat_comb     = desired feature combination to be used for phenograph clustering, acceptable choices: "all",
+                        "cell_sum", "cell_ave", "cell_sum_only", "cell_ave_only" (Default="all")
+            k             = number of initial neighbors to run Pheno-graph (Default=None)
+                        If k is not provided, k is set to N / 100, where N is the total number of single cells
+            save_vis      = a flag indicating whether to save the visualization output (Default=False)
+            pheno_markers = a list of markers used in phenograph clustering (must be a subset of cytof_img.markers)
+    Outputs:
+        df_all     = a dataframe of features for all cells in the cohort, with the clustering output saved in the column
+        'phenotype_total{n_community}', where n_community stands for the total number of communities defined by the cohort
+            Also, each individual cytof_img class instances will be updated with 2 new attributes:
+            1)"num phenotypes ({feat_comb}_{normq}normed_{k})"
+            2)"phenotypes ({feat_comb}_{normq}normed_{k})"
+        feat_names  = feature names (columns) used to generate PhenoGraph output
+        k           = the initial number of k used to run PhenoGraph
+        pheno_name  = the column name of the added column indicating phenograph cluster
+        vis_savedir = the directory to save the visualization output
+        markers     = the list of markers used (minimal, for visualization purposes)
+    """
+    vis_savedir = ""
+    feat_groups = {
+        "all": ["cell_sum", "cell_ave", "cell_morphology"],
+        "cell_sum": ["cell_sum", "cell_morphology"],
+        "cell_ave": ["cell_ave", "cell_morphology"],
+        "cell_sum_only": ["cell_sum"],
+        "cell_ave_only": ["cell_ave"]
+    }
+    assert feat_comb in feat_groups.keys(), f"{feat_comb} not supported!"
+    feat_name = f"_{normq}normed_scaled"
+    n_attr    = f"df_feature{feat_name}"
+    dfs = {}
+    cytof_ims = {}
+    df_io = pd.read_csv(os.path.join(outdir, "input_output.csv"))
+    df_slide_roi = pd.read_csv(cohort_file)
+    # load all scaled feature in the cohort
+    for i in df_io.index:
+        f_out = df_io.loc[i, "output_file"]
+        f_roi = f_out.split('/')[-1].split('.pkl')[0]
+        if not os.path.isfile(f_out):
+            print("{} not found, skip".format(f_out))
+            continue
+        cytof_img = load_CytofImage(f_out)
+        if i == 0:
+            dict_feat = cytof_img.features
+            markers   = cytof_img.markers
+        cytof_ims[f_roi] = cytof_img
+        dfs[f_roi] = getattr(cytof_img, n_attr)
+    feat_names = []
+    for y in feat_groups[feat_comb]:
+        if "morphology" in y:
+            feat_names += dict_feat[y]
+        else:
+            if pheno_markers == "all":
+                feat_names += dict_feat[y]
+                pheno_markers = markers
+            else:
+                assert isinstance(pheno_markers, list)
+                ids = [markers.index(x) for x in pheno_markers]
+                feat_names += [dict_feat[y][x] for x in ids]
+    # concatenate feature dataframes of all rois in the cohort
+    df_all = pd.concat([_ for key, _ in dfs.items()])
+    # set number of nearest neighbors k and run PhenoGraph for phenotype clustering
+    k = k if k else int(df_all.shape[0] / 100)  # 100
+    communities, graph, Q = phenograph.cluster(df_all[feat_names], k=k, n_jobs=-1)  # run PhenoGraph
+    n_community = len(np.unique(communities))
+    # Visualize
+    ## project to 2D
+    umap_2d = umap.UMAP(n_components=2, init='random', random_state=0)
+    proj_2d = umap_2d.fit_transform(df_all[feat_names])
+    # plot together
+    print("Visualization in 2d - cohort")
+    plt.figure(figsize=(4, 4))
+    plt.title("cohort")
+    sns.scatterplot(x=proj_2d[:, 0], y=proj_2d[:, 1], hue=communities, palette='tab20',
+                    #                 legend=legend,
+                    hue_order=np.arange(n_community))
+    plt.axis('tight')
+    plt.legend(bbox_to_anchor=(1.01, 1), loc=2, borderaxespad=0.)
+    if save_vis:
+        vis_savedir = os.path.join(outdir, "phenograph_{}_{}normed_{}".format(feat_comb, normq, k))
+        if not os.path.exists(vis_savedir):
+            os.makedirs(vis_savedir)
+        plt.savefig(os.path.join(vis_savedir, "cluster_scatter.png"))
+    plt.show()
+    # attach clustering output to df_all
+    pheno_name = f'phenotype_total{n_community}'
+    df_all[pheno_name] = communities
+    df_all['{}_projx'.format(pheno_name)] = proj_2d[:,0]
+    df_all['{}_projy'.format(pheno_name)] = proj_2d[:,1]
+    return df_all, feat_names, k, pheno_name, vis_savedir, markers
+def _gather_roi_pheno(df_slide_roi, df_all):
+    """Split whole df into df for each ROI"""
+    pheno_roi = {}
+    for i in df_slide_roi.index:
+        path_i = df_slide_roi.loc[i, "path"]
+        roi_i  = df_slide_roi.loc[i, "ROI"]
+        f_in  = os.path.join(path_i, roi_i)
+        cond  = df_all["filename"] == f_in
+        pheno_roi[roi_i.replace(".txt", "")] = df_all.loc[cond, :]
+    return pheno_roi
+def _vis_cell_phenotypes(df_feat, communities, n_community, markers, list_features, accumul_type="sum", savedir=None, savename=""):
+    """ Visualize cell phenotypes for a given dataframe of feature
+    Args:
+        df_feat: a dataframe of features
+        communities: a list of communities (can be a subset of the cohort communities, but should be consistent with df_feat)
+        n_community: number of communities in the cohort (n_community >= number of unique values in communities)
+        markers: a list of markers used in CyTOF image (to be present in the heatmap visualization)
+        list_features: a list of feature names (consistent with columns in df_feat)
+        accumul_type: feature aggregation type, choose from "sum" and "ave" (default="sum")
+        savedir: results saving directory. If not None, visualization plots will be saved in the desired directory (default=None)
+    Returns:
+        cell_cluster: a (N, M) matrix, where N = # of clustered communities, and M = # of markers
+        cell_cluster_norm: the normalized form of cell_cluster (normalized by subtracting the median value)
+    """
+    assert accumul_type in ["sum", "ave"], "Wrong accumulation type! Choose from 'sum' and 'ave'!"
+    cell_cluster = np.zeros((n_community, len(markers)))
+    for cluster in range(len(np.unique(communities))):
+        df_sub = df_feat[communities == cluster]
+        if df_sub.shape[0] == 0:
+            continue
+        for i, feat in enumerate(list_features): # for each feature in the list of features
+            cell_cluster[cluster, i] = np.average(df_sub[feat])
+    cell_cluster_norm = cell_cluster - np.median(cell_cluster, axis=0)
+    sns.heatmap(cell_cluster_norm, # cell_cluster - np.median(cell_cluster, axis=0),#
+                cmap='magma',
+                xticklabels=markers,
+                yticklabels=np.arange(len(np.unique(communities)))
+               )
+    plt.xlabel("Markers - {}".format(accumul_type))
+    plt.ylabel("Phenograph clusters")
+    plt.title("normalized expression - cell {}".format(accumul_type))
+    savename += "_cell_{}.png".format(accumul_type)
+    if savedir is not None:
+        if not os.path.exists(savedir):
+            os.makedirs(savedir)
+        plt.savefig(os.path.join(savedir, savename))
+    plt.show()
+    return cell_cluster, cell_cluster_norm
+def vis_phenograph(df_slide_roi, df_all, pheno_name, markers, used_feat, level="cohort", accumul_type="sum",
+                   to_save=False, savepath="./", vis_scatter=False):
+    """
+    Args:
+        df_slide_roi = a dataframe with slide-roi correspondence information included
+        df_all       = dataframe with feature and clustering results included
+        pheno_name   = name (key) of the phenograph output
+        markers      = a (minimal) list of markers used in Pheno-Graph (to visualize)
+        list_feat    = a list of features used (should be consistent with columns available in df_all)
+        level        = level to visualize, choose from "cohort", "slide", or "roi" (default="cohort")
+        accumul_type = type of feature accumulation used (default="sum")
+        to_save      = a flag indicating whether or not save output (default=False)
+        savepath     = visualization saving directory (default="./")
+    """
+    if to_save:
+        if not os.path.exists(savepath):
+            os.makedirs
+    # features used for accumul_type
+    ids       = [i for (i,x) in enumerate(used_feat) if re.search(".{}".format(accumul_type), x)]
+    list_feat = [used_feat[i] for i in ids]
+    '''# features used for cell ave
+    accumul_type             = "ave"
+    ids                      = [i for (i,x) in enumerate(used_feats[key]) if re.search(".{}".format(accumul_type), x)]
+    list_feats[accumul_type] = [used_feats[key][i] for i in ids]
+    list_feat_morph = [x for x in used_feats[key] if x not in list_feats["sum"]+list_feats["ave"]]'''
+    if accumul_type == "sum":
+        suffix = "_cell_sum"
+    elif accumul_type == "ave":
+        suffix = "_cell_ave"
+    assert level in ["cohort", "slide", "roi"], "Only 'cohort', 'slide' or 'roi' levels are accepted!"
+    '''df_io = pd.read_csv(os.path.join(outdir, "input_output.csv"))'''
+    n_community = len(df_all[pheno_name].unique())
+    if level == "cohort":
+        phenos = {level: df_all}
+    else:
+        phenos = _gather_roi_pheno(df_slide_roi, df_all)
+        if level == "slide":
+            for slide in df_io["Slide"].unique(): # for each slide
+                for seen_roi, roi_i in enumerate(df_slide_roi.loc[df_slide_roi["Slide"] == slide, "ROI"]):  ## for each ROI
+                    f_roi = roi_i.replace(".txt", "")
+                    if seen_roi == 0:
+                        phenos[slide] = phenos[f_roi]
+                    else:
+                        phenos[slide] = pd.concat([phenos[slide], phenos[f_roi]])
+                    phenos.pop(f_roi)
+    savename = ""
+    for key, df_pheno in phenos.items():
+        if to_save:
+            savepath_ = os.path.join(savepath, level)
+            savename = key
+        communities = df_pheno[pheno_name]
+        _vis_cell_phenotypes(df_pheno, communities, n_community, markers, list_feat, accumul_type,
+                             savedir=savepath_, savename=savename)
+        # visualize scatter (2-d projection)
+        if vis_scatter:
+            proj_2d = df_pheno[['{}_projx'.format(pheno_name), '{}_projy'.format(pheno_name)]].to_numpy()
+#             print("Visualization in 2d - cohort")
+            plt.figure(figsize=(4, 4))
+            plt.title("cohort")
+            sns.scatterplot(x=proj_2d[:, 0], y=proj_2d[:, 1], hue=communities, palette='tab20',
+                            #                 legend=legend,
+                            hue_order=np.arange(n_community))
+            plt.axis('tight')
+            plt.legend(bbox_to_anchor=(1.01, 1), loc=2, borderaxespad=0.)
+            if to_save:
+                plt.savefig(os.path.join(savepath_, "scatter_{}.png".format(savename)))
+            plt.show()
+    return phenos
+import sklearn.neighbors
+from sklearn.neighbors import kneighbors_graph as skgraph
+from sklearn.metrics import DistanceMetric# from sklearn.neighbors import DistanceMetric
+from scipy import sparse as sp
+import networkx as nx
+def _gather_roi_distances(df_slide_roi, outdir, name_pheno, thres_dist=50):
+    dist = DistanceMetric.get_metric('euclidean')
+    dist_matrices = {}
+    for i, f_roi in enumerate(df_slide_roi['ROI'].unique()):
+        roi = f_roi.replace('.txt', '')
+        slide = df_slide_roi.loc[df_slide_roi["ROI"] == f_roi, "Slide"].values[0]
+        f_cytof_im = "{}_{}.pkl".format(slide, roi)
+        if not f_cytof_im in os.listdir(os.path.join(outdir, "cytof_images")):
+            print("{} not found, skip".format(f_cytof_im))
+            continue
+        cytof_im   = load_CytofImage(os.path.join(outdir, "cytof_images", f_cytof_im))
+        df_sub = cytof_im.df_feature
+        dist_matrices[roi] = {}
+        dist_matrices[roi]['dist'] = dist.pairwise(df_sub.loc[:, ['coordinate_x', 'coordinate_y']].values)
+        phenograph = getattr(cytof_im, 'phenograph')[name_pheno]
+        cluster = phenograph['clusters'].values
+        if i == 0:
+            n_cluster = phenograph['num_community']
+        # expected percentage
+        expected_percentage = np.zeros((n_cluster, n_cluster))
+        for _i in range(n_cluster):
+            for _j in range(n_cluster):
+                expected_percentage[_i, _j] = sum(cluster == _i) * sum(cluster == _j) #/ len(df_sub)**2
+        dist_matrices[roi]['expected_percentage'] = expected_percentage
+        dist_matrices[roi]['num_cell'] = len(df_sub)
+        # edge num
+        edge_nums = np.zeros_like(expected_percentage)
+        dist_matrix = dist_matrices[roi]['dist']
+        n_cells = dist_matrix.shape[0]
+        for _i in range(n_cells):
+            for _j in range(n_cells):
+                 if dist_matrix[_i, _j] > 0 and dist_matrix[_i, _j] < thres_dist:
+                        edge_nums[cluster[_i], cluster[_j]] += 1
+        # edge_percentages = edge_nums/np.sum(edge_nums)
+        dist_matrices[roi]['edge_nums'] = edge_nums
+    return dist_matrices
+def _gather_roi_kneighbor_graphs(df_slide_roi, outdir, name_pheno, k=8):
+    graphs = {}
+    for i, f_roi in enumerate(df_slide_roi['ROI'].unique()):
+        roi = f_roi.replace('.txt', '')
+        f_cytof_im = "{}.pkl".format(roi)
+        if not f_cytof_im in os.listdir(os.path.join(outdir, "cytof_images")):
+            print("{} not found, skip".format(f_cytof_im))
+            continue
+        cytof_im   = load_CytofImage(os.path.join(outdir, "cytof_images", f_cytof_im))
+        df_sub = cytof_im.df_feature
+        graph = skgraph(np.array(df_sub.loc[:, ['coordinate_x', 'coordinate_y']]), n_neighbors=k, mode='distance')
+        graph.toarray()
+        I, J, V = sp.find(graph)
+        graphs[roi] = {}
+        graphs[roi]['I'] = I  # Start (center)
+        graphs[roi]['J'] = J  # End
+        graphs[roi]['V'] = V
+        graphs[roi]['graph'] = graph
+        phenograph = getattr(cytof_im, 'phenograph')[name_pheno]
+        cluster    = phenograph['clusters'].values
+        if i == 0:
+            n_cluster = phenograph['num_community']
+        # Edge type summary
+        edge_nums = np.zeros((n_cluster, n_cluster))
+        for _i, _j in zip(I, J):
+            edge_nums[cluster[_i], cluster[_j]] += 1
+        graphs[roi]['edge_nums'] = edge_nums
+        '''edge_percentages = edge_nums/np.sum(edge_nums)'''
+        expected_percentage = np.zeros((n_cluster, n_cluster))
+        for _i in range(n_cluster):
+            for _j in range(n_cluster):
+                expected_percentage[_i, _j] = sum(cluster == _i) * sum(cluster == _j) #/ len(df_sub)**2
+        graphs[roi]['expected_percentage'] = expected_percentage
+        graphs[roi]['num_cell'] = len(df_sub)
+    return graphs
+def interaction_analysis(df_slide_roi, outdir, name_pheno, method="distance", k=8, thres_dist=50, level="slide", clustergrid=None):
+    assert method in ["distance", "graph"], "Method can be either 'distance' or 'graph'!"
+    if method == "distance":
+        info = _gather_roi_distances(df_slide_roi, outdir, name_pheno, thres_dist)
+    else:
+        info = _gather_roi_kneighbor_graphs(df_slide_roi, outdir, name_pheno, k)
+    interacts = {}
+    if level == "slide":
+        for slide in df_slide_roi["Slide"].unique():
+            for seen_roi, f_roi in enumerate(df_slide_roi.loc[df_slide_roi["Slide"] == slide, "ROI"]):
+                roi = f_roi.replace(".txt", "")
+                if seen_roi == 0:
+                    info[slide] = {}
+                    info[slide]['edge_nums']           = info[roi]['edge_nums']
+                    info[slide]['expected_percentage'] = info[roi]['expected_percentage']
+                    info[slide]['num_cell']            = info[roi]['num_cell']
+                else:
+                    info[slide]['edge_nums']           += info[roi]['edge_nums']
+                    info[slide]['expected_percentage'] += info[roi]['expected_percentage']
+                    info[slide]['num_cell']            += info[roi]['num_cell']
+                info.pop(roi)
+    for key, item in info.items():
+        edge_percentage     = item['edge_nums'] / np.sum(item['edge_nums'])
+        expected_percentage = item['expected_percentage'] / item['num_cell'] ** 2
+        # Normalize
+        interact_norm       = np.log10(edge_percentage/expected_percentage + 0.1)
+        # Fix Nan
+        interact_norm[np.isnan(interact_norm)] = np.log10(1 + 0.1)
+        interacts[key]      = interact_norm
+    # plot
+    for f_key, interact in interacts.items():
+        plt.figure(figsize=(6, 6))
+        ax = sns.heatmap(interact, center=np.log10(1 + 0.1),
+                         cmap='RdBu_r', vmin=-1, vmax=1)
+        ax.set_aspect('equal')
+        plt.title(f_key)
+        plt.show()
+        if clustergrid is None:
+            plt.figure()
+            clustergrid = sns.clustermap(interact, center=np.log10(1 + 0.1),
+                                         cmap='RdBu_r', vmin=-1, vmax=1,
+                                         xticklabels=np.arange(interact.shape[0]),
+                                         yticklabels=np.arange(interact.shape[0]),
+                                         figsize=(6, 6))
+            plt.title(f_key)
+            plt.show()
+        plt.figure()
+        sns.clustermap(interact[clustergrid.dendrogram_row.reordered_ind, :]\
+                       [:, clustergrid.dendrogram_row.reordered_ind],
+                       center=np.log10(1 + 0.1), cmap='RdBu_r', vmin=-1, vmax=1,
+                       xticklabels=clustergrid.dendrogram_row.reordered_ind,
+                       yticklabels=clustergrid.dendrogram_row.reordered_ind,
+                       figsize=(6, 6), row_cluster=False, col_cluster=False)
+        plt.title(f_key)
+        plt.show()
+    return interacts, clustergrid

cytof/hyperion_preprocess.py ADDED Viewed

	@@ -0,0 +1,335 @@

+import os
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+import pathlib
+import skimage.io as skio
+import warnings
+from typing import Union, Optional, Type, Tuple, List
+# from readimc import MCDFile
+# from cytof.classes import CytofImage, CytofImageTiff
+import sys
+import platform
+from pathlib import Path
+FILE = Path(__file__).resolve()
+ROOT = FILE.parents[0]  # cytof root directory
+if str(ROOT) not in sys.path:
+    sys.path.append(str(ROOT))  # add ROOT to PATH
+if platform.system() != 'Windows':
+    ROOT = Path(os.path.relpath(ROOT, Path.cwd()))  # relative
+from classes import CytofImage, CytofImageTiff
+# ####################### Read data ########################
+def cytof_read_data_roi(filename, slide="", roi=None, iltype="hwd", **kwargs) -> Tuple[CytofImage, list]:
+    """ Read cytof data (.txt file) as a dataframe
+    Inputs:
+        filename = full filename of the cytof data (path-name-ext)
+    Returns:
+        df_cytof = dataframe of the cytof data
+        cols     = column names of the dataframe, an empty list returned if not reading data from a dataframe
+    :param filename: str
+    :return df_cytof: pandas.core.frame.DataFrame
+    """
+    ext = pathlib.Path(filename).suffix
+    assert len(ext) > 0, "Please provide a full file name with extension!"
+    assert ext.upper() in ['.TXT', '.TIFF', '.TIF', '.CSV', '.QPTIFF'], "filetypes other than '.txt', '.tiff'  or '.csv' are not (yet) supported."
+    if ext.upper() in ['.TXT', '.CSV']: # the case with a dataframe
+        if ext.upper() == '.TXT':
+            df_cytof = pd.read_csv(filename, sep='\t') # pd.read_table(filename)
+            if roi is None:
+                roi = os.path.basename(filename).split('.txt')[0]
+            # initialize an instance of CytofImage
+            cytof_img = CytofImage(df_cytof, slide=slide, roi=roi, filename=filename)
+        elif ext.upper() == '.CSV':
+            df_cytof = pd.read_csv(filename)
+            if roi is None:
+                roi = os.path.basename(filename).split('.csv')[0]
+            # initialize an instance of CytofImage
+            cytof_img = CytofImage(df_cytof, slide=slide, roi=roi, filename=filename)
+        if "X" in kwargs and "Y" in kwargs:
+            cytof_img.df.rename(columns={kwargs["X"]: "X", kwargs["Y"]: 'Y'}, inplace=True)
+        cols = cytof_img.df.columns
+    else: # the case without a dataframe
+        image = skio.imread(filename, plugin="tifffile")
+        orig_img_shape = image.shape
+        sorted_shape = np.sort(orig_img_shape)
+        # roll the sorted shape by one to the left
+        # ref: https://numpy.org/doc/stable/reference/generated/numpy.roll.html
+        correct_shape = np.roll(sorted_shape, -1)
+        # sometimes tiff could be square, this ensures images were correctly transposed
+        orig_temp = list(orig_img_shape) # tuple is immutable
+        correct_index = []
+        for shape in correct_shape:
+            correct_index.append(orig_temp.index(shape))
+            # placeholder, since shape can't = 0
+            orig_temp[orig_temp.index(shape)] = 0
+        image = image.transpose(correct_index)
+        # create TIFF class cytof image
+        cytof_img = CytofImageTiff(image, slide=slide, roi=roi, filename=filename)
+        cols = []
+    return cytof_img, cols
+def cytof_read_data_mcd(filename, verbose=False):
+    # slides = {}
+    cytof_imgs = {}
+    with MCDFile(filename) as f:
+        if verbose:
+            print("\n{}, \n\t{} slides, showing the 1st slide:".format(filename, len(f.slides)))
+        ## slide
+        for slide in f.slides:
+            if verbose:
+                print("\tslide ID: {}, description: {}, width: {} um, height: {}um".format(
+                slide.id,
+                slide.description,
+                slide.width_um,
+                slide.height_um)
+            )
+            # slides[slide.id] = {}
+            # read the slide image
+            im_slide = f.read_slide(slide)  # numpy array or None
+            if verbose:
+                print("\n\tslide image shape: {}".format(im_slide.shape))
+            # (optional) read the first panorama image
+            panorama = slide.panoramas[0]
+            if verbose:
+                print(
+                "\t{} panoramas, showing the 1st one. \n\tpanorama ID: {}, description: {}, width: {} um, height: {}um".format(
+                    len(slide.panoramas),
+                    panorama.id,
+                    panorama.description,
+                    panorama.width_um,
+                    panorama.height_um)
+            )
+            im_pano = f.read_panorama(panorama)  # numpy array
+            if verbose:
+                print("\n\tpanorama image shape: {}".format(im_pano.shape))
+            for roi in slide.acquisitions: # for each acquisition (roi)
+                im_roi = f.read_acquisition(roi)  # array, shape: (c, y, x), dtype: float32
+                if verbose:
+                    print("\troi {}, shape: {}".format(roi.id, img_roi.shape))
+#                 slides[slide.id][roi.id] = {
+#                     "channel_names": roi.channel_names,
+#                     "channel_labels": roi.channel_labels,
+#                     "image": im_roi
+#                 }
+                cytof_img = CytofImageTiff(image=im_roi.transpose((1,2,0)),
+                                           slide=slide.id,
+                                           roi=roi.id,
+                                           filename=raw_f)
+                cytof_img.set_channels(roi.channel_names, roi.channel_labels)
+                cytof_imgs["{}_{}".format(slide.id, roi.id)] = cytof_img
+    return cytof_imgs# slides
+def cytof_preprocess(df):
+    """ Preprocess cytof dataframe
+        Every pair of X and Y values represent for a unique physical pixel locations in the original image
+        The values for Xs and Ys should be continuous integers
+        The missing pixels would be filled with 0
+    Inputs:
+        df = cytof dataframe
+    Returns:
+        df = preprocessed cytof dataframe with missing pixel values filled with 0
+    :param df: pandas.core.frame.DataFrame
+    :return df: pandas.core.frame.DataFrame
+    """
+    nrow = max(df['Y'].values) + 1
+    ncol = max(df['X'].values) + 1
+    n = len(df)
+    if nrow * ncol > n:
+        df2 = pd.DataFrame(np.zeros((nrow * ncol - n, len(df.columns)), dtype=int), columns=df.columns)
+        df  = pd.concat([df, df2])
+    return df
+def cytof_check_channels(df, marker_names=None, xlim=None, ylim=None):
+    """A visualization function to show different markers of a cytof image
+    Inputs:
+        df           = preprocessed cytof dataframe
+        marker_names = marker names to visualize, should match to column names in df (default=None)
+        xlim         = x-axis limit of output image (default=None)
+        ylim         = y-axis limit of output image (default=None)
+    :param df: pandas.core.frame.DataFrame
+    :param marker_names: list
+    :param xlim: tuple
+    :prarm ylim: tuple
+    """
+    if marker_names is None:
+        marker_names = [df.columns[_] for _ in range(6, len(df.columns))]
+    nrow = max(df['Y'].values) + 1
+    ncol = max(df['X'].values) + 1
+    ax_ncol = 5
+    ax_nrow = int(np.ceil(len(marker_names)/5))
+    fig, axes = plt.subplots(ax_nrow, ax_ncol, figsize=(3*ax_ncol, 3*ax_nrow))
+    if ax_nrow == 1:
+        axes = np.array([axes])
+    for i, _ in enumerate(marker_names):
+        _ax_nrow = int(np.floor(i/ax_ncol))
+        _ax_ncol = i % ax_ncol
+        image = df[_].values.reshape(nrow, ncol)
+        image = np.clip(image/np.quantile(image, 0.99), 0, 1)
+        axes[_ax_nrow, _ax_ncol].set_title(_)
+        if xlim is not None:
+            image = image[:, xlim[0]:xlim[1]]
+        if ylim is not None:
+            image = image[ylim[0]:ylim[1], :]
+        im = axes[_ax_nrow, _ax_ncol].imshow(image, cmap="gray")
+        fig.colorbar(im, ax=axes[_ax_nrow, _ax_ncol])
+    plt.show()
+def remove_special_channels(self, channels):
+    for channel in channels:
+        idx = self.channels.index(channel)
+        self.channels.pop(idx)
+        self.markers.pop(idx)
+        self.labels.pop(idx)
+        self.df.drop(columns=channel, inplace=True)
+def define_special_channels(self, channels_dict):
+    # create a copy of original dataframe
+    self.df_orig = self.df.copy()
+    for new_name, old_names in channels_dict.items():
+        print(new_name)
+        if len(old_names) == 0:
+            continue
+        old_nms = []
+        for i, old_name in enumerate(old_names):
+            if old_name['marker_name'] not in self.channels:
+                warnings.warn('{} is not available!'.format(old_name['marker_name']))
+                continue
+            old_nms.append(old_name)
+        if len(old_nms) > 0:
+            for i, old_name in enumerate(old_nms):
+                if i == 0:
+                    self.df[new_name] = self.df[old_name['marker_name']]
+                else:
+                    self.df[new_name] += self.df[old_name['marker_name']]
+            if not old_name['to_keep']:
+                idx = self.channels.index(old_name['marker_name'])
+                # Remove the unwanted channels
+                self.channels.pop(idx)
+                self.markers.pop(idx)
+                self.labels.pop(idx)
+                self.df.drop(columns=old_name['marker_name'], inplace=True)
+            self.channels.append(new_name)
+def cytof_txt2img(df, marker_names):
+    """ Convert from cytof dataframe to d-dimensional image, where d=length of marker names
+        Each channel of the output image correspond to the pixel intensity of the corresponding marker
+    Inputs:
+        df           = cytof dataframe
+        marker_names = markers to take into consideration
+    Returns:
+        out_img      = d-dimensional image
+    :param df: pandas.core.frame.DataFrame
+    :param marker_names: list
+    :return out_img: numpy.ndarray
+    """
+    nc_in = len(marker_names)
+    marker_names = [_ for _ in marker_names if _ in df.columns.values]
+    nc = len(marker_names)
+    if nc != nc_in:
+        warnings.warn("{} markers selected instead of {}".format(nc, nc_in))
+    nrow = max(df['Y'].values) + 1
+    ncol = max(df['X'].values) + 1
+    print("Output image shape: [{}, {}, {}]".format(nrow, ncol, nc))
+    out_image = np.zeros([nrow, ncol, nc], dtype=float)
+    for _nc in range(nc):
+        out_image[..., _nc] = df[marker_names[_nc]].values.reshape(nrow, ncol)
+    return out_image
+def cytof_merge_channels(im_cytof: np.ndarray,
+                         channel_names: List,
+                         channel_ids:List = None,
+                         channels: List = None,
+                         quantiles: List = None,
+                         visualize: bool = False):
+    """ Merge selected channels (given by "channel_ids") of raw cytof image and generate a RGB image
+    Inputs:
+        im_cytof      = raw cytof image
+        channel_names = a list of names correspond to all channels of the im_cytof
+        channel_ids   = the indices of channels to show, no more than 6 channels can be shown the same time (default=None)
+        channels      = the names of channels to show, no more than 6 channels can be shown the same time (default=None)
+                        Either "channel_ids" or "channels" should be provided
+        quantiles     = the quantile values for each channel defined by channel_ids (default=None)
+        visualize     = a flag indicating whether print the visualization on screen
+    Returns:
+        merged_im   = channel merged image
+        quantiles   = the quantile values for each channel defined by channel_ids
+    :param im_cytof: numpy.ndarray
+    :param channel_names: list
+    :param channel_ids: list
+    :param channels: list
+    :param quantiles: list
+    :return merged_im: numpy.ndarray
+    :return quantiles: list
+    """
+    assert len(channel_names) == im_cytof.shape[-1], 'The length of "channel_names" does not match the image size!'
+    assert channel_ids or channels, 'At least one should be provided, either "channel_ids" or "channels"!'
+    if channel_ids is None:
+        channel_ids = [channel_names.index(n) for n in channels]
+    assert len(channel_ids) <= 6, "No more than 6 channels can be visualized simultaneously!"
+    if len(channel_ids) > 3:
+        warnings.warn(
+            "Visualizing more than 3 channels the same time results in deteriorated visualization. \
+            It is not recommended!")
+    full_colors = ['red', 'green', 'blue', 'cyan', 'magenta', 'yellow']
+    info = [f"{marker} in {c}\n" for (marker, c) in \
+            zip([channel_names[i] for i in channel_ids], full_colors[:len(channel_ids)])]
+    print(f"Visualizing... \n{''.join(info)}")
+    merged_im = np.zeros((im_cytof.shape[0], im_cytof.shape[1], 3))
+    if quantiles is None:
+        quantiles = [np.quantile(im_cytof[..., _], 0.99) for _ in channel_ids]
+    for _ in range(min(len(channel_ids), 3)):
+        merged_im[..., _] = np.clip(im_cytof[..., channel_ids[_]] / quantiles[_], 0, 1) * 255
+    chs = [[1, 2], [0, 2], [0, 1]]
+    chs_id = 0
+    while _ < len(channel_ids) - 1:
+        _ += 1
+        for j in chs[chs_id]:
+            merged_im[..., j] += np.clip(im_cytof[..., channel_ids[_]] / quantiles[_], 0, 1) * 255  # /2
+            merged_im[..., j] = np.clip(merged_im[..., j], 0, 255)
+        chs_id += 1
+    merged_im = merged_im.astype(np.uint8)
+    if visualize:
+        plt.imshow(merged_im)
+        plt.show()
+    return merged_im, quantiles

cytof/hyperion_segmentation.py ADDED Viewed

	@@ -0,0 +1,341 @@

+import scipy
+import skimage
+from skimage import feature
+import numpy as np
+import matplotlib.pyplot as plt
+from skimage.color import label2rgb
+from skimage.segmentation import mark_boundaries
+import os
+import sys
+import platform
+from pathlib import Path
+FILE = Path(__file__).resolve()
+ROOT = FILE.parents[0]  # cytof root directory
+if str(ROOT) not in sys.path:
+    sys.path.append(str(ROOT))  # add ROOT to PATH
+if platform.system() != 'Windows':
+    ROOT = Path(os.path.relpath(ROOT, Path.cwd()))  # relative
+from segmentation_functions import generate_mask, normalize
+# from cytof.segmentation_functions import generate_mask, normalize
+def cytof_nuclei_segmentation(im_nuclei, show_process=False, size_hole=50, size_obj=7,
+                              start_coords=(0, 0), side=100, colors=[], min_distance=2,
+                              fg_marker_dilate=2, bg_marker_dilate=2
+                              ):
+    """ Segment nuclei based on the input nuclei image
+    Inputs:
+        im_nuclei    = raw cytof image correspond to nuclei, size=(h, w)
+        show_process = flag of whether show the process  (default=False)
+        size_hole    = size of the hole to be removed (default=50)
+        size_obj     = size of the small objects to be removed (default=7)
+        start_coords = the starting (x,y) coordinates of visualizing process (default=(0,0))
+        side         = the side length of visualizing process (default=100)
+        colors       = a list of colors used to visualize segmentation results (default=[])
+    Returns:
+        labels = nuclei segmentation result, where background is represented by 1, size=(h, w)
+        colors = the list of colors used to visualize segmentation results
+    :param im_nuclei: numpy.ndarray
+    :param show_process: bool
+    :param size_hole: int
+    :param size_obj: int
+    :param start_coords: int
+    :return labels: numpy.ndarray
+    :return colors: list
+    """
+    if len(colors) == 0:
+        cmap_set3 = plt.get_cmap("Set3")
+        cmap_tab20c = plt.get_cmap("tab20c")
+        colors = [cmap_tab20c.colors[_] for _ in range(len(cmap_tab20c.colors))] + \
+                 [cmap_set3.colors[_] for _ in range(len(cmap_set3.colors))]
+    x0, y0 = start_coords
+    mask = generate_mask(np.clip(im_nuclei, 0, np.quantile(im_nuclei, 0.95)), fill_hole=False, use_watershed=False)
+    mask = skimage.morphology.remove_small_holes(mask.astype(bool), size_hole)
+    mask = skimage.morphology.remove_small_objects(mask.astype(bool), size_obj)
+    if show_process:
+        plt.figure(figsize=(4, 4))
+        plt.imshow(mask[x0:x0 + side, y0:y0 + side], cmap='gray')
+        plt.show()
+    # Find and count local maxima
+    distance = scipy.ndimage.distance_transform_edt(mask)
+    distance = scipy.ndimage.gaussian_filter(distance, 1)
+    local_maxi_idx = skimage.feature.peak_local_max(distance, exclude_border=False, min_distance=min_distance,
+                                                    labels=None)
+    local_maxi = np.zeros_like(distance, dtype=bool)
+    local_maxi[tuple(local_maxi_idx.T)] = True
+    markers = scipy.ndimage.label(local_maxi)[0]
+    markers = markers > 0
+    markers = skimage.morphology.dilation(markers, skimage.morphology.disk(fg_marker_dilate))
+    markers = skimage.morphology.label(markers)
+    markers[markers > 0] = markers[markers > 0] + 1
+    markers = markers + skimage.morphology.erosion(1 - mask, skimage.morphology.disk(bg_marker_dilate))
+    # Another watershed
+    temp_im = skimage.util.img_as_ubyte(normalize(np.clip(im_nuclei, 0, np.quantile(im_nuclei, 0.95))))
+    gradient = skimage.filters.rank.gradient(temp_im, skimage.morphology.disk(3))
+    # gradient = skimage.filters.rank.gradient(normalize(np.clip(im_nuclei, 0, np.quantile(im_nuclei, 0.95))),
+    #                                          skimage.morphology.disk(3))
+    labels = skimage.segmentation.watershed(gradient, markers)
+    labels = skimage.morphology.closing(labels)
+    labels_rgb = label2rgb(labels, bg_label=1, colors=colors)
+    labels_rgb[labels == 1, ...] = (0, 0, 0)
+    if show_process:
+        fig, axes = plt.subplots(3, 2, figsize=(8, 12), sharex=False, sharey=False)
+        ax = axes.ravel()
+        ax[0].set_title("original grayscale")
+        ax[0].imshow(np.clip(im_nuclei[x0:x0 + side, y0:y0 + side], 0, np.quantile(im_nuclei, 0.95)),
+                     interpolation='nearest')
+        ax[1].set_title("markers")
+        ax[1].imshow(label2rgb(markers[x0:x0 + side, y0:y0 + side], bg_label=1, colors=colors),
+                     interpolation='nearest')
+        ax[2].set_title("distance")
+        ax[2].imshow(-distance[x0:x0 + side, y0:y0 + side], cmap=plt.cm.nipy_spectral, interpolation='nearest')
+        ax[3].set_title("gradient")
+        ax[3].imshow(gradient[x0:x0 + side, y0:y0 + side], interpolation='nearest')
+        ax[4].set_title("Watershed Labels")
+        ax[4].imshow(labels_rgb[x0:x0 + side, y0:y0 + side, :], interpolation='nearest')
+        ax[5].set_title("Watershed Labels")
+        ax[5].imshow(labels_rgb, interpolation='nearest')
+        plt.show()
+    return labels, colors
+def cytof_cell_segmentation(nuclei_seg, radius=5, membrane_channel=None, show_process=False,
+                            start_coords=(0, 0), side=100, colors=[]):
+    """ Cell segmentation based on nuclei segmentation; membrane-guided cell segmentation if membrane_channel provided.
+    Inputs:
+        nuclei_seg       = an index image containing nuclei instance segmentation information, where the background is
+                           represented by 1, size=(h,w). Typically, the output of calling the cytof_nuclei_segmentation
+                           function.
+        radius           = assumed radius of cells (default=5)
+        membrane_channel = membrane image channel of original cytof image (default=None)
+        show_process     = a flag indicating whether or not showing the segmentation process (default=False)
+        start_coords     = the starting (x,y) coordinates of visualizing process (default=(0,0))
+        side             = the side length of visualizing process (default=100)
+        colors           = a list of colors used to visualize segmentation results (default=[])
+    Returns:
+        labels = an index image containing cell instance segmentation information, where the background is
+                           represented by 1
+        colors = the list of colors used to visualize segmentation results
+    :param nuclei_seg: numpy.ndarray
+    :param radius: int
+    :param membrane_channel: numpy.ndarray
+    :param show_process: bool
+    :param start_coords: tuple
+    :param side: int
+    :return labels: numpy.ndarray
+    :return colors: list
+    """
+    if len(colors) == 0:
+        cmap_set3 = plt.get_cmap("Set3")
+        cmap_tab20c = plt.get_cmap("tab20c")
+        colors = [cmap_tab20c.colors[_] for _ in range(len(cmap_tab20c.colors))] + \
+            [cmap_set3.colors[_] for _ in range(len(cmap_set3.colors))]
+    x0, y0 = start_coords
+    ## nuclei segmentation -> nuclei mask
+    nuclei_mask = nuclei_seg > 1
+    if show_process:
+        nuclei_bg = nuclei_seg.min()
+        fig, ax = plt.subplots(1, 2, figsize=(8, 4))
+        nuclei_seg_vis = label2rgb(nuclei_seg[x0:x0 + side, y0:y0 + side], bg_label=nuclei_bg, colors=colors)
+        nuclei_seg_vis[nuclei_seg[x0:x0 + side, y0:y0 + side] == nuclei_bg, ...] = (0, 0, 0)
+        ax[0].imshow(nuclei_seg_vis), ax[0].set_title('nuclei segmentation')
+        ax[1].imshow(nuclei_mask[x0:x0 + side, y0:y0 + side], cmap='gray'), ax[1].set_title('nuclei mask')
+    if membrane_channel is not None:
+        membrane_mask = generate_mask(np.clip(membrane_channel, 0, np.quantile(membrane_channel, 0.95)),
+                                      fill_hole=False, use_watershed=False)
+        if show_process:
+            # visualize
+            nuclei_membrane = np.zeros((membrane_mask.shape[0], membrane_mask.shape[1], 3), dtype=np.uint8)
+            nuclei_membrane[..., 0] = nuclei_mask * 255
+            nuclei_membrane[..., 1] = membrane_mask
+            fig, ax = plt.subplots(1, 2, figsize=(8, 4))
+            ax[0].imshow(membrane_mask[x0:x0 + side, y0:y0 + side], cmap='gray'), ax[0].set_title('membrane mask')
+            ax[1].imshow(nuclei_membrane[x0:x0 + side, y0:y0 + side]), ax[1].set_title('nuclei - membrane')
+        # postprocess raw membrane mask
+        membrane_mask_close = skimage.morphology.closing(membrane_mask, skimage.morphology.disk(1))
+        membrane_mask_open  = skimage.morphology.opening(membrane_mask_close, skimage.morphology.disk(1))
+        membrane_mask_erode = skimage.morphology.erosion(membrane_mask_open, skimage.morphology.disk(3))
+        # Find skeleton
+        membrane_for_skeleton = (membrane_mask_open > 0) & (nuclei_mask == False)
+        membrane_skeleton = skimage.morphology.skeletonize(membrane_for_skeleton)
+        '''print(membrane_skeleton)
+        print(membrane_mask_erode)'''
+        membrane_mask = membrane_mask_erode
+        membrane_mask_2 = (membrane_mask_erode > 0) | membrane_skeleton
+        if show_process:
+            fig, axs = plt.subplots(1, 4, figsize=(16, 4))
+            axs[0].imshow(membrane_mask[x0:x0 + side, y0:y0 + side], cmap='gray')
+            axs[0].set_title('raw membrane mask')
+            axs[1].imshow(membrane_mask_close[x0:x0 + side, y0:y0 + side], cmap='gray')
+            axs[1].set_title('membrane mask - closed')
+            axs[2].imshow(membrane_mask_open[x0:x0 + side, y0:y0 + side], cmap='gray')
+            axs[2].set_title('membrane mask -  opened')
+            axs[3].imshow(membrane_mask_erode[x0:x0 + side, y0:y0 + side], cmap='gray')
+            axs[3].set_title('membrane mask - erosion')
+            plt.show()
+            fig, axs = plt.subplots(1, 3, figsize=(12, 4))
+            axs[0].imshow(membrane_skeleton[x0:x0 + side, y0:y0 + side], cmap='gray')
+            axs[0].set_title('skeleton')
+            axs[1].imshow(membrane_mask[x0:x0 + side, y0:y0 + side], cmap='gray')
+            axs[1].set_title('membrane mask (final)')
+            axs[2].imshow(membrane_mask_2[x0:x0 + side, y0:y0 + side], cmap='gray')
+            axs[2].set_title('membrane mask 2')
+            plt.show()
+            # overlap and visualize
+            nuclei_membrane = np.zeros((membrane_mask.shape[0], membrane_mask.shape[1], 3), dtype=np.uint8)
+            nuclei_membrane[..., 0] = nuclei_mask * 255
+            nuclei_membrane[..., 1] = membrane_mask
+            fig, ax = plt.subplots(1, 2, figsize=(8, 4))
+            ax[0].imshow(membrane_mask[x0:x0 + side, y0:y0 + side], cmap='gray'), ax[0].set_title('membrane mask')
+            ax[1].imshow(nuclei_membrane[x0:x0 + side, y0:y0 + side]), ax[1].set_title('nuclei - membrane')
+    # dilate nuclei mask by radius
+    dilate_nuclei_mask = skimage.morphology.dilation(nuclei_mask, skimage.morphology.disk(radius))
+    if show_process:
+        fig, axs = plt.subplots(1, 3, figsize=(12, 4))
+        axs[0].imshow(nuclei_mask[x0:x0 + side, y0:y0 + side], cmap='gray')
+        axs[0].set_title('nuclei mask')
+        axs[1].imshow(dilate_nuclei_mask[x0:x0 + side, y0:y0 + side], cmap='gray')
+        axs[1].set_title('dilated nuclei mask')
+        if membrane_channel is not None:
+            axs[2].imshow(membrane_mask[x0:x0 + side, y0:y0 + side] > 0, cmap='gray')
+            axs[2].set_title('membrane mask')
+    # define sure foreground, sure background, and unknown region
+    sure_fg = nuclei_mask.copy()  # nuclei mask defines sure foreground
+    # dark region in dilated nuclei mask (dilate_nuclei_mask == False) OR bright region in cell mask (cell_mask > 0)
+    # defines sure background
+    if membrane_channel is not None:
+        sure_bg  = ((membrane_mask > 0) | (dilate_nuclei_mask == False)) & (sure_fg == False)
+        sure_bg2 = ((membrane_mask_2 > 0) | (dilate_nuclei_mask == False)) & (sure_fg == False)
+    else:
+        sure_bg =  (dilate_nuclei_mask == False) & (sure_fg == False)
+    unknown = np.logical_not(np.logical_or(sure_fg, sure_bg))
+    if show_process:
+        fig, axs = plt.subplots(1, 4, figsize=(16, 4))
+        axs[0].imshow(sure_fg[x0:x0 + side, y0:y0 + side], cmap='gray')
+        axs[0].set_title('sure fg')
+        axs[1].imshow(sure_bg[x0:x0 + side, y0:y0 + side], cmap='gray')
+        if membrane_channel is not None:
+            axs[1].set_title('sure bg: membrane | not (dilated nuclei)')
+        else:
+            axs[1].set_title('sure bg: not (dilated nuclei)')
+        axs[2].imshow(unknown[x0:x0 + side, y0:y0 + side], cmap='gray')
+        axs[2].set_title('unknown')
+        # visualize in a RGB image
+        fg_bg_un = np.zeros((unknown.shape[0], unknown.shape[1], 3), dtype=np.uint8)
+        fg_bg_un[..., 0] = sure_fg * 255  # sure foreground - red
+        fg_bg_un[..., 1] = sure_bg * 255  # sure background - green
+        fg_bg_un[..., 2] = unknown * 255  # unknown - blue
+        axs[3].imshow(fg_bg_un[x0:x0 + side, y0:y0 + side])
+        plt.show()
+    ## Euclidean distance transform: distance to the closest zero pixel for each pixel of the input image.
+    if membrane_channel is not None:
+        distance_bg = -scipy.ndimage.distance_transform_edt(1 - sure_bg2)
+        distance_fg = scipy.ndimage.distance_transform_edt(1 - sure_fg)
+        distance = distance_bg+distance_fg
+    else:
+        distance = scipy.ndimage.distance_transform_edt(1 - sure_fg)
+    distance = scipy.ndimage.gaussian_filter(distance, 1)
+    # watershed
+    markers = nuclei_seg.copy()
+    markers[unknown] = 0
+    if show_process:
+        fig, axs = plt.subplots(1, 2, figsize=(8, 4))
+        axs[0].set_title("markers")
+        axs[0].imshow(label2rgb(markers[x0:x0 + side, y0:y0 + side], bg_label=1, colors=colors),
+                     interpolation='nearest')
+        axs[1].set_title("distance")
+        im = axs[1].imshow(distance[x0:x0 + side, y0:y0 + side], cmap=plt.cm.nipy_spectral, interpolation='nearest')
+        plt.colorbar(im, ax=axs[1])
+    labels = skimage.segmentation.watershed(distance, markers)
+    if show_process:
+        fig, axs = plt.subplots(1, 4, figsize=(16, 4))
+        axs[0].imshow(unknown[x0:x0 + side, y0:y0 + side])
+        axs[0].set_title('cytoplasm')  # , cmap=cmap, interpolation='nearest'
+        nuclei_lb = label2rgb(nuclei_seg, bg_label=1, colors=colors)
+        nuclei_lb[nuclei_seg == 1, ...] = (0, 0, 0)
+        axs[1].imshow(nuclei_lb)  # , cmap=cmap, interpolation='nearest')
+        axs[1].set_xlim(x0, x0 + side - 1), axs[1].set_ylim(y0 + side - 1, y0)
+        axs[1].set_title('nuclei')
+        cell_lb = label2rgb(labels, bg_label=1, colors=colors)
+        cell_lb[labels == 1, ...] = (0, 0, 0)
+        axs[2].imshow(cell_lb)  # , cmap=cmap, interpolation='nearest')
+        axs[2].set_title('cells')
+        axs[2].set_xlim(x0, x0 + side - 1), axs[2].set_ylim(y0 + side - 1, y0)
+        merge_lb = cell_lb.copy()
+        merge_lb = cell_lb ** 2
+        merge_lb[nuclei_mask == 1, ...] = np.clip(nuclei_lb[nuclei_mask == 1, ...].astype(float) * 1.2, 0, 1)
+        axs[3].imshow(merge_lb)
+        axs[3].set_title('nuclei-cells')
+        axs[3].set_xlim(x0, x0 + side - 1), axs[3].set_ylim(y0 + side - 1, y0)
+        plt.show()
+    return labels, colors
+def visualize_segmentation(raw_image, channels, seg, channel_ids, bound_color=(1, 1, 1), bound_mode='inner', show=True, bg_label=0):
+    """ Visualize segmentation results with boundaries
+    Inputs:
+        raw_image   = raw cytof image
+        channels    = a list of channels correspond to each channel in raw_image
+        seg         = instance segmentation result (index image)
+        channel_ids = indices of desired channels to visualize results
+        bound_color = desired color in RGB to show boundaries (default=(1,1,1), white color)
+        bound_mode  = the mode for finding boundaries, string in {‘thick’, ‘inner’, ‘outer’, ‘subpixel’}.
+                      (default="inner"). For more details, see
+                      [skimage.segmentation.mark_boundaries](https://scikit-image.org/docs/stable/api/skimage.segmentation.html)
+        show        = a flag indicating whether or not print result image on screen
+    Returns:
+        marked_image
+    :param raw_image: numpy.ndarray
+    :param seg: numpy.ndarray
+    :param channel_ids: int
+    :param bound_color: tuple
+    :param bound_mode: string
+    :param show: bool
+    :return marked_image
+    """
+    from cytof.hyperion_preprocess import cytof_merge_channels
+    # mark_boundaries() highight the segmented area for better visualization
+    # ref: https://scikit-image.org/docs/stable/api/skimage.segmentation.html#skimage.segmentation.mark_boundaries
+    marked_image = mark_boundaries(cytof_merge_channels(raw_image, channels, channel_ids)[0],
+                                   seg, mode=bound_mode, color=bound_color, background_label=bg_label)
+    if show:
+        plt.figure(figsize=(8,8))
+        plt.imshow(marked_image)
+        plt.show()
+    return marked_image

cytof/segmentation_functions.py ADDED Viewed

	@@ -0,0 +1,815 @@

+# Functions for nuclei segmentation in Kaggle PANDA challenge
+import numpy as np
+import matplotlib.image as mpimg
+import matplotlib.pyplot as plt
+from sklearn import preprocessing
+import math
+import scipy.misc as misc
+import cv2
+import skimage
+from skimage import measure
+from skimage import img_as_bool, io, color, morphology, segmentation
+from skimage.morphology import binary_closing, binary_opening, disk, closing, opening
+from PIL import Image
+import time
+import re
+import sys
+import os
+# import openslide
+# from openslide import open_slide, ImageSlide
+import matplotlib.pyplot as plt
+import pandas as pd
+import xml.etree.ElementTree as ET
+from skimage.draw import polygon
+import random
+#####################################################################
+# Functions for color deconvolution
+#####################################################################
+def normalize(mat, quantile_low=0, quantile_high=1):
+    """Do min-max normalization for input matrix of any dimension."""
+    mat_normalized = (mat - np.quantile(mat, quantile_low)) / (
+                np.quantile(mat, quantile_high) - np.quantile(mat, quantile_low))
+    return mat_normalized
+def convert_to_optical_densities(img_RGB, r0=255, g0=255, b0=255):
+    """Conver RGB image to optical densities with same shape as input image."""
+    OD = img_RGB.astype(float)
+    OD[:, :, 0] /= r0
+    OD[:, :, 1] /= g0
+    OD[:, :, 2] /= b0
+    return -np.log(OD + 0.00001)
+def channel_deconvolution(img_RGB, staining_type, plot_image=False, to_normalize=True):
+    """Deconvolute RGB image into different staining channels.
+       Ref: https://blog.bham.ac.uk/intellimic/g-landini-software/colour-deconvolution/
+    Args:
+        img_RGB: A uint8 numpy array with RGB channels.
+        staining_type: Dyes used to stain the image; choose one from ("HDB", "HRB", "HDR", "HEB").
+        plot_image: Set True if want to real-time display results. Default is False.
+    Returns:
+        An unnormlized h*w*3 deconvoluted matrix and 3 different channels normalized to [0, 1] seperately.
+    Raises:
+        Exception: An error occured if staining_type is not defined.
+    """
+    if staining_type == "HDB":
+        channels = ("Hematoxylin", "DAB", "Background")
+        stain_OD = np.asarray([[0.650, 0.704, 0.286], [0.268, 0.570, 0.776], [0.754, 0.077, 0.652]])
+    elif staining_type == "HRB":
+        channels = ("Hematoxylin", "Red", "Background")
+        stain_OD = np.asarray([[0.650, 0.704, 0.286], [0.214, 0.851, 0.478], [0.754, 0.077, 0.652]])
+    elif staining_type == "HDR":
+        channels = ("Hematoxylin", "DAB", "Red")
+        stain_OD = np.asarray([[0.650, 0.704, 0.286], [0.268, 0.570, 0.776], [0.214, 0.851, 0.478]])
+    elif staining_type == "HEB":
+        channels = ("Hematoxylin", "Eosin", "Background")
+        # stain_OD = np.asarray([[0.550,0.758,0.351],[0.398,0.634,0.600],[0.754,0.077,0.652]])
+        stain_OD = np.asarray([[0.644211, 0.716556, 0.266844], [0.092789, 0.964111, 0.283111], [0.754, 0.077, 0.652]])
+    else:
+        raise Exception("Staining type not defined. Choose one from the following: HDB, HRB, HDR, HEB.")
+    # Stain absorbance matrix normalization
+    normalized_stain_OD = []
+    for r in stain_OD:
+        normalized_stain_OD.append(r / np.linalg.norm(r))
+    normalized_stain_OD = np.asarray(normalized_stain_OD)
+    stain_OD_inverse = np.linalg.inv(normalized_stain_OD)
+    # Calculate optical density of input image
+    OD = convert_to_optical_densities(img_RGB, 255, 255, 255)
+    # Deconvolution
+    img_deconvoluted = np.reshape(np.dot(np.reshape(OD, (-1, 3)), stain_OD_inverse), OD.shape)
+    # Define each channel
+    if to_normalize:
+        channel1 = normalize(img_deconvoluted[:, :, 0])  # First dye
+        channel2 = normalize(img_deconvoluted[:, :, 1])  # Second dye
+        channel3 = normalize(img_deconvoluted[:, :, 2])  # Third dye or background
+    else:
+        channel1 = img_deconvoluted[:, :, 0]  # First dye
+        channel2 = img_deconvoluted[:, :, 1]  # Second dye
+        channel3 = img_deconvoluted[:, :, 2]  # Third dye or background
+    if plot_image:
+        fig, axes = plt.subplots(2, 2, figsize=(15, 15), sharex=True, sharey=True,
+                                 subplot_kw={'adjustable': 'box-forced'})
+        ax = axes.ravel()
+        ax[0].imshow(img_RGB)
+        ax[0].set_title("Original image")
+        ax[1].imshow(channel1, cmap="gray")
+        ax[1].set_title(channels[0])
+        ax[2].imshow(channel2, cmap="gray")
+        ax[2].set_title(channels[1])
+        ax[3].imshow(channel3, cmap="gray")
+        ax[3].set_title(channels[2])
+        plt.show()
+    return img_deconvoluted, channel1, channel2, channel3
+##################################################################
+# Functions for morphological operations
+##################################################################
+def make_8UC(mat, normalized=True):
+    """Convert the matrix to the equivalent matrix of the unsigned 8 bit integer datatype."""
+    if normalized:
+        mat_uint8 = np.array(mat.copy() * 255, dtype=np.uint8)
+    else:
+        mat_uint8 = np.array(normalize(mat) * 255, dtype=np.uint8)
+    return mat_uint8
+def make_8UC3(mat, normalized=True):
+    """Convert the matrix to the equivalent matrix of the unsigned 8 bit integer datatype with 3 channels."""
+    mat_uint8 = make_8UC(mat, normalized)
+    mat_uint8 = np.stack((mat_uint8,) * 3, axis=-1)
+    return mat_uint8
+def check_channel(channel):
+    """Check whether there is any signals in a channel (yes: 1; no: 0)."""
+    channel_uint8 = make_8UC(normalize(channel))
+    if np.var(channel_uint8) < 0.02:
+        return 0
+    else:
+        return 1
+def fill_holes(img_bw):
+    """Fill holes in input 0/255 matrix; equivalent of MATLAB's imfill(BW, 'holes')."""
+    height, width = img_bw.shape
+    # Needs to be 2 pixels larger than image sent to cv2.floodFill
+    mask = np.zeros((height + 4, width + 4), np.uint8)
+    # Add one pixel of padding all around so that objects touching border aren't filled against border
+    img_bw_copy = np.zeros((height + 2, width + 2), np.uint8)
+    img_bw_copy[1:(height + 1), 1:(width + 1)] = img_bw
+    cv2.floodFill(img_bw_copy, mask, (0, 0), 255)
+    img_bw = img_bw | (255 - img_bw_copy[1:(height + 1), 1:(width + 1)])
+    return img_bw
+def otsu_thresholding(img, thresh=None, plot_image=False, fill_hole=False):
+    """Do image thresholding.
+    Args:
+        img: A uint8 matrix for thresholding.
+        thresh: If provided, do binary thresholding use this threshold. If not, do default Otsu thresholding.
+        plot_image: Set Ture if want to real-time display results. Default is False.
+        fill_hole: Set True if want to fill holes in the generated mask. Default is False.
+    Returns:
+        A 0/255 mask matrix same size as img; object: 255; backgroung: 0.
+    """
+    if thresh is None:
+        # Perform Otsu thresholding
+        thresh, mask = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
+    else:
+        # Manually set threshold
+        thresh, mask = cv2.threshold(img, thresh, 255, cv2.THRESH_BINARY)
+    mask = skimage.morphology.remove_small_objects(mask, 2)
+    # Fill holes
+    if fill_hole:
+        mask = fill_holes(mask)
+    if plot_image:
+        plt.figure()
+        plt.imshow(img, cmap="gray")
+        plt.title("Original")
+        plt.figure()
+        plt.imshow(mask)
+        plt.title("After Thresholding")
+        plt.colorbar()
+        plt.show()
+    return mask
+def watershed(mask, img, plot_image=False, kernel_size=2):
+    """Do watershed segmentation for input mask and image.
+    Args:
+        mask: A 0/255 matrix with 255 indicating objects.
+        img: An 8UC3 matrix for watershed segmentation.
+        plot_image: Set True if want to real-time display results. Default is False.
+        kernel_size: Kernal size for inner marker erosion. Default is 2.
+    Returns:
+        A uint8 mask same size as input image, with -1 indicating boundary, 1 indicating background,
+        and numbers>1 indicating objects.
+    """
+    img_copy = img.copy()
+    mask_copy = np.array(mask.copy(), dtype=np.uint8)
+    # Sure foreground area (inner marker)
+    mask_closed = closing(np.array(mask_copy, dtype=np.uint8))
+    mask_closed = closing(np.array(mask_closed, dtype=np.uint8))
+    kernel = np.ones((kernel_size, kernel_size), np.uint8)
+    sure_fg = cv2.erode(mask_closed, kernel, iterations=2)
+    sure_fg = skimage.morphology.closing(np.array(sure_fg, dtype=np.uint8))
+    # Sure background area (outer marker)
+    sure_fg_bool = 1 - img_as_bool(sure_fg)
+    sure_bg = np.uint8(1 - morphology.skeletonize(sure_fg_bool))
+    # Unknown region (the region other than inner or outer marker)
+    sure_fg = np.uint8(sure_fg)
+    unknown = cv2.subtract(sure_bg, sure_fg)
+    # Marker for cv2.watershed
+    _, markers = cv2.connectedComponents(sure_fg)
+    markers = markers + 1  # Set background to 1
+    markers[unknown == 1] = 0
+    # Watershed
+    # TODO(shidan.wang@utsouthwestern.edu): Replace cv2.watershed with skimage.morphology.watershed
+    marker = cv2.watershed(img_copy, markers.copy())
+    if plot_image:
+        plt.figure()
+        plt.imshow(sure_fg)
+        plt.title("Inner Marker")
+        plt.figure()
+        plt.imshow(sure_bg)
+        plt.title("Outer Marker")
+        plt.figure()
+        plt.imshow(unknown)
+        plt.title("Unknown")
+        plt.figure()
+        plt.imshow(markers, cmap='jet')
+        plt.title("Markers")
+        plt.figure()
+        plt.imshow(marker, cmap='jet')
+        plt.title("Mask")
+        plt.figure()
+        plt.imshow(img)
+        plt.title("Original Image")
+        plt.figure()
+        img_copy[marker == -1] = [0, 255, 0]
+        plt.imshow(img_copy)
+        plt.title("Marked Image")
+        plt.show()
+    return marker
+def generate_mask(channel, original_img=None, overlap_color=(0, 1, 0),
+                  plot_process=False, plot_result=False, title="",
+                  fill_hole=False, thresh=None,
+                  use_watershed=True, watershed_kernel_size=2,
+                  save_img=False, save_path=None):
+    """Generate mask for a gray-value image.
+    Args:
+        channel: Channel returned by function 'channel_deconvolution'. A gray-value image is also accepted.
+        original_img: A image used for plotting overlapped segmentation result, optional.
+        overlap_color: A 3-value tuple setting the color used to mark segmentation boundaries on original
+            image. Default is green (0, 1, 0).
+        plot_process: Set True if want to display the whole mask generation process. Default is False.
+        plot_result: Set True if want to display the final result. Default is False.
+        title: The title used for plot_result, optional.
+        fill_hole: Set True if want to fill mask holes. Default is False.
+        thresh: Provide this value to do binary thresholding instead of default otsu thresholding.
+        use_watershed: Set False if want to skip the watershed segmentation step. Default is True.
+        watershed_kernel_size: Kernel size of inner marker erosion. Default is 2.
+        save_img: Set True if want to save the mask image. Default is False.
+        save_path: The path to save the mask image, optional. Prefer *.png or *.pdf.
+    Returns:
+        A binary mask with 1 indicating an object and 0 indicating background.
+    Raises:
+        IOError: An error occured writing image to save_path.
+    """
+    if not check_channel(channel):
+        # If there is not any signal
+        print("No signals detected for this channel")
+        return np.zeros(channel.shape)
+    else:
+        channel = normalize(channel)
+        if use_watershed:
+            mask_threshold = otsu_thresholding(make_8UC(channel),
+                                               plot_image=plot_process, fill_hole=fill_hole, thresh=thresh)
+            marker = watershed(mask_threshold, make_8UC3(channel),
+                               plot_image=plot_process, kernel_size=watershed_kernel_size)
+            # create mask
+            mask = np.zeros(marker.shape)
+            mask[marker == 1] = 1
+            mask = 1 - mask
+            # Set boundary as mask from Otsu_thresholding, since cv2.watershed automatically set boundary as -1
+            mask[0, :] = mask_threshold[0, :] == 255
+            mask[-1, :] = mask_threshold[-1, :] == 255
+            mask[:, 0] = mask_threshold[:, 0] == 255
+            mask[:, -1] = mask_threshold[:, -1] == 255
+        else:
+            mask = otsu_thresholding(make_8UC(channel),
+                                     plot_image=plot_process, fill_hole=fill_hole, thresh=thresh)
+        if plot_result or save_img:
+            if original_img is None:
+                # If original image is not provided, plot mask only
+                plt.figure()
+                plt.imshow(mask, cmap="gray")
+            else:
+                # If original image is provided
+                overlapped_img = segmentation.mark_boundaries(original_img, skimage.measure.label(mask),
+                                                              overlap_color, mode="thick")
+                fig, axes = plt.subplots(1, 2, figsize=(15, 15), sharex=True, sharey=True,
+                                         subplot_kw={'adjustable': 'box-forced'})
+                ax = axes.ravel()
+                ax[0].imshow(mask, cmap="gray")
+                ax[0].set_title(str(title) + " Mask")
+                ax[1].imshow(overlapped_img)
+                ax[1].set_title("Overlapped with Original Image")
+            if save_img:
+                try:
+                    plt.savefig(save_path)
+                except:
+                    raise IOError("Error saving image to {}".format(save_path))
+            if plot_result:
+                plt.show()
+            plt.close()
+    return mask
+def get_mask_for_slide_image(filePath, display_progress=False):
+    """Generate mask for slide"""
+    slide = open_slide(filePath)
+    # Use the lowest resolution
+    level_dims = slide.level_dimensions
+    level_to_analyze = len(level_dims) - 1
+    dims_of_selected = level_dims[-1]
+    if display_progress:
+        print('Selected image of size (' + str(dims_of_selected[0]) + ', ' + str(dims_of_selected[1]) + ')')
+    slide_image = slide.read_region((0, 0), level_to_analyze, dims_of_selected)
+    slide_image = np.array(slide_image)
+    if display_progress:
+        plt.figure()
+        plt.imshow(slide_image)
+    # Perform Otsu thresholding
+    # threshR, maskR = cv2.threshold(slide_image[:, :, 0], 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
+    # threshG, maskG = cv2.threshold(slide_image[:, :, 1], 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
+    threshB, maskB = cv2.threshold(slide_image[:, :, 2], 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
+    # Add the channels together
+    # mask = ((255-maskR) | (255-maskG) | (255-maskB))
+    mask = 255 - maskB
+    if display_progress:
+        plt.figure()
+        plt.imshow(mask)
+    # Delete small objects
+    # min_pixel_count = 0.005 * dims_of_selected[0] * dims_of_selected[1]
+    # mask = np.array(skimage.morphology.remove_small_objects(np.array(mask/255, dtype=bool), min_pixel_count),
+    #                 dtype=np.uint8)
+    # if display_progress:
+    #     print("Min pixel count: {}".format(min_pixel_count))
+    #     plt.figure()
+    #     plt.imshow(mask)
+    #     plt.show()
+    # Dilate the image
+    kernel = np.ones((3, 3), np.uint8)
+    mask = cv2.dilate(mask, kernel, iterations=1)
+    mask = cv2.erode(mask, kernel, iterations=1)
+    mask = cv2.dilate(mask, kernel, iterations=1)
+    # Fill holes
+    mask = fill_holes(mask)
+    if display_progress:
+        plt.figure()
+        plt.imshow(mask)
+        plt.show()
+    return mask, slide_image
+##################################################################
+# Functions for extracting patches from slide image
+##################################################################
+def extract_patch_by_location(filepath, location, patch_size=(500, 500),
+                              plot_image=False, level_to_analyze=0, save=False, savepath='.'):
+    if not os.path.isfile(filepath):
+        raise IOError("Image not found!")
+        return []
+    slide = open_slide(filepath)
+    slide_image = slide.read_region(location, level_to_analyze, patch_size)
+    if plot_image:
+        plt.figure()
+        plt.imshow(slide_image)
+        plt.show()
+    if save:
+        filename = re.search("(?<=/)[^/]+\.svs", filepath).group(0)[0:-4]
+        savename = os.path.join(savepath, str(filename) + '_' + str(location[0]) + '_' + str(location[1]) + '.png')
+        misc.imsave(savename, slide_image)
+        print("Writed to " + savename)
+    return slide_image
+def extract_patch_by_tissue_area(filePath, nPatch=0, patchSize=500, maxPatch=10,
+                                 filename=None, savePath=None, displayProgress=False, desiredLevel=0, random=False):
+    '''Input: slide
+       Output: image patches'''
+    if filename is None:
+        filename = re.search("(?<=/)[0-9]+\.svs", filePath).group(0)
+    if savePath is None:
+        savePath = '/home/swan15/python/brainTumor/sample_patches/'
+    bwMask, slideImageCV = get_mask_for_slide_image(filePath, display_progress=displayProgress)
+    slide = open_slide(filePath)
+    levelDims = slide.level_dimensions
+    # find magnitude
+    for i in range(0, len(levelDims)):
+        if bwMask.shape[0] == levelDims[i][1]:
+            magnitude = levelDims[0][1] / levelDims[i][1]
+            break
+    if not random:
+        nCol = int(math.ceil(levelDims[0][1] / patchSize))
+        nRow = int(math.ceil(levelDims[0][0] / patchSize))
+        # get contour
+        _, contours, _ = cv2.findContours(bwMask, cv2.RETR_TREE, cv2.CHAIN_APPROX_NONE)
+        for nContours in range(0, len(contours)):
+            print(nContours)
+            # i is the y axis in the image
+            for i in range(0, nRow):
+                minRow = i * patchSize / magnitude
+                maxRow = (i + 1) * patchSize / magnitude
+                matches = [x for x in range(0, len(contours[nContours][:, 0, 0]))
+                           if (contours[nContours][x, 0, 1] > minRow and contours[nContours][x, 0, 1] < maxRow)]
+                try:
+                    print([min(contours[nContours][matches, 0, 0]), max(contours[nContours][matches, 0, 0])])
+                    # save image
+                    minCol = min(contours[nContours][matches, 0, 0]) * magnitude
+                    maxCol = max(contours[nContours][matches, 0, 0]) * magnitude
+                    minColInt = int(math.floor(minCol / patchSize))
+                    maxColInt = int(math.ceil(maxCol / patchSize))
+                    for j in range(minColInt, maxColInt):
+                        startCol = j * patchSize
+                        startRow = i * patchSize
+                        patch = slide.read_region((startCol, startRow), desiredLevel, (patchSize, patchSize))
+                        patchCV = np.array(patch)
+                        patchCV = patchCV[:, :, 0:3]
+                        fname = os.path.join(savePath, filename + '_' + str(i) + '_' + str(j) + '.png')
+                        if not os.path.isfile(fname):
+                            misc.imsave(fname, patchCV)
+                            nPatch = nPatch + 1
+                            print(nPatch)
+                        if nPatch >= maxPatch:
+                            break
+                except ValueError:
+                    continue
+                if nPatch >= maxPatch:
+                    break
+            if nPatch >= maxPatch:
+                break
+    else:
+        # randomly pick up image
+        for i in range(nPatch, maxPatch):
+            coords = np.transpose(np.nonzero(bwMask >= 1))
+            y, x = coords[np.random.randint(0, len(coords) - 1)]
+            x = int(x * magnitude) - int(patchSize / 2)
+            y = int(y * magnitude) - int(patchSize / 2)
+            image = np.array(slide.read_region((x, y), desiredLevel, (patchSize, patchSize)))[..., 0:3]
+            fname = os.path.join(savePath, filename + '_' + str(i) + '.png')
+            if not os.path.isfile(fname):
+                misc.imsave(fname, image)
+                print(i)
+def parseXML(xmlFile, pattern):
+    """
+    Parse XML File and returns an object containing all the vertices
+    Verticies: (dict)
+         pattern: (list) of dicts, each with 'X' and 'Y' key
+                [{ 'X': [1,2,3],
+                   'Y': [1,2,3]  }]
+    """
+    tree = ET.parse(xmlFile)  # Convert XML file into tree representation
+    root = tree.getroot()
+    regions = root.iter('Region')  # Extract all Regions
+    vertices = {pattern: []}  # Store all vertices in a dictionary
+    for region in regions:
+        label = region.get('Text')  # label either as 'ROI' or 'normal'
+        if label == pattern:
+            vertices[label].append({'X': [], 'Y': []})
+            for vertex in region.iter('Vertex'):
+                X = float(vertex.get('X'))
+                Y = float(vertex.get('Y'))
+                vertices[label][-1]['X'].append(X)
+                vertices[label][-1]['Y'].append(Y)
+    return vertices
+def calculateRatio(levelDims):
+    """ Calculates the ratio between the highest resolution image and lowest resolution image.
+    Returns the ratio as a tuple (Xratio, Yratio).
+    """
+    highestReso = np.asarray(levelDims[0])
+    lowestReso = np.asarray(levelDims[-1])
+    Xratio, Yratio = highestReso / lowestReso
+    return (Xratio, Yratio)
+def createMask(levelDims, vertices, pattern):
+    """
+    Input: levelDims (nested list): dimensions of each layer of the slide.
+           vertices (dict object as describe above)
+    Output: (tuple) mask
+            numpy nd array of 0/1, where 1 indicates inside the region
+            and 0 is outside the region
+    """
+    # Down scale the XML region to create a low reso image mask, and then
+    # rescale the image to retain reso of image mask to save memory and time
+    Xratio, Yratio = calculateRatio(levelDims)
+    nRows, nCols = levelDims[-1]
+    mask = np.zeros((nRows, nCols), dtype=np.uint8)
+    for i in range(len(vertices[pattern])):
+        lowX = np.array(vertices[pattern][i]['X']) / Xratio
+        lowY = np.array(vertices[pattern][i]['Y']) / Yratio
+        rr, cc = polygon(lowX, lowY, (nRows, nCols))
+        mask[rr, cc] = 1
+    return mask
+def getMask(xmlFile, svsFile, pattern):
+    """ Parses XML File to get mask vertices and returns matrix masks
+    where 1 indicates the pixel is inside the mask, and 0 indicates outside the mask.
+    @param: {string} xmlFile: name of xml file that contains annotation vertices outlining the mask.
+    @param: {string} svsFile: name of svs file that contains the slide image.
+    @param: {pattern} string: name of the xml labeling
+    Returns: slide - openslide slide Object
+             mask - matrix mask of pattern
+    """
+    vertices = parseXML(xmlFile, pattern)  # Parse XML to get vertices of mask
+    if not len(vertices[pattern]):
+        slide = 0
+        mask = 0
+        return slide, mask
+    slide = open_slide(svsFile)
+    levelDims = slide.level_dimensions
+    mask = createMask(levelDims, vertices, pattern)
+    return slide, mask
+def plotMask(mask):
+    fig, ax1 = plt.subplots(nrows=1, figsize=(6, 10))
+    ax1.imshow(mask)
+    plt.show()
+def chooseRandPixel(mask):
+    """ Returns [x,y] numpy array of random pixel.
+    NOTE: the returned [x, y] correspond to [row, col] in the mask
+    @param {numpy matrix} mask from which to choose random pixel.
+           E.g., self.level_dims = self.slide.level_dimensions
+                 self.zoom = self.level_dims[0][0] / self.level_dims[-1][0]
+                 self.slide, mask = getMask(xml_file, slide_file, pattern)
+                 self.mask = cv2.erode(mask, np.ones((50, 50)))
+                 def get_patch(self):
+                    x, y = chooseRandPixel(self.mask)  # x is the columns of original image
+                    x = int(x * self.zoom)
+                    y = int(y * self.zoom)
+                    patch = self.slide.read_region((x, y), 0, (self.PATCH_SIZE, self.PATCH_SIZE))
+                    patch = np.array(patch)[..., 0:3]
+                    return patch, x, y
+                 self.get_patch()
+    """
+    array = np.transpose(np.nonzero(mask))  # Get the indices of nonzero elements of mask.
+    index = random.randint(0, len(array) - 1)  # Select a random index
+    return array[index]
+def plotImage(image):
+    plt.imshow(image)
+    plt.show()
+def checkWhiteSlide(image):
+    im = np.array(image.convert(mode='RGB'))
+    pixels = np.ravel(im)
+    mean = np.mean(pixels)
+    return mean >= 230
+# extractPatchByXMLLabeling
+def getPatches(slide, mask, numPatches=0, dims=(0, 0), dirPath='', slideNum='', plot=False, plotMask=False):
+    """ Generates and saves 'numPatches' patches with dimension 'dims' from image 'slide' contained within 'mask'.
+    @param {Openslide Slide obj} slide: image object
+    @param {numpy matrix} mask: where 0 is outside region of interest and 1 indicates within
+    @param {int} numPatches
+    @param {tuple} dims: (w,h) dimensions of patches
+    @param {string} dirPath: directory in which to save patches
+    @param {string} slideNum: slide number
+    Saves patches in directory specified by dirPath as [slideNum]_[patchNum]_[Xpixel]x[Ypixel].png
+    """
+    w, h = dims
+    levelDims = slide.level_dimensions
+    Xratio, Yratio = calculateRatio(levelDims)
+    i = 0
+    while i < numPatches:
+        firstLoop = True  # Boolean to ensure while loop runs at least once.
+        while firstLoop:  # or not mask[rr,cc].all(): # True if it is the first loop or if all pixels are in the mask
+            firstLoop = False
+            x, y = chooseRandPixel(mask)  # Get random top left pixel of patch.
+            xVertices = np.array([x, x + (w / Xratio), x + (w / Xratio), x, x])
+            yVertices = np.array([y, y, y - (h / Yratio), y - (h / Yratio), y])
+            rr, cc = polygon(xVertices, yVertices)
+        image = slide.read_region((int(x * Xratio), int(y * Yratio)), 0, (w, h))
+        isWhite = checkWhiteSlide(image)
+        # newPath = 'other' if isWhite else dirPath
+        if not isWhite: i += 1
+        slideName = '_'.join([slideNum, 'x'.join([str(x * Xratio), str(y * Yratio)])])
+        image.save(os.path.join(dirPath, slideName + ".png"))
+        if plot:
+            plotImage(image)
+        if plotMask: mask[rr, cc] = 0
+    if plotMask:
+        plotImage(mask)
+'''Example codes for getting patches from labeled svs files:
+#define the patterns
+patterns = ['small_acinar',
+            'large_acinar',
+            'tubular',
+            'trabecular',
+            'aveolar',
+            'solid',
+            'pseudopapillary',
+            'rhabdoid',
+            'sarcomatoid',
+            'necrosis',
+            'normal',
+            'other']
+#create folders
+for pattern in patterns:
+    if not os.path.exists(pattern):
+        os.makedirs(pattern)
+#define parameters
+patchSize = 500
+numPatches = 50
+dirName = '/home/swan15/kidney/ccRCC/slides'
+annotatedSlides = 'slide_region_of_interests.txt'
+f = open(annotatedSlides, 'r+')
+slides = [re.search('.*(?=\.svs)', line).group(0) for line in f
+          if re.search('.*(?=\.svs)', line) is not None]
+print slides
+f.close()
+for slideID in slides:
+    print('Start '+slideID)
+    try:
+        xmlFile = slideID+'.xml'
+        svsFile = slideID+'.svs'
+        xmlFile = os.path.join(dirName, xmlFile)
+        svsFile = os.path.join(dirName, svsFile)
+        if not os.path.isfile(xmlFile):
+            print xmlFile+' not exist'
+            continue
+        for pattern in patterns:
+            numPatchesGenerated = len([files for files in os.listdir(pattern)
+                                      if re.search(slideID+'_.+\.png', files) is not None])
+            if numPatchesGenerated >= numPatches:
+                print(pattern+' existed')
+                continue
+            else:
+                numPatchesTemp = numPatches - numPatchesGenerated
+            slide, mask = getMask(xmlFile, svsFile, pattern)
+            if not slide:
+                #print(pattern+' not detected.')
+                continue
+            getPatches(slide, mask, numPatches = numPatchesTemp, dims = (patchSize, patchSize),
+                       dirPath = pattern+'/', slideNum = slideID, plotMask = False)  # Get Patches
+            print(pattern+' done.')
+        print('Done with ' + slideID)
+        print('----------------------')
+    except:
+        print('Error with ' + slideID)
+'''
+##################################################################
+# RGB color processing
+##################################################################
+# convert RGBA image to RGB (specifically designed for masks)
+def convert_RGBA(RGBA_img):
+    if np.shape(RGBA_img)[2] == 4:
+        RGB_img = np.zeros((np.shape(RGBA_img)[0], np.shape(RGBA_img)[1], 3))
+        RGB_img[RGBA_img[:, :, 3] == 0] = [255, 255, 255]
+        RGB_img[RGBA_img[:, :, 3] == 255] = RGBA_img[RGBA_img[:, :, 3] == 255, 0:3]
+        return RGB_img
+    else:
+        print("Not an RGBA image")
+        return RGBA_img
+# Convert RGB mask to one-channel mask
+def RGB_to_index(RGB_img, RGB_markers=None, RGB_labels=None):
+    """Change RGB to 2D index matrix; each RGB color corresponds to one index.
+    Args:
+        RGB_markers: start from background (marked as 0);
+            Example format:
+                [[255, 255, 255],
+                [160, 255, 0]]
+        RGB_labels: a numeric vector corresponding to the labels of RGB_markers;
+            length should be the same as RGB_markers.
+    """
+    if np.shape(RGB_img)[2] != 3:
+        print("Not an RGB image")
+        return RGB_img
+    else:
+        if RGB_markers == None:
+            RGB_markers = [[255, 255, 255]]
+        if RGB_labels == None:
+            RGB_labels = range(np.shape(RGB_markers)[0])
+        mask_index = np.zeros((np.shape(RGB_img)[0], np.shape(RGB_img)[1]))
+        for i, RGB_label in enumerate(RGB_labels):
+            mask_index[np.all(RGB_img == RGB_markers[i], axis=2)] = RGB_label
+    return mask_index
+def index_to_RGB(mask_index, RGB_markers=None):
+    """Change index to 2D image; each index corresponds to one color"""
+    mask_index_copy = mask_index.copy()
+    mask_index_copy = np.squeeze(mask_index_copy)  # In case the mask shape is not [height, width]
+    if RGB_markers == None:
+        print("RGB_markers not provided!")
+        RGB_markers = [[255, 255, 255]]
+    RGB_img = np.zeros((np.shape(mask_index_copy)[0], np.shape(mask_index_copy)[1], 3), dtype=np.uint8)
+    RGB_img[:, :] = RGB_markers[0]  # Background
+    for i in range(np.shape(RGB_markers)[0]):
+        RGB_img[mask_index_copy == i] = RGB_markers[i]
+    return RGB_img
+def shift_HSV(img, amount=(0.9, 0.9, 0.9)):
+    """Function to tune Hue, Saturation, and Value for image img"""
+    img = Image.fromarray(img, 'RGB')
+    hsv = img.convert('HSV')
+    hsv = np.array(hsv)
+    hsv[..., 0] = np.clip((hsv[..., 0] * amount[0]), a_max=255, a_min=0)
+    hsv[..., 1] = np.clip((hsv[..., 1] * amount[1]), a_max=255, a_min=0)
+    hsv[..., 2] = np.clip((hsv[..., 2] * amount[2]), a_max=255, a_min=0)
+    new_img = Image.fromarray(hsv, 'HSV')
+    return np.array(new_img.convert('RGB'))

cytof/utils.py ADDED Viewed

	@@ -0,0 +1,514 @@

+import os
+import pickle as pkl
+import skimage
+import matplotlib.pyplot as plt
+from matplotlib.patches import Rectangle
+import seaborn as sns
+import numpy as np
+import pandas as pd
+from sklearn.mixture import GaussianMixture
+import scipy
+from typing import Union, Optional, Type, Tuple, List, Dict
+import itertools
+from multiprocessing import Pool
+from tqdm import tqdm
+from readimc import MCDFile, TXTFile
+import warnings
+def load_CytofImage(savename):
+    cytof_img = pkl.load(open(savename, "rb"))
+    return cytof_img
+def load_CytofCohort(savename):
+    cytof_cohort = pkl.load(open(savename, "rb"))
+    return cytof_cohort
+def process_mcd(filename: str,
+                params: Dict):
+    """
+    A function to process a whole slide .mcd file
+    """
+    from classes import CytofImageTiff, CytofCohort
+    quality_control_thres = params.get("quality_control_thres", None)
+    channels_remove       = params.get("channels_remove", None)
+    channels_dict         = params.get("channels_dict", None)
+    use_membrane          = params.get("use_membrane", False)
+    cell_radius           = params.get("cell_radius", 5)
+    normalize_qs          = params.get("normalize_qs", 75)
+    df_cohort    = pd.DataFrame(columns = ['Slide', 'ROI', 'input file'])
+    cytof_images = {}
+    corrupted    = []
+    with MCDFile(filename) as f:
+        for slide in f.slides:
+            sid = f"{slide.description}{slide.id}"
+            print(sid)
+            for roi in slide.acquisitions:
+                rid = roi.description
+                print(f'processing slide_id-roi: {sid}-{rid}')
+                if roi.metadata["DataStartOffset"] < roi.metadata["DataEndOffset"]:
+                    img_roi = f.read_acquisition(roi)  # array, shape: (c, y, x), dtype: float3
+                    img_roi = np.transpose(img_roi, (1, 2, 0))
+                    cytof_img = CytofImageTiff(slide=sid, roi = rid, image=img_roi, filename=f"{sid}-{rid}")
+                    # cytof_img.quality_control(thres=quality_control_thres)
+                    channels = [f"{mk}({cn})" for (mk, cn) in zip(roi.channel_labels, roi.channel_names)]
+                    cytof_img.set_markers(markers=roi.channel_labels, labels=roi.channel_names, channels=channels) # targets, metals
+                    # known corrupted channels, e.g. nan-nan1
+                    if channels_remove is not None and len(channels_remove) > 0:
+                        cytof_img.remove_special_channels(channels_remove)
+                    # maps channel names to nuclei/membrane
+                    if channels_dict is not None:
+                        # remove nuclei channel for segmentation
+                        channels_rm = cytof_img.define_special_channels(channels_dict, rm_key='nuclei')
+                        cytof_img.remove_special_channels(channels_rm)
+                        cytof_img.get_seg(radius=cell_radius, use_membrane=use_membrane)
+                        cytof_img.extract_features(cytof_img.filename)
+                        cytof_img.feature_quantile_normalization(qs=normalize_qs)
+                    df_cohort = pd.concat([df_cohort, pd.DataFrame.from_dict([{'Slide': sid,
+                                                                   'ROI': rid,
+                                                                   'input file': filename}])])
+                    cytof_images[f"{sid}-{rid}"] = cytof_img
+                else:
+                    corrupted.append(f"{sid}-{rid}")
+    print(f"This cohort now contains {len(cytof_images)} ROIs, after excluding {len(corrupted)} corrupted ones from the original MCD.")
+    cytof_cohort = CytofCohort(cytof_images=cytof_images, df_cohort=df_cohort)
+    if channels_dict is not None:
+        cytof_cohort.batch_process_feature()
+    else:
+        warnings.warn("Feature extraction is not done as no nuclei channels defined by 'channels_dict'!")
+    return corrupted, cytof_cohort#, cytof_images
+def save_multi_channel_img(img, savename):
+    """
+    A helper function to save multi-channel images
+    """
+    skimage.io.imsave(savename, img)
+def generate_color_dict(names: List,
+                        sort_names: bool = True,
+                       ):
+    """
+    Randomly generate a dictionary of colors based on provided "names"
+    """
+    if sort_names:
+        names.sort()
+    color_dict = dict((n, plt.cm.get_cmap('tab20').colors[i]) for (i, n) in enumerate(names))
+    return color_dict
+def show_color_table(color_dict: dict, # = None,
+                   # names: List = ['1'],
+                   title: str = "",
+                   maxcols: int = 4,
+                   emptycols: int = 0,
+                   # sort_names: bool = True,
+                   dpi: int = 72,
+                   cell_width: int = 212,
+                   cell_height: int = 22,
+                   swatch_width: int = 48,
+                   margin: int = 12,
+                   topmargin: int = 40,
+                   show: bool = True
+                   ):
+    """
+    Show color dictionary
+    Generate the color table for visualization.
+    If "color_dict" is provided, show color_dict;
+    otherwise, randomly generate color_dict based on "names"
+    reference: https://matplotlib.org/stable/gallery/color/named_colors.html
+    args:
+        color_dict (optional) = a dictionary of colors. key: color legend name - value: RGB representation of color
+        names (optional) = names for each color legend (default=["1"])
+        title (optional) = title for the color table (default="")
+        maxcols = maximum number of columns in visualization
+        emptycols (optional) = number of empty columns for a maxcols-column figure,
+            i.e. maxcols=4 and emptycols=3 means presenting single column plot (default=0)
+        sort_names (optional) = a flag indicating whether sort colors based on names (default=True)
+    """
+#     if sort_names:
+#         names.sort()
+#     if color_pool is None:
+#         color_pool = dict((n, plt.cm.get_cmap('tab20').colors[i]) for (i, n) in enumerate(names))
+#     else:
+    names = color_dict.keys()
+    n = len(names)
+    ncols = maxcols - emptycols
+    nrows = n // ncols + int(n % ncols > 0)
+    #     width  = cell_width * 4 + 2 * margin
+    width = cell_width * ncols + 2 * margin
+    height = cell_height * nrows + margin + topmargin
+    fig, ax = plt.subplots(figsize=(width / dpi, height / dpi), dpi=dpi)
+    fig.subplots_adjust(margin / width, margin / height,
+                        (width - margin) / width, (height - topmargin) / height)
+    #     ax.set_xlim(0, cell_width * 4)
+    ax.set_xlim(0, cell_width * ncols)
+    ax.set_ylim(cell_height * (nrows - 0.5), -cell_height / 2.)
+    ax.yaxis.set_visible(False)
+    ax.xaxis.set_visible(False)
+    ax.set_axis_off()
+    ax.set_title(title, fontsize=16, loc="left", pad=10)
+    for i, n in enumerate(names):
+        row = i % nrows
+        col = i // nrows
+        y = row * cell_height
+        swatch_start_x = cell_width * col
+        text_pos_x = cell_width * col + swatch_width + 7
+        ax.text(text_pos_x, y, n, fontsize=12,
+                horizontalalignment='left',
+                verticalalignment='center')
+        ax.add_patch(
+            Rectangle(xy=(swatch_start_x, y - 9), width=swatch_width,
+                      height=18, facecolor=color_dict[n], edgecolor='0.7')
+        )
+def _extract_feature_one_nuclei(nuclei_id, nuclei_seg, cell_seg, filename, morphology, nuclei_morphology, cell_morphology,
+                       channels, raw_image, sum_exp_nuclei, ave_exp_nuclei, sum_exp_cell, ave_exp_cell):
+    regions = skimage.measure.regionprops((nuclei_seg == nuclei_id) * 1)
+    if len(regions) >= 1:
+        this_nucleus = regions[0]
+    else:
+        return {}
+    regions = skimage.measure.regionprops((cell_seg == nuclei_id) * 1)  # , coordinates='xy') (deprecated)
+    if len(regions) >= 1:
+        this_cell = regions[0]
+    else:
+        return {}
+    centroid_y, centroid_x = this_nucleus.centroid  # y: rows; x: columnsb
+    res = {"filename": filename,
+           "id": nuclei_id,
+           "coordinate_x": centroid_x,
+           "coordinate_y": centroid_y}
+    # morphology
+    for i, feature in enumerate(morphology[:-1]):
+        res[nuclei_morphology[i]] = getattr(this_nucleus, feature)
+        res[cell_morphology[i]]   = getattr(this_cell, feature)
+    res[nuclei_morphology[-1]]    = 1.0 * this_nucleus.perimeter ** 2 / this_nucleus.filled_area
+    res[cell_morphology[-1]]      = 1.0 * this_cell.perimeter ** 2 / this_cell.filled_area
+    # markers
+    for ch, marker in enumerate(channels):
+        res[sum_exp_nuclei[ch]] = np.sum(raw_image[nuclei_seg == nuclei_id, ch])
+        res[ave_exp_nuclei[ch]] = np.average(raw_image[nuclei_seg == nuclei_id, ch])
+        res[sum_exp_cell[ch]]   = np.sum(raw_image[cell_seg == nuclei_id, ch])
+        res[ave_exp_cell[ch]]   = np.average(raw_image[cell_seg == nuclei_id, ch])
+    return res
+def extract_feature(channels: List,
+                    raw_image: np.ndarray,
+                    nuclei_seg: np.ndarray,
+                    cell_seg: np.ndarray,
+                    filename: str,
+                    use_parallel: bool = True,
+                    show_sample: bool = False) -> pd.DataFrame:
+    """ Extract nuclei and cell level feature from cytof image based on nuclei segmentation and cell segmentation
+        results
+    Inputs:
+        channels   = channels to extract feature from
+        raw_image  = raw cytof image
+        nuclei_seg = nuclei segmentation result
+        cell_seg   = cell segmentation result
+        filename   = filename of current cytof image
+    Returns:
+        feature_summary_df = a dataframe containing summary of extracted features
+        morphology         = names of morphology features extracted
+    :param channels: list
+    :param raw_image: numpy.ndarray
+    :param nuclei_seg: numpy.ndarray
+    :param cell_seg: numpy.ndarray
+    :param filename: string
+    :param morpholoty: list
+    :return feature_summary_df: pandas.core.frame.DataFrame
+    """
+    assert (len(channels) == raw_image.shape[-1])
+    # morphology features to be extracted
+    morphology = ["area", "convex_area", "eccentricity", "extent",
+                "filled_area", "major_axis_length", "minor_axis_length",
+                "orientation", "perimeter", "solidity", "pa_ratio"]
+    ## morphology features
+    nuclei_morphology = [_ + '_nuclei' for _ in morphology]  # morphology - nuclei level
+    cell_morphology = [_ + '_cell' for _ in morphology]  # morphology - cell level
+    ## single cell features
+    # nuclei level
+    sum_exp_nuclei = [_ + '_nuclei_sum' for _ in channels]  # sum expression over nuclei
+    ave_exp_nuclei = [_ + '_nuclei_ave' for _ in channels]  # average expression over nuclei
+    # cell level
+    sum_exp_cell   = [_ + '_cell_sum' for _ in channels]  # sum expression over cell
+    ave_exp_cell   = [_ + '_cell_ave' for _ in channels]  # average expression over cell
+    # column names of final result dataframe
+    column_names       = ["filename", "id", "coordinate_x", "coordinate_y"] + \
+                         sum_exp_nuclei + ave_exp_nuclei + nuclei_morphology + \
+                         sum_exp_cell + ave_exp_cell + cell_morphology
+    # Initiate
+    n_nuclei = np.max(nuclei_seg)
+    feature_summary_df = pd.DataFrame(columns=column_names)
+    if use_parallel:
+        nuclei_ids = range(2, n_nuclei + 1)
+        with Pool() as mp_pool:
+            res = mp_pool.starmap(_extract_feature_one_nuclei,
+                                      zip(nuclei_ids,
+                                          itertools.repeat(nuclei_seg),
+                                          itertools.repeat(cell_seg),
+                                          itertools.repeat(filename),
+                                          itertools.repeat(morphology),
+                                          itertools.repeat(nuclei_morphology),
+                                          itertools.repeat(cell_morphology),
+                                          itertools.repeat(channels),
+                                          itertools.repeat(raw_image),
+                                          itertools.repeat(sum_exp_nuclei),
+                                          itertools.repeat(ave_exp_nuclei),
+                                          itertools.repeat(sum_exp_cell),
+                                          itertools.repeat(ave_exp_cell)
+                                         ))
+            # print(len(res), n_nuclei)
+    else:
+        res = []
+        for nuclei_id in tqdm(range(2, n_nuclei + 1), position=0, leave=True):
+            res.append(_extract_feature_one_nuclei(nuclei_id, nuclei_seg, cell_seg, filename,
+                                                   morphology, nuclei_morphology, cell_morphology,
+                                                   channels, raw_image,
+                                                   sum_exp_nuclei, ave_exp_nuclei, sum_exp_cell, ave_exp_cell))
+    feature_summary_df = pd.DataFrame(res)
+    if show_sample:
+        print(feature_summary_df.sample(5))
+    return feature_summary_df
+def check_feature_distribution(feature_summary_df, features):
+    """ Visualize feature distribution for each feature
+    Inputs:
+        feature_summary_df = dataframe of extracted feature summary
+        features           = features to check distribution
+    Returns:
+        None
+    :param feature_summary_df: pandas.core.frame.DataFrame
+    :param features: list
+    """
+    for feature in features:
+        print(feature)
+        fig, ax = plt.subplots(1, 1, figsize=(3, 2))
+        ax.hist(np.log2(feature_summary_df[feature] + 0.0001), 100)
+        ax.set_xlim(-15, 15)
+        plt.show()
+# def visualize_scatter(data, communities, n_community, title, figsize=(4,4), savename=None, show=False):
+#     """
+#     data = data to visualize (N, 2)
+#     communities = group indices correspond to each sample in data (N, 1) or (N, )
+#     n_community = total number of groups in the cohort (n_community >= unique number of communities)
+#     """
+#     fig, ax = plt.subplots(1,1, figsize=figsize)
+#     ax.set_title(title)
+#     sns.scatterplot(x=data[:,0], y=data[:,1], hue=communities, palette='tab20',
+#                     hue_order=np.arange(n_community))
+#                     #                 legend=legend,
+#                     # hue_order=np.arange(n_community))
+#     plt.axis('tight')
+#     plt.legend(bbox_to_anchor=(1.01, 1), loc=2, borderaxespad=0.)
+#     if savename is not None:
+#         print("saving plot to {}".format(savename))
+#         plt.savefig(savename)
+#     if show:
+#         plt.show()
+#         return None
+#     return fig
+def visualize_scatter(data, communities, n_community, title, figsize=(5,5), savename=None, show=False, ax=None):
+    """
+    data = data to visualize (N, 2)
+    communities = group indices correspond to each sample in data (N, 1) or (N, )
+    n_community = total number of groups in the cohort (n_community >= unique number of communities)
+    """
+    clos = not show and ax is None
+    show = show and ax is None
+    if ax is None:
+        fig, ax = plt.subplots(1,1)
+    else:
+        fig = None
+    ax.set_title(title)
+    sns.scatterplot(x=data[:,0], y=data[:,1], hue=communities, palette='tab20',
+                    hue_order=np.arange(n_community), ax=ax)
+                    #                 legend=legend,
+                    # hue_order=np.arange(n_community))
+    ax.legend(bbox_to_anchor=(1.01, 1), loc=2, borderaxespad=0.)
+    # plt.axis('tight')
+    if savename is not None:
+        print("saving plot to {}".format(savename))
+        plt.tight_layout()
+        plt.savefig(savename)
+    if show:
+        plt.show()
+    if clos:
+        plt.close('all')
+    return fig
+def visualize_expression(data, markers, group_ids, title, figsize=(5,5), savename=None, show=False, ax=None):
+    clos = not show and ax is None
+    show = show and ax is None
+    if ax is None:
+        fig, ax = plt.subplots(1,1)
+    else:
+        fig = None
+    sns.heatmap(data,
+                cmap='magma',
+                xticklabels=markers,
+                yticklabels=group_ids,
+                ax=ax
+               )
+    ax.set_xlabel("Markers")
+    ax.set_ylabel("Phenograph clusters")
+    ax.set_title("normalized expression - {}".format(title))
+    ax.xaxis.set_tick_params(labelsize=8)
+    if savename is not None:
+        plt.tight_layout()
+        plt.savefig(savename)
+    if show:
+        plt.show()
+    if clos:
+        plt.close('all')
+    return fig
+def _get_thresholds(df_feature: pd.DataFrame,
+                    features: List[str],
+                    thres_bg: float = 0.3,
+                    visualize: bool = True,
+                    verbose: bool = False):
+    """Calculate thresholds for each feature by assuming a Gaussian Mixture Model
+    Inputs:
+        df_feature = dataframe of extracted feature summary
+        features   = a list of features to calculate thresholds from
+        thres_bg   = a threshold such that the component with the mixing weight greater than the threshold would
+                            be considered as background. (Default=0.3)
+        visualize  = a flag indicating whether to visualize the feature distributions and thresholds or not.
+                            (Default=True)
+        verbose    = a flag indicating whether to print calculated values on screen or not. (Default=False)
+    Outputs:
+        thresholds = a dictionary of calculated threshold values
+    :param df_feature: pandas.core.frame.DataFrame
+    :param features: list
+    :param visualize: bool
+    :param verbose: bool
+    :return thresholds: dict
+    """
+    thresholds = {}
+    for f, feat_name in enumerate(features):
+        X = df_feature[feat_name].values.reshape(-1, 1)
+        gm = GaussianMixture(n_components=2, random_state=0, n_init=2).fit(X)
+        mu = np.min(gm.means_[gm.weights_ > thres_bg])
+        which_component = np.argmax(gm.means_ == mu)
+        if verbose:
+            print(f"GMM mean values: {gm.means_}")
+            print(f"GMM weights: {gm.weights_}")
+            print(f"GMM covariances: {gm.covariances_}")
+        X     = df_feature[feat_name].values
+        hist  = np.histogram(X, 150)
+        sigma = np.sqrt(gm.covariances_[which_component, 0, 0])
+        background_ratio = gm.weights_[which_component]
+        thres = sigma * 2.5 + mu
+        thresholds[feat_name] = thres
+        n = sum(X > thres)
+        percentage = n / len(X)
+        ## visualize
+        if visualize:
+            fig, ax = plt.subplots(1, 1)
+            ax.hist(X, 150, density=True)
+            ax.set_xlabel("log2({})".format(feat_name))
+            ax.plot(hist[1], scipy.stats.norm.pdf(hist[1], mu, sigma) * background_ratio, c='red')
+            _which_component = np.argmin(gm.means_ == mu)
+            _mu = gm.means_[_which_component]
+            _sigma = np.sqrt(gm.covariances_[_which_component, 0, 0])
+            ax.plot(hist[1], scipy.stats.norm.pdf(hist[1], _mu, _sigma) * (1 - background_ratio), c='orange')
+            ax.axvline(x=thres, c='red')
+            ax.text(0.7, 0.9, "n={}, percentage={}".format(n, np.round(percentage, 3)), ha='center', va='center',
+                    transform=ax.transAxes)
+            ax.text(0.3, 0.9, "mu={}, sigma={}".format(np.round(mu, 2), np.round(sigma, 2)), ha='center', va='center',
+                    transform=ax.transAxes)
+            ax.text(0.3, 0.8, "background ratio={}".format(np.round(background_ratio, 2)), ha='center', va='center',
+                    transform=ax.transAxes)
+            ax.set_title(feat_name)
+            plt.show()
+    return thresholds
+def _generate_summary(df_feature: pd.DataFrame, features: List[str], thresholds: dict) -> pd.DataFrame:
+    """Generate (cell level) summary table for each feature in features: feature name, total number (of cells),
+        calculated GMM threshold for this feature, number of individuals (cells) with greater than threshold values,
+        ratio of individuals (cells) with greater than threshold values
+    Inputs:
+        df_feature = dataframe of extracted feature summary
+        features   = a list of features to generate summary table
+        thresholds = (calculated GMM-based) thresholds for each feature
+    Outputs:
+        df_info    = summary table for each feature
+    :param df_feature: pandas.core.frame.DataFrame
+    :param features: list
+    :param thresholds: dict
+    :return df_info: pandas.core.frame.DataFrame
+    """
+    df_info = pd.DataFrame(columns=['feature', 'total number', 'threshold', 'positive counts', 'positive ratio'])
+    for feature in features:  # loop over each feature
+        thres = thresholds[feature]  # fetch threshold for the feature
+        X = df_feature[feature].values
+        n = sum(X > thres)
+        N = len(X)
+        df_new_row = pd.DataFrame({'feature': feature, 'total number': N, 'threshold': thres,
+                                   'positive counts': n, 'positive ratio': n / N}, index=[0])
+        df_info = pd.concat([df_info, df_new_row])
+    return df_info.reset_index(drop=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1,17 @@

+matplotlib==3.6.0
+numpy==1.24.3
+pandas==1.5.1
+PyYAML==6.0
+scikit-image==0.19.3
+scikit-learn==1.1.3
+scipy==1.9.3
+seaborn==0.12.1
+tqdm==4.64.1
+threadpoolctl==3.1.0
+opencv-python==4.7.0.72
+phenograph==1.5.7
+umap-learn==0.5.3
+readimc==0.6.2
+gradio==4.0.1
+plotly==5.18.0
+imagecodecs==2023.1.23