Spaces:

gzsbwfj03
/

multiTAP_testing

Sleeping

App Files Files Community

fangjiang commited on Aug 18, 2025

Commit

2990d1c

1 Parent(s): 1c8ba2d

initial update

Browse files

Files changed (11) hide show

README.md +6 -5
app.py +595 -0
cytof/__init__.py +4 -0
cytof/batch_preprocess.py +279 -0
cytof/classes.py +1894 -0
cytof/hyperion_analysis.py +1477 -0
cytof/hyperion_preprocess.py +335 -0
cytof/hyperion_segmentation.py +341 -0
cytof/segmentation_functions.py +815 -0
cytof/utils.py +514 -0
requirements.txt +17 -0

README.md CHANGED Viewed

@@ -1,12 +1,13 @@
 ---
-title: MultiTAP Testing
-emoji: 👀
-colorFrom: green
-colorTo: purple
 sdk: gradio
-sdk_version: 5.42.0
 app_file: app.py
 pinned: false
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: MultiTAP
+emoji: 🌖
+colorFrom: red
+colorTo: yellow
 sdk: gradio
+sdk_version: 4.8.0
 app_file: app.py
 pinned: false
+license: mit
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,595 @@

+import gradio as gr
+import yaml
+import skimage
+import numpy as np
+import matplotlib.pyplot as plt
+from matplotlib.pyplot import cm
+import plotly.express as px
+import plotly.graph_objs as go
+from plotly.subplots import make_subplots
+import os
+import seaborn as sns
+from cytof import classes
+from classes import CytofImage, CytofCohort, CytofImageTiff
+from cytof.hyperion_preprocess import cytof_read_data_roi
+from cytof.utils import show_color_table
+OUTDIR = './output'
+def cytof_tiff_eval(file_path, marker_path, cytof_state):
+    # set to generic names because uploaded filenames is unpredictable
+    slide = 'slide0'
+    roi = 'roi1'
+    # read in the data
+    cytof_img, _ = cytof_read_data_roi(file_path, slide, roi)
+    # case 1. user uploaded TXT/CSV
+    if marker_path is None:
+        # get markers
+        cytof_img.get_markers()
+        # prepsocess
+        cytof_img.preprocess()
+        cytof_img.get_image()
+    # case 2. user uploaded TIFF
+    else:
+        labels_markers = yaml.load(open(marker_path, "rb"), Loader=yaml.Loader)
+        cytof_img.set_markers(**labels_markers)
+    viz = cytof_img.check_channels(ncols=3, savedir='.')
+    msg = f'Your uploaded TIFF has {len(cytof_img.markers)} markers'
+    cytof_state = cytof_img
+    return msg, viz, cytof_state
+def channel_select(cytof_img):
+    # one for define unwanted channels, one for defining nuclei, one for defining membrane
+    return gr.Dropdown(choices=cytof_img.channels, multiselect=True), gr.Dropdown(choices=cytof_img.channels, multiselect=True), gr.Dropdown(choices=cytof_img.channels, multiselect=True)
+def nuclei_select(cytof_img):
+    # one for defining nuclei, one for defining membrane
+    return gr.Dropdown(choices=cytof_img.channels, multiselect=True), gr.Dropdown(choices=cytof_img.channels, multiselect=True)
+def modify_channels(cytof_img, unwanted_channels, nuc_channels, mem_channels):
+    """
+    3-step function. 1) removes unwanted channels, 2) define nuclei channels, 3) define membrane channels
+    """
+    cytof_img_updated = cytof_img.copy()
+    cytof_img_updated.remove_special_channels(unwanted_channels)
+    # define and remove nuclei channels
+    nuclei_define = {'nuclei' : nuc_channels}
+    channels_rm = cytof_img_updated.define_special_channels(nuclei_define)
+    cytof_img_updated.remove_special_channels(channels_rm)
+    # define and keep membrane channels
+    membrane_define = {'membrane' : mem_channels}
+    cytof_img_updated.define_special_channels(membrane_define)
+    # only get image when need to derive from df. CytofImageTIFF has inherent image attribute
+    if type(cytof_img_updated) is CytofImage:
+        cytof_img_updated.get_image()
+    nuclei_channel_str = ', '.join(channels_rm)
+    membrane_channel_str = ', '.join(mem_channels)
+    msg = 'Your remaining channels are: ' + ', '.join(cytof_img_updated.channels) + '.\n\n Nuclei channels: ' + nuclei_channel_str + '.\n\n Membrane channels: ' + membrane_channel_str
+    return msg, cytof_img_updated
+def update_dropdown_options(cytof_img, selected_self, selected_other1, selected_other2):
+    """
+    Remove the selected option in the dropdown from the other two dropdowns
+    """
+    updated_choices = cytof_img.channels.copy()
+    unavail_options = selected_self + selected_other1 + selected_other2
+    for opt in unavail_options:
+        updated_choices.remove(opt)
+    return gr.Dropdown(choices=updated_choices+selected_other1, value=selected_other1, multiselect=True), gr.Dropdown(choices=updated_choices+selected_other2, value=selected_other2, multiselect=True)
+def cell_seg(cytof_img, radius):
+    # check if membrane channel available
+    use_membrane = 'membrane' in cytof_img.channels
+    nuclei_seg, cell_seg = cytof_img.get_seg(use_membrane=use_membrane, radius=radius, show_process=False)
+    # visualize nuclei and cells segmentation
+    marked_image_nuclei = cytof_img.visualize_seg(segtype="nuclei", show=False)
+    marked_image_cell = cytof_img.visualize_seg(segtype="cell", show=False)
+    # visualizing nuclei and/or membrane, plus the first marker in channels
+    marker_visualized = cytof_img.channels[0]
+    # similar to plt.imshow()
+    fig = px.imshow(marked_image_cell)
+    # add scatter plot dots as legends
+    fig.add_trace(go.Scatter(x=[None], y=[None], mode='markers', marker=dict(color='white'), name='membrane boundaries'))
+    fig.add_trace(go.Scatter(x=[None], y=[None], mode='markers', marker=dict(color='yellow'), name='nucleus boundaries'))
+    fig.add_trace(go.Scatter(x=[None], y=[None], mode='markers', marker=dict(color='red'), name='nucleus'))
+    fig.add_trace(go.Scatter(x=[None], y=[None], mode='markers', marker=dict(color='green'), name=marker_visualized))
+    fig.update_layout(legend=dict(orientation="v", bgcolor='lightgray'))
+    return fig, cytof_img
+def feature_extraction(cytof_img, cohort_state, percentile_threshold):
+    # extract and normalize all features
+    cytof_img.extract_features(filename=cytof_img.filename)
+    cytof_img.feature_quantile_normalization(qs=[percentile_threshold])
+    # create dir if not exist
+    if not os.path.isdir(OUTDIR):
+        os.makedirs(OUTDIR)
+    cytof_img.export_feature(f"df_feature_{percentile_threshold}normed", os.path.join(OUTDIR, f"feature_{percentile_threshold}normed.csv"))
+    df_feature = getattr(cytof_img, f"df_feature_{percentile_threshold}normed" )
+    # each file upload in Gradio will always have the same filename
+    # also the temp path created by Gradio is too long to be visually satisfying.
+    df_feature = df_feature.loc[:, df_feature.columns != 'filename']
+    # calculates quantiles between each marker and cell
+    cytof_img.calculate_quantiles(qs=[75])
+    dict_cytof_img = {f"{cytof_img.slide}_{cytof_img.roi}": cytof_img}
+    # convert to cohort and prepare downstream analysis
+    cytof_cohort = CytofCohort(cytof_images=dict_cytof_img, dir_out=OUTDIR)
+    cytof_cohort.batch_process_feature()
+    cytof_cohort.generate_summary()
+    cohort_state = cytof_cohort
+    msg = 'Feature extraction completed!'
+    return cytof_img, cytof_cohort, df_feature
+def co_expression(cytof_img, percentile_threshold):
+    feat_name = f"{percentile_threshold}normed"
+    df_co_pos_prob, df_expected_prob = cytof_img.roi_co_expression(feature_name=feat_name, accumul_type='sum', return_components=False)
+    epsilon = 1e-6 # avoid divide by 0 or log(0)
+    # Normalize and fix Nan
+    edge_percentage_norm = np.log10(df_co_pos_prob.values / (df_expected_prob.values+epsilon) + epsilon)
+    # if observed/expected = 0, then log odds ratio will have log10(epsilon)
+    # no observed means co-expression cannot be determined, does not mean strong negative co-expression
+    edge_percentage_norm[edge_percentage_norm == np.log10(epsilon)] = 0
+    # do some post processing
+    marker_all_clean = [m.replace('_cell_sum', '') for m in df_expected_prob.columns]
+    # fig = plt.figure()
+    clustergrid = sns.clustermap(edge_percentage_norm,
+                    # clustergrid = sns.clustermap(edge_percentage_norm,
+                    center=np.log10(1 + epsilon), cmap='RdBu_r', vmin=-1, vmax=3,
+                    xticklabels=marker_all_clean, yticklabels=marker_all_clean)
+    # retrieve matplotlib.Figure object from clustermap
+    fig = clustergrid.ax_heatmap.get_figure()
+    return fig, cytof_img
+def spatial_interaction(cytof_img, percentile_threshold, method, cluster_threshold):
+    feat_name = f"{percentile_threshold}normed"
+    df_expected_prob, df_cell_interaction_prob = cytof_img.roi_interaction_graphs(feature_name=feat_name, accumul_type='sum', method=method, threshold=cluster_threshold)
+    epsilon = 1e-6
+    # Normalize and fix Nan
+    edge_percentage_norm = np.log10(df_cell_interaction_prob.values / (df_expected_prob.values+epsilon) + epsilon)
+    # if observed/expected = 0, then log odds ratio will have log10(epsilon)
+    # no observed means interaction cannot be determined, does not mean strong negative interaction
+    edge_percentage_norm[edge_percentage_norm == np.log10(epsilon)] = 0
+    # do some post processing
+    marker_all_clean = [m.replace('_cell_sum', '') for m in df_expected_prob.columns]
+    clustergrid = sns.clustermap(edge_percentage_norm,
+                                # clustergrid = sns.clustermap(edge_percentage_norm,
+                                center=np.log10(1 + epsilon), cmap='bwr', vmin=-2, vmax=2,
+                                xticklabels=marker_all_clean, yticklabels=marker_all_clean)
+    # retrieve matplotlib.Figure object from clustermap
+    fig = clustergrid.ax_heatmap.get_figure()
+    return fig, cytof_img
+def get_marker_pos_options(cytof_img):
+    options = cytof_img.channels.copy()
+    # nuclei is guaranteed to exist after defining channels
+    options.remove('nuclei')
+    # search for channel "membrane" and delete, skip if cannot find
+    try:
+        options.remove('membrane')
+    except ValueError:
+        pass
+    return gr.Dropdown(choices=options, interactive=True), gr.Dropdown(choices=options, interactive=True)
+def viz_pos_marker_pair(cytof_img, marker1, marker2, percentile_threshold):
+    stain_nuclei1, stain_cell1, color_dict = cytof_img.visualize_marker_positive(
+                                        marker=marker1,
+                                        feature_type="normed",
+                                        accumul_type="sum",
+                                        normq=percentile_threshold,
+                                        show_boundary=True,
+                                        color_list=[(0,0,1), (0,1,0)], # negative, positive
+                                        color_bound=(0,0,0),
+                                        show_colortable=False)
+    stain_nuclei2, stain_cell2, color_dict = cytof_img.visualize_marker_positive(
+                                        marker=marker2,
+                                        feature_type="normed",
+                                        accumul_type="sum",
+                                        normq=percentile_threshold,
+                                        show_boundary=True,
+                                        color_list=[(0,0,1), (0,1,0)], # negative, positive
+                                        color_bound=(0,0,0),
+                                        show_colortable=False)
+    # create two subplots
+    fig = make_subplots(rows=1, cols=2, shared_xaxes=True, shared_yaxes=True, subplot_titles=(f"positive {marker1} cells", f"positive {marker2} cells"))
+    fig.add_trace(px.imshow(stain_cell1).data[0], row=1, col=1)
+    fig.add_trace(px.imshow(stain_cell2).data[0], row=1, col=2)
+    # Synchronize axes
+    fig.update_xaxes(matches='x')
+    fig.update_yaxes(matches='y')
+    fig.update_layout(title_text=" ")
+    return fig
+def phenograph(cytof_cohort):
+    key_pheno = cytof_cohort.clustering_phenograph()
+    df_feats, commus, cluster_protein_exps, figs, figs_scatter, figs_exps = cytof_cohort.vis_phenograph(
+        key_pheno=key_pheno,
+        level="cohort",
+        save_vis=False,
+        show_plots=False,
+        plot_together=False)
+    umap = figs_scatter['cohort']
+    expression = figs_exps['cohort']['cell_sum']
+    return umap, cytof_cohort
+def cluster_interaction_fn(cytof_img, cytof_cohort):
+    # avoid calling the clustering algorithm again. cohort is guaranteed to have one phenogrpah
+    key_pheno = list(cytof_cohort.phenograph.keys())[0]
+    epsilon = 1e-6
+    interacts, clustergrid = cytof_cohort.cluster_interaction_analysis(key_pheno)
+    interact = interacts[cytof_img.slide]
+    clustergrid_interaction = sns.clustermap(interact, center=np.log10(1+epsilon),
+                                cmap='RdBu_r', vmin=-1, vmax=1,
+                                xticklabels=np.arange(interact.shape[0]),
+                                yticklabels=np.arange(interact.shape[0]))
+    # retrieve matplotlib.Figure object from clustermap
+    fig = clustergrid.ax_heatmap.get_figure()
+    return fig, cytof_img, cytof_cohort
+def get_cluster_pos_options(cytof_img):
+    options = cytof_img.channels.copy()
+    # nuclei is guaranteed to exist after defining channels
+    options.remove('nuclei')
+    # search for channel "membrane" and delete, skip if cannot find
+    try:
+        options.remove('membrane')
+    except ValueError:
+        pass
+    return gr.Dropdown(choices=options, interactive=True)
+def viz_cluster_positive(marker, percentile_threshold, cytof_img, cytof_cohort):
+    # avoid calling the clustering algorithm again. cohort is guaranteed to have one phenogrpah
+    key_pheno = list(cytof_cohort.phenograph.keys())[0]
+    # marker positive cell
+    stain_nuclei1, stain_cell1, color_dict = cytof_img.visualize_marker_positive(
+                                        marker=marker,
+                                        feature_type="normed",
+                                        accumul_type="sum",
+                                        normq=percentile_threshold,
+                                        show_boundary=True,
+                                        color_list=[(0,0,1), (0,1,0)], # negative, positive
+                                        color_bound=(0,0,0),
+                                        show_colortable=False)
+    # attch PhenoGraph results to individual ROIs
+    cytof_cohort.attach_individual_roi_pheno(key_pheno, override=True)
+    # PhenoGraph clustering visualization
+    pheno_stain_nuclei, pheno_stain_cell, color_dict = cytof_img.visualize_pheno(key_pheno=key_pheno)
+    # create two subplots
+    fig = make_subplots(rows=1, cols=2, shared_xaxes=True, shared_yaxes=True, subplot_titles=(f"positive {marker} cells", "PhenoGraph clusters on cells"))
+    fig.add_trace(px.imshow(stain_cell1).data[0], row=1, col=1)
+    fig.add_trace(px.imshow(pheno_stain_cell).data[0], row=1, col=2)
+    # Synchronize axes
+    fig.update_xaxes(matches='x')
+    fig.update_yaxes(matches='y')
+    fig.update_layout(title_text=" ")
+    return fig, cytof_img, cytof_cohort
+# Gradio App template
+custom_css = """
+<style>
+.h-1 {
+    font-size: 40px !important;
+}
+.h-2 {
+    font-size: 20px !important;
+}
+.h-3 {
+    font-size: 20px !important;
+}
+.mb-10 {
+    margin-bottom: 10px !important;
+}
+.no-label label {
+    display: none !important;
+}
+.cell-no-label span {
+    display: none !important;
+}
+.no-border {
+    border-width: 0 !important;
+}
+hr {
+    padding-bottom: 10px !important;
+}
+.input-choices {
+    padding: 10px 0 !important;
+}
+.input-choices > span {
+    display: none;
+}
+.form:has(.input-choices) {
+    border-width: 0 !important;
+    box-shadow: none !important;
+}
+</style>
+"""
+with gr.Blocks() as demo:
+    gr.HTML(custom_css)
+    cytof_state = gr.State(CytofImage())
+     # used in scenrios where users define/remove channels multiple times
+    cytof_original_state = gr.State(CytofImage())
+    gr.Markdown('<div class="h-1">Step 1. Upload images</div>')
+    gr.Markdown('<div class="h-2">You may upload one or two files depending on your use case.</div>')
+    gr.Markdown('<div class="h-2">Case 1: Upload a single file.'
+    '<ul><li>upload a TXT or CSV file that contains information about antibodies, rare heavy metal isotopes, and image channel names.</li>'
+    '<li>files are following the CyTOF, IMC, or multiplex data convention.</li>'
+    '</ul></div>')
+    gr.Markdown('<div class="h-2">Case 2: Upload multiple files.'
+    '<ul><li>upload a TIFF file containing Regions of Interest (ROIs) stored as multiplexed images.</li>'
+    '<li>upload a Marker File listing the channels to identify the antibodies.</li>'
+    '</ul></div>')
+    gr.Markdown('<hr>')
+    gr.Markdown('<div class="h-2">Select Input Case:</div>')
+    choices = gr.Radio(["Case 1", "Case 2"], value="Case 1", label="Choose Input Case", elem_classes='input-choices')
+    def toggle_file_input(choice):
+        if choice == "Case 1":
+            return (
+                gr.update(visible=True, file_types=['.txt', '.csv'], label="TXT or CSV File"),
+                gr.update(visible=False)
+            )
+        else:
+            return (
+                gr.update(visible=True, file_types=[".tiff", '.tif'], label="TIFF File"),
+                gr.update(visible=True)
+            )
+    with gr.Row(equal_height=True): # second row where 1) asks for marker file upload and 2) displays the visualization of individual channels
+        with gr.Column(scale=2):
+            gr.Markdown('<div class="h-2">File Input:</div>')
+            img_path = gr.File(file_types=['.txt', '.csv'], label='TXT or CSV File')
+            marker_path = gr.File(file_types=['.txt'], label='Marker File', visible=False)
+            with gr.Row():
+                clear_btn = gr.Button("Clear")
+                submit_btn = gr.Button("Upload")
+        with gr.Column(scale=3):
+            gr.Markdown('<div class="h-2">Marker Information:</div>')
+            img_info = gr.Textbox(label='Ensure the number of markers displayed below matches the expected number.')
+            gr.Markdown('<div class="h-3">Visualization of individual channels:</div>')
+            with gr.Accordion("", open=True):
+                img_viz = gr.Plot(elem_classes='no-label no-border')
+    choices.change(fn=toggle_file_input, inputs=choices, outputs=[img_path, marker_path])
+        # img_viz = gr.Plot(label="Visualization of individual channels")
+    gr.Markdown('<br>')
+    gr.Markdown('<div class="h-1">Step 2. Modify existing channels</div>')
+    gr.Markdown('<div class="h-2">(Required) Define channels designed to visualize nuclei. </div>')
+    gr.Markdown('<div class="h-2">(Optional) Remove unwanted channel after visualizing the individual channels. </div>')
+    gr.Markdown('<div class="h-2">(Optional) Define channels degisned to visualize membranes.</div>')
+    gr.Markdown('<hr>')
+    with gr.Row(equal_height=True): # third row selects nuclei channels
+        with gr.Column(scale=2):
+            selected_nuclei = gr.Dropdown(label='(Required) Select the nuclei channel', interactive=True)
+            selected_unwanted_channel = gr.Dropdown(label='(Optional) Select the unwanted channel', interactive=True)
+            selected_membrane = gr.Dropdown(label='(Optional) Select the membrane channel', interactive=True)
+            define_btn = gr.Button('Modify channels')
+        with gr.Column(scale=3):
+            channel_feedback = gr.Textbox(label='Channels info update')
+        # upload the file, and gather channel info. Then populate to the unwanted_channel, nuclei, and membrane components
+        submit_btn.click(
+            fn=cytof_tiff_eval, inputs=[img_path, marker_path, cytof_original_state], outputs=[img_info, img_viz, cytof_original_state],
+            api_name='upload'
+        ).success(
+            fn=channel_select, inputs=cytof_original_state, outputs=[selected_unwanted_channel, selected_nuclei, selected_membrane]
+        )
+    selected_unwanted_channel.change(fn=update_dropdown_options, inputs=[cytof_original_state, selected_unwanted_channel, selected_nuclei, selected_membrane], outputs=[selected_nuclei, selected_membrane], api_name='dropdown_monitor1') # api_name used to identify in the endpoints
+    selected_nuclei.change(fn=update_dropdown_options, inputs=[cytof_original_state, selected_nuclei, selected_membrane, selected_unwanted_channel], outputs=[selected_membrane, selected_unwanted_channel], api_name='dropdown_monitor2')
+    selected_membrane.change(fn=update_dropdown_options, inputs=[cytof_original_state, selected_membrane, selected_nuclei, selected_unwanted_channel], outputs=[selected_nuclei, selected_unwanted_channel], api_name='dropdown_monitor3')
+    # modifies the channels per user input
+    define_btn.click(fn=modify_channels, inputs=[cytof_original_state, selected_unwanted_channel, selected_nuclei, selected_membrane], outputs=[channel_feedback, cytof_state])
+    gr.Markdown('<br>')
+    gr.Markdown('<div class="h-1">Step 3. Perform cell segmentation based on the defined nuclei and membrane channels</div>')
+    gr.Markdown('<hr>')
+    with gr.Row(): # This row defines cell radius and performs segmentation
+        with gr.Column(scale=2):
+            gr.Markdown('<div class="h-2">Cell Size:</div>')
+            cell_radius = gr.Number(value=5, precision=0, label='Cell size', info='Please enter the desired radius for cell segmentation (in pixels; default value: 5)', elem_classes='cell-no-label')
+            seg_btn = gr.Button("Segment")
+        with gr.Column(scale=3):
+            gr.Markdown('<div class="h-2">Visualization of the segmentation: </div>')
+            seg_viz = gr.Plot(label="Hover over graph to zoom, pan, save, etc.")
+        seg_btn.click(fn=cell_seg, inputs=[cytof_state, cell_radius], outputs=[seg_viz, cytof_state])
+    gr.Markdown('<br>')
+    gr.Markdown('<div class="h-1">Step 4. Extract cell features</div>')
+    gr.Markdown('<div class="h-2">Note: This step will take significantly longer than the previous ones. A 300MB IMC file takes about 7 minutes to compute.</div>')
+    gr.Markdown('<hr>')
+    cohort_state = gr.State(CytofCohort())
+    with gr.Row(): # feature extraction related functinos
+        with gr.Column(scale=2):
+            # gr.CheckboxGroup(choices=['Yes', 'Yes', 'Yes'], label='')
+            norm_percentile = gr.Slider(minimum=50, maximum=99, step=1, value=75, interactive=True, label='Normalized quantification percentile')
+            extract_btn = gr.Button('Extract')
+        with gr.Column(scale=3):
+            feat_df = gr.DataFrame(headers=['id','coordinate_x','coordinate_y','area_nuclei'],col_count=(4, "fixed"))
+        extract_btn.click(fn=feature_extraction, inputs=[cytof_state, cohort_state, norm_percentile],
+        outputs=[cytof_state, cohort_state, feat_df])
+    gr.Markdown('<br>')
+    gr.Markdown('<div class="h-1">Step 5. Downstream analysis</div>')
+    gr.Markdown('<hr>')
+    gr.Markdown('<div class="h-2">(1) Co-expression Analysis</div>')
+    with gr.Row(): # show co-expression and spatial analysis
+        with gr.Column(scale=2):
+            gr.Markdown('<div class="h-2">This analysis measures the level of co-expression for each pair of biomarkers by calculating the odds ratio between the observed co-occurrence and the expected expressing even</div>')
+            co_exp_btn = gr.Button('Run co-expression analysis')
+        with gr.Column(scale=3):
+            gr.Markdown('<div class="h-2">Visualization of cell coexpression of markers</div>')
+            co_exp_viz = gr.Plot(elem_classes='no-label')
+    gr.Markdown('<div class="h-2">(2) Spatial Interactoin Analysis</div>')
+    def update_info_text(choice):
+        if choice == "k-neighbor":
+            return 'K-neighbor: classifies the threshold number of surrounding cells as neighborhood pairs.'
+        else:
+            return 'Distance: classifies cells within threshold distance as neighborhood pairs.'
+    with gr.Row():
+        with gr.Column(scale=2):
+            gr.Markdown('<div class="h-2">This analysis measures the degree of co-expression within a pair of neighborhoods.</div>')
+            gr.Markdown('<div class="h-2">Select the clustering method:</div>')
+            info_text = gr.Markdown(update_info_text('K-neighbor'))
+            cluster_method = gr.Radio(['k-neighbor', 'distance'], value='k-neighbor', elem_classes='test', label='')
+            cluster_threshold = gr.Slider(minimum=1, maximum=100, step=1, value=30, interactive=True, label='Clustering threshold')
+            spatial_btn = gr.Button('Run spatial interaction analysis')
+        with gr.Column(scale=3):
+            gr.Markdown('<div class="h-2">Visualization of spatial interaction of markers</div>')
+            spatial_viz = gr.Plot(elem_classes='no-label')
+    cluster_method.change(fn=update_info_text, inputs=cluster_method, outputs=info_text)
+    co_exp_btn.click(fn=co_expression, inputs=[cytof_state, norm_percentile], outputs=[co_exp_viz, cytof_state])
+    # spatial_btn logic is in step6. This is populate the marker positive dropdown options
+    gr.Markdown('<br>')
+    gr.Markdown('<div class="h-1">Step 6. Visualize positive markers</div>')
+    gr.Markdown('<div class="h-2">Select two markers for side-by-side comparison to visualize their positive states in cells. This serves two purposes. </div>')
+    gr.Markdown('<div class="h-2">(1) Validate the co-expression analysis results. High expression level should mean a similar number of positive markers within the two slides, whereas low expression level mean a large difference of in the number of positive markers. </div>')
+    gr.Markdown('<div class="h-2">(2) Validate teh spatial interaction analysis results. High interaction means the two positive markers are in close proximity of each other (proximity is previously defined in `clustering threshold`), and vice versa.</div>')
+    gr.Markdown('<hr>')
+    with gr.Row(): # two marker positive visualization - dropdown options
+        with gr.Column(scale=2):
+            selected_marker1 = gr.Dropdown(label='Select one marker', info='Select a marker to visualize', interactive=True)
+            selected_marker2 = gr.Dropdown(label='Select another marker', info='Selecting the same marker as the previous one is allowed', interactive=True)
+            pos_viz_btn = gr.Button('Visualize these two markers')
+        with gr.Column(scale=3):
+            gr.Markdown('<div class="h-2">Visualization of the two markers.</div>')
+            marker_pos_viz = gr.Plot(label="Hover over graph to zoom, pan, save, etc.")
+            spatial_btn.click(
+                fn=spatial_interaction, inputs=[cytof_state, norm_percentile, cluster_method, cluster_threshold], outputs=[spatial_viz, cytof_state]
+            ).success(
+                fn=get_marker_pos_options, inputs=[cytof_state], outputs=[selected_marker1, selected_marker2]
+            )
+            pos_viz_btn.click(fn=viz_pos_marker_pair, inputs=[cytof_state, selected_marker1, selected_marker2, norm_percentile], outputs=[marker_pos_viz])
+    gr.Markdown('<br>')
+    gr.Markdown('<div class="h-1">Step 7. Phenogrpah Clustering</div>')
+    gr.Markdown('<div class="h-2">Cells can be clustered into sub-groups based on the extracted single-cell data.</div>')
+    gr.Markdown('<div class="h-2">Time reference: a 300MB IMC file takes about 2 minutes to compute.</div>')
+    gr.Markdown('<hr>')
+    with gr.Row(): # add two plots to visualize phenograph results
+        with gr.Column(scale=2):
+            gr.Markdown('<div class="h-2">We used UMAP to project the high-dimensional data onto a 2-D space.</div>')
+            umap_btn = gr.Button('Run Phenograph clustering')
+        with gr.Column(scale=3):
+            phenograph_umap = gr.Plot(label="UMAP results")
+    with gr.Row(): # add two plots to visualize phenograph results
+        with gr.Column(scale=2):
+            gr.Markdown('<div class="h-2">The previously assigned clusters are also reflected in this figure.</div>')
+            cluster_interact_btn = gr.Button('Run clustering interaction')
+        with gr.Column(scale=3):
+            cluster_interaction = gr.Plot(label="Spatial interaction of clusters")
+            cluster_interact_btn.click(cluster_interaction_fn, inputs=[cytof_state, cohort_state], outputs=[cluster_interaction, cytof_state, cohort_state])
+    gr.Markdown('<br>')
+    gr.Markdown('<div class="h-2">In additional, you could visualizing the cluster assignments against the positive markers to oberve any patterns:</div>')
+    gr.Markdown('<hr>')
+    with gr.Row():
+        with gr.Column(scale=2):
+            selected_cluster_marker = gr.Dropdown(label='Select one marker', info='Select a marker to visualize', interactive=True)
+            cluster_positive_btn = gr.Button('Compare clusters and positive markers')
+        with gr.Column(scale=3):
+            cluster_v_positive = gr.Plot(label="Cluster assignment vs. positive cells. Hover over graph to zoom, pan, save, etc.")
+        umap_btn.click(
+            fn=phenograph, inputs=[cohort_state], outputs=[phenograph_umap, cohort_state]
+        ).success(
+            fn=get_cluster_pos_options, inputs=[cytof_state], outputs=[selected_cluster_marker], api_name='selectClusterMarker'
+        )
+        cluster_positive_btn.click(fn=viz_cluster_positive, inputs=[selected_cluster_marker, norm_percentile, cytof_state, cohort_state], outputs=[cluster_v_positive, cytof_state, cohort_state])
+    # clear everything if clicked
+    clear_components = [img_path, marker_path, img_info, img_viz, channel_feedback, seg_viz, feat_df, co_exp_viz, spatial_viz, marker_pos_viz, phenograph_umap, cluster_interaction, cluster_v_positive]
+    clear_btn.click(lambda: [None]*len(clear_components), outputs=clear_components)
+if __name__ == "__main__":
+    demo.launch()

cytof/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+# from .hyperion_analysis import *
+from .hyperion_preprocess import *
+from .utils import *
+from .segmentation_functions import *

cytof/batch_preprocess.py ADDED Viewed

	@@ -0,0 +1,279 @@

+#!/usr/bin/env python
+# coding: utf-8
+import os
+import glob
+import matplotlib.pyplot as plt
+import pickle as pkl
+import numpy as np
+import argparse
+import yaml
+import pandas as pd
+import skimage
+import sys
+import platform
+from pathlib import Path
+FILE = Path(__file__).resolve()
+ROOT = FILE.parents[0]  # cytof root directory
+if str(ROOT) not in sys.path:
+    sys.path.append(str(ROOT))  # add ROOT to PATH
+if platform.system() != 'Windows':
+    ROOT = Path(os.path.relpath(ROOT, Path.cwd()))  # relative
+from classes import CytofImage, CytofImageTiff
+# import sys
+# sys.path.append('../cytof')
+from hyperion_preprocess import cytof_read_data_roi
+from hyperion_analysis import batch_scale_feature
+from utils import save_multi_channel_img
+def makelist(string):
+    delim = ','
+    # return [float(_) for _ in string.split(delim)]
+    return [_ for _ in string.split(delim)]
+def parse_opt():
+    parser = argparse.ArgumentParser('Cytof batch process', add_help=False)
+    parser.add_argument('--cohort_file', type=str,
+                        help='a txt file with information of all file paths in the cohort')
+    parser.add_argument('--params_ROI', type=str,
+                        help='a txt file with parameters used to process single ROI previously')
+    parser.add_argument('--outdir', type=str, help='directory to save outputs')
+    parser.add_argument('--save_channel_images', action='store_true',
+                        help='an indicator of whether save channel images')
+    parser.add_argument('--save_seg_vis', action='store_true',
+                        help='an indicator of whether save sample visualization of segmentation')
+    parser.add_argument('--show_seg_process', action='store_true',
+                        help='an indicator of whether show segmentation process')
+    parser.add_argument('--quality_control_thres', type=int, default=50,
+                        help='the smallest image size for an image to be kept')
+    return parser
+def main(args):
+    # if args.save_channel_images:
+    #     print("saving channel images")
+    # else:
+    #     print("NOT saving channel images")
+    # if args.save_seg_vis:
+    #     print("saving segmentation visualization")
+    # else:
+    #     print("NOT saving segmentation visualization")
+    # if args.show_seg_process:
+    #     print("showing segmentation process")
+    # else:
+    #     print("NOT showing segmentation process")
+    # parameters used when processing single ROI
+    params_ROI   = yaml.load(open(args.params_ROI, "rb"), Loader=yaml.Loader)
+    channel_dict = params_ROI["channel_dict"]
+    channels_remove = params_ROI["channels_remove"]
+    quality_control_thres = params_ROI["quality_control_thres"]
+    # name of the batch and saving directory
+    cohort_name = os.path.basename(args.cohort_file).split('.csv')[0]
+    print(cohort_name)
+    outdir = os.path.join(args.outdir, cohort_name)
+    if not os.path.exists(outdir):
+        os.makedirs(outdir)
+    feat_dirs = {}
+    feat_dirs['orig'] = os.path.join(outdir, "feature")
+    if not os.path.exists(feat_dirs['orig']):
+        os.makedirs(feat_dirs['orig'])
+    for q in params_ROI["normalize_qs"]:
+        dir_qnorm = os.path.join(outdir, f"feature_{q}normed")
+        feat_dirs[f"{q}normed"] = dir_qnorm
+        if not os.path.exists(dir_qnorm):
+            os.makedirs(dir_qnorm)
+    dir_img_cytof = os.path.join(outdir, "cytof_images")
+    if not os.path.exists(dir_img_cytof):
+        os.makedirs(dir_img_cytof)
+    if args.save_seg_vis:
+        dir_seg_vis = os.path.join(outdir, "segmentation_visualization")
+        if not os.path.exists(dir_seg_vis):
+            os.makedirs(dir_seg_vis)
+    # process batch files
+    cohort_files_ = pd.read_csv(args.cohort_file)
+    # cohort_files = [os.path.join(cohort_files_.loc[i, "path"], "{}".format(cohort_files_.loc[i, "ROI"])) \
+    #                 for i in range(cohort_files_.shape[0])]
+    print("Start processing {} files".format(cohort_files_.shape[0]))
+    cytof_imgs = {}  # a dictionary contain the full file path of all results
+    seen = 0
+    dfs_scale_params = {}  # key: quantile q; item: features to be scaled
+    df_io = pd.DataFrame(columns=["Slide", "ROI", "path", "output_file"])
+    df_bad_rois = pd.DataFrame(columns=["Slide", "ROI", "path", "size (W*H)"])
+    # for f_roi in cohort_files:
+    for i in range(cohort_files_.shape[0]):
+        slide, pth_i, f_roi_ = cohort_files_.loc[i, "Slide"], cohort_files_.loc[i, "path"], cohort_files_.loc[i, "ROI"]
+        f_roi = os.path.join(pth_i, f_roi_)
+        print("\nNow analyzing {}".format(f_roi))
+        roi   = f_roi_.split('.txt')[0]
+        print("{}-{}".format(slide, roi))
+        ## 1) Read and preprocess data
+        # read data: file name -> dataframe
+        cytof_img = cytof_read_data_roi(f_roi, slide, roi)
+        # quality control section
+        cytof_img.quality_control(thres=quality_control_thres)
+        if not cytof_img.keep:
+            H = max(cytof_img.df['Y'].values) + 1
+            W = max(cytof_img.df['X'].values) + 1
+        # if (H < args.quality_control_thres) or (W < quality_control_thres):
+        #     print("At least one dimension of the image {}-{} is smaller than {}, skipping" \
+        #           .format(cytof_img.slide, cytof_img.roi, quality_control_thres))
+            df_bad_rois = pd.concat([df_bad_rois,
+                                     pd.DataFrame.from_dict([{"Slide": slide,
+                                      "ROI": roi,
+                                      "path": pth_i,
+                                      "size (W*H)": (W,H)}])])
+            continue
+        if args.save_channel_images:
+            dir_roi_channel_img = os.path.join(outdir, "channel_images", f_roi_)
+            if not os.path.exists(dir_roi_channel_img):
+                os.makedirs(dir_roi_channel_img)
+        # markers used when capturing the image
+        cytof_img.get_markers()
+        # preprocess: fill missing values with 0.
+        cytof_img.preprocess()
+        # save info
+        if seen == 0:
+            f_info = open(os.path.join(outdir, 'readme.txt'), 'w')
+            f_info.write("Original markers: ")
+            f_info.write('\n{}'.format(", ".join(cytof_img.markers)))
+            f_info.write("\nOriginal channels: ")
+            f_info.write('\n{}'.format(", ".join(cytof_img.channels)))
+        ## (optional): save channel images
+        if args.save_channel_images:
+            cytof_img.get_image()
+            cytof_img.save_channel_images(dir_roi_channel_img)
+        ## remove special channels if defined
+        if len(channels_remove) > 0:
+            cytof_img.remove_special_channels(channels_remove)
+            cytof_img.get_image()
+        ## 2) nuclei & membrane channels and visualization
+        cytof_img.define_special_channels(channel_dict)
+        assert len(cytof_img.channels) == cytof_img.image.shape[-1]
+        # #### Dataframe -> raw image
+        # cytof_img.get_image()
+        ## (optional): save channel images
+        if args.save_channel_images:
+            cytof_img.get_image()
+            vis_channels = [k for (k, itm) in params_ROI["channel_dict"].items() if len(itm)>0]
+            cytof_img.save_channel_images(dir_roi_channel_img, channels=vis_channels)
+        ## 3) Nuclei and cell segmentation
+        nuclei_seg, cell_seg = cytof_img.get_seg(use_membrane=params_ROI["use_membrane"],
+                                                 radius=params_ROI["cell_radius"],
+                                                 show_process=args.show_seg_process)
+        if args.save_seg_vis:
+            marked_image_nuclei = cytof_img.visualize_seg(segtype="nuclei", show=False)
+            save_multi_channel_img(skimage.img_as_ubyte(marked_image_nuclei[0:100, 0:100, :]),
+                                    os.path.join(dir_seg_vis, "{}_{}_nuclei_seg.png".format(slide, roi)))
+            marked_image_cell = cytof_img.visualize_seg(segtype="cell", show=False)
+            save_multi_channel_img(skimage.img_as_ubyte(marked_image_cell[0:100, 0:100, :]),
+                                    os.path.join(dir_seg_vis, "{}_{}_cell_seg.png".format(slide, roi)))
+        ## 4) Feature extraction
+        cytof_img.extract_features(f_roi)
+        # save the original extracted feature
+        cytof_img.df_feature.to_csv(os.path.join(feat_dirs['orig'], "{}_{}_feature_summary.csv".format(slide, roi)),
+                                    index=False)
+        ### 4.1) Log transform and quantile normalization
+        cytof_img.feature_quantile_normalization(qs=params_ROI["normalize_qs"], savedir=feat_dirs['orig'])
+        # calculate scaling parameters
+        ## features to be scaled
+        if seen == 0:
+            s_features = [col for key, features in cytof_img.features.items() \
+                      for f in features \
+                      for col in cytof_img.df_feature.columns if col.startswith(f)]
+            f_info.write("\nChannels removed: ")
+            f_info.write("\n{}".format(", ".join(channels_remove)))
+            f_info.write("\nFinal markers: ")
+            f_info.write("\n{}".format(', '.join(cytof_img.markers)))
+            f_info.write("\nFinal channels: ")
+            f_info.write("\n{}".format(', '.join(cytof_img.channels)))
+            f_info.close()
+        ## loop over quantiles
+        for q, quantile in cytof_img.dict_quantiles.items():
+            n_attr = f"df_feature_{q}normed"
+            df_normed = getattr(cytof_img, n_attr)
+            # save the normalized features to csv
+            df_normed.to_csv(os.path.join(feat_dirs[f"{q}normed"],
+                                          "{}_{}_feature_summary.csv".format(slide, roi)),
+                             index=False)
+            if seen == 0:
+                dfs_scale_params[q] = df_normed[s_features]
+                dict_quantiles = cytof_img.dict_quantiles
+            else:
+                # dfs_scale_params[q] = dfs_scale_params[q].append(df_normed[s_features], ignore_index=True)
+                dfs_scale_params[q] = pd.concat([dfs_scale_params[q], df_normed[s_features]])
+        seen += 1
+        # save the class instance
+        out_file = os.path.join(dir_img_cytof, "{}_{}.pkl".format(slide, roi))
+        cytof_img.save_cytof(out_file)
+        cytof_imgs[roi] = out_file
+        # df_io = df_io.append({"Slide": slide,
+        #                       "ROI": roi,
+        #                       "path": pth_i,
+        #                       "output_file": out_file}, ignore_index=True)
+        df_io = pd.concat([df_io,
+                           pd.DataFrame.from_dict([{"Slide": slide,
+                            "ROI": roi,
+                            "path": pth_i,
+                            "output_file": os.path.abspath(out_file) # use absolute path
+                            }])
+                           ])
+    for q in dict_quantiles.keys():
+        df_scale_params = dfs_scale_params[q].mean().to_frame(name="mean").transpose()
+        # df_scale_params = df_scale_params.append(dfs_scale_params[q].std().to_frame(name="std").transpose(),
+        #                                          ignore_index=True)
+        df_scale_params = pd.concat([df_scale_params, dfs_scale_params[q].std().to_frame(name="std").transpose()])
+        df_scale_params.to_csv(os.path.join(outdir, f"{q}normed_scale_params.csv"), index=False)
+    # df_io = pd.DataFrame.from_dict(cytof_imgs, orient="index", columns=['output_file'])
+    # df_io.reset_index(inplace=True)
+    # df_io.rename(columns={'index': 'input_file'}, inplace=True)
+    df_io.to_csv(os.path.join(outdir, "input_output.csv"), index=False)
+    if len(df_bad_rois) > 0:
+        df_bad_rois.to_csv(os.path.join(outdir, "skipped_rois.csv"), index=False)
+    # scale feature
+    batch_scale_feature(outdir, normqs=params_ROI["normalize_qs"], df_io=df_io)
+    # return cytof_imgs, feat_dirs
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser('Cytof batch process', parents=[parse_opt()])
+    args  = parser.parse_args()
+    main(args)

cytof/classes.py ADDED Viewed

	@@ -0,0 +1,1894 @@

+import itertools
+import re
+import warnings
+import os
+import sys
+import copy
+import pickle as pkl
+import numpy as np
+import pandas as pd
+import skimage
+from skimage.segmentation import mark_boundaries
+import matplotlib.pyplot as plt
+from matplotlib.pyplot import cm
+import matplotlib.pyplot
+matplotlib.pyplot.switch_backend('Agg')
+import seaborn as sns
+import phenograph
+# suppress numba deprecation warning
+# ref: https://github.com/Arize-ai/phoenix/pull/799
+with warnings.catch_warnings():
+    from numba.core.errors import NumbaWarning
+    warnings.simplefilter("ignore", category=NumbaWarning)
+    import umap
+    from umap import UMAP
+from typing import Union, Optional, Type, Tuple, List, Dict
+from collections.abc import Callable
+from scipy import sparse as sp
+from sklearn.neighbors import kneighbors_graph as skgraph  # , DistanceMetric
+from sklearn.metrics import DistanceMetric
+from sklearn.cluster import KMeans
+from itertools import product
+## added for test
+import platform
+from pathlib import Path
+FILE = Path(__file__).resolve()
+ROOT = FILE.parents[0]  # cytof root directory
+if str(ROOT) not in sys.path:
+    sys.path.append(str(ROOT))  # add ROOT to PATH
+if platform.system() != 'Windows':
+    ROOT = Path(os.path.relpath(ROOT, Path.cwd()))  # relative
+from hyperion_segmentation import cytof_nuclei_segmentation, cytof_cell_segmentation, visualize_segmentation
+from cytof.utils import (save_multi_channel_img, generate_color_dict, show_color_table,
+visualize_scatter, visualize_expression, _get_thresholds, _generate_summary)
+def get_name(dfrow):
+    return os.path.join(dfrow['path'], dfrow['ROI'])
+class CytofImage():
+    morphology = ["area", "convex_area", "eccentricity", "extent",
+                  "filled_area", "major_axis_length", "minor_axis_length",
+                  "orientation", "perimeter", "solidity", "pa_ratio"]
+    def __init__(self, df: Optional[pd.DataFrame] = None, slide: str = "", roi: str = "", filename: str = ""):
+        self.df       = df
+        self.slide    = slide
+        self.roi      = roi
+        self.filename = filename
+        self.columns  = None # column names in original cytof data (dataframe)
+        self.markers  = None # protein markers
+        self.labels   = None # metal isotopes used to tag protein
+        self.image    = None
+        self.channels = None # channel names correspond to each channel of self.image
+        self.features = None
+    def copy(self):
+        '''
+        Creates a deep copy of the current CytofImage object and return it
+        '''
+        new_instance = type(self)(self.df.copy(), self.slide, self.roi, self.filename)
+        new_instance.columns = copy.deepcopy(self.columns)
+        new_instance.markers = copy.deepcopy(self.markers)
+        new_instance.labels = copy.deepcopy(self.labels)
+        new_instance.image = copy.deepcopy(self.image)
+        new_instance.channels = copy.deepcopy(self.channels)
+        new_instance.features = copy.deepcopy(self.features)
+        return new_instance
+    def __str__(self):
+        return f"CytofImage slide {self.slide}, ROI {self.roi}"
+    def __repr__(self):
+        return f"CytofImage(slide={self.slide}, roi={self.roi})"
+    def save_cytof(self, savename: str):
+        directory = os.path.dirname(savename)
+        if not os.path.exists(directory):
+            os.makedirs(directory)
+        pkl.dump(self, open(savename, "wb"))
+    def get_markers(self, imarker0: Optional[str] = None):
+        """
+        Get     (1) the channel names correspond to each image channel
+                (2) a list of protein markers used to obtain the CyTOF image
+                (3) a list of labels tagged to each of the protein markers
+        """
+        self.columns = list(self.df.columns)
+        if imarker0 is not None:  # if the index of the 1st marker provided
+            self.raw_channels = self.columns[imarker0:]
+        else:  # assumption: channel names have the common expression: marker(label*)
+            pattern = "\w+.*\(\w+\)"
+            self.raw_channels = [re.findall(pattern, t)[0] for t in self.columns if len(re.findall(pattern, t)) > 0]
+        self.raw_markers = [x.split('(')[0] for x in self.raw_channels]
+        self.raw_labels  = [x.split('(')[-1].split(')')[0] for x in self.raw_channels]
+        self.channels = self.raw_channels.copy()
+        self.markers  = self.raw_markers.copy()
+        self.labels   = self.raw_labels.copy()
+    def export_feature(self, feat_name: str, savename: Optional[str] = None):
+        """ Export a set of specified feature """
+        savename = savename if savename else f"{feat_name}.csv"
+        savename = savename if savename.endswith(".csv") else f"{feat_name}.csv"
+        df = getattr(self, feat_name)
+        df.to_csv(savename)
+    def preprocess(self):
+        nrow = int(max(self.df['Y'].values)) + 1
+        ncol = int(max(self.df['X'].values)) + 1
+        n = len(self.df)
+        if nrow * ncol > n:
+            df2 = pd.DataFrame(np.zeros((nrow * ncol - n, len(self.df.columns)), dtype=int),
+                               columns=self.df.columns)
+            self.df = pd.concat([self.df, df2])
+    def quality_control(self, thres: int = 50) -> None:
+        setattr(self, "keep", False)
+        if (max(self.df['X']) < thres) \
+                or (max(self.df['Y']) < thres):
+            print("At least one dimension of the image {}-{} is smaller than {}, exclude from analyzing" \
+                  .format(self.slide, self.roi, thres))
+            self.keep = False
+    def check_channels(self,
+                       channels: Optional[List] = None,
+                       xlim: Optional[List] = None,
+                       ylim: Optional[List] = None,
+                       ncols: int = 5,
+                       vis_q: float = 0.9,
+                       colorbar: bool = False,
+                       savedir: Optional[str] = None,
+                       savename: str = "check_channels"
+                       ):# -> Optional[matplotlib.figure.Figure]:
+        """
+        xlim = a list of 2 numbers indicating the ylimits to show image (default=None)
+        ylim = a list of 2 numbers indicating the ylimits to show image (default=None)
+        ncols = number of subplots per row (default=5)
+        vis_q = percentile q used to normalize image before visualization  (default=0.9)
+        """
+        show = True if savedir is None else False
+        if channels is not None:
+            if not all([cl.lower() in self.channels for cl in channels]):
+                print("At least one of the channels not available, visualizing all channels instead!")
+                channels = None
+        if channels is None:  # if no desired channels specified, check all channels
+            channels = self.channels
+        nrow = max(self.df['Y'].values) + 1
+        ncol = max(self.df['X'].values) + 1
+        if len(channels) <= ncols:
+            ax_nrow = 1
+            ax_ncol = len(channels)
+        else:
+            ax_ncol = ncols
+            ax_nrow = int(np.ceil(len(channels) / ncols))
+        fig, axes = plt.subplots(ax_nrow, ax_ncol, figsize=(3 * ax_ncol, 3 * ax_nrow))
+        if ax_nrow == 1:
+            axes = np.array([axes])
+            if ax_ncol == 1:
+                axes = np.expand_dims(axes, axis=1)
+        for i, _ in enumerate(channels):
+            _ax_nrow = int(np.floor(i / ax_ncol))
+            _ax_ncol = i % ax_ncol
+            image = self.df[_].values.reshape(nrow, ncol)
+            percentile_q = np.quantile(image, vis_q) if np.quantile(image, vis_q)!= 0 else 1
+            image = np.clip(image / percentile_q, 0, 1)
+            axes[_ax_nrow, _ax_ncol].set_title(_)
+            if xlim is not None:
+                image = image[:, xlim[0]:xlim[1]]
+            if ylim is not None:
+                image = image[ylim[0]:ylim[1], :]
+            im = axes[_ax_nrow, _ax_ncol].imshow(image, cmap="gray")
+            if colorbar:
+                fig.colorbar(im, ax=axes[_ax_nrow, _ax_ncol])
+        plt.tight_layout()
+        if show:
+            plt.show()
+        else:
+            plt.savefig(os.path.join(savedir, f"{savename}.png"))
+            return fig
+    def get_image(self, channels: List =None, inplace: bool = True, verbose=False):
+        """
+        Get channel images based on provided channels. By default, get channel images correspond to all channels
+        """
+        if channels is not None:
+            if not all([cl in self.channels for cl in channels]):
+                print("At least one of the channels not available, using default all channels instead!")
+                channels = self.channels
+                inplace = True
+        else:
+            channels = self.channels
+            inplace = True
+        nc = len(channels)
+        nrow = max(self.df['Y'].values) + 1
+        ncol = max(self.df['X'].values) + 1
+        if verbose:
+            print("Output image shape: [{}, {}, {}]".format(nrow, ncol, nc))
+        target_image = np.zeros([nrow, ncol, nc], dtype=float)
+        for _nc in range(nc):
+            target_image[..., _nc] = self.df[channels[_nc]].values.reshape(nrow, ncol)
+        if inplace:
+            self.image = target_image
+        else:
+            return target_image
+    def visualize_single_channel(self,
+                                 channel_name: str,
+                                 color: str,
+                                 quantile: float = None,
+                                 visualize: bool = False):
+        """
+        Visualize one channel of the multi-channel image, with a specified color from red, green, and blue
+        """
+        channel_id = self.channels.index(channel_name)
+        if quantile is None:  # calculate 99th percentile by default
+            quantile = np.quantile(self.image[..., channel_id], 0.99)
+        channel_id_ = ["red", "green", "blue"].index(color)  # channel index
+        vis_im = np.zeros((self.image.shape[0], self.image.shape[1], 3))
+        gs = np.clip(self.image[..., channel_id] / quantile, 0, 1)  # grayscale
+        vis_im[..., channel_id_] = gs
+        vis_im = (vis_im * 255).astype(np.uint8)
+        if visualize:
+            fig, ax = plt.subplots(1, 1)
+            ax.imshow(vis_im)
+            plt.show()
+        return vis_im
+    def visualize_channels(self,
+                           channel_ids: Optional[List]=None,
+                           channel_names: Optional[List]=None,
+                           quantiles: Optional[List]=None,
+                           visualize: Optional[bool]=False,
+                           show_colortable: Optional[bool]=False
+                           ):
+        """
+        Visualize multiple channels simultaneously
+        """
+        assert channel_ids or channel_names, 'At least one should be provided, either "channel_ids" or "channel_names"!'
+        if channel_ids is None:
+            channel_ids = [self.channels.index(n) for n in channel_names]
+        else:
+            channel_names = [self.channels[i] for i in channel_ids]
+        assert len(channel_ids) <= 7, "No more than 6 channels can be visualized simultaneously!"
+        if len(channel_ids) > 3:
+            warnings.warn(
+                "Visualizing more than 3 channels the same time results in deteriorated visualization. \
+                It is not recommended!")
+        print("Visualizing channels: {}".format(', '.join(channel_names)))
+        full_colors = ['red', 'green', 'blue', 'cyan', 'magenta', 'yellow', 'white']
+        color_values = [(1, 0, 0), (0, 1, 0), (0, 0, 1),
+                        (0, 1, 1), (1, 0, 1), (1, 1, 0),
+                        (1, 1, 1)]
+        info = ["{} in {}\n".format(marker, c) for (marker, c) in \
+                zip([self.channels[i] for i in channel_ids], full_colors[:len(channel_ids)])]
+        print("Visualizing... \n{}".format(''.join(info)))
+        merged_im = np.zeros((self.image.shape[0], self.image.shape[1], 3))
+        if quantiles is None:
+            quantiles = [np.quantile(self.image[..., _], 0.99) for _ in channel_ids]
+        # max_vals = []
+        for _ in range(min(len(channel_ids), 3)):  # first 3 channels, assign colors R, G, B
+            gs = np.clip(self.image[..., channel_ids[_]] / quantiles[_], 0, 1)  # grayscale
+            merged_im[..., _] = gs * 255
+            max_val = [0, 0, 0]
+            max_val[_] = gs.max() * 255
+            # max_vals.append(max_val)
+        chs = [[1, 2], [0, 2], [0, 1], [0, 1, 2]]
+        chs_id = 0
+        while _ < len(channel_ids) - 1:
+            _ += 1
+            max_val = [0, 0, 0]
+            for j in chs[chs_id]:
+                gs = np.clip(self.image[..., channel_ids[_]] / quantiles[_], 0, 1)
+                merged_im[..., j] += gs * 255  # /2
+                merged_im[..., j] = np.clip(merged_im[..., j], 0, 255)
+                max_val[j] = gs.max() * 255
+            chs_id += 1
+            # max_vals.append(max_val)
+        merged_im = merged_im.astype(np.uint8)
+        if visualize:
+            fig, ax = plt.subplots(1, 1)
+            ax.imshow(merged_im)
+            plt.show()
+        vis_markers = [self.markers[i] if i < len(self.markers) else self.channels[i] for i in channel_ids]
+        color_dict = dict((n, c) for (n, c) in zip(vis_markers, color_values[:len(channel_ids)]))
+        if show_colortable:
+            show_color_table(color_dict=color_dict, title="color dictionary", emptycols=3, sort_names=True)
+        return merged_im, quantiles, color_dict
+    def remove_special_channels(self, channels: List):
+        """
+        Given a list of channels, remove them from the class. This typically happens when users define certain channels to be the nuclei for special processing.
+        """
+        for channel in channels:
+            if channel not in self.channels:
+                print("Channel {} not available, escaping...".format(channel))
+                continue
+            idx = self.channels.index(channel)
+            self.channels.pop(idx)
+            self.markers.pop(idx)
+            self.labels.pop(idx)
+            self.df.drop(columns=channel, inplace=True)
+    def define_special_channels(self, channels_dict: Dict, verbose=False, rm_key: str = 'nuclei'):
+        '''
+        Special channels (antibodies) commonly found to define cell componenets (e.g. nuclei or membranes)
+        '''
+        channels_rm = []
+        for new_name, old_names in channels_dict.items():
+            if len(old_names) == 0:
+                continue
+            old_nms = []
+            for i, old_name in enumerate(old_names):
+                if old_name not in self.channels:
+                    warnings.warn('{} is not available!'.format(old_name))
+                    continue
+                old_nms.append(old_name)
+            if verbose:
+                print("Defining channel '{}' by summing up channels: {}.".format(new_name, ', '.join(old_nms)))
+            if len(old_nms) > 0:
+                # only add channels to removal list if matching remove key
+                if new_name == rm_key:
+                    channels_rm += old_nms
+                for i, old_name in enumerate(old_nms):
+                    if i == 0:
+                        self.df[new_name] = self.df[old_name]
+                    else:
+                        self.df[new_name] += self.df[old_name]
+                if new_name not in self.channels:
+                    self.channels.append(new_name)
+        self.get_image(verbose=verbose)
+        if hasattr(self, "defined_channels"):
+            for key in channels_dict.keys():
+                self.defined_channels.add(key)
+        else:
+            setattr(self, "defined_channels", set(list(channels_dict.keys())))
+        return channels_rm
+    def get_seg(
+        self,
+        use_membrane: bool = True,
+        radius: int = 5,
+        sz_hole: int = 1,
+        sz_obj: int = 3,
+        min_distance: int = 2,
+        fg_marker_dilate: int = 2,
+        bg_marker_dilate: int = 2,
+        show_process: bool = False,
+        verbose: bool = False):
+        channels = [x.lower() for x in self.channels]
+        assert 'nuclei' in channels, "a 'nuclei' channel is required for segmentation!"
+        nuclei_img = self.image[..., self.channels.index('nuclei')]
+        if show_process:
+            print("Nuclei segmentation...")
+        # else:
+        #     print("Not showing segmentation process")
+        nuclei_seg, color_dict = cytof_nuclei_segmentation(nuclei_img, show_process=show_process,
+                                                           size_hole=sz_hole, size_obj=sz_obj,
+                                                           fg_marker_dilate=fg_marker_dilate,
+                                                           bg_marker_dilate=bg_marker_dilate,
+                                                           min_distance=min_distance)
+        membrane_img = self.image[..., self.channels.index('membrane')] \
+            if (use_membrane and 'membrane' in self.channels) else None
+        if show_process:
+            print("Cell segmentation...")
+        cell_seg, _ = cytof_cell_segmentation(nuclei_seg, radius, membrane_channel=membrane_img,
+                                              show_process=show_process, colors=color_dict)
+        self.nuclei_seg = nuclei_seg
+        self.cell_seg   = cell_seg
+        return nuclei_seg, cell_seg
+    def visualize_seg(self, segtype: str = "cell", seg=None, show: bool = False, bg_label: int = 1):
+        assert segtype in ["nuclei", "cell"], f"segtype {segtype} not supported. Accepted cell type: ['nuclei', 'cell']"
+        # nuclei in red, membrane in green
+        if "membrane" in self.channels:
+            channel_ids = [self.channels.index(_) for _ in ["nuclei", "membrane"]]
+        else:
+            # visualize one marker channel and nuclei channel
+            channel_ids = [self.channels.index("nuclei"), 0]
+        if seg is None:
+            if segtype == "cell":
+                seg = self.cell_seg
+                '''# membrane in red, nuclei in green
+                channel_ids = [self.channels.index(_) for _ in ["membrane", "nuclei"]]'''
+            else:
+                seg = self.nuclei_seg
+        # mark distinct membrane or nuclei boundary colors
+        if segtype == 'cell':
+            marked_image = visualize_segmentation(self.image, self.channels, seg, channel_ids=channel_ids, bound_color=(1, 1, 1), show=show, bg_label=bg_label)
+        else: # marking nucleus boundaries as blue
+            marked_image = visualize_segmentation(self.image, self.channels, seg, channel_ids=channel_ids, bound_color=(1, 1, 0), show=show, bg_label=bg_label)
+        seg_color = 'yellow' if segtype=='nuclei' else 'white'
+        print(f"{segtype} boundary marked by {seg_color}")
+        return marked_image
+    def extract_features(self, filename, use_parallel=True, show_sample=False):
+        from cytof.utils import extract_feature
+        # channel indices correspond to pure markers
+        '''pattern = "\w+.*\(\w+\)"
+        marker_idx      = [i for (i,x) in enumerate(self.channels) if len(re.findall(pattern, x))>0] '''
+        marker_idx = [i for (i, x) in enumerate(self.channels) if x not in self.defined_channels]
+        marker_channels = [self.channels[i] for i in marker_idx]  # pure marker channels
+        marker_image = self.image[..., marker_idx]  # channel images correspond to pure markers
+        morphology = self.morphology
+        self.features = {
+            "nuclei_morphology": [_ + '_nuclei' for _ in morphology],  # morphology - nuclei level
+            "cell_morphology": [_ + '_cell' for _ in morphology],  # morphology - cell level
+            "cell_sum": [_ + '_cell_sum' for _ in marker_channels],
+            "cell_ave": [_ + '_cell_ave' for _ in marker_channels],
+            "nuclei_sum": [_ + '_nuclei_sum' for _ in marker_channels],
+            "nuclei_ave": [_ + '_nuclei_ave' for _ in marker_channels],
+        }
+        self.df_feature = extract_feature(marker_channels, marker_image,
+                                          self.nuclei_seg, self.cell_seg,
+                                          filename, use_parallel=use_parallel,
+                                          show_sample=show_sample)
+    def calculate_quantiles(self, qs: Union[List, int] = 75, savename: Optional[str] = None, verbose: bool = False):
+        """
+        Calculate the q-quantiles of each marker with cell level summation given the q values
+        """
+        qs = [qs] if isinstance(qs, int) else qs
+        _expressions_cell_sum = []
+        quantiles = {}
+        colors = cm.rainbow(np.linspace(0, 1, len(qs)))
+        for feature_name in self.features["cell_sum"]:  # all cell sum features except for nuclei_cell_sum and membrane_cell_sum
+            if feature_name.startswith("nuclei") or feature_name.startswith("membrane"):
+                continue
+            _expressions_cell_sum.extend(self.df_feature[feature_name])
+        plt.hist(np.log2(np.array(_expressions_cell_sum) + 0.0001), 100, density=True)
+        for q, c in zip(qs, colors):
+            quantiles[q] = np.quantile(_expressions_cell_sum, q / 100)
+            plt.axvline(np.log2(quantiles[q]), label=f"{q}th percentile", c=c)
+            if verbose:
+                print(f"{q}th percentile: {quantiles[q]}")
+        plt.xlim(-15, 15)
+        plt.xlabel("log2(expression of all markers)")
+        plt.legend()
+        if savename is not None:
+            plt.savefig(savename)
+        plt.show()
+        # attach quantile dictionary to self
+        self.dict_quantiles = quantiles
+        print('dict quantiles:', quantiles)
+        # return quantiles
+    def _vis_normalization(self, savename: Optional[str] = None):
+        """
+        Compare before and after normalization
+        """
+        expressions = {}
+        expressions["original"] = []
+        ## before normalization
+        for key, features in self.features.items():
+            if key.endswith("morphology"):
+                continue
+            for feature_name in features:
+                if feature_name.startswith('nuclei') or feature_name.startswith('membrane'):
+                    continue
+                expressions["original"].extend(self.df_feature[feature_name])
+        log_exp = np.log2(np.array(expressions['original']) + 0.0001)
+        plt.hist(log_exp, 100, density=True, label='before normalization')
+        for q in self.dict_quantiles.keys():
+            n_attr = f"df_feature_{q}normed"
+            expressions[f"{q}_normed"] = []
+            for key, features in self.features.items():
+                if key.endswith("morphology"):
+                    continue
+                for feature_name in features:
+                    if feature_name.startswith('nuclei') or feature_name.startswith('membrane'):
+                        continue
+                    expressions[f"{q}_normed"].extend(getattr(self, n_attr)[feature_name])
+            plt.hist(expressions[f"{q}_normed"], 100, density=True, label=f"after {q}th percentile normalization")
+        plt.legend()
+        plt.xlabel('log2(expressions of all markers)')
+        plt.ylabel('Frequency')
+        if savename is not None:
+            plt.savefig(savename)
+        plt.show()
+        return expressions
+    def feature_quantile_normalization(self,
+                                       qs: Union[List[int], int] = 75,
+                                       vis_compare: bool = True,
+                                       savedir: Optional[str] = None):
+        """
+        Normalize all features with given quantiles except for morphology features
+        Args:
+            qs: value (int) or values (list of int) of for q-th percentile normalization
+            vis_compare: a boolean flag indicating whether or not visualize comparison before and after normalization
+            (default=True)
+            savedir: saving directory for comparison and percentiles;
+            if not None, visualizations of percentiles and comparison before and after normalization will be saved in savedir
+            (default=None)
+        """
+        qs = [qs] if isinstance(qs, int) else qs
+        if savedir is not None:
+            savename_quantile = os.path.join(savedir, "{}_{}_percentiles.png".format(self.slide, self.roi))
+            savename_compare  = os.path.join(savedir, "{}_{}_comparison.png".format(self.slide, self.roi))
+        else:
+            savename_quantile, savename_compare = None, None
+        self.calculate_quantiles(qs, savename=savename_quantile)
+        for q, quantile_val in self.dict_quantiles.items():
+            n_attr = f"df_feature_{q}normed" # attribute name
+            log_normed = copy.deepcopy(self.df_feature)
+            for key, features in self.features.items():
+                if key.endswith("morphology"):
+                    continue
+                for feature_name in features:
+                    if feature_name.startswith("nuclei") or feature_name.startswith("membrane"):
+                        continue
+                    # log-quantile normalization
+                    log_normed.loc[:, feature_name] = np.log2(log_normed.loc[:, feature_name] / quantile_val + 0.0001)
+            setattr(self, n_attr, log_normed)
+        if vis_compare:
+            _ = self._vis_normalization(savename=savename_compare)
+    def save_channel_images(self, savedir: str, channels: Optional[List] = None, ext: str = ".png", quantile_norm: int = 99):
+        """
+        Save channel images
+        """
+        if channels is not None:
+            if not all([cl in self.channels for cl in channels]):
+                print("At least one of the channels not available, saving all channels instead!")
+                channels = self.channels
+        else:
+            channels = self.channels
+        '''assert all([x.lower() in channels_temp for x in channels]), "Not all provided channels are available!"'''
+        for chn in channels:
+            savename = os.path.join(savedir, f"{chn}{ext}")
+            #         i = channels_temp.index(chn.lower())
+            i = self.channels.index(chn)
+            im_temp = self.image[..., i]
+            quantile_temp = np.quantile(im_temp, quantile_norm / 100) \
+                if np.quantile(im_temp, quantile_norm / 100) != 0 else 1
+            im_temp_ = np.clip(im_temp / quantile_temp, 0, 1)
+            save_multi_channel_img((im_temp_ * 255).astype(np.uint8), savename)
+    def marker_positive(self, feature_type: str = "normed", accumul_type: str = "sum", normq: int = 75):
+        assert feature_type in ["original", "normed", "scaled"], 'accepted feature types are "original", "normed", "scaled"'
+        if feature_type == "original":
+            feat_name = ""
+        elif feature_type == "normed":
+            feat_name = f"_{normq}normed"
+        else:
+            feat_name = f"_{normq}normed_scaled"
+        n_attr     = f"df_feature{feat_name}"  # class attribute name for feature table
+        count_attr = f"cell_count{feat_name}_{accumul_type}"  # class attribute name for feature summary table
+        df_feat  = getattr(self, n_attr)
+        df_thres = getattr(self, count_attr)
+        thresholds_cell_marker = dict((x, y) for (x, y) in zip(df_thres["feature"], df_thres["threshold"]))
+        columns = ["id"] + [marker for marker in self.markers]
+        df_marker_positive = pd.DataFrame(columns=columns,
+                                          data=np.zeros((len(df_feat), len(self.markers) + 1), type=np.int32))
+        df_marker_positive["id"] = df_feat["id"]
+        for im, marker in enumerate(self.markers):
+            channel_ = f"{self.channels[im]}_cell_{accumul_type}"
+            df_marker_positive.loc[df_feat[channel_] > thresholds_cell_marker[channel_], marker] = 1
+        setattr(self, f"df_marker_positive{feat_name}", df_marker_positive)
+    def marker_positive_summary(self,
+                                thresholds: Dict,
+                                feat_type: str = "normed",
+                                normq: int = 75,
+                                accumul_type: str = "sum"
+                                ):
+        """
+        Generate marker positive summary for CytofImage:
+        Output rendered: f"cell_count_{feat_name}_{aggre}" and f"marker_positive_{feat_name}_{aggre}"
+        """
+        assert feat_type in ["normed_scaled", "normed", ""], f"feature type {feat_type} not supported!"
+        feat_name = f"{feat_type}" if feat_type=="" else f"{normq}{feat_type}" # the attribute name to achieve from cytof_img
+        n_attr = f"df_feature{feat_name}" if feat_type=="" else f"df_feature_{feat_name}" # the attribute name to achieve from cytof_img
+        df_thres = pd.DataFrame({"feature": thresholds.keys(), "threshold": thresholds.values()})
+        df_marker_pos_sum = getattr(self, n_attr).copy()
+        keep_feat_set = f"cell_{accumul_type}"
+        for key, feat_set in getattr(self, "features").items():
+            if key == keep_feat_set:
+                marker_set = self.markers
+                df_marker_pos_sum_ = df_marker_pos_sum[feat_set].copy().transpose()
+                comp_cols = list(df_marker_pos_sum_.columns)
+                df_marker_pos_sum_.reset_index(names='feature', inplace=True)
+                merged  = df_marker_pos_sum_.merge(df_thres, on="feature", how="left")
+                df_temp = merged[comp_cols].ge(merged["threshold"], axis=0)
+                df_temp.index = merged['feature']
+                df_marker_pos_sum[feat_set] = df_temp.transpose()[feat_set]
+                map_rename = dict((k, v) for (k,v) in zip(feat_set, marker_set))
+                df_marker_pos_sum.rename(columns=map_rename, inplace=True)
+            else:
+                df_marker_pos_sum.drop(columns=feat_set, inplace=True)
+        df_thres['total number']    = df_temp.count(axis=1).values
+        df_thres['positive counts'] = df_temp.sum(axis=1).values
+        df_thres['positive ratio']  = df_thres['positive counts'] / df_thres['total number']
+        attr_cell_count = f"cell_count_{feat_name}_{accumul_type}"
+        attr_marker_pos = f"df_marker_positive_{feat_name}_{accumul_type}"
+        setattr(self, attr_cell_count, df_thres)
+        setattr(self, attr_marker_pos, df_marker_pos_sum)
+        return f"{feat_name}_{accumul_type}"
+    def visualize_marker_positive(self,
+                                  marker: str,
+                                  feature_type: str,
+                                  accumul_type: str = "sum",
+                                  normq: int = 99,
+                                  show_boundary: bool = True,
+                                  color_list: List[Tuple] = [(0,0,1), (0,1,0)], # negative, positive
+                                  color_bound: Tuple = (0,0,0),
+                                  show_colortable: bool=False
+                                  ):
+        assert feature_type in ["original", "normed",
+                                "scaled"], 'accepted feature types are "original", "normed", "scaled"'
+        if feature_type == "original":
+            feat_name = ""
+        elif feature_type == "normed":
+            feat_name = f"_{normq}normed"
+        else:
+            feat_name = f"_{normq}normed_scaled"
+        # self.marker_positive(feature_type=feature_type, accumul_type=accumul_type, normq=normq)
+        df_marker_positive_original = getattr(self, f"df_marker_positive{feat_name}_{accumul_type}")
+        df_marker_positive = df_marker_positive_original.copy()
+        # exclude the channels accordingly
+        if 'membrane' in self.channels:
+            channels_wo_special = self.channels[:-2] # excludes nuclei and membrane channel
+        else:
+            channels_wo_special = self.channels[:-1] # excludes nuclei channel only
+        # the original four location info + marker/channel names
+        reconstructed_marker_channel = ['filename', 'id', 'coordinate_x', 'coordinate_y'] + channels_wo_special
+        assert len(reconstructed_marker_channel) == len(df_marker_positive_original.columns)
+        df_marker_positive.columns = reconstructed_marker_channel
+        color_dict = dict((key, v) for (key, v) in zip(['negative', 'positive'], color_list))
+        if show_colortable:
+            show_color_table(color_dict=color_dict, title="color dictionary", emptycols=3)
+        color_ids = []
+        stain_nuclei = np.zeros((self.nuclei_seg.shape[0], self.nuclei_seg.shape[1], 3)) + 1
+        for i in range(2, np.max(self.nuclei_seg) + 1):
+            color_id = df_marker_positive[marker][df_marker_positive['id'] == i].values[0]
+            if color_id not in color_ids:
+                color_ids.append(color_id)
+            stain_nuclei[self.nuclei_seg == i] = color_list[color_id][:3]
+        # add boundary
+        if show_boundary:
+            stain_nuclei = mark_boundaries(stain_nuclei,
+                                       self.nuclei_seg, mode="inner", color=color_bound)
+        # stained Cell image
+        stain_cell = np.zeros((self.cell_seg.shape[0], self.cell_seg.shape[1], 3)) + 1
+        for i in range(2, np.max(self.cell_seg) + 1):
+            color_id = df_marker_positive[marker][df_marker_positive['id'] == i].values[0]
+            stain_cell[self.cell_seg == i] = color_list[color_id][:3]
+        if show_boundary:
+            stain_cell = mark_boundaries(stain_cell,
+                                     self.cell_seg, mode="inner", color=color_bound)
+        return stain_nuclei, stain_cell, color_dict
+    def visualize_pheno(self, key_pheno: str,
+                        color_dict: Optional[dict] = None,
+                        show: bool = False,
+                        show_colortable: bool = False):
+        assert key_pheno in self.phenograph, "Pheno-Graph with {} not available!".format(key_pheno)
+        phenograph = self.phenograph[key_pheno]
+        communities = phenograph['communities']  # phenograph clustering community IDs
+        seg_id = self.df_feature['id']  # nuclei / cell segmentation IDs
+        if color_dict is None:
+            color_dict = dict((_, plt.cm.get_cmap('tab20').colors[_ % 20]) \
+                              for _ in np.unique(communities))
+        #     rgba_colors   = np.array([color_dict[_] for _ in communities])
+        if show_colortable:
+            show_color_table(color_dict=color_dict,
+                             title="phenograph clusters",
+                             emptycols=3, dpi=60)
+        # Create image with nuclei / cells stained by PhenoGraph clustering output
+        # stain rule: same color for same cluster, stain nuclei
+        stain_nuclei = np.zeros((self.nuclei_seg.shape[0], self.nuclei_seg.shape[1], 3)) + 1
+        stain_cell = np.zeros((self.cell_seg.shape[0], self.cell_seg.shape[1], 3)) + 1
+        for i in range(2, np.max(self.nuclei_seg) + 1):
+            commu_id = communities[seg_id == i][0]
+            stain_nuclei[self.nuclei_seg == i] = color_dict[commu_id]  # rgba_colors[communities[seg_id == i]][:3] #
+            stain_cell[self.cell_seg == i] = color_dict[commu_id]  # rgba_colors[communities[seg_id == i]][:3] #
+        if show:
+            fig, axs = plt.subplots(1, 2, figsize=(16, 8))
+            axs[0].imshow(stain_nuclei)
+            axs[1].imshow(stain_cell)
+        return stain_nuclei, stain_cell, color_dict
+    def get_binary_pos_express_df(self, feature_name, accumul_type):
+        """
+        returns a dataframe in the form marker1, marker2, ... vs. cell1, cell2; indicating whether each cell is positively expressed in each marker
+        """
+        df_feature_name = f"df_feature_{feature_name}"
+        # get the feature extraction result
+        df_feature = getattr(self , df_feature_name)
+        # select only markers with desired accumulation type
+        marker_col_all = [x for x in df_feature.columns if f"cell_{accumul_type}" in x]
+        # subset feature
+        df_feature_of_interst = df_feature[marker_col_all]
+        # reports each marker's threshold to be considered positively expressed, number of positive cells, etc
+        df_cell_count_info = getattr(self, f"cell_count_{feature_name}_{accumul_type}")
+        thresholds = df_cell_count_info.threshold
+        # returns a binary dataframe of whether each cell at each marker passes the positive threshold
+        df_binary_pos_exp = df_feature_of_interst.apply(lambda column: apply_threshold_to_column(column, threshold=thresholds[df_feature_of_interst.columns.get_loc(column.name)]))
+        return df_binary_pos_exp
+    def roi_co_expression(self, feature_name, accumul_type, return_components=False):
+        """
+        Performs the co-expression analysis at the single ROI level.
+        Can return components for cohort analysis if needed
+        """
+        from itertools import product
+        # returns a binary dataframe of whether each cell at each marker passes the positive threshold
+        df_binary_pos_exp = self.get_binary_pos_express_df(feature_name, accumul_type)
+        n_cells, n_markers = df_binary_pos_exp.shape
+        df_pos_exp_val = df_binary_pos_exp.values
+        # list all pair-wise combinations of the markers
+        column_combinations = list(product(range(n_markers), repeat=2))
+        # step to the numerator of the log odds ratio
+        co_positive_count_matrix = np.zeros((n_markers, n_markers))
+        # step to the denominator of the log odds ratio
+        expected_count_matrix = np.zeros((n_markers, n_markers))
+        for combo in column_combinations:
+            marker1, marker2 = combo
+            # count cells that positively expresses in both marker 1 and 2
+            positive_prob_marker1_and_2 = np.sum(np.logical_and(df_pos_exp_val[:, marker1], df_pos_exp_val[:, marker2]))
+            co_positive_count_matrix[marker1, marker2] = positive_prob_marker1_and_2
+            # pair (A,B) counts is the same as pair (B,A) counts
+            co_positive_count_matrix[marker2, marker1] = positive_prob_marker1_and_2
+            # count expected cells if marker 1 and 2 are independently expressed
+            # p(A and B) = p(A) * p(B) = num_pos_a * num_pos_b / (num_cells * num_cells)
+            # p(A) = number of positive cells / number of cells
+            exp_prob_in_marker1_and_2 = np.sum(df_pos_exp_val[:, marker1]) * np.sum(df_pos_exp_val[:, marker2])
+            expected_count_matrix[marker1, marker2] = exp_prob_in_marker1_and_2
+            expected_count_matrix[marker2, marker1] = exp_prob_in_marker1_and_2
+        # theta(i_pos and j_pos)
+        df_co_pos = pd.DataFrame(co_positive_count_matrix, index=df_binary_pos_exp.columns, columns=df_binary_pos_exp.columns)
+        # E(x)
+        df_expected = pd.DataFrame(expected_count_matrix, index=df_binary_pos_exp.columns, columns=df_binary_pos_exp.columns)
+        if return_components:
+            # hold off on calculating probabilites. Need the components from other ROIs to calculate the co-expression
+            return df_co_pos, df_expected, n_cells
+        # otherwise, return the probabilies
+        df_co_pos_prob = df_co_pos / n_cells
+        df_expected_prob = df_expected / n_cells**2
+        return df_co_pos_prob, df_expected_prob
+    def roi_interaction_graphs(self, feature_name, accumul_type, method: str = "distance", threshold=50, return_components=False):
+        """ Performs spatial interaction at the ROI level.
+        Finds if two positive markers are in proximity with each other. Proximity can be defined either with k-nearest neighbor or distance thresholding.
+        Args:
+            key_pheno: dictionary key for a specific phenograph output
+            method: method to construct the adjacency matrix, choose from "distance" and "kneighbor"
+            threshold: either the number of neighbors or euclidean distance to qualify as neighborhood pairs. Default is 50 for distance and 20 for k-neighbor.
+            **kwargs: used to specify distance threshold (thres) for "distance" method or number of neighbors (k)
+            for "kneighbor" method
+        Output:
+            network: (dict) ROI level network that will be used for cluster interaction analysis
+        """
+        assert method in ["distance", "k-neighbor"], "Method can be either 'distance' or 'k-neighbor'!"
+        print(f'Calculating spatial interaction with method "{method}" and threshold at {threshold}')
+        df_feature_name = f"df_feature_{feature_name}"
+        # get the feature extraction result
+        df_feature = getattr(self , df_feature_name)
+        # select only markers with desired accumulation type
+        marker_col_all = [x for x in df_feature.columns if f"cell_{accumul_type}" in x]
+        # subset feature
+        df_feature_of_interst = df_feature[marker_col_all]
+        n_cells, n_markers = df_feature_of_interst.shape
+        networks = {}
+        if method == "distance":
+            dist = DistanceMetric.get_metric('euclidean')
+            neighbor_matrix = dist.pairwise(df_feature.loc[:, ['coordinate_x', 'coordinate_y']].values)
+            # returns nonzero elements of the matrix
+            # ref: https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.find.html
+            I, J, V = sp.find(neighbor_matrix)
+            # finds index of values less than the distance threshold
+            v_keep_index = V < threshold
+        elif method == "k-neighbor":
+            neighbor_matrix = skgraph(np.array(df_feature.loc[:, ['coordinate_x', 'coordinate_y']]), n_neighbors=threshold, mode='distance')
+            # returns nonzero elements of the matrix
+            # ref: https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.find.html
+            I, J, V = sp.find(neighbor_matrix)
+            v_keep_index = V > 0 # any non-zero distance neighbor qualifies
+        # finds index of values less than the distance threshold
+        i_keep, j_keep = I[v_keep_index], J[v_keep_index]
+        assert len(i_keep) == len(j_keep) # these are paired indexes for the cell. must equal in length.
+        n_neighbor_pairs = len(i_keep)
+        # (i,j) now tells you the index of the two cells that are in close proximity (within {thres} distance of each other)
+        # now we need a list that tells you the positive expressed marker index in each cell
+        # returns a binary dataframe of whether each cell at each marker passes the positive threshold
+        df_binary_pos_exp = self.get_binary_pos_express_df(feature_name, accumul_type)
+        df_pos_exp_val = df_binary_pos_exp.values # convert to matrix operation
+        # cell-marker positive list, 1-D. len = n_cells. Each element indicates the positively expressed marker of that cell index
+        # only wants where the x condition is True. x refers to the docs x, not the actual array direction
+        # ref: https://numpy.org/doc/stable/reference/generated/numpy.where.html
+        cell_marker_pos_list = [np.where(cell)[0] for cell in df_pos_exp_val]
+        cell_interaction_in_markers_counts = np.zeros((n_markers, n_markers))
+        # used to calculate E(x)
+        expected_marker_count_1d = np.zeros(n_markers)
+        # go through each close proxmity cell pair
+        for i, j in zip(i_keep, j_keep):
+            # locate the cell via index, then
+            marker_index_neighbor_pair1 = cell_marker_pos_list[i]
+            marker_index_neighbor_pair2 = cell_marker_pos_list[j]
+            # within each neighbor pair (i.e. pairs of cells) contains the positively expressed markers index in that cell
+            # the product of these markers index from each cell indicates interaction pair
+            marker_matrix_update_coords = list(product(marker_index_neighbor_pair1, marker_index_neighbor_pair2))
+            # update the counts between each marker interaction pair
+            # example coords: (pos_marker_index_in_cell1, pos_marker_index_in_cell2)
+            for coords in marker_matrix_update_coords:
+                cell_interaction_in_markers_counts[coords] += 1
+            # find the marker index that appeared in both pairs of the neighbor cells
+            markers_index_both_neighbor_pair = np.union1d(marker_index_neighbor_pair1, marker_index_neighbor_pair2)
+            expected_marker_count_1d[markers_index_both_neighbor_pair] += 1 # increase the markers that appears in either neighborhood pair
+        # expected counts
+        # expected_marker_count_1d = np.sum(df_pos_exp_val, axis=0)
+        # ref: https://numpy.org/doc/stable/reference/generated/numpy.outer.html
+        expected_counts = np.outer(expected_marker_count_1d, expected_marker_count_1d)
+        # expected and observed needs to match dimension to perform element-wise operation
+        assert expected_counts.shape == cell_interaction_in_markers_counts.shape
+        df_expected_counts = pd.DataFrame(expected_counts, index=df_feature_of_interst.columns, columns=df_feature_of_interst.columns)
+        df_cell_interaction_counts = pd.DataFrame(cell_interaction_in_markers_counts, index=df_feature_of_interst.columns, columns=df_feature_of_interst.columns)
+        if return_components:
+            return df_expected_counts, df_cell_interaction_counts, n_neighbor_pairs
+        # calculates percentage within function if not return compoenents
+        # df_expected_prob = df_expected_counts / n_cells**2
+        df_expected_prob = df_expected_counts / n_neighbor_pairs**2
+        # theta(i_pos and j_pos)
+        df_cell_interaction_prob = df_cell_interaction_counts / n_neighbor_pairs
+        return df_expected_prob, df_cell_interaction_prob
+class CytofImageTiff(CytofImage):
+    """
+    CytofImage for Tiff images, inherit from Cytofimage
+    """
+    def __init__(self, image, slide="", roi="", filename=""):
+        self.image = image
+        self.markers = None  # markers
+        self.labels = None  # labels
+        self.slide = slide
+        self.roi = roi
+        self.filename = filename
+        self.channels = None  # ["{}({})".format(marker, label) for (marker, label) in zip(self.markers, self.labels)]
+    def copy(self):
+        '''
+        Creates a deep copy of the current CytofImageTIFF object and return it
+        '''
+        new_instance = type(self)(self.image.copy(), self.slide, self.roi, self.filename)
+        new_instance.markers = copy.deepcopy(self.markers)
+        new_instance.labels = copy.deepcopy(self.labels)
+        new_instance.channels = copy.deepcopy(self.channels)
+        return new_instance
+    def quality_control(self,  thres: int = 50) -> None:
+        setattr(self, "keep", False)
+        if any([x < thres for x in self.image.shape]):
+            print(f"At least one dimension of the image {self.slide}-{self.roi} is smaller than {thres}, \
+                hence exclude from analyzing" )
+            self.keep = False
+    def set_channels(self, markers: List, labels: List):
+        self.markers = markers
+        self.labels = labels
+        self.channels = ["{}({})".format(marker, label) for (marker, label) in zip(self.markers, self.labels)]
+    def set_markers(self,
+                markers: list,
+                labels: list,
+                channels: Optional[list] = None
+               ):
+        """This deprecates set_channels """
+        self.raw_markers = markers
+        self.raw_labels  = labels
+        if channels is not None:
+            self.raw_channels = channels
+        else:
+            self.raw_channels = [f"{marker}-{label}" for (marker, label) in zip(markers, labels)]
+        self.channels = self.raw_channels.copy()
+        self.markers  = self.raw_markers.copy()
+        self.labels   = self.raw_labels.copy()
+    def check_channels(self,
+                       channels: Optional[List] = None,
+                       xlim: Optional[List] = None,
+                       ylim: Optional[List] = None,
+                       ncols: int = 5, vis_q: int = 0.9,
+                       colorbar: bool = False,
+                       savedir: Optional[str] = None,
+                       savename: str = "check_channels"):
+        """
+        xlim = a list of 2 numbers indicating the ylimits to show image (default=None)
+        ylim = a list of 2 numbers indicating the ylimits to show image (default=None)
+        ncols = number of subplots per row (default=5)
+        vis_q = percentile q used to normalize image before visualization  (default=0.9)
+        """
+        show = True if savedir is None else False
+        if channels is not None:
+            if not all([cl in self.channels for cl in channels]):
+                print("At least one of the channels not available, visualizing all channels instead!")
+                channels = None
+        if channels is None:  # if no desired channels specified, check all channels
+            channels = self.channels
+        if len(channels) <= ncols:
+            ax_nrow = 1
+            ax_ncol = len(channels)
+        else:
+            ax_ncol = ncols
+            ax_nrow = int(np.ceil(len(channels) / ncols))
+        fig, axes = plt.subplots(ax_nrow, ax_ncol, figsize=(3 * ax_ncol, 3 * ax_nrow))
+        # fig, axes = plt.subplots(ax_nrow, ax_ncol)
+        if ax_nrow == 1:
+            axes = np.array([axes])
+            if ax_ncol == 1:
+                axes = np.expand_dims(axes, axis=1)
+        for i, _ in enumerate(channels):
+            _ax_nrow = int(np.floor(i / ax_ncol))
+            _ax_ncol = i % ax_ncol
+            _i = self.channels.index(_)
+            image = self.image[..., _i]
+            percentile_q = np.quantile(image, vis_q) if np.quantile(image, vis_q) != 0 else 1
+            image = np.clip(image / percentile_q, 0, 1)
+            axes[_ax_nrow, _ax_ncol].set_title(_)
+            if xlim is not None:
+                image = image[:, xlim[0]:xlim[1]]
+            if ylim is not None:
+                image = image[ylim[0]:ylim[1], :]
+            im = axes[_ax_nrow, _ax_ncol].imshow(image, cmap="gray")
+            if colorbar:
+                fig.colorbar(im, ax=axes[_ax_nrow, _ax_ncol])
+        plt.tight_layout(pad=1.2)
+        # axes.axis('scaled')
+        if show:
+            plt.show()
+        else:
+            # plt.savefig(os.path.join(savedir, f"{savename}.png"))
+            return fig
+    def remove_special_channels(self, channels: List):
+        for channel in channels:
+            if channel not in self.channels:
+                print("Channel {} not available, escaping...".format(channel))
+                continue
+            idx = self.channels.index(channel)
+            self.channels.pop(idx)
+            self.markers.pop(idx)
+            self.labels.pop(idx)
+            self.image = np.delete(self.image, idx, axis=2)
+            if hasattr(self, "df"):
+                self.df.drop(columns=channel, inplace=True)
+    def define_special_channels(
+        self,
+        channels_dict: Dict,
+        q: float = 0.95,
+        overwrite: bool = False,
+        verbose: bool = False,
+        rm_key: str = 'nuclei'):
+        channels_rm = []
+        # new_name is the key from channels_dict, old_names contains a list of existing channel names
+        for new_name, old_names in channels_dict.items():
+            if len(old_names) == 0:
+                continue
+            if new_name in self.channels and (not overwrite):
+                print("Warning: {} is already present, skipping...".format(new_name))
+                continue
+            if new_name in self.channels and overwrite:
+                print("Warning: {} is already present, overwriting...".format(new_name))
+                idx = self.channels.index(new_name)
+                self.image = np.delete(self.image, idx, axis=2)
+                self.channels.pop(idx)
+            old_nms = []
+            for i, old_name in enumerate(old_names):
+                if old_name not in self.channels:
+                    # warnings.warn('{} is not available!'.format(old_name['marker_name']))
+                    warnings.warn('{} is not available!'.format(old_name))
+                    continue
+                old_nms.append(old_name)
+            if verbose:
+                print("Defining channel '{}' by summing up channels: {}.".format(new_name, ', '.join(old_nms)))
+            if len(old_nms) > 0:
+                # only add channels to removal list if matching remove key
+                if new_name == rm_key:
+                    channels_rm += old_nms
+                for i, old_name in enumerate(old_nms):
+                    _i = self.channels.index(old_name)
+                    _image = self.image[..., _i]
+                    percentile_q = np.quantile(_image, q) if np.quantile(_image, q) != 0 else 1
+                    _image = np.clip(_image / percentile_q, 0, 1)  # quantile normalization
+                    if i == 0:
+                        image = _image
+                    else:
+                        image += _image
+                if verbose:
+                    print(f"Original image shape: {self.image.shape}")
+                self.image = np.dstack([self.image, image[:, :, None]])
+                if verbose:
+                    print(f"Image shape after defining special channel(s) {self.image.shape}")
+                if new_name not in self.channels:
+                    self.channels.append(new_name)
+        if hasattr(self, "defined_channels"):
+            for key in channels_dict.keys():
+                self.defined_channels.add(key)
+        else:
+            setattr(self, "defined_channels", set(list(channels_dict.keys())))
+        return channels_rm
+# Define a function to apply the threshold and convert to binary
+def apply_threshold_to_column(column, threshold):
+    """
+    Apply a threshold to a column of data and convert it to binary.
+    @param column: The input column of data to be thresholded.
+    @param threshold: The threshold value to compare the elements in the column.
+    @return: A binary array where True represents values meeting or exceeding the threshold,
+             and False represents values below the threshold.
+    """
+    return (column >= threshold)
+class CytofCohort():
+    def __init__(self, cytof_images: Optional[dict] = None,
+                 df_cohort: Optional[pd.DataFrame] = None,
+                 dir_out: str = "./",
+                 cohort_name: str = "cohort1"):
+        """
+        cytof_images:
+        df_cohort: Slide | ROI | input file
+        """
+        self.cytof_images = cytof_images or {}
+        self.df_cohort    = df_cohort# or None# pd.read_csv(file_cohort) # the slide-ROI
+        self.feat_sets = {
+            "all": ["cell_sum", "cell_ave", "cell_morphology"],
+            "cell_sum": ["cell_sum", "cell_morphology"],
+            "cell_ave": ["cell_ave", "cell_morphology"],
+            "cell_sum_only": ["cell_sum"],
+            "cell_ave_only": ["cell_ave"]
+        }
+        self.name    = cohort_name
+        self.dir_out = os.path.join(dir_out, self.name)
+        if not os.path.exists(self.dir_out):
+            os.makedirs(self.dir_out)
+    def __getitem__(self, key):
+        'Extracts a particular cytof image from the cohort'
+        return self.cytof_images[key]
+    def __str__(self):
+        return f"CytofCohort {self.name}"
+    def __repr__(self):
+        return f"CytofCohort(name={self.name})"
+    def save_cytof_cohort(self, savename):
+        directory = os.path.dirname(savename)
+        if not os.path.exists(directory):
+            os.makedirs(directory)
+        pkl.dump(self, open(savename, "wb"))
+    def batch_process_feature(self):
+        """
+        Batch process: if the CytofCohort is initialized by a dictionary of CytofImages
+        """
+        slides, rois, fs_input = [], [], []
+        for n, cytof_img in self.cytof_images.items():
+            if not hasattr(self, "dict_feat"):
+                setattr(self, "dict_feat", cytof_img.features)
+            if not hasattr(self, "markers"):
+                setattr(self, "markers", cytof_img.markers)
+            print('dict quantiles in batch process:', cytof_img.dict_quantiles)
+            try:
+                qs &= set(list(cytof_img.dict_quantiles.keys()))
+            except:
+                qs = set(list(cytof_img.dict_quantiles.keys()))
+            slides.append(cytof_img.slide)
+            rois.append(cytof_img.roi)
+            fs_input.append(cytof_img.filename) #df_feature['filename'].unique()[0])
+        setattr(self, "normqs", qs)
+        # scale feature (in a batch)
+        df_scale_params = self.scale_feature()
+        setattr(self, "df_scale_params", df_scale_params)
+        if self.df_cohort is None:
+            self.df_cohort = pd.DataFrame({"Slide": slides, "ROI": rois, "input file": fs_input})
+    def batch_process(self, params: Dict):
+        sys.path.append("../CLIscripts")
+        from process_single_roi import process_single, SetParameters
+        for i, (slide, roi, fname) in self.df_cohort.iterrows():
+            paramsi = SetParameters(filename=fname,
+                        outdir=self.dir_out,
+                        label_marker_file=params.get('label_marker_file', None),
+                        slide=slide,
+                        roi=roi,
+                        quality_control_thres=params.get("quality_control_thres", 50),
+                        channels_remove=params.get("channels_remove", None),
+                        channels_dict=params.get("channels_dict", None),
+                        use_membrane=params.get("use_membrane",True),
+                        cell_radius=params.get("cell_radius", 5),
+                        normalize_qs=params.get("normalize_qs", 75),
+                        iltype=params.get('iltype', None))
+            cytof_img = process_single(paramsi, downstream_analysis=False, verbose=False)
+            self.cytof_images[f"{slide}_{roi}"] = cytof_img
+        self.batch_process_feature()
+    def get_feature(self,
+                    normq: int = 75,
+                    feat_type: str = "normed_scaled",
+                    verbose: bool = False):
+        """
+        Get a specific set of feature for the cohort
+        The set is defined by `normq` and `feat_type`
+        """
+        assert feat_type in ["normed_scaled", "normed", ""], f"feature type {feat_type} not supported!"
+        if feat_type != "" and not hasattr(self, "df_feature"):
+            orig_dfs = {}
+            for f_roi, cytof_img in self.cytof_images.items():
+                orig_dfs[f_roi] = getattr(cytof_img, "df_feature")
+            setattr(self, "df_feature", pd.concat([_ for key, _ in orig_dfs.items()]).reset_index(drop=True))
+        feat_name = feat_type if feat_type=="" else f"_{normq}{feat_type}"
+        n_attr    = f"df_feature{feat_name}"
+        dfs = {}
+        for f_roi, cytof_img in self.cytof_images.items():
+            dfs[f_roi] = getattr(cytof_img, n_attr)
+        setattr(self, n_attr, pd.concat([_ for key, _ in dfs.items()]).reset_index(drop=True))
+        if verbose:
+            print("The attribute name of the feature: {}".format(n_attr))
+    def scale_feature(self):
+        """Scale features for all normalization q values"""
+        cytof_img = list(self.cytof_images.values())[0]
+        # features to be scaled
+        s_features = [col for key, features in cytof_img.features.items() \
+                              for f in features \
+                              for col in cytof_img.df_feature.columns if col.startswith(f)]
+        for normq in self.normqs:
+            n_attr = f"df_feature_{normq}normed"
+            n_attr_scaled = f"df_feature_{normq}normed_scaled"
+            if not hasattr(self, n_attr):
+                self.get_feature(normq=normq, feat_type="normed")
+            df_feature = getattr(self, n_attr)
+            # calculate scaling parameters
+            df_scale_params = df_feature[s_features].mean().to_frame(name="mean").transpose()
+            df_scale_params = pd.concat([df_scale_params, df_feature[s_features].std().to_frame(name="std").transpose()])
+            #
+            m = df_scale_params[df_scale_params.columns].iloc[0] # mean
+            s = df_scale_params[df_scale_params.columns].iloc[1] # std.dev
+            df_feature_scale = copy.deepcopy(df_feature)
+            assert len([x for x in df_scale_params.columns if x not in df_scale_params.columns]) == 0
+            # scale
+            df_feature_scale[df_scale_params.columns] = (df_feature_scale[df_scale_params.columns] - m) / s
+            setattr(self, n_attr_scaled, df_feature_scale)
+        return df_scale_params
+    def _get_feature_subset(self,
+                           normq: int = 75,
+                           feat_type: str = "normed_scaled",
+                           feat_set: str = "all",
+                           markers: str = "all",
+                           verbose: bool = False):
+        assert feat_type in ["normed_scaled", "normed", ""], f"feature type {feat_type} not supported!"
+        assert (markers == "all" or isinstance(markers, list))
+        assert feat_set in self.feat_sets.keys(), f"feature set {feat_set} not supported!"
+        description = "original" if feat_type=="" else f"{normq}{feat_type}"
+        n_attr      = f"df_feature{feat_type}" if feat_type=="" else f"df_feature_{normq}{feat_type}" # the attribute name to achieve from cytof_img
+        if not hasattr(self, n_attr):
+            self.get_feature(normq, feat_type)
+        if verbose:
+            print("\nThe attribute name of the feature: {}".format(n_attr))
+        feat_names = [] # a list of feature names
+        for y in self.feat_sets[feat_set]:
+            if "morphology" in y:
+                feat_names += self.dict_feat[y]
+            else:
+                if markers == "all": # features extracted from all markers are kept
+                    feat_names += self.dict_feat[y]
+                    markers = self.markers
+                else: # only features correspond to markers kept (markers are a subset of self.markers)
+                    ids = [self.markers.index(x) for x in markers] # TODO: the case where marker in markers not in self.markers???
+                    feat_names += [self.dict_feat[y][x] for x in ids]
+        df_feature = getattr(self, n_attr)[feat_names]
+        return df_feature, markers, feat_names, description, n_attr
+    ###############################################################
+    ################## PhenoGraph Clustering ######################
+    ###############################################################
+    def clustering_phenograph(self,
+                              normq:int = 75,
+                              feat_type:str = "normed_scaled",
+                              feat_set: str = "all",
+                              pheno_markers: Union[str, List] = "all",
+                              k: int = None,
+                              save_vis: bool = False,
+                              verbose:bool = True):
+        if pheno_markers == "all":
+            pheno_markers_ = "_all"
+        else:
+            pheno_markers_ = "_subset1"
+        assert feat_type in ["normed_scaled", "normed", ""], f"feature type {feat_type} not supported!"
+        df_feature, pheno_markers, feat_names, description, n_attr = self._get_feature_subset(normq=normq,
+                                                                                          feat_type=feat_type,
+                                                                                          feat_set=feat_set,
+                                                                                          markers=pheno_markers,
+                                                                                          verbose=verbose)
+        # set number of nearest neighbors k and run PhenoGraph for phenotype clustering
+        k = k if k else int(df_feature.shape[0] / 100)
+        if k < 10:
+            k = min(df_feature.shape[0]-1, 10)
+            # perform k-means algorithm for small k
+            kmeans = KMeans(n_clusters=k, random_state=42).fit(df_feature)
+            communities = kmeans.labels_
+        else:
+            communities, graph, Q = phenograph.cluster(df_feature, k=k, n_jobs=-1)   # run PhenoGraph
+        # project to 2D using UMAP
+        umap_2d = umap.UMAP(n_components=2, init='random', random_state=0)
+        proj_2d = umap_2d.fit_transform(df_feature)
+        if not hasattr(self, "phenograph"):
+            setattr(self, "phenograph", {})
+        key_pheno  = f"{description}_{feat_set}_feature_{k}"
+        key_pheno += f"{pheno_markers_}_markers"
+        N = len(np.unique(communities))
+        self.phenograph[key_pheno] = {
+            "data": df_feature,
+            "markers": pheno_markers,
+            "features": feat_names,
+            "description": {"normalization": description, "feature_set": feat_set}, # normalization and/or scaling | set of feature (in self.feat_sets)
+            "communities": communities,
+            "proj_2d": proj_2d,
+            "N": N,
+            "feat_attr": n_attr
+        }
+        if verbose:
+            print(f"\n{N} communities found. The dictionary key for phenograph: {key_pheno}.")
+        return key_pheno
+    def _gather_roi_pheno(self, key_pheno):
+        """Split whole df into df for each ROI"""
+        df_slide_roi    = self.df_cohort
+        pheno_out       = self.phenograph[key_pheno]
+        df_feat_all     = getattr(self, pheno_out['feat_attr']) # original feature (to use the slide/ roi /filename info) data
+        df_pheno_all    = pheno_out['data'] # phenograph data
+        proj_2d_all     = pheno_out['proj_2d']
+        communities_all = pheno_out['communities']
+        df_feature_roi, proj_2d_roi, communities_roi = {}, {}, {}
+        for i in self.df_cohort.index:       # Slide | ROI | input file
+#             path_i = df_slide_roi.loc[i, "path"]
+            roi_i  = df_slide_roi.loc[i, "ROI"]
+            f_in   = df_slide_roi.loc[i, "input file"]# os.path.join(path_i, roi_i)
+            cond   = df_feat_all["filename"] == f_in
+            df_feature_roi[roi_i] = df_pheno_all.loc[cond, :]
+            proj_2d_roi[roi_i] = proj_2d_all[cond, :]
+            communities_roi[roi_i] = communities_all[cond]
+        return df_feature_roi, proj_2d_roi, communities_roi
+    def vis_phenograph(self,
+                       key_pheno: str,
+                       level: str = "cohort",
+                       accumul_type: Union[List[str], str] = "cell_sum",  # ["cell_sum", "cell_ave"]
+                       normalize: bool = False,
+                       save_vis: bool = False,
+                       show_plots: bool = False,
+                       plot_together: bool = True,
+                       fig_width: int = 5 # only when plot_together is True
+                       ):
+        assert level.upper() in ["COHORT", "SLIDE", "ROI"], "Only 'cohort', 'slide' and 'roi' are accetable values for level"
+        this_pheno = self.phenograph[key_pheno]
+        feat_names = this_pheno['features']
+        descrip = this_pheno['description']
+        n_community = this_pheno['N']
+        markers = this_pheno['markers']
+        feat_set = self.feat_sets[descrip['feature_set']]
+        if save_vis:
+            vis_savedir = os.path.join(self.dir_out, "phenograph", key_pheno + f"-{n_community}clusters")
+            if not os.path.exists(vis_savedir):
+                os.makedirs(vis_savedir)
+        else:
+            vis_savedir = None
+        if accumul_type is None:  # by default, visualize all accumulation types
+            accumul_type = [_ for _ in feat_set if "morphology" not in _]
+        if isinstance(accumul_type, str):
+            accumul_type = [accumul_type]
+        proj_2d = this_pheno['proj_2d']
+        df_feature = this_pheno['data']
+        communities = this_pheno['communities']
+        if level.upper() == "COHORT":
+            proj_2ds = {"cohort": proj_2d}
+            df_feats = {"cohort": df_feature}
+            commus = {"cohort": communities}
+        else:
+            df_feats, proj_2ds, commus = self._gather_roi_pheno(key_pheno)
+            if level.upper() == "SLIDE":
+                for slide in self.df_cohort["Slide"].unique():  # for each slide
+                    f_rois = [roi_i.replace(".txt", "") for roi_i in
+                              self.df_cohort.loc[self.df_cohort["Slide"] == slide, "ROI"]]
+                    df_feats[slide] = pd.concat([df_feats[f_roi] for f_roi in f_rois])
+                    proj_2ds[slide] = np.concatenate([proj_2ds[f_roi] for f_roi in f_rois])
+                    commus[slide] = np.concatenate([commus[f_roi] for f_roi in f_rois])
+                    for f_roi in f_rois:
+                        df_feats.pop(f_roi)
+                        proj_2ds.pop(f_roi)
+                        commus.pop(f_roi)
+        figs = {} # if plot_together
+        figs_scatter = {} # if not plot_together
+        figs_exps    = {}
+        cluster_protein_exps = {}
+        for key, df_feature in df_feats.items():
+            if plot_together:
+                ncol = len(accumul_type)+1
+                fig, axs = plt.subplots(1,ncol, figsize=(ncol*fig_width, fig_width))
+            proj_2d = proj_2ds[key]
+            commu = commus[key]
+            # Visualize 1: plot 2d projection together
+            print("Visualization in 2d - {}-{}".format(level, key))
+            savename = os.path.join(vis_savedir, f"cluster_scatter_{level}_{key}.png") if (save_vis and not plot_together) else None
+            ax = axs[0] if plot_together else None
+            fig_scatter = visualize_scatter(data=proj_2d, communities=commu, n_community=n_community,
+                                            title=key, savename=savename, show=show_plots, ax=ax)
+            figs_scatter[key] = fig_scatter
+            figs_exps[key]    = {}
+            # Visualize 2: protein expression
+            for axid, acm_tpe in enumerate(accumul_type):
+                ids = [i for (i, x) in enumerate(feat_names) if re.search(".{}".format(acm_tpe), x)]
+                feat_names_ = [feat_names[i] for i in ids]
+                cluster_protein_exp = np.zeros((n_community, len(markers)))
+                group_ids = np.arange(len(np.unique(communities)))
+                for cluster in range(len(np.unique(communities))):  # for each (global) community
+                    df_sub = df_feature.loc[commu == cluster]
+                    if df_sub.shape[0] == 0:
+                        group_ids = np.delete(group_ids, group_ids == cluster)
+                        continue
+                    # number of markers should match # of features extracted.
+                    for i, feat in enumerate(feat_names_):
+                        cluster_protein_exp[cluster, i] = np.average(df_sub[feat])
+                # get rid of non-exist clusters
+                '''cluster_protein_exp = cluster_protein_exp[group_ids, :]'''
+                if normalize:
+                    cluster_protein_exp_norm = cluster_protein_exp - np.median(cluster_protein_exp, axis=0)
+                    # or set non-exist cluster to be inf
+                    rid = set(np.arange(len(np.unique(communities)))) - set(group_ids)
+                    if len(rid) > 0:
+                        rid = np.array(list(rid))
+                        cluster_protein_exp_norm[rid, :] = np.nan
+                        group_ids = np.arange(len(np.unique(communities)))
+                savename = os.path.join(vis_savedir, f"protein_expression_{level}_{acm_tpe}_{key}.png") \
+                    if (save_vis and not plot_together) else None
+                vis_exp = cluster_protein_exp_norm if normalize else cluster_protein_exp
+                ax = axs[axid+1] if plot_together else None
+                fig_exps = visualize_expression(data=vis_exp, markers=markers,
+                                                group_ids=group_ids, title="{} - {}-{}".format(level, acm_tpe, key),
+                                                savename=savename, show=show_plots, ax=ax)
+                figs_exps[key][acm_tpe]   = fig_exps
+                cluster_protein_exps[key] = vis_exp
+            plt.tight_layout()
+            if plot_together:
+                figs[key] = fig
+                if save_vis:
+                    plt.savefig(os.path.join(vis_savedir, f"phenograph_{level}_{acm_tpe}_{key}.png"), dpi=300)
+                if show_plots:
+                    plt.show()
+            if not show_plots:
+                plt.close("all")
+        return df_feats, commus, cluster_protein_exps, figs, figs_scatter, figs_exps
+    def attach_individual_roi_pheno(self, key_pheno, override=False):
+        """ Attach PhenoGraph outputs to each individual CytofImage (roi) and update each saved CytofImage
+        """
+        assert key_pheno in self.phenograph.keys(), "Pheno-Graph with {} not available!".format(key_pheno)
+        phenograph = self.phenograph[key_pheno]  # data, markers, features, description, communities, proj_2d, N
+        for n, cytof_img in self.cytof_images.items():
+            if not hasattr(cytof_img, "phenograph"):
+                setattr(cytof_img, "phenograph", {})
+            if key_pheno in cytof_img.phenograph and not override:
+                print("\n{} already attached for {}-{}, skipping ... ".format(key_pheno, cytof_img.slide, cytof_img.roi))
+                continue
+            cond = self.df_feature['filename'] == cytof_img.filename  # cytof_img.filename: original file name
+            data = phenograph['data'].loc[cond, :]
+            communities = phenograph['communities'][cond.values]
+            proj_2d = phenograph['proj_2d'][cond.values]
+            # phenograph for this image
+            this_phenograph = {"data": data,
+                               "markers": phenograph["markers"],
+                               "features": phenograph["features"],
+                               "description": phenograph["description"],
+                               "communities": communities,
+                               "proj_2d": proj_2d,
+                               "N": phenograph["N"]
+                               }
+            cytof_img.phenograph[key_pheno] = this_phenograph
+    def _gather_roi_kneighbor_graphs(self, key_pheno: str, method: str = "distance", **kwars: dict) -> dict:
+        """ Define adjacency community for each cell based on either k-nearest neighbor or distance
+        Args:
+            key_pheno: dictionary key for a specific phenograph output
+            method: method to construct the adjacency matrix, choose from "distance" and "kneighbor"
+            **kwargs: used to specify distance threshold (thres) for "distance" method or number of neighbors (k)
+            for "kneighbor" method
+        Output:
+            network: (dict) ROI level network that will be used for cluster interaction analysis
+        """
+        assert method in ["distance", "kneighbor"], "Method can be either 'distance' or 'kneighbor'!"
+        default_thres = {
+            "thres": 50,
+            "k": 8
+        }
+        _ = "k" if method == "kneighbor" else "thres"
+        thres = kwars.get(_, default_thres[_])
+        print("{}: {}".format(_, thres))
+        df_pheno_feat = getattr(self, self.phenograph[key_pheno]['feat_attr'])
+        n_cluster = self.phenograph[key_pheno]['N']
+        cluster = self.phenograph[key_pheno]['communities']
+        df_slide_roi = getattr(self, "df_cohort")
+        networks = {}
+        if method == "kneighbor":  # construct K-neighbor graph
+            for i, row in df_slide_roi.iterrows(): #for i in df_slide_roi.index:       # Slide | ROI | input file
+                slide, roi, f_in = row["Slide"], row["ROI"], row["input file"]
+                cond = df_pheno_feat['filename'] == f_in
+                if cond.sum() == 0:
+                    continue
+                _cluster = cluster[cond.values]
+                df_sub = df_pheno_feat.loc[cond, :]
+                graph = skgraph(np.array(df_sub.loc[:, ['coordinate_x', 'coordinate_y']]),
+                                n_neighbors=thres, mode='distance')
+                graph.toarray()
+                I, J, V = sp.find(graph)
+                networks[roi] = dict()
+                networks[roi]['I'] = I  # from cell
+                networks[roi]['J'] = J  # to cell
+                networks[roi]['V'] = V  # distance value
+                networks[roi]['network'] = graph
+                # Edge type summary
+                edge_nums = np.zeros((n_cluster, n_cluster))
+                for _i, _j in zip(I, J):
+                    edge_nums[_cluster[_i], _cluster[_j]] += 1
+                networks[roi]['edge_nums'] = edge_nums
+                expected_percentage = np.zeros((n_cluster, n_cluster))
+                for _i in range(n_cluster):
+                    for _j in range(n_cluster):
+                        expected_percentage[_i, _j] = sum(_cluster == _i) * sum(_cluster == _j)  # / len(df_sub)**2
+                networks[roi]['expected_percentage'] = expected_percentage
+                networks[roi]['num_cell'] = len(df_sub)
+        else:  # construct neighborhood matrix using distance cut-off
+            cal_dist = DistanceMetric.get_metric('euclidean')
+            for i, row in df_slide_roi.iterrows(): #for i in df_slide_roi.index:       # Slide | ROI | input file
+                slide, roi, f_in = row["Slide"], row["ROI"], row["input file"]
+                cond = df_pheno_feat['filename'] == f_in
+                if cond.sum() == 0:
+                    continue
+                networks[roi] = dict()
+                _cluster = cluster[cond.values]
+                df_sub = df_pheno_feat.loc[cond, :]
+                dist = cal_dist.pairwise(df_sub.loc[:, ['coordinate_x', 'coordinate_y']].values)
+                networks[roi]['dist'] = dist
+                # expected percentage
+                expected_percentage = np.zeros((n_cluster, n_cluster))
+                for _i in range(n_cluster):
+                    for _j in range(n_cluster):
+                        expected_percentage[_i, _j] = sum(_cluster == _i) * sum(_cluster == _j)  # / len(df_sub)**2
+                networks[roi]['expected_percentage'] = expected_percentage
+                n_cells = len(df_sub)
+                # edge num
+                edge_nums = np.zeros_like(expected_percentage)
+                for _i in range(n_cells):
+                    for _j in range(n_cells):
+                        if dist[_i, _j] > 0 and dist[_i, _j] < thres:
+                            edge_nums[_cluster[_i], _cluster[_j]] += 1
+                networks[roi]['edge_nums'] = edge_nums
+                networks[roi]['num_cell'] = n_cells
+        return networks
+    def cluster_interaction_analysis(self, key_pheno, method="distance", level="slide", clustergrid=None, viz=False, **kwars):
+        """Interaction analysis for clusters
+        """
+        assert method in ["distance", "kneighbor"], "Method can be either 'distance' or 'kneighbor'!"
+        assert level in ["slide", "roi"], "Level can be either 'slide' or 'roi'!"
+        default_thres = {
+            "thres": 50,
+            "k": 8
+        }
+        _ = "k" if method == "kneighbor" else "thres"
+        thres = kwars.get(_, default_thres[_])
+        """print("{}: {}".format(_, thres))"""
+        networks = self._gather_roi_kneighbor_graphs(key_pheno, method=method, **{_: thres})
+        if level == "slide":
+            keys = ['edge_nums', 'expected_percentage', 'num_cell']
+            for slide in self.df_cohort['Slide'].unique():
+                cond = self.df_cohort['Slide'] == slide
+                df_slide = self.df_cohort.loc[cond, :]
+                rois = df_slide['ROI'].values
+                '''keys = list(networks.values())[0].keys()'''
+                networks[slide] = {}
+                for key in keys:
+                    networks[slide][key] = sum([networks[roi][key] for roi in rois if roi in networks])
+                for roi in rois:
+                    if roi in networks:
+                        networks.pop(roi)
+        interacts = {}
+        epsilon = 1e-6
+        for key, item in networks.items():
+            edge_percentage = item['edge_nums'] / np.sum(item['edge_nums'])
+            expected_percentage = item['expected_percentage'] / item['num_cell'] ** 2
+            # Normalize
+            interact_norm = np.log10(edge_percentage / (expected_percentage+epsilon) + epsilon)
+            interact_norm[interact_norm == np.log10(epsilon)] = 0
+            interacts[key] = interact_norm
+        # plot
+        for f_key, interact in interacts.items():
+            plt.figure(figsize=(6, 6))
+            ax = sns.heatmap(interact, center=np.log10(1 + epsilon),
+                             cmap='RdBu_r', vmin=-1, vmax=1)
+            ax.set_aspect('equal')
+            plt.title(f_key)
+            plt.show()
+            if clustergrid is None:
+                plt.figure()
+                clustergrid = sns.clustermap(interact, center=np.log10(1 + epsilon),
+                                             cmap='RdBu_r', vmin=-1, vmax=1,
+                                             xticklabels=np.arange(interact.shape[0]),
+                                             yticklabels=np.arange(interact.shape[0]),
+                                             figsize=(6, 6))
+                plt.title(f_key)
+                plt.show()
+            plt.figure()
+            sns.clustermap(interact[clustergrid.dendrogram_row.reordered_ind, :] \
+                               [:, clustergrid.dendrogram_row.reordered_ind],
+                           center=np.log10(1 + 0.1), cmap='RdBu_r', vmin=-1, vmax=1,
+                           xticklabels=clustergrid.dendrogram_row.reordered_ind,
+                           yticklabels=clustergrid.dendrogram_row.reordered_ind,
+                           figsize=(6, 6), row_cluster=False, col_cluster=False)
+            plt.title(f_key)
+            plt.show()
+        # IMPORTANT: attch to individual ROIs
+        self.attach_individual_roi_pheno(key_pheno, override=True)
+        return interacts, clustergrid
+    ###############################################################
+    ###################### Marker Level ###########################
+    ###############################################################
+    def generate_summary(self,
+                        feat_type: str = "normed",
+                        normq: int = 75,
+                        vis_thres: bool = False,
+                        accumul_type: Union[List[str], str] = "sum",
+                        verbose: bool = False,
+                        get_thresholds: Callable = _get_thresholds,
+                        ) -> List:
+        """ Generate marker positive summaries and attach to each individual CyTOF image in the cohort
+        """
+        accumul_type = [accumul_type] if isinstance(accumul_type, str) else accumul_type
+        assert feat_type in ["normed_scaled", "normed", ""], f"feature type {feat_type} not supported!"
+        feat_name = f"{feat_type}" if feat_type=="" else f"{normq}{feat_type}" # the attribute name to achieve from cytof_img
+        n_attr = f"df_feature{feat_name}" if feat_type=="" else f"df_feature_{feat_name}" # the attribute name to achieve from cytof_img
+        df_feat = getattr(self, n_attr)
+        # get thresholds
+        thres = getattr(self, "marker_thresholds", {})
+        thres[f"{normq}_{feat_type}"] = {}
+        for _ in accumul_type: # for either marker sum or marker average
+            print(f"Getting thresholds for cell {_} of all markers.")
+            thres[f"{normq}_{feat_type}"][f"cell_{_}"] = get_thresholds(df_feature=df_feat,
+                                                                        features=self.dict_feat[f"cell_{_}"],
+                                                                        visualize=vis_thres,
+                                                                        verbose=verbose)
+        setattr(self, "marker_thresholds", thres)
+        # split to each ROI
+        _attr_marker_pos, seen = [], 0
+        self.df_cohort['Slide_ROI'] = self.df_cohort[['Slide', 'ROI']].agg('_'.join, axis=1)
+        for n, cytof_img in self.cytof_images.items(): # ({slide}_{roi}, CytofImage)
+            if not hasattr(cytof_img, n_attr): # cytof_img object instance may not contain _scaled feature
+                cond = self.df_cohort['Slide_ROI'] == n
+                input_file = self.df_cohort.loc[self.df_cohort['Slide_ROI'] == n, 'input file'].values[0]
+                _df_feat = df_feat.loc[df_feat['filename'] == input_file].reset_index(drop=True)
+                setattr(cytof_img, n_attr, _df_feat)
+            else:
+                _df_feat = getattr(cytof_img, n_attr)
+            for _ in accumul_type: #["sum", "ave"]: # for either marker sum or marker average accumulation
+                attr_marker_pos = cytof_img.marker_positive_summary(
+                    thresholds=thres[f"{normq}_{feat_type}"][f"cell_{_}"],
+                    feat_type=feat_type,
+                    normq=normq,
+                    accumul_type=_
+                )
+                if seen == 0:
+                    _attr_marker_pos.append(attr_marker_pos)
+            seen += 1
+        return _attr_marker_pos
+    def co_expression_analysis(self,
+                                normq: int = 75,
+                                feat_type: str = "normed",
+                                co_exp_markers: Union[str, List] = "all",
+                                accumul_type: Union[str, List[str]] = "sum",
+                                verbose: bool = False,
+                                clustergrid=None):
+        # parameter checks and preprocess for analysis
+        assert feat_type in ["original", "normed", "scaled"]
+        if feat_type == "original":
+            feat_name = ""
+        elif feat_type == "normed":
+            feat_name = f"{normq}normed"
+        else:
+            feat_name = f"{normq}normed_scaled"
+        # go through each roi, get their binary marker-cell expression
+        roi_binary_express_dict = dict()
+        for i, cytof_img in enumerate(self.cytof_images.values()):
+            slide, roi = cytof_img.slide, cytof_img.roi
+            df_binary_pos_exp = cytof_img.get_binary_pos_express_df(feat_name, accumul_type)
+            roi_binary_express_dict[roi] = df_binary_pos_exp
+        df_slide_roi = self.df_cohort
+        # in cohort analysis, co-expression is always analyzed per Slide.
+        # per ROI analysis can be done by calling the cytof_img individually
+        slide_binary_express_dict = dict()
+        # concatenate all ROIs into one, for each slide
+        for slide in df_slide_roi["Slide"].unique():
+            rois_of_one_slide = df_slide_roi.loc[df_slide_roi["Slide"] == slide, "ROI"]
+            for i, filename_roi in enumerate(rois_of_one_slide):
+                ind_roi = filename_roi.replace('.txt', '')
+                if ind_roi not in roi_binary_express_dict:
+                    print(f'ROI {ind_roi} in self.df_cohort, but not found in co-expression dicts')
+                    continue
+                try: # adding to existing slide key
+                    # append dataframe row-wise, then perform co-expression analysis at the slide level
+                    slide_binary_express_dict[slide] = pd.concat([slide_binary_express_dict[slide], roi_binary_express_dict[ind_roi]], ignore_index=True)
+                except KeyError: # # first iteration writing to slide, couldn't find the slide key
+                    slide_binary_express_dict[slide] = roi_binary_express_dict[ind_roi].copy()
+        slide_co_expression_dict = dict()
+        # for each slide, perform co-expression analysis
+        for slide_key, large_binary_express in slide_binary_express_dict.items():
+            n_cells, n_markers = large_binary_express.shape
+            df_pos_exp_val = large_binary_express.values
+            # list all pair-wise combinations of the markers
+            column_combinations = list(product(range(n_markers), repeat=2))
+            # step to the numerator of the log odds ratio
+            co_positive_prob_matrix = np.zeros((n_markers, n_markers))
+            # step to the denominator of the log odds ratio
+            expected_prob_matrix = np.zeros((n_markers, n_markers))
+            for combo in column_combinations:
+                marker1, marker2 = combo
+                # count cells that positively expresses in both marker 1 and 2
+                positive_prob_marker1_and_2 = np.sum(np.logical_and(df_pos_exp_val[:, marker1], df_pos_exp_val[:, marker2])) / n_cells
+                co_positive_prob_matrix[marker1, marker2] = positive_prob_marker1_and_2
+                # pair (A,B) counts is the same as pair (B,A) counts
+                co_positive_prob_matrix[marker2, marker1] = positive_prob_marker1_and_2
+                # count expected cells if marker 1 and 2 are independently expressed
+                # p(A and B) = p(A) * p(B) = num_pos_a * num_pos_b / (num_cells * num_cells)
+                # p(A) = number of positive cells / number of cells
+                exp_prob_in_marker1_and_2 = np.sum(df_pos_exp_val[:, marker1]) * np.sum(df_pos_exp_val[:, marker2]) / n_cells**2
+                expected_prob_matrix[marker1, marker2] = exp_prob_in_marker1_and_2
+                expected_prob_matrix[marker2, marker1] = exp_prob_in_marker1_and_2
+            # theta(i_pos and j_pos)
+            df_co_pos = pd.DataFrame(co_positive_prob_matrix, index=df_binary_pos_exp.columns, columns=df_binary_pos_exp.columns)
+            # E(x)
+            df_expected = pd.DataFrame(expected_prob_matrix, index=df_binary_pos_exp.columns, columns=df_binary_pos_exp.columns)
+            epsilon = 1e-6 # avoid divide by 0 or log(0)
+            # Normalize and fix Nan
+            edge_percentage_norm = np.log10(df_co_pos.values / (df_expected.values+epsilon) + epsilon)
+            # if observed/expected = 0, then log odds ratio will have log10(epsilon)
+            # no observed means co-expression cannot be determined, does not mean strong negative co-expression
+            edge_percentage_norm[edge_percentage_norm == np.log10(epsilon)] = 0
+            slide_co_expression_dict[slide_key] = (edge_percentage_norm, df_expected.columns)
+        return slide_co_expression_dict

cytof/hyperion_analysis.py ADDED Viewed

	@@ -0,0 +1,1477 @@

+import os
+import re
+import glob
+import pickle as pkl
+import copy
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+from matplotlib.pyplot import cm
+import warnings
+from tqdm import tqdm
+import skimage
+import phenograph
+import umap
+import seaborn as sns
+from scipy.stats import spearmanr
+import sys
+import platform
+from pathlib import Path
+FILE = Path(__file__).resolve()
+ROOT = FILE.parents[0]  # cytof root directory
+if str(ROOT) not in sys.path:
+    sys.path.append(str(ROOT))  # add ROOT to PATH
+if platform.system() != 'Windows':
+    ROOT = Path(os.path.relpath(ROOT, Path.cwd()))  # relative
+from classes import CytofImage, CytofImageTiff
+import hyperion_preprocess as pre
+import hyperion_segmentation as seg
+from utils import load_CytofImage
+# from cytof import hyperion_preprocess as pre
+# from cytof import hyperion_segmentation as seg
+# from cytof.utils import load_CytofImage
+def _longest_substring(str1, str2):
+    ans = ""
+    len1, len2 = len(str1), len(str2)
+    for i in range(len1):
+        for j in range(len2):
+            match = ""
+            _len = 0
+            while ((i+_len < len1) and (j+_len < len2) and str1[i+_len] == str2[j+_len]):
+                match += str1[i+_len]
+                _len += 1
+                if len(match) > len(ans):
+                    ans = match
+    return ans
+def extract_feature(channels, raw_image, nuclei_seg, cell_seg, filename, show_head=False):
+    """ Extract nuclei and cell level feature from cytof image based on nuclei segmentation and cell segmentation
+        results
+    Inputs:
+        channels   = channels to extract feature from
+        raw_image  = raw cytof image
+        nuclei_seg = nuclei segmentation result
+        cell_seg   = cell segmentation result
+        filename   = filename of current cytof image
+    Returns:
+        feature_summary_df = a dataframe containing summary of extracted features
+        morphology         = names of morphology features extracted
+    :param channels: list
+    :param raw_image: numpy.ndarray
+    :param nuclei_seg: numpy.ndarray
+    :param cell_seg: numpy.ndarray
+    :param filename: string
+    :param morpholoty: list
+    :return feature_summary_df: pandas.core.frame.DataFrame
+    """
+    assert (len(channels) == raw_image.shape[-1])
+    # morphology features to be extracted
+    morphology = ["area", "convex_area", "eccentricity", "extent",
+                "filled_area", "major_axis_length", "minor_axis_length",
+                "orientation", "perimeter", "solidity", "pa_ratio"]
+    ## morphology features
+    nuclei_morphology = [_ + '_nuclei' for _ in morphology]  # morphology - nuclei level
+    cell_morphology = [_ + '_cell' for _ in morphology]  # morphology - cell level
+    ## single cell features
+    # nuclei level
+    sum_exp_nuclei = [_ + '_nuclei_sum' for _ in channels]  # sum expression over nuclei
+    ave_exp_nuclei = [_ + '_nuclei_ave' for _ in channels]  # average expression over nuclei
+    # cell level
+    sum_exp_cell   = [_ + '_cell_sum' for _ in channels]  # sum expression over cell
+    ave_exp_cell   = [_ + '_cell_ave' for _ in channels]  # average expression over cell
+    # column names of final result dataframe
+    column_names       = ["filename", "id", "coordinate_x", "coordinate_y"] + \
+                         sum_exp_nuclei + ave_exp_nuclei + nuclei_morphology + \
+                         sum_exp_cell + ave_exp_cell + cell_morphology
+    # Initiate
+    res = dict()
+    for column_name in column_names:
+        res[column_name] = []
+    n_nuclei = np.max(nuclei_seg)
+    for nuclei_id in tqdm(range(2, n_nuclei + 1), position=0, leave=True):
+        res["filename"].append(filename)
+        res["id"].append(nuclei_id)
+        regions = skimage.measure.regionprops((nuclei_seg == nuclei_id) * 1)  # , coordinates='xy') (deprecated)
+        if len(regions) >= 1:
+            this_nucleus = regions[0]
+        else:
+            continue
+        regions = skimage.measure.regionprops((cell_seg == nuclei_id) * 1)  # , coordinates='xy') (deprecated)
+        if len(regions) >= 1:
+            this_cell = regions[0]
+        else:
+            continue
+        centroid_y, centroid_x = this_nucleus.centroid  # y: rows; x: columns
+        res['coordinate_x'].append(centroid_x)
+        res['coordinate_y'].append(centroid_y)
+        # morphology
+        for i, feature in enumerate(morphology[:-1]):
+            res[nuclei_morphology[i]].append(getattr(this_nucleus, feature))
+            res[cell_morphology[i]].append(getattr(this_cell, feature))
+        res[nuclei_morphology[-1]].append(1.0 * this_nucleus.perimeter ** 2 / this_nucleus.filled_area)
+        res[cell_morphology[-1]].append(1.0 * this_cell.perimeter ** 2 / this_cell.filled_area)
+        # markers
+        for i, marker in enumerate(channels):
+            ch = i
+            res[sum_exp_nuclei[i]].append(np.sum(raw_image[nuclei_seg == nuclei_id, ch]))
+            res[ave_exp_nuclei[i]].append(np.average(raw_image[nuclei_seg == nuclei_id, ch]))
+            res[sum_exp_cell[i]].append(np.sum(raw_image[cell_seg == nuclei_id, ch]))
+            res[ave_exp_cell[i]].append(np.average(raw_image[cell_seg == nuclei_id, ch]))
+    feature_summary_df = pd.DataFrame(res)
+    if show_head:
+        print(feature_summary_df.head())
+    return feature_summary_df
+###############################################################################
+# def check_feature_distribution(feature_summary_df, features):
+#     """ Visualize feature distribution for each feature
+#     Inputs:
+#         feature_summary_df = dataframe of extracted feature summary
+#         features           = features to check distribution
+#     Returns:
+#         None
+#     :param feature_summary_df: pandas.core.frame.DataFrame
+#     :param features: list
+#     """
+#     for feature in features:
+#         print(feature)
+#         fig, ax = plt.subplots(1, 1, figsize=(3, 2))
+#         ax.hist(np.log2(feature_summary_df[feature] + 0.0001), 100)
+#         ax.set_xlim(-15, 15)
+#         plt.show()
+def feature_quantile_normalization(feature_summary_df, features, qs=[75,99]):
+    """ Calculate the q-quantiles of selected features given quantile q values. Then perform q-quantile normalization
+     on these features using calculated quantile values. The feature_summary_df will be updated in-place with new
+     columns "feature_qnormed" generated and added. Meanwhile, visualize distribution of log2 features before and after
+     q-normalization
+    Inputs:
+        feature_summary_df = dataframe of extracted feature summary
+        features           = features to be normalized
+        qs                 = quantile q values (default=[75,99])
+    Returns:
+        quantiles          = quantile values for each q
+    :param feature_summary_df: pandas.core.frame.DataFrame
+    :param features: list
+    :param qs: list
+    :return quantiles: dict
+    """
+    expressions = []
+    expressions_normed = dict((key, []) for key in qs)
+    quantiles   = {}
+    colors = cm.rainbow(np.linspace(0, 1, len(qs)))
+    for feat in features:
+        quantiles[feat] = {}
+        expressions.extend(feature_summary_df[feat])
+        plt.hist(np.log2(np.array(expressions) + 0.0001), 100, density=True)
+        for q, c in zip(qs, colors):
+            quantile_val = np.quantile(expressions, q/100)
+            quantiles[feat][q] = quantile_val
+            plt.axvline(np.log2(quantile_val), label=f"{q}th percentile", c=c)
+            print(f"{q}th percentile: {quantile_val}")
+            # log-quantile normalization
+            normed = np.log2(feature_summary_df.loc[:, feat] / quantile_val + 0.0001)
+            feature_summary_df.loc[:, f"{feat}_{q}normed"] = normed
+            expressions_normed[q].extend(normed)
+        plt.xlim(-15, 15)
+        plt.xlabel("log2(expression of all markers)")
+        plt.legend()
+        plt.show()
+    # visualize before & after quantile normalization
+    '''N = len(qs)+1 # (len(qs)+1) // 2 + (len(qs)+1) %2'''
+    log_expressions = tuple([np.log2(np.array(expressions) + 0.0001)] + [expressions_normed[q] for q in qs])
+    labels = ["before normalization"] + [f"after {q} normalization" for q in qs]
+    fig, ax = plt.subplots(1, 1, figsize=(12, 7))
+    ax.hist(log_expressions, 100, density=True, label=labels)
+    ax.set_xlabel("log2(expressions for all markers)")
+    plt.legend()
+    plt.show()
+    return quantiles
+def feature_scaling(feature_summary_df, features, inplace=False):
+    """Perform in-place mean-std scaling on selected features. Normally, do not scale nuclei sum feature
+    Inputs:
+        feature_summary_df = dataframe of extracted feature summary
+        features           = features to perform scaling on
+        inplace            = an indicator of whether perform the scaling in-place (Default=False)
+    Returns:
+    :param feature_summary_df: pandas.core.frame.DataFrame
+    :param features: list
+    :param inplace: bool
+    """
+    scaled_feature_summary_df = feature_summary_df if inplace else feature_summary_df.copy()
+    for feat in features:
+        if feat not in feature_summary_df.columns:
+            print(f"Warning: {feat} not available!")
+            continue
+        scaled_feature_summary_df[feat] = \
+            (scaled_feature_summary_df[feat] - np.average(scaled_feature_summary_df[feat])) \
+            / np.std(scaled_feature_summary_df[feat])
+    if not inplace:
+        return scaled_feature_summary_df
+def generate_summary(feature_summary_df, features, thresholds):
+    """Generate (cell level) summary table for each feature in features: feature name, total number (of cells),
+        calculated GMM threshold for this feature, number of individuals (cells) with greater than threshold values,
+        ratio of individuals (cells) with greater than threshold values
+    Inputs:
+        feature_summary_df = dataframe of extracted feature summary
+        features           = a list of features to generate summary table
+        thresholds         = (calculated GMM-based) thresholds for each feature
+    Outputs:
+        df_info    = summary table for each feature
+    :param feature_summary_df: pandas.core.frame.DataFrame
+    :param features: list
+    :param thresholds: dict
+    :return df_info: pandas.core.frame.DataFrame
+    """
+    df_info = pd.DataFrame(columns=['feature', 'total number', 'threshold', 'positive counts', 'positive ratio'])
+    for feature in features:
+        # calculate threshold
+        thres = thresholds[feature]
+        X = feature_summary_df[feature].values
+        n = sum(X > thres)
+        N = len(X)
+        df_new_row = pd.DataFrame({'feature': feature,'total number':N, 'threshold':thres,
+                                  'positive counts':n, 'positive ratio': n/N}, index=[0])
+        df_info = pd.concat([df_info, df_new_row])
+    return df_info
+# def visualize_thresholding_outcome(feat,
+#                                    feature_summary_df,
+#                                    raw_image,
+#                                    channel_names,
+#                                    thres,
+#                                    nuclei_seg,
+#                                    cell_seg,
+#                                    vis_quantile_q=0.9, savepath=None):
+#     """ Visualize calculated threshold for a feature by mapping back to nuclei and cell segmentation outputs - showing
+#         greater than threshold pixels in red color, others with blue color.
+#         Meanwhile, visualize the original image with red color indicating the channel correspond to the feature.
+#     Inputs:
+#         feat               = name of the feature to visualize
+#         feature_summary_df = dataframe of extracted feature summary
+#         raw_image          = raw cytof image
+#         channel_names       = a list of marker names, which is consistent with each channel in the raw_image
+#         thres              = threshold value for feature "feat"
+#         nuclei_seg         = nuclei segmentation output
+#         cell_seg           = cell segmentation output
+#     Outputs:
+#         stain_nuclei       = nuclei segmentation output stained with threshold information
+#         stain_cell         = cell segmentation output stained with threshold information
+#     :param feat: string
+#     :param feature_summary_df: pandas.core.frame.DataFrame
+#     :param raw_image: numpy.ndarray
+#     :param channel_names: list
+#     :param thres: float
+#     :param nuclei_seg: numpy.ndarray
+#     :param cell_seg: numpy.ndarray
+#     :return stain_nuclei: numpy.ndarray
+#     :return stain_cell: numpy.ndarray
+#     """
+#     col_name = channel_names[np.argmax([len(_longest_substring(feat, x)) for x in channel_names])]
+#     col_id   = channel_names.index(col_name)
+#     df_temp = pd.DataFrame(columns=[f"{feat}_overthres"], data=np.zeros(len(feature_summary_df), dtype=np.int32))
+#     df_temp.loc[feature_summary_df[feat] > thres, f"{feat}_overthres"] = 1
+#     feature_summary_df = pd.concat([feature_summary_df, df_temp], axis=1)
+#     # feature_summary_df.loc[:, f"{feat}_overthres"] = 0
+#     # feature_summary_df.loc[feature_summary_df[feat] > thres, f"{feat}_overthres"] = 1
+#
+#     '''rgba_color = [plt.cm.get_cmap('tab20').colors[_ % 20] for _ in feature_summary_df.loc[:, f"{feat}_overthres"]]'''
+#     color_ids  = []
+#
+#     # stained Nuclei image
+#     stain_nuclei = np.zeros((nuclei_seg.shape[0], nuclei_seg.shape[1], 3)) + 1
+#     for i in range(2, np.max(nuclei_seg) + 1):
+#         color_id = feature_summary_df[f"{feat}_overthres"][feature_summary_df['id'] == i].values[0] * 2
+#         if color_id not in color_ids:
+#             color_ids.append(color_id)
+#         stain_nuclei[nuclei_seg == i] = plt.cm.get_cmap('tab20').colors[color_id][:3]
+#
+#     # stained Cell image
+#     stain_cell = np.zeros((cell_seg.shape[0], cell_seg.shape[1], 3)) + 1
+#     for i in range(2, np.max(cell_seg) + 1):
+#         color_id = feature_summary_df[f"{feat}_overthres"][feature_summary_df['id'] == i].values[0] * 2
+#         stain_cell[cell_seg == i] = plt.cm.get_cmap('tab20').colors[color_id][:3]
+#
+#     fig, axs = plt.subplots(1,3,figsize=(16, 8))
+#     if col_id != 0:
+#         channel_ids = (col_id, 0)
+#     else:
+#         channel_ids = (col_id, -1)
+#     '''print(channel_ids)'''
+#     quantiles = [np.quantile(raw_image[..., _], vis_quantile_q) for _ in channel_ids]
+#     vis_img, _ = pre.cytof_merge_channels(raw_image, channel_names=channel_names,
+#                                           channel_ids=channel_ids, quantiles=quantiles)
+#     marker = feat.split("(")[0]
+#     print(f"Nuclei and cell with high {marker} expression shown in orange, low in blue.")
+#
+#     axs[0].imshow(vis_img)
+#     axs[1].imshow(stain_nuclei)
+#     axs[2].imshow(stain_cell)
+#     axs[0].set_title("pseudo-colored original image")
+#     axs[1].set_title(f"{marker} expression shown in nuclei")
+#     axs[2].set_title(f"{marker} expression shown in cell")
+#     if savepath is not None:
+#         plt.savefig(savepath)
+#     plt.show()
+#     return stain_nuclei, stain_cell, vis_img
+########################################################################################################################
+############################################### batch functions ########################################################
+########################################################################################################################
+def batch_extract_feature(files, markers, nuclei_markers, membrane_markers=None, show_vis=False):
+    """Extract features for cytof images from a list of files. Normally this list contains ROIs of the same slide
+    Inputs:
+        files            = a list of files to be processed
+        markers          = a list of marker names used when generating the image
+        nuclei_markers   = a list of markers define the nuclei channel (used for nuclei segmentation)
+        membrane_markers = a list of markers define the membrane channel (used for cell segmentation) (Default=None)
+        show_vis         = an indicator of showing visualization during process
+    Outputs:
+        file_features    = a dictionary contains extracted features for each file
+    :param files: list
+    :param markers: list
+    :param nuclei_markers: list
+    :param membrane_markers: list
+    :param show_vis: bool
+    :return file_features: dict
+    """
+    file_features = {}
+    for f in tqdm(files):
+        # read data
+        df = pre.cytof_read_data(f)
+        # preprocess
+        df_ = pre.cytof_preprocess(df)
+        column_names = markers[:]
+        df_output = pre.define_special_channel(df_, 'nuclei', markers=nuclei_markers)
+        column_names.insert(0, 'nuclei')
+        if membrane_markers is not None:
+            df_output = pre.define_special_channel(df_output, 'membrane', markers=membrane_markers)
+            column_names.append('membrane')
+        raw_image = pre.cytof_txt2img(df_output, marker_names=column_names)
+        if show_vis:
+            merged_im, _ = pre.cytof_merge_channels(raw_image, channel_ids=[0, -1], quantiles=None, visualize=False)
+            plt.imshow(merged_im[0:200, 200:400, ...])
+            plt.title('Selected region of raw cytof image')
+            plt.show()
+        # nuclei and cell segmentation
+        nuclei_img = raw_image[..., column_names.index('nuclei')]
+        nuclei_seg, color_dict = seg.cytof_nuclei_segmentation(nuclei_img, show_process=False)
+        if membrane_markers is not None:
+            membrane_img = raw_image[..., column_names.index('membrane')]
+            cell_seg, _ = seg.cytof_cell_segmentation(nuclei_seg, membrane_channel=membrane_img, show_process=False)
+        else:
+            cell_seg, _ = seg.cytof_cell_segmentation(nuclei_seg, show_process=False)
+        if show_vis:
+            marked_image_nuclei = seg.visualize_segmentation(raw_image, nuclei_seg, channel_ids=(0, -1), show=False)
+            marked_image_cell = seg.visualize_segmentation(raw_image, cell_seg, channel_ids=(-1, 0), show=False)
+            fig, axs = plt.subplots(1,2,figsize=(10,6))
+            axs[0].imshow(marked_image_nuclei[0:200, 200:400, :]), axs[0].set_title('nuclei segmentation')
+            axs[1].imshow(marked_image_cell[0:200, 200:400, :]), axs[1].set_title('cell segmentation')
+            plt.show()
+        # feature extraction
+        feat_names = markers[:]
+        feat_names.insert(0, 'nuclei')
+        df_feat_sum = extract_feature(feat_names, raw_image, nuclei_seg, cell_seg, filename=f)
+        file_features[f] = df_feat_sum
+    return file_features
+def batch_norm_scale(file_features, column_names, qs=[75,99]):
+    """Perform feature log transform, quantile normalization and scaling in a batch
+    Inputs:
+        file_features = A dictionary of dataframes containing extracted features. key - file name, item - feature table
+        column_names  = A list of markers. Should be consistent with column names in dataframe of features
+        qs            = quantile q values (Default=[75,99])
+    Outputs:
+        file_features_out = log transformed, quantile normalized and scaled features for each file in the batch
+        quantiles         = a dictionary of quantile values for each file in the batch
+    :param file_features: dict
+    :param column_names: list
+    :param qs: list
+    :return file_features_out: dict
+    :return quantiles: dict
+    """
+    file_features_out = copy.deepcopy(file_features) # maintain a copy of original file_features
+    # marker features
+    cell_markers_sum   = [_ + '_cell_sum' for _ in column_names]
+    cell_markers_ave   = [_ + '_cell_ave' for _ in column_names]
+    nuclei_markers_sum = [_ + '_nuclei_sum' for _ in column_names]
+    nuclei_markers_ave = [_ + '_nuclei_ave' for _ in column_names]
+    # morphology features
+    morphology = ["area", "convex_area", "eccentricity", "extent",
+                  "filled_area", "major_axis_length", "minor_axis_length",
+                  "orientation", "perimeter", "solidity", "pa_ratio"]
+    nuclei_morphology = [_ + '_nuclei' for _ in morphology]  # morphology - nuclei level
+    cell_morphology   = [_ + '_cell' for _ in morphology]  # morphology - cell level
+    # features to be normalized
+    features_to_norm = [x for x in nuclei_markers_sum + nuclei_markers_ave + cell_markers_sum + cell_markers_ave \
+                        if not x.startswith('nuclei')]
+    # features to be scaled
+    scale_features = []
+    for feature_name in nuclei_morphology + cell_morphology + nuclei_markers_sum + nuclei_markers_ave + \
+                        cell_markers_sum + cell_markers_ave:
+        '''if feature_name not in nuclei_morphology + cell_morphology and not feature_name.startswith('nuclei'):
+            scale_features += [feature_name, f"{feature_name}_75normed", f"{feature_name}_99normed"]
+        else:
+            scale_features += [feature_name]'''
+        temp = [feature_name]
+        if feature_name not in nuclei_morphology + cell_morphology and not feature_name.startswith('nuclei'):
+            for q in qs:
+                temp += [f"{feature_name}_{q}normed"]
+        scale_features += temp
+    quantiles = {}
+    for f, df in file_features_out.items():
+        print(f)
+        quantiles[f] = feature_quantile_normalization(df, features=features_to_norm, qs=qs)
+        feature_scaling(df, features=scale_features, inplace=True)
+    return file_features_out, quantiles
+def batch_scale_feature(outdir, normqs, df_io=None, files_scale=None):
+    """
+    Inputs:
+        outdir      = output saving directory, which contains the scale file generated previously,
+                     the input_output_csv file with the list of available cytof_img class instances in the batch,
+                     as well as previously saved cytof_img class instances in .pkl files
+        normqs      = a list of q values of percentile normalization
+        files_scale = full file name of the scaling information
+    Outputs: None
+        Scaled feature are saved as .csv files in subfolder "feature_qnormed_scaled" in outdir
+        A new attribute will be added to cytof_img class instance, and the update class instance is saved in outdir
+    """
+    if df_io is None:
+        df_io = pd.read_csv(os.path.join(outdir, "input_output.csv"))
+    for _i, normq in enumerate(normqs):
+        n_attr = f"df_feature_{normq}normed"
+        n_attr_scaled = f"{n_attr}_scaled"
+        file_scale = files_scale[_i] if files_scale is not None else os.path.join(outdir, "{}normed_scale_params.csv".format(normq))
+        # saving directory of scaled normed feature
+        dirq = os.path.join(outdir, f"feature_{normq}normed_scaled")
+        if not os.path.exists(dirq):
+            os.makedirs(dirq)
+        # load scaling parameters
+        df_scale = pd.read_csv(file_scale, index_col=False)
+        m = df_scale[df_scale.columns].iloc[0] # mean
+        s = df_scale[df_scale.columns].iloc[1] # std.dev
+        dfs = {}
+        cytofs = {}
+        # save scaled feature
+        for f_cytof in df_io['output_file']:
+    #     for roi, f_cytof in zip(df_io['ROI'], df_io['output_file']):
+            cytof_img = pkl.load(open(f_cytof, "rb"))
+            assert hasattr(cytof_img, n_attr), f"attribute {n_attr} not exist"
+            df_feat = copy.deepcopy(getattr(cytof_img, n_attr))
+            assert len([x for x in df_scale.columns if x not in df_feat.columns]) == 0
+            # scale
+            df_feat[df_scale.columns] = (df_feat[df_scale.columns] - m) / s
+            # save scaled feature to csv
+            df_feat.to_csv(os.path.join(dirq, os.path.basename(f_cytof).replace('.pkl', '.csv')), index=False)
+            # add attribute "df_feature_scaled"
+            setattr(cytof_img, n_attr_scaled, df_feat)
+            # save updated cytof_img class instance
+            pkl.dump(cytof_img, open(f_cytof, "wb"))
+def batch_generate_summary(outdir, feature_type="normed", normq=75, scaled=True, vis_thres=False):
+    """
+    Inputs:
+        outdir       = output saving directory, which contains the scale file generated previously, as well as previously saved
+                     cytof_img class instances in .pkl files
+        feature_type = type of feature to be used, available choices: "original", "normed", "scaled"
+        normq        = q value of quantile normalization
+        scaled       = a flag indicating whether or not use the scaled version of features (Default=False)
+        vis_thres    = a flag indicating whether or not visualize the process of calculating thresholds (Default=False)
+    Outputs: None
+        Two .csv files, one for cell sum and the other for cell average features, are saved for each ROI, containing the
+        threshold and cell count information of each feature, in the subfolder "marker_summary" under outdir
+    """
+    assert feature_type in ["original", "normed", "scaled"], 'accepted feature types are "original", "normed", "scaled"'
+    if feature_type == "original":
+        feat_name = ""
+    elif feature_type == "normed":
+        feat_name = f"{normq}normed"
+    else:
+        feat_name = f"{normq}normed_scaled"
+    n_attr = f"df_feature_{feat_name}"
+    dir_sum = os.path.join(outdir, "marker_summary", feat_name)
+    print(dir_sum)
+    if not os.path.exists(dir_sum):
+        os.makedirs(dir_sum)
+    seen = 0
+    dfs = {}
+    cytofs = {}
+    df_io  = pd.read_csv(os.path.join(outdir, "input_output.csv"))
+    for f in df_io['output_file'].tolist():
+        f_roi = os.path.basename(f).split(".pkl")[0]
+        cytof_img = pkl.load(open(f, "rb"))
+        ##### updated #####
+        df_feat = getattr(cytof_img, n_attr)
+        dfs[f]  = getattr(cytof_img, n_attr)
+        cytofs[f] = cytof_img
+        ##### end updated #####
+        if seen == 0:
+            feat_cell_sum = cytof_img.features['cell_sum']
+            feat_cell_ave = cytof_img.features['cell_ave']
+        seen += 1
+    ##### updated #####
+    all_df     = pd.concat(dfs.values(), ignore_index=True)
+    print("Getting thresholds for marker sum")
+    thres_sum = _get_thresholds(all_df, feat_cell_sum, visualize=vis_thres)
+    print("Getting thresholds for marker average")
+    thres_ave = _get_thresholds(all_df, feat_cell_ave, visualize=vis_thres)
+    for f, cytof_img in cytofs.items():
+        f_roi = os.path.basename(f).split(".pkl")[0]
+        df_info_cell_sum_f = generate_summary(dfs[f], features=feat_cell_sum, thresholds=thres_sum)
+        df_info_cell_ave_f = generate_summary(dfs[f], features=feat_cell_ave, thresholds=thres_ave)
+        setattr(cytof_img, f"cell_count_{feat_name}_sum", df_info_cell_sum_f)
+        setattr(cytof_img, f"cell_count_{feat_name}_ave", df_info_cell_ave_f)
+        df_info_cell_sum_f.to_csv(os.path.join(dir_sum, f"{f_roi}_cell_count_sum.csv"), index=False)
+        df_info_cell_ave_f.to_csv(os.path.join(dir_sum, f"{f_roi}_cell_count_ave.csv"), index=False)
+        pkl.dump(cytof_img, open(f, "wb"))
+    return dir_sum
+def _gather_roi_expressions(df_io, normqs=[75]):
+    """Only cell level sum"""
+    expressions = {}
+    expressions_normed = {}
+    for roi in df_io["ROI"].unique():
+        expressions[roi] = []
+        f_cytof_im = df_io.loc[df_io["ROI"] == roi, "output_file"].values[0]
+        cytof_im = load_CytofImage(f_cytof_im)
+        for feature_name in cytof_im.features['cell_sum']:
+            expressions[roi].extend(cytof_im.df_feature[feature_name])
+        expressions_normed[roi] = dict((q, {}) for q in normqs)
+        for q in expressions_normed[roi].keys():
+            expressions_normed[roi][q] = []
+            normed_feat = getattr(cytof_im, "df_feature_{}normed".format(q))
+            for feature_name in cytof_im.features['cell_sum']:
+                expressions_normed[roi][q].extend(normed_feat[feature_name])
+    return expressions, expressions_normed
+def visualize_normalization(df_slide_roi, normqs=[75], level="slide"):
+    expressions_, expressions_normed_ = _gather_roi_expressions(df_slide_roi, normqs=normqs)
+    if level == "slide":
+        prefix = "Slide"
+        expressions, expressions_normed = {}, {}
+        for slide in df_slide_roi["Slide"].unique():
+            f_rois = df_slide_roi.loc[df_slide_roi["Slide"] == slide, "ROI"].values
+            rois = [x.replace('.txt', '') for x in f_rois]
+            expressions[slide] = []
+            expressions_normed[slide] = dict((q, []) for q in normqs)
+            for roi in rois:
+                expressions[slide].extend(expressions_[roi])
+                for q in expressions_normed[slide].keys():
+                    expressions_normed[slide][q].extend(expressions_normed_[roi][q])
+    else:
+        expressions, expressions_normed = expressions_, expressions_normed_
+        prefix = "ROI"
+    num_q = len(normqs)
+    for key, key_exp in expressions.items():  # create a new plot for each slide (or ROI)
+        print("Showing {} {}".format(prefix, key))
+        fig, ax = plt.subplots(1, num_q + 1, figsize=(4 * (num_q + 1), 4))
+        ax[0].hist((np.log2(np.array(key_exp) + 0.0001),), 100, density=True)
+        ax[0].set_title("Before normalization")
+        ax[0].set_xlabel("log2(cellular expression of all markers)")
+        for i, q in enumerate(normqs):
+            ax[i + 1].hist((np.array(expressions_normed[key][q]) + 0.0001,), 100, density=True)
+            ax[i + 1].set_title("After {}-th percentile normalization".format(q))
+            ax[i + 1].set_xlabel("log2(cellular expression of all markers)")
+        plt.show()
+    return expressions, expressions_normed
+###########################################################
+############# marker level analysis functions #############
+###########################################################
+############# marker co-expression analysis #############
+def _gather_roi_co_exp(df_slide_roi, outdir, feat_name, accumul_type):
+    """roi level co-expression analysis"""
+    n_attr = f"df_feature_{feat_name}"
+    expected_percentages = {}
+    edge_percentages     = {}
+    num_cells            = {}
+    for seen_roi, f_roi in enumerate(df_slide_roi["ROI"].unique()):
+        roi = f_roi.replace(".txt", "")
+        slide = df_slide_roi.loc[df_slide_roi["ROI"] == f_roi, "Slide"].values[0]
+        f_cytof_im = "{}_{}.pkl".format(slide, roi)
+        if not f_cytof_im in os.listdir(os.path.join(outdir, "cytof_images")):
+            print("{} not found, skip".format(f_cytof_im))
+            continue
+        cytof_im   = load_CytofImage(os.path.join(outdir, "cytof_images", f_cytof_im))
+        df_feat = getattr(cytof_im, n_attr)
+        if seen_roi == 0:
+            # all gene (marker) columns
+            marker_col_all = [x for x in df_feat.columns if "cell_{}".format(accumul_type) in x]
+            marker_all = [x.split('(')[0] for x in marker_col_all]
+            n_marker = len(marker_col_all)
+        n_cell   = len(df_feat)
+        # corresponding marker positive info file
+        df_info_cell = getattr(cytof_im,"cell_count_{}_{}".format(feat_name,accumul_type))
+        pos_nums   = df_info_cell["positive counts"].values
+        pos_ratios = df_info_cell["positive ratio"].values
+        thresholds = df_info_cell["threshold"].values
+        # create new expected_percentage matrix for each ROI
+        expected_percentage = np.zeros((n_marker, n_marker))
+        # expected_percentage
+        # an N by N matrix, where N represent for the number of total gene (marker)
+        # each ij-th element represents for the percentage that both the i-th and the j-th gene is "positive"
+        # based on the threshold defined previously
+        for ii in range(n_marker):
+            for jj in range(n_marker):
+                expected_percentage[ii, jj] = pos_nums[ii] * pos_nums[jj]
+        expected_percentages[roi] = expected_percentage
+        # edge_percentage
+        # an N by N matrix, where N represent for the number of gene (marker)
+        # each ij-th element represents for the percentage of cells that show positive in both i-th and j-th gene
+        edge_nums = np.zeros_like(expected_percentage)
+        for ii in range(n_marker):
+            _x = df_feat[marker_col_all[ii]].values > thresholds[ii] # _x = df_feat[marker_col_all[ii]].values > thresholds[marker_idx[ii]]
+            for jj in range(n_marker):
+                _y = df_feat[marker_col_all[jj]].values > thresholds[jj] # _y = df_feat[marker_col_all[jj]].values > thresholds[marker_idx[jj]]
+                edge_nums[ii, jj] = np.sum(np.all([_x, _y], axis=0)) # / n_cell
+        edge_percentages[roi] = edge_nums
+        num_cells[roi]        = n_cell
+    return expected_percentages, edge_percentages, num_cells, marker_all, marker_col_all
+def co_expression_analysis(df_slide_roi, outdir, feature_type, accumul_type, co_exp_markers="all", normq=75,
+                           level="slide", clustergrid=None):
+    """
+    """
+    assert level in ["slide", "roi"], "Only slide or roi levels are accepted!"
+    assert feature_type in ["original", "normed", "scaled"]
+    if feature_type == "original":
+        feat_name = ""
+    elif feature_type == "normed":
+        feat_name = f"{normq}normed"
+    else:
+        feat_name = f"{normq}normed_scaled"
+    print(feat_name)
+    dir_cytof_img = os.path.join(outdir, "cytof_images")
+    expected_percentages, edge_percentages, num_cells, marker_all, marker_col_all = \
+        _gather_roi_co_exp(df_slide_roi, outdir, feat_name, accumul_type)
+    if co_exp_markers != "all":
+        # assert (isinstance(co_exp_markers, list) and all([x in cytof_img.markers for x in co_exp_markers]))
+        assert (isinstance(co_exp_markers, list) and all([x in marker_all for x in co_exp_markers]))
+        marker_idx = np.array([marker_all.index(x) for x in co_exp_markers])
+        marker_all = [marker_all[x] for x in marker_idx]
+        marker_col_all = [marker_col_all[x] for x in marker_idx]
+    else:
+        marker_idx = np.arange(len(marker_all))
+    if level == "slide":
+        # expected_percentages, edge_percentages = {}, {}
+        for slide in df_slide_roi["Slide"].unique():  ## for each slide
+            for seen_roi, f_roi in enumerate(df_slide_roi.loc[df_slide_roi["Slide"] == slide, "ROI"]):  ## for each ROI
+                roi = f_roi.replace(".txt", "")
+                if roi not in expected_percentages:
+                    continue
+                if seen_roi == 0:
+                    expected_percentages[slide] = expected_percentages[roi]
+                    edge_percentages[slide] = edge_percentages[roi]
+                    num_cells[slide] = num_cells[roi]
+                else:
+                    expected_percentages[slide] += expected_percentages[roi]
+                    edge_percentages[slide] += edge_percentages[roi]
+                    num_cells[slide] += num_cells[roi]
+                expected_percentages.pop(roi)
+                edge_percentages.pop(roi)
+                num_cells.pop(roi)
+    co_exps = {}
+    for key, expected_percentage in expected_percentages.items():
+        expected_percentage = expected_percentage / num_cells[key] ** 2
+        edge_percentage = edge_percentages[key] / num_cells[key]
+        # Normalize
+        edge_percentage_norm = np.log10(edge_percentage / expected_percentage + 0.1)
+        # Fix Nan
+        edge_percentage_norm[np.isnan(edge_percentage_norm)] = np.log10(1 + 0.1)
+        co_exps[key] = edge_percentage_norm
+    # plot
+    for f_key, edge_percentage_norm in co_exps.items():
+        plt.figure(figsize=(6, 6))
+        ax = sns.heatmap(edge_percentage_norm[marker_idx, :][:, marker_idx], center=np.log10(1 + 0.1),
+                         # ax = sns.heatmap(edge_percentage_norm, center=np.log10(1 + 0.1),
+                         cmap='RdBu_r', vmin=-1, vmax=3,
+                         xticklabels=marker_all, yticklabels=marker_all)
+        ax.set_aspect('equal')
+        plt.title(f_key)
+        plt.show()
+        if clustergrid is None:
+            plt.figure()
+            clustergrid = sns.clustermap(edge_percentage_norm[marker_idx, :][:, marker_idx],
+                                         # clustergrid = sns.clustermap(edge_percentage_norm,
+                                         center=np.log10(1 + 0.1), cmap='RdBu_r', vmin=-1, vmax=3,
+                                         xticklabels=marker_all, yticklabels=marker_all, figsize=(6, 6))
+            plt.title(f_key)
+            plt.show()
+        # else:
+        plt.figure()
+        sns.clustermap(edge_percentage_norm[marker_idx, :][:, marker_idx] \
+                       # sns.clustermap(edge_percentage_norm \
+                       [clustergrid.dendrogram_row.reordered_ind, :][:, clustergrid.dendrogram_row.reordered_ind],
+                       center=np.log10(1 + 0.1), cmap='RdBu_r', vmin=-1, vmax=3,
+                       xticklabels=np.array(marker_all)[clustergrid.dendrogram_row.reordered_ind],
+                       yticklabels=np.array(marker_all)[clustergrid.dendrogram_row.reordered_ind],
+                       figsize=(6, 6), row_cluster=False, col_cluster=False)
+        plt.title(f_key)
+        plt.show()
+    return co_exps, marker_idx, clustergrid
+############# marker correlation #############
+from scipy.stats import spearmanr
+def _gather_roi_corr(df_slide_roi, outdir, feat_name, accumul_type):
+    """roi level correlation analysis"""
+    n_attr = f"df_feature_{feat_name}"
+    feats = {}
+    for seen_roi, f_roi in enumerate(df_slide_roi["ROI"].unique()):## for each ROI
+        roi = f_roi.replace(".txt", "")
+        slide = df_slide_roi.loc[df_slide_roi["ROI"] == f_roi, "Slide"].values[0]
+        f_cytof_im = "{}_{}.pkl".format(slide, roi)
+        if not f_cytof_im in os.listdir(os.path.join(outdir, "cytof_images")):
+            print("{} not found, skip".format(f_cytof_im))
+            continue
+        cytof_im   = load_CytofImage(os.path.join(outdir, "cytof_images", f_cytof_im))
+        df_feat    = getattr(cytof_im, n_attr)
+        feats[roi] = df_feat
+        if seen_roi == 0:
+            # all gene (marker) columns
+            marker_col_all = [x for x in df_feat.columns if "cell_{}".format(accumul_type) in x]
+            marker_all = [x.split('(')[0] for x in marker_col_all]
+    return feats, marker_all, marker_col_all
+def correlation_analysis(df_slide_roi, outdir, feature_type, accumul_type, corr_markers="all", normq=75, level="slide",
+                         clustergrid=None):
+    """
+    """
+    assert level in ["slide", "roi"], "Only slide or roi levels are accepted!"
+    assert feature_type in ["original", "normed", "scaled"]
+    if feature_type == "original":
+        feat_name = ""
+    elif feature_type == "normed":
+        feat_name = f"{normq}normed"
+    else:
+        feat_name = f"{normq}normed_scaled"
+    print(feat_name)
+    dir_cytof_img = os.path.join(outdir, "cytof_images")
+    feats, marker_all, marker_col_all = _gather_roi_corr(df_slide_roi, outdir, feat_name, accumul_type)
+    n_marker = len(marker_all)
+    corrs = {}
+    # n_marker = len(marker_all)
+    if level == "slide":
+        for slide in df_slide_roi["Slide"].unique():  ## for each slide
+            for seen_roi, f_roi in enumerate(df_slide_roi.loc[df_slide_roi["Slide"] == slide, "ROI"]):  ## for each ROI
+                roi = f_roi.replace(".txt", "")
+                if roi not in feats:
+                    continue
+                if seen_roi == 0:
+                    feats[slide] = feats[roi]
+                else:
+                    #                     feats[slide] = feats[slide].append(feats[roi], ignore_index=True)
+                    feats[slide] = pd.concat([feats[slide], feats[roi]])
+                feats.pop(roi)
+    for key, feat in feats.items():
+        correlation = np.zeros((n_marker, n_marker))
+        for i, feature_i in enumerate(marker_col_all):
+            for j, feature_j in enumerate(marker_col_all):
+                correlation[i, j] = spearmanr(feat[feature_i].values, feat[feature_j].values).correlation
+        corrs[key] = correlation
+    if corr_markers != "all":
+        assert (isinstance(corr_markers, list) and all([x in marker_all for x in corr_markers]))
+        marker_idx = np.array([marker_all.index(x) for x in corr_markers])
+        marker_all = [marker_all[x] for x in marker_idx]
+        marker_col_all = [marker_col_all[x] for x in marker_idx]
+    else:
+        marker_idx = np.arange(len(marker_all))
+    # plot
+    for f_key, corr in corrs.items():
+        plt.figure(figsize=(6, 6))
+        ax = sns.heatmap(corr[marker_idx, :][:, marker_idx], center=np.log10(1 + 0.1),
+                         cmap='RdBu_r', vmin=-1, vmax=1,
+                         xticklabels=corr_markers, yticklabels=corr_markers)
+        ax.set_aspect('equal')
+        plt.title(f_key)
+        plt.show()
+        if clustergrid is None:
+            plt.figure()
+            clustergrid = sns.clustermap(corr[marker_idx, :][:, marker_idx],
+                                         center=np.log10(1 + 0.1), cmap='RdBu_r', vmin=-1, vmax=1,
+                                         xticklabels=corr_markers, yticklabels=corr_markers, figsize=(6, 6))
+            plt.title(f_key)
+            plt.show()
+        plt.figure()
+        sns.clustermap(corr[marker_idx, :][:, marker_idx] \
+                           [clustergrid.dendrogram_row.reordered_ind, :][:, clustergrid.dendrogram_row.reordered_ind],
+                       center=np.log10(1 + 0.1), cmap='RdBu_r', vmin=-1, vmax=1,
+                       xticklabels=np.array(corr_markers)[clustergrid.dendrogram_row.reordered_ind],
+                       yticklabels=np.array(corr_markers)[clustergrid.dendrogram_row.reordered_ind],
+                       figsize=(6, 6), row_cluster=False, col_cluster=False)
+        plt.title(f_key)
+        plt.show()
+    return corrs, marker_idx, clustergrid
+############# marker interaction #############
+from sklearn.neighbors import DistanceMetric
+from tqdm import tqdm
+def _gather_roi_interact(df_slide_roi, outdir, feat_name, accumul_type, interact_markers="all", thres_dist=50):
+    dist = DistanceMetric.get_metric('euclidean')
+    n_attr = f"df_feature_{feat_name}"
+    edge_percentages = {}
+    num_edges = {}
+    for seen_roi, f_roi in enumerate(df_slide_roi["ROI"].unique()):  ## for each ROI
+        roi = f_roi.replace(".txt", "")
+        slide = df_slide_roi.loc[df_slide_roi["ROI"] == f_roi, "Slide"].values[0]
+        f_cytof_im = "{}_{}.pkl".format(slide, roi)
+        if not f_cytof_im in os.listdir(os.path.join(outdir, "cytof_images")):
+            print("{} not found, skip".format(f_cytof_im))
+            continue
+        cytof_im = load_CytofImage(os.path.join(outdir, "cytof_images", f_cytof_im))
+        df_feat  = getattr(cytof_im, n_attr)
+        n_cell   = len(df_feat)
+        dist_matrix = dist.pairwise(df_feat.loc[:, ['coordinate_x', 'coordinate_y']].values)
+        if seen_roi==0:
+            # all gene (marker) columns
+            marker_col_all = [x for x in df_feat.columns if "cell_{}".format(accumul_type) in x]
+            marker_all = [x.split('(')[0] for x in marker_col_all]
+            n_marker = len(marker_col_all)
+        # corresponding marker positive info file
+        df_info_cell = getattr(cytof_im,"cell_count_{}_{}".format(feat_name,accumul_type))
+        thresholds   = df_info_cell["threshold"].values#[marker_idx]
+        n_edges = 0
+        # expected_percentage = np.zeros((n_marker, n_marker))
+        # edge_percentage = np.zeros_like(expected_percentage)
+        edge_nums = np.zeros((n_marker, n_marker))
+        # interaction
+        cluster_sub = []
+        for i_cell in range(n_cell):
+            _temp = set()
+            for k in range(n_marker):
+                if df_feat[marker_col_all[k]].values[i_cell] > thresholds[k]:
+                    _temp = _temp | {k}
+            cluster_sub.append(_temp)
+        for i in tqdm(range(n_cell)):
+            for j in range(n_cell):
+                 if dist_matrix[i, j] > 0 and dist_matrix[i, j] < thres_dist:
+                        n_edges += 1
+                        for m in cluster_sub[i]:
+                            for n in cluster_sub[j]:
+                                edge_nums[m, n] += 1
+        edge_percentages[roi] = edge_nums#/n_edges
+        num_edges[roi] = n_edges
+    return edge_percentages, num_edges, marker_all, marker_col_all
+def interaction_analysis(df_slide_roi,
+                         outdir,
+                         feature_type,
+                         accumul_type,
+                         interact_markers="all",
+                         normq=75,
+                         level="slide",
+                         thres_dist=50,
+                         clustergrid=None):
+    """
+    """
+    assert level in ["slide", "roi"], "Only slide or roi levels are accepted!"
+    assert feature_type in ["original", "normed", "scaled"]
+    if feature_type == "original":
+        feat_name = ""
+    elif feature_type == "normed":
+        feat_name = f"{normq}normed"
+    else:
+        feat_name = f"{normq}normed_scaled"
+    print(feat_name)
+    dir_cytof_img = os.path.join(outdir, "cytof_images")
+    expected_percentages, _, num_cells, marker_all_, marker_col_all_ = \
+        _gather_roi_co_exp(df_slide_roi, outdir, feat_name, accumul_type)
+    edge_percentages, num_edges, marker_all, marker_col_all = \
+        _gather_roi_interact(df_slide_roi, outdir, feat_name, accumul_type, interact_markers="all",
+                             thres_dist=thres_dist)
+    if level == "slide":
+        for slide in df_slide_roi["Slide"].unique():  ## for each slide
+            for seen_roi, f_roi in enumerate(df_slide_roi.loc[df_slide_roi["Slide"] == slide, "ROI"]):  ## for each ROI
+                roi = f_roi.replace(".txt", "")
+                if roi not in expected_percentages:
+                    continue
+                if seen_roi == 0:
+                    expected_percentages[slide] = expected_percentages[roi]
+                    edge_percentages[slide] = edge_percentages[roi]
+                    num_edges[slide] = num_edges[roi]
+                    num_cells[slide] = num_cells[roi]
+                else:
+                    expected_percentages[slide] += expected_percentages[roi]
+                    edge_percentages[slide] += edge_percentages[roi]
+                    num_edges[slide] += num_edges[roi]
+                    num_cells[slide] += num_cells[roi]
+                expected_percentages.pop(roi)
+                edge_percentages.pop(roi)
+                num_edges.pop(roi)
+                num_cells.pop(roi)
+    if interact_markers != "all":
+        assert (isinstance(interact_markers, list) and all([x in marker_all for x in interact_markers]))
+        marker_idx = np.array([marker_all.index(x) for x in interact_markers])
+        marker_all = [marker_all[x] for x in marker_idx]
+        marker_col_all = [marker_col_all[x] for x in marker_idx]
+    else:
+        marker_idx = np.arange(len(marker_all))
+    interacts = {}
+    for key, edge_percentage in edge_percentages.items():
+        expected_percentage = expected_percentages[key] / num_cells[key] ** 2
+        edge_percentage = edge_percentage / num_edges[key]
+        # Normalize
+        edge_percentage_norm = np.log10(edge_percentage / expected_percentage + 0.1)
+        # Fix Nan
+        edge_percentage_norm[np.isnan(edge_percentage_norm)] = np.log10(1 + 0.1)
+        interacts[key] = edge_percentage_norm
+    # plot
+    for f_key, interact_ in interacts.items():
+        interact = interact_[marker_idx, :][:, marker_idx]
+        plt.figure(figsize=(6, 6))
+        ax = sns.heatmap(interact, center=np.log10(1 + 0.1),
+                         cmap='RdBu_r', vmin=-1, vmax=1,
+                         xticklabels=interact_markers, yticklabels=interact_markers)
+        ax.set_aspect('equal')
+        plt.title(f_key)
+        plt.show()
+        if clustergrid is None:
+            plt.figure()
+            clustergrid = sns.clustermap(interact, center=np.log10(1 + 0.1), cmap='RdBu_r', vmin=-1, vmax=1,
+                                         xticklabels=interact_markers, yticklabels=interact_markers, figsize=(6, 6))
+            plt.title(f_key)
+            plt.show()
+        plt.figure()
+        sns.clustermap(
+            interact[clustergrid.dendrogram_row.reordered_ind, :][:, clustergrid.dendrogram_row.reordered_ind],
+            center=np.log10(1 + 0.1), cmap='RdBu_r', vmin=-1, vmax=1,
+            xticklabels=np.array(interact_markers)[clustergrid.dendrogram_row.reordered_ind],
+            yticklabels=np.array(interact_markers)[clustergrid.dendrogram_row.reordered_ind],
+            figsize=(6, 6), row_cluster=False, col_cluster=False)
+        plt.title(f_key)
+        plt.show()
+    return interacts, clustergrid
+###########################################################
+######## Pheno-Graph clustering analysis functions ########
+###########################################################
+def clustering_phenograph(cohort_file, outdir, normq=75, feat_comb="all", k=None, save_vis=False, pheno_markers="all"):
+    """Perform Pheno-graph clustering for the cohort
+        Inputs:
+            cohort_file   = a .csv file include the whole cohort
+            outdir        = output saving directory, previously saved cytof_img class instances in .pkl files
+            normq         = q value for quantile normalization
+            feat_comb     = desired feature combination to be used for phenograph clustering, acceptable choices: "all",
+                        "cell_sum", "cell_ave", "cell_sum_only", "cell_ave_only" (Default="all")
+            k             = number of initial neighbors to run Pheno-graph (Default=None)
+                        If k is not provided, k is set to N / 100, where N is the total number of single cells
+            save_vis      = a flag indicating whether to save the visualization output (Default=False)
+            pheno_markers = a list of markers used in phenograph clustering (must be a subset of cytof_img.markers)
+    Outputs:
+        df_all     = a dataframe of features for all cells in the cohort, with the clustering output saved in the column
+        'phenotype_total{n_community}', where n_community stands for the total number of communities defined by the cohort
+            Also, each individual cytof_img class instances will be updated with 2 new attributes:
+            1)"num phenotypes ({feat_comb}_{normq}normed_{k})"
+            2)"phenotypes ({feat_comb}_{normq}normed_{k})"
+        feat_names  = feature names (columns) used to generate PhenoGraph output
+        k           = the initial number of k used to run PhenoGraph
+        pheno_name  = the column name of the added column indicating phenograph cluster
+        vis_savedir = the directory to save the visualization output
+        markers     = the list of markers used (minimal, for visualization purposes)
+    """
+    vis_savedir = ""
+    feat_groups = {
+        "all": ["cell_sum", "cell_ave", "cell_morphology"],
+        "cell_sum": ["cell_sum", "cell_morphology"],
+        "cell_ave": ["cell_ave", "cell_morphology"],
+        "cell_sum_only": ["cell_sum"],
+        "cell_ave_only": ["cell_ave"]
+    }
+    assert feat_comb in feat_groups.keys(), f"{feat_comb} not supported!"
+    feat_name = f"_{normq}normed_scaled"
+    n_attr    = f"df_feature{feat_name}"
+    dfs = {}
+    cytof_ims = {}
+    df_io = pd.read_csv(os.path.join(outdir, "input_output.csv"))
+    df_slide_roi = pd.read_csv(cohort_file)
+    # load all scaled feature in the cohort
+    for i in df_io.index:
+        f_out = df_io.loc[i, "output_file"]
+        f_roi = f_out.split('/')[-1].split('.pkl')[0]
+        if not os.path.isfile(f_out):
+            print("{} not found, skip".format(f_out))
+            continue
+        cytof_img = load_CytofImage(f_out)
+        if i == 0:
+            dict_feat = cytof_img.features
+            markers   = cytof_img.markers
+        cytof_ims[f_roi] = cytof_img
+        dfs[f_roi] = getattr(cytof_img, n_attr)
+    feat_names = []
+    for y in feat_groups[feat_comb]:
+        if "morphology" in y:
+            feat_names += dict_feat[y]
+        else:
+            if pheno_markers == "all":
+                feat_names += dict_feat[y]
+                pheno_markers = markers
+            else:
+                assert isinstance(pheno_markers, list)
+                ids = [markers.index(x) for x in pheno_markers]
+                feat_names += [dict_feat[y][x] for x in ids]
+    # concatenate feature dataframes of all rois in the cohort
+    df_all = pd.concat([_ for key, _ in dfs.items()])
+    # set number of nearest neighbors k and run PhenoGraph for phenotype clustering
+    k = k if k else int(df_all.shape[0] / 100)  # 100
+    communities, graph, Q = phenograph.cluster(df_all[feat_names], k=k, n_jobs=-1)  # run PhenoGraph
+    n_community = len(np.unique(communities))
+    # Visualize
+    ## project to 2D
+    umap_2d = umap.UMAP(n_components=2, init='random', random_state=0)
+    proj_2d = umap_2d.fit_transform(df_all[feat_names])
+    # plot together
+    print("Visualization in 2d - cohort")
+    plt.figure(figsize=(4, 4))
+    plt.title("cohort")
+    sns.scatterplot(x=proj_2d[:, 0], y=proj_2d[:, 1], hue=communities, palette='tab20',
+                    #                 legend=legend,
+                    hue_order=np.arange(n_community))
+    plt.axis('tight')
+    plt.legend(bbox_to_anchor=(1.01, 1), loc=2, borderaxespad=0.)
+    if save_vis:
+        vis_savedir = os.path.join(outdir, "phenograph_{}_{}normed_{}".format(feat_comb, normq, k))
+        if not os.path.exists(vis_savedir):
+            os.makedirs(vis_savedir)
+        plt.savefig(os.path.join(vis_savedir, "cluster_scatter.png"))
+    plt.show()
+    # attach clustering output to df_all
+    pheno_name = f'phenotype_total{n_community}'
+    df_all[pheno_name] = communities
+    df_all['{}_projx'.format(pheno_name)] = proj_2d[:,0]
+    df_all['{}_projy'.format(pheno_name)] = proj_2d[:,1]
+    return df_all, feat_names, k, pheno_name, vis_savedir, markers
+def _gather_roi_pheno(df_slide_roi, df_all):
+    """Split whole df into df for each ROI"""
+    pheno_roi = {}
+    for i in df_slide_roi.index:
+        path_i = df_slide_roi.loc[i, "path"]
+        roi_i  = df_slide_roi.loc[i, "ROI"]
+        f_in  = os.path.join(path_i, roi_i)
+        cond  = df_all["filename"] == f_in
+        pheno_roi[roi_i.replace(".txt", "")] = df_all.loc[cond, :]
+    return pheno_roi
+def _vis_cell_phenotypes(df_feat, communities, n_community, markers, list_features, accumul_type="sum", savedir=None, savename=""):
+    """ Visualize cell phenotypes for a given dataframe of feature
+    Args:
+        df_feat: a dataframe of features
+        communities: a list of communities (can be a subset of the cohort communities, but should be consistent with df_feat)
+        n_community: number of communities in the cohort (n_community >= number of unique values in communities)
+        markers: a list of markers used in CyTOF image (to be present in the heatmap visualization)
+        list_features: a list of feature names (consistent with columns in df_feat)
+        accumul_type: feature aggregation type, choose from "sum" and "ave" (default="sum")
+        savedir: results saving directory. If not None, visualization plots will be saved in the desired directory (default=None)
+    Returns:
+        cell_cluster: a (N, M) matrix, where N = # of clustered communities, and M = # of markers
+        cell_cluster_norm: the normalized form of cell_cluster (normalized by subtracting the median value)
+    """
+    assert accumul_type in ["sum", "ave"], "Wrong accumulation type! Choose from 'sum' and 'ave'!"
+    cell_cluster = np.zeros((n_community, len(markers)))
+    for cluster in range(len(np.unique(communities))):
+        df_sub = df_feat[communities == cluster]
+        if df_sub.shape[0] == 0:
+            continue
+        for i, feat in enumerate(list_features): # for each feature in the list of features
+            cell_cluster[cluster, i] = np.average(df_sub[feat])
+    cell_cluster_norm = cell_cluster - np.median(cell_cluster, axis=0)
+    sns.heatmap(cell_cluster_norm, # cell_cluster - np.median(cell_cluster, axis=0),#
+                cmap='magma',
+                xticklabels=markers,
+                yticklabels=np.arange(len(np.unique(communities)))
+               )
+    plt.xlabel("Markers - {}".format(accumul_type))
+    plt.ylabel("Phenograph clusters")
+    plt.title("normalized expression - cell {}".format(accumul_type))
+    savename += "_cell_{}.png".format(accumul_type)
+    if savedir is not None:
+        if not os.path.exists(savedir):
+            os.makedirs(savedir)
+        plt.savefig(os.path.join(savedir, savename))
+    plt.show()
+    return cell_cluster, cell_cluster_norm
+def vis_phenograph(df_slide_roi, df_all, pheno_name, markers, used_feat, level="cohort", accumul_type="sum",
+                   to_save=False, savepath="./", vis_scatter=False):
+    """
+    Args:
+        df_slide_roi = a dataframe with slide-roi correspondence information included
+        df_all       = dataframe with feature and clustering results included
+        pheno_name   = name (key) of the phenograph output
+        markers      = a (minimal) list of markers used in Pheno-Graph (to visualize)
+        list_feat    = a list of features used (should be consistent with columns available in df_all)
+        level        = level to visualize, choose from "cohort", "slide", or "roi" (default="cohort")
+        accumul_type = type of feature accumulation used (default="sum")
+        to_save      = a flag indicating whether or not save output (default=False)
+        savepath     = visualization saving directory (default="./")
+    """
+    if to_save:
+        if not os.path.exists(savepath):
+            os.makedirs
+    # features used for accumul_type
+    ids       = [i for (i,x) in enumerate(used_feat) if re.search(".{}".format(accumul_type), x)]
+    list_feat = [used_feat[i] for i in ids]
+    '''# features used for cell ave
+    accumul_type             = "ave"
+    ids                      = [i for (i,x) in enumerate(used_feats[key]) if re.search(".{}".format(accumul_type), x)]
+    list_feats[accumul_type] = [used_feats[key][i] for i in ids]
+    list_feat_morph = [x for x in used_feats[key] if x not in list_feats["sum"]+list_feats["ave"]]'''
+    if accumul_type == "sum":
+        suffix = "_cell_sum"
+    elif accumul_type == "ave":
+        suffix = "_cell_ave"
+    assert level in ["cohort", "slide", "roi"], "Only 'cohort', 'slide' or 'roi' levels are accepted!"
+    '''df_io = pd.read_csv(os.path.join(outdir, "input_output.csv"))'''
+    n_community = len(df_all[pheno_name].unique())
+    if level == "cohort":
+        phenos = {level: df_all}
+    else:
+        phenos = _gather_roi_pheno(df_slide_roi, df_all)
+        if level == "slide":
+            for slide in df_io["Slide"].unique(): # for each slide
+                for seen_roi, roi_i in enumerate(df_slide_roi.loc[df_slide_roi["Slide"] == slide, "ROI"]):  ## for each ROI
+                    f_roi = roi_i.replace(".txt", "")
+                    if seen_roi == 0:
+                        phenos[slide] = phenos[f_roi]
+                    else:
+                        phenos[slide] = pd.concat([phenos[slide], phenos[f_roi]])
+                    phenos.pop(f_roi)
+    savename = ""
+    for key, df_pheno in phenos.items():
+        if to_save:
+            savepath_ = os.path.join(savepath, level)
+            savename = key
+        communities = df_pheno[pheno_name]
+        _vis_cell_phenotypes(df_pheno, communities, n_community, markers, list_feat, accumul_type,
+                             savedir=savepath_, savename=savename)
+        # visualize scatter (2-d projection)
+        if vis_scatter:
+            proj_2d = df_pheno[['{}_projx'.format(pheno_name), '{}_projy'.format(pheno_name)]].to_numpy()
+#             print("Visualization in 2d - cohort")
+            plt.figure(figsize=(4, 4))
+            plt.title("cohort")
+            sns.scatterplot(x=proj_2d[:, 0], y=proj_2d[:, 1], hue=communities, palette='tab20',
+                            #                 legend=legend,
+                            hue_order=np.arange(n_community))
+            plt.axis('tight')
+            plt.legend(bbox_to_anchor=(1.01, 1), loc=2, borderaxespad=0.)
+            if to_save:
+                plt.savefig(os.path.join(savepath_, "scatter_{}.png".format(savename)))
+            plt.show()
+    return phenos
+import sklearn.neighbors
+from sklearn.neighbors import kneighbors_graph as skgraph
+from sklearn.metrics import DistanceMetric# from sklearn.neighbors import DistanceMetric
+from scipy import sparse as sp
+import networkx as nx
+def _gather_roi_distances(df_slide_roi, outdir, name_pheno, thres_dist=50):
+    dist = DistanceMetric.get_metric('euclidean')
+    dist_matrices = {}
+    for i, f_roi in enumerate(df_slide_roi['ROI'].unique()):
+        roi = f_roi.replace('.txt', '')
+        slide = df_slide_roi.loc[df_slide_roi["ROI"] == f_roi, "Slide"].values[0]
+        f_cytof_im = "{}_{}.pkl".format(slide, roi)
+        if not f_cytof_im in os.listdir(os.path.join(outdir, "cytof_images")):
+            print("{} not found, skip".format(f_cytof_im))
+            continue
+        cytof_im   = load_CytofImage(os.path.join(outdir, "cytof_images", f_cytof_im))
+        df_sub = cytof_im.df_feature
+        dist_matrices[roi] = {}
+        dist_matrices[roi]['dist'] = dist.pairwise(df_sub.loc[:, ['coordinate_x', 'coordinate_y']].values)
+        phenograph = getattr(cytof_im, 'phenograph')[name_pheno]
+        cluster = phenograph['clusters'].values
+        if i == 0:
+            n_cluster = phenograph['num_community']
+        # expected percentage
+        expected_percentage = np.zeros((n_cluster, n_cluster))
+        for _i in range(n_cluster):
+            for _j in range(n_cluster):
+                expected_percentage[_i, _j] = sum(cluster == _i) * sum(cluster == _j) #/ len(df_sub)**2
+        dist_matrices[roi]['expected_percentage'] = expected_percentage
+        dist_matrices[roi]['num_cell'] = len(df_sub)
+        # edge num
+        edge_nums = np.zeros_like(expected_percentage)
+        dist_matrix = dist_matrices[roi]['dist']
+        n_cells = dist_matrix.shape[0]
+        for _i in range(n_cells):
+            for _j in range(n_cells):
+                 if dist_matrix[_i, _j] > 0 and dist_matrix[_i, _j] < thres_dist:
+                        edge_nums[cluster[_i], cluster[_j]] += 1
+        # edge_percentages = edge_nums/np.sum(edge_nums)
+        dist_matrices[roi]['edge_nums'] = edge_nums
+    return dist_matrices
+def _gather_roi_kneighbor_graphs(df_slide_roi, outdir, name_pheno, k=8):
+    graphs = {}
+    for i, f_roi in enumerate(df_slide_roi['ROI'].unique()):
+        roi = f_roi.replace('.txt', '')
+        f_cytof_im = "{}.pkl".format(roi)
+        if not f_cytof_im in os.listdir(os.path.join(outdir, "cytof_images")):
+            print("{} not found, skip".format(f_cytof_im))
+            continue
+        cytof_im   = load_CytofImage(os.path.join(outdir, "cytof_images", f_cytof_im))
+        df_sub = cytof_im.df_feature
+        graph = skgraph(np.array(df_sub.loc[:, ['coordinate_x', 'coordinate_y']]), n_neighbors=k, mode='distance')
+        graph.toarray()
+        I, J, V = sp.find(graph)
+        graphs[roi] = {}
+        graphs[roi]['I'] = I  # Start (center)
+        graphs[roi]['J'] = J  # End
+        graphs[roi]['V'] = V
+        graphs[roi]['graph'] = graph
+        phenograph = getattr(cytof_im, 'phenograph')[name_pheno]
+        cluster    = phenograph['clusters'].values
+        if i == 0:
+            n_cluster = phenograph['num_community']
+        # Edge type summary
+        edge_nums = np.zeros((n_cluster, n_cluster))
+        for _i, _j in zip(I, J):
+            edge_nums[cluster[_i], cluster[_j]] += 1
+        graphs[roi]['edge_nums'] = edge_nums
+        '''edge_percentages = edge_nums/np.sum(edge_nums)'''
+        expected_percentage = np.zeros((n_cluster, n_cluster))
+        for _i in range(n_cluster):
+            for _j in range(n_cluster):
+                expected_percentage[_i, _j] = sum(cluster == _i) * sum(cluster == _j) #/ len(df_sub)**2
+        graphs[roi]['expected_percentage'] = expected_percentage
+        graphs[roi]['num_cell'] = len(df_sub)
+    return graphs
+def interaction_analysis(df_slide_roi, outdir, name_pheno, method="distance", k=8, thres_dist=50, level="slide", clustergrid=None):
+    assert method in ["distance", "graph"], "Method can be either 'distance' or 'graph'!"
+    if method == "distance":
+        info = _gather_roi_distances(df_slide_roi, outdir, name_pheno, thres_dist)
+    else:
+        info = _gather_roi_kneighbor_graphs(df_slide_roi, outdir, name_pheno, k)
+    interacts = {}
+    if level == "slide":
+        for slide in df_slide_roi["Slide"].unique():
+            for seen_roi, f_roi in enumerate(df_slide_roi.loc[df_slide_roi["Slide"] == slide, "ROI"]):
+                roi = f_roi.replace(".txt", "")
+                if seen_roi == 0:
+                    info[slide] = {}
+                    info[slide]['edge_nums']           = info[roi]['edge_nums']
+                    info[slide]['expected_percentage'] = info[roi]['expected_percentage']
+                    info[slide]['num_cell']            = info[roi]['num_cell']
+                else:
+                    info[slide]['edge_nums']           += info[roi]['edge_nums']
+                    info[slide]['expected_percentage'] += info[roi]['expected_percentage']
+                    info[slide]['num_cell']            += info[roi]['num_cell']
+                info.pop(roi)
+    for key, item in info.items():
+        edge_percentage     = item['edge_nums'] / np.sum(item['edge_nums'])
+        expected_percentage = item['expected_percentage'] / item['num_cell'] ** 2
+        # Normalize
+        interact_norm       = np.log10(edge_percentage/expected_percentage + 0.1)
+        # Fix Nan
+        interact_norm[np.isnan(interact_norm)] = np.log10(1 + 0.1)
+        interacts[key]      = interact_norm
+    # plot
+    for f_key, interact in interacts.items():
+        plt.figure(figsize=(6, 6))
+        ax = sns.heatmap(interact, center=np.log10(1 + 0.1),
+                         cmap='RdBu_r', vmin=-1, vmax=1)
+        ax.set_aspect('equal')
+        plt.title(f_key)
+        plt.show()
+        if clustergrid is None:
+            plt.figure()
+            clustergrid = sns.clustermap(interact, center=np.log10(1 + 0.1),
+                                         cmap='RdBu_r', vmin=-1, vmax=1,
+                                         xticklabels=np.arange(interact.shape[0]),
+                                         yticklabels=np.arange(interact.shape[0]),
+                                         figsize=(6, 6))
+            plt.title(f_key)
+            plt.show()
+        plt.figure()
+        sns.clustermap(interact[clustergrid.dendrogram_row.reordered_ind, :]\
+                       [:, clustergrid.dendrogram_row.reordered_ind],
+                       center=np.log10(1 + 0.1), cmap='RdBu_r', vmin=-1, vmax=1,
+                       xticklabels=clustergrid.dendrogram_row.reordered_ind,
+                       yticklabels=clustergrid.dendrogram_row.reordered_ind,
+                       figsize=(6, 6), row_cluster=False, col_cluster=False)
+        plt.title(f_key)
+        plt.show()
+    return interacts, clustergrid

cytof/hyperion_preprocess.py ADDED Viewed

	@@ -0,0 +1,335 @@

+import os
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+import pathlib
+import skimage.io as skio
+import warnings
+from typing import Union, Optional, Type, Tuple, List
+# from readimc import MCDFile
+# from cytof.classes import CytofImage, CytofImageTiff
+import sys
+import platform
+from pathlib import Path
+FILE = Path(__file__).resolve()
+ROOT = FILE.parents[0]  # cytof root directory
+if str(ROOT) not in sys.path:
+    sys.path.append(str(ROOT))  # add ROOT to PATH
+if platform.system() != 'Windows':
+    ROOT = Path(os.path.relpath(ROOT, Path.cwd()))  # relative
+from classes import CytofImage, CytofImageTiff
+# ####################### Read data ########################
+def cytof_read_data_roi(filename, slide="", roi=None, iltype="hwd", **kwargs) -> Tuple[CytofImage, list]:
+    """ Read cytof data (.txt file) as a dataframe
+    Inputs:
+        filename = full filename of the cytof data (path-name-ext)
+    Returns:
+        df_cytof = dataframe of the cytof data
+        cols     = column names of the dataframe, an empty list returned if not reading data from a dataframe
+    :param filename: str
+    :return df_cytof: pandas.core.frame.DataFrame
+    """
+    ext = pathlib.Path(filename).suffix
+    assert len(ext) > 0, "Please provide a full file name with extension!"
+    assert ext.upper() in ['.TXT', '.TIFF', '.TIF', '.CSV', '.QPTIFF'], "filetypes other than '.txt', '.tiff'  or '.csv' are not (yet) supported."
+    if ext.upper() in ['.TXT', '.CSV']: # the case with a dataframe
+        if ext.upper() == '.TXT':
+            df_cytof = pd.read_csv(filename, sep='\t') # pd.read_table(filename)
+            if roi is None:
+                roi = os.path.basename(filename).split('.txt')[0]
+            # initialize an instance of CytofImage
+            cytof_img = CytofImage(df_cytof, slide=slide, roi=roi, filename=filename)
+        elif ext.upper() == '.CSV':
+            df_cytof = pd.read_csv(filename)
+            if roi is None:
+                roi = os.path.basename(filename).split('.csv')[0]
+            # initialize an instance of CytofImage
+            cytof_img = CytofImage(df_cytof, slide=slide, roi=roi, filename=filename)
+        if "X" in kwargs and "Y" in kwargs:
+            cytof_img.df.rename(columns={kwargs["X"]: "X", kwargs["Y"]: 'Y'}, inplace=True)
+        cols = cytof_img.df.columns
+    else: # the case without a dataframe
+        image = skio.imread(filename, plugin="tifffile")
+        orig_img_shape = image.shape
+        sorted_shape = np.sort(orig_img_shape)
+        # roll the sorted shape by one to the left
+        # ref: https://numpy.org/doc/stable/reference/generated/numpy.roll.html
+        correct_shape = np.roll(sorted_shape, -1)
+        # sometimes tiff could be square, this ensures images were correctly transposed
+        orig_temp = list(orig_img_shape) # tuple is immutable
+        correct_index = []
+        for shape in correct_shape:
+            correct_index.append(orig_temp.index(shape))
+            # placeholder, since shape can't = 0
+            orig_temp[orig_temp.index(shape)] = 0
+        image = image.transpose(correct_index)
+        # create TIFF class cytof image
+        cytof_img = CytofImageTiff(image, slide=slide, roi=roi, filename=filename)
+        cols = []
+    return cytof_img, cols
+def cytof_read_data_mcd(filename, verbose=False):
+    # slides = {}
+    cytof_imgs = {}
+    with MCDFile(filename) as f:
+        if verbose:
+            print("\n{}, \n\t{} slides, showing the 1st slide:".format(filename, len(f.slides)))
+        ## slide
+        for slide in f.slides:
+            if verbose:
+                print("\tslide ID: {}, description: {}, width: {} um, height: {}um".format(
+                slide.id,
+                slide.description,
+                slide.width_um,
+                slide.height_um)
+            )
+            # slides[slide.id] = {}
+            # read the slide image
+            im_slide = f.read_slide(slide)  # numpy array or None
+            if verbose:
+                print("\n\tslide image shape: {}".format(im_slide.shape))
+            # (optional) read the first panorama image
+            panorama = slide.panoramas[0]
+            if verbose:
+                print(
+                "\t{} panoramas, showing the 1st one. \n\tpanorama ID: {}, description: {}, width: {} um, height: {}um".format(
+                    len(slide.panoramas),
+                    panorama.id,
+                    panorama.description,
+                    panorama.width_um,
+                    panorama.height_um)
+            )
+            im_pano = f.read_panorama(panorama)  # numpy array
+            if verbose:
+                print("\n\tpanorama image shape: {}".format(im_pano.shape))
+            for roi in slide.acquisitions: # for each acquisition (roi)
+                im_roi = f.read_acquisition(roi)  # array, shape: (c, y, x), dtype: float32
+                if verbose:
+                    print("\troi {}, shape: {}".format(roi.id, img_roi.shape))
+#                 slides[slide.id][roi.id] = {
+#                     "channel_names": roi.channel_names,
+#                     "channel_labels": roi.channel_labels,
+#                     "image": im_roi
+#                 }
+                cytof_img = CytofImageTiff(image=im_roi.transpose((1,2,0)),
+                                           slide=slide.id,
+                                           roi=roi.id,
+                                           filename=raw_f)
+                cytof_img.set_channels(roi.channel_names, roi.channel_labels)
+                cytof_imgs["{}_{}".format(slide.id, roi.id)] = cytof_img
+    return cytof_imgs# slides
+def cytof_preprocess(df):
+    """ Preprocess cytof dataframe
+        Every pair of X and Y values represent for a unique physical pixel locations in the original image
+        The values for Xs and Ys should be continuous integers
+        The missing pixels would be filled with 0
+    Inputs:
+        df = cytof dataframe
+    Returns:
+        df = preprocessed cytof dataframe with missing pixel values filled with 0
+    :param df: pandas.core.frame.DataFrame
+    :return df: pandas.core.frame.DataFrame
+    """
+    nrow = max(df['Y'].values) + 1
+    ncol = max(df['X'].values) + 1
+    n = len(df)
+    if nrow * ncol > n:
+        df2 = pd.DataFrame(np.zeros((nrow * ncol - n, len(df.columns)), dtype=int), columns=df.columns)
+        df  = pd.concat([df, df2])
+    return df
+def cytof_check_channels(df, marker_names=None, xlim=None, ylim=None):
+    """A visualization function to show different markers of a cytof image
+    Inputs:
+        df           = preprocessed cytof dataframe
+        marker_names = marker names to visualize, should match to column names in df (default=None)
+        xlim         = x-axis limit of output image (default=None)
+        ylim         = y-axis limit of output image (default=None)
+    :param df: pandas.core.frame.DataFrame
+    :param marker_names: list
+    :param xlim: tuple
+    :prarm ylim: tuple
+    """
+    if marker_names is None:
+        marker_names = [df.columns[_] for _ in range(6, len(df.columns))]
+    nrow = max(df['Y'].values) + 1
+    ncol = max(df['X'].values) + 1
+    ax_ncol = 5
+    ax_nrow = int(np.ceil(len(marker_names)/5))
+    fig, axes = plt.subplots(ax_nrow, ax_ncol, figsize=(3*ax_ncol, 3*ax_nrow))
+    if ax_nrow == 1:
+        axes = np.array([axes])
+    for i, _ in enumerate(marker_names):
+        _ax_nrow = int(np.floor(i/ax_ncol))
+        _ax_ncol = i % ax_ncol
+        image = df[_].values.reshape(nrow, ncol)
+        image = np.clip(image/np.quantile(image, 0.99), 0, 1)
+        axes[_ax_nrow, _ax_ncol].set_title(_)
+        if xlim is not None:
+            image = image[:, xlim[0]:xlim[1]]
+        if ylim is not None:
+            image = image[ylim[0]:ylim[1], :]
+        im = axes[_ax_nrow, _ax_ncol].imshow(image, cmap="gray")
+        fig.colorbar(im, ax=axes[_ax_nrow, _ax_ncol])
+    plt.show()
+def remove_special_channels(self, channels):
+    for channel in channels:
+        idx = self.channels.index(channel)
+        self.channels.pop(idx)
+        self.markers.pop(idx)
+        self.labels.pop(idx)
+        self.df.drop(columns=channel, inplace=True)
+def define_special_channels(self, channels_dict):
+    # create a copy of original dataframe
+    self.df_orig = self.df.copy()
+    for new_name, old_names in channels_dict.items():
+        print(new_name)
+        if len(old_names) == 0:
+            continue
+        old_nms = []
+        for i, old_name in enumerate(old_names):
+            if old_name['marker_name'] not in self.channels:
+                warnings.warn('{} is not available!'.format(old_name['marker_name']))
+                continue
+            old_nms.append(old_name)
+        if len(old_nms) > 0:
+            for i, old_name in enumerate(old_nms):
+                if i == 0:
+                    self.df[new_name] = self.df[old_name['marker_name']]
+                else:
+                    self.df[new_name] += self.df[old_name['marker_name']]
+            if not old_name['to_keep']:
+                idx = self.channels.index(old_name['marker_name'])
+                # Remove the unwanted channels
+                self.channels.pop(idx)
+                self.markers.pop(idx)
+                self.labels.pop(idx)
+                self.df.drop(columns=old_name['marker_name'], inplace=True)
+            self.channels.append(new_name)
+def cytof_txt2img(df, marker_names):
+    """ Convert from cytof dataframe to d-dimensional image, where d=length of marker names
+        Each channel of the output image correspond to the pixel intensity of the corresponding marker
+    Inputs:
+        df           = cytof dataframe
+        marker_names = markers to take into consideration
+    Returns:
+        out_img      = d-dimensional image
+    :param df: pandas.core.frame.DataFrame
+    :param marker_names: list
+    :return out_img: numpy.ndarray
+    """
+    nc_in = len(marker_names)
+    marker_names = [_ for _ in marker_names if _ in df.columns.values]
+    nc = len(marker_names)
+    if nc != nc_in:
+        warnings.warn("{} markers selected instead of {}".format(nc, nc_in))
+    nrow = max(df['Y'].values) + 1
+    ncol = max(df['X'].values) + 1
+    print("Output image shape: [{}, {}, {}]".format(nrow, ncol, nc))
+    out_image = np.zeros([nrow, ncol, nc], dtype=float)
+    for _nc in range(nc):
+        out_image[..., _nc] = df[marker_names[_nc]].values.reshape(nrow, ncol)
+    return out_image
+def cytof_merge_channels(im_cytof: np.ndarray,
+                         channel_names: List,
+                         channel_ids:List = None,
+                         channels: List = None,
+                         quantiles: List = None,
+                         visualize: bool = False):
+    """ Merge selected channels (given by "channel_ids") of raw cytof image and generate a RGB image
+    Inputs:
+        im_cytof      = raw cytof image
+        channel_names = a list of names correspond to all channels of the im_cytof
+        channel_ids   = the indices of channels to show, no more than 6 channels can be shown the same time (default=None)
+        channels      = the names of channels to show, no more than 6 channels can be shown the same time (default=None)
+                        Either "channel_ids" or "channels" should be provided
+        quantiles     = the quantile values for each channel defined by channel_ids (default=None)
+        visualize     = a flag indicating whether print the visualization on screen
+    Returns:
+        merged_im   = channel merged image
+        quantiles   = the quantile values for each channel defined by channel_ids
+    :param im_cytof: numpy.ndarray
+    :param channel_names: list
+    :param channel_ids: list
+    :param channels: list
+    :param quantiles: list
+    :return merged_im: numpy.ndarray
+    :return quantiles: list
+    """
+    assert len(channel_names) == im_cytof.shape[-1], 'The length of "channel_names" does not match the image size!'
+    assert channel_ids or channels, 'At least one should be provided, either "channel_ids" or "channels"!'
+    if channel_ids is None:
+        channel_ids = [channel_names.index(n) for n in channels]
+    assert len(channel_ids) <= 6, "No more than 6 channels can be visualized simultaneously!"
+    if len(channel_ids) > 3:
+        warnings.warn(
+            "Visualizing more than 3 channels the same time results in deteriorated visualization. \
+            It is not recommended!")
+    full_colors = ['red', 'green', 'blue', 'cyan', 'magenta', 'yellow']
+    info = [f"{marker} in {c}\n" for (marker, c) in \
+            zip([channel_names[i] for i in channel_ids], full_colors[:len(channel_ids)])]
+    print(f"Visualizing... \n{''.join(info)}")
+    merged_im = np.zeros((im_cytof.shape[0], im_cytof.shape[1], 3))
+    if quantiles is None:
+        quantiles = [np.quantile(im_cytof[..., _], 0.99) for _ in channel_ids]
+    for _ in range(min(len(channel_ids), 3)):
+        merged_im[..., _] = np.clip(im_cytof[..., channel_ids[_]] / quantiles[_], 0, 1) * 255
+    chs = [[1, 2], [0, 2], [0, 1]]
+    chs_id = 0
+    while _ < len(channel_ids) - 1:
+        _ += 1
+        for j in chs[chs_id]:
+            merged_im[..., j] += np.clip(im_cytof[..., channel_ids[_]] / quantiles[_], 0, 1) * 255  # /2
+            merged_im[..., j] = np.clip(merged_im[..., j], 0, 255)
+        chs_id += 1
+    merged_im = merged_im.astype(np.uint8)
+    if visualize:
+        plt.imshow(merged_im)
+        plt.show()
+    return merged_im, quantiles

cytof/hyperion_segmentation.py ADDED Viewed

	@@ -0,0 +1,341 @@

+import scipy
+import skimage
+from skimage import feature
+import numpy as np
+import matplotlib.pyplot as plt
+from skimage.color import label2rgb
+from skimage.segmentation import mark_boundaries
+import os
+import sys
+import platform
+from pathlib import Path
+FILE = Path(__file__).resolve()
+ROOT = FILE.parents[0]  # cytof root directory
+if str(ROOT) not in sys.path:
+    sys.path.append(str(ROOT))  # add ROOT to PATH
+if platform.system() != 'Windows':
+    ROOT = Path(os.path.relpath(ROOT, Path.cwd()))  # relative
+from segmentation_functions import generate_mask, normalize
+# from cytof.segmentation_functions import generate_mask, normalize
+def cytof_nuclei_segmentation(im_nuclei, show_process=False, size_hole=50, size_obj=7,
+                              start_coords=(0, 0), side=100, colors=[], min_distance=2,
+                              fg_marker_dilate=2, bg_marker_dilate=2
+                              ):
+    """ Segment nuclei based on the input nuclei image
+    Inputs:
+        im_nuclei    = raw cytof image correspond to nuclei, size=(h, w)
+        show_process = flag of whether show the process  (default=False)
+        size_hole    = size of the hole to be removed (default=50)
+        size_obj     = size of the small objects to be removed (default=7)
+        start_coords = the starting (x,y) coordinates of visualizing process (default=(0,0))
+        side         = the side length of visualizing process (default=100)
+        colors       = a list of colors used to visualize segmentation results (default=[])
+    Returns:
+        labels = nuclei segmentation result, where background is represented by 1, size=(h, w)
+        colors = the list of colors used to visualize segmentation results
+    :param im_nuclei: numpy.ndarray
+    :param show_process: bool
+    :param size_hole: int
+    :param size_obj: int
+    :param start_coords: int
+    :return labels: numpy.ndarray
+    :return colors: list
+    """
+    if len(colors) == 0:
+        cmap_set3 = plt.get_cmap("Set3")
+        cmap_tab20c = plt.get_cmap("tab20c")
+        colors = [cmap_tab20c.colors[_] for _ in range(len(cmap_tab20c.colors))] + \
+                 [cmap_set3.colors[_] for _ in range(len(cmap_set3.colors))]
+    x0, y0 = start_coords
+    mask = generate_mask(np.clip(im_nuclei, 0, np.quantile(im_nuclei, 0.95)), fill_hole=False, use_watershed=False)
+    mask = skimage.morphology.remove_small_holes(mask.astype(bool), size_hole)
+    mask = skimage.morphology.remove_small_objects(mask.astype(bool), size_obj)
+    if show_process:
+        plt.figure(figsize=(4, 4))
+        plt.imshow(mask[x0:x0 + side, y0:y0 + side], cmap='gray')
+        plt.show()
+    # Find and count local maxima
+    distance = scipy.ndimage.distance_transform_edt(mask)
+    distance = scipy.ndimage.gaussian_filter(distance, 1)
+    local_maxi_idx = skimage.feature.peak_local_max(distance, exclude_border=False, min_distance=min_distance,
+                                                    labels=None)
+    local_maxi = np.zeros_like(distance, dtype=bool)
+    local_maxi[tuple(local_maxi_idx.T)] = True
+    markers = scipy.ndimage.label(local_maxi)[0]
+    markers = markers > 0
+    markers = skimage.morphology.dilation(markers, skimage.morphology.disk(fg_marker_dilate))
+    markers = skimage.morphology.label(markers)
+    markers[markers > 0] = markers[markers > 0] + 1
+    markers = markers + skimage.morphology.erosion(1 - mask, skimage.morphology.disk(bg_marker_dilate))
+    # Another watershed
+    temp_im = skimage.util.img_as_ubyte(normalize(np.clip(im_nuclei, 0, np.quantile(im_nuclei, 0.95))))
+    gradient = skimage.filters.rank.gradient(temp_im, skimage.morphology.disk(3))
+    # gradient = skimage.filters.rank.gradient(normalize(np.clip(im_nuclei, 0, np.quantile(im_nuclei, 0.95))),
+    #                                          skimage.morphology.disk(3))
+    labels = skimage.segmentation.watershed(gradient, markers)
+    labels = skimage.morphology.closing(labels)
+    labels_rgb = label2rgb(labels, bg_label=1, colors=colors)
+    labels_rgb[labels == 1, ...] = (0, 0, 0)
+    if show_process:
+        fig, axes = plt.subplots(3, 2, figsize=(8, 12), sharex=False, sharey=False)
+        ax = axes.ravel()
+        ax[0].set_title("original grayscale")
+        ax[0].imshow(np.clip(im_nuclei[x0:x0 + side, y0:y0 + side], 0, np.quantile(im_nuclei, 0.95)),
+                     interpolation='nearest')
+        ax[1].set_title("markers")
+        ax[1].imshow(label2rgb(markers[x0:x0 + side, y0:y0 + side], bg_label=1, colors=colors),
+                     interpolation='nearest')
+        ax[2].set_title("distance")
+        ax[2].imshow(-distance[x0:x0 + side, y0:y0 + side], cmap=plt.cm.nipy_spectral, interpolation='nearest')
+        ax[3].set_title("gradient")
+        ax[3].imshow(gradient[x0:x0 + side, y0:y0 + side], interpolation='nearest')
+        ax[4].set_title("Watershed Labels")
+        ax[4].imshow(labels_rgb[x0:x0 + side, y0:y0 + side, :], interpolation='nearest')
+        ax[5].set_title("Watershed Labels")
+        ax[5].imshow(labels_rgb, interpolation='nearest')
+        plt.show()
+    return labels, colors
+def cytof_cell_segmentation(nuclei_seg, radius=5, membrane_channel=None, show_process=False,
+                            start_coords=(0, 0), side=100, colors=[]):
+    """ Cell segmentation based on nuclei segmentation; membrane-guided cell segmentation if membrane_channel provided.
+    Inputs:
+        nuclei_seg       = an index image containing nuclei instance segmentation information, where the background is
+                           represented by 1, size=(h,w). Typically, the output of calling the cytof_nuclei_segmentation
+                           function.
+        radius           = assumed radius of cells (default=5)
+        membrane_channel = membrane image channel of original cytof image (default=None)
+        show_process     = a flag indicating whether or not showing the segmentation process (default=False)
+        start_coords     = the starting (x,y) coordinates of visualizing process (default=(0,0))
+        side             = the side length of visualizing process (default=100)
+        colors           = a list of colors used to visualize segmentation results (default=[])
+    Returns:
+        labels = an index image containing cell instance segmentation information, where the background is
+                           represented by 1
+        colors = the list of colors used to visualize segmentation results
+    :param nuclei_seg: numpy.ndarray
+    :param radius: int
+    :param membrane_channel: numpy.ndarray
+    :param show_process: bool
+    :param start_coords: tuple
+    :param side: int
+    :return labels: numpy.ndarray
+    :return colors: list
+    """
+    if len(colors) == 0:
+        cmap_set3 = plt.get_cmap("Set3")
+        cmap_tab20c = plt.get_cmap("tab20c")
+        colors = [cmap_tab20c.colors[_] for _ in range(len(cmap_tab20c.colors))] + \
+            [cmap_set3.colors[_] for _ in range(len(cmap_set3.colors))]
+    x0, y0 = start_coords
+    ## nuclei segmentation -> nuclei mask
+    nuclei_mask = nuclei_seg > 1
+    if show_process:
+        nuclei_bg = nuclei_seg.min()
+        fig, ax = plt.subplots(1, 2, figsize=(8, 4))
+        nuclei_seg_vis = label2rgb(nuclei_seg[x0:x0 + side, y0:y0 + side], bg_label=nuclei_bg, colors=colors)
+        nuclei_seg_vis[nuclei_seg[x0:x0 + side, y0:y0 + side] == nuclei_bg, ...] = (0, 0, 0)
+        ax[0].imshow(nuclei_seg_vis), ax[0].set_title('nuclei segmentation')
+        ax[1].imshow(nuclei_mask[x0:x0 + side, y0:y0 + side], cmap='gray'), ax[1].set_title('nuclei mask')
+    if membrane_channel is not None:
+        membrane_mask = generate_mask(np.clip(membrane_channel, 0, np.quantile(membrane_channel, 0.95)),
+                                      fill_hole=False, use_watershed=False)
+        if show_process:
+            # visualize
+            nuclei_membrane = np.zeros((membrane_mask.shape[0], membrane_mask.shape[1], 3), dtype=np.uint8)
+            nuclei_membrane[..., 0] = nuclei_mask * 255
+            nuclei_membrane[..., 1] = membrane_mask
+            fig, ax = plt.subplots(1, 2, figsize=(8, 4))
+            ax[0].imshow(membrane_mask[x0:x0 + side, y0:y0 + side], cmap='gray'), ax[0].set_title('membrane mask')
+            ax[1].imshow(nuclei_membrane[x0:x0 + side, y0:y0 + side]), ax[1].set_title('nuclei - membrane')
+        # postprocess raw membrane mask
+        membrane_mask_close = skimage.morphology.closing(membrane_mask, skimage.morphology.disk(1))
+        membrane_mask_open  = skimage.morphology.opening(membrane_mask_close, skimage.morphology.disk(1))
+        membrane_mask_erode = skimage.morphology.erosion(membrane_mask_open, skimage.morphology.disk(3))
+        # Find skeleton
+        membrane_for_skeleton = (membrane_mask_open > 0) & (nuclei_mask == False)
+        membrane_skeleton = skimage.morphology.skeletonize(membrane_for_skeleton)
+        '''print(membrane_skeleton)
+        print(membrane_mask_erode)'''
+        membrane_mask = membrane_mask_erode
+        membrane_mask_2 = (membrane_mask_erode > 0) | membrane_skeleton
+        if show_process:
+            fig, axs = plt.subplots(1, 4, figsize=(16, 4))
+            axs[0].imshow(membrane_mask[x0:x0 + side, y0:y0 + side], cmap='gray')
+            axs[0].set_title('raw membrane mask')
+            axs[1].imshow(membrane_mask_close[x0:x0 + side, y0:y0 + side], cmap='gray')
+            axs[1].set_title('membrane mask - closed')
+            axs[2].imshow(membrane_mask_open[x0:x0 + side, y0:y0 + side], cmap='gray')
+            axs[2].set_title('membrane mask -  opened')
+            axs[3].imshow(membrane_mask_erode[x0:x0 + side, y0:y0 + side], cmap='gray')
+            axs[3].set_title('membrane mask - erosion')
+            plt.show()
+            fig, axs = plt.subplots(1, 3, figsize=(12, 4))
+            axs[0].imshow(membrane_skeleton[x0:x0 + side, y0:y0 + side], cmap='gray')
+            axs[0].set_title('skeleton')
+            axs[1].imshow(membrane_mask[x0:x0 + side, y0:y0 + side], cmap='gray')
+            axs[1].set_title('membrane mask (final)')
+            axs[2].imshow(membrane_mask_2[x0:x0 + side, y0:y0 + side], cmap='gray')
+            axs[2].set_title('membrane mask 2')
+            plt.show()
+            # overlap and visualize
+            nuclei_membrane = np.zeros((membrane_mask.shape[0], membrane_mask.shape[1], 3), dtype=np.uint8)
+            nuclei_membrane[..., 0] = nuclei_mask * 255
+            nuclei_membrane[..., 1] = membrane_mask
+            fig, ax = plt.subplots(1, 2, figsize=(8, 4))
+            ax[0].imshow(membrane_mask[x0:x0 + side, y0:y0 + side], cmap='gray'), ax[0].set_title('membrane mask')
+            ax[1].imshow(nuclei_membrane[x0:x0 + side, y0:y0 + side]), ax[1].set_title('nuclei - membrane')
+    # dilate nuclei mask by radius
+    dilate_nuclei_mask = skimage.morphology.dilation(nuclei_mask, skimage.morphology.disk(radius))
+    if show_process:
+        fig, axs = plt.subplots(1, 3, figsize=(12, 4))
+        axs[0].imshow(nuclei_mask[x0:x0 + side, y0:y0 + side], cmap='gray')
+        axs[0].set_title('nuclei mask')
+        axs[1].imshow(dilate_nuclei_mask[x0:x0 + side, y0:y0 + side], cmap='gray')
+        axs[1].set_title('dilated nuclei mask')
+        if membrane_channel is not None:
+            axs[2].imshow(membrane_mask[x0:x0 + side, y0:y0 + side] > 0, cmap='gray')
+            axs[2].set_title('membrane mask')
+    # define sure foreground, sure background, and unknown region
+    sure_fg = nuclei_mask.copy()  # nuclei mask defines sure foreground
+    # dark region in dilated nuclei mask (dilate_nuclei_mask == False) OR bright region in cell mask (cell_mask > 0)
+    # defines sure background
+    if membrane_channel is not None:
+        sure_bg  = ((membrane_mask > 0) | (dilate_nuclei_mask == False)) & (sure_fg == False)
+        sure_bg2 = ((membrane_mask_2 > 0) | (dilate_nuclei_mask == False)) & (sure_fg == False)
+    else:
+        sure_bg =  (dilate_nuclei_mask == False) & (sure_fg == False)
+    unknown = np.logical_not(np.logical_or(sure_fg, sure_bg))
+    if show_process:
+        fig, axs = plt.subplots(1, 4, figsize=(16, 4))
+        axs[0].imshow(sure_fg[x0:x0 + side, y0:y0 + side], cmap='gray')
+        axs[0].set_title('sure fg')
+        axs[1].imshow(sure_bg[x0:x0 + side, y0:y0 + side], cmap='gray')
+        if membrane_channel is not None:
+            axs[1].set_title('sure bg: membrane | not (dilated nuclei)')
+        else:
+            axs[1].set_title('sure bg: not (dilated nuclei)')
+        axs[2].imshow(unknown[x0:x0 + side, y0:y0 + side], cmap='gray')
+        axs[2].set_title('unknown')
+        # visualize in a RGB image
+        fg_bg_un = np.zeros((unknown.shape[0], unknown.shape[1], 3), dtype=np.uint8)
+        fg_bg_un[..., 0] = sure_fg * 255  # sure foreground - red
+        fg_bg_un[..., 1] = sure_bg * 255  # sure background - green
+        fg_bg_un[..., 2] = unknown * 255  # unknown - blue
+        axs[3].imshow(fg_bg_un[x0:x0 + side, y0:y0 + side])
+        plt.show()
+    ## Euclidean distance transform: distance to the closest zero pixel for each pixel of the input image.
+    if membrane_channel is not None:
+        distance_bg = -scipy.ndimage.distance_transform_edt(1 - sure_bg2)
+        distance_fg = scipy.ndimage.distance_transform_edt(1 - sure_fg)
+        distance = distance_bg+distance_fg
+    else:
+        distance = scipy.ndimage.distance_transform_edt(1 - sure_fg)
+    distance = scipy.ndimage.gaussian_filter(distance, 1)
+    # watershed
+    markers = nuclei_seg.copy()
+    markers[unknown] = 0
+    if show_process:
+        fig, axs = plt.subplots(1, 2, figsize=(8, 4))
+        axs[0].set_title("markers")
+        axs[0].imshow(label2rgb(markers[x0:x0 + side, y0:y0 + side], bg_label=1, colors=colors),
+                     interpolation='nearest')
+        axs[1].set_title("distance")
+        im = axs[1].imshow(distance[x0:x0 + side, y0:y0 + side], cmap=plt.cm.nipy_spectral, interpolation='nearest')
+        plt.colorbar(im, ax=axs[1])
+    labels = skimage.segmentation.watershed(distance, markers)
+    if show_process:
+        fig, axs = plt.subplots(1, 4, figsize=(16, 4))
+        axs[0].imshow(unknown[x0:x0 + side, y0:y0 + side])
+        axs[0].set_title('cytoplasm')  # , cmap=cmap, interpolation='nearest'
+        nuclei_lb = label2rgb(nuclei_seg, bg_label=1, colors=colors)
+        nuclei_lb[nuclei_seg == 1, ...] = (0, 0, 0)
+        axs[1].imshow(nuclei_lb)  # , cmap=cmap, interpolation='nearest')
+        axs[1].set_xlim(x0, x0 + side - 1), axs[1].set_ylim(y0 + side - 1, y0)
+        axs[1].set_title('nuclei')
+        cell_lb = label2rgb(labels, bg_label=1, colors=colors)
+        cell_lb[labels == 1, ...] = (0, 0, 0)
+        axs[2].imshow(cell_lb)  # , cmap=cmap, interpolation='nearest')
+        axs[2].set_title('cells')
+        axs[2].set_xlim(x0, x0 + side - 1), axs[2].set_ylim(y0 + side - 1, y0)
+        merge_lb = cell_lb.copy()
+        merge_lb = cell_lb ** 2
+        merge_lb[nuclei_mask == 1, ...] = np.clip(nuclei_lb[nuclei_mask == 1, ...].astype(float) * 1.2, 0, 1)
+        axs[3].imshow(merge_lb)
+        axs[3].set_title('nuclei-cells')
+        axs[3].set_xlim(x0, x0 + side - 1), axs[3].set_ylim(y0 + side - 1, y0)
+        plt.show()
+    return labels, colors
+def visualize_segmentation(raw_image, channels, seg, channel_ids, bound_color=(1, 1, 1), bound_mode='inner', show=True, bg_label=0):
+    """ Visualize segmentation results with boundaries
+    Inputs:
+        raw_image   = raw cytof image
+        channels    = a list of channels correspond to each channel in raw_image
+        seg         = instance segmentation result (index image)
+        channel_ids = indices of desired channels to visualize results
+        bound_color = desired color in RGB to show boundaries (default=(1,1,1), white color)
+        bound_mode  = the mode for finding boundaries, string in {‘thick’, ‘inner’, ‘outer’, ‘subpixel’}.
+                      (default="inner"). For more details, see
+                      [skimage.segmentation.mark_boundaries](https://scikit-image.org/docs/stable/api/skimage.segmentation.html)
+        show        = a flag indicating whether or not print result image on screen
+    Returns:
+        marked_image
+    :param raw_image: numpy.ndarray
+    :param seg: numpy.ndarray
+    :param channel_ids: int
+    :param bound_color: tuple
+    :param bound_mode: string
+    :param show: bool
+    :return marked_image
+    """
+    from cytof.hyperion_preprocess import cytof_merge_channels
+    # mark_boundaries() highight the segmented area for better visualization
+    # ref: https://scikit-image.org/docs/stable/api/skimage.segmentation.html#skimage.segmentation.mark_boundaries
+    marked_image = mark_boundaries(cytof_merge_channels(raw_image, channels, channel_ids)[0],
+                                   seg, mode=bound_mode, color=bound_color, background_label=bg_label)
+    if show:
+        plt.figure(figsize=(8,8))
+        plt.imshow(marked_image)
+        plt.show()
+    return marked_image

cytof/segmentation_functions.py ADDED Viewed

	@@ -0,0 +1,815 @@

+# Functions for nuclei segmentation in Kaggle PANDA challenge
+import numpy as np
+import matplotlib.image as mpimg
+import matplotlib.pyplot as plt
+from sklearn import preprocessing
+import math
+import scipy.misc as misc
+import cv2
+import skimage
+from skimage import measure
+from skimage import img_as_bool, io, color, morphology, segmentation
+from skimage.morphology import binary_closing, binary_opening, disk, closing, opening
+from PIL import Image
+import time
+import re
+import sys
+import os
+# import openslide
+# from openslide import open_slide, ImageSlide
+import matplotlib.pyplot as plt
+import pandas as pd
+import xml.etree.ElementTree as ET
+from skimage.draw import polygon
+import random
+#####################################################################
+# Functions for color deconvolution
+#####################################################################
+def normalize(mat, quantile_low=0, quantile_high=1):
+    """Do min-max normalization for input matrix of any dimension."""
+    mat_normalized = (mat - np.quantile(mat, quantile_low)) / (
+                np.quantile(mat, quantile_high) - np.quantile(mat, quantile_low))
+    return mat_normalized
+def convert_to_optical_densities(img_RGB, r0=255, g0=255, b0=255):
+    """Conver RGB image to optical densities with same shape as input image."""
+    OD = img_RGB.astype(float)
+    OD[:, :, 0] /= r0
+    OD[:, :, 1] /= g0
+    OD[:, :, 2] /= b0
+    return -np.log(OD + 0.00001)
+def channel_deconvolution(img_RGB, staining_type, plot_image=False, to_normalize=True):
+    """Deconvolute RGB image into different staining channels.
+       Ref: https://blog.bham.ac.uk/intellimic/g-landini-software/colour-deconvolution/
+    Args:
+        img_RGB: A uint8 numpy array with RGB channels.
+        staining_type: Dyes used to stain the image; choose one from ("HDB", "HRB", "HDR", "HEB").
+        plot_image: Set True if want to real-time display results. Default is False.
+    Returns:
+        An unnormlized h*w*3 deconvoluted matrix and 3 different channels normalized to [0, 1] seperately.
+    Raises:
+        Exception: An error occured if staining_type is not defined.
+    """
+    if staining_type == "HDB":
+        channels = ("Hematoxylin", "DAB", "Background")
+        stain_OD = np.asarray([[0.650, 0.704, 0.286], [0.268, 0.570, 0.776], [0.754, 0.077, 0.652]])
+    elif staining_type == "HRB":
+        channels = ("Hematoxylin", "Red", "Background")
+        stain_OD = np.asarray([[0.650, 0.704, 0.286], [0.214, 0.851, 0.478], [0.754, 0.077, 0.652]])
+    elif staining_type == "HDR":
+        channels = ("Hematoxylin", "DAB", "Red")
+        stain_OD = np.asarray([[0.650, 0.704, 0.286], [0.268, 0.570, 0.776], [0.214, 0.851, 0.478]])
+    elif staining_type == "HEB":
+        channels = ("Hematoxylin", "Eosin", "Background")
+        # stain_OD = np.asarray([[0.550,0.758,0.351],[0.398,0.634,0.600],[0.754,0.077,0.652]])
+        stain_OD = np.asarray([[0.644211, 0.716556, 0.266844], [0.092789, 0.964111, 0.283111], [0.754, 0.077, 0.652]])
+    else:
+        raise Exception("Staining type not defined. Choose one from the following: HDB, HRB, HDR, HEB.")
+    # Stain absorbance matrix normalization
+    normalized_stain_OD = []
+    for r in stain_OD:
+        normalized_stain_OD.append(r / np.linalg.norm(r))
+    normalized_stain_OD = np.asarray(normalized_stain_OD)
+    stain_OD_inverse = np.linalg.inv(normalized_stain_OD)
+    # Calculate optical density of input image
+    OD = convert_to_optical_densities(img_RGB, 255, 255, 255)
+    # Deconvolution
+    img_deconvoluted = np.reshape(np.dot(np.reshape(OD, (-1, 3)), stain_OD_inverse), OD.shape)
+    # Define each channel
+    if to_normalize:
+        channel1 = normalize(img_deconvoluted[:, :, 0])  # First dye
+        channel2 = normalize(img_deconvoluted[:, :, 1])  # Second dye
+        channel3 = normalize(img_deconvoluted[:, :, 2])  # Third dye or background
+    else:
+        channel1 = img_deconvoluted[:, :, 0]  # First dye
+        channel2 = img_deconvoluted[:, :, 1]  # Second dye
+        channel3 = img_deconvoluted[:, :, 2]  # Third dye or background
+    if plot_image:
+        fig, axes = plt.subplots(2, 2, figsize=(15, 15), sharex=True, sharey=True,
+                                 subplot_kw={'adjustable': 'box-forced'})
+        ax = axes.ravel()
+        ax[0].imshow(img_RGB)
+        ax[0].set_title("Original image")
+        ax[1].imshow(channel1, cmap="gray")
+        ax[1].set_title(channels[0])
+        ax[2].imshow(channel2, cmap="gray")
+        ax[2].set_title(channels[1])
+        ax[3].imshow(channel3, cmap="gray")
+        ax[3].set_title(channels[2])
+        plt.show()
+    return img_deconvoluted, channel1, channel2, channel3
+##################################################################
+# Functions for morphological operations
+##################################################################
+def make_8UC(mat, normalized=True):
+    """Convert the matrix to the equivalent matrix of the unsigned 8 bit integer datatype."""
+    if normalized:
+        mat_uint8 = np.array(mat.copy() * 255, dtype=np.uint8)
+    else:
+        mat_uint8 = np.array(normalize(mat) * 255, dtype=np.uint8)
+    return mat_uint8
+def make_8UC3(mat, normalized=True):
+    """Convert the matrix to the equivalent matrix of the unsigned 8 bit integer datatype with 3 channels."""
+    mat_uint8 = make_8UC(mat, normalized)
+    mat_uint8 = np.stack((mat_uint8,) * 3, axis=-1)
+    return mat_uint8
+def check_channel(channel):
+    """Check whether there is any signals in a channel (yes: 1; no: 0)."""
+    channel_uint8 = make_8UC(normalize(channel))
+    if np.var(channel_uint8) < 0.02:
+        return 0
+    else:
+        return 1
+def fill_holes(img_bw):
+    """Fill holes in input 0/255 matrix; equivalent of MATLAB's imfill(BW, 'holes')."""
+    height, width = img_bw.shape
+    # Needs to be 2 pixels larger than image sent to cv2.floodFill
+    mask = np.zeros((height + 4, width + 4), np.uint8)
+    # Add one pixel of padding all around so that objects touching border aren't filled against border
+    img_bw_copy = np.zeros((height + 2, width + 2), np.uint8)
+    img_bw_copy[1:(height + 1), 1:(width + 1)] = img_bw
+    cv2.floodFill(img_bw_copy, mask, (0, 0), 255)
+    img_bw = img_bw | (255 - img_bw_copy[1:(height + 1), 1:(width + 1)])
+    return img_bw
+def otsu_thresholding(img, thresh=None, plot_image=False, fill_hole=False):
+    """Do image thresholding.
+    Args:
+        img: A uint8 matrix for thresholding.
+        thresh: If provided, do binary thresholding use this threshold. If not, do default Otsu thresholding.
+        plot_image: Set Ture if want to real-time display results. Default is False.
+        fill_hole: Set True if want to fill holes in the generated mask. Default is False.
+    Returns:
+        A 0/255 mask matrix same size as img; object: 255; backgroung: 0.
+    """
+    if thresh is None:
+        # Perform Otsu thresholding
+        thresh, mask = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
+    else:
+        # Manually set threshold
+        thresh, mask = cv2.threshold(img, thresh, 255, cv2.THRESH_BINARY)
+    mask = skimage.morphology.remove_small_objects(mask, 2)
+    # Fill holes
+    if fill_hole:
+        mask = fill_holes(mask)
+    if plot_image:
+        plt.figure()
+        plt.imshow(img, cmap="gray")
+        plt.title("Original")
+        plt.figure()
+        plt.imshow(mask)
+        plt.title("After Thresholding")
+        plt.colorbar()
+        plt.show()
+    return mask
+def watershed(mask, img, plot_image=False, kernel_size=2):
+    """Do watershed segmentation for input mask and image.
+    Args:
+        mask: A 0/255 matrix with 255 indicating objects.
+        img: An 8UC3 matrix for watershed segmentation.
+        plot_image: Set True if want to real-time display results. Default is False.
+        kernel_size: Kernal size for inner marker erosion. Default is 2.
+    Returns:
+        A uint8 mask same size as input image, with -1 indicating boundary, 1 indicating background,
+        and numbers>1 indicating objects.
+    """
+    img_copy = img.copy()
+    mask_copy = np.array(mask.copy(), dtype=np.uint8)
+    # Sure foreground area (inner marker)
+    mask_closed = closing(np.array(mask_copy, dtype=np.uint8))
+    mask_closed = closing(np.array(mask_closed, dtype=np.uint8))
+    kernel = np.ones((kernel_size, kernel_size), np.uint8)
+    sure_fg = cv2.erode(mask_closed, kernel, iterations=2)
+    sure_fg = skimage.morphology.closing(np.array(sure_fg, dtype=np.uint8))
+    # Sure background area (outer marker)
+    sure_fg_bool = 1 - img_as_bool(sure_fg)
+    sure_bg = np.uint8(1 - morphology.skeletonize(sure_fg_bool))
+    # Unknown region (the region other than inner or outer marker)
+    sure_fg = np.uint8(sure_fg)
+    unknown = cv2.subtract(sure_bg, sure_fg)
+    # Marker for cv2.watershed
+    _, markers = cv2.connectedComponents(sure_fg)
+    markers = markers + 1  # Set background to 1
+    markers[unknown == 1] = 0
+    # Watershed
+    # TODO(shidan.wang@utsouthwestern.edu): Replace cv2.watershed with skimage.morphology.watershed
+    marker = cv2.watershed(img_copy, markers.copy())
+    if plot_image:
+        plt.figure()
+        plt.imshow(sure_fg)
+        plt.title("Inner Marker")
+        plt.figure()
+        plt.imshow(sure_bg)
+        plt.title("Outer Marker")
+        plt.figure()
+        plt.imshow(unknown)
+        plt.title("Unknown")
+        plt.figure()
+        plt.imshow(markers, cmap='jet')
+        plt.title("Markers")
+        plt.figure()
+        plt.imshow(marker, cmap='jet')
+        plt.title("Mask")
+        plt.figure()
+        plt.imshow(img)
+        plt.title("Original Image")
+        plt.figure()
+        img_copy[marker == -1] = [0, 255, 0]
+        plt.imshow(img_copy)
+        plt.title("Marked Image")
+        plt.show()
+    return marker
+def generate_mask(channel, original_img=None, overlap_color=(0, 1, 0),
+                  plot_process=False, plot_result=False, title="",
+                  fill_hole=False, thresh=None,
+                  use_watershed=True, watershed_kernel_size=2,
+                  save_img=False, save_path=None):
+    """Generate mask for a gray-value image.
+    Args:
+        channel: Channel returned by function 'channel_deconvolution'. A gray-value image is also accepted.
+        original_img: A image used for plotting overlapped segmentation result, optional.
+        overlap_color: A 3-value tuple setting the color used to mark segmentation boundaries on original
+            image. Default is green (0, 1, 0).
+        plot_process: Set True if want to display the whole mask generation process. Default is False.
+        plot_result: Set True if want to display the final result. Default is False.
+        title: The title used for plot_result, optional.
+        fill_hole: Set True if want to fill mask holes. Default is False.
+        thresh: Provide this value to do binary thresholding instead of default otsu thresholding.
+        use_watershed: Set False if want to skip the watershed segmentation step. Default is True.
+        watershed_kernel_size: Kernel size of inner marker erosion. Default is 2.
+        save_img: Set True if want to save the mask image. Default is False.
+        save_path: The path to save the mask image, optional. Prefer *.png or *.pdf.
+    Returns:
+        A binary mask with 1 indicating an object and 0 indicating background.
+    Raises:
+        IOError: An error occured writing image to save_path.
+    """
+    if not check_channel(channel):
+        # If there is not any signal
+        print("No signals detected for this channel")
+        return np.zeros(channel.shape)
+    else:
+        channel = normalize(channel)
+        if use_watershed:
+            mask_threshold = otsu_thresholding(make_8UC(channel),
+                                               plot_image=plot_process, fill_hole=fill_hole, thresh=thresh)
+            marker = watershed(mask_threshold, make_8UC3(channel),
+                               plot_image=plot_process, kernel_size=watershed_kernel_size)
+            # create mask
+            mask = np.zeros(marker.shape)
+            mask[marker == 1] = 1
+            mask = 1 - mask
+            # Set boundary as mask from Otsu_thresholding, since cv2.watershed automatically set boundary as -1
+            mask[0, :] = mask_threshold[0, :] == 255
+            mask[-1, :] = mask_threshold[-1, :] == 255
+            mask[:, 0] = mask_threshold[:, 0] == 255
+            mask[:, -1] = mask_threshold[:, -1] == 255
+        else:
+            mask = otsu_thresholding(make_8UC(channel),
+                                     plot_image=plot_process, fill_hole=fill_hole, thresh=thresh)
+        if plot_result or save_img:
+            if original_img is None:
+                # If original image is not provided, plot mask only
+                plt.figure()
+                plt.imshow(mask, cmap="gray")
+            else:
+                # If original image is provided
+                overlapped_img = segmentation.mark_boundaries(original_img, skimage.measure.label(mask),
+                                                              overlap_color, mode="thick")
+                fig, axes = plt.subplots(1, 2, figsize=(15, 15), sharex=True, sharey=True,
+                                         subplot_kw={'adjustable': 'box-forced'})
+                ax = axes.ravel()
+                ax[0].imshow(mask, cmap="gray")
+                ax[0].set_title(str(title) + " Mask")
+                ax[1].imshow(overlapped_img)
+                ax[1].set_title("Overlapped with Original Image")
+            if save_img:
+                try:
+                    plt.savefig(save_path)
+                except:
+                    raise IOError("Error saving image to {}".format(save_path))
+            if plot_result:
+                plt.show()
+            plt.close()
+    return mask
+def get_mask_for_slide_image(filePath, display_progress=False):
+    """Generate mask for slide"""
+    slide = open_slide(filePath)
+    # Use the lowest resolution
+    level_dims = slide.level_dimensions
+    level_to_analyze = len(level_dims) - 1
+    dims_of_selected = level_dims[-1]
+    if display_progress:
+        print('Selected image of size (' + str(dims_of_selected[0]) + ', ' + str(dims_of_selected[1]) + ')')
+    slide_image = slide.read_region((0, 0), level_to_analyze, dims_of_selected)
+    slide_image = np.array(slide_image)
+    if display_progress:
+        plt.figure()
+        plt.imshow(slide_image)
+    # Perform Otsu thresholding
+    # threshR, maskR = cv2.threshold(slide_image[:, :, 0], 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
+    # threshG, maskG = cv2.threshold(slide_image[:, :, 1], 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
+    threshB, maskB = cv2.threshold(slide_image[:, :, 2], 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
+    # Add the channels together
+    # mask = ((255-maskR) | (255-maskG) | (255-maskB))
+    mask = 255 - maskB
+    if display_progress:
+        plt.figure()
+        plt.imshow(mask)
+    # Delete small objects
+    # min_pixel_count = 0.005 * dims_of_selected[0] * dims_of_selected[1]
+    # mask = np.array(skimage.morphology.remove_small_objects(np.array(mask/255, dtype=bool), min_pixel_count),
+    #                 dtype=np.uint8)
+    # if display_progress:
+    #     print("Min pixel count: {}".format(min_pixel_count))
+    #     plt.figure()
+    #     plt.imshow(mask)
+    #     plt.show()
+    # Dilate the image
+    kernel = np.ones((3, 3), np.uint8)
+    mask = cv2.dilate(mask, kernel, iterations=1)
+    mask = cv2.erode(mask, kernel, iterations=1)
+    mask = cv2.dilate(mask, kernel, iterations=1)
+    # Fill holes
+    mask = fill_holes(mask)
+    if display_progress:
+        plt.figure()
+        plt.imshow(mask)
+        plt.show()
+    return mask, slide_image
+##################################################################
+# Functions for extracting patches from slide image
+##################################################################
+def extract_patch_by_location(filepath, location, patch_size=(500, 500),
+                              plot_image=False, level_to_analyze=0, save=False, savepath='.'):
+    if not os.path.isfile(filepath):
+        raise IOError("Image not found!")
+        return []
+    slide = open_slide(filepath)
+    slide_image = slide.read_region(location, level_to_analyze, patch_size)
+    if plot_image:
+        plt.figure()
+        plt.imshow(slide_image)
+        plt.show()
+    if save:
+        filename = re.search("(?<=/)[^/]+\.svs", filepath).group(0)[0:-4]
+        savename = os.path.join(savepath, str(filename) + '_' + str(location[0]) + '_' + str(location[1]) + '.png')
+        misc.imsave(savename, slide_image)
+        print("Writed to " + savename)
+    return slide_image
+def extract_patch_by_tissue_area(filePath, nPatch=0, patchSize=500, maxPatch=10,
+                                 filename=None, savePath=None, displayProgress=False, desiredLevel=0, random=False):
+    '''Input: slide
+       Output: image patches'''
+    if filename is None:
+        filename = re.search("(?<=/)[0-9]+\.svs", filePath).group(0)
+    if savePath is None:
+        savePath = '/home/swan15/python/brainTumor/sample_patches/'
+    bwMask, slideImageCV = get_mask_for_slide_image(filePath, display_progress=displayProgress)
+    slide = open_slide(filePath)
+    levelDims = slide.level_dimensions
+    # find magnitude
+    for i in range(0, len(levelDims)):
+        if bwMask.shape[0] == levelDims[i][1]:
+            magnitude = levelDims[0][1] / levelDims[i][1]
+            break
+    if not random:
+        nCol = int(math.ceil(levelDims[0][1] / patchSize))
+        nRow = int(math.ceil(levelDims[0][0] / patchSize))
+        # get contour
+        _, contours, _ = cv2.findContours(bwMask, cv2.RETR_TREE, cv2.CHAIN_APPROX_NONE)
+        for nContours in range(0, len(contours)):
+            print(nContours)
+            # i is the y axis in the image
+            for i in range(0, nRow):
+                minRow = i * patchSize / magnitude
+                maxRow = (i + 1) * patchSize / magnitude
+                matches = [x for x in range(0, len(contours[nContours][:, 0, 0]))
+                           if (contours[nContours][x, 0, 1] > minRow and contours[nContours][x, 0, 1] < maxRow)]
+                try:
+                    print([min(contours[nContours][matches, 0, 0]), max(contours[nContours][matches, 0, 0])])
+                    # save image
+                    minCol = min(contours[nContours][matches, 0, 0]) * magnitude
+                    maxCol = max(contours[nContours][matches, 0, 0]) * magnitude
+                    minColInt = int(math.floor(minCol / patchSize))
+                    maxColInt = int(math.ceil(maxCol / patchSize))
+                    for j in range(minColInt, maxColInt):
+                        startCol = j * patchSize
+                        startRow = i * patchSize
+                        patch = slide.read_region((startCol, startRow), desiredLevel, (patchSize, patchSize))
+                        patchCV = np.array(patch)
+                        patchCV = patchCV[:, :, 0:3]
+                        fname = os.path.join(savePath, filename + '_' + str(i) + '_' + str(j) + '.png')
+                        if not os.path.isfile(fname):
+                            misc.imsave(fname, patchCV)
+                            nPatch = nPatch + 1
+                            print(nPatch)
+                        if nPatch >= maxPatch:
+                            break
+                except ValueError:
+                    continue
+                if nPatch >= maxPatch:
+                    break
+            if nPatch >= maxPatch:
+                break
+    else:
+        # randomly pick up image
+        for i in range(nPatch, maxPatch):
+            coords = np.transpose(np.nonzero(bwMask >= 1))
+            y, x = coords[np.random.randint(0, len(coords) - 1)]
+            x = int(x * magnitude) - int(patchSize / 2)
+            y = int(y * magnitude) - int(patchSize / 2)
+            image = np.array(slide.read_region((x, y), desiredLevel, (patchSize, patchSize)))[..., 0:3]
+            fname = os.path.join(savePath, filename + '_' + str(i) + '.png')
+            if not os.path.isfile(fname):
+                misc.imsave(fname, image)
+                print(i)
+def parseXML(xmlFile, pattern):
+    """
+    Parse XML File and returns an object containing all the vertices
+    Verticies: (dict)
+         pattern: (list) of dicts, each with 'X' and 'Y' key
+                [{ 'X': [1,2,3],
+                   'Y': [1,2,3]  }]
+    """
+    tree = ET.parse(xmlFile)  # Convert XML file into tree representation
+    root = tree.getroot()
+    regions = root.iter('Region')  # Extract all Regions
+    vertices = {pattern: []}  # Store all vertices in a dictionary
+    for region in regions:
+        label = region.get('Text')  # label either as 'ROI' or 'normal'
+        if label == pattern:
+            vertices[label].append({'X': [], 'Y': []})
+            for vertex in region.iter('Vertex'):
+                X = float(vertex.get('X'))
+                Y = float(vertex.get('Y'))
+                vertices[label][-1]['X'].append(X)
+                vertices[label][-1]['Y'].append(Y)
+    return vertices
+def calculateRatio(levelDims):
+    """ Calculates the ratio between the highest resolution image and lowest resolution image.
+    Returns the ratio as a tuple (Xratio, Yratio).
+    """
+    highestReso = np.asarray(levelDims[0])
+    lowestReso = np.asarray(levelDims[-1])
+    Xratio, Yratio = highestReso / lowestReso
+    return (Xratio, Yratio)
+def createMask(levelDims, vertices, pattern):
+    """
+    Input: levelDims (nested list): dimensions of each layer of the slide.
+           vertices (dict object as describe above)
+    Output: (tuple) mask
+            numpy nd array of 0/1, where 1 indicates inside the region
+            and 0 is outside the region
+    """
+    # Down scale the XML region to create a low reso image mask, and then
+    # rescale the image to retain reso of image mask to save memory and time
+    Xratio, Yratio = calculateRatio(levelDims)
+    nRows, nCols = levelDims[-1]
+    mask = np.zeros((nRows, nCols), dtype=np.uint8)
+    for i in range(len(vertices[pattern])):
+        lowX = np.array(vertices[pattern][i]['X']) / Xratio
+        lowY = np.array(vertices[pattern][i]['Y']) / Yratio
+        rr, cc = polygon(lowX, lowY, (nRows, nCols))
+        mask[rr, cc] = 1
+    return mask
+def getMask(xmlFile, svsFile, pattern):
+    """ Parses XML File to get mask vertices and returns matrix masks
+    where 1 indicates the pixel is inside the mask, and 0 indicates outside the mask.
+    @param: {string} xmlFile: name of xml file that contains annotation vertices outlining the mask.
+    @param: {string} svsFile: name of svs file that contains the slide image.
+    @param: {pattern} string: name of the xml labeling
+    Returns: slide - openslide slide Object
+             mask - matrix mask of pattern
+    """
+    vertices = parseXML(xmlFile, pattern)  # Parse XML to get vertices of mask
+    if not len(vertices[pattern]):
+        slide = 0
+        mask = 0
+        return slide, mask
+    slide = open_slide(svsFile)
+    levelDims = slide.level_dimensions
+    mask = createMask(levelDims, vertices, pattern)
+    return slide, mask
+def plotMask(mask):
+    fig, ax1 = plt.subplots(nrows=1, figsize=(6, 10))
+    ax1.imshow(mask)
+    plt.show()
+def chooseRandPixel(mask):
+    """ Returns [x,y] numpy array of random pixel.
+    NOTE: the returned [x, y] correspond to [row, col] in the mask
+    @param {numpy matrix} mask from which to choose random pixel.
+           E.g., self.level_dims = self.slide.level_dimensions
+                 self.zoom = self.level_dims[0][0] / self.level_dims[-1][0]
+                 self.slide, mask = getMask(xml_file, slide_file, pattern)
+                 self.mask = cv2.erode(mask, np.ones((50, 50)))
+                 def get_patch(self):
+                    x, y = chooseRandPixel(self.mask)  # x is the columns of original image
+                    x = int(x * self.zoom)
+                    y = int(y * self.zoom)
+                    patch = self.slide.read_region((x, y), 0, (self.PATCH_SIZE, self.PATCH_SIZE))
+                    patch = np.array(patch)[..., 0:3]
+                    return patch, x, y
+                 self.get_patch()
+    """
+    array = np.transpose(np.nonzero(mask))  # Get the indices of nonzero elements of mask.
+    index = random.randint(0, len(array) - 1)  # Select a random index
+    return array[index]
+def plotImage(image):
+    plt.imshow(image)
+    plt.show()
+def checkWhiteSlide(image):
+    im = np.array(image.convert(mode='RGB'))
+    pixels = np.ravel(im)
+    mean = np.mean(pixels)
+    return mean >= 230
+# extractPatchByXMLLabeling
+def getPatches(slide, mask, numPatches=0, dims=(0, 0), dirPath='', slideNum='', plot=False, plotMask=False):
+    """ Generates and saves 'numPatches' patches with dimension 'dims' from image 'slide' contained within 'mask'.
+    @param {Openslide Slide obj} slide: image object
+    @param {numpy matrix} mask: where 0 is outside region of interest and 1 indicates within
+    @param {int} numPatches
+    @param {tuple} dims: (w,h) dimensions of patches
+    @param {string} dirPath: directory in which to save patches
+    @param {string} slideNum: slide number
+    Saves patches in directory specified by dirPath as [slideNum]_[patchNum]_[Xpixel]x[Ypixel].png
+    """
+    w, h = dims
+    levelDims = slide.level_dimensions
+    Xratio, Yratio = calculateRatio(levelDims)
+    i = 0
+    while i < numPatches:
+        firstLoop = True  # Boolean to ensure while loop runs at least once.
+        while firstLoop:  # or not mask[rr,cc].all(): # True if it is the first loop or if all pixels are in the mask
+            firstLoop = False
+            x, y = chooseRandPixel(mask)  # Get random top left pixel of patch.
+            xVertices = np.array([x, x + (w / Xratio), x + (w / Xratio), x, x])
+            yVertices = np.array([y, y, y - (h / Yratio), y - (h / Yratio), y])
+            rr, cc = polygon(xVertices, yVertices)
+        image = slide.read_region((int(x * Xratio), int(y * Yratio)), 0, (w, h))
+        isWhite = checkWhiteSlide(image)
+        # newPath = 'other' if isWhite else dirPath
+        if not isWhite: i += 1
+        slideName = '_'.join([slideNum, 'x'.join([str(x * Xratio), str(y * Yratio)])])
+        image.save(os.path.join(dirPath, slideName + ".png"))
+        if plot:
+            plotImage(image)
+        if plotMask: mask[rr, cc] = 0
+    if plotMask:
+        plotImage(mask)
+'''Example codes for getting patches from labeled svs files:
+#define the patterns
+patterns = ['small_acinar',
+            'large_acinar',
+            'tubular',
+            'trabecular',
+            'aveolar',
+            'solid',
+            'pseudopapillary',
+            'rhabdoid',
+            'sarcomatoid',
+            'necrosis',
+            'normal',
+            'other']
+#create folders
+for pattern in patterns:
+    if not os.path.exists(pattern):
+        os.makedirs(pattern)
+#define parameters
+patchSize = 500
+numPatches = 50
+dirName = '/home/swan15/kidney/ccRCC/slides'
+annotatedSlides = 'slide_region_of_interests.txt'
+f = open(annotatedSlides, 'r+')
+slides = [re.search('.*(?=\.svs)', line).group(0) for line in f
+          if re.search('.*(?=\.svs)', line) is not None]
+print slides
+f.close()
+for slideID in slides:
+    print('Start '+slideID)
+    try:
+        xmlFile = slideID+'.xml'
+        svsFile = slideID+'.svs'
+        xmlFile = os.path.join(dirName, xmlFile)
+        svsFile = os.path.join(dirName, svsFile)
+        if not os.path.isfile(xmlFile):
+            print xmlFile+' not exist'
+            continue
+        for pattern in patterns:
+            numPatchesGenerated = len([files for files in os.listdir(pattern)
+                                      if re.search(slideID+'_.+\.png', files) is not None])
+            if numPatchesGenerated >= numPatches:
+                print(pattern+' existed')
+                continue
+            else:
+                numPatchesTemp = numPatches - numPatchesGenerated
+            slide, mask = getMask(xmlFile, svsFile, pattern)
+            if not slide:
+                #print(pattern+' not detected.')
+                continue
+            getPatches(slide, mask, numPatches = numPatchesTemp, dims = (patchSize, patchSize),
+                       dirPath = pattern+'/', slideNum = slideID, plotMask = False)  # Get Patches
+            print(pattern+' done.')
+        print('Done with ' + slideID)
+        print('----------------------')
+    except:
+        print('Error with ' + slideID)
+'''
+##################################################################
+# RGB color processing
+##################################################################
+# convert RGBA image to RGB (specifically designed for masks)
+def convert_RGBA(RGBA_img):
+    if np.shape(RGBA_img)[2] == 4:
+        RGB_img = np.zeros((np.shape(RGBA_img)[0], np.shape(RGBA_img)[1], 3))
+        RGB_img[RGBA_img[:, :, 3] == 0] = [255, 255, 255]
+        RGB_img[RGBA_img[:, :, 3] == 255] = RGBA_img[RGBA_img[:, :, 3] == 255, 0:3]
+        return RGB_img
+    else:
+        print("Not an RGBA image")
+        return RGBA_img
+# Convert RGB mask to one-channel mask
+def RGB_to_index(RGB_img, RGB_markers=None, RGB_labels=None):
+    """Change RGB to 2D index matrix; each RGB color corresponds to one index.
+    Args:
+        RGB_markers: start from background (marked as 0);
+            Example format:
+                [[255, 255, 255],
+                [160, 255, 0]]
+        RGB_labels: a numeric vector corresponding to the labels of RGB_markers;
+            length should be the same as RGB_markers.
+    """
+    if np.shape(RGB_img)[2] != 3:
+        print("Not an RGB image")
+        return RGB_img
+    else:
+        if RGB_markers == None:
+            RGB_markers = [[255, 255, 255]]
+        if RGB_labels == None:
+            RGB_labels = range(np.shape(RGB_markers)[0])
+        mask_index = np.zeros((np.shape(RGB_img)[0], np.shape(RGB_img)[1]))
+        for i, RGB_label in enumerate(RGB_labels):
+            mask_index[np.all(RGB_img == RGB_markers[i], axis=2)] = RGB_label
+    return mask_index
+def index_to_RGB(mask_index, RGB_markers=None):
+    """Change index to 2D image; each index corresponds to one color"""
+    mask_index_copy = mask_index.copy()
+    mask_index_copy = np.squeeze(mask_index_copy)  # In case the mask shape is not [height, width]
+    if RGB_markers == None:
+        print("RGB_markers not provided!")
+        RGB_markers = [[255, 255, 255]]
+    RGB_img = np.zeros((np.shape(mask_index_copy)[0], np.shape(mask_index_copy)[1], 3), dtype=np.uint8)
+    RGB_img[:, :] = RGB_markers[0]  # Background
+    for i in range(np.shape(RGB_markers)[0]):
+        RGB_img[mask_index_copy == i] = RGB_markers[i]
+    return RGB_img
+def shift_HSV(img, amount=(0.9, 0.9, 0.9)):
+    """Function to tune Hue, Saturation, and Value for image img"""
+    img = Image.fromarray(img, 'RGB')
+    hsv = img.convert('HSV')
+    hsv = np.array(hsv)
+    hsv[..., 0] = np.clip((hsv[..., 0] * amount[0]), a_max=255, a_min=0)
+    hsv[..., 1] = np.clip((hsv[..., 1] * amount[1]), a_max=255, a_min=0)
+    hsv[..., 2] = np.clip((hsv[..., 2] * amount[2]), a_max=255, a_min=0)
+    new_img = Image.fromarray(hsv, 'HSV')
+    return np.array(new_img.convert('RGB'))

cytof/utils.py ADDED Viewed

	@@ -0,0 +1,514 @@

+import os
+import pickle as pkl
+import skimage
+import matplotlib.pyplot as plt
+from matplotlib.patches import Rectangle
+import seaborn as sns
+import numpy as np
+import pandas as pd
+from sklearn.mixture import GaussianMixture
+import scipy
+from typing import Union, Optional, Type, Tuple, List, Dict
+import itertools
+from multiprocessing import Pool
+from tqdm import tqdm
+from readimc import MCDFile, TXTFile
+import warnings
+def load_CytofImage(savename):
+    cytof_img = pkl.load(open(savename, "rb"))
+    return cytof_img
+def load_CytofCohort(savename):
+    cytof_cohort = pkl.load(open(savename, "rb"))
+    return cytof_cohort
+def process_mcd(filename: str,
+                params: Dict):
+    """
+    A function to process a whole slide .mcd file
+    """
+    from classes import CytofImageTiff, CytofCohort
+    quality_control_thres = params.get("quality_control_thres", None)
+    channels_remove       = params.get("channels_remove", None)
+    channels_dict         = params.get("channels_dict", None)
+    use_membrane          = params.get("use_membrane", False)
+    cell_radius           = params.get("cell_radius", 5)
+    normalize_qs          = params.get("normalize_qs", 75)
+    df_cohort    = pd.DataFrame(columns = ['Slide', 'ROI', 'input file'])
+    cytof_images = {}
+    corrupted    = []
+    with MCDFile(filename) as f:
+        for slide in f.slides:
+            sid = f"{slide.description}{slide.id}"
+            print(sid)
+            for roi in slide.acquisitions:
+                rid = roi.description
+                print(f'processing slide_id-roi: {sid}-{rid}')
+                if roi.metadata["DataStartOffset"] < roi.metadata["DataEndOffset"]:
+                    img_roi = f.read_acquisition(roi)  # array, shape: (c, y, x), dtype: float3
+                    img_roi = np.transpose(img_roi, (1, 2, 0))
+                    cytof_img = CytofImageTiff(slide=sid, roi = rid, image=img_roi, filename=f"{sid}-{rid}")
+                    # cytof_img.quality_control(thres=quality_control_thres)
+                    channels = [f"{mk}({cn})" for (mk, cn) in zip(roi.channel_labels, roi.channel_names)]
+                    cytof_img.set_markers(markers=roi.channel_labels, labels=roi.channel_names, channels=channels) # targets, metals
+                    # known corrupted channels, e.g. nan-nan1
+                    if channels_remove is not None and len(channels_remove) > 0:
+                        cytof_img.remove_special_channels(channels_remove)
+                    # maps channel names to nuclei/membrane
+                    if channels_dict is not None:
+                        # remove nuclei channel for segmentation
+                        channels_rm = cytof_img.define_special_channels(channels_dict, rm_key='nuclei')
+                        cytof_img.remove_special_channels(channels_rm)
+                        cytof_img.get_seg(radius=cell_radius, use_membrane=use_membrane)
+                        cytof_img.extract_features(cytof_img.filename)
+                        cytof_img.feature_quantile_normalization(qs=normalize_qs)
+                    df_cohort = pd.concat([df_cohort, pd.DataFrame.from_dict([{'Slide': sid,
+                                                                   'ROI': rid,
+                                                                   'input file': filename}])])
+                    cytof_images[f"{sid}-{rid}"] = cytof_img
+                else:
+                    corrupted.append(f"{sid}-{rid}")
+    print(f"This cohort now contains {len(cytof_images)} ROIs, after excluding {len(corrupted)} corrupted ones from the original MCD.")
+    cytof_cohort = CytofCohort(cytof_images=cytof_images, df_cohort=df_cohort)
+    if channels_dict is not None:
+        cytof_cohort.batch_process_feature()
+    else:
+        warnings.warn("Feature extraction is not done as no nuclei channels defined by 'channels_dict'!")
+    return corrupted, cytof_cohort#, cytof_images
+def save_multi_channel_img(img, savename):
+    """
+    A helper function to save multi-channel images
+    """
+    skimage.io.imsave(savename, img)
+def generate_color_dict(names: List,
+                        sort_names: bool = True,
+                       ):
+    """
+    Randomly generate a dictionary of colors based on provided "names"
+    """
+    if sort_names:
+        names.sort()
+    color_dict = dict((n, plt.cm.get_cmap('tab20').colors[i]) for (i, n) in enumerate(names))
+    return color_dict
+def show_color_table(color_dict: dict, # = None,
+                   # names: List = ['1'],
+                   title: str = "",
+                   maxcols: int = 4,
+                   emptycols: int = 0,
+                   # sort_names: bool = True,
+                   dpi: int = 72,
+                   cell_width: int = 212,
+                   cell_height: int = 22,
+                   swatch_width: int = 48,
+                   margin: int = 12,
+                   topmargin: int = 40,
+                   show: bool = True
+                   ):
+    """
+    Show color dictionary
+    Generate the color table for visualization.
+    If "color_dict" is provided, show color_dict;
+    otherwise, randomly generate color_dict based on "names"
+    reference: https://matplotlib.org/stable/gallery/color/named_colors.html
+    args:
+        color_dict (optional) = a dictionary of colors. key: color legend name - value: RGB representation of color
+        names (optional) = names for each color legend (default=["1"])
+        title (optional) = title for the color table (default="")
+        maxcols = maximum number of columns in visualization
+        emptycols (optional) = number of empty columns for a maxcols-column figure,
+            i.e. maxcols=4 and emptycols=3 means presenting single column plot (default=0)
+        sort_names (optional) = a flag indicating whether sort colors based on names (default=True)
+    """
+#     if sort_names:
+#         names.sort()
+#     if color_pool is None:
+#         color_pool = dict((n, plt.cm.get_cmap('tab20').colors[i]) for (i, n) in enumerate(names))
+#     else:
+    names = color_dict.keys()
+    n = len(names)
+    ncols = maxcols - emptycols
+    nrows = n // ncols + int(n % ncols > 0)
+    #     width  = cell_width * 4 + 2 * margin
+    width = cell_width * ncols + 2 * margin
+    height = cell_height * nrows + margin + topmargin
+    fig, ax = plt.subplots(figsize=(width / dpi, height / dpi), dpi=dpi)
+    fig.subplots_adjust(margin / width, margin / height,
+                        (width - margin) / width, (height - topmargin) / height)
+    #     ax.set_xlim(0, cell_width * 4)
+    ax.set_xlim(0, cell_width * ncols)
+    ax.set_ylim(cell_height * (nrows - 0.5), -cell_height / 2.)
+    ax.yaxis.set_visible(False)
+    ax.xaxis.set_visible(False)
+    ax.set_axis_off()
+    ax.set_title(title, fontsize=16, loc="left", pad=10)
+    for i, n in enumerate(names):
+        row = i % nrows
+        col = i // nrows
+        y = row * cell_height
+        swatch_start_x = cell_width * col
+        text_pos_x = cell_width * col + swatch_width + 7
+        ax.text(text_pos_x, y, n, fontsize=12,
+                horizontalalignment='left',
+                verticalalignment='center')
+        ax.add_patch(
+            Rectangle(xy=(swatch_start_x, y - 9), width=swatch_width,
+                      height=18, facecolor=color_dict[n], edgecolor='0.7')
+        )
+def _extract_feature_one_nuclei(nuclei_id, nuclei_seg, cell_seg, filename, morphology, nuclei_morphology, cell_morphology,
+                       channels, raw_image, sum_exp_nuclei, ave_exp_nuclei, sum_exp_cell, ave_exp_cell):
+    regions = skimage.measure.regionprops((nuclei_seg == nuclei_id) * 1)
+    if len(regions) >= 1:
+        this_nucleus = regions[0]
+    else:
+        return {}
+    regions = skimage.measure.regionprops((cell_seg == nuclei_id) * 1)  # , coordinates='xy') (deprecated)
+    if len(regions) >= 1:
+        this_cell = regions[0]
+    else:
+        return {}
+    centroid_y, centroid_x = this_nucleus.centroid  # y: rows; x: columnsb
+    res = {"filename": filename,
+           "id": nuclei_id,
+           "coordinate_x": centroid_x,
+           "coordinate_y": centroid_y}
+    # morphology
+    for i, feature in enumerate(morphology[:-1]):
+        res[nuclei_morphology[i]] = getattr(this_nucleus, feature)
+        res[cell_morphology[i]]   = getattr(this_cell, feature)
+    res[nuclei_morphology[-1]]    = 1.0 * this_nucleus.perimeter ** 2 / this_nucleus.filled_area
+    res[cell_morphology[-1]]      = 1.0 * this_cell.perimeter ** 2 / this_cell.filled_area
+    # markers
+    for ch, marker in enumerate(channels):
+        res[sum_exp_nuclei[ch]] = np.sum(raw_image[nuclei_seg == nuclei_id, ch])
+        res[ave_exp_nuclei[ch]] = np.average(raw_image[nuclei_seg == nuclei_id, ch])
+        res[sum_exp_cell[ch]]   = np.sum(raw_image[cell_seg == nuclei_id, ch])
+        res[ave_exp_cell[ch]]   = np.average(raw_image[cell_seg == nuclei_id, ch])
+    return res
+def extract_feature(channels: List,
+                    raw_image: np.ndarray,
+                    nuclei_seg: np.ndarray,
+                    cell_seg: np.ndarray,
+                    filename: str,
+                    use_parallel: bool = True,
+                    show_sample: bool = False) -> pd.DataFrame:
+    """ Extract nuclei and cell level feature from cytof image based on nuclei segmentation and cell segmentation
+        results
+    Inputs:
+        channels   = channels to extract feature from
+        raw_image  = raw cytof image
+        nuclei_seg = nuclei segmentation result
+        cell_seg   = cell segmentation result
+        filename   = filename of current cytof image
+    Returns:
+        feature_summary_df = a dataframe containing summary of extracted features
+        morphology         = names of morphology features extracted
+    :param channels: list
+    :param raw_image: numpy.ndarray
+    :param nuclei_seg: numpy.ndarray
+    :param cell_seg: numpy.ndarray
+    :param filename: string
+    :param morpholoty: list
+    :return feature_summary_df: pandas.core.frame.DataFrame
+    """
+    assert (len(channels) == raw_image.shape[-1])
+    # morphology features to be extracted
+    morphology = ["area", "convex_area", "eccentricity", "extent",
+                "filled_area", "major_axis_length", "minor_axis_length",
+                "orientation", "perimeter", "solidity", "pa_ratio"]
+    ## morphology features
+    nuclei_morphology = [_ + '_nuclei' for _ in morphology]  # morphology - nuclei level
+    cell_morphology = [_ + '_cell' for _ in morphology]  # morphology - cell level
+    ## single cell features
+    # nuclei level
+    sum_exp_nuclei = [_ + '_nuclei_sum' for _ in channels]  # sum expression over nuclei
+    ave_exp_nuclei = [_ + '_nuclei_ave' for _ in channels]  # average expression over nuclei
+    # cell level
+    sum_exp_cell   = [_ + '_cell_sum' for _ in channels]  # sum expression over cell
+    ave_exp_cell   = [_ + '_cell_ave' for _ in channels]  # average expression over cell
+    # column names of final result dataframe
+    column_names       = ["filename", "id", "coordinate_x", "coordinate_y"] + \
+                         sum_exp_nuclei + ave_exp_nuclei + nuclei_morphology + \
+                         sum_exp_cell + ave_exp_cell + cell_morphology
+    # Initiate
+    n_nuclei = np.max(nuclei_seg)
+    feature_summary_df = pd.DataFrame(columns=column_names)
+    if use_parallel:
+        nuclei_ids = range(2, n_nuclei + 1)
+        with Pool() as mp_pool:
+            res = mp_pool.starmap(_extract_feature_one_nuclei,
+                                      zip(nuclei_ids,
+                                          itertools.repeat(nuclei_seg),
+                                          itertools.repeat(cell_seg),
+                                          itertools.repeat(filename),
+                                          itertools.repeat(morphology),
+                                          itertools.repeat(nuclei_morphology),
+                                          itertools.repeat(cell_morphology),
+                                          itertools.repeat(channels),
+                                          itertools.repeat(raw_image),
+                                          itertools.repeat(sum_exp_nuclei),
+                                          itertools.repeat(ave_exp_nuclei),
+                                          itertools.repeat(sum_exp_cell),
+                                          itertools.repeat(ave_exp_cell)
+                                         ))
+            # print(len(res), n_nuclei)
+    else:
+        res = []
+        for nuclei_id in tqdm(range(2, n_nuclei + 1), position=0, leave=True):
+            res.append(_extract_feature_one_nuclei(nuclei_id, nuclei_seg, cell_seg, filename,
+                                                   morphology, nuclei_morphology, cell_morphology,
+                                                   channels, raw_image,
+                                                   sum_exp_nuclei, ave_exp_nuclei, sum_exp_cell, ave_exp_cell))
+    feature_summary_df = pd.DataFrame(res)
+    if show_sample:
+        print(feature_summary_df.sample(5))
+    return feature_summary_df
+def check_feature_distribution(feature_summary_df, features):
+    """ Visualize feature distribution for each feature
+    Inputs:
+        feature_summary_df = dataframe of extracted feature summary
+        features           = features to check distribution
+    Returns:
+        None
+    :param feature_summary_df: pandas.core.frame.DataFrame
+    :param features: list
+    """
+    for feature in features:
+        print(feature)
+        fig, ax = plt.subplots(1, 1, figsize=(3, 2))
+        ax.hist(np.log2(feature_summary_df[feature] + 0.0001), 100)
+        ax.set_xlim(-15, 15)
+        plt.show()
+# def visualize_scatter(data, communities, n_community, title, figsize=(4,4), savename=None, show=False):
+#     """
+#     data = data to visualize (N, 2)
+#     communities = group indices correspond to each sample in data (N, 1) or (N, )
+#     n_community = total number of groups in the cohort (n_community >= unique number of communities)
+#     """
+#     fig, ax = plt.subplots(1,1, figsize=figsize)
+#     ax.set_title(title)
+#     sns.scatterplot(x=data[:,0], y=data[:,1], hue=communities, palette='tab20',
+#                     hue_order=np.arange(n_community))
+#                     #                 legend=legend,
+#                     # hue_order=np.arange(n_community))
+#     plt.axis('tight')
+#     plt.legend(bbox_to_anchor=(1.01, 1), loc=2, borderaxespad=0.)
+#     if savename is not None:
+#         print("saving plot to {}".format(savename))
+#         plt.savefig(savename)
+#     if show:
+#         plt.show()
+#         return None
+#     return fig
+def visualize_scatter(data, communities, n_community, title, figsize=(5,5), savename=None, show=False, ax=None):
+    """
+    data = data to visualize (N, 2)
+    communities = group indices correspond to each sample in data (N, 1) or (N, )
+    n_community = total number of groups in the cohort (n_community >= unique number of communities)
+    """
+    clos = not show and ax is None
+    show = show and ax is None
+    if ax is None:
+        fig, ax = plt.subplots(1,1)
+    else:
+        fig = None
+    ax.set_title(title)
+    sns.scatterplot(x=data[:,0], y=data[:,1], hue=communities, palette='tab20',
+                    hue_order=np.arange(n_community), ax=ax)
+                    #                 legend=legend,
+                    # hue_order=np.arange(n_community))
+    ax.legend(bbox_to_anchor=(1.01, 1), loc=2, borderaxespad=0.)
+    # plt.axis('tight')
+    if savename is not None:
+        print("saving plot to {}".format(savename))
+        plt.tight_layout()
+        plt.savefig(savename)
+    if show:
+        plt.show()
+    if clos:
+        plt.close('all')
+    return fig
+def visualize_expression(data, markers, group_ids, title, figsize=(5,5), savename=None, show=False, ax=None):
+    clos = not show and ax is None
+    show = show and ax is None
+    if ax is None:
+        fig, ax = plt.subplots(1,1)
+    else:
+        fig = None
+    sns.heatmap(data,
+                cmap='magma',
+                xticklabels=markers,
+                yticklabels=group_ids,
+                ax=ax
+               )
+    ax.set_xlabel("Markers")
+    ax.set_ylabel("Phenograph clusters")
+    ax.set_title("normalized expression - {}".format(title))
+    ax.xaxis.set_tick_params(labelsize=8)
+    if savename is not None:
+        plt.tight_layout()
+        plt.savefig(savename)
+    if show:
+        plt.show()
+    if clos:
+        plt.close('all')
+    return fig
+def _get_thresholds(df_feature: pd.DataFrame,
+                    features: List[str],
+                    thres_bg: float = 0.3,
+                    visualize: bool = True,
+                    verbose: bool = False):
+    """Calculate thresholds for each feature by assuming a Gaussian Mixture Model
+    Inputs:
+        df_feature = dataframe of extracted feature summary
+        features   = a list of features to calculate thresholds from
+        thres_bg   = a threshold such that the component with the mixing weight greater than the threshold would
+                            be considered as background. (Default=0.3)
+        visualize  = a flag indicating whether to visualize the feature distributions and thresholds or not.
+                            (Default=True)
+        verbose    = a flag indicating whether to print calculated values on screen or not. (Default=False)
+    Outputs:
+        thresholds = a dictionary of calculated threshold values
+    :param df_feature: pandas.core.frame.DataFrame
+    :param features: list
+    :param visualize: bool
+    :param verbose: bool
+    :return thresholds: dict
+    """
+    thresholds = {}
+    for f, feat_name in enumerate(features):
+        X = df_feature[feat_name].values.reshape(-1, 1)
+        gm = GaussianMixture(n_components=2, random_state=0, n_init=2).fit(X)
+        mu = np.min(gm.means_[gm.weights_ > thres_bg])
+        which_component = np.argmax(gm.means_ == mu)
+        if verbose:
+            print(f"GMM mean values: {gm.means_}")
+            print(f"GMM weights: {gm.weights_}")
+            print(f"GMM covariances: {gm.covariances_}")
+        X     = df_feature[feat_name].values
+        hist  = np.histogram(X, 150)
+        sigma = np.sqrt(gm.covariances_[which_component, 0, 0])
+        background_ratio = gm.weights_[which_component]
+        thres = sigma * 2.5 + mu
+        thresholds[feat_name] = thres
+        n = sum(X > thres)
+        percentage = n / len(X)
+        ## visualize
+        if visualize:
+            fig, ax = plt.subplots(1, 1)
+            ax.hist(X, 150, density=True)
+            ax.set_xlabel("log2({})".format(feat_name))
+            ax.plot(hist[1], scipy.stats.norm.pdf(hist[1], mu, sigma) * background_ratio, c='red')
+            _which_component = np.argmin(gm.means_ == mu)
+            _mu = gm.means_[_which_component]
+            _sigma = np.sqrt(gm.covariances_[_which_component, 0, 0])
+            ax.plot(hist[1], scipy.stats.norm.pdf(hist[1], _mu, _sigma) * (1 - background_ratio), c='orange')
+            ax.axvline(x=thres, c='red')
+            ax.text(0.7, 0.9, "n={}, percentage={}".format(n, np.round(percentage, 3)), ha='center', va='center',
+                    transform=ax.transAxes)
+            ax.text(0.3, 0.9, "mu={}, sigma={}".format(np.round(mu, 2), np.round(sigma, 2)), ha='center', va='center',
+                    transform=ax.transAxes)
+            ax.text(0.3, 0.8, "background ratio={}".format(np.round(background_ratio, 2)), ha='center', va='center',
+                    transform=ax.transAxes)
+            ax.set_title(feat_name)
+            plt.show()
+    return thresholds
+def _generate_summary(df_feature: pd.DataFrame, features: List[str], thresholds: dict) -> pd.DataFrame:
+    """Generate (cell level) summary table for each feature in features: feature name, total number (of cells),
+        calculated GMM threshold for this feature, number of individuals (cells) with greater than threshold values,
+        ratio of individuals (cells) with greater than threshold values
+    Inputs:
+        df_feature = dataframe of extracted feature summary
+        features   = a list of features to generate summary table
+        thresholds = (calculated GMM-based) thresholds for each feature
+    Outputs:
+        df_info    = summary table for each feature
+    :param df_feature: pandas.core.frame.DataFrame
+    :param features: list
+    :param thresholds: dict
+    :return df_info: pandas.core.frame.DataFrame
+    """
+    df_info = pd.DataFrame(columns=['feature', 'total number', 'threshold', 'positive counts', 'positive ratio'])
+    for feature in features:  # loop over each feature
+        thres = thresholds[feature]  # fetch threshold for the feature
+        X = df_feature[feature].values
+        n = sum(X > thres)
+        N = len(X)
+        df_new_row = pd.DataFrame({'feature': feature, 'total number': N, 'threshold': thres,
+                                   'positive counts': n, 'positive ratio': n / N}, index=[0])
+        df_info = pd.concat([df_info, df_new_row])
+    return df_info.reset_index(drop=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1,17 @@

+matplotlib==3.6.0
+numpy==1.24.3
+pandas==1.5.1
+PyYAML==6.0
+scikit-image==0.19.3
+scikit-learn==1.1.3
+scipy==1.9.3
+seaborn==0.12.1
+tqdm==4.64.1
+threadpoolctl==3.1.0
+opencv-python==4.7.0.72
+phenograph==1.5.7
+umap-learn==0.5.3
+readimc==0.6.2
+gradio==4.0.1
+plotly==5.18.0
+imagecodecs==2023.1.23