ivangzf commited on
Commit
f0ed68b
·
1 Parent(s): 53cad39

add minor improvements on loading and saving

Browse files
Files changed (1) hide show
  1. cytof/classes.py +10 -231
cytof/classes.py CHANGED
@@ -37,17 +37,6 @@ from sklearn.cluster import KMeans
37
  from itertools import product
38
 
39
 
40
- # SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
41
- # sys.path.append(os.path.dirname(SCRIPT_DIR))
42
-
43
- # # from hyperion_segmentation import cytof_nuclei_segmentation, cytof_cell_segmentation, visualize_segmentation
44
- # # from utils import (save_multi_channel_img, generate_color_dict, show_color_table,
45
- # # visualize_scatter, visualize_expression, _get_thresholds, _generate_summary)
46
-
47
- # from cytof.hyperion_segmentation import cytof_nuclei_segmentation, cytof_cell_segmentation, visualize_segmentation
48
- # from cytof.utils import (save_multi_channel_img, generate_color_dict, show_color_table,
49
- # visualize_scatter, visualize_expression, _get_thresholds, _generate_summary)
50
-
51
  ## added for test
52
  import platform
53
  from pathlib import Path
@@ -61,18 +50,6 @@ from hyperion_segmentation import cytof_nuclei_segmentation, cytof_cell_segmenta
61
  from cytof.utils import (save_multi_channel_img, generate_color_dict, show_color_table,
62
  visualize_scatter, visualize_expression, _get_thresholds, _generate_summary)
63
 
64
- # def _get_colors(n):
65
- # base_colors = np.array([(1, 0, 0), (0, 1, 0), (0, 0, 1),
66
- # (0, 1, 1), (1, 0, 1), (1, 1, 0),
67
- # (1, 1, 1)])
68
-
69
- # n0 = len(base_colors)
70
- # if n <= n0:
71
- # colours = base_colors[:n]
72
- # else:
73
- # colours = np.vstack((base_colors, cm.rainbow(np.linspace(0, 1, n-n0))[:,:-1]))
74
- # return colours
75
-
76
  def get_name(dfrow):
77
  return os.path.join(dfrow['path'], dfrow['ROI'])
78
 
@@ -118,6 +95,9 @@ class CytofImage():
118
  return f"CytofImage(slide={self.slide}, roi={self.roi})"
119
 
120
  def save_cytof(self, savename: str):
 
 
 
121
  pkl.dump(self, open(savename, "wb"))
122
 
123
  def get_markers(self, imarker0: Optional[str] = None):
@@ -599,11 +579,6 @@ class CytofImage():
599
  else:
600
  channels = self.channels
601
  '''assert all([x.lower() in channels_temp for x in channels]), "Not all provided channels are available!"'''
602
- # for (i, chn) in enumerate(channels):
603
- # savename = os.path.join(savedir, f"{chn}.tiff")
604
- # im_temp = self.image[..., i]
605
- # im_temp_ = np.clip(im_temp / np.quantile(im_temp, 0.99), 0, 1)
606
- # save_multi_channel_img((im_temp_ * 255).astype(np.uint8), savename)
607
  for chn in channels:
608
  savename = os.path.join(savedir, f"{chn}{ext}")
609
  # i = channels_temp.index(chn.lower())
@@ -1190,7 +1165,6 @@ class CytofCohort():
1190
  """
1191
  self.cytof_images = cytof_images or {}
1192
  self.df_cohort = df_cohort# or None# pd.read_csv(file_cohort) # the slide-ROI
1193
- # self.df_io = None pd.read_csv(file_io) # the input-output correspondence file
1194
  self.feat_sets = {
1195
  "all": ["cell_sum", "cell_ave", "cell_morphology"],
1196
  "cell_sum": ["cell_sum", "cell_morphology"],
@@ -1214,6 +1188,9 @@ class CytofCohort():
1214
  return f"CytofCohort(name={self.name})"
1215
 
1216
  def save_cytof_cohort(self, savename):
 
 
 
1217
  pkl.dump(self, open(savename, "wb"))
1218
 
1219
  def batch_process_feature(self):
@@ -1247,8 +1224,8 @@ class CytofCohort():
1247
 
1248
 
1249
  def batch_process(self, params: Dict):
1250
- sys.path.append("../../CLIscripts")
1251
- from single_roi import process_single, SetParameters
1252
  for i, (slide, roi, fname) in self.df_cohort.iterrows():
1253
  paramsi = SetParameters(filename=fname,
1254
  outdir=self.dir_out,
@@ -1395,9 +1372,8 @@ class CytofCohort():
1395
  kmeans = KMeans(n_clusters=k, random_state=42).fit(df_feature)
1396
  communities = kmeans.labels_
1397
  else:
1398
- communities, graph, Q = phenograph.cluster(df_feature, k=k, seed=42, n_jobs=10) # run PhenoGraph
1399
 
1400
- print('Performing dimensionality reduction with UMAP now...')
1401
  # project to 2D using UMAP
1402
  umap_2d = umap.UMAP(n_components=2, init='random', random_state=0)
1403
  proj_2d = umap_2d.fit_transform(df_feature)
@@ -1699,14 +1675,12 @@ class CytofCohort():
1699
  thres = kwars.get(_, default_thres[_])
1700
  """print("{}: {}".format(_, thres))"""
1701
  networks = self._gather_roi_kneighbor_graphs(key_pheno, method=method, **{_: thres})
1702
- # networks = _gather_roi_kneighbor_graphs(self, key_pheno, method=method, **{_: thres})
1703
 
1704
  if level == "slide":
1705
  keys = ['edge_nums', 'expected_percentage', 'num_cell']
1706
  for slide in self.df_cohort['Slide'].unique():
1707
  cond = self.df_cohort['Slide'] == slide
1708
  df_slide = self.df_cohort.loc[cond, :]
1709
- # rois = df_slide.apply(lambda row: get_name(row), axis=1).values
1710
  rois = df_slide['ROI'].values
1711
  '''keys = list(networks.values())[0].keys()'''
1712
  networks[slide] = {}
@@ -1756,6 +1730,7 @@ class CytofCohort():
1756
  figsize=(6, 6), row_cluster=False, col_cluster=False)
1757
  plt.title(f_key)
1758
  plt.show()
 
1759
  # IMPORTANT: attch to individual ROIs
1760
  self.attach_individual_roi_pheno(key_pheno, override=True)
1761
  return interacts, clustergrid
@@ -1792,7 +1767,6 @@ class CytofCohort():
1792
  visualize=vis_thres,
1793
  verbose=verbose)
1794
  setattr(self, "marker_thresholds", thres)
1795
- print('done getting threshold for summary')
1796
 
1797
  # split to each ROI
1798
  _attr_marker_pos, seen = [], 0
@@ -1818,29 +1792,6 @@ class CytofCohort():
1818
  seen += 1
1819
  return _attr_marker_pos
1820
 
1821
- def get_roi_co_exp_compoent(self, feature_name, accumul_type):
1822
- """
1823
- Creates the components for ROI level co-expression. This is expected to be used in cohort analysis
1824
- """
1825
-
1826
- # initialize dictionaries to record componenets of the log odds-ratio
1827
- co_positive_counts_dict = dict() # theta in manuscript
1828
- expected_counts_dict = dict() # E in manuscript
1829
- num_cells_dict = dict()
1830
-
1831
- for i, cytof_img in enumerate(self.cytof_images.values()):
1832
- slide, roi = cytof_img.slide, cytof_img.roi
1833
-
1834
- # compute the co-expression at the ROI level, but does not compute the final probability
1835
- df_co_pos_prob, df_expected_prob, n_cell = cytof_img.roi_co_expression(feature_name=feature_name, accumul_type=accumul_type, return_components=True)
1836
-
1837
- # store the counts and number of cells separately
1838
- co_positive_counts_dict[roi] = df_co_pos_prob
1839
- expected_counts_dict[roi] = df_expected_prob
1840
- num_cells_dict[roi] = n_cell
1841
-
1842
- return co_positive_counts_dict, expected_counts_dict, num_cells_dict
1843
-
1844
  def co_expression_analysis(self,
1845
  normq: int = 75,
1846
  feat_type: str = "normed",
@@ -1941,175 +1892,3 @@ class CytofCohort():
1941
  slide_co_expression_dict[slide_key] = (edge_percentage_norm, df_expected.columns)
1942
 
1943
  return slide_co_expression_dict
1944
-
1945
-
1946
- def _gather_roi_co_exp(self,
1947
- feat_name: str,
1948
- accumul_type: str = "sum"):
1949
- """roi level co-expression analysis"""
1950
- n_attr = f"df_feature_{feat_name}"
1951
- expected_percentages = {}
1952
- edge_percentages = {}
1953
- num_cells = {}
1954
- df_slide_roi = self.df_cohort
1955
-
1956
- for i, cytof_img in enumerate(self.cytof_images.values()):
1957
- slide, roi = cytof_img.slide, cytof_img.roi
1958
- df_feat = getattr(cytof_img , n_attr)
1959
-
1960
- if i == 0:
1961
- # all gene (marker) columns
1962
- marker_col_all = [x for x in df_feat.columns if f"cell_{accumul_type}" in x]
1963
- ids = [cytof_img.channels.index(x.split(f"_cell_{accumul_type}")[0]) for x in marker_col_all]
1964
- marker_all = list(np.array(cytof_img.markers, dtype=object)[np.array(ids)])
1965
- n_marker = len(marker_col_all)
1966
- n_cell = len(df_feat)
1967
-
1968
- # corresponding marker positive info file
1969
- if not hasattr(cytof_img, f"cell_count_{feat_name}_{accumul_type}"):
1970
- print('no marker positive analysis found. Generating analysis...')
1971
- self.generate_summary()
1972
-
1973
- df_info_cell = getattr(cytof_img, f"cell_count_{feat_name}_{accumul_type}")
1974
- pos_nums = df_info_cell["positive counts"].values
1975
- pos_ratios = df_info_cell["positive ratio"].values
1976
- thresholds = df_info_cell["threshold"].values
1977
-
1978
- # create new expected_percentage matrix for each ROI
1979
- expected_percentage = np.zeros((n_marker, n_marker))
1980
- edge_percentage = np.zeros_like(expected_percentage)
1981
-
1982
- """expected_percentage
1983
- an N by N matrix, where N represent for the number of total gene (marker)
1984
- each ij-th element represents for the percentage that both the i-th and the j-th gene is "positive"
1985
- based on the threshold defined previously"""
1986
- for ii in range(n_marker):
1987
- for jj in range(n_marker):
1988
- expected_percentage[ii, jj] = pos_nums[ii] * pos_nums[jj]
1989
- expected_percentages[roi] = expected_percentage
1990
-
1991
- """edge_percentage
1992
- an N by N matrix, where N represent for the number of gene (marker)
1993
- each ij-th element represents for the percentage of cells that show positive in both i-th and j-th gene
1994
- """
1995
- edge_nums = np.zeros_like(expected_percentage)
1996
- for ii in range(n_marker):
1997
- _x = df_feat[marker_col_all[ii]].values > thresholds[ii]
1998
- # _x = df_feat[marker_col_all[ii]].values > thresholds[marker_idx[ii]]
1999
- for jj in range(n_marker):
2000
- _y = df_feat[marker_col_all[jj]].values > thresholds[jj]
2001
- # _y = df_feat[marker_col_all[jj]].values > thresholds[marker_idx[jj]]
2002
- edge_nums[ii, jj] = np.sum(np.all([_x, _y], axis=0)) # / n_cell
2003
- edge_percentages[roi] = edge_nums
2004
- num_cells[roi] = n_cell
2005
-
2006
- return expected_percentages, edge_percentages, num_cells, marker_all, marker_col_all
2007
-
2008
- def _co_expression_analysis_(self,
2009
- normq: int = 75,
2010
- feat_type: str = "normed",
2011
- co_exp_markers: Union[str, List] = "all",
2012
- accumul_type: Union[str, List[str]] = "sum",
2013
- level: str = "slide",
2014
- verbose: bool = False,
2015
- clustergrid=None):
2016
-
2017
- assert level in ["slide", "roi"], "Only slide or roi levels are accepted!"
2018
- assert feat_type in ["original", "normed", "scaled"]
2019
- if feat_type == "original":
2020
- feat_name = ""
2021
- elif feat_type == "normed":
2022
- feat_name = f"{normq}normed"
2023
- else:
2024
- feat_name = f"{normq}normed_scaled"
2025
- if verbose:
2026
- print(feat_name)
2027
-
2028
- expected_percentages, edge_percentages, num_cells, marker_all, marker_col_all = \
2029
- self._gather_roi_co_exp(feat_name)
2030
-
2031
-
2032
- if co_exp_markers != "all":
2033
- # assert (isinstance(co_exp_markers, list) and all([x in cytof_img.markers for x in co_exp_markers]))
2034
- assert (isinstance(co_exp_markers, list) and all([x in marker_all for x in co_exp_markers]))
2035
- marker_idx = np.array([marker_all.index(x) for x in co_exp_markers])
2036
- marker_all = [marker_all[x] for x in marker_idx]
2037
- marker_col_all = [marker_col_all[x] for x in marker_idx]
2038
- else:
2039
- marker_idx = np.arange(len(marker_all))
2040
-
2041
- df_slide_roi = self.df_cohort
2042
- if level == "slide":
2043
- # expected_percentages, edge_percentages = {}, {}
2044
- for slide in df_slide_roi["Slide"].unique(): ## for each slide
2045
- print('co-exp slide:', slide)
2046
- print('exp-perc keys before:', expected_percentages.keys())
2047
- for seen_roi, f_roi in enumerate(df_slide_roi.loc[df_slide_roi["Slide"] == slide, "ROI"]): ## for each ROI
2048
- roi = f_roi.replace(".txt", "")
2049
- print('roi finding:', roi)
2050
- if roi not in expected_percentages:
2051
- continue
2052
- if seen_roi == 0:
2053
- expected_percentages[slide] = expected_percentages[roi]
2054
- edge_percentages[slide] = edge_percentages[roi]
2055
- num_cells[slide] = num_cells[roi]
2056
- else:
2057
- expected_percentages[slide] += expected_percentages[roi]
2058
- edge_percentages[slide] += edge_percentages[roi]
2059
- num_cells[slide] += num_cells[roi]
2060
- expected_percentages.pop(roi)
2061
- edge_percentages.pop(roi)
2062
- num_cells.pop(roi)
2063
-
2064
- print('exp-perc keys after:', expected_percentages.keys())
2065
- # print('exp_after:', expected_percentages)
2066
-
2067
- co_exps = {}
2068
- for key, expected_percentage in expected_percentages.items():
2069
- print('key for co-exp:', key)
2070
- # print(type(expected_percentage), type(num_cells[key]))
2071
- # print('exp_perc:', expected_percentage)
2072
- expected_percentage = expected_percentage / num_cells[key] ** 2
2073
- edge_percentage = edge_percentages[key] / num_cells[key]
2074
-
2075
- # Normalize
2076
- edge_percentage_norm = np.log10(edge_percentage / expected_percentage + 0.1)
2077
-
2078
- # Fix Nan
2079
- edge_percentage_norm[np.isnan(edge_percentage_norm)] = np.log10(1 + 0.1)
2080
-
2081
- co_exps[key] = edge_percentage_norm
2082
-
2083
- # plot
2084
- for f_key, edge_percentage_norm in co_exps.items():
2085
- # fig, ax = plt.subplots(1,1, figsize=(6,6))
2086
- # # ax = sns.heatmap(edge_percentage_norm, center=np.log10(1 + 0.1),
2087
- # sns.heatmap(edge_percentage_norm[marker_idx, :][:, marker_idx], center=np.log10(1 + 0.1),
2088
- # cmap='RdBu_r', vmin=-1, vmax=3, xticklabels=marker_all, yticklabels=marker_all, ax=ax)
2089
- # ax.set_aspect('equal')
2090
- # plt.title(f_key)
2091
- # plt.show()
2092
-
2093
- if clustergrid is None:
2094
- plt.figure()
2095
- clustergrid = sns.clustermap(edge_percentage_norm[marker_idx, :][:, marker_idx],
2096
- # clustergrid = sns.clustermap(edge_percentage_norm,
2097
- center=np.log10(1 + 0.1), cmap='RdBu_r', vmin=-1, vmax=3,
2098
- xticklabels=marker_all, yticklabels=marker_all)
2099
- if verbose:
2100
- plt.title(f_key)
2101
- plt.show()
2102
-
2103
- else:
2104
- if verbose:
2105
- plt.figure()
2106
- sns.clustermap(edge_percentage_norm[marker_idx, :][:, marker_idx] \
2107
- [clustergrid.dendrogram_row.reordered_ind, :]\
2108
- [:, clustergrid.dendrogram_row.reordered_ind],
2109
- center=np.log10(1 + 0.1), cmap='RdBu_r', vmin=-1, vmax=3,
2110
- xticklabels=np.array(marker_all)[clustergrid.dendrogram_row.reordered_ind],
2111
- yticklabels=np.array(marker_all)[clustergrid.dendrogram_row.reordered_ind],
2112
- figsize=(6, 6), row_cluster=False, col_cluster=False)
2113
- plt.title(f_key)
2114
- plt.show()
2115
- return co_exps, marker_all, marker_idx, clustergrid
 
37
  from itertools import product
38
 
39
 
 
 
 
 
 
 
 
 
 
 
 
40
  ## added for test
41
  import platform
42
  from pathlib import Path
 
50
  from cytof.utils import (save_multi_channel_img, generate_color_dict, show_color_table,
51
  visualize_scatter, visualize_expression, _get_thresholds, _generate_summary)
52
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  def get_name(dfrow):
54
  return os.path.join(dfrow['path'], dfrow['ROI'])
55
 
 
95
  return f"CytofImage(slide={self.slide}, roi={self.roi})"
96
 
97
  def save_cytof(self, savename: str):
98
+ directory = os.path.dirname(savename)
99
+ if not os.path.exists(directory):
100
+ os.makedirs(directory)
101
  pkl.dump(self, open(savename, "wb"))
102
 
103
  def get_markers(self, imarker0: Optional[str] = None):
 
579
  else:
580
  channels = self.channels
581
  '''assert all([x.lower() in channels_temp for x in channels]), "Not all provided channels are available!"'''
 
 
 
 
 
582
  for chn in channels:
583
  savename = os.path.join(savedir, f"{chn}{ext}")
584
  # i = channels_temp.index(chn.lower())
 
1165
  """
1166
  self.cytof_images = cytof_images or {}
1167
  self.df_cohort = df_cohort# or None# pd.read_csv(file_cohort) # the slide-ROI
 
1168
  self.feat_sets = {
1169
  "all": ["cell_sum", "cell_ave", "cell_morphology"],
1170
  "cell_sum": ["cell_sum", "cell_morphology"],
 
1188
  return f"CytofCohort(name={self.name})"
1189
 
1190
  def save_cytof_cohort(self, savename):
1191
+ directory = os.path.dirname(savename)
1192
+ if not os.path.exists(directory):
1193
+ os.makedirs(directory)
1194
  pkl.dump(self, open(savename, "wb"))
1195
 
1196
  def batch_process_feature(self):
 
1224
 
1225
 
1226
  def batch_process(self, params: Dict):
1227
+ sys.path.append("../CLIscripts")
1228
+ from process_single_roi import process_single, SetParameters
1229
  for i, (slide, roi, fname) in self.df_cohort.iterrows():
1230
  paramsi = SetParameters(filename=fname,
1231
  outdir=self.dir_out,
 
1372
  kmeans = KMeans(n_clusters=k, random_state=42).fit(df_feature)
1373
  communities = kmeans.labels_
1374
  else:
1375
+ communities, graph, Q = phenograph.cluster(df_feature, k=k, n_jobs=-1) # run PhenoGraph
1376
 
 
1377
  # project to 2D using UMAP
1378
  umap_2d = umap.UMAP(n_components=2, init='random', random_state=0)
1379
  proj_2d = umap_2d.fit_transform(df_feature)
 
1675
  thres = kwars.get(_, default_thres[_])
1676
  """print("{}: {}".format(_, thres))"""
1677
  networks = self._gather_roi_kneighbor_graphs(key_pheno, method=method, **{_: thres})
 
1678
 
1679
  if level == "slide":
1680
  keys = ['edge_nums', 'expected_percentage', 'num_cell']
1681
  for slide in self.df_cohort['Slide'].unique():
1682
  cond = self.df_cohort['Slide'] == slide
1683
  df_slide = self.df_cohort.loc[cond, :]
 
1684
  rois = df_slide['ROI'].values
1685
  '''keys = list(networks.values())[0].keys()'''
1686
  networks[slide] = {}
 
1730
  figsize=(6, 6), row_cluster=False, col_cluster=False)
1731
  plt.title(f_key)
1732
  plt.show()
1733
+
1734
  # IMPORTANT: attch to individual ROIs
1735
  self.attach_individual_roi_pheno(key_pheno, override=True)
1736
  return interacts, clustergrid
 
1767
  visualize=vis_thres,
1768
  verbose=verbose)
1769
  setattr(self, "marker_thresholds", thres)
 
1770
 
1771
  # split to each ROI
1772
  _attr_marker_pos, seen = [], 0
 
1792
  seen += 1
1793
  return _attr_marker_pos
1794
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1795
  def co_expression_analysis(self,
1796
  normq: int = 75,
1797
  feat_type: str = "normed",
 
1892
  slide_co_expression_dict[slide_key] = (edge_percentage_norm, df_expected.columns)
1893
 
1894
  return slide_co_expression_dict