Xue-Jun commited on
Commit
4a4b152
·
verified ·
1 Parent(s): fe92153

Upload 6 files

Browse files
Files changed (5) hide show
  1. .gitignore +2 -0
  2. app.py +2 -8
  3. r_functions.py +19 -49
  4. usalign_runner.py +27 -31
  5. utils.py +52 -164
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ /data
2
+ /__pycache__
app.py CHANGED
@@ -1,21 +1,14 @@
1
- import hashlib
2
  import os
3
- import sys
4
- from io import StringIO
5
- from pathlib import Path
6
 
7
  import gradio as gr
8
- import matplotlib.pyplot as plt
9
- import pandas as pd
10
 
11
- from usalign_runner import USalignRunner
12
  from utils import calculate_md5, run_community_analysis, run_usalign, save_pdb_files
13
 
14
  os.environ["GRADIO_ANALYTICS_ENABLED"] = "False"
15
 
16
  # Create Gradio interface
17
  with gr.Blocks() as demo:
18
- gr.Markdown("# Structure-Based Similarity Network")
19
 
20
  with gr.Row():
21
  file_input = gr.File(
@@ -93,6 +86,7 @@ with gr.Blocks() as demo:
93
  outputs=[
94
  tm_matrix_output,
95
  newick_output,
 
96
  download_tm,
97
  ],
98
  )
 
 
1
  import os
 
 
 
2
 
3
  import gradio as gr
 
 
4
 
 
5
  from utils import calculate_md5, run_community_analysis, run_usalign, save_pdb_files
6
 
7
  os.environ["GRADIO_ANALYTICS_ENABLED"] = "False"
8
 
9
  # Create Gradio interface
10
  with gr.Blocks() as demo:
11
+ gr.Markdown("# This is a Temp Title")
12
 
13
  with gr.Row():
14
  file_input = gr.File(
 
86
  outputs=[
87
  tm_matrix_output,
88
  newick_output,
89
+ # network_plot,
90
  download_tm,
91
  ],
92
  )
r_functions.py CHANGED
@@ -1,21 +1,16 @@
1
- import pandas as pd
2
- import numpy as np
3
- from rpy2.robjects import pandas2ri, r, Formula
4
- from rpy2.robjects.packages import importr
5
- from rpy2.robjects.vectors import StrVector, FloatVector, IntVector
6
- from rpy2.robjects.conversion import localconverter
7
  import rpy2.robjects as ro
8
- import os
9
-
 
10
 
11
  pandas2ri.activate()
12
 
13
  # 导入必要的 R 包
14
- stats = importr('stats')
15
- ape = importr('ape')
16
- igraph = importr('igraph', robject_translations={'.env': '_env_'})
17
- openxlsx = importr('openxlsx')
18
- # dplyr = importr('dplyr')
19
 
20
  def get_r_matrix(df):
21
  with localconverter(ro.default_converter + pandas2ri.converter):
@@ -23,34 +18,31 @@ def get_r_matrix(df):
23
  return r_tm_matrix
24
 
25
 
26
- export_matrix_to_newick_r = ro.r("""
 
27
  convert_to_newick <- function(tm_matrix, output_file) {
28
  # 导入 ape 包
29
  if (!require(ape, quietly = TRUE)) {
30
  install.packages("ape", repos = "https://cran.r-project.org")
31
  library(ape)
32
  }
33
-
34
  # 计算距离矩阵
35
  dist_matrix <- dist(tm_matrix)
36
-
37
  # 层次聚类
38
  hclust_tree <- hclust(dist_matrix, method = "ward.D2")
39
-
40
  # 转为 phylo 对象
41
  phylo_tree <- as.phylo(hclust_tree)
42
-
43
  # 导出为 Newick 格式
44
  write.tree(phylo_tree, file = output_file)
45
-
46
  newick_str <- write.tree(phylo_tree)
47
-
48
  return(newick_str)
49
  }
50
- """)
 
51
 
52
- export_similarity_network_r = ro.r("""
53
- create_similarity_network_r <- function(threshold, tm_matrix, excel_path, csv_path) {
 
54
  # 导入必要的包
55
  if (!require(igraph, quietly = TRUE)) {
56
  install.packages("igraph", repos = "https://cran.r-project.org")
@@ -60,74 +52,53 @@ create_similarity_network_r <- function(threshold, tm_matrix, excel_path, csv_pa
60
  install.packages("openxlsx", repos = "https://cran.r-project.org")
61
  library(openxlsx)
62
  }
63
-
64
  # 根据相似性阈值创建边缘列表,并过滤掉自环
65
  overthresholdedges <- which(tm_matrix >= threshold, arr.ind = TRUE)
66
  overthresholdedges <- overthresholdedges[overthresholdedges[, 1] != overthresholdedges[, 2], ]
67
-
68
  # 创建空的图形对象
69
  graph <- graph.empty()
70
-
71
  # 添加节点
72
  nodes <- rownames(tm_matrix)
73
  graph <- add_vertices(graph, nv = length(nodes), name = nodes)
74
-
75
  # 添加边
76
  for (i in 1:nrow(overthresholdedges)) {
77
  graph <- add_edges(graph, c(overthresholdedges[i, 1], overthresholdedges[i, 2]))
78
  }
79
-
80
  # 转换为无向图
81
  graph <- as.undirected(graph, mode = "collapse")
82
-
83
  # 计算聚类
84
  clusters <- fastgreedy.community(graph)
85
-
86
  # 获取每个聚类的大小
87
  cluster_sizes <- sizes(clusters)
88
-
89
  # 按聚类大小降序排序
90
  sorted_clusters <- clusters[order(cluster_sizes, decreasing = TRUE)]
91
-
92
  # 获取每个聚类的成员
93
  cluster_members <- membership(clusters)
94
-
95
  # 找到孤立节点
96
  singleton_nodes <- names(cluster_members[cluster_members %in% which(sizes(clusters) == 1)])
97
-
98
  # 创建Cytoscape导出文件
99
  cytoscape_export <- createWorkbook()
100
-
101
  # 创建边Sheet
102
  addWorksheet(cytoscape_export, sheetName = "Edges")
103
  writeData(cytoscape_export, sheet = "Edges", x = "Source", startCol = 1, startRow = 1)
104
  writeData(cytoscape_export, sheet = "Edges", x = "Target", startCol = 2, startRow = 1)
105
-
106
  # 获取边列表
107
  edges <- get.edgelist(graph)
108
-
109
  # 填充边Sheet数据
110
  if (nrow(edges) > 0) {
111
  writeData(cytoscape_export, sheet = "Edges", x = V(graph)[edges[, 1]]$name, startCol = 1, startRow = 2)
112
  writeData(cytoscape_export, sheet = "Edges", x = V(graph)[edges[, 2]]$name, startCol = 2, startRow = 2)
113
- }
114
-
115
  # 找到当前边Sheet的最后一行
116
  last_edge_row <- nrow(edges) + 1
117
-
118
  # 添加孤立节点
119
  if (length(singleton_nodes) > 0) {
120
  writeData(cytoscape_export, sheet = "Edges", x = singleton_nodes, startCol = 1, startRow = last_edge_row + 1)
121
  }
122
-
123
  # 保存Excel文件
124
  saveWorkbook(cytoscape_export, excel_path, overwrite = TRUE)
125
-
126
- saveWorkbook(cytoscape_export, "structure_based_similarity_network_cytoscape_export.xlsx", overwrite = TRUE)
127
-
128
  # 创建一个空的数据框用于储存节点和聚类信息
129
  export_clusters <- data.frame(protein = character(), cluster_name = character(), stringsAsFactors = FALSE)
130
-
131
  # 遍历 sorted_clusters
132
  cluster_index <- 1 # 初始化簇索引
133
  for (cluster_name in names(sorted_clusters)) {
@@ -143,10 +114,9 @@ create_similarity_network_r <- function(threshold, tm_matrix, excel_path, csv_pa
143
  export_clusters <- rbind(export_clusters, data.frame(protein = protein, cluster_name = current_cluster_name))
144
  }
145
  cluster_index <- cluster_index + 1 # 索引加1
146
- }
147
-
148
- write.csv(export_clusters, csv_path, row.names = FALSE, quote = TRUE)
149
  # 返回聚类结果
150
  return(list(cluster_data = export_clusters, graph = graph))
151
  }
152
- """)
 
 
 
 
 
 
 
 
1
  import rpy2.robjects as ro
2
+ from rpy2.robjects import pandas2ri
3
+ from rpy2.robjects.conversion import localconverter
4
+ from rpy2.robjects.packages import importr
5
 
6
  pandas2ri.activate()
7
 
8
  # 导入必要的 R 包
9
+ stats = importr("stats")
10
+ ape = importr("ape")
11
+ igraph = importr("igraph", robject_translations={".env": "_env_"})
12
+ openxlsx = importr("openxlsx")
13
+
14
 
15
  def get_r_matrix(df):
16
  with localconverter(ro.default_converter + pandas2ri.converter):
 
18
  return r_tm_matrix
19
 
20
 
21
+ export_matrix_to_newick_r = ro.r(
22
+ """
23
  convert_to_newick <- function(tm_matrix, output_file) {
24
  # 导入 ape 包
25
  if (!require(ape, quietly = TRUE)) {
26
  install.packages("ape", repos = "https://cran.r-project.org")
27
  library(ape)
28
  }
 
29
  # 计算距离矩阵
30
  dist_matrix <- dist(tm_matrix)
 
31
  # 层次聚类
32
  hclust_tree <- hclust(dist_matrix, method = "ward.D2")
 
33
  # 转为 phylo 对象
34
  phylo_tree <- as.phylo(hclust_tree)
 
35
  # 导出为 Newick 格式
36
  write.tree(phylo_tree, file = output_file)
 
37
  newick_str <- write.tree(phylo_tree)
 
38
  return(newick_str)
39
  }
40
+ """
41
+ )
42
 
43
+ export_similarity_network_r = ro.r(
44
+ """
45
+ create_similarity_network_r <- function(threshold, tm_matrix, excel_path) {
46
  # 导入必要的包
47
  if (!require(igraph, quietly = TRUE)) {
48
  install.packages("igraph", repos = "https://cran.r-project.org")
 
52
  install.packages("openxlsx", repos = "https://cran.r-project.org")
53
  library(openxlsx)
54
  }
 
55
  # 根据相似性阈值创建边缘列表,并过滤掉自环
56
  overthresholdedges <- which(tm_matrix >= threshold, arr.ind = TRUE)
57
  overthresholdedges <- overthresholdedges[overthresholdedges[, 1] != overthresholdedges[, 2], ]
 
58
  # 创建空的图形对象
59
  graph <- graph.empty()
 
60
  # 添加节点
61
  nodes <- rownames(tm_matrix)
62
  graph <- add_vertices(graph, nv = length(nodes), name = nodes)
 
63
  # 添加边
64
  for (i in 1:nrow(overthresholdedges)) {
65
  graph <- add_edges(graph, c(overthresholdedges[i, 1], overthresholdedges[i, 2]))
66
  }
 
67
  # 转换为无向图
68
  graph <- as.undirected(graph, mode = "collapse")
 
69
  # 计算聚类
70
  clusters <- fastgreedy.community(graph)
 
71
  # 获取每个聚类的大小
72
  cluster_sizes <- sizes(clusters)
 
73
  # 按聚类大小降序排序
74
  sorted_clusters <- clusters[order(cluster_sizes, decreasing = TRUE)]
 
75
  # 获取每个聚类的成员
76
  cluster_members <- membership(clusters)
 
77
  # 找到孤立节点
78
  singleton_nodes <- names(cluster_members[cluster_members %in% which(sizes(clusters) == 1)])
 
79
  # 创建Cytoscape导出文件
80
  cytoscape_export <- createWorkbook()
 
81
  # 创建边Sheet
82
  addWorksheet(cytoscape_export, sheetName = "Edges")
83
  writeData(cytoscape_export, sheet = "Edges", x = "Source", startCol = 1, startRow = 1)
84
  writeData(cytoscape_export, sheet = "Edges", x = "Target", startCol = 2, startRow = 1)
 
85
  # 获取边列表
86
  edges <- get.edgelist(graph)
 
87
  # 填充边Sheet数据
88
  if (nrow(edges) > 0) {
89
  writeData(cytoscape_export, sheet = "Edges", x = V(graph)[edges[, 1]]$name, startCol = 1, startRow = 2)
90
  writeData(cytoscape_export, sheet = "Edges", x = V(graph)[edges[, 2]]$name, startCol = 2, startRow = 2)
91
+ }
 
92
  # 找到当前边Sheet的最后一行
93
  last_edge_row <- nrow(edges) + 1
 
94
  # 添加孤立节点
95
  if (length(singleton_nodes) > 0) {
96
  writeData(cytoscape_export, sheet = "Edges", x = singleton_nodes, startCol = 1, startRow = last_edge_row + 1)
97
  }
 
98
  # 保存Excel文件
99
  saveWorkbook(cytoscape_export, excel_path, overwrite = TRUE)
 
 
 
100
  # 创建一个空的数据框用于储存节点和聚类信息
101
  export_clusters <- data.frame(protein = character(), cluster_name = character(), stringsAsFactors = FALSE)
 
102
  # 遍历 sorted_clusters
103
  cluster_index <- 1 # 初始化簇索引
104
  for (cluster_name in names(sorted_clusters)) {
 
114
  export_clusters <- rbind(export_clusters, data.frame(protein = protein, cluster_name = current_cluster_name))
115
  }
116
  cluster_index <- cluster_index + 1 # 索引加1
117
+ }
 
 
118
  # 返回聚类结果
119
  return(list(cluster_data = export_clusters, graph = graph))
120
  }
121
+ """
122
+ )
usalign_runner.py CHANGED
@@ -1,29 +1,27 @@
1
  import subprocess
2
- import os
3
- from typing import List, Optional
4
  from pathlib import Path
 
 
5
  import yaml
6
 
 
7
  class USalignRunner:
8
  def __init__(self, config_path: str = "config.yaml"):
9
  """
10
  Initialize USalignRunner with parameters from config file.
11
-
12
  Args:
13
  config_path (str): Path to the configuration file
14
  """
15
- with open(config_path, 'r',encoding="utf-8") as f:
16
  config = yaml.safe_load(f)
17
-
18
- self.usalign_path = Path(config['USalign']['path'])
19
  self.default_params = {
20
- 'tmscore': config['USalign']['tmscore'],
21
- 'outfmt': config['USalign']['outfmt'],
22
- 'mol': 'protein' # Default to protein alignment
23
  }
24
-
25
- if not self.usalign_path.exists():
26
- raise FileNotFoundError(f"USalign executable not found at {self.usalign_path}")
27
 
28
  def run_alignment(
29
  self,
@@ -32,37 +30,35 @@ class USalignRunner:
32
  tmscore: Optional[float] = None,
33
  outfmt: Optional[int] = None,
34
  ) -> tuple[int, str, str]:
35
- tmscore = tmscore if tmscore is not None else self.default_params['tmscore']
36
- outfmt = outfmt if outfmt is not None else self.default_params['outfmt']
37
-
38
  # Create the command
39
  cmd = [
40
  str(self.usalign_path),
41
- "-mol", self.default_params['mol'],
42
- "-dir", str(target_dir),
 
 
43
  pdb_list_file,
44
- "-TMscore", str(tmscore),
45
- "-outfmt", str(outfmt)
 
 
46
  ]
47
  print(cmd)
48
-
49
  # Convert command list to string
50
  cmd_str = " ".join(cmd)
51
-
52
  try:
53
  # Execute the command
54
- process = subprocess.Popen(
55
- cmd_str,
56
- stdout=subprocess.PIPE,
57
- stderr=subprocess.PIPE,
58
- shell=True,
59
- text=True
60
- )
61
-
62
  # Get output
63
  stdout, stderr = process.communicate()
64
-
65
  return process.returncode, stdout, stderr
66
-
67
  except Exception as e:
68
  return -1, "", str(e)
 
1
  import subprocess
 
 
2
  from pathlib import Path
3
+ from typing import Optional
4
+
5
  import yaml
6
 
7
+
8
  class USalignRunner:
9
  def __init__(self, config_path: str = "config.yaml"):
10
  """
11
  Initialize USalignRunner with parameters from config file.
12
+
13
  Args:
14
  config_path (str): Path to the configuration file
15
  """
16
+ with open(config_path, "r", encoding="utf-8") as f:
17
  config = yaml.safe_load(f)
18
+
19
+ self.usalign_path = Path(config["USalign"]["path"])
20
  self.default_params = {
21
+ "tmscore": config["USalign"]["tmscore"],
22
+ "outfmt": config["USalign"]["outfmt"],
23
+ "mol": "protein", # Default to protein alignment
24
  }
 
 
 
25
 
26
  def run_alignment(
27
  self,
 
30
  tmscore: Optional[float] = None,
31
  outfmt: Optional[int] = None,
32
  ) -> tuple[int, str, str]:
33
+ tmscore = tmscore if tmscore is not None else self.default_params["tmscore"]
34
+ outfmt = outfmt if outfmt is not None else self.default_params["outfmt"]
35
+
36
  # Create the command
37
  cmd = [
38
  str(self.usalign_path),
39
+ "-mol",
40
+ self.default_params["mol"],
41
+ "-dir",
42
+ str(target_dir),
43
  pdb_list_file,
44
+ "-TMscore",
45
+ str(tmscore),
46
+ "-outfmt",
47
+ str(outfmt),
48
  ]
49
  print(cmd)
50
+
51
  # Convert command list to string
52
  cmd_str = " ".join(cmd)
53
+
54
  try:
55
  # Execute the command
56
+ process = subprocess.Popen(cmd_str, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True, text=True)
57
+
 
 
 
 
 
 
58
  # Get output
59
  stdout, stderr = process.communicate()
60
+
61
  return process.returncode, stdout, stderr
62
+
63
  except Exception as e:
64
  return -1, "", str(e)
utils.py CHANGED
@@ -1,224 +1,138 @@
1
- import numpy as np
2
- import pandas as pd
3
- # import fastcluster
4
- import networkx as nx
5
- from community import community_louvain
6
- from scipy.spatial.distance import pdist, squareform
7
- from scipy.cluster.hierarchy import linkage, to_tree
8
- from networkx.algorithms.community import greedy_modularity_communities
9
- from Bio import Phylo
10
- from Bio.Phylo.BaseTree import Tree, Clade
11
- import matplotlib.pyplot as plt
12
- import sys
13
- import gradio as gr
14
- import os
15
  import hashlib
16
- from pathlib import Path
17
- import pandas as pd
18
  from io import StringIO
19
- from usalign_runner import USalignRunner
20
- import pandas as pd
21
  import numpy as np
22
- from rpy2.robjects import pandas2ri, r, Formula
23
- from rpy2.robjects.packages import importr
24
- from rpy2.robjects.vectors import StrVector, FloatVector, IntVector
25
- from rpy2.robjects.conversion import localconverter
26
  import rpy2.robjects as ro
27
- import os
 
 
 
 
28
 
29
- from r_functions import get_r_matrix,export_matrix_to_newick_r,export_similarity_network_r
30
 
31
  def get_TM_mat_from_df(df):
32
- chain1_unique = df['#PDBchain1'].unique()
33
- chain2_unique = df['PDBchain2'].unique()
34
- unique_chains = sorted(set(df['#PDBchain1'].unique()).union(set(df['PDBchain2'].unique())))
35
  chain_to_idx = {chain: idx for idx, chain in enumerate(unique_chains)}
36
  n = len(unique_chains)
37
  matrix = np.eye(n)
38
  for _, row in df.iterrows():
39
- chain1 = row['#PDBchain1']
40
- chain2 = row['PDBchain2']
41
  if chain1 in chain_to_idx and chain2 in chain_to_idx:
42
  i = chain_to_idx[chain1]
43
  j = chain_to_idx[chain2]
44
- matrix[j, i] = row['TM1']
45
- matrix[i, j] = row['TM2']
46
 
47
- columns_names = [chain.replace("/","").replace(".pdb:A","") for chain in unique_chains]
48
- df = pd.DataFrame(np.array(matrix),
49
- columns=columns_names,
50
- index=columns_names)
51
  return df
52
 
53
 
54
- # def get_cluster_z_from_df(df):
55
- # dist_matrix = pdist(df, metric='euclidean')
56
- # Z = fastcluster.linkage(dist_matrix, method='ward')
57
- # return Z
58
-
59
- def scipy_to_biopython(Z, labels):
60
- """将scipy的linkage矩阵转换为Bio.Phylo树"""
61
- tree = to_tree(Z, rd=False)
62
-
63
- def build_clade(node):
64
- if node.is_leaf():
65
- return Clade(branch_length=node.dist, name=labels[node.id])
66
- else:
67
- left = build_clade(node.left)
68
- right = build_clade(node.right)
69
- return Clade(branch_length=node.dist, clades=[left, right])
70
-
71
- root = build_clade(tree)
72
- return Tree(root)
73
-
74
- def write_str_to_file(s:str,file_path:str):
75
- with open(file_path,'w',encoding="utf8") as f:
76
- f.write(s)
77
-
78
-
79
- def build_graph_from_mat_df(TM_score_matrix,threshold = 0.75):
80
-
81
- G = nx.Graph()
82
- G.add_nodes_from(TM_score_matrix.index)
83
- matrix_values = TM_score_matrix.values
84
- # np.fill_diagonal(matrix_values, 0) # 排除自环
85
- rows, cols = np.where(matrix_values >= threshold)
86
- edges = [(TM_score_matrix.index[i], TM_score_matrix.index[j])
87
- for i, j in zip(rows, cols) if i != j]
88
- G.add_edges_from(edges)
89
- return G
90
-
91
- def fill_community_to_graph(G):
92
- partition = community_louvain.best_partition(G)
93
- nx.set_node_attributes(G, partition, 'cluster')
94
- return partition
95
-
96
-
97
- def get_graph_fig(G,partition):
98
- plt.figure(figsize=(12, 10))
99
- pos = nx.spring_layout(G)
100
- nx.draw_networkx_nodes(G, pos, node_size=50,
101
- cmap=plt.cm.tab20, node_color=list(partition.values()))
102
- nx.draw_networkx_edges(G, pos, alpha=0.3)
103
- plt.title("Structure Similarity Network")
104
- plt.axis('off')
105
- fig = plt.gcf()
106
- return fig
107
-
108
-
109
-
110
  def calculate_md5(files):
111
- """
112
- Calculate MD5 hash for a list of files.
113
- The hash is calculated by combining the content of all files in sorted order.
114
-
115
- Args:
116
- files: List of file objects from Gradio upload
117
-
118
- Returns:
119
- str: MD5 hash of the combined file contents
120
- """
121
  hash_md5 = hashlib.md5()
122
-
123
- # Sort files by name to ensure consistent hash regardless of upload order
124
  sorted_files = sorted(files, key=lambda x: x.name)
125
-
126
  for file in sorted_files:
127
  with open(file.name, "rb") as f:
128
  for chunk in iter(lambda: f.read(4096), b""):
129
  hash_md5.update(chunk)
130
-
131
  return hash_md5.hexdigest()
132
 
133
- def save_pdb_files(files, data_dir='./data'):
 
134
  """Save uploaded PDB files to the specified directory."""
135
  if not files:
136
  return "No files uploaded"
137
-
138
  # Create data directory if it doesn't exist
139
  data_path = Path(data_dir)
140
  data_path.mkdir(parents=True, exist_ok=True)
141
-
142
  # Calculate MD5 hash for all files
143
  md5_hash = calculate_md5(files)
144
-
145
- file_dir = os.path.join(data_path , md5_hash )
146
  # file_dir.mkdir(exist_ok=True)
147
  try:
148
  os.mkdir(file_dir)
149
- except:
150
  pass
151
- file_dir = os.path.join(data_path , md5_hash , "pdb")
152
  try:
153
  os.mkdir(file_dir)
154
- except:
155
  pass
156
  print(f"Created directory: {file_dir}")
157
-
158
  # Create list file
159
- list_file = os.path.join(data_path , md5_hash , "pdb_list")
160
 
161
  filenames = []
162
-
163
  results = []
164
  for file in files:
165
  # Get original filename
166
  original_filename = os.path.basename(file.name)
167
  filenames.append(original_filename)
168
  # Check if file already exists
169
- target_path = os.path.join(file_dir,original_filename )
170
  print(f"Saving to: {target_path}")
171
-
172
  # Save the file
173
  with open(target_path, "wb") as f:
174
  f.write(open(file.name, "rb").read())
175
  results.append(f"Saved {original_filename}")
176
-
177
  # Write list file
178
  with open(list_file, "w") as f:
179
  f.write("\n".join(filenames))
180
  results.append(f"Created list file: {list_file}")
181
-
182
  return "\n".join(results)
183
 
 
184
  def run_usalign(md5_hash):
185
  """Run USalign on the uploaded PDB files and return results as DataFrame."""
186
  try:
187
  runner = USalignRunner()
188
  data_path = Path("./data")
189
- pdb_dir = os.path.join(data_path , md5_hash , "pdb")
190
- list_file = os.path.join(data_path , md5_hash , "pdb_list")
191
  print(str(pdb_dir))
192
  print(str(list_file))
193
- return_code, stdout, stderr = runner.run_alignment(
194
- target_dir=str(pdb_dir),
195
- pdb_list_file=str(list_file)
196
- )
197
  print(stdout)
198
  print(stderr)
199
  if return_code == 0:
200
  # Handle potential encoding issues
201
  df = pd.read_csv(StringIO(stdout), sep="\t", encoding=sys.getdefaultencoding())
202
-
203
  # Clean up any potential encoding artifacts in column names
204
  df.columns = [col.strip() for col in df.columns]
205
  return df
206
  else:
207
  return pd.DataFrame({"Error": [stderr]})
208
  except Exception as e:
209
- return pd.DataFrame({"Error": [stderr]})
 
210
 
211
- def run_community_analysis(results_df, data_dir, md5_hash,threshold):
212
  """Run community analysis pipeline and return results."""
213
  try:
214
  # Generate TM matrix
215
  tm_matrix = get_TM_mat_from_df(results_df)
216
 
217
- tm_file = os.path.join("data",md5_hash,"tm_matrix.csv")
218
- newick_file = os.path.join("data",md5_hash,"clustering.newick")
219
  # network_file = os.path.join("data",md5_hash,"network.svg")
220
- network_edges_file = os.path.join("data",md5_hash,"network_cytoscape_export.xlsx")
221
- cluster_file = os.path.join("data",md5_hash,"cluster_assignments.csv")
222
 
223
  with localconverter(ro.default_converter + pandas2ri.converter):
224
  r_tm_matrix = ro.conversion.py2rpy(tm_matrix)
@@ -226,8 +140,7 @@ def run_community_analysis(results_df, data_dir, md5_hash,threshold):
226
  result = export_matrix_to_newick_r(r_tm_matrix, newick_file)
227
  newick_str = result[0]
228
 
229
- export_similarity_network_r(threshold, r_tm_matrix,network_edges_file, cluster_file)
230
-
231
 
232
  # cluster_df.to_csv(cluster_file,index=False)
233
  # combined_df.to_csv(network_edges_file,index=False)
@@ -237,44 +150,19 @@ def run_community_analysis(results_df, data_dir, md5_hash,threshold):
237
  # Phylo.write(tree, newick_file, "newick")
238
  # fig.savefig(network_file, format="svg", bbox_inches="tight")
239
  # plt.close(fig)
240
-
241
  return {
242
  "tm_matrix": tm_matrix,
243
  "newick_str": newick_str,
244
  # "network_fig": fig,
245
- "files":[
246
  tm_file,
247
  newick_file,
248
  # network_file,
249
  network_edges_file,
250
- cluster_file
251
- ]
252
  }
253
  except Exception as e:
254
  print("Error", str(e))
255
  return {"Error": str(e)}
256
-
257
-
258
-
259
- def get_dataframe_from_network(G,partition):
260
- edges_data = [list(edge) for edge in G.edges()]
261
- edges_df = pd.DataFrame(edges_data, columns=["Source", "Target"])
262
- cluster_membership = {}
263
- for idx, comm in enumerate(partition):
264
- for node in comm:
265
- cluster_membership[node] = f"cluster_{idx+1}"
266
-
267
- singleton_nodes = [n for n in G.nodes if G.degree[n] == 0]
268
- for node in singleton_nodes:
269
- cluster_membership[node] = "singleton"
270
-
271
- # 创建孤立节点的数据
272
- singleton_data = [[node, ""] for node in singleton_nodes]
273
- singleton_df = pd.DataFrame(singleton_data, columns=["Source", "Target"])
274
-
275
- # 合并数据
276
- combined_df = pd.concat([edges_df, singleton_df], ignore_index=True)
277
- return combined_df
278
-
279
- # # 导出为 CSV 文件
280
- # combined_df.to_csv("structure_based_similarity_network_cytoscape_export.csv", index=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import hashlib
2
+ import os
3
+ import sys
4
  from io import StringIO
5
+ from pathlib import Path
6
+
7
  import numpy as np
8
+ import pandas as pd
 
 
 
9
  import rpy2.robjects as ro
10
+ from rpy2.robjects import pandas2ri
11
+ from rpy2.robjects.conversion import localconverter
12
+
13
+ from r_functions import export_matrix_to_newick_r, export_similarity_network_r
14
+ from usalign_runner import USalignRunner
15
 
 
16
 
17
  def get_TM_mat_from_df(df):
18
+ unique_chains = sorted(set(df["#PDBchain1"].unique()).union(set(df["PDBchain2"].unique())))
 
 
19
  chain_to_idx = {chain: idx for idx, chain in enumerate(unique_chains)}
20
  n = len(unique_chains)
21
  matrix = np.eye(n)
22
  for _, row in df.iterrows():
23
+ chain1 = row["#PDBchain1"]
24
+ chain2 = row["PDBchain2"]
25
  if chain1 in chain_to_idx and chain2 in chain_to_idx:
26
  i = chain_to_idx[chain1]
27
  j = chain_to_idx[chain2]
28
+ matrix[j, i] = row["TM1"]
29
+ matrix[i, j] = row["TM2"]
30
 
31
+ columns_names = [chain.replace("/", "").replace(".pdb:A", "") for chain in unique_chains]
32
+ df = pd.DataFrame(np.array(matrix), columns=columns_names, index=columns_names)
 
 
33
  return df
34
 
35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  def calculate_md5(files):
 
 
 
 
 
 
 
 
 
 
37
  hash_md5 = hashlib.md5()
 
 
38
  sorted_files = sorted(files, key=lambda x: x.name)
39
+
40
  for file in sorted_files:
41
  with open(file.name, "rb") as f:
42
  for chunk in iter(lambda: f.read(4096), b""):
43
  hash_md5.update(chunk)
44
+
45
  return hash_md5.hexdigest()
46
 
47
+
48
+ def save_pdb_files(files, data_dir="./data"):
49
  """Save uploaded PDB files to the specified directory."""
50
  if not files:
51
  return "No files uploaded"
52
+
53
  # Create data directory if it doesn't exist
54
  data_path = Path(data_dir)
55
  data_path.mkdir(parents=True, exist_ok=True)
56
+
57
  # Calculate MD5 hash for all files
58
  md5_hash = calculate_md5(files)
59
+
60
+ file_dir = os.path.join(data_path, md5_hash)
61
  # file_dir.mkdir(exist_ok=True)
62
  try:
63
  os.mkdir(file_dir)
64
+ except Exception:
65
  pass
66
+ file_dir = os.path.join(data_path, md5_hash, "pdb")
67
  try:
68
  os.mkdir(file_dir)
69
+ except Exception:
70
  pass
71
  print(f"Created directory: {file_dir}")
72
+
73
  # Create list file
74
+ list_file = os.path.join(data_path, md5_hash, "pdb_list")
75
 
76
  filenames = []
77
+
78
  results = []
79
  for file in files:
80
  # Get original filename
81
  original_filename = os.path.basename(file.name)
82
  filenames.append(original_filename)
83
  # Check if file already exists
84
+ target_path = os.path.join(file_dir, original_filename)
85
  print(f"Saving to: {target_path}")
86
+
87
  # Save the file
88
  with open(target_path, "wb") as f:
89
  f.write(open(file.name, "rb").read())
90
  results.append(f"Saved {original_filename}")
91
+
92
  # Write list file
93
  with open(list_file, "w") as f:
94
  f.write("\n".join(filenames))
95
  results.append(f"Created list file: {list_file}")
96
+
97
  return "\n".join(results)
98
 
99
+
100
  def run_usalign(md5_hash):
101
  """Run USalign on the uploaded PDB files and return results as DataFrame."""
102
  try:
103
  runner = USalignRunner()
104
  data_path = Path("./data")
105
+ pdb_dir = os.path.join(data_path, md5_hash, "pdb")
106
+ list_file = os.path.join(data_path, md5_hash, "pdb_list")
107
  print(str(pdb_dir))
108
  print(str(list_file))
109
+ return_code, stdout, stderr = runner.run_alignment(target_dir=str(pdb_dir), pdb_list_file=str(list_file))
 
 
 
110
  print(stdout)
111
  print(stderr)
112
  if return_code == 0:
113
  # Handle potential encoding issues
114
  df = pd.read_csv(StringIO(stdout), sep="\t", encoding=sys.getdefaultencoding())
115
+
116
  # Clean up any potential encoding artifacts in column names
117
  df.columns = [col.strip() for col in df.columns]
118
  return df
119
  else:
120
  return pd.DataFrame({"Error": [stderr]})
121
  except Exception as e:
122
+ return pd.DataFrame({"Error": [e, stderr]})
123
+
124
 
125
+ def run_community_analysis(results_df, data_dir, md5_hash, threshold):
126
  """Run community analysis pipeline and return results."""
127
  try:
128
  # Generate TM matrix
129
  tm_matrix = get_TM_mat_from_df(results_df)
130
 
131
+ tm_file = os.path.join("data", md5_hash, "tm_matrix.csv")
132
+ newick_file = os.path.join("data", md5_hash, "clustering.newick")
133
  # network_file = os.path.join("data",md5_hash,"network.svg")
134
+ network_edges_file = os.path.join("data", md5_hash, "network_cytoscape_export.xlsx")
135
+ # cluster_file = os.path.join("data", md5_hash, "cluster_assignments.csv")
136
 
137
  with localconverter(ro.default_converter + pandas2ri.converter):
138
  r_tm_matrix = ro.conversion.py2rpy(tm_matrix)
 
140
  result = export_matrix_to_newick_r(r_tm_matrix, newick_file)
141
  newick_str = result[0]
142
 
143
+ export_similarity_network_r(threshold, r_tm_matrix, network_edges_file)
 
144
 
145
  # cluster_df.to_csv(cluster_file,index=False)
146
  # combined_df.to_csv(network_edges_file,index=False)
 
150
  # Phylo.write(tree, newick_file, "newick")
151
  # fig.savefig(network_file, format="svg", bbox_inches="tight")
152
  # plt.close(fig)
153
+
154
  return {
155
  "tm_matrix": tm_matrix,
156
  "newick_str": newick_str,
157
  # "network_fig": fig,
158
+ "files": [
159
  tm_file,
160
  newick_file,
161
  # network_file,
162
  network_edges_file,
163
+ # cluster_file,
164
+ ],
165
  }
166
  except Exception as e:
167
  print("Error", str(e))
168
  return {"Error": str(e)}