Files changed (5) hide show
  1. .gitignore +0 -2
  2. app.py +8 -2
  3. r_functions.py +49 -19
  4. usalign_runner.py +31 -27
  5. utils.py +164 -52
.gitignore DELETED
@@ -1,2 +0,0 @@
1
- /data
2
- /__pycache__
 
 
 
app.py CHANGED
@@ -1,14 +1,21 @@
 
1
  import os
 
 
 
2
 
3
  import gradio as gr
 
 
4
 
 
5
  from utils import calculate_md5, run_community_analysis, run_usalign, save_pdb_files
6
 
7
  os.environ["GRADIO_ANALYTICS_ENABLED"] = "False"
8
 
9
  # Create Gradio interface
10
  with gr.Blocks() as demo:
11
- gr.Markdown("# This is a Temp Title")
12
 
13
  with gr.Row():
14
  file_input = gr.File(
@@ -86,7 +93,6 @@ with gr.Blocks() as demo:
86
  outputs=[
87
  tm_matrix_output,
88
  newick_output,
89
- # network_plot,
90
  download_tm,
91
  ],
92
  )
 
1
+ import hashlib
2
  import os
3
+ import sys
4
+ from io import StringIO
5
+ from pathlib import Path
6
 
7
  import gradio as gr
8
+ import matplotlib.pyplot as plt
9
+ import pandas as pd
10
 
11
+ from usalign_runner import USalignRunner
12
  from utils import calculate_md5, run_community_analysis, run_usalign, save_pdb_files
13
 
14
  os.environ["GRADIO_ANALYTICS_ENABLED"] = "False"
15
 
16
  # Create Gradio interface
17
  with gr.Blocks() as demo:
18
+ gr.Markdown("# Structure-Based Similarity Network")
19
 
20
  with gr.Row():
21
  file_input = gr.File(
 
93
  outputs=[
94
  tm_matrix_output,
95
  newick_output,
 
96
  download_tm,
97
  ],
98
  )
r_functions.py CHANGED
@@ -1,16 +1,21 @@
1
- import rpy2.robjects as ro
2
- from rpy2.robjects import pandas2ri
3
- from rpy2.robjects.conversion import localconverter
4
  from rpy2.robjects.packages import importr
 
 
 
 
 
5
 
6
  pandas2ri.activate()
7
 
8
  # 导入必要的 R 包
9
- stats = importr("stats")
10
- ape = importr("ape")
11
- igraph = importr("igraph", robject_translations={".env": "_env_"})
12
- openxlsx = importr("openxlsx")
13
-
14
 
15
  def get_r_matrix(df):
16
  with localconverter(ro.default_converter + pandas2ri.converter):
@@ -18,31 +23,34 @@ def get_r_matrix(df):
18
  return r_tm_matrix
19
 
20
 
21
- export_matrix_to_newick_r = ro.r(
22
- """
23
  convert_to_newick <- function(tm_matrix, output_file) {
24
  # 导入 ape 包
25
  if (!require(ape, quietly = TRUE)) {
26
  install.packages("ape", repos = "https://cran.r-project.org")
27
  library(ape)
28
  }
 
29
  # 计算距离矩阵
30
  dist_matrix <- dist(tm_matrix)
 
31
  # 层次聚类
32
  hclust_tree <- hclust(dist_matrix, method = "ward.D2")
 
33
  # 转为 phylo 对象
34
  phylo_tree <- as.phylo(hclust_tree)
 
35
  # 导出为 Newick 格式
36
  write.tree(phylo_tree, file = output_file)
 
37
  newick_str <- write.tree(phylo_tree)
 
38
  return(newick_str)
39
  }
40
- """
41
- )
42
 
43
- export_similarity_network_r = ro.r(
44
- """
45
- create_similarity_network_r <- function(threshold, tm_matrix, excel_path) {
46
  # 导入必要的包
47
  if (!require(igraph, quietly = TRUE)) {
48
  install.packages("igraph", repos = "https://cran.r-project.org")
@@ -52,53 +60,74 @@ create_similarity_network_r <- function(threshold, tm_matrix, excel_path) {
52
  install.packages("openxlsx", repos = "https://cran.r-project.org")
53
  library(openxlsx)
54
  }
 
55
  # 根据相似性阈值创建边缘列表,并过滤掉自环
56
  overthresholdedges <- which(tm_matrix >= threshold, arr.ind = TRUE)
57
  overthresholdedges <- overthresholdedges[overthresholdedges[, 1] != overthresholdedges[, 2], ]
 
58
  # 创建空的图形对象
59
  graph <- graph.empty()
 
60
  # 添加节点
61
  nodes <- rownames(tm_matrix)
62
  graph <- add_vertices(graph, nv = length(nodes), name = nodes)
 
63
  # 添加边
64
  for (i in 1:nrow(overthresholdedges)) {
65
  graph <- add_edges(graph, c(overthresholdedges[i, 1], overthresholdedges[i, 2]))
66
  }
 
67
  # 转换为无向图
68
  graph <- as.undirected(graph, mode = "collapse")
 
69
  # 计算聚类
70
  clusters <- fastgreedy.community(graph)
 
71
  # 获取每个聚类的大小
72
  cluster_sizes <- sizes(clusters)
 
73
  # 按聚类大小降序排序
74
  sorted_clusters <- clusters[order(cluster_sizes, decreasing = TRUE)]
 
75
  # 获取每个聚类的成员
76
  cluster_members <- membership(clusters)
 
77
  # 找到孤立节点
78
  singleton_nodes <- names(cluster_members[cluster_members %in% which(sizes(clusters) == 1)])
 
79
  # 创建Cytoscape导出文件
80
  cytoscape_export <- createWorkbook()
 
81
  # 创建边Sheet
82
  addWorksheet(cytoscape_export, sheetName = "Edges")
83
  writeData(cytoscape_export, sheet = "Edges", x = "Source", startCol = 1, startRow = 1)
84
  writeData(cytoscape_export, sheet = "Edges", x = "Target", startCol = 2, startRow = 1)
 
85
  # 获取边列表
86
  edges <- get.edgelist(graph)
 
87
  # 填充边Sheet数据
88
  if (nrow(edges) > 0) {
89
  writeData(cytoscape_export, sheet = "Edges", x = V(graph)[edges[, 1]]$name, startCol = 1, startRow = 2)
90
  writeData(cytoscape_export, sheet = "Edges", x = V(graph)[edges[, 2]]$name, startCol = 2, startRow = 2)
91
- }
 
92
  # 找到当前边Sheet的最后一行
93
  last_edge_row <- nrow(edges) + 1
 
94
  # 添加孤立节点
95
  if (length(singleton_nodes) > 0) {
96
  writeData(cytoscape_export, sheet = "Edges", x = singleton_nodes, startCol = 1, startRow = last_edge_row + 1)
97
  }
 
98
  # 保存Excel文件
99
  saveWorkbook(cytoscape_export, excel_path, overwrite = TRUE)
 
 
 
100
  # 创建一个空的数据框用于储存节点和聚类信息
101
  export_clusters <- data.frame(protein = character(), cluster_name = character(), stringsAsFactors = FALSE)
 
102
  # 遍历 sorted_clusters
103
  cluster_index <- 1 # 初始化簇索引
104
  for (cluster_name in names(sorted_clusters)) {
@@ -114,9 +143,10 @@ create_similarity_network_r <- function(threshold, tm_matrix, excel_path) {
114
  export_clusters <- rbind(export_clusters, data.frame(protein = protein, cluster_name = current_cluster_name))
115
  }
116
  cluster_index <- cluster_index + 1 # 索引加1
117
- }
 
 
118
  # 返回聚类结果
119
  return(list(cluster_data = export_clusters, graph = graph))
120
  }
121
- """
122
- )
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from rpy2.robjects import pandas2ri, r, Formula
4
  from rpy2.robjects.packages import importr
5
+ from rpy2.robjects.vectors import StrVector, FloatVector, IntVector
6
+ from rpy2.robjects.conversion import localconverter
7
+ import rpy2.robjects as ro
8
+ import os
9
+
10
 
11
  pandas2ri.activate()
12
 
13
  # 导入必要的 R 包
14
+ stats = importr('stats')
15
+ ape = importr('ape')
16
+ igraph = importr('igraph', robject_translations={'.env': '_env_'})
17
+ openxlsx = importr('openxlsx')
18
+ # dplyr = importr('dplyr')
19
 
20
  def get_r_matrix(df):
21
  with localconverter(ro.default_converter + pandas2ri.converter):
 
23
  return r_tm_matrix
24
 
25
 
26
+ export_matrix_to_newick_r = ro.r("""
 
27
  convert_to_newick <- function(tm_matrix, output_file) {
28
  # 导入 ape 包
29
  if (!require(ape, quietly = TRUE)) {
30
  install.packages("ape", repos = "https://cran.r-project.org")
31
  library(ape)
32
  }
33
+
34
  # 计算距离矩阵
35
  dist_matrix <- dist(tm_matrix)
36
+
37
  # 层次聚类
38
  hclust_tree <- hclust(dist_matrix, method = "ward.D2")
39
+
40
  # 转为 phylo 对象
41
  phylo_tree <- as.phylo(hclust_tree)
42
+
43
  # 导出为 Newick 格式
44
  write.tree(phylo_tree, file = output_file)
45
+
46
  newick_str <- write.tree(phylo_tree)
47
+
48
  return(newick_str)
49
  }
50
+ """)
 
51
 
52
+ export_similarity_network_r = ro.r("""
53
+ create_similarity_network_r <- function(threshold, tm_matrix, excel_path, csv_path) {
 
54
  # 导入必要的包
55
  if (!require(igraph, quietly = TRUE)) {
56
  install.packages("igraph", repos = "https://cran.r-project.org")
 
60
  install.packages("openxlsx", repos = "https://cran.r-project.org")
61
  library(openxlsx)
62
  }
63
+
64
  # 根据相似性阈值创建边缘列表,并过滤掉自环
65
  overthresholdedges <- which(tm_matrix >= threshold, arr.ind = TRUE)
66
  overthresholdedges <- overthresholdedges[overthresholdedges[, 1] != overthresholdedges[, 2], ]
67
+
68
  # 创建空的图形对象
69
  graph <- graph.empty()
70
+
71
  # 添加节点
72
  nodes <- rownames(tm_matrix)
73
  graph <- add_vertices(graph, nv = length(nodes), name = nodes)
74
+
75
  # 添加边
76
  for (i in 1:nrow(overthresholdedges)) {
77
  graph <- add_edges(graph, c(overthresholdedges[i, 1], overthresholdedges[i, 2]))
78
  }
79
+
80
  # 转换为无向图
81
  graph <- as.undirected(graph, mode = "collapse")
82
+
83
  # 计算聚类
84
  clusters <- fastgreedy.community(graph)
85
+
86
  # 获取每个聚类的大小
87
  cluster_sizes <- sizes(clusters)
88
+
89
  # 按聚类大小降序排序
90
  sorted_clusters <- clusters[order(cluster_sizes, decreasing = TRUE)]
91
+
92
  # 获取每个聚类的成员
93
  cluster_members <- membership(clusters)
94
+
95
  # 找到孤立节点
96
  singleton_nodes <- names(cluster_members[cluster_members %in% which(sizes(clusters) == 1)])
97
+
98
  # 创建Cytoscape导出文件
99
  cytoscape_export <- createWorkbook()
100
+
101
  # 创建边Sheet
102
  addWorksheet(cytoscape_export, sheetName = "Edges")
103
  writeData(cytoscape_export, sheet = "Edges", x = "Source", startCol = 1, startRow = 1)
104
  writeData(cytoscape_export, sheet = "Edges", x = "Target", startCol = 2, startRow = 1)
105
+
106
  # 获取边列表
107
  edges <- get.edgelist(graph)
108
+
109
  # 填充边Sheet数据
110
  if (nrow(edges) > 0) {
111
  writeData(cytoscape_export, sheet = "Edges", x = V(graph)[edges[, 1]]$name, startCol = 1, startRow = 2)
112
  writeData(cytoscape_export, sheet = "Edges", x = V(graph)[edges[, 2]]$name, startCol = 2, startRow = 2)
113
+ }
114
+
115
  # 找到当前边Sheet的最后一行
116
  last_edge_row <- nrow(edges) + 1
117
+
118
  # 添加孤立节点
119
  if (length(singleton_nodes) > 0) {
120
  writeData(cytoscape_export, sheet = "Edges", x = singleton_nodes, startCol = 1, startRow = last_edge_row + 1)
121
  }
122
+
123
  # 保存Excel文件
124
  saveWorkbook(cytoscape_export, excel_path, overwrite = TRUE)
125
+
126
+ saveWorkbook(cytoscape_export, "structure_based_similarity_network_cytoscape_export.xlsx", overwrite = TRUE)
127
+
128
  # 创建一个空的数据框用于储存节点和聚类信息
129
  export_clusters <- data.frame(protein = character(), cluster_name = character(), stringsAsFactors = FALSE)
130
+
131
  # 遍历 sorted_clusters
132
  cluster_index <- 1 # 初始化簇索引
133
  for (cluster_name in names(sorted_clusters)) {
 
143
  export_clusters <- rbind(export_clusters, data.frame(protein = protein, cluster_name = current_cluster_name))
144
  }
145
  cluster_index <- cluster_index + 1 # 索引加1
146
+ }
147
+
148
+ write.csv(export_clusters, csv_path, row.names = FALSE, quote = TRUE)
149
  # 返回聚类结果
150
  return(list(cluster_data = export_clusters, graph = graph))
151
  }
152
+ """)
 
usalign_runner.py CHANGED
@@ -1,27 +1,29 @@
1
  import subprocess
 
 
2
  from pathlib import Path
3
- from typing import Optional
4
-
5
  import yaml
6
 
7
-
8
  class USalignRunner:
9
  def __init__(self, config_path: str = "config.yaml"):
10
  """
11
  Initialize USalignRunner with parameters from config file.
12
-
13
  Args:
14
  config_path (str): Path to the configuration file
15
  """
16
- with open(config_path, "r", encoding="utf-8") as f:
17
  config = yaml.safe_load(f)
18
-
19
- self.usalign_path = Path(config["USalign"]["path"])
20
  self.default_params = {
21
- "tmscore": config["USalign"]["tmscore"],
22
- "outfmt": config["USalign"]["outfmt"],
23
- "mol": "protein", # Default to protein alignment
24
  }
 
 
 
25
 
26
  def run_alignment(
27
  self,
@@ -30,35 +32,37 @@ class USalignRunner:
30
  tmscore: Optional[float] = None,
31
  outfmt: Optional[int] = None,
32
  ) -> tuple[int, str, str]:
33
- tmscore = tmscore if tmscore is not None else self.default_params["tmscore"]
34
- outfmt = outfmt if outfmt is not None else self.default_params["outfmt"]
35
-
36
  # Create the command
37
  cmd = [
38
  str(self.usalign_path),
39
- "-mol",
40
- self.default_params["mol"],
41
- "-dir",
42
- str(target_dir),
43
  pdb_list_file,
44
- "-TMscore",
45
- str(tmscore),
46
- "-outfmt",
47
- str(outfmt),
48
  ]
49
  print(cmd)
50
-
51
  # Convert command list to string
52
  cmd_str = " ".join(cmd)
53
-
54
  try:
55
  # Execute the command
56
- process = subprocess.Popen(cmd_str, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True, text=True)
57
-
 
 
 
 
 
 
58
  # Get output
59
  stdout, stderr = process.communicate()
60
-
61
  return process.returncode, stdout, stderr
62
-
63
  except Exception as e:
64
  return -1, "", str(e)
 
1
  import subprocess
2
+ import os
3
+ from typing import List, Optional
4
  from pathlib import Path
 
 
5
  import yaml
6
 
 
7
  class USalignRunner:
8
  def __init__(self, config_path: str = "config.yaml"):
9
  """
10
  Initialize USalignRunner with parameters from config file.
11
+
12
  Args:
13
  config_path (str): Path to the configuration file
14
  """
15
+ with open(config_path, 'r',encoding="utf-8") as f:
16
  config = yaml.safe_load(f)
17
+
18
+ self.usalign_path = Path(config['USalign']['path'])
19
  self.default_params = {
20
+ 'tmscore': config['USalign']['tmscore'],
21
+ 'outfmt': config['USalign']['outfmt'],
22
+ 'mol': 'protein' # Default to protein alignment
23
  }
24
+
25
+ if not self.usalign_path.exists():
26
+ raise FileNotFoundError(f"USalign executable not found at {self.usalign_path}")
27
 
28
  def run_alignment(
29
  self,
 
32
  tmscore: Optional[float] = None,
33
  outfmt: Optional[int] = None,
34
  ) -> tuple[int, str, str]:
35
+ tmscore = tmscore if tmscore is not None else self.default_params['tmscore']
36
+ outfmt = outfmt if outfmt is not None else self.default_params['outfmt']
37
+
38
  # Create the command
39
  cmd = [
40
  str(self.usalign_path),
41
+ "-mol", self.default_params['mol'],
42
+ "-dir", str(target_dir),
 
 
43
  pdb_list_file,
44
+ "-TMscore", str(tmscore),
45
+ "-outfmt", str(outfmt)
 
 
46
  ]
47
  print(cmd)
48
+
49
  # Convert command list to string
50
  cmd_str = " ".join(cmd)
51
+
52
  try:
53
  # Execute the command
54
+ process = subprocess.Popen(
55
+ cmd_str,
56
+ stdout=subprocess.PIPE,
57
+ stderr=subprocess.PIPE,
58
+ shell=True,
59
+ text=True
60
+ )
61
+
62
  # Get output
63
  stdout, stderr = process.communicate()
64
+
65
  return process.returncode, stdout, stderr
66
+
67
  except Exception as e:
68
  return -1, "", str(e)
utils.py CHANGED
@@ -1,138 +1,224 @@
1
- import hashlib
2
- import os
 
 
 
 
 
 
 
 
 
3
  import sys
4
- from io import StringIO
 
 
5
  from pathlib import Path
6
-
7
- import numpy as np
8
  import pandas as pd
9
- import rpy2.robjects as ro
10
- from rpy2.robjects import pandas2ri
11
- from rpy2.robjects.conversion import localconverter
12
-
13
- from r_functions import export_matrix_to_newick_r, export_similarity_network_r
14
  from usalign_runner import USalignRunner
 
 
 
 
 
 
 
 
15
 
 
16
 
17
  def get_TM_mat_from_df(df):
18
- unique_chains = sorted(set(df["#PDBchain1"].unique()).union(set(df["PDBchain2"].unique())))
 
 
19
  chain_to_idx = {chain: idx for idx, chain in enumerate(unique_chains)}
20
  n = len(unique_chains)
21
  matrix = np.eye(n)
22
  for _, row in df.iterrows():
23
- chain1 = row["#PDBchain1"]
24
- chain2 = row["PDBchain2"]
25
  if chain1 in chain_to_idx and chain2 in chain_to_idx:
26
  i = chain_to_idx[chain1]
27
  j = chain_to_idx[chain2]
28
- matrix[j, i] = row["TM1"]
29
- matrix[i, j] = row["TM2"]
30
 
31
- columns_names = [chain.replace("/", "").replace(".pdb:A", "") for chain in unique_chains]
32
- df = pd.DataFrame(np.array(matrix), columns=columns_names, index=columns_names)
 
 
33
  return df
34
 
35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  def calculate_md5(files):
 
 
 
 
 
 
 
 
 
 
37
  hash_md5 = hashlib.md5()
 
 
38
  sorted_files = sorted(files, key=lambda x: x.name)
39
-
40
  for file in sorted_files:
41
  with open(file.name, "rb") as f:
42
  for chunk in iter(lambda: f.read(4096), b""):
43
  hash_md5.update(chunk)
44
-
45
  return hash_md5.hexdigest()
46
 
47
-
48
- def save_pdb_files(files, data_dir="./data"):
49
  """Save uploaded PDB files to the specified directory."""
50
  if not files:
51
  return "No files uploaded"
52
-
53
  # Create data directory if it doesn't exist
54
  data_path = Path(data_dir)
55
  data_path.mkdir(parents=True, exist_ok=True)
56
-
57
  # Calculate MD5 hash for all files
58
  md5_hash = calculate_md5(files)
59
-
60
- file_dir = os.path.join(data_path, md5_hash)
61
  # file_dir.mkdir(exist_ok=True)
62
  try:
63
  os.mkdir(file_dir)
64
- except Exception:
65
  pass
66
- file_dir = os.path.join(data_path, md5_hash, "pdb")
67
  try:
68
  os.mkdir(file_dir)
69
- except Exception:
70
  pass
71
  print(f"Created directory: {file_dir}")
72
-
73
  # Create list file
74
- list_file = os.path.join(data_path, md5_hash, "pdb_list")
75
 
76
  filenames = []
77
-
78
  results = []
79
  for file in files:
80
  # Get original filename
81
  original_filename = os.path.basename(file.name)
82
  filenames.append(original_filename)
83
  # Check if file already exists
84
- target_path = os.path.join(file_dir, original_filename)
85
  print(f"Saving to: {target_path}")
86
-
87
  # Save the file
88
  with open(target_path, "wb") as f:
89
  f.write(open(file.name, "rb").read())
90
  results.append(f"Saved {original_filename}")
91
-
92
  # Write list file
93
  with open(list_file, "w") as f:
94
  f.write("\n".join(filenames))
95
  results.append(f"Created list file: {list_file}")
96
-
97
  return "\n".join(results)
98
 
99
-
100
  def run_usalign(md5_hash):
101
  """Run USalign on the uploaded PDB files and return results as DataFrame."""
102
  try:
103
  runner = USalignRunner()
104
  data_path = Path("./data")
105
- pdb_dir = os.path.join(data_path, md5_hash, "pdb")
106
- list_file = os.path.join(data_path, md5_hash, "pdb_list")
107
  print(str(pdb_dir))
108
  print(str(list_file))
109
- return_code, stdout, stderr = runner.run_alignment(target_dir=str(pdb_dir), pdb_list_file=str(list_file))
 
 
 
110
  print(stdout)
111
  print(stderr)
112
  if return_code == 0:
113
  # Handle potential encoding issues
114
  df = pd.read_csv(StringIO(stdout), sep="\t", encoding=sys.getdefaultencoding())
115
-
116
  # Clean up any potential encoding artifacts in column names
117
  df.columns = [col.strip() for col in df.columns]
118
  return df
119
  else:
120
  return pd.DataFrame({"Error": [stderr]})
121
  except Exception as e:
122
- return pd.DataFrame({"Error": [e, stderr]})
123
-
124
 
125
- def run_community_analysis(results_df, data_dir, md5_hash, threshold):
126
  """Run community analysis pipeline and return results."""
127
  try:
128
  # Generate TM matrix
129
  tm_matrix = get_TM_mat_from_df(results_df)
130
 
131
- tm_file = os.path.join("data", md5_hash, "tm_matrix.csv")
132
- newick_file = os.path.join("data", md5_hash, "clustering.newick")
133
  # network_file = os.path.join("data",md5_hash,"network.svg")
134
- network_edges_file = os.path.join("data", md5_hash, "network_cytoscape_export.xlsx")
135
- # cluster_file = os.path.join("data", md5_hash, "cluster_assignments.csv")
136
 
137
  with localconverter(ro.default_converter + pandas2ri.converter):
138
  r_tm_matrix = ro.conversion.py2rpy(tm_matrix)
@@ -140,7 +226,8 @@ def run_community_analysis(results_df, data_dir, md5_hash, threshold):
140
  result = export_matrix_to_newick_r(r_tm_matrix, newick_file)
141
  newick_str = result[0]
142
 
143
- export_similarity_network_r(threshold, r_tm_matrix, network_edges_file)
 
144
 
145
  # cluster_df.to_csv(cluster_file,index=False)
146
  # combined_df.to_csv(network_edges_file,index=False)
@@ -150,19 +237,44 @@ def run_community_analysis(results_df, data_dir, md5_hash, threshold):
150
  # Phylo.write(tree, newick_file, "newick")
151
  # fig.savefig(network_file, format="svg", bbox_inches="tight")
152
  # plt.close(fig)
153
-
154
  return {
155
  "tm_matrix": tm_matrix,
156
  "newick_str": newick_str,
157
  # "network_fig": fig,
158
- "files": [
159
  tm_file,
160
  newick_file,
161
  # network_file,
162
  network_edges_file,
163
- # cluster_file,
164
- ],
165
  }
166
  except Exception as e:
167
  print("Error", str(e))
168
  return {"Error": str(e)}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ # import fastcluster
4
+ import networkx as nx
5
+ from community import community_louvain
6
+ from scipy.spatial.distance import pdist, squareform
7
+ from scipy.cluster.hierarchy import linkage, to_tree
8
+ from networkx.algorithms.community import greedy_modularity_communities
9
+ from Bio import Phylo
10
+ from Bio.Phylo.BaseTree import Tree, Clade
11
+ import matplotlib.pyplot as plt
12
  import sys
13
+ import gradio as gr
14
+ import os
15
+ import hashlib
16
  from pathlib import Path
 
 
17
  import pandas as pd
18
+ from io import StringIO
 
 
 
 
19
  from usalign_runner import USalignRunner
20
+ import pandas as pd
21
+ import numpy as np
22
+ from rpy2.robjects import pandas2ri, r, Formula
23
+ from rpy2.robjects.packages import importr
24
+ from rpy2.robjects.vectors import StrVector, FloatVector, IntVector
25
+ from rpy2.robjects.conversion import localconverter
26
+ import rpy2.robjects as ro
27
+ import os
28
 
29
+ from r_functions import get_r_matrix,export_matrix_to_newick_r,export_similarity_network_r
30
 
31
  def get_TM_mat_from_df(df):
32
+ chain1_unique = df['#PDBchain1'].unique()
33
+ chain2_unique = df['PDBchain2'].unique()
34
+ unique_chains = sorted(set(df['#PDBchain1'].unique()).union(set(df['PDBchain2'].unique())))
35
  chain_to_idx = {chain: idx for idx, chain in enumerate(unique_chains)}
36
  n = len(unique_chains)
37
  matrix = np.eye(n)
38
  for _, row in df.iterrows():
39
+ chain1 = row['#PDBchain1']
40
+ chain2 = row['PDBchain2']
41
  if chain1 in chain_to_idx and chain2 in chain_to_idx:
42
  i = chain_to_idx[chain1]
43
  j = chain_to_idx[chain2]
44
+ matrix[j, i] = row['TM1']
45
+ matrix[i, j] = row['TM2']
46
 
47
+ columns_names = [chain.replace("/","").replace(".pdb:A","") for chain in unique_chains]
48
+ df = pd.DataFrame(np.array(matrix),
49
+ columns=columns_names,
50
+ index=columns_names)
51
  return df
52
 
53
 
54
+ # def get_cluster_z_from_df(df):
55
+ # dist_matrix = pdist(df, metric='euclidean')
56
+ # Z = fastcluster.linkage(dist_matrix, method='ward')
57
+ # return Z
58
+
59
+ def scipy_to_biopython(Z, labels):
60
+ """将scipy的linkage矩阵转换为Bio.Phylo树"""
61
+ tree = to_tree(Z, rd=False)
62
+
63
+ def build_clade(node):
64
+ if node.is_leaf():
65
+ return Clade(branch_length=node.dist, name=labels[node.id])
66
+ else:
67
+ left = build_clade(node.left)
68
+ right = build_clade(node.right)
69
+ return Clade(branch_length=node.dist, clades=[left, right])
70
+
71
+ root = build_clade(tree)
72
+ return Tree(root)
73
+
74
+ def write_str_to_file(s:str,file_path:str):
75
+ with open(file_path,'w',encoding="utf8") as f:
76
+ f.write(s)
77
+
78
+
79
+ def build_graph_from_mat_df(TM_score_matrix,threshold = 0.75):
80
+
81
+ G = nx.Graph()
82
+ G.add_nodes_from(TM_score_matrix.index)
83
+ matrix_values = TM_score_matrix.values
84
+ # np.fill_diagonal(matrix_values, 0) # 排除自环
85
+ rows, cols = np.where(matrix_values >= threshold)
86
+ edges = [(TM_score_matrix.index[i], TM_score_matrix.index[j])
87
+ for i, j in zip(rows, cols) if i != j]
88
+ G.add_edges_from(edges)
89
+ return G
90
+
91
+ def fill_community_to_graph(G):
92
+ partition = community_louvain.best_partition(G)
93
+ nx.set_node_attributes(G, partition, 'cluster')
94
+ return partition
95
+
96
+
97
+ def get_graph_fig(G,partition):
98
+ plt.figure(figsize=(12, 10))
99
+ pos = nx.spring_layout(G)
100
+ nx.draw_networkx_nodes(G, pos, node_size=50,
101
+ cmap=plt.cm.tab20, node_color=list(partition.values()))
102
+ nx.draw_networkx_edges(G, pos, alpha=0.3)
103
+ plt.title("Structure Similarity Network")
104
+ plt.axis('off')
105
+ fig = plt.gcf()
106
+ return fig
107
+
108
+
109
+
110
  def calculate_md5(files):
111
+ """
112
+ Calculate MD5 hash for a list of files.
113
+ The hash is calculated by combining the content of all files in sorted order.
114
+
115
+ Args:
116
+ files: List of file objects from Gradio upload
117
+
118
+ Returns:
119
+ str: MD5 hash of the combined file contents
120
+ """
121
  hash_md5 = hashlib.md5()
122
+
123
+ # Sort files by name to ensure consistent hash regardless of upload order
124
  sorted_files = sorted(files, key=lambda x: x.name)
125
+
126
  for file in sorted_files:
127
  with open(file.name, "rb") as f:
128
  for chunk in iter(lambda: f.read(4096), b""):
129
  hash_md5.update(chunk)
130
+
131
  return hash_md5.hexdigest()
132
 
133
+ def save_pdb_files(files, data_dir='./data'):
 
134
  """Save uploaded PDB files to the specified directory."""
135
  if not files:
136
  return "No files uploaded"
137
+
138
  # Create data directory if it doesn't exist
139
  data_path = Path(data_dir)
140
  data_path.mkdir(parents=True, exist_ok=True)
141
+
142
  # Calculate MD5 hash for all files
143
  md5_hash = calculate_md5(files)
144
+
145
+ file_dir = os.path.join(data_path , md5_hash )
146
  # file_dir.mkdir(exist_ok=True)
147
  try:
148
  os.mkdir(file_dir)
149
+ except:
150
  pass
151
+ file_dir = os.path.join(data_path , md5_hash , "pdb")
152
  try:
153
  os.mkdir(file_dir)
154
+ except:
155
  pass
156
  print(f"Created directory: {file_dir}")
157
+
158
  # Create list file
159
+ list_file = os.path.join(data_path , md5_hash , "pdb_list")
160
 
161
  filenames = []
162
+
163
  results = []
164
  for file in files:
165
  # Get original filename
166
  original_filename = os.path.basename(file.name)
167
  filenames.append(original_filename)
168
  # Check if file already exists
169
+ target_path = os.path.join(file_dir,original_filename )
170
  print(f"Saving to: {target_path}")
171
+
172
  # Save the file
173
  with open(target_path, "wb") as f:
174
  f.write(open(file.name, "rb").read())
175
  results.append(f"Saved {original_filename}")
176
+
177
  # Write list file
178
  with open(list_file, "w") as f:
179
  f.write("\n".join(filenames))
180
  results.append(f"Created list file: {list_file}")
181
+
182
  return "\n".join(results)
183
 
 
184
  def run_usalign(md5_hash):
185
  """Run USalign on the uploaded PDB files and return results as DataFrame."""
186
  try:
187
  runner = USalignRunner()
188
  data_path = Path("./data")
189
+ pdb_dir = os.path.join(data_path , md5_hash , "pdb")
190
+ list_file = os.path.join(data_path , md5_hash , "pdb_list")
191
  print(str(pdb_dir))
192
  print(str(list_file))
193
+ return_code, stdout, stderr = runner.run_alignment(
194
+ target_dir=str(pdb_dir),
195
+ pdb_list_file=str(list_file)
196
+ )
197
  print(stdout)
198
  print(stderr)
199
  if return_code == 0:
200
  # Handle potential encoding issues
201
  df = pd.read_csv(StringIO(stdout), sep="\t", encoding=sys.getdefaultencoding())
202
+
203
  # Clean up any potential encoding artifacts in column names
204
  df.columns = [col.strip() for col in df.columns]
205
  return df
206
  else:
207
  return pd.DataFrame({"Error": [stderr]})
208
  except Exception as e:
209
+ return pd.DataFrame({"Error": [stderr]})
 
210
 
211
+ def run_community_analysis(results_df, data_dir, md5_hash,threshold):
212
  """Run community analysis pipeline and return results."""
213
  try:
214
  # Generate TM matrix
215
  tm_matrix = get_TM_mat_from_df(results_df)
216
 
217
+ tm_file = os.path.join("data",md5_hash,"tm_matrix.csv")
218
+ newick_file = os.path.join("data",md5_hash,"clustering.newick")
219
  # network_file = os.path.join("data",md5_hash,"network.svg")
220
+ network_edges_file = os.path.join("data",md5_hash,"network_cytoscape_export.xlsx")
221
+ cluster_file = os.path.join("data",md5_hash,"cluster_assignments.csv")
222
 
223
  with localconverter(ro.default_converter + pandas2ri.converter):
224
  r_tm_matrix = ro.conversion.py2rpy(tm_matrix)
 
226
  result = export_matrix_to_newick_r(r_tm_matrix, newick_file)
227
  newick_str = result[0]
228
 
229
+ export_similarity_network_r(threshold, r_tm_matrix,network_edges_file, cluster_file)
230
+
231
 
232
  # cluster_df.to_csv(cluster_file,index=False)
233
  # combined_df.to_csv(network_edges_file,index=False)
 
237
  # Phylo.write(tree, newick_file, "newick")
238
  # fig.savefig(network_file, format="svg", bbox_inches="tight")
239
  # plt.close(fig)
240
+
241
  return {
242
  "tm_matrix": tm_matrix,
243
  "newick_str": newick_str,
244
  # "network_fig": fig,
245
+ "files":[
246
  tm_file,
247
  newick_file,
248
  # network_file,
249
  network_edges_file,
250
+ cluster_file
251
+ ]
252
  }
253
  except Exception as e:
254
  print("Error", str(e))
255
  return {"Error": str(e)}
256
+
257
+
258
+
259
+ def get_dataframe_from_network(G,partition):
260
+ edges_data = [list(edge) for edge in G.edges()]
261
+ edges_df = pd.DataFrame(edges_data, columns=["Source", "Target"])
262
+ cluster_membership = {}
263
+ for idx, comm in enumerate(partition):
264
+ for node in comm:
265
+ cluster_membership[node] = f"cluster_{idx+1}"
266
+
267
+ singleton_nodes = [n for n in G.nodes if G.degree[n] == 0]
268
+ for node in singleton_nodes:
269
+ cluster_membership[node] = "singleton"
270
+
271
+ # 创建孤立节点的数据
272
+ singleton_data = [[node, ""] for node in singleton_nodes]
273
+ singleton_df = pd.DataFrame(singleton_data, columns=["Source", "Target"])
274
+
275
+ # 合并数据
276
+ combined_df = pd.concat([edges_df, singleton_df], ignore_index=True)
277
+ return combined_df
278
+
279
+ # # 导出为 CSV 文件
280
+ # combined_df.to_csv("structure_based_similarity_network_cytoscape_export.csv", index=False)