Upload 6 files
Browse files- .gitignore +2 -0
- app.py +2 -8
- r_functions.py +19 -49
- usalign_runner.py +27 -31
- utils.py +52 -164
.gitignore
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/data
|
| 2 |
+
/__pycache__
|
app.py
CHANGED
|
@@ -1,21 +1,14 @@
|
|
| 1 |
-
import hashlib
|
| 2 |
import os
|
| 3 |
-
import sys
|
| 4 |
-
from io import StringIO
|
| 5 |
-
from pathlib import Path
|
| 6 |
|
| 7 |
import gradio as gr
|
| 8 |
-
import matplotlib.pyplot as plt
|
| 9 |
-
import pandas as pd
|
| 10 |
|
| 11 |
-
from usalign_runner import USalignRunner
|
| 12 |
from utils import calculate_md5, run_community_analysis, run_usalign, save_pdb_files
|
| 13 |
|
| 14 |
os.environ["GRADIO_ANALYTICS_ENABLED"] = "False"
|
| 15 |
|
| 16 |
# Create Gradio interface
|
| 17 |
with gr.Blocks() as demo:
|
| 18 |
-
gr.Markdown("#
|
| 19 |
|
| 20 |
with gr.Row():
|
| 21 |
file_input = gr.File(
|
|
@@ -93,6 +86,7 @@ with gr.Blocks() as demo:
|
|
| 93 |
outputs=[
|
| 94 |
tm_matrix_output,
|
| 95 |
newick_output,
|
|
|
|
| 96 |
download_tm,
|
| 97 |
],
|
| 98 |
)
|
|
|
|
|
|
|
| 1 |
import os
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
import gradio as gr
|
|
|
|
|
|
|
| 4 |
|
|
|
|
| 5 |
from utils import calculate_md5, run_community_analysis, run_usalign, save_pdb_files
|
| 6 |
|
| 7 |
os.environ["GRADIO_ANALYTICS_ENABLED"] = "False"
|
| 8 |
|
| 9 |
# Create Gradio interface
|
| 10 |
with gr.Blocks() as demo:
|
| 11 |
+
gr.Markdown("# This is a Temp Title")
|
| 12 |
|
| 13 |
with gr.Row():
|
| 14 |
file_input = gr.File(
|
|
|
|
| 86 |
outputs=[
|
| 87 |
tm_matrix_output,
|
| 88 |
newick_output,
|
| 89 |
+
# network_plot,
|
| 90 |
download_tm,
|
| 91 |
],
|
| 92 |
)
|
r_functions.py
CHANGED
|
@@ -1,21 +1,16 @@
|
|
| 1 |
-
import pandas as pd
|
| 2 |
-
import numpy as np
|
| 3 |
-
from rpy2.robjects import pandas2ri, r, Formula
|
| 4 |
-
from rpy2.robjects.packages import importr
|
| 5 |
-
from rpy2.robjects.vectors import StrVector, FloatVector, IntVector
|
| 6 |
-
from rpy2.robjects.conversion import localconverter
|
| 7 |
import rpy2.robjects as ro
|
| 8 |
-
import
|
| 9 |
-
|
|
|
|
| 10 |
|
| 11 |
pandas2ri.activate()
|
| 12 |
|
| 13 |
# 导入必要的 R 包
|
| 14 |
-
stats = importr(
|
| 15 |
-
ape = importr(
|
| 16 |
-
igraph = importr(
|
| 17 |
-
openxlsx = importr(
|
| 18 |
-
|
| 19 |
|
| 20 |
def get_r_matrix(df):
|
| 21 |
with localconverter(ro.default_converter + pandas2ri.converter):
|
|
@@ -23,34 +18,31 @@ def get_r_matrix(df):
|
|
| 23 |
return r_tm_matrix
|
| 24 |
|
| 25 |
|
| 26 |
-
export_matrix_to_newick_r = ro.r(
|
|
|
|
| 27 |
convert_to_newick <- function(tm_matrix, output_file) {
|
| 28 |
# 导入 ape 包
|
| 29 |
if (!require(ape, quietly = TRUE)) {
|
| 30 |
install.packages("ape", repos = "https://cran.r-project.org")
|
| 31 |
library(ape)
|
| 32 |
}
|
| 33 |
-
|
| 34 |
# 计算距离矩阵
|
| 35 |
dist_matrix <- dist(tm_matrix)
|
| 36 |
-
|
| 37 |
# 层次聚类
|
| 38 |
hclust_tree <- hclust(dist_matrix, method = "ward.D2")
|
| 39 |
-
|
| 40 |
# 转为 phylo 对象
|
| 41 |
phylo_tree <- as.phylo(hclust_tree)
|
| 42 |
-
|
| 43 |
# 导出为 Newick 格式
|
| 44 |
write.tree(phylo_tree, file = output_file)
|
| 45 |
-
|
| 46 |
newick_str <- write.tree(phylo_tree)
|
| 47 |
-
|
| 48 |
return(newick_str)
|
| 49 |
}
|
| 50 |
-
"""
|
|
|
|
| 51 |
|
| 52 |
-
export_similarity_network_r = ro.r(
|
| 53 |
-
|
|
|
|
| 54 |
# 导入必要的包
|
| 55 |
if (!require(igraph, quietly = TRUE)) {
|
| 56 |
install.packages("igraph", repos = "https://cran.r-project.org")
|
|
@@ -60,74 +52,53 @@ create_similarity_network_r <- function(threshold, tm_matrix, excel_path, csv_pa
|
|
| 60 |
install.packages("openxlsx", repos = "https://cran.r-project.org")
|
| 61 |
library(openxlsx)
|
| 62 |
}
|
| 63 |
-
|
| 64 |
# 根据相似性阈值创建边缘列表,并过滤掉自环
|
| 65 |
overthresholdedges <- which(tm_matrix >= threshold, arr.ind = TRUE)
|
| 66 |
overthresholdedges <- overthresholdedges[overthresholdedges[, 1] != overthresholdedges[, 2], ]
|
| 67 |
-
|
| 68 |
# 创建空的图形对象
|
| 69 |
graph <- graph.empty()
|
| 70 |
-
|
| 71 |
# 添加节点
|
| 72 |
nodes <- rownames(tm_matrix)
|
| 73 |
graph <- add_vertices(graph, nv = length(nodes), name = nodes)
|
| 74 |
-
|
| 75 |
# 添加边
|
| 76 |
for (i in 1:nrow(overthresholdedges)) {
|
| 77 |
graph <- add_edges(graph, c(overthresholdedges[i, 1], overthresholdedges[i, 2]))
|
| 78 |
}
|
| 79 |
-
|
| 80 |
# 转换为无向图
|
| 81 |
graph <- as.undirected(graph, mode = "collapse")
|
| 82 |
-
|
| 83 |
# 计算聚类
|
| 84 |
clusters <- fastgreedy.community(graph)
|
| 85 |
-
|
| 86 |
# 获取每个聚类的大小
|
| 87 |
cluster_sizes <- sizes(clusters)
|
| 88 |
-
|
| 89 |
# 按聚类大小降序排序
|
| 90 |
sorted_clusters <- clusters[order(cluster_sizes, decreasing = TRUE)]
|
| 91 |
-
|
| 92 |
# 获取每个聚类的成员
|
| 93 |
cluster_members <- membership(clusters)
|
| 94 |
-
|
| 95 |
# 找到孤立节点
|
| 96 |
singleton_nodes <- names(cluster_members[cluster_members %in% which(sizes(clusters) == 1)])
|
| 97 |
-
|
| 98 |
# 创建Cytoscape导出文件
|
| 99 |
cytoscape_export <- createWorkbook()
|
| 100 |
-
|
| 101 |
# 创建边Sheet
|
| 102 |
addWorksheet(cytoscape_export, sheetName = "Edges")
|
| 103 |
writeData(cytoscape_export, sheet = "Edges", x = "Source", startCol = 1, startRow = 1)
|
| 104 |
writeData(cytoscape_export, sheet = "Edges", x = "Target", startCol = 2, startRow = 1)
|
| 105 |
-
|
| 106 |
# 获取边列表
|
| 107 |
edges <- get.edgelist(graph)
|
| 108 |
-
|
| 109 |
# 填充边Sheet数据
|
| 110 |
if (nrow(edges) > 0) {
|
| 111 |
writeData(cytoscape_export, sheet = "Edges", x = V(graph)[edges[, 1]]$name, startCol = 1, startRow = 2)
|
| 112 |
writeData(cytoscape_export, sheet = "Edges", x = V(graph)[edges[, 2]]$name, startCol = 2, startRow = 2)
|
| 113 |
-
}
|
| 114 |
-
|
| 115 |
# 找到当前边Sheet的最后一行
|
| 116 |
last_edge_row <- nrow(edges) + 1
|
| 117 |
-
|
| 118 |
# 添加孤立节点
|
| 119 |
if (length(singleton_nodes) > 0) {
|
| 120 |
writeData(cytoscape_export, sheet = "Edges", x = singleton_nodes, startCol = 1, startRow = last_edge_row + 1)
|
| 121 |
}
|
| 122 |
-
|
| 123 |
# 保存Excel文件
|
| 124 |
saveWorkbook(cytoscape_export, excel_path, overwrite = TRUE)
|
| 125 |
-
|
| 126 |
-
saveWorkbook(cytoscape_export, "structure_based_similarity_network_cytoscape_export.xlsx", overwrite = TRUE)
|
| 127 |
-
|
| 128 |
# 创建一个空的数据框用于储存节点和聚类信息
|
| 129 |
export_clusters <- data.frame(protein = character(), cluster_name = character(), stringsAsFactors = FALSE)
|
| 130 |
-
|
| 131 |
# 遍历 sorted_clusters
|
| 132 |
cluster_index <- 1 # 初始化簇索引
|
| 133 |
for (cluster_name in names(sorted_clusters)) {
|
|
@@ -143,10 +114,9 @@ create_similarity_network_r <- function(threshold, tm_matrix, excel_path, csv_pa
|
|
| 143 |
export_clusters <- rbind(export_clusters, data.frame(protein = protein, cluster_name = current_cluster_name))
|
| 144 |
}
|
| 145 |
cluster_index <- cluster_index + 1 # 索引加1
|
| 146 |
-
}
|
| 147 |
-
|
| 148 |
-
write.csv(export_clusters, csv_path, row.names = FALSE, quote = TRUE)
|
| 149 |
# 返回聚类结果
|
| 150 |
return(list(cluster_data = export_clusters, graph = graph))
|
| 151 |
}
|
| 152 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import rpy2.robjects as ro
|
| 2 |
+
from rpy2.robjects import pandas2ri
|
| 3 |
+
from rpy2.robjects.conversion import localconverter
|
| 4 |
+
from rpy2.robjects.packages import importr
|
| 5 |
|
| 6 |
pandas2ri.activate()
|
| 7 |
|
| 8 |
# 导入必要的 R 包
|
| 9 |
+
stats = importr("stats")
|
| 10 |
+
ape = importr("ape")
|
| 11 |
+
igraph = importr("igraph", robject_translations={".env": "_env_"})
|
| 12 |
+
openxlsx = importr("openxlsx")
|
| 13 |
+
|
| 14 |
|
| 15 |
def get_r_matrix(df):
|
| 16 |
with localconverter(ro.default_converter + pandas2ri.converter):
|
|
|
|
| 18 |
return r_tm_matrix
|
| 19 |
|
| 20 |
|
| 21 |
+
export_matrix_to_newick_r = ro.r(
|
| 22 |
+
"""
|
| 23 |
convert_to_newick <- function(tm_matrix, output_file) {
|
| 24 |
# 导入 ape 包
|
| 25 |
if (!require(ape, quietly = TRUE)) {
|
| 26 |
install.packages("ape", repos = "https://cran.r-project.org")
|
| 27 |
library(ape)
|
| 28 |
}
|
|
|
|
| 29 |
# 计算距离矩阵
|
| 30 |
dist_matrix <- dist(tm_matrix)
|
|
|
|
| 31 |
# 层次聚类
|
| 32 |
hclust_tree <- hclust(dist_matrix, method = "ward.D2")
|
|
|
|
| 33 |
# 转为 phylo 对象
|
| 34 |
phylo_tree <- as.phylo(hclust_tree)
|
|
|
|
| 35 |
# 导出为 Newick 格式
|
| 36 |
write.tree(phylo_tree, file = output_file)
|
|
|
|
| 37 |
newick_str <- write.tree(phylo_tree)
|
|
|
|
| 38 |
return(newick_str)
|
| 39 |
}
|
| 40 |
+
"""
|
| 41 |
+
)
|
| 42 |
|
| 43 |
+
export_similarity_network_r = ro.r(
|
| 44 |
+
"""
|
| 45 |
+
create_similarity_network_r <- function(threshold, tm_matrix, excel_path) {
|
| 46 |
# 导入必要的包
|
| 47 |
if (!require(igraph, quietly = TRUE)) {
|
| 48 |
install.packages("igraph", repos = "https://cran.r-project.org")
|
|
|
|
| 52 |
install.packages("openxlsx", repos = "https://cran.r-project.org")
|
| 53 |
library(openxlsx)
|
| 54 |
}
|
|
|
|
| 55 |
# 根据相似性阈值创建边缘列表,并过滤掉自环
|
| 56 |
overthresholdedges <- which(tm_matrix >= threshold, arr.ind = TRUE)
|
| 57 |
overthresholdedges <- overthresholdedges[overthresholdedges[, 1] != overthresholdedges[, 2], ]
|
|
|
|
| 58 |
# 创建空的图形对象
|
| 59 |
graph <- graph.empty()
|
|
|
|
| 60 |
# 添加节点
|
| 61 |
nodes <- rownames(tm_matrix)
|
| 62 |
graph <- add_vertices(graph, nv = length(nodes), name = nodes)
|
|
|
|
| 63 |
# 添加边
|
| 64 |
for (i in 1:nrow(overthresholdedges)) {
|
| 65 |
graph <- add_edges(graph, c(overthresholdedges[i, 1], overthresholdedges[i, 2]))
|
| 66 |
}
|
|
|
|
| 67 |
# 转换为无向图
|
| 68 |
graph <- as.undirected(graph, mode = "collapse")
|
|
|
|
| 69 |
# 计算聚类
|
| 70 |
clusters <- fastgreedy.community(graph)
|
|
|
|
| 71 |
# 获取每个聚类的大小
|
| 72 |
cluster_sizes <- sizes(clusters)
|
|
|
|
| 73 |
# 按聚类大小降序排序
|
| 74 |
sorted_clusters <- clusters[order(cluster_sizes, decreasing = TRUE)]
|
|
|
|
| 75 |
# 获取每个聚类的成员
|
| 76 |
cluster_members <- membership(clusters)
|
|
|
|
| 77 |
# 找到孤立节点
|
| 78 |
singleton_nodes <- names(cluster_members[cluster_members %in% which(sizes(clusters) == 1)])
|
|
|
|
| 79 |
# 创建Cytoscape导出文件
|
| 80 |
cytoscape_export <- createWorkbook()
|
|
|
|
| 81 |
# 创建边Sheet
|
| 82 |
addWorksheet(cytoscape_export, sheetName = "Edges")
|
| 83 |
writeData(cytoscape_export, sheet = "Edges", x = "Source", startCol = 1, startRow = 1)
|
| 84 |
writeData(cytoscape_export, sheet = "Edges", x = "Target", startCol = 2, startRow = 1)
|
|
|
|
| 85 |
# 获取边列表
|
| 86 |
edges <- get.edgelist(graph)
|
|
|
|
| 87 |
# 填充边Sheet数据
|
| 88 |
if (nrow(edges) > 0) {
|
| 89 |
writeData(cytoscape_export, sheet = "Edges", x = V(graph)[edges[, 1]]$name, startCol = 1, startRow = 2)
|
| 90 |
writeData(cytoscape_export, sheet = "Edges", x = V(graph)[edges[, 2]]$name, startCol = 2, startRow = 2)
|
| 91 |
+
}
|
|
|
|
| 92 |
# 找到当前边Sheet的最后一行
|
| 93 |
last_edge_row <- nrow(edges) + 1
|
|
|
|
| 94 |
# 添加孤立节点
|
| 95 |
if (length(singleton_nodes) > 0) {
|
| 96 |
writeData(cytoscape_export, sheet = "Edges", x = singleton_nodes, startCol = 1, startRow = last_edge_row + 1)
|
| 97 |
}
|
|
|
|
| 98 |
# 保存Excel文件
|
| 99 |
saveWorkbook(cytoscape_export, excel_path, overwrite = TRUE)
|
|
|
|
|
|
|
|
|
|
| 100 |
# 创建一个空的数据框用于储存节点和聚类信息
|
| 101 |
export_clusters <- data.frame(protein = character(), cluster_name = character(), stringsAsFactors = FALSE)
|
|
|
|
| 102 |
# 遍历 sorted_clusters
|
| 103 |
cluster_index <- 1 # 初始化簇索引
|
| 104 |
for (cluster_name in names(sorted_clusters)) {
|
|
|
|
| 114 |
export_clusters <- rbind(export_clusters, data.frame(protein = protein, cluster_name = current_cluster_name))
|
| 115 |
}
|
| 116 |
cluster_index <- cluster_index + 1 # 索引加1
|
| 117 |
+
}
|
|
|
|
|
|
|
| 118 |
# 返回聚类结果
|
| 119 |
return(list(cluster_data = export_clusters, graph = graph))
|
| 120 |
}
|
| 121 |
+
"""
|
| 122 |
+
)
|
usalign_runner.py
CHANGED
|
@@ -1,29 +1,27 @@
|
|
| 1 |
import subprocess
|
| 2 |
-
import os
|
| 3 |
-
from typing import List, Optional
|
| 4 |
from pathlib import Path
|
|
|
|
|
|
|
| 5 |
import yaml
|
| 6 |
|
|
|
|
| 7 |
class USalignRunner:
|
| 8 |
def __init__(self, config_path: str = "config.yaml"):
|
| 9 |
"""
|
| 10 |
Initialize USalignRunner with parameters from config file.
|
| 11 |
-
|
| 12 |
Args:
|
| 13 |
config_path (str): Path to the configuration file
|
| 14 |
"""
|
| 15 |
-
with open(config_path,
|
| 16 |
config = yaml.safe_load(f)
|
| 17 |
-
|
| 18 |
-
self.usalign_path = Path(config[
|
| 19 |
self.default_params = {
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
}
|
| 24 |
-
|
| 25 |
-
if not self.usalign_path.exists():
|
| 26 |
-
raise FileNotFoundError(f"USalign executable not found at {self.usalign_path}")
|
| 27 |
|
| 28 |
def run_alignment(
|
| 29 |
self,
|
|
@@ -32,37 +30,35 @@ class USalignRunner:
|
|
| 32 |
tmscore: Optional[float] = None,
|
| 33 |
outfmt: Optional[int] = None,
|
| 34 |
) -> tuple[int, str, str]:
|
| 35 |
-
tmscore = tmscore if tmscore is not None else self.default_params[
|
| 36 |
-
outfmt = outfmt if outfmt is not None else self.default_params[
|
| 37 |
-
|
| 38 |
# Create the command
|
| 39 |
cmd = [
|
| 40 |
str(self.usalign_path),
|
| 41 |
-
"-mol",
|
| 42 |
-
"
|
|
|
|
|
|
|
| 43 |
pdb_list_file,
|
| 44 |
-
"-TMscore",
|
| 45 |
-
|
|
|
|
|
|
|
| 46 |
]
|
| 47 |
print(cmd)
|
| 48 |
-
|
| 49 |
# Convert command list to string
|
| 50 |
cmd_str = " ".join(cmd)
|
| 51 |
-
|
| 52 |
try:
|
| 53 |
# Execute the command
|
| 54 |
-
process = subprocess.Popen(
|
| 55 |
-
|
| 56 |
-
stdout=subprocess.PIPE,
|
| 57 |
-
stderr=subprocess.PIPE,
|
| 58 |
-
shell=True,
|
| 59 |
-
text=True
|
| 60 |
-
)
|
| 61 |
-
|
| 62 |
# Get output
|
| 63 |
stdout, stderr = process.communicate()
|
| 64 |
-
|
| 65 |
return process.returncode, stdout, stderr
|
| 66 |
-
|
| 67 |
except Exception as e:
|
| 68 |
return -1, "", str(e)
|
|
|
|
| 1 |
import subprocess
|
|
|
|
|
|
|
| 2 |
from pathlib import Path
|
| 3 |
+
from typing import Optional
|
| 4 |
+
|
| 5 |
import yaml
|
| 6 |
|
| 7 |
+
|
| 8 |
class USalignRunner:
|
| 9 |
def __init__(self, config_path: str = "config.yaml"):
|
| 10 |
"""
|
| 11 |
Initialize USalignRunner with parameters from config file.
|
| 12 |
+
|
| 13 |
Args:
|
| 14 |
config_path (str): Path to the configuration file
|
| 15 |
"""
|
| 16 |
+
with open(config_path, "r", encoding="utf-8") as f:
|
| 17 |
config = yaml.safe_load(f)
|
| 18 |
+
|
| 19 |
+
self.usalign_path = Path(config["USalign"]["path"])
|
| 20 |
self.default_params = {
|
| 21 |
+
"tmscore": config["USalign"]["tmscore"],
|
| 22 |
+
"outfmt": config["USalign"]["outfmt"],
|
| 23 |
+
"mol": "protein", # Default to protein alignment
|
| 24 |
}
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
def run_alignment(
|
| 27 |
self,
|
|
|
|
| 30 |
tmscore: Optional[float] = None,
|
| 31 |
outfmt: Optional[int] = None,
|
| 32 |
) -> tuple[int, str, str]:
|
| 33 |
+
tmscore = tmscore if tmscore is not None else self.default_params["tmscore"]
|
| 34 |
+
outfmt = outfmt if outfmt is not None else self.default_params["outfmt"]
|
| 35 |
+
|
| 36 |
# Create the command
|
| 37 |
cmd = [
|
| 38 |
str(self.usalign_path),
|
| 39 |
+
"-mol",
|
| 40 |
+
self.default_params["mol"],
|
| 41 |
+
"-dir",
|
| 42 |
+
str(target_dir),
|
| 43 |
pdb_list_file,
|
| 44 |
+
"-TMscore",
|
| 45 |
+
str(tmscore),
|
| 46 |
+
"-outfmt",
|
| 47 |
+
str(outfmt),
|
| 48 |
]
|
| 49 |
print(cmd)
|
| 50 |
+
|
| 51 |
# Convert command list to string
|
| 52 |
cmd_str = " ".join(cmd)
|
| 53 |
+
|
| 54 |
try:
|
| 55 |
# Execute the command
|
| 56 |
+
process = subprocess.Popen(cmd_str, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True, text=True)
|
| 57 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
# Get output
|
| 59 |
stdout, stderr = process.communicate()
|
| 60 |
+
|
| 61 |
return process.returncode, stdout, stderr
|
| 62 |
+
|
| 63 |
except Exception as e:
|
| 64 |
return -1, "", str(e)
|
utils.py
CHANGED
|
@@ -1,224 +1,138 @@
|
|
| 1 |
-
import numpy as np
|
| 2 |
-
import pandas as pd
|
| 3 |
-
# import fastcluster
|
| 4 |
-
import networkx as nx
|
| 5 |
-
from community import community_louvain
|
| 6 |
-
from scipy.spatial.distance import pdist, squareform
|
| 7 |
-
from scipy.cluster.hierarchy import linkage, to_tree
|
| 8 |
-
from networkx.algorithms.community import greedy_modularity_communities
|
| 9 |
-
from Bio import Phylo
|
| 10 |
-
from Bio.Phylo.BaseTree import Tree, Clade
|
| 11 |
-
import matplotlib.pyplot as plt
|
| 12 |
-
import sys
|
| 13 |
-
import gradio as gr
|
| 14 |
-
import os
|
| 15 |
import hashlib
|
| 16 |
-
|
| 17 |
-
import
|
| 18 |
from io import StringIO
|
| 19 |
-
from
|
| 20 |
-
|
| 21 |
import numpy as np
|
| 22 |
-
|
| 23 |
-
from rpy2.robjects.packages import importr
|
| 24 |
-
from rpy2.robjects.vectors import StrVector, FloatVector, IntVector
|
| 25 |
-
from rpy2.robjects.conversion import localconverter
|
| 26 |
import rpy2.robjects as ro
|
| 27 |
-
import
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
-
from r_functions import get_r_matrix,export_matrix_to_newick_r,export_similarity_network_r
|
| 30 |
|
| 31 |
def get_TM_mat_from_df(df):
|
| 32 |
-
|
| 33 |
-
chain2_unique = df['PDBchain2'].unique()
|
| 34 |
-
unique_chains = sorted(set(df['#PDBchain1'].unique()).union(set(df['PDBchain2'].unique())))
|
| 35 |
chain_to_idx = {chain: idx for idx, chain in enumerate(unique_chains)}
|
| 36 |
n = len(unique_chains)
|
| 37 |
matrix = np.eye(n)
|
| 38 |
for _, row in df.iterrows():
|
| 39 |
-
chain1 = row[
|
| 40 |
-
chain2 = row[
|
| 41 |
if chain1 in chain_to_idx and chain2 in chain_to_idx:
|
| 42 |
i = chain_to_idx[chain1]
|
| 43 |
j = chain_to_idx[chain2]
|
| 44 |
-
matrix[j, i] = row[
|
| 45 |
-
matrix[i, j] = row[
|
| 46 |
|
| 47 |
-
columns_names = [chain.replace("/","").replace(".pdb:A","") for chain in unique_chains]
|
| 48 |
-
df = pd.DataFrame(np.array(matrix),
|
| 49 |
-
columns=columns_names,
|
| 50 |
-
index=columns_names)
|
| 51 |
return df
|
| 52 |
|
| 53 |
|
| 54 |
-
# def get_cluster_z_from_df(df):
|
| 55 |
-
# dist_matrix = pdist(df, metric='euclidean')
|
| 56 |
-
# Z = fastcluster.linkage(dist_matrix, method='ward')
|
| 57 |
-
# return Z
|
| 58 |
-
|
| 59 |
-
def scipy_to_biopython(Z, labels):
|
| 60 |
-
"""将scipy的linkage矩阵转换为Bio.Phylo树"""
|
| 61 |
-
tree = to_tree(Z, rd=False)
|
| 62 |
-
|
| 63 |
-
def build_clade(node):
|
| 64 |
-
if node.is_leaf():
|
| 65 |
-
return Clade(branch_length=node.dist, name=labels[node.id])
|
| 66 |
-
else:
|
| 67 |
-
left = build_clade(node.left)
|
| 68 |
-
right = build_clade(node.right)
|
| 69 |
-
return Clade(branch_length=node.dist, clades=[left, right])
|
| 70 |
-
|
| 71 |
-
root = build_clade(tree)
|
| 72 |
-
return Tree(root)
|
| 73 |
-
|
| 74 |
-
def write_str_to_file(s:str,file_path:str):
|
| 75 |
-
with open(file_path,'w',encoding="utf8") as f:
|
| 76 |
-
f.write(s)
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
def build_graph_from_mat_df(TM_score_matrix,threshold = 0.75):
|
| 80 |
-
|
| 81 |
-
G = nx.Graph()
|
| 82 |
-
G.add_nodes_from(TM_score_matrix.index)
|
| 83 |
-
matrix_values = TM_score_matrix.values
|
| 84 |
-
# np.fill_diagonal(matrix_values, 0) # 排除自环
|
| 85 |
-
rows, cols = np.where(matrix_values >= threshold)
|
| 86 |
-
edges = [(TM_score_matrix.index[i], TM_score_matrix.index[j])
|
| 87 |
-
for i, j in zip(rows, cols) if i != j]
|
| 88 |
-
G.add_edges_from(edges)
|
| 89 |
-
return G
|
| 90 |
-
|
| 91 |
-
def fill_community_to_graph(G):
|
| 92 |
-
partition = community_louvain.best_partition(G)
|
| 93 |
-
nx.set_node_attributes(G, partition, 'cluster')
|
| 94 |
-
return partition
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
def get_graph_fig(G,partition):
|
| 98 |
-
plt.figure(figsize=(12, 10))
|
| 99 |
-
pos = nx.spring_layout(G)
|
| 100 |
-
nx.draw_networkx_nodes(G, pos, node_size=50,
|
| 101 |
-
cmap=plt.cm.tab20, node_color=list(partition.values()))
|
| 102 |
-
nx.draw_networkx_edges(G, pos, alpha=0.3)
|
| 103 |
-
plt.title("Structure Similarity Network")
|
| 104 |
-
plt.axis('off')
|
| 105 |
-
fig = plt.gcf()
|
| 106 |
-
return fig
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
def calculate_md5(files):
|
| 111 |
-
"""
|
| 112 |
-
Calculate MD5 hash for a list of files.
|
| 113 |
-
The hash is calculated by combining the content of all files in sorted order.
|
| 114 |
-
|
| 115 |
-
Args:
|
| 116 |
-
files: List of file objects from Gradio upload
|
| 117 |
-
|
| 118 |
-
Returns:
|
| 119 |
-
str: MD5 hash of the combined file contents
|
| 120 |
-
"""
|
| 121 |
hash_md5 = hashlib.md5()
|
| 122 |
-
|
| 123 |
-
# Sort files by name to ensure consistent hash regardless of upload order
|
| 124 |
sorted_files = sorted(files, key=lambda x: x.name)
|
| 125 |
-
|
| 126 |
for file in sorted_files:
|
| 127 |
with open(file.name, "rb") as f:
|
| 128 |
for chunk in iter(lambda: f.read(4096), b""):
|
| 129 |
hash_md5.update(chunk)
|
| 130 |
-
|
| 131 |
return hash_md5.hexdigest()
|
| 132 |
|
| 133 |
-
|
|
|
|
| 134 |
"""Save uploaded PDB files to the specified directory."""
|
| 135 |
if not files:
|
| 136 |
return "No files uploaded"
|
| 137 |
-
|
| 138 |
# Create data directory if it doesn't exist
|
| 139 |
data_path = Path(data_dir)
|
| 140 |
data_path.mkdir(parents=True, exist_ok=True)
|
| 141 |
-
|
| 142 |
# Calculate MD5 hash for all files
|
| 143 |
md5_hash = calculate_md5(files)
|
| 144 |
-
|
| 145 |
-
file_dir = os.path.join(data_path
|
| 146 |
# file_dir.mkdir(exist_ok=True)
|
| 147 |
try:
|
| 148 |
os.mkdir(file_dir)
|
| 149 |
-
except:
|
| 150 |
pass
|
| 151 |
-
file_dir = os.path.join(data_path
|
| 152 |
try:
|
| 153 |
os.mkdir(file_dir)
|
| 154 |
-
except:
|
| 155 |
pass
|
| 156 |
print(f"Created directory: {file_dir}")
|
| 157 |
-
|
| 158 |
# Create list file
|
| 159 |
-
list_file = os.path.join(data_path
|
| 160 |
|
| 161 |
filenames = []
|
| 162 |
-
|
| 163 |
results = []
|
| 164 |
for file in files:
|
| 165 |
# Get original filename
|
| 166 |
original_filename = os.path.basename(file.name)
|
| 167 |
filenames.append(original_filename)
|
| 168 |
# Check if file already exists
|
| 169 |
-
target_path = os.path.join(file_dir,
|
| 170 |
print(f"Saving to: {target_path}")
|
| 171 |
-
|
| 172 |
# Save the file
|
| 173 |
with open(target_path, "wb") as f:
|
| 174 |
f.write(open(file.name, "rb").read())
|
| 175 |
results.append(f"Saved {original_filename}")
|
| 176 |
-
|
| 177 |
# Write list file
|
| 178 |
with open(list_file, "w") as f:
|
| 179 |
f.write("\n".join(filenames))
|
| 180 |
results.append(f"Created list file: {list_file}")
|
| 181 |
-
|
| 182 |
return "\n".join(results)
|
| 183 |
|
|
|
|
| 184 |
def run_usalign(md5_hash):
|
| 185 |
"""Run USalign on the uploaded PDB files and return results as DataFrame."""
|
| 186 |
try:
|
| 187 |
runner = USalignRunner()
|
| 188 |
data_path = Path("./data")
|
| 189 |
-
pdb_dir = os.path.join(data_path
|
| 190 |
-
list_file = os.path.join(data_path
|
| 191 |
print(str(pdb_dir))
|
| 192 |
print(str(list_file))
|
| 193 |
-
return_code, stdout, stderr = runner.run_alignment(
|
| 194 |
-
target_dir=str(pdb_dir),
|
| 195 |
-
pdb_list_file=str(list_file)
|
| 196 |
-
)
|
| 197 |
print(stdout)
|
| 198 |
print(stderr)
|
| 199 |
if return_code == 0:
|
| 200 |
# Handle potential encoding issues
|
| 201 |
df = pd.read_csv(StringIO(stdout), sep="\t", encoding=sys.getdefaultencoding())
|
| 202 |
-
|
| 203 |
# Clean up any potential encoding artifacts in column names
|
| 204 |
df.columns = [col.strip() for col in df.columns]
|
| 205 |
return df
|
| 206 |
else:
|
| 207 |
return pd.DataFrame({"Error": [stderr]})
|
| 208 |
except Exception as e:
|
| 209 |
-
return pd.DataFrame({"Error": [stderr]})
|
|
|
|
| 210 |
|
| 211 |
-
def run_community_analysis(results_df, data_dir, md5_hash,threshold):
|
| 212 |
"""Run community analysis pipeline and return results."""
|
| 213 |
try:
|
| 214 |
# Generate TM matrix
|
| 215 |
tm_matrix = get_TM_mat_from_df(results_df)
|
| 216 |
|
| 217 |
-
tm_file = os.path.join("data",md5_hash,"tm_matrix.csv")
|
| 218 |
-
newick_file = os.path.join("data",md5_hash,"clustering.newick")
|
| 219 |
# network_file = os.path.join("data",md5_hash,"network.svg")
|
| 220 |
-
network_edges_file = os.path.join("data",md5_hash,"network_cytoscape_export.xlsx")
|
| 221 |
-
cluster_file = os.path.join("data",md5_hash,"cluster_assignments.csv")
|
| 222 |
|
| 223 |
with localconverter(ro.default_converter + pandas2ri.converter):
|
| 224 |
r_tm_matrix = ro.conversion.py2rpy(tm_matrix)
|
|
@@ -226,8 +140,7 @@ def run_community_analysis(results_df, data_dir, md5_hash,threshold):
|
|
| 226 |
result = export_matrix_to_newick_r(r_tm_matrix, newick_file)
|
| 227 |
newick_str = result[0]
|
| 228 |
|
| 229 |
-
export_similarity_network_r(threshold, r_tm_matrix,
|
| 230 |
-
|
| 231 |
|
| 232 |
# cluster_df.to_csv(cluster_file,index=False)
|
| 233 |
# combined_df.to_csv(network_edges_file,index=False)
|
|
@@ -237,44 +150,19 @@ def run_community_analysis(results_df, data_dir, md5_hash,threshold):
|
|
| 237 |
# Phylo.write(tree, newick_file, "newick")
|
| 238 |
# fig.savefig(network_file, format="svg", bbox_inches="tight")
|
| 239 |
# plt.close(fig)
|
| 240 |
-
|
| 241 |
return {
|
| 242 |
"tm_matrix": tm_matrix,
|
| 243 |
"newick_str": newick_str,
|
| 244 |
# "network_fig": fig,
|
| 245 |
-
"files":[
|
| 246 |
tm_file,
|
| 247 |
newick_file,
|
| 248 |
# network_file,
|
| 249 |
network_edges_file,
|
| 250 |
-
cluster_file
|
| 251 |
-
]
|
| 252 |
}
|
| 253 |
except Exception as e:
|
| 254 |
print("Error", str(e))
|
| 255 |
return {"Error": str(e)}
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
def get_dataframe_from_network(G,partition):
|
| 260 |
-
edges_data = [list(edge) for edge in G.edges()]
|
| 261 |
-
edges_df = pd.DataFrame(edges_data, columns=["Source", "Target"])
|
| 262 |
-
cluster_membership = {}
|
| 263 |
-
for idx, comm in enumerate(partition):
|
| 264 |
-
for node in comm:
|
| 265 |
-
cluster_membership[node] = f"cluster_{idx+1}"
|
| 266 |
-
|
| 267 |
-
singleton_nodes = [n for n in G.nodes if G.degree[n] == 0]
|
| 268 |
-
for node in singleton_nodes:
|
| 269 |
-
cluster_membership[node] = "singleton"
|
| 270 |
-
|
| 271 |
-
# 创建孤立节点的数据
|
| 272 |
-
singleton_data = [[node, ""] for node in singleton_nodes]
|
| 273 |
-
singleton_df = pd.DataFrame(singleton_data, columns=["Source", "Target"])
|
| 274 |
-
|
| 275 |
-
# 合并数据
|
| 276 |
-
combined_df = pd.concat([edges_df, singleton_df], ignore_index=True)
|
| 277 |
-
return combined_df
|
| 278 |
-
|
| 279 |
-
# # 导出为 CSV 文件
|
| 280 |
-
# combined_df.to_csv("structure_based_similarity_network_cytoscape_export.csv", index=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import hashlib
|
| 2 |
+
import os
|
| 3 |
+
import sys
|
| 4 |
from io import StringIO
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
|
| 7 |
import numpy as np
|
| 8 |
+
import pandas as pd
|
|
|
|
|
|
|
|
|
|
| 9 |
import rpy2.robjects as ro
|
| 10 |
+
from rpy2.robjects import pandas2ri
|
| 11 |
+
from rpy2.robjects.conversion import localconverter
|
| 12 |
+
|
| 13 |
+
from r_functions import export_matrix_to_newick_r, export_similarity_network_r
|
| 14 |
+
from usalign_runner import USalignRunner
|
| 15 |
|
|
|
|
| 16 |
|
| 17 |
def get_TM_mat_from_df(df):
|
| 18 |
+
unique_chains = sorted(set(df["#PDBchain1"].unique()).union(set(df["PDBchain2"].unique())))
|
|
|
|
|
|
|
| 19 |
chain_to_idx = {chain: idx for idx, chain in enumerate(unique_chains)}
|
| 20 |
n = len(unique_chains)
|
| 21 |
matrix = np.eye(n)
|
| 22 |
for _, row in df.iterrows():
|
| 23 |
+
chain1 = row["#PDBchain1"]
|
| 24 |
+
chain2 = row["PDBchain2"]
|
| 25 |
if chain1 in chain_to_idx and chain2 in chain_to_idx:
|
| 26 |
i = chain_to_idx[chain1]
|
| 27 |
j = chain_to_idx[chain2]
|
| 28 |
+
matrix[j, i] = row["TM1"]
|
| 29 |
+
matrix[i, j] = row["TM2"]
|
| 30 |
|
| 31 |
+
columns_names = [chain.replace("/", "").replace(".pdb:A", "") for chain in unique_chains]
|
| 32 |
+
df = pd.DataFrame(np.array(matrix), columns=columns_names, index=columns_names)
|
|
|
|
|
|
|
| 33 |
return df
|
| 34 |
|
| 35 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
def calculate_md5(files):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
hash_md5 = hashlib.md5()
|
|
|
|
|
|
|
| 38 |
sorted_files = sorted(files, key=lambda x: x.name)
|
| 39 |
+
|
| 40 |
for file in sorted_files:
|
| 41 |
with open(file.name, "rb") as f:
|
| 42 |
for chunk in iter(lambda: f.read(4096), b""):
|
| 43 |
hash_md5.update(chunk)
|
| 44 |
+
|
| 45 |
return hash_md5.hexdigest()
|
| 46 |
|
| 47 |
+
|
| 48 |
+
def save_pdb_files(files, data_dir="./data"):
|
| 49 |
"""Save uploaded PDB files to the specified directory."""
|
| 50 |
if not files:
|
| 51 |
return "No files uploaded"
|
| 52 |
+
|
| 53 |
# Create data directory if it doesn't exist
|
| 54 |
data_path = Path(data_dir)
|
| 55 |
data_path.mkdir(parents=True, exist_ok=True)
|
| 56 |
+
|
| 57 |
# Calculate MD5 hash for all files
|
| 58 |
md5_hash = calculate_md5(files)
|
| 59 |
+
|
| 60 |
+
file_dir = os.path.join(data_path, md5_hash)
|
| 61 |
# file_dir.mkdir(exist_ok=True)
|
| 62 |
try:
|
| 63 |
os.mkdir(file_dir)
|
| 64 |
+
except Exception:
|
| 65 |
pass
|
| 66 |
+
file_dir = os.path.join(data_path, md5_hash, "pdb")
|
| 67 |
try:
|
| 68 |
os.mkdir(file_dir)
|
| 69 |
+
except Exception:
|
| 70 |
pass
|
| 71 |
print(f"Created directory: {file_dir}")
|
| 72 |
+
|
| 73 |
# Create list file
|
| 74 |
+
list_file = os.path.join(data_path, md5_hash, "pdb_list")
|
| 75 |
|
| 76 |
filenames = []
|
| 77 |
+
|
| 78 |
results = []
|
| 79 |
for file in files:
|
| 80 |
# Get original filename
|
| 81 |
original_filename = os.path.basename(file.name)
|
| 82 |
filenames.append(original_filename)
|
| 83 |
# Check if file already exists
|
| 84 |
+
target_path = os.path.join(file_dir, original_filename)
|
| 85 |
print(f"Saving to: {target_path}")
|
| 86 |
+
|
| 87 |
# Save the file
|
| 88 |
with open(target_path, "wb") as f:
|
| 89 |
f.write(open(file.name, "rb").read())
|
| 90 |
results.append(f"Saved {original_filename}")
|
| 91 |
+
|
| 92 |
# Write list file
|
| 93 |
with open(list_file, "w") as f:
|
| 94 |
f.write("\n".join(filenames))
|
| 95 |
results.append(f"Created list file: {list_file}")
|
| 96 |
+
|
| 97 |
return "\n".join(results)
|
| 98 |
|
| 99 |
+
|
| 100 |
def run_usalign(md5_hash):
|
| 101 |
"""Run USalign on the uploaded PDB files and return results as DataFrame."""
|
| 102 |
try:
|
| 103 |
runner = USalignRunner()
|
| 104 |
data_path = Path("./data")
|
| 105 |
+
pdb_dir = os.path.join(data_path, md5_hash, "pdb")
|
| 106 |
+
list_file = os.path.join(data_path, md5_hash, "pdb_list")
|
| 107 |
print(str(pdb_dir))
|
| 108 |
print(str(list_file))
|
| 109 |
+
return_code, stdout, stderr = runner.run_alignment(target_dir=str(pdb_dir), pdb_list_file=str(list_file))
|
|
|
|
|
|
|
|
|
|
| 110 |
print(stdout)
|
| 111 |
print(stderr)
|
| 112 |
if return_code == 0:
|
| 113 |
# Handle potential encoding issues
|
| 114 |
df = pd.read_csv(StringIO(stdout), sep="\t", encoding=sys.getdefaultencoding())
|
| 115 |
+
|
| 116 |
# Clean up any potential encoding artifacts in column names
|
| 117 |
df.columns = [col.strip() for col in df.columns]
|
| 118 |
return df
|
| 119 |
else:
|
| 120 |
return pd.DataFrame({"Error": [stderr]})
|
| 121 |
except Exception as e:
|
| 122 |
+
return pd.DataFrame({"Error": [e, stderr]})
|
| 123 |
+
|
| 124 |
|
| 125 |
+
def run_community_analysis(results_df, data_dir, md5_hash, threshold):
|
| 126 |
"""Run community analysis pipeline and return results."""
|
| 127 |
try:
|
| 128 |
# Generate TM matrix
|
| 129 |
tm_matrix = get_TM_mat_from_df(results_df)
|
| 130 |
|
| 131 |
+
tm_file = os.path.join("data", md5_hash, "tm_matrix.csv")
|
| 132 |
+
newick_file = os.path.join("data", md5_hash, "clustering.newick")
|
| 133 |
# network_file = os.path.join("data",md5_hash,"network.svg")
|
| 134 |
+
network_edges_file = os.path.join("data", md5_hash, "network_cytoscape_export.xlsx")
|
| 135 |
+
# cluster_file = os.path.join("data", md5_hash, "cluster_assignments.csv")
|
| 136 |
|
| 137 |
with localconverter(ro.default_converter + pandas2ri.converter):
|
| 138 |
r_tm_matrix = ro.conversion.py2rpy(tm_matrix)
|
|
|
|
| 140 |
result = export_matrix_to_newick_r(r_tm_matrix, newick_file)
|
| 141 |
newick_str = result[0]
|
| 142 |
|
| 143 |
+
export_similarity_network_r(threshold, r_tm_matrix, network_edges_file)
|
|
|
|
| 144 |
|
| 145 |
# cluster_df.to_csv(cluster_file,index=False)
|
| 146 |
# combined_df.to_csv(network_edges_file,index=False)
|
|
|
|
| 150 |
# Phylo.write(tree, newick_file, "newick")
|
| 151 |
# fig.savefig(network_file, format="svg", bbox_inches="tight")
|
| 152 |
# plt.close(fig)
|
| 153 |
+
|
| 154 |
return {
|
| 155 |
"tm_matrix": tm_matrix,
|
| 156 |
"newick_str": newick_str,
|
| 157 |
# "network_fig": fig,
|
| 158 |
+
"files": [
|
| 159 |
tm_file,
|
| 160 |
newick_file,
|
| 161 |
# network_file,
|
| 162 |
network_edges_file,
|
| 163 |
+
# cluster_file,
|
| 164 |
+
],
|
| 165 |
}
|
| 166 |
except Exception as e:
|
| 167 |
print("Error", str(e))
|
| 168 |
return {"Error": str(e)}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|