Spaces:

Xue-Jun
/

StructureBasedSimilarityNetwork

Sleeping

App Files Files Community

test

by Xue-Jun - opened Oct 7, 2025

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+252

-102

This PR is in draft mode

Files changed (5) hide show

.gitignore +0 -2
app.py +8 -2
r_functions.py +49 -19
usalign_runner.py +31 -27
utils.py +164 -52

.gitignore DELETED Viewed

	@@ -1,2 +0,0 @@
1	- /data
2	- /__pycache__

app.py CHANGED Viewed

@@ -1,14 +1,21 @@
 import os
 import gradio as gr
 from utils import calculate_md5, run_community_analysis, run_usalign, save_pdb_files
 os.environ["GRADIO_ANALYTICS_ENABLED"] = "False"
 # Create Gradio interface
 with gr.Blocks() as demo:
-    gr.Markdown("# This is a Temp Title")
     with gr.Row():
         file_input = gr.File(
@@ -86,7 +93,6 @@ with gr.Blocks() as demo:
         outputs=[
             tm_matrix_output,
             newick_output,
-            # network_plot,
             download_tm,
         ],
     )

+import hashlib
 import os
+import sys
+from io import StringIO
+from pathlib import Path
 import gradio as gr
+import matplotlib.pyplot as plt
+import pandas as pd
+from usalign_runner import USalignRunner
 from utils import calculate_md5, run_community_analysis, run_usalign, save_pdb_files
 os.environ["GRADIO_ANALYTICS_ENABLED"] = "False"
 # Create Gradio interface
 with gr.Blocks() as demo:
+    gr.Markdown("# Structure-Based Similarity Network")
     with gr.Row():
         file_input = gr.File(
         outputs=[
             tm_matrix_output,
             newick_output,
             download_tm,
         ],
     )

r_functions.py CHANGED Viewed

@@ -1,16 +1,21 @@
-import rpy2.robjects as ro
-from rpy2.robjects import pandas2ri
-from rpy2.robjects.conversion import localconverter
 from rpy2.robjects.packages import importr
 pandas2ri.activate()
 # 导入必要的 R 包
-stats = importr("stats")
-ape = importr("ape")
-igraph = importr("igraph", robject_translations={".env": "_env_"})
-openxlsx = importr("openxlsx")
 def get_r_matrix(df):
     with localconverter(ro.default_converter + pandas2ri.converter):
@@ -18,31 +23,34 @@ def get_r_matrix(df):
     return r_tm_matrix
-export_matrix_to_newick_r = ro.r(
-    """
     convert_to_newick <- function(tm_matrix, output_file) {
         # 导入 ape 包
         if (!require(ape, quietly = TRUE)) {
             install.packages("ape", repos = "https://cran.r-project.org")
             library(ape)
         }
         # 计算距离矩阵
         dist_matrix <- dist(tm_matrix)
         # 层次聚类
         hclust_tree <- hclust(dist_matrix, method = "ward.D2")
         # 转为 phylo 对象
         phylo_tree <- as.phylo(hclust_tree)
         # 导出为 Newick 格式
         write.tree(phylo_tree, file = output_file)
         newick_str <- write.tree(phylo_tree)
         return(newick_str)
     }
-    """
-)
-export_similarity_network_r = ro.r(
-    """
-create_similarity_network_r <- function(threshold, tm_matrix, excel_path) {
     # 导入必要的包
     if (!require(igraph, quietly = TRUE)) {
         install.packages("igraph", repos = "https://cran.r-project.org")
@@ -52,53 +60,74 @@ create_similarity_network_r <- function(threshold, tm_matrix, excel_path) {
         install.packages("openxlsx", repos = "https://cran.r-project.org")
         library(openxlsx)
     }
     # 根据相似性阈值创建边缘列表，并过滤掉自环
     overthresholdedges <- which(tm_matrix >= threshold, arr.ind = TRUE)
     overthresholdedges <- overthresholdedges[overthresholdedges[, 1] != overthresholdedges[, 2], ]
     # 创建空的图形对象
     graph <- graph.empty()
     # 添加节点
     nodes <- rownames(tm_matrix)
     graph <- add_vertices(graph, nv = length(nodes), name = nodes)
     # 添加边
     for (i in 1:nrow(overthresholdedges)) {
         graph <- add_edges(graph, c(overthresholdedges[i, 1], overthresholdedges[i, 2]))
     }
     # 转换为无向图
     graph <- as.undirected(graph, mode = "collapse")
     # 计算聚类
     clusters <- fastgreedy.community(graph)
     # 获取每个聚类的大小
     cluster_sizes <- sizes(clusters)
     # 按聚类大小降序排序
     sorted_clusters <- clusters[order(cluster_sizes, decreasing = TRUE)]
     # 获取每个聚类的成员
     cluster_members <- membership(clusters)
     # 找到孤立节点
     singleton_nodes <- names(cluster_members[cluster_members %in% which(sizes(clusters) == 1)])
     # 创建Cytoscape导出文件
     cytoscape_export <- createWorkbook()
     # 创建边Sheet
     addWorksheet(cytoscape_export, sheetName = "Edges")
     writeData(cytoscape_export, sheet = "Edges", x = "Source", startCol = 1, startRow = 1)
     writeData(cytoscape_export, sheet = "Edges", x = "Target", startCol = 2, startRow = 1)
     # 获取边列表
     edges <- get.edgelist(graph)
     # 填充边Sheet数据
     if (nrow(edges) > 0) {
         writeData(cytoscape_export, sheet = "Edges", x = V(graph)[edges[, 1]]$name, startCol = 1, startRow = 2)
         writeData(cytoscape_export, sheet = "Edges", x = V(graph)[edges[, 2]]$name, startCol = 2, startRow = 2)
-    }
     # 找到当前边Sheet的最后一行
     last_edge_row <- nrow(edges) + 1
     # 添加孤立节点
     if (length(singleton_nodes) > 0) {
         writeData(cytoscape_export, sheet = "Edges", x = singleton_nodes, startCol = 1, startRow = last_edge_row + 1)
     }
     # 保存Excel文件
     saveWorkbook(cytoscape_export, excel_path, overwrite = TRUE)
     # 创建一个空的数据框用于储存节点和聚类信息
     export_clusters <- data.frame(protein = character(), cluster_name = character(), stringsAsFactors = FALSE)
     # 遍历 sorted_clusters
     cluster_index <- 1  # 初始化簇索引
     for (cluster_name in names(sorted_clusters)) {
@@ -114,9 +143,10 @@ create_similarity_network_r <- function(threshold, tm_matrix, excel_path) {
             export_clusters <- rbind(export_clusters, data.frame(protein = protein, cluster_name = current_cluster_name))
         }
         cluster_index <- cluster_index + 1  # 索引加1
-    }
     # 返回聚类结果
     return(list(cluster_data = export_clusters, graph = graph))
 }
-"""
-)

+import pandas as pd
+import numpy as np
+from rpy2.robjects import pandas2ri, r, Formula
 from rpy2.robjects.packages import importr
+from rpy2.robjects.vectors import StrVector, FloatVector, IntVector
+from rpy2.robjects.conversion import localconverter
+import rpy2.robjects as ro
+import os
 pandas2ri.activate()
 # 导入必要的 R 包
+stats = importr('stats')
+ape = importr('ape')
+igraph = importr('igraph', robject_translations={'.env': '_env_'})
+openxlsx = importr('openxlsx')
+# dplyr = importr('dplyr')
 def get_r_matrix(df):
     with localconverter(ro.default_converter + pandas2ri.converter):
     return r_tm_matrix
+export_matrix_to_newick_r = ro.r("""
     convert_to_newick <- function(tm_matrix, output_file) {
         # 导入 ape 包
         if (!require(ape, quietly = TRUE)) {
             install.packages("ape", repos = "https://cran.r-project.org")
             library(ape)
         }
         # 计算距离矩阵
         dist_matrix <- dist(tm_matrix)
         # 层次聚类
         hclust_tree <- hclust(dist_matrix, method = "ward.D2")
         # 转为 phylo 对象
         phylo_tree <- as.phylo(hclust_tree)
         # 导出为 Newick 格式
         write.tree(phylo_tree, file = output_file)
         newick_str <- write.tree(phylo_tree)
         return(newick_str)
     }
+    """)
+export_similarity_network_r = ro.r("""
+create_similarity_network_r <- function(threshold, tm_matrix, excel_path, csv_path) {
     # 导入必要的包
     if (!require(igraph, quietly = TRUE)) {
         install.packages("igraph", repos = "https://cran.r-project.org")
         install.packages("openxlsx", repos = "https://cran.r-project.org")
         library(openxlsx)
     }
     # 根据相似性阈值创建边缘列表，并过滤掉自环
     overthresholdedges <- which(tm_matrix >= threshold, arr.ind = TRUE)
     overthresholdedges <- overthresholdedges[overthresholdedges[, 1] != overthresholdedges[, 2], ]
     # 创建空的图形对象
     graph <- graph.empty()
     # 添加节点
     nodes <- rownames(tm_matrix)
     graph <- add_vertices(graph, nv = length(nodes), name = nodes)
     # 添加边
     for (i in 1:nrow(overthresholdedges)) {
         graph <- add_edges(graph, c(overthresholdedges[i, 1], overthresholdedges[i, 2]))
     }
     # 转换为无向图
     graph <- as.undirected(graph, mode = "collapse")
     # 计算聚类
     clusters <- fastgreedy.community(graph)
     # 获取每个聚类的大小
     cluster_sizes <- sizes(clusters)
     # 按聚类大小降序排序
     sorted_clusters <- clusters[order(cluster_sizes, decreasing = TRUE)]
     # 获取每个聚类的成员
     cluster_members <- membership(clusters)
     # 找到孤立节点
     singleton_nodes <- names(cluster_members[cluster_members %in% which(sizes(clusters) == 1)])
     # 创建Cytoscape导出文件
     cytoscape_export <- createWorkbook()
     # 创建边Sheet
     addWorksheet(cytoscape_export, sheetName = "Edges")
     writeData(cytoscape_export, sheet = "Edges", x = "Source", startCol = 1, startRow = 1)
     writeData(cytoscape_export, sheet = "Edges", x = "Target", startCol = 2, startRow = 1)
     # 获取边列表
     edges <- get.edgelist(graph)
     # 填充边Sheet数据
     if (nrow(edges) > 0) {
         writeData(cytoscape_export, sheet = "Edges", x = V(graph)[edges[, 1]]$name, startCol = 1, startRow = 2)
         writeData(cytoscape_export, sheet = "Edges", x = V(graph)[edges[, 2]]$name, startCol = 2, startRow = 2)
+    }
     # 找到当前边Sheet的最后一行
     last_edge_row <- nrow(edges) + 1
     # 添加孤立节点
     if (length(singleton_nodes) > 0) {
         writeData(cytoscape_export, sheet = "Edges", x = singleton_nodes, startCol = 1, startRow = last_edge_row + 1)
     }
     # 保存Excel文件
     saveWorkbook(cytoscape_export, excel_path, overwrite = TRUE)
+    saveWorkbook(cytoscape_export, "structure_based_similarity_network_cytoscape_export.xlsx", overwrite = TRUE)
     # 创建一个空的数据框用于储存节点和聚类信息
     export_clusters <- data.frame(protein = character(), cluster_name = character(), stringsAsFactors = FALSE)
     # 遍历 sorted_clusters
     cluster_index <- 1  # 初始化簇索引
     for (cluster_name in names(sorted_clusters)) {
             export_clusters <- rbind(export_clusters, data.frame(protein = protein, cluster_name = current_cluster_name))
         }
         cluster_index <- cluster_index + 1  # 索引加1
+    }
+    write.csv(export_clusters, csv_path, row.names = FALSE, quote = TRUE)
     # 返回聚类结果
     return(list(cluster_data = export_clusters, graph = graph))
 }
+""")

usalign_runner.py CHANGED Viewed

@@ -1,27 +1,29 @@
 import subprocess
 from pathlib import Path
-from typing import Optional
 import yaml
 class USalignRunner:
     def __init__(self, config_path: str = "config.yaml"):
         """
         Initialize USalignRunner with parameters from config file.
         Args:
             config_path (str): Path to the configuration file
         """
-        with open(config_path, "r", encoding="utf-8") as f:
             config = yaml.safe_load(f)
-        self.usalign_path = Path(config["USalign"]["path"])
         self.default_params = {
-            "tmscore": config["USalign"]["tmscore"],
-            "outfmt": config["USalign"]["outfmt"],
-            "mol": "protein",  # Default to protein alignment
         }
     def run_alignment(
         self,
@@ -30,35 +32,37 @@ class USalignRunner:
         tmscore: Optional[float] = None,
         outfmt: Optional[int] = None,
     ) -> tuple[int, str, str]:
-        tmscore = tmscore if tmscore is not None else self.default_params["tmscore"]
-        outfmt = outfmt if outfmt is not None else self.default_params["outfmt"]
         # Create the command
         cmd = [
             str(self.usalign_path),
-            "-mol",
-            self.default_params["mol"],
-            "-dir",
-            str(target_dir),
             pdb_list_file,
-            "-TMscore",
-            str(tmscore),
-            "-outfmt",
-            str(outfmt),
         ]
         print(cmd)
         # Convert command list to string
         cmd_str = "  ".join(cmd)
         try:
             # Execute the command
-            process = subprocess.Popen(cmd_str, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True, text=True)
             # Get output
             stdout, stderr = process.communicate()
             return process.returncode, stdout, stderr
         except Exception as e:
             return -1, "", str(e)

 import subprocess
+import os
+from typing import List, Optional
 from pathlib import Path
 import yaml
 class USalignRunner:
     def __init__(self, config_path: str = "config.yaml"):
         """
         Initialize USalignRunner with parameters from config file.
         Args:
             config_path (str): Path to the configuration file
         """
+        with open(config_path, 'r',encoding="utf-8") as f:
             config = yaml.safe_load(f)
+        self.usalign_path = Path(config['USalign']['path'])
         self.default_params = {
+            'tmscore': config['USalign']['tmscore'],
+            'outfmt': config['USalign']['outfmt'],
+            'mol': 'protein'  # Default to protein alignment
         }
+        if not self.usalign_path.exists():
+            raise FileNotFoundError(f"USalign executable not found at {self.usalign_path}")
     def run_alignment(
         self,
         tmscore: Optional[float] = None,
         outfmt: Optional[int] = None,
     ) -> tuple[int, str, str]:
+        tmscore = tmscore if tmscore is not None else self.default_params['tmscore']
+        outfmt = outfmt if outfmt is not None else self.default_params['outfmt']
         # Create the command
         cmd = [
             str(self.usalign_path),
+            "-mol", self.default_params['mol'],
+            "-dir", str(target_dir),
             pdb_list_file,
+            "-TMscore", str(tmscore),
+            "-outfmt", str(outfmt)
         ]
         print(cmd)
         # Convert command list to string
         cmd_str = "  ".join(cmd)
         try:
             # Execute the command
+            process = subprocess.Popen(
+                cmd_str,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                shell=True,
+                text=True
+            )
             # Get output
             stdout, stderr = process.communicate()
             return process.returncode, stdout, stderr
         except Exception as e:
             return -1, "", str(e)

utils.py CHANGED Viewed

@@ -1,138 +1,224 @@
-import hashlib
-import os
 import sys
-from io import StringIO
 from pathlib import Path
-import numpy as np
 import pandas as pd
-import rpy2.robjects as ro
-from rpy2.robjects import pandas2ri
-from rpy2.robjects.conversion import localconverter
-from r_functions import export_matrix_to_newick_r, export_similarity_network_r
 from usalign_runner import USalignRunner
 def get_TM_mat_from_df(df):
-    unique_chains = sorted(set(df["#PDBchain1"].unique()).union(set(df["PDBchain2"].unique())))
     chain_to_idx = {chain: idx for idx, chain in enumerate(unique_chains)}
     n = len(unique_chains)
     matrix = np.eye(n)
     for _, row in df.iterrows():
-        chain1 = row["#PDBchain1"]
-        chain2 = row["PDBchain2"]
         if chain1 in chain_to_idx and chain2 in chain_to_idx:
             i = chain_to_idx[chain1]
             j = chain_to_idx[chain2]
-            matrix[j, i] = row["TM1"]
-            matrix[i, j] = row["TM2"]
-    columns_names = [chain.replace("/", "").replace(".pdb:A", "") for chain in unique_chains]
-    df = pd.DataFrame(np.array(matrix), columns=columns_names, index=columns_names)
     return df
 def calculate_md5(files):
     hash_md5 = hashlib.md5()
     sorted_files = sorted(files, key=lambda x: x.name)
     for file in sorted_files:
         with open(file.name, "rb") as f:
             for chunk in iter(lambda: f.read(4096), b""):
                 hash_md5.update(chunk)
     return hash_md5.hexdigest()
-def save_pdb_files(files, data_dir="./data"):
     """Save uploaded PDB files to the specified directory."""
     if not files:
         return "No files uploaded"
     # Create data directory if it doesn't exist
     data_path = Path(data_dir)
     data_path.mkdir(parents=True, exist_ok=True)
     # Calculate MD5 hash for all files
     md5_hash = calculate_md5(files)
-    file_dir = os.path.join(data_path, md5_hash)
     # file_dir.mkdir(exist_ok=True)
     try:
         os.mkdir(file_dir)
-    except Exception:
         pass
-    file_dir = os.path.join(data_path, md5_hash, "pdb")
     try:
         os.mkdir(file_dir)
-    except Exception:
         pass
     print(f"Created directory: {file_dir}")
     # Create list file
-    list_file = os.path.join(data_path, md5_hash, "pdb_list")
     filenames = []
     results = []
     for file in files:
         # Get original filename
         original_filename = os.path.basename(file.name)
         filenames.append(original_filename)
         # Check if file already exists
-        target_path = os.path.join(file_dir, original_filename)
         print(f"Saving to: {target_path}")
         # Save the file
         with open(target_path, "wb") as f:
             f.write(open(file.name, "rb").read())
         results.append(f"Saved {original_filename}")
     # Write list file
     with open(list_file, "w") as f:
         f.write("\n".join(filenames))
     results.append(f"Created list file: {list_file}")
     return "\n".join(results)
 def run_usalign(md5_hash):
     """Run USalign on the uploaded PDB files and return results as DataFrame."""
     try:
         runner = USalignRunner()
         data_path = Path("./data")
-        pdb_dir = os.path.join(data_path, md5_hash, "pdb")
-        list_file = os.path.join(data_path, md5_hash, "pdb_list")
         print(str(pdb_dir))
         print(str(list_file))
-        return_code, stdout, stderr = runner.run_alignment(target_dir=str(pdb_dir), pdb_list_file=str(list_file))
         print(stdout)
         print(stderr)
         if return_code == 0:
             # Handle potential encoding issues
             df = pd.read_csv(StringIO(stdout), sep="\t", encoding=sys.getdefaultencoding())
             # Clean up any potential encoding artifacts in column names
             df.columns = [col.strip() for col in df.columns]
             return df
         else:
             return pd.DataFrame({"Error": [stderr]})
     except Exception as e:
-        return pd.DataFrame({"Error": [e, stderr]})
-def run_community_analysis(results_df, data_dir, md5_hash, threshold):
     """Run community analysis pipeline and return results."""
     try:
         # Generate TM matrix
         tm_matrix = get_TM_mat_from_df(results_df)
-        tm_file = os.path.join("data", md5_hash, "tm_matrix.csv")
-        newick_file = os.path.join("data", md5_hash, "clustering.newick")
         # network_file = os.path.join("data",md5_hash,"network.svg")
-        network_edges_file = os.path.join("data", md5_hash, "network_cytoscape_export.xlsx")
-        # cluster_file = os.path.join("data", md5_hash, "cluster_assignments.csv")
         with localconverter(ro.default_converter + pandas2ri.converter):
             r_tm_matrix = ro.conversion.py2rpy(tm_matrix)
@@ -140,7 +226,8 @@ def run_community_analysis(results_df, data_dir, md5_hash, threshold):
             result = export_matrix_to_newick_r(r_tm_matrix, newick_file)
             newick_str = result[0]
-            export_similarity_network_r(threshold, r_tm_matrix, network_edges_file)
         # cluster_df.to_csv(cluster_file,index=False)
         # combined_df.to_csv(network_edges_file,index=False)
@@ -150,19 +237,44 @@ def run_community_analysis(results_df, data_dir, md5_hash, threshold):
         # Phylo.write(tree, newick_file, "newick")
         # fig.savefig(network_file, format="svg", bbox_inches="tight")
         # plt.close(fig)
         return {
             "tm_matrix": tm_matrix,
             "newick_str": newick_str,
             # "network_fig": fig,
-            "files": [
                 tm_file,
                 newick_file,
                 # network_file,
                 network_edges_file,
-                # cluster_file,
-            ],
         }
     except Exception as e:
         print("Error", str(e))
         return {"Error": str(e)}

+import numpy as np
+import pandas as pd
+# import fastcluster
+import networkx as nx
+from community import community_louvain
+from scipy.spatial.distance import pdist, squareform
+from scipy.cluster.hierarchy import linkage, to_tree
+from networkx.algorithms.community import greedy_modularity_communities
+from Bio import Phylo
+from Bio.Phylo.BaseTree import Tree, Clade
+import matplotlib.pyplot as plt
 import sys
+import gradio as gr
+import os
+import hashlib
 from pathlib import Path
 import pandas as pd
+from io import StringIO
 from usalign_runner import USalignRunner
+import pandas as pd
+import numpy as np
+from rpy2.robjects import pandas2ri, r, Formula
+from rpy2.robjects.packages import importr
+from rpy2.robjects.vectors import StrVector, FloatVector, IntVector
+from rpy2.robjects.conversion import localconverter
+import rpy2.robjects as ro
+import os
+from r_functions import get_r_matrix,export_matrix_to_newick_r,export_similarity_network_r
 def get_TM_mat_from_df(df):
+    chain1_unique = df['#PDBchain1'].unique()
+    chain2_unique = df['PDBchain2'].unique()
+    unique_chains = sorted(set(df['#PDBchain1'].unique()).union(set(df['PDBchain2'].unique())))
     chain_to_idx = {chain: idx for idx, chain in enumerate(unique_chains)}
     n = len(unique_chains)
     matrix = np.eye(n)
     for _, row in df.iterrows():
+        chain1 = row['#PDBchain1']
+        chain2 = row['PDBchain2']
         if chain1 in chain_to_idx and chain2 in chain_to_idx:
             i = chain_to_idx[chain1]
             j = chain_to_idx[chain2]
+            matrix[j, i] = row['TM1']
+            matrix[i, j] = row['TM2']
+    columns_names = [chain.replace("/","").replace(".pdb:A","") for chain in unique_chains]
+    df = pd.DataFrame(np.array(matrix),
+                      columns=columns_names,
+                      index=columns_names)
     return df
+# def get_cluster_z_from_df(df):
+#     dist_matrix = pdist(df, metric='euclidean')
+#     Z = fastcluster.linkage(dist_matrix, method='ward')
+#     return Z
+def scipy_to_biopython(Z, labels):
+    """将scipy的linkage矩阵转换为Bio.Phylo树"""
+    tree = to_tree(Z, rd=False)
+    def build_clade(node):
+        if node.is_leaf():
+            return Clade(branch_length=node.dist, name=labels[node.id])
+        else:
+            left = build_clade(node.left)
+            right = build_clade(node.right)
+            return Clade(branch_length=node.dist, clades=[left, right])
+    root = build_clade(tree)
+    return Tree(root)
+def write_str_to_file(s:str,file_path:str):
+    with open(file_path,'w',encoding="utf8") as f:
+        f.write(s)
+def build_graph_from_mat_df(TM_score_matrix,threshold = 0.75):
+    G = nx.Graph()
+    G.add_nodes_from(TM_score_matrix.index)
+    matrix_values = TM_score_matrix.values
+    # np.fill_diagonal(matrix_values, 0)  # 排除自环
+    rows, cols = np.where(matrix_values >= threshold)
+    edges = [(TM_score_matrix.index[i], TM_score_matrix.index[j])
+            for i, j in zip(rows, cols) if i != j]
+    G.add_edges_from(edges)
+    return G
+def fill_community_to_graph(G):
+    partition = community_louvain.best_partition(G)
+    nx.set_node_attributes(G, partition, 'cluster')
+    return partition
+def get_graph_fig(G,partition):
+    plt.figure(figsize=(12, 10))
+    pos = nx.spring_layout(G)
+    nx.draw_networkx_nodes(G, pos, node_size=50,
+                        cmap=plt.cm.tab20, node_color=list(partition.values()))
+    nx.draw_networkx_edges(G, pos, alpha=0.3)
+    plt.title("Structure Similarity Network")
+    plt.axis('off')
+    fig = plt.gcf()
+    return fig
 def calculate_md5(files):
+    """
+    Calculate MD5 hash for a list of files.
+    The hash is calculated by combining the content of all files in sorted order.
+    Args:
+        files: List of file objects from Gradio upload
+    Returns:
+        str: MD5 hash of the combined file contents
+    """
     hash_md5 = hashlib.md5()
+    # Sort files by name to ensure consistent hash regardless of upload order
     sorted_files = sorted(files, key=lambda x: x.name)
     for file in sorted_files:
         with open(file.name, "rb") as f:
             for chunk in iter(lambda: f.read(4096), b""):
                 hash_md5.update(chunk)
     return hash_md5.hexdigest()
+def save_pdb_files(files, data_dir='./data'):
     """Save uploaded PDB files to the specified directory."""
     if not files:
         return "No files uploaded"
     # Create data directory if it doesn't exist
     data_path = Path(data_dir)
     data_path.mkdir(parents=True, exist_ok=True)
     # Calculate MD5 hash for all files
     md5_hash = calculate_md5(files)
+    file_dir = os.path.join(data_path , md5_hash )
     # file_dir.mkdir(exist_ok=True)
     try:
         os.mkdir(file_dir)
+    except:
         pass
+    file_dir = os.path.join(data_path , md5_hash , "pdb")
     try:
         os.mkdir(file_dir)
+    except:
         pass
     print(f"Created directory: {file_dir}")
     # Create list file
+    list_file = os.path.join(data_path , md5_hash , "pdb_list")
     filenames = []
     results = []
     for file in files:
         # Get original filename
         original_filename = os.path.basename(file.name)
         filenames.append(original_filename)
         # Check if file already exists
+        target_path = os.path.join(file_dir,original_filename )
         print(f"Saving to: {target_path}")
         # Save the file
         with open(target_path, "wb") as f:
             f.write(open(file.name, "rb").read())
         results.append(f"Saved {original_filename}")
     # Write list file
     with open(list_file, "w") as f:
         f.write("\n".join(filenames))
     results.append(f"Created list file: {list_file}")
     return "\n".join(results)
 def run_usalign(md5_hash):
     """Run USalign on the uploaded PDB files and return results as DataFrame."""
     try:
         runner = USalignRunner()
         data_path = Path("./data")
+        pdb_dir = os.path.join(data_path , md5_hash , "pdb")
+        list_file = os.path.join(data_path , md5_hash , "pdb_list")
         print(str(pdb_dir))
         print(str(list_file))
+        return_code, stdout, stderr = runner.run_alignment(
+            target_dir=str(pdb_dir),
+            pdb_list_file=str(list_file)
+        )
         print(stdout)
         print(stderr)
         if return_code == 0:
             # Handle potential encoding issues
             df = pd.read_csv(StringIO(stdout), sep="\t", encoding=sys.getdefaultencoding())
             # Clean up any potential encoding artifacts in column names
             df.columns = [col.strip() for col in df.columns]
             return df
         else:
             return pd.DataFrame({"Error": [stderr]})
     except Exception as e:
+        return pd.DataFrame({"Error": [stderr]})
+def run_community_analysis(results_df, data_dir, md5_hash,threshold):
     """Run community analysis pipeline and return results."""
     try:
         # Generate TM matrix
         tm_matrix = get_TM_mat_from_df(results_df)
+        tm_file = os.path.join("data",md5_hash,"tm_matrix.csv")
+        newick_file = os.path.join("data",md5_hash,"clustering.newick")
         # network_file = os.path.join("data",md5_hash,"network.svg")
+        network_edges_file = os.path.join("data",md5_hash,"network_cytoscape_export.xlsx")
+        cluster_file = os.path.join("data",md5_hash,"cluster_assignments.csv")
         with localconverter(ro.default_converter + pandas2ri.converter):
             r_tm_matrix = ro.conversion.py2rpy(tm_matrix)
             result = export_matrix_to_newick_r(r_tm_matrix, newick_file)
             newick_str = result[0]
+            export_similarity_network_r(threshold, r_tm_matrix,network_edges_file, cluster_file)
         # cluster_df.to_csv(cluster_file,index=False)
         # combined_df.to_csv(network_edges_file,index=False)
         # Phylo.write(tree, newick_file, "newick")
         # fig.savefig(network_file, format="svg", bbox_inches="tight")
         # plt.close(fig)
         return {
             "tm_matrix": tm_matrix,
             "newick_str": newick_str,
             # "network_fig": fig,
+            "files":[
                 tm_file,
                 newick_file,
                 # network_file,
                 network_edges_file,
+                cluster_file
+            ]
         }
     except Exception as e:
         print("Error", str(e))
         return {"Error": str(e)}
+def get_dataframe_from_network(G,partition):
+    edges_data = [list(edge) for edge in G.edges()]
+    edges_df = pd.DataFrame(edges_data, columns=["Source", "Target"])
+    cluster_membership = {}
+    for idx, comm in enumerate(partition):
+        for node in comm:
+            cluster_membership[node] = f"cluster_{idx+1}"
+    singleton_nodes = [n for n in G.nodes if G.degree[n] == 0]
+    for node in singleton_nodes:
+        cluster_membership[node] = "singleton"
+    # 创建孤立节点的数据
+    singleton_data = [[node, ""] for node in singleton_nodes]
+    singleton_df = pd.DataFrame(singleton_data, columns=["Source", "Target"])
+    # 合并数据
+    combined_df = pd.concat([edges_df, singleton_df], ignore_index=True)
+    return combined_df
+    # # 导出为 CSV 文件
+    # combined_df.to_csv("structure_based_similarity_network_cytoscape_export.csv", index=False)