Spaces:

Xue-Jun
/

StructureBasedSimilarityNetwork

Sleeping

App Files Files Community

Xue-Jun commited on Oct 7, 2025

Commit

4a4b152

verified ·

1 Parent(s): fe92153

Upload 6 files

Browse files

Files changed (5) hide show

.gitignore +2 -0
app.py +2 -8
r_functions.py +19 -49
usalign_runner.py +27 -31
utils.py +52 -164

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ /data
2	+ /__pycache__

app.py CHANGED Viewed

@@ -1,21 +1,14 @@
-import hashlib
 import os
-import sys
-from io import StringIO
-from pathlib import Path
 import gradio as gr
-import matplotlib.pyplot as plt
-import pandas as pd
-from usalign_runner import USalignRunner
 from utils import calculate_md5, run_community_analysis, run_usalign, save_pdb_files
 os.environ["GRADIO_ANALYTICS_ENABLED"] = "False"
 # Create Gradio interface
 with gr.Blocks() as demo:
-    gr.Markdown("# Structure-Based Similarity Network")
     with gr.Row():
         file_input = gr.File(
@@ -93,6 +86,7 @@ with gr.Blocks() as demo:
         outputs=[
             tm_matrix_output,
             newick_output,
             download_tm,
         ],
     )

 import os
 import gradio as gr
 from utils import calculate_md5, run_community_analysis, run_usalign, save_pdb_files
 os.environ["GRADIO_ANALYTICS_ENABLED"] = "False"
 # Create Gradio interface
 with gr.Blocks() as demo:
+    gr.Markdown("# This is a Temp Title")
     with gr.Row():
         file_input = gr.File(
         outputs=[
             tm_matrix_output,
             newick_output,
+            # network_plot,
             download_tm,
         ],
     )

r_functions.py CHANGED Viewed

@@ -1,21 +1,16 @@
-import pandas as pd
-import numpy as np
-from rpy2.robjects import pandas2ri, r, Formula
-from rpy2.robjects.packages import importr
-from rpy2.robjects.vectors import StrVector, FloatVector, IntVector
-from rpy2.robjects.conversion import localconverter
 import rpy2.robjects as ro
-import os
 pandas2ri.activate()
 # 导入必要的 R 包
-stats = importr('stats')
-ape = importr('ape')
-igraph = importr('igraph', robject_translations={'.env': '_env_'})
-openxlsx = importr('openxlsx')
-# dplyr = importr('dplyr')
 def get_r_matrix(df):
     with localconverter(ro.default_converter + pandas2ri.converter):
@@ -23,34 +18,31 @@ def get_r_matrix(df):
     return r_tm_matrix
-export_matrix_to_newick_r = ro.r("""
     convert_to_newick <- function(tm_matrix, output_file) {
         # 导入 ape 包
         if (!require(ape, quietly = TRUE)) {
             install.packages("ape", repos = "https://cran.r-project.org")
             library(ape)
         }
         # 计算距离矩阵
         dist_matrix <- dist(tm_matrix)
         # 层次聚类
         hclust_tree <- hclust(dist_matrix, method = "ward.D2")
         # 转为 phylo 对象
         phylo_tree <- as.phylo(hclust_tree)
         # 导出为 Newick 格式
         write.tree(phylo_tree, file = output_file)
         newick_str <- write.tree(phylo_tree)
         return(newick_str)
     }
-    """)
-export_similarity_network_r = ro.r("""
-create_similarity_network_r <- function(threshold, tm_matrix, excel_path, csv_path) {
     # 导入必要的包
     if (!require(igraph, quietly = TRUE)) {
         install.packages("igraph", repos = "https://cran.r-project.org")
@@ -60,74 +52,53 @@ create_similarity_network_r <- function(threshold, tm_matrix, excel_path, csv_pa
         install.packages("openxlsx", repos = "https://cran.r-project.org")
         library(openxlsx)
     }
     # 根据相似性阈值创建边缘列表，并过滤掉自环
     overthresholdedges <- which(tm_matrix >= threshold, arr.ind = TRUE)
     overthresholdedges <- overthresholdedges[overthresholdedges[, 1] != overthresholdedges[, 2], ]
     # 创建空的图形对象
     graph <- graph.empty()
     # 添加节点
     nodes <- rownames(tm_matrix)
     graph <- add_vertices(graph, nv = length(nodes), name = nodes)
     # 添加边
     for (i in 1:nrow(overthresholdedges)) {
         graph <- add_edges(graph, c(overthresholdedges[i, 1], overthresholdedges[i, 2]))
     }
     # 转换为无向图
     graph <- as.undirected(graph, mode = "collapse")
     # 计算聚类
     clusters <- fastgreedy.community(graph)
     # 获取每个聚类的大小
     cluster_sizes <- sizes(clusters)
     # 按聚类大小降序排序
     sorted_clusters <- clusters[order(cluster_sizes, decreasing = TRUE)]
     # 获取每个聚类的成员
     cluster_members <- membership(clusters)
     # 找到孤立节点
     singleton_nodes <- names(cluster_members[cluster_members %in% which(sizes(clusters) == 1)])
     # 创建Cytoscape导出文件
     cytoscape_export <- createWorkbook()
     # 创建边Sheet
     addWorksheet(cytoscape_export, sheetName = "Edges")
     writeData(cytoscape_export, sheet = "Edges", x = "Source", startCol = 1, startRow = 1)
     writeData(cytoscape_export, sheet = "Edges", x = "Target", startCol = 2, startRow = 1)
     # 获取边列表
     edges <- get.edgelist(graph)
     # 填充边Sheet数据
     if (nrow(edges) > 0) {
         writeData(cytoscape_export, sheet = "Edges", x = V(graph)[edges[, 1]]$name, startCol = 1, startRow = 2)
         writeData(cytoscape_export, sheet = "Edges", x = V(graph)[edges[, 2]]$name, startCol = 2, startRow = 2)
-    }
     # 找到当前边Sheet的最后一行
     last_edge_row <- nrow(edges) + 1
     # 添加孤立节点
     if (length(singleton_nodes) > 0) {
         writeData(cytoscape_export, sheet = "Edges", x = singleton_nodes, startCol = 1, startRow = last_edge_row + 1)
     }
     # 保存Excel文件
     saveWorkbook(cytoscape_export, excel_path, overwrite = TRUE)
-    saveWorkbook(cytoscape_export, "structure_based_similarity_network_cytoscape_export.xlsx", overwrite = TRUE)
     # 创建一个空的数据框用于储存节点和聚类信息
     export_clusters <- data.frame(protein = character(), cluster_name = character(), stringsAsFactors = FALSE)
     # 遍历 sorted_clusters
     cluster_index <- 1  # 初始化簇索引
     for (cluster_name in names(sorted_clusters)) {
@@ -143,10 +114,9 @@ create_similarity_network_r <- function(threshold, tm_matrix, excel_path, csv_pa
             export_clusters <- rbind(export_clusters, data.frame(protein = protein, cluster_name = current_cluster_name))
         }
         cluster_index <- cluster_index + 1  # 索引加1
-    }
-    write.csv(export_clusters, csv_path, row.names = FALSE, quote = TRUE)
     # 返回聚类结果
     return(list(cluster_data = export_clusters, graph = graph))
 }
-""")

 import rpy2.robjects as ro
+from rpy2.robjects import pandas2ri
+from rpy2.robjects.conversion import localconverter
+from rpy2.robjects.packages import importr
 pandas2ri.activate()
 # 导入必要的 R 包
+stats = importr("stats")
+ape = importr("ape")
+igraph = importr("igraph", robject_translations={".env": "_env_"})
+openxlsx = importr("openxlsx")
 def get_r_matrix(df):
     with localconverter(ro.default_converter + pandas2ri.converter):
     return r_tm_matrix
+export_matrix_to_newick_r = ro.r(
+    """
     convert_to_newick <- function(tm_matrix, output_file) {
         # 导入 ape 包
         if (!require(ape, quietly = TRUE)) {
             install.packages("ape", repos = "https://cran.r-project.org")
             library(ape)
         }
         # 计算距离矩阵
         dist_matrix <- dist(tm_matrix)
         # 层次聚类
         hclust_tree <- hclust(dist_matrix, method = "ward.D2")
         # 转为 phylo 对象
         phylo_tree <- as.phylo(hclust_tree)
         # 导出为 Newick 格式
         write.tree(phylo_tree, file = output_file)
         newick_str <- write.tree(phylo_tree)
         return(newick_str)
     }
+    """
+)
+export_similarity_network_r = ro.r(
+    """
+create_similarity_network_r <- function(threshold, tm_matrix, excel_path) {
     # 导入必要的包
     if (!require(igraph, quietly = TRUE)) {
         install.packages("igraph", repos = "https://cran.r-project.org")
         install.packages("openxlsx", repos = "https://cran.r-project.org")
         library(openxlsx)
     }
     # 根据相似性阈值创建边缘列表，并过滤掉自环
     overthresholdedges <- which(tm_matrix >= threshold, arr.ind = TRUE)
     overthresholdedges <- overthresholdedges[overthresholdedges[, 1] != overthresholdedges[, 2], ]
     # 创建空的图形对象
     graph <- graph.empty()
     # 添加节点
     nodes <- rownames(tm_matrix)
     graph <- add_vertices(graph, nv = length(nodes), name = nodes)
     # 添加边
     for (i in 1:nrow(overthresholdedges)) {
         graph <- add_edges(graph, c(overthresholdedges[i, 1], overthresholdedges[i, 2]))
     }
     # 转换为无向图
     graph <- as.undirected(graph, mode = "collapse")
     # 计算聚类
     clusters <- fastgreedy.community(graph)
     # 获取每个聚类的大小
     cluster_sizes <- sizes(clusters)
     # 按聚类大小降序排序
     sorted_clusters <- clusters[order(cluster_sizes, decreasing = TRUE)]
     # 获取每个聚类的成员
     cluster_members <- membership(clusters)
     # 找到孤立节点
     singleton_nodes <- names(cluster_members[cluster_members %in% which(sizes(clusters) == 1)])
     # 创建Cytoscape导出文件
     cytoscape_export <- createWorkbook()
     # 创建边Sheet
     addWorksheet(cytoscape_export, sheetName = "Edges")
     writeData(cytoscape_export, sheet = "Edges", x = "Source", startCol = 1, startRow = 1)
     writeData(cytoscape_export, sheet = "Edges", x = "Target", startCol = 2, startRow = 1)
     # 获取边列表
     edges <- get.edgelist(graph)
     # 填充边Sheet数据
     if (nrow(edges) > 0) {
         writeData(cytoscape_export, sheet = "Edges", x = V(graph)[edges[, 1]]$name, startCol = 1, startRow = 2)
         writeData(cytoscape_export, sheet = "Edges", x = V(graph)[edges[, 2]]$name, startCol = 2, startRow = 2)
+    }
     # 找到当前边Sheet的最后一行
     last_edge_row <- nrow(edges) + 1
     # 添加孤立节点
     if (length(singleton_nodes) > 0) {
         writeData(cytoscape_export, sheet = "Edges", x = singleton_nodes, startCol = 1, startRow = last_edge_row + 1)
     }
     # 保存Excel文件
     saveWorkbook(cytoscape_export, excel_path, overwrite = TRUE)
     # 创建一个空的数据框用于储存节点和聚类信息
     export_clusters <- data.frame(protein = character(), cluster_name = character(), stringsAsFactors = FALSE)
     # 遍历 sorted_clusters
     cluster_index <- 1  # 初始化簇索引
     for (cluster_name in names(sorted_clusters)) {
             export_clusters <- rbind(export_clusters, data.frame(protein = protein, cluster_name = current_cluster_name))
         }
         cluster_index <- cluster_index + 1  # 索引加1
+    }
     # 返回聚类结果
     return(list(cluster_data = export_clusters, graph = graph))
 }
+"""
+)

usalign_runner.py CHANGED Viewed

@@ -1,29 +1,27 @@
 import subprocess
-import os
-from typing import List, Optional
 from pathlib import Path
 import yaml
 class USalignRunner:
     def __init__(self, config_path: str = "config.yaml"):
         """
         Initialize USalignRunner with parameters from config file.
         Args:
             config_path (str): Path to the configuration file
         """
-        with open(config_path, 'r',encoding="utf-8") as f:
             config = yaml.safe_load(f)
-        self.usalign_path = Path(config['USalign']['path'])
         self.default_params = {
-            'tmscore': config['USalign']['tmscore'],
-            'outfmt': config['USalign']['outfmt'],
-            'mol': 'protein'  # Default to protein alignment
         }
-        if not self.usalign_path.exists():
-            raise FileNotFoundError(f"USalign executable not found at {self.usalign_path}")
     def run_alignment(
         self,
@@ -32,37 +30,35 @@ class USalignRunner:
         tmscore: Optional[float] = None,
         outfmt: Optional[int] = None,
     ) -> tuple[int, str, str]:
-        tmscore = tmscore if tmscore is not None else self.default_params['tmscore']
-        outfmt = outfmt if outfmt is not None else self.default_params['outfmt']
         # Create the command
         cmd = [
             str(self.usalign_path),
-            "-mol", self.default_params['mol'],
-            "-dir", str(target_dir),
             pdb_list_file,
-            "-TMscore", str(tmscore),
-            "-outfmt", str(outfmt)
         ]
         print(cmd)
         # Convert command list to string
         cmd_str = "  ".join(cmd)
         try:
             # Execute the command
-            process = subprocess.Popen(
-                cmd_str,
-                stdout=subprocess.PIPE,
-                stderr=subprocess.PIPE,
-                shell=True,
-                text=True
-            )
             # Get output
             stdout, stderr = process.communicate()
             return process.returncode, stdout, stderr
         except Exception as e:
             return -1, "", str(e)

 import subprocess
 from pathlib import Path
+from typing import Optional
 import yaml
 class USalignRunner:
     def __init__(self, config_path: str = "config.yaml"):
         """
         Initialize USalignRunner with parameters from config file.
         Args:
             config_path (str): Path to the configuration file
         """
+        with open(config_path, "r", encoding="utf-8") as f:
             config = yaml.safe_load(f)
+        self.usalign_path = Path(config["USalign"]["path"])
         self.default_params = {
+            "tmscore": config["USalign"]["tmscore"],
+            "outfmt": config["USalign"]["outfmt"],
+            "mol": "protein",  # Default to protein alignment
         }
     def run_alignment(
         self,
         tmscore: Optional[float] = None,
         outfmt: Optional[int] = None,
     ) -> tuple[int, str, str]:
+        tmscore = tmscore if tmscore is not None else self.default_params["tmscore"]
+        outfmt = outfmt if outfmt is not None else self.default_params["outfmt"]
         # Create the command
         cmd = [
             str(self.usalign_path),
+            "-mol",
+            self.default_params["mol"],
+            "-dir",
+            str(target_dir),
             pdb_list_file,
+            "-TMscore",
+            str(tmscore),
+            "-outfmt",
+            str(outfmt),
         ]
         print(cmd)
         # Convert command list to string
         cmd_str = "  ".join(cmd)
         try:
             # Execute the command
+            process = subprocess.Popen(cmd_str, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True, text=True)
             # Get output
             stdout, stderr = process.communicate()
             return process.returncode, stdout, stderr
         except Exception as e:
             return -1, "", str(e)

utils.py CHANGED Viewed

@@ -1,224 +1,138 @@
-import numpy as np
-import pandas as pd
-# import fastcluster
-import networkx as nx
-from community import community_louvain
-from scipy.spatial.distance import pdist, squareform
-from scipy.cluster.hierarchy import linkage, to_tree
-from networkx.algorithms.community import greedy_modularity_communities
-from Bio import Phylo
-from Bio.Phylo.BaseTree import Tree, Clade
-import matplotlib.pyplot as plt
-import sys
-import gradio as gr
-import os
 import hashlib
-from pathlib import Path
-import pandas as pd
 from io import StringIO
-from usalign_runner import USalignRunner
-import pandas as pd
 import numpy as np
-from rpy2.robjects import pandas2ri, r, Formula
-from rpy2.robjects.packages import importr
-from rpy2.robjects.vectors import StrVector, FloatVector, IntVector
-from rpy2.robjects.conversion import localconverter
 import rpy2.robjects as ro
-import os
-from r_functions import get_r_matrix,export_matrix_to_newick_r,export_similarity_network_r
 def get_TM_mat_from_df(df):
-    chain1_unique = df['#PDBchain1'].unique()
-    chain2_unique = df['PDBchain2'].unique()
-    unique_chains = sorted(set(df['#PDBchain1'].unique()).union(set(df['PDBchain2'].unique())))
     chain_to_idx = {chain: idx for idx, chain in enumerate(unique_chains)}
     n = len(unique_chains)
     matrix = np.eye(n)
     for _, row in df.iterrows():
-        chain1 = row['#PDBchain1']
-        chain2 = row['PDBchain2']
         if chain1 in chain_to_idx and chain2 in chain_to_idx:
             i = chain_to_idx[chain1]
             j = chain_to_idx[chain2]
-            matrix[j, i] = row['TM1']
-            matrix[i, j] = row['TM2']
-    columns_names = [chain.replace("/","").replace(".pdb:A","") for chain in unique_chains]
-    df = pd.DataFrame(np.array(matrix),
-                      columns=columns_names,
-                      index=columns_names)
     return df
-# def get_cluster_z_from_df(df):
-#     dist_matrix = pdist(df, metric='euclidean')
-#     Z = fastcluster.linkage(dist_matrix, method='ward')
-#     return Z
-def scipy_to_biopython(Z, labels):
-    """将scipy的linkage矩阵转换为Bio.Phylo树"""
-    tree = to_tree(Z, rd=False)
-    def build_clade(node):
-        if node.is_leaf():
-            return Clade(branch_length=node.dist, name=labels[node.id])
-        else:
-            left = build_clade(node.left)
-            right = build_clade(node.right)
-            return Clade(branch_length=node.dist, clades=[left, right])
-    root = build_clade(tree)
-    return Tree(root)
-def write_str_to_file(s:str,file_path:str):
-    with open(file_path,'w',encoding="utf8") as f:
-        f.write(s)
-def build_graph_from_mat_df(TM_score_matrix,threshold = 0.75):
-    G = nx.Graph()
-    G.add_nodes_from(TM_score_matrix.index)
-    matrix_values = TM_score_matrix.values
-    # np.fill_diagonal(matrix_values, 0)  # 排除自环
-    rows, cols = np.where(matrix_values >= threshold)
-    edges = [(TM_score_matrix.index[i], TM_score_matrix.index[j])
-            for i, j in zip(rows, cols) if i != j]
-    G.add_edges_from(edges)
-    return G
-def fill_community_to_graph(G):
-    partition = community_louvain.best_partition(G)
-    nx.set_node_attributes(G, partition, 'cluster')
-    return partition
-def get_graph_fig(G,partition):
-    plt.figure(figsize=(12, 10))
-    pos = nx.spring_layout(G)
-    nx.draw_networkx_nodes(G, pos, node_size=50,
-                        cmap=plt.cm.tab20, node_color=list(partition.values()))
-    nx.draw_networkx_edges(G, pos, alpha=0.3)
-    plt.title("Structure Similarity Network")
-    plt.axis('off')
-    fig = plt.gcf()
-    return fig
 def calculate_md5(files):
-    """
-    Calculate MD5 hash for a list of files.
-    The hash is calculated by combining the content of all files in sorted order.
-    Args:
-        files: List of file objects from Gradio upload
-    Returns:
-        str: MD5 hash of the combined file contents
-    """
     hash_md5 = hashlib.md5()
-    # Sort files by name to ensure consistent hash regardless of upload order
     sorted_files = sorted(files, key=lambda x: x.name)
     for file in sorted_files:
         with open(file.name, "rb") as f:
             for chunk in iter(lambda: f.read(4096), b""):
                 hash_md5.update(chunk)
     return hash_md5.hexdigest()
-def save_pdb_files(files, data_dir='./data'):
     """Save uploaded PDB files to the specified directory."""
     if not files:
         return "No files uploaded"
     # Create data directory if it doesn't exist
     data_path = Path(data_dir)
     data_path.mkdir(parents=True, exist_ok=True)
     # Calculate MD5 hash for all files
     md5_hash = calculate_md5(files)
-    file_dir = os.path.join(data_path , md5_hash )
     # file_dir.mkdir(exist_ok=True)
     try:
         os.mkdir(file_dir)
-    except:
         pass
-    file_dir = os.path.join(data_path , md5_hash , "pdb")
     try:
         os.mkdir(file_dir)
-    except:
         pass
     print(f"Created directory: {file_dir}")
     # Create list file
-    list_file = os.path.join(data_path , md5_hash , "pdb_list")
     filenames = []
     results = []
     for file in files:
         # Get original filename
         original_filename = os.path.basename(file.name)
         filenames.append(original_filename)
         # Check if file already exists
-        target_path = os.path.join(file_dir,original_filename )
         print(f"Saving to: {target_path}")
         # Save the file
         with open(target_path, "wb") as f:
             f.write(open(file.name, "rb").read())
         results.append(f"Saved {original_filename}")
     # Write list file
     with open(list_file, "w") as f:
         f.write("\n".join(filenames))
     results.append(f"Created list file: {list_file}")
     return "\n".join(results)
 def run_usalign(md5_hash):
     """Run USalign on the uploaded PDB files and return results as DataFrame."""
     try:
         runner = USalignRunner()
         data_path = Path("./data")
-        pdb_dir = os.path.join(data_path , md5_hash , "pdb")
-        list_file = os.path.join(data_path , md5_hash , "pdb_list")
         print(str(pdb_dir))
         print(str(list_file))
-        return_code, stdout, stderr = runner.run_alignment(
-            target_dir=str(pdb_dir),
-            pdb_list_file=str(list_file)
-        )
         print(stdout)
         print(stderr)
         if return_code == 0:
             # Handle potential encoding issues
             df = pd.read_csv(StringIO(stdout), sep="\t", encoding=sys.getdefaultencoding())
             # Clean up any potential encoding artifacts in column names
             df.columns = [col.strip() for col in df.columns]
             return df
         else:
             return pd.DataFrame({"Error": [stderr]})
     except Exception as e:
-        return pd.DataFrame({"Error": [stderr]})
-def run_community_analysis(results_df, data_dir, md5_hash,threshold):
     """Run community analysis pipeline and return results."""
     try:
         # Generate TM matrix
         tm_matrix = get_TM_mat_from_df(results_df)
-        tm_file = os.path.join("data",md5_hash,"tm_matrix.csv")
-        newick_file = os.path.join("data",md5_hash,"clustering.newick")
         # network_file = os.path.join("data",md5_hash,"network.svg")
-        network_edges_file = os.path.join("data",md5_hash,"network_cytoscape_export.xlsx")
-        cluster_file = os.path.join("data",md5_hash,"cluster_assignments.csv")
         with localconverter(ro.default_converter + pandas2ri.converter):
             r_tm_matrix = ro.conversion.py2rpy(tm_matrix)
@@ -226,8 +140,7 @@ def run_community_analysis(results_df, data_dir, md5_hash,threshold):
             result = export_matrix_to_newick_r(r_tm_matrix, newick_file)
             newick_str = result[0]
-            export_similarity_network_r(threshold, r_tm_matrix,network_edges_file, cluster_file)
         # cluster_df.to_csv(cluster_file,index=False)
         # combined_df.to_csv(network_edges_file,index=False)
@@ -237,44 +150,19 @@ def run_community_analysis(results_df, data_dir, md5_hash,threshold):
         # Phylo.write(tree, newick_file, "newick")
         # fig.savefig(network_file, format="svg", bbox_inches="tight")
         # plt.close(fig)
         return {
             "tm_matrix": tm_matrix,
             "newick_str": newick_str,
             # "network_fig": fig,
-            "files":[
                 tm_file,
                 newick_file,
                 # network_file,
                 network_edges_file,
-                cluster_file
-            ]
         }
     except Exception as e:
         print("Error", str(e))
         return {"Error": str(e)}
-def get_dataframe_from_network(G,partition):
-    edges_data = [list(edge) for edge in G.edges()]
-    edges_df = pd.DataFrame(edges_data, columns=["Source", "Target"])
-    cluster_membership = {}
-    for idx, comm in enumerate(partition):
-        for node in comm:
-            cluster_membership[node] = f"cluster_{idx+1}"
-    singleton_nodes = [n for n in G.nodes if G.degree[n] == 0]
-    for node in singleton_nodes:
-        cluster_membership[node] = "singleton"
-    # 创建孤立节点的数据
-    singleton_data = [[node, ""] for node in singleton_nodes]
-    singleton_df = pd.DataFrame(singleton_data, columns=["Source", "Target"])
-    # 合并数据
-    combined_df = pd.concat([edges_df, singleton_df], ignore_index=True)
-    return combined_df
-    # # 导出为 CSV 文件
-    # combined_df.to_csv("structure_based_similarity_network_cytoscape_export.csv", index=False)

 import hashlib
+import os
+import sys
 from io import StringIO
+from pathlib import Path
 import numpy as np
+import pandas as pd
 import rpy2.robjects as ro
+from rpy2.robjects import pandas2ri
+from rpy2.robjects.conversion import localconverter
+from r_functions import export_matrix_to_newick_r, export_similarity_network_r
+from usalign_runner import USalignRunner
 def get_TM_mat_from_df(df):
+    unique_chains = sorted(set(df["#PDBchain1"].unique()).union(set(df["PDBchain2"].unique())))
     chain_to_idx = {chain: idx for idx, chain in enumerate(unique_chains)}
     n = len(unique_chains)
     matrix = np.eye(n)
     for _, row in df.iterrows():
+        chain1 = row["#PDBchain1"]
+        chain2 = row["PDBchain2"]
         if chain1 in chain_to_idx and chain2 in chain_to_idx:
             i = chain_to_idx[chain1]
             j = chain_to_idx[chain2]
+            matrix[j, i] = row["TM1"]
+            matrix[i, j] = row["TM2"]
+    columns_names = [chain.replace("/", "").replace(".pdb:A", "") for chain in unique_chains]
+    df = pd.DataFrame(np.array(matrix), columns=columns_names, index=columns_names)
     return df
 def calculate_md5(files):
     hash_md5 = hashlib.md5()
     sorted_files = sorted(files, key=lambda x: x.name)
     for file in sorted_files:
         with open(file.name, "rb") as f:
             for chunk in iter(lambda: f.read(4096), b""):
                 hash_md5.update(chunk)
     return hash_md5.hexdigest()
+def save_pdb_files(files, data_dir="./data"):
     """Save uploaded PDB files to the specified directory."""
     if not files:
         return "No files uploaded"
     # Create data directory if it doesn't exist
     data_path = Path(data_dir)
     data_path.mkdir(parents=True, exist_ok=True)
     # Calculate MD5 hash for all files
     md5_hash = calculate_md5(files)
+    file_dir = os.path.join(data_path, md5_hash)
     # file_dir.mkdir(exist_ok=True)
     try:
         os.mkdir(file_dir)
+    except Exception:
         pass
+    file_dir = os.path.join(data_path, md5_hash, "pdb")
     try:
         os.mkdir(file_dir)
+    except Exception:
         pass
     print(f"Created directory: {file_dir}")
     # Create list file
+    list_file = os.path.join(data_path, md5_hash, "pdb_list")
     filenames = []
     results = []
     for file in files:
         # Get original filename
         original_filename = os.path.basename(file.name)
         filenames.append(original_filename)
         # Check if file already exists
+        target_path = os.path.join(file_dir, original_filename)
         print(f"Saving to: {target_path}")
         # Save the file
         with open(target_path, "wb") as f:
             f.write(open(file.name, "rb").read())
         results.append(f"Saved {original_filename}")
     # Write list file
     with open(list_file, "w") as f:
         f.write("\n".join(filenames))
     results.append(f"Created list file: {list_file}")
     return "\n".join(results)
 def run_usalign(md5_hash):
     """Run USalign on the uploaded PDB files and return results as DataFrame."""
     try:
         runner = USalignRunner()
         data_path = Path("./data")
+        pdb_dir = os.path.join(data_path, md5_hash, "pdb")
+        list_file = os.path.join(data_path, md5_hash, "pdb_list")
         print(str(pdb_dir))
         print(str(list_file))
+        return_code, stdout, stderr = runner.run_alignment(target_dir=str(pdb_dir), pdb_list_file=str(list_file))
         print(stdout)
         print(stderr)
         if return_code == 0:
             # Handle potential encoding issues
             df = pd.read_csv(StringIO(stdout), sep="\t", encoding=sys.getdefaultencoding())
             # Clean up any potential encoding artifacts in column names
             df.columns = [col.strip() for col in df.columns]
             return df
         else:
             return pd.DataFrame({"Error": [stderr]})
     except Exception as e:
+        return pd.DataFrame({"Error": [e, stderr]})
+def run_community_analysis(results_df, data_dir, md5_hash, threshold):
     """Run community analysis pipeline and return results."""
     try:
         # Generate TM matrix
         tm_matrix = get_TM_mat_from_df(results_df)
+        tm_file = os.path.join("data", md5_hash, "tm_matrix.csv")
+        newick_file = os.path.join("data", md5_hash, "clustering.newick")
         # network_file = os.path.join("data",md5_hash,"network.svg")
+        network_edges_file = os.path.join("data", md5_hash, "network_cytoscape_export.xlsx")
+        # cluster_file = os.path.join("data", md5_hash, "cluster_assignments.csv")
         with localconverter(ro.default_converter + pandas2ri.converter):
             r_tm_matrix = ro.conversion.py2rpy(tm_matrix)
             result = export_matrix_to_newick_r(r_tm_matrix, newick_file)
             newick_str = result[0]
+            export_similarity_network_r(threshold, r_tm_matrix, network_edges_file)
         # cluster_df.to_csv(cluster_file,index=False)
         # combined_df.to_csv(network_edges_file,index=False)
         # Phylo.write(tree, newick_file, "newick")
         # fig.savefig(network_file, format="svg", bbox_inches="tight")
         # plt.close(fig)
         return {
             "tm_matrix": tm_matrix,
             "newick_str": newick_str,
             # "network_fig": fig,
+            "files": [
                 tm_file,
                 newick_file,
                 # network_file,
                 network_edges_file,
+                # cluster_file,
+            ],
         }
     except Exception as e:
         print("Error", str(e))
         return {"Error": str(e)}