Spaces:

Xue-Jun
/

StructureBasedSimilarityNetwork

Sleeping

File size: 4,835 Bytes

import rpy2.robjects as ro
from rpy2.robjects import pandas2ri
from rpy2.robjects.conversion import localconverter
from rpy2.robjects.packages import importr

pandas2ri.activate()

# 导入必要的 R 包
stats = importr("stats")
ape = importr("ape")
igraph = importr("igraph", robject_translations={".env": "_env_"})
openxlsx = importr("openxlsx")


def get_r_matrix(df):
    with localconverter(ro.default_converter + pandas2ri.converter):
        r_tm_matrix = ro.conversion.py2rpy(df)
    return r_tm_matrix


export_matrix_to_newick_r = ro.r(
    """
    convert_to_newick <- function(tm_matrix, output_file) {
        # 导入 ape 包
        if (!require(ape, quietly = TRUE)) {
            install.packages("ape", repos = "https://cran.r-project.org")
            library(ape)
        }
        # 计算距离矩阵
        dist_matrix <- dist(tm_matrix)
        # 层次聚类
        hclust_tree <- hclust(dist_matrix, method = "ward.D2")
        # 转为 phylo 对象
        phylo_tree <- as.phylo(hclust_tree)
        # 导出为 Newick 格式
        write.tree(phylo_tree, file = output_file)
        newick_str <- write.tree(phylo_tree)
        return(newick_str)
    }
    """
)

export_similarity_network_r = ro.r(
    """
create_similarity_network_r <- function(threshold, tm_matrix, excel_path) {
    # 导入必要的包
    if (!require(igraph, quietly = TRUE)) {
        install.packages("igraph", repos = "https://cran.r-project.org")
        library(igraph)
    }
    if (!require(openxlsx, quietly = TRUE)) {
        install.packages("openxlsx", repos = "https://cran.r-project.org")
        library(openxlsx)
    }
    # 根据相似性阈值创建边缘列表，并过滤掉自环
    overthresholdedges <- which(tm_matrix >= threshold, arr.ind = TRUE)
    overthresholdedges <- overthresholdedges[overthresholdedges[, 1] != overthresholdedges[, 2], ]
    # 创建空的图形对象
    graph <- graph.empty()
    # 添加节点
    nodes <- rownames(tm_matrix)
    graph <- add_vertices(graph, nv = length(nodes), name = nodes)
    # 添加边
    for (i in 1:nrow(overthresholdedges)) {
        graph <- add_edges(graph, c(overthresholdedges[i, 1], overthresholdedges[i, 2]))
    }
    # 转换为无向图
    graph <- as.undirected(graph, mode = "collapse")
    # 计算聚类
    clusters <- fastgreedy.community(graph)
    # 获取每个聚类的大小
    cluster_sizes <- sizes(clusters)
    # 按聚类大小降序排序
    sorted_clusters <- clusters[order(cluster_sizes, decreasing = TRUE)]
    # 获取每个聚类的成员
    cluster_members <- membership(clusters)
    # 找到孤立节点
    singleton_nodes <- names(cluster_members[cluster_members %in% which(sizes(clusters) == 1)])
    # 创建Cytoscape导出文件
    cytoscape_export <- createWorkbook()
    # 创建边Sheet
    addWorksheet(cytoscape_export, sheetName = "Edges")
    writeData(cytoscape_export, sheet = "Edges", x = "Source", startCol = 1, startRow = 1)
    writeData(cytoscape_export, sheet = "Edges", x = "Target", startCol = 2, startRow = 1)
    # 获取边列表
    edges <- get.edgelist(graph)
    # 填充边Sheet数据
    if (nrow(edges) > 0) {
        writeData(cytoscape_export, sheet = "Edges", x = V(graph)[edges[, 1]]$name, startCol = 1, startRow = 2)
        writeData(cytoscape_export, sheet = "Edges", x = V(graph)[edges[, 2]]$name, startCol = 2, startRow = 2)
    }    
    # 找到当前边Sheet的最后一行
    last_edge_row <- nrow(edges) + 1
    # 添加孤立节点
    if (length(singleton_nodes) > 0) {
        writeData(cytoscape_export, sheet = "Edges", x = singleton_nodes, startCol = 1, startRow = last_edge_row + 1)
    }
    # 保存Excel文件
    saveWorkbook(cytoscape_export, excel_path, overwrite = TRUE)  
    # 创建一个空的数据框用于储存节点和聚类信息
    export_clusters <- data.frame(protein = character(), cluster_name = character(), stringsAsFactors = FALSE)
    # 遍历 sorted_clusters
    cluster_index <- 1  # 初始化簇索引
    for (cluster_name in names(sorted_clusters)) {
        proteins <- sorted_clusters[[cluster_name]]
        # 将每个 protein 和对应的 cluster_name 添加到数据框
        for (protein in proteins) {
            # 检查 protein 是否在 singleton_nodes 中
            if (protein %in% singleton_nodes) {
                current_cluster_name <- "singleton"  # 修改为 "singleton"
            } else {
                current_cluster_name <- as.character(cluster_index)  # 使用簇索引
            }
            export_clusters <- rbind(export_clusters, data.frame(protein = protein, cluster_name = current_cluster_name))
        }
        cluster_index <- cluster_index + 1  # 索引加1
    }        
    # 返回聚类结果
    return(list(cluster_data = export_clusters, graph = graph))
}
"""
)