Spaces:

Xue-Jun
/

StructureBasedSimilarityNetwork

Sleeping

File size: 5,356 Bytes

9b9c66d

import pandas as pd
import numpy as np
from rpy2.robjects import pandas2ri, r, Formula
from rpy2.robjects.packages import importr
from rpy2.robjects.vectors import StrVector, FloatVector, IntVector
from rpy2.robjects.conversion import localconverter
import rpy2.robjects as ro
import os


pandas2ri.activate()

# 导入必要的 R 包
stats = importr('stats')
ape = importr('ape')
igraph = importr('igraph', robject_translations={'.env': '_env_'})
openxlsx = importr('openxlsx')
# dplyr = importr('dplyr')

def get_r_matrix(df):
    with localconverter(ro.default_converter + pandas2ri.converter):
        r_tm_matrix = ro.conversion.py2rpy(df)
    return r_tm_matrix


export_matrix_to_newick_r = ro.r("""
    convert_to_newick <- function(tm_matrix, output_file) {
        # 导入 ape 包
        if (!require(ape, quietly = TRUE)) {
            install.packages("ape", repos = "https://cran.r-project.org")
            library(ape)
        }
        
        # 计算距离矩阵
        dist_matrix <- dist(tm_matrix)
        
        # 层次聚类
        hclust_tree <- hclust(dist_matrix, method = "ward.D2")
        
        # 转为 phylo 对象
        phylo_tree <- as.phylo(hclust_tree)
        
        # 导出为 Newick 格式
        write.tree(phylo_tree, file = output_file)
        
        newick_str <- write.tree(phylo_tree)
        
        return(newick_str)
    }
    """)

export_similarity_network_r = ro.r("""
create_similarity_network_r <- function(threshold, tm_matrix, excel_path, csv_path) {
    # 导入必要的包
    if (!require(igraph, quietly = TRUE)) {
        install.packages("igraph", repos = "https://cran.r-project.org")
        library(igraph)
    }
    if (!require(openxlsx, quietly = TRUE)) {
        install.packages("openxlsx", repos = "https://cran.r-project.org")
        library(openxlsx)
    }
    
    # 根据相似性阈值创建边缘列表，并过滤掉自环
    overthresholdedges <- which(tm_matrix >= threshold, arr.ind = TRUE)
    overthresholdedges <- overthresholdedges[overthresholdedges[, 1] != overthresholdedges[, 2], ]
    
    # 创建空的图形对象
    graph <- graph.empty()
    
    # 添加节点
    nodes <- rownames(tm_matrix)
    graph <- add_vertices(graph, nv = length(nodes), name = nodes)
    
    # 添加边
    for (i in 1:nrow(overthresholdedges)) {
        graph <- add_edges(graph, c(overthresholdedges[i, 1], overthresholdedges[i, 2]))
    }
    
    # 转换为无向图
    graph <- as.undirected(graph, mode = "collapse")
    
    # 计算聚类
    clusters <- fastgreedy.community(graph)
    
    # 获取每个聚类的大小
    cluster_sizes <- sizes(clusters)
    
    # 按聚类大小降序排序
    sorted_clusters <- clusters[order(cluster_sizes, decreasing = TRUE)]
    
    # 获取每个聚类的成员
    cluster_members <- membership(clusters)
    
    # 找到孤立节点
    singleton_nodes <- names(cluster_members[cluster_members %in% which(sizes(clusters) == 1)])
    
    # 创建Cytoscape导出文件
    cytoscape_export <- createWorkbook()
    
    # 创建边Sheet
    addWorksheet(cytoscape_export, sheetName = "Edges")
    writeData(cytoscape_export, sheet = "Edges", x = "Source", startCol = 1, startRow = 1)
    writeData(cytoscape_export, sheet = "Edges", x = "Target", startCol = 2, startRow = 1)
    
    # 获取边列表
    edges <- get.edgelist(graph)
    
    # 填充边Sheet数据
    if (nrow(edges) > 0) {
        writeData(cytoscape_export, sheet = "Edges", x = V(graph)[edges[, 1]]$name, startCol = 1, startRow = 2)
        writeData(cytoscape_export, sheet = "Edges", x = V(graph)[edges[, 2]]$name, startCol = 2, startRow = 2)
    }
    
    # 找到当前边Sheet的最后一行
    last_edge_row <- nrow(edges) + 1
    
    # 添加孤立节点
    if (length(singleton_nodes) > 0) {
        writeData(cytoscape_export, sheet = "Edges", x = singleton_nodes, startCol = 1, startRow = last_edge_row + 1)
    }
    
    # 保存Excel文件
    saveWorkbook(cytoscape_export, excel_path, overwrite = TRUE)  
                                   
    saveWorkbook(cytoscape_export, "structure_based_similarity_network_cytoscape_export.xlsx", overwrite = TRUE)

    # 创建一个空的数据框用于储存节点和聚类信息
    export_clusters <- data.frame(protein = character(), cluster_name = character(), stringsAsFactors = FALSE)

    # 遍历 sorted_clusters
    cluster_index <- 1  # 初始化簇索引
    for (cluster_name in names(sorted_clusters)) {
        proteins <- sorted_clusters[[cluster_name]]
        # 将每个 protein 和对应的 cluster_name 添加到数据框
        for (protein in proteins) {
            # 检查 protein 是否在 singleton_nodes 中
            if (protein %in% singleton_nodes) {
                current_cluster_name <- "singleton"  # 修改为 "singleton"
            } else {
                current_cluster_name <- as.character(cluster_index)  # 使用簇索引
            }
            export_clusters <- rbind(export_clusters, data.frame(protein = protein, cluster_name = current_cluster_name))
        }
        cluster_index <- cluster_index + 1  # 索引加1
    }
    
    write.csv(export_clusters, csv_path, row.names = FALSE, quote = TRUE)           
    # 返回聚类结果
    return(list(cluster_data = export_clusters, graph = graph))
}
""")