File size: 4,835 Bytes
9b9c66d
4a4b152
 
 
9b9c66d
 
 
 
4a4b152
 
 
 
 
9b9c66d
 
 
 
 
 
 
4a4b152
 
9b9c66d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4a4b152
 
9b9c66d
4a4b152
 
 
9b9c66d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4a4b152
9b9c66d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4a4b152
9b9c66d
 
 
4a4b152
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import rpy2.robjects as ro
from rpy2.robjects import pandas2ri
from rpy2.robjects.conversion import localconverter
from rpy2.robjects.packages import importr

pandas2ri.activate()

# 导入必要的 R 包
stats = importr("stats")
ape = importr("ape")
igraph = importr("igraph", robject_translations={".env": "_env_"})
openxlsx = importr("openxlsx")


def get_r_matrix(df):
    with localconverter(ro.default_converter + pandas2ri.converter):
        r_tm_matrix = ro.conversion.py2rpy(df)
    return r_tm_matrix


export_matrix_to_newick_r = ro.r(
    """
    convert_to_newick <- function(tm_matrix, output_file) {
        # 导入 ape 包
        if (!require(ape, quietly = TRUE)) {
            install.packages("ape", repos = "https://cran.r-project.org")
            library(ape)
        }
        # 计算距离矩阵
        dist_matrix <- dist(tm_matrix)
        # 层次聚类
        hclust_tree <- hclust(dist_matrix, method = "ward.D2")
        # 转为 phylo 对象
        phylo_tree <- as.phylo(hclust_tree)
        # 导出为 Newick 格式
        write.tree(phylo_tree, file = output_file)
        newick_str <- write.tree(phylo_tree)
        return(newick_str)
    }
    """
)

export_similarity_network_r = ro.r(
    """
create_similarity_network_r <- function(threshold, tm_matrix, excel_path) {
    # 导入必要的包
    if (!require(igraph, quietly = TRUE)) {
        install.packages("igraph", repos = "https://cran.r-project.org")
        library(igraph)
    }
    if (!require(openxlsx, quietly = TRUE)) {
        install.packages("openxlsx", repos = "https://cran.r-project.org")
        library(openxlsx)
    }
    # 根据相似性阈值创建边缘列表,并过滤掉自环
    overthresholdedges <- which(tm_matrix >= threshold, arr.ind = TRUE)
    overthresholdedges <- overthresholdedges[overthresholdedges[, 1] != overthresholdedges[, 2], ]
    # 创建空的图形对象
    graph <- graph.empty()
    # 添加节点
    nodes <- rownames(tm_matrix)
    graph <- add_vertices(graph, nv = length(nodes), name = nodes)
    # 添加边
    for (i in 1:nrow(overthresholdedges)) {
        graph <- add_edges(graph, c(overthresholdedges[i, 1], overthresholdedges[i, 2]))
    }
    # 转换为无向图
    graph <- as.undirected(graph, mode = "collapse")
    # 计算聚类
    clusters <- fastgreedy.community(graph)
    # 获取每个聚类的大小
    cluster_sizes <- sizes(clusters)
    # 按聚类大小降序排序
    sorted_clusters <- clusters[order(cluster_sizes, decreasing = TRUE)]
    # 获取每个聚类的成员
    cluster_members <- membership(clusters)
    # 找到孤立节点
    singleton_nodes <- names(cluster_members[cluster_members %in% which(sizes(clusters) == 1)])
    # 创建Cytoscape导出文件
    cytoscape_export <- createWorkbook()
    # 创建边Sheet
    addWorksheet(cytoscape_export, sheetName = "Edges")
    writeData(cytoscape_export, sheet = "Edges", x = "Source", startCol = 1, startRow = 1)
    writeData(cytoscape_export, sheet = "Edges", x = "Target", startCol = 2, startRow = 1)
    # 获取边列表
    edges <- get.edgelist(graph)
    # 填充边Sheet数据
    if (nrow(edges) > 0) {
        writeData(cytoscape_export, sheet = "Edges", x = V(graph)[edges[, 1]]$name, startCol = 1, startRow = 2)
        writeData(cytoscape_export, sheet = "Edges", x = V(graph)[edges[, 2]]$name, startCol = 2, startRow = 2)
    }    
    # 找到当前边Sheet的最后一行
    last_edge_row <- nrow(edges) + 1
    # 添加孤立节点
    if (length(singleton_nodes) > 0) {
        writeData(cytoscape_export, sheet = "Edges", x = singleton_nodes, startCol = 1, startRow = last_edge_row + 1)
    }
    # 保存Excel文件
    saveWorkbook(cytoscape_export, excel_path, overwrite = TRUE)  
    # 创建一个空的数据框用于储存节点和聚类信息
    export_clusters <- data.frame(protein = character(), cluster_name = character(), stringsAsFactors = FALSE)
    # 遍历 sorted_clusters
    cluster_index <- 1  # 初始化簇索引
    for (cluster_name in names(sorted_clusters)) {
        proteins <- sorted_clusters[[cluster_name]]
        # 将每个 protein 和对应的 cluster_name 添加到数据框
        for (protein in proteins) {
            # 检查 protein 是否在 singleton_nodes 中
            if (protein %in% singleton_nodes) {
                current_cluster_name <- "singleton"  # 修改为 "singleton"
            } else {
                current_cluster_name <- as.character(cluster_index)  # 使用簇索引
            }
            export_clusters <- rbind(export_clusters, data.frame(protein = protein, cluster_name = current_cluster_name))
        }
        cluster_index <- cluster_index + 1  # 索引加1
    }        
    # 返回聚类结果
    return(list(cluster_data = export_clusters, graph = graph))
}
"""
)