Yuanclaw / modules /differential_analysis.R
huashu's picture
Export YuanSeq to Hugging Face without binary assets
7e6a9d1
# =====================================================
# 差异分析模块
# =====================================================
differential_analysis_server <- function(input, output, session) {
# 获取数据输入模块的函数
data_input <- data_input_server(input, output, session)
# --- 样本数量显示 ---
output$sample_count_display <- renderUI({
req(data_input$raw_data(), input$control_group, input$treat_group)
# 获取组别信息
groups_list <- list(
Control = input$control_group,
Treatment = input$treat_group
)
# 计算每组样本数
group_counts <- sapply(groups_list, length)
min_replicates <- min(group_counts)
# 确定分析方法
if (min_replicates >= 3) {
method_text <- "limma-voom (样本充足)"
method_color <- "success"
} else {
method_text <- "edgeR (样本较少)"
method_color <- "warning"
}
tagList(
tags$div(class = "alert alert-info",
tags$h5("📊 样本统计与对比设置"),
tags$p(tags$strong("对照组 (Control): "), group_counts["Control"], " 个样本"),
tags$p(tags$strong("处理组 (Treatment): "), group_counts["Treatment"], " 个样本"),
tags$p(tags$strong("最小重复数: "), min_replicates),
tags$hr(),
tags$p(tags$strong("分析方法: "),
tags$span(class = paste0("text-", method_color), method_text)),
tags$p(class = "text-muted small",
"规则: 每组样本数≥3时使用limma-voom,<3时使用edgeR"),
tags$hr(style = "margin-top: 10px; margin-bottom: 10px;"),
tags$p(tags$strong("🔄 对比方向: "),
tags$span(class = "text-primary", "Treatment vs Control"),
tags$br(),
tags$small(class = "text-muted",
"log2FC > 0: 基因在处理组中上调表达",
tags$br(),
"log2FC < 0: 基因在处理组中下调表达")
)
)
)
})
# --- 差异分析函数 ---
perform_differential_analysis <- function(df_use, group, min_replicates) {
# 数据预处理
dge <- DGEList(counts = df_use, group = group)
dge <- calcNormFactors(dge)
# 过滤低表达基因
keep <- filterByExpr(dge)
dge <- dge[keep, , keep.lib.sizes=FALSE]
if (min_replicates >= 3) {
# limma-voom 分析流程
design <- model.matrix(~0 + group)
colnames(design) <- levels(group)
v <- voom(dge, design, plot = FALSE)
fit <- lmFit(v, design)
# 设置对比(使用字符串引用design矩阵的列名)
cm <- makeContrasts(
Treatment_vs_Control = Treatment - Control,
levels = design
)
fit2 <- contrasts.fit(fit, cm)
fit2 <- eBayes(fit2)
# 获取结果
res <- topTable(fit2, coef = "Treatment_vs_Control", number = Inf)
res$GeneID <- rownames(res)
res <- res %>%
dplyr::rename(
log2FoldChange = logFC,
pvalue = P.Value, # limma的原始p值
padj = adj.P.Val, # limma的BH校正p值
t_stat = t # moderated t统计量
)
# 验证关键列是否存在
required_cols <- c("log2FoldChange", "pvalue", "padj")
missing_cols <- setdiff(required_cols, colnames(res))
if (length(missing_cols) > 0) {
stop(sprintf("limma-voom结果缺少必要列: %s。现有列: %s",
paste(missing_cols, collapse = ", "),
paste(colnames(res), collapse = ", ")))
}
} else {
# edgeR 分析流程
# 明确指定对比方向:Treatment vs Control
# edgeR的exactTest默认对比是第2个水平 vs 第1个水平
# 由于我们设置了levels = c("Control", "Treatment"),所以默认就是Treatment vs Control
if (min_replicates > 1) {
# 有重复时估计离散度
dge <- estimateDisp(dge)
# 明确指定对比:Treatment vs Control
et <- exactTest(dge, pair = c("Control", "Treatment"))
} else {
# 无重复时使用固定离散度
user_disp_sqrt <- 0.1 # 可以通过UI设置
# 明确指定对比:Treatment vs Control
et <- exactTest(dge, pair = c("Control", "Treatment"), dispersion = user_disp_sqrt^2)
}
# 获取结果
res <- topTags(et, n = Inf)$table
res$GeneID <- rownames(res)
res <- res %>%
dplyr::rename(
log2FoldChange = logFC,
pvalue = PValue,
padj = FDR # edgeR的topTags返回FDR列(Benjamini-Hochberg校正)
)
# 验证关键列是否存在
required_cols <- c("log2FoldChange", "pvalue", "padj")
missing_cols <- setdiff(required_cols, colnames(res))
if (length(missing_cols) > 0) {
stop(sprintf("edgeR结果缺少必要列: %s。现有列: %s",
paste(missing_cols, collapse = ", "),
paste(colnames(res), collapse = ", ")))
}
}
# 添加基础均值(使用标准化后的CPM值)
res$baseMean <- rowMeans(edgeR::cpm(dge, log = FALSE, prior.count = 1))
# 添加logCPM
res$logCPM <- edgeR::cpm(dge, log = TRUE, prior.count = 1) %>%
rowMeans()
return(res)
}
# --- 执行差异分析 ---
deg_results <- eventReactive(input$analyze, {
req(data_input$raw_data(), input$control_group, input$treat_group)
# 获取数据
df <- data_input$raw_data()
ctrl <- input$control_group
trt <- input$treat_group
# 验证输入
if (length(ctrl) == 0 || length(trt) == 0) {
showNotification("请至少选择一个对照组和处理组样本", type = "error")
return(NULL)
}
# 检查样本重叠
if (length(intersect(ctrl, trt)) > 0) {
showNotification("对照组和处理组不能有重叠样本", type = "error")
return(NULL)
}
# 准备数据
df_use <- df[, c(ctrl, trt)]
group <- factor(c(rep("Control", length(ctrl)),
rep("Treatment", length(trt))),
levels = c("Control", "Treatment"))
min_replicates <- min(length(ctrl), length(trt))
# 执行差异分析
tryCatch({
res <- perform_differential_analysis(df_use, group, min_replicates)
# === LogFC方向验证和提示 ===
# 计算上调和下调基因数量,用于方向验证
n_up <- sum(res$log2FoldChange > 0, na.rm = TRUE)
n_down <- sum(res$log2FoldChange < 0, na.rm = TRUE)
n_total <- n_up + n_down
# 计算显著性基因数量
n_significant <- sum(res$padj < input$pval_cutoff & abs(res$log2FoldChange) > input$log2fc_cutoff, na.rm = TRUE)
n_up_sig <- sum(res$padj < input$pval_cutoff & res$log2FoldChange > input$log2fc_cutoff, na.rm = TRUE)
n_down_sig <- sum(res$padj < input$pval_cutoff & res$log2FoldChange < -input$log2fc_cutoff, na.rm = TRUE)
# p值统计
pval_stats <- summary(res$pvalue)
padj_stats <- summary(res$padj)
# 显示对比方向和统计信息
cat("\n========== 差异分析结果摘要 ==========\n")
cat("分析方法:", if(min_replicates >= 3) "limma-voom" else "edgeR", "\n")
cat("对比组别: Treatment vs Control\n")
cat("含义: 相对于Control组,Treatment组的基因表达变化\n")
cat("----------------------------------------\n")
cat("P值校正方法: Benjamini-Hochberg (BH FDR)\n")
cat("筛选阈值: padj <", input$pval_cutoff, "且 |log2FC| >", input$log2fc_cutoff, "\n")
cat("----------------------------------------\n")
cat("总体分布:\n")
cat(sprintf(" 上调基因 (log2FC > 0): %d (%.1f%%)\n", n_up, 100*n_up/n_total))
cat(sprintf(" 下调基因 (log2FC < 0): %d (%.1f%%)\n", n_down, 100*n_down/n_total))
cat("----------------------------------------\n")
cat("显著差异基因:\n")
cat(sprintf(" 总计: %d\n", n_significant))
cat(sprintf(" 上调: %d\n", n_up_sig))
cat(sprintf(" 下调: %d\n", n_down_sig))
cat("----------------------------------------\n")
cat("P值分布:\n")
cat(sprintf(" 最小值: %.2e\n", pval_stats["Min."]))
cat(sprintf(" 中位数: %.2e\n", pval_stats["Median"]))
cat(sprintf(" 最大值: %.2e\n", pval_stats["Max."]))
cat("校正P值 (FDR) 分布:\n")
cat(sprintf(" 最小值: %.2e\n", padj_stats["Min."]))
cat(sprintf(" 中位数: %.2e\n", padj_stats["Median"]))
cat(sprintf(" 最大值: %.2e\n", padj_stats["Max."]))
cat("========================================\n\n")
# 添加差异状态
res$Status <- ifelse(
res$padj < input$pval_cutoff & abs(res$log2FoldChange) > input$log2fc_cutoff,
ifelse(res$log2FoldChange > 0, "Up", "Down"),
"Not DE"
)
# 添加t统计量(如果没有从topTable获取到)
if (!"t_stat" %in% colnames(res)) {
# 对于edgeR结果,使用近似t统计量
res$t_stat <- qnorm(1 - res$pvalue/2) * sign(res$log2FoldChange)
}
# 基因注释
anno <- data_input$annotate_genes(res$GeneID, input$species_select)
if (!is.null(anno)) {
# 改进的GeneID清理逻辑 - 保留Ensembl ID的版本号
clean_geneid <- res$GeneID
clean_geneid <- trimws(clean_geneid)
clean_geneid <- gsub("[\t\n\r]", "", clean_geneid)
# 对于非Ensembl ID,才移除特殊字符
non_ensembl <- !grepl("^ENS", clean_geneid, ignore.case = TRUE)
clean_geneid[non_ensembl] <- gsub("[^[:alnum:]]", "", clean_geneid[non_ensembl])
# 清理anno中的列名
anno_clean <- anno
if ("SYMBOL" %in% colnames(anno_clean)) {
# 使用与原始数据相同的清理逻辑
anno_clean$SYMBOL_CLEAN <- gsub("[^[:alnum:]]", "", anno_clean$SYMBOL)
if (input$species_select == "Mm") {
anno_clean$SYMBOL_CLEAN <- sapply(anno_clean$SYMBOL_CLEAN, function(x) {
if (grepl("^[A-Za-z]", x) && nchar(x) > 0) {
paste0(toupper(substr(x, 1, 1)), tolower(substr(x, 2, nchar(x))))
} else {
x
}
}, USE.NAMES = FALSE)
} else {
anno_clean$SYMBOL_CLEAN <- toupper(anno_clean$SYMBOL_CLEAN)
}
}
# 初始化结果列
if (!"SYMBOL" %in% colnames(res)) res$SYMBOL <- res$GeneID
if (!"ENTREZID" %in% colnames(res)) res$ENTREZID <- NA
# 🔥 修复:记录原始GeneID,用于后续显示
res$Original_GeneID <- res$GeneID
# 🔥 新的匹配策略:优先使用ENSEMBL列匹配Ensembl ID
cat("开始SYMBOL匹配流程...\n")
cat("注释数据列名:", paste(colnames(anno_clean), collapse=", "), "\n")
if ("ENSEMBL" %in% colnames(anno_clean)) {
# 检查哪些基因是Ensembl ID
is_ensembl_id <- grepl("^ENS", clean_geneid, ignore.case = TRUE)
n_ensembl <- sum(is_ensembl_id)
cat("发现", n_ensembl, "个Ensembl ID格式的基因\n")
if (any(is_ensembl_id)) {
# 调试:显示前5个Ensembl ID
ensembl_ids <- clean_geneid[is_ensembl_id]
cat("前5个Ensembl ID:", paste(head(ensembl_ids, 5), collapse=", "), "\n")
cat("注释数据库中ENSEMBL列数量:", length(anno_clean$ENSEMBL), "\n")
# 直接通过ENSEMBL列匹配
ensembl_match_idx <- match(clean_geneid[is_ensembl_id], anno_clean$ENSEMBL)
matched_ensembl <- !is.na(ensembl_match_idx)
n_matched <- sum(matched_ensembl)
cat("通过ENSEMBL列匹配成功:", n_matched, "/", n_ensembl, "个基因\n")
if (any(matched_ensembl)) {
ensembl_indices <- which(is_ensembl_id)
indices_to_update <- ensembl_indices[matched_ensembl]
res$SYMBOL[indices_to_update] <- anno_clean$SYMBOL[ensembl_match_idx[matched_ensembl]]
res$ENTREZID[indices_to_update] <- anno_clean$ENTREZID[ensembl_match_idx[matched_ensembl]]
}
# 🔥 调试:显示未匹配的基因
if (n_matched < n_ensembl) {
unmatched_ids <- clean_geneid[is_ensembl_id][!matched_ensembl]
cat("警告:", sum(!matched_ensembl), "个Ensembl ID未能匹配\n")
cat("未匹配示例(前5个):", paste(head(unmatched_ids, 5), collapse=", "), "\n")
}
}
} else {
cat("错误:注释数据中没有ENSEMBL列!\n")
}
# 第一步:尝试SYMBOL匹配(对于非Ensembl ID或未匹配的基因)
if ("SYMBOL" %in% colnames(anno_clean)) {
# 只对尚未匹配ENTREZID的基因尝试SYMBOL匹配
unmatched <- is.na(res$ENTREZID) | res$ENTREZID == ""
if (any(unmatched)) {
match_idx <- match(clean_geneid[unmatched], anno_clean$SYMBOL_CLEAN)
matched_genes <- !is.na(match_idx)
if (any(matched_genes)) {
unmatched_indices <- which(unmatched)
indices_to_update <- unmatched_indices[matched_genes]
res$SYMBOL[indices_to_update] <- anno_clean$SYMBOL[match_idx[matched_genes]]
res$ENTREZID[indices_to_update] <- anno_clean$ENTREZID[match_idx[matched_genes]]
cat("通过SYMBOL列匹配成功:", sum(matched_genes), "个基因\n")
}
}
# 🔥 第二步:对于仍未匹配的基因,尝试使用ENTREZID反向查询
still_unmatched <- is.na(res$ENTREZID) | res$ENTREZID == ""
if (any(still_unmatched)) {
# 尝试通过ENTREZID匹配
entrez_match_idx <- match(res$GeneID[still_unmatched], anno_clean$ENTREZID)
matched_entrez <- !is.na(entrez_match_idx)
if (any(matched_entrez)) {
unmatched_indices <- which(still_unmatched)
indices_to_update <- unmatched_indices[matched_entrez]
res$SYMBOL[indices_to_update] <- anno_clean$SYMBOL[entrez_match_idx[matched_entrez]]
res$ENTREZID[indices_to_update] <- anno_clean$ENTREZID[entrez_match_idx[matched_entrez]]
cat("通过ENTREZID反向匹配成功:", sum(matched_entrez), "个基因\n")
}
}
}
}
# 确保有SYMBOL列
if (!"SYMBOL" %in% colnames(res)) res$SYMBOL <- res$GeneID
if (!"ENTREZID" %in% colnames(res)) res$ENTREZID <- NA
# 过滤假基因
res <- data_input$filter_pseudo_genes(res)
# 改进的基因去重逻辑 - 保留统计显著性最高的基因
if (any(duplicated(res$SYMBOL))) {
# 按照p值和log2FoldChange的显著性排序
res <- res %>%
dplyr::arrange(SYMBOL, padj, abs(log2FoldChange)) %>%
dplyr::distinct(SYMBOL, .keep_all = TRUE)
# 记录去重信息
n_duplicates <- sum(duplicated(res$SYMBOL))
if (n_duplicates > 0) {
cat(sprintf("移除了 %d 个重复的基因记录\n", n_duplicates))
}
}
return(res)
}, error = function(e) {
showNotification(paste("差异分析失败:", e$message), type = "error")
return(NULL)
})
})
# 增强的列映射函数
enhanced_column_mapping <- function(df) {
cat("检查上传的差异基因文件列结构...\n")
cat("原始列名:", paste(colnames(df), collapse = ", "), "\n")
# 可能的列名映射
column_mappings <- list(
log2FoldChange = c("log2FoldChange", "log2FC", "avg_log2FC", "logFC", "log2_fold_change", "log2fc", "log2fc_adj"),
pvalue = c("pvalue", "p_val", "p.value", "P.Value", "pvalue_adj"),
padj = c("padj", "p_val_adj", "p_adj", "adj.P.Val", "pvalue_adj", "FDR"),
GeneID = c("GeneID", "gene", "Gene", "SYMBOL", "symbol", "gene_symbol", "ensembl", "ENSEMBL")
)
# 检查并重命名列
for (target_col in names(column_mappings)) {
possible_names <- column_mappings[[target_col]]
found <- FALSE
for (col_name in possible_names) {
if (col_name %in% colnames(df)) {
if (col_name != target_col) {
cat(" 重命名列:", col_name, "->", target_col, "\n")
colnames(df)[colnames(df) == col_name] <- target_col
} else {
cat(" 找到列:", target_col, "\n")
}
found <- TRUE
break
}
}
if (!found) {
cat(" ⚠️ 缺失列:", target_col, "\n")
}
}
# 确保log2FoldChange是数值类型
if ("log2FoldChange" %in% colnames(df)) {
if (!is.numeric(df$log2FoldChange)) {
cat(" 转换log2FoldChange为数值类型\n")
original_type <- class(df$log2FoldChange)[1]
df$log2FoldChange <- as.numeric(as.character(df$log2FoldChange))
n_na <- sum(is.na(df$log2FoldChange))
if (n_na > 0) {
warning(sprintf("log2FoldChange列从%s转换为数值时产生了%d个NA值", original_type, n_na))
}
}
}
# 确保pvalue和padj是数值类型
for (col in c("pvalue", "padj")) {
if (col %in% colnames(df)) {
if (!is.numeric(df[[col]])) {
cat(" 转换", col, "为数值类型\n")
original_type <- class(df[[col]])[1]
df[[col]] <- as.numeric(as.character(df[[col]]))
n_na <- sum(is.na(df[[col]]))
if (n_na > 0) {
warning(sprintf("%s列从%s转换为数值时产生了%d个NA值", col, original_type, n_na))
}
}
}
}
return(df)
}
# --- 加载差异基因结果 ---
deg_results_from_file <- eventReactive(input$load_deg, {
req(data_input$deg_file_data())
showNotification("正在加载差异基因结果...", type = "message")
df <- data_input$deg_file_data()
cat("上传的文件列名:", paste(colnames(df), collapse = ", "), "\n")
# 应用增强的列映射
df <- enhanced_column_mapping(df)
# 检查必要的列是否存在
required_cols <- c("pvalue", "log2FoldChange")
missing_cols <- setdiff(required_cols, colnames(df))
if (length(missing_cols) > 0) {
showNotification(paste("缺少必要的列:", paste(missing_cols, collapse = ", ")), type = "error")
showNotification("请确保上传的文件包含pvalue和log2FoldChange列,或使用以下列名之一:", type = "warning")
showNotification("log2FoldChange: log2FC, avg_log2FC, logFC, log2_fold_change, log2fc, log2fc_adj", type = "message")
showNotification("pvalue: p_val, p.value, P.Value, pvalue_adj", type = "message")
return(NULL)
}
# 确保有padj列,如果没有则使用pvalue(但会标记警告)
if (!"padj" %in% colnames(df)) {
df$padj <- df$pvalue
showNotification("⚠️ 警告:未找到校正p值(padj/FDR)列,将使用原始p值代替。", type = "warning")
showNotification("建议:差异分析结果应包含多重假设检验校正后的p值。", type = "message")
# 添加标记列,以便后续分析知道这是未校正的数据
df$using_unadjusted_pval <- TRUE
} else {
df$using_unadjusted_pval <- FALSE
}
# 重命名列以匹配内部格式
res <- df
# 确保所有必要列都存在
if (!"GeneID" %in% colnames(res)) {
if ("SYMBOL" %in% colnames(res)) {
res$GeneID <- res$SYMBOL
} else {
# 如果都没有,使用行名
res$GeneID <- rownames(res)
}
}
# 添加缺失的列
if (!"baseMean" %in% colnames(res)) res$baseMean <- 1
if (!"logCPM" %in% colnames(res)) res$logCPM <- 0
# 确保SYMBOL列存在
if (!"SYMBOL" %in% colnames(res)) {
res$SYMBOL <- res$GeneID
}
# --- 差异状态判断 ---
pval_col <- if(input$deg_pval_type == "p_val_adj") "padj" else "pvalue"
res$Status <- ifelse(res[[pval_col]] < input$deg_pval_cutoff & abs(res$log2FoldChange) > input$deg_log2fc_cutoff,
ifelse(res$log2FoldChange > 0, "Up", "Down"), "Not DE")
# TF 活性分析需要
# 🔥 关键修复:确保t_stat不会产生Inf值
res <- res %>%
dplyr::mutate(
# 限制pvalue的最小值,避免-log10(pvalue)过大
pvalue_safe = pmax(pvalue, 1e-300), # 防止log10(0) = Inf
# 计算t_stat,并限制范围
t_stat = -log10(pvalue_safe) * log2FoldChange
) %>%
# 移除Inf和NA值
dplyr::mutate(
t_stat = ifelse(is.finite(t_stat), t_stat, NA)
)
cat(sprintf("📊 差异分析: %d 个基因的t_stat\n", sum(!is.na(res$t_stat))))
cat(sprintf("📊 t_stat范围: %.2f 至 %.2f\n",
min(res$t_stat, na.rm = TRUE),
max(res$t_stat, na.rm = TRUE)))
# --- 注释基因 ---
anno <- data_input$annotate_genes(res$GeneID, input$deg_species)
if (!is.null(anno)) {
# 改进的GeneID清理逻辑 - 保留Ensembl ID的版本号
clean_geneid <- res$GeneID
clean_geneid <- trimws(clean_geneid)
clean_geneid <- gsub("[\t\n\r]", "", clean_geneid)
# 对于非Ensembl ID,才移除特殊字符
non_ensembl <- !grepl("^ENS", clean_geneid, ignore.case = TRUE)
clean_geneid[non_ensembl] <- gsub("[^[:alnum:]]", "", clean_geneid[non_ensembl])
# 清理anno中的列名
anno_clean <- anno
if ("SYMBOL" %in% colnames(anno_clean)) {
# 使用与原始数据相同的清理逻辑
anno_clean$SYMBOL_CLEAN <- gsub("[^[:alnum:]]", "", anno_clean$SYMBOL)
if (input$deg_species == "Mm") {
anno_clean$SYMBOL_CLEAN <- sapply(anno_clean$SYMBOL_CLEAN, function(x) {
if (grepl("^[A-Za-z]", x) && nchar(x) > 0) {
paste0(toupper(substr(x, 1, 1)), tolower(substr(x, 2, nchar(x))))
} else {
x
}
}, USE.NAMES = FALSE)
} else {
anno_clean$SYMBOL_CLEAN <- toupper(anno_clean$SYMBOL_CLEAN)
}
}
# 初始化结果列
if (!"SYMBOL" %in% colnames(res)) res$SYMBOL <- res$GeneID
if (!"ENTREZID" %in% colnames(res)) res$ENTREZID <- NA
# 🔥 修复:记录原始GeneID,用于后续显示
res$Original_GeneID <- res$GeneID
# 🔥 新的匹配策略:优先使用ENSEMBL列匹配Ensembl ID
if ("ENSEMBL" %in% colnames(anno_clean)) {
# 检查哪些基因是Ensembl ID
is_ensembl_id <- grepl("^ENS", clean_geneid, ignore.case = TRUE)
if (any(is_ensembl_id)) {
# 直接通过ENSEMBL列匹配
ensembl_match_idx <- match(clean_geneid[is_ensembl_id], anno_clean$ENSEMBL)
matched_ensembl <- !is.na(ensembl_match_idx)
if (any(matched_ensembl)) {
ensembl_indices <- which(is_ensembl_id)
indices_to_update <- ensembl_indices[matched_ensembl]
res$SYMBOL[indices_to_update] <- anno_clean$SYMBOL[ensembl_match_idx[matched_ensembl]]
res$ENTREZID[indices_to_update] <- anno_clean$ENTREZID[ensembl_match_idx[matched_ensembl]]
cat("通过ENSEMBL列匹配成功:", sum(matched_ensembl), "个基因\n")
}
}
}
# 第一步:尝试SYMBOL匹配(对于非Ensembl ID或未匹配的基因)
if ("SYMBOL" %in% colnames(anno_clean)) {
# 只对尚未匹配ENTREZID的基因尝试SYMBOL匹配
unmatched <- is.na(res$ENTREZID) | res$ENTREZID == ""
if (any(unmatched)) {
match_idx <- match(clean_geneid[unmatched], anno_clean$SYMBOL_CLEAN)
matched_genes <- !is.na(match_idx)
if (any(matched_genes)) {
unmatched_indices <- which(unmatched)
indices_to_update <- unmatched_indices[matched_genes]
res$SYMBOL[indices_to_update] <- anno_clean$SYMBOL[match_idx[matched_genes]]
res$ENTREZID[indices_to_update] <- anno_clean$ENTREZID[match_idx[matched_genes]]
cat("通过SYMBOL列匹配成功:", sum(matched_genes), "个基因\n")
}
}
# 🔥 第二步:对于仍未匹配的基因,尝试使用ENTREZID反向查询
still_unmatched <- is.na(res$ENTREZID) | res$ENTREZID == ""
if (any(still_unmatched)) {
# 尝试通过ENTREZID匹配
entrez_match_idx <- match(res$GeneID[still_unmatched], anno_clean$ENTREZID)
matched_entrez <- !is.na(entrez_match_idx)
if (any(matched_entrez)) {
unmatched_indices <- which(still_unmatched)
indices_to_update <- unmatched_indices[matched_entrez]
res$SYMBOL[indices_to_update] <- anno_clean$SYMBOL[entrez_match_idx[matched_entrez]]
res$ENTREZID[indices_to_update] <- anno_clean$ENTREZID[entrez_match_idx[matched_entrez]]
cat("通过ENTREZID反向匹配成功:", sum(matched_entrez), "个基因\n")
}
}
}
} else {
res$SYMBOL <- res$GeneID
res$ENTREZID <- NA
# 🔥 修复:记录原始GeneID
res$Original_GeneID <- res$GeneID
}
if (!"SYMBOL" %in% colnames(res)) res$SYMBOL <- res$GeneID
if (!"ENTREZID" %in% colnames(res)) res$ENTREZID <- NA
# 🌟 过滤假基因
res <- data_input$filter_pseudo_genes(res)
# 改进的基因去重逻辑 - 保留统计显著性最高的基因
if (any(duplicated(res$SYMBOL))) {
# 按照p值和log2FoldChange的显著性排序
res <- res %>%
dplyr::arrange(SYMBOL, padj, abs(log2FoldChange)) %>%
dplyr::distinct(SYMBOL, .keep_all = TRUE)
# 记录去重信息
n_duplicates_before <- nrow(res) - nrow(dplyr::distinct(res, SYMBOL))
if (n_duplicates_before > 0) {
cat(sprintf("移除了 %d 个重复的基因记录\n", n_duplicates_before))
}
}
# 最终检查
cat("最终数据列:", paste(colnames(res), collapse = ", "), "\n")
cat("log2FoldChange类型:", class(res$log2FoldChange), "\n")
cat("log2FoldChange范围:", range(res$log2FoldChange, na.rm=TRUE), "\n")
return(res)
})
# 🆕 --- 加载芯片差异结果 ---
chip_results_from_file <- eventReactive(input$load_chip, {
req(data_input$chip_file_data())
showNotification("正在加载芯片差异结果...", type = "message")
df <- data_input$chip_file_data()
cat("芯片文件列名:", paste(colnames(df), collapse = ", "), "\n")
# 检查必要的列是否存在(芯片limma结果的列名)
required_cols <- c("logFC", "P.Value", "SYMBOL", "ID")
missing_cols <- setdiff(required_cols, colnames(df))
if (length(missing_cols) > 0) {
showNotification(paste("缺少必要的列:", paste(missing_cols, collapse = ", ")), type = "error")
showNotification("请确保上传的文件包含: logFC, AveExpr, t, P.Value, adj.P.Val, B, SYMBOL, ID", type = "warning")
return(NULL)
}
# 转换为标准格式
res <- data.frame(
ID = as.character(df$ID), # Entrez Gene ID (保持字符)
SYMBOL = df$SYMBOL, # 基因符号
log2FoldChange = df$logFC, # log2倍数变化
pvalue = df$P.Value, # 原始p值
padj = df$adj.P.Val, # BH校正p值
baseMean = df$AveExpr, # 平均表达
t = df$t, # t统计量
ENTREZID = as.numeric(as.character(df$ID)), # 🔧 转换为numeric(clusterProfiler需要)
GeneID = df$SYMBOL, # 基因符号(用于兼容)
Original_GeneID = df$SYMBOL,
stringsAsFactors = FALSE
)
# 检查ENTREZID转换结果
na_count <- sum(is.na(res$ENTREZID))
if (na_count > 0) {
cat(sprintf("⚠️ 警告: %d个基因的ENTREZID转换为NA(可能包含非数字ID)\n", na_count))
}
# --- 差异状态判断 ---
pval_col <- if(input$chip_pval_type == "adj.P.Val") "padj" else "pvalue"
res$Status <- ifelse(res[[pval_col]] < input$chip_pval_cutoff & abs(res$log2FoldChange) > input$chip_log2fc_cutoff,
ifelse(res$log2FoldChange > 0, "Up", "Down"), "Not DE")
# TF 活性分析需要 - 计算t_stat
res <- res %>%
dplyr::mutate(
pvalue_safe = pmax(pvalue, 1e-300),
t_stat = -log10(pvalue_safe) * log2FoldChange
) %>%
dplyr::mutate(
t_stat = ifelse(is.finite(t_stat), t_stat, NA)
)
cat(sprintf("📊 芯片分析: %d 个基因的t_stat\n", sum(!is.na(res$t_stat))))
# 芯片数据已经包含了ID和SYMBOL,不需要额外注释
# 🔧 过滤掉ENTREZID为NA的基因(clusterProfiler需要有效的Entrez ID)
before_filter <- nrow(res)
res <- res[!is.na(res$ENTREZID), ]
after_filter <- nrow(res)
if (before_filter > after_filter) {
cat(sprintf("⚠️ 过滤了 %d 个ENTREZID为NA的基因\n", before_filter - after_filter))
}
# 去重
if (any(duplicated(res$SYMBOL))) {
res <- res %>%
dplyr::arrange(SYMBOL, padj, abs(log2FoldChange)) %>%
dplyr::distinct(SYMBOL, .keep_all = TRUE)
}
cat("✅ 芯片数据加载完成:", nrow(res), "个基因\n")
showNotification(sprintf("✅ 芯片数据加载完成: %d 个基因", nrow(res)), type = "message")
return(res)
})
# --- 获取过滤后的表达矩阵基因列表(用于背景基因集) ---
get_filtered_expr_genes <- reactive({
if (input$data_source == "counts") {
# 从原始数据开始
req(data_input$raw_data(), input$control_group, input$treat_group)
df <- data_input$raw_data()
ctrl <- input$control_group
trt <- input$treat_group
df_use <- df[, c(ctrl, trt)]
group <- factor(c(rep("Control", length(ctrl)), rep("Treatment", length(trt))),
levels = c("Control", "Treatment"))
# 使用与deg_results()相同的过滤逻辑
# 注意:filterByExpr()对两种方法使用相同的逻辑
dge <- DGEList(counts = df_use, group = group)
dge <- calcNormFactors(dge)
keep <- filterByExpr(dge)
filtered_genes <- rownames(dge)[keep]
return(filtered_genes)
} else {
# 对于上传的差异基因文件,无法获取原始表达矩阵
# 返回NULL,让富集分析模块使用默认背景
return(NULL)
}
})
# --- 统一的差异结果获取函数 ---
get_deg_results <- reactive({
if (input$data_source == "counts") {
# 返回完整数据:deg_df + 表达矩阵 + 分组信息
return(list(
deg_df = deg_results(),
background_genes = get_filtered_expr_genes(),
expr_matrix = data_input$raw_data(), # 完整表达矩阵
ctrl_samples = input$ctrl_samples, # 对照组样本
trt_samples = input$trt_samples # 处理组样本
))
} else if (input$data_source == "deg") {
# 上传差异文件时无法获取表达矩阵
return(list(
deg_df = deg_results_from_file(),
background_genes = NULL,
expr_matrix = NULL,
ctrl_samples = NULL,
trt_samples = NULL
))
} else if (input$data_source == "chip") {
# 🆕 芯片差异结果
return(list(
deg_df = chip_results_from_file(),
background_genes = NULL, # 芯片数据无法提供背景基因
expr_matrix = NULL,
ctrl_samples = NULL,
trt_samples = NULL
))
}
})
# --- 恢复主差异分析结果下载 ---
output$download_results <- downloadHandler(
filename = function() {
paste0("DEG_Results_", Sys.Date(), ".csv")
},
content = function(file) {
req(get_deg_results())
write.csv(get_deg_results()$deg_df, file, row.names = FALSE)
}
)
# --- 差异基因统计信息UI ---
output$deg_summary <- renderUI({
req(get_deg_results())
data <- get_deg_results()$deg_df
# 计算统计信息
n_total <- nrow(data)
n_up <- sum(data$Status == "Up", na.rm = TRUE)
n_down <- sum(data$Status == "Down", na.rm = TRUE)
n_not_de <- sum(data$Status == "Not DE", na.rm = TRUE)
# 计算比例
if (n_total > 0) {
pct_up <- round(100 * n_up / n_total, 1)
pct_down <- round(100 * n_down / n_total, 1)
pct_not_de <- round(100 * n_not_de / n_total, 1)
} else {
pct_up <- pct_down <- pct_not_de <- 0
}
# 创建统计卡片
tagList(
tags$div(
class = "row",
style = "margin-bottom: 20px;",
tags$div(
class = "col-sm-3",
tags$div(
class = "card",
style = "border-left: 4px solid #6c757d;",
tags$div(class = "card-body", style = "padding: 15px;",
tags$h6(class = "card-subtitle mb-2 text-muted", "总基因数"),
tags$h3(class = "card-title mb-0", style = "color: #6c757d;",
format(n_total, big.mark = ",")
),
tags$small(class = "text-muted", paste0("100%"))
)
)
),
tags$div(
class = "col-sm-3",
tags$div(
class = "card",
style = "border-left: 4px solid #28a745;",
tags$div(class = "card-body", style = "padding: 15px;",
tags$h6(class = "card-subtitle mb-2 text-muted", "上调基因"),
tags$h3(class = "card-title mb-0", style = "color: #28a745;",
format(n_up, big.mark = ",")
),
tags$small(class = "text-muted", paste0(pct_up, "%"))
)
)
),
tags$div(
class = "col-sm-3",
tags$div(
class = "card",
style = "border-left: 4px solid #dc3545;",
tags$div(class = "card-body", style = "padding: 15px;",
tags$h6(class = "card-subtitle mb-2 text-muted", "下调基因"),
tags$h3(class = "card-title mb-0", style = "color: #dc3545;",
format(n_down, big.mark = ",")
),
tags$small(class = "text-muted", paste0(pct_down, "%"))
)
)
),
tags$div(
class = "col-sm-3",
tags$div(
class = "card",
style = "border-left: 4px solid #17a2b8;",
tags$div(class = "card-body", style = "padding: 15px;",
tags$h6(class = "card-subtitle mb-2 text-muted", "非显著"),
tags$h3(class = "card-title mb-0", style = "color: #17a2b8;",
format(n_not_de, big.mark = ",")
),
tags$small(class = "text-muted", paste0(pct_not_de, "%"))
)
)
)
)
)
})
output$deg_table <- DT::renderDataTable({
req(get_deg_results())
data_to_display <- get_deg_results()$deg_df
# 只格式化存在的列
numeric_cols <- c("log2FoldChange", "pvalue", "padj", "t_stat")
existing_numeric_cols <- numeric_cols[numeric_cols %in% colnames(data_to_display)]
if (length(existing_numeric_cols) > 0) {
DT::datatable(data_to_display, options = list(scrollX=T, pageLength=10), rownames=F) %>%
formatRound(existing_numeric_cols, 4)
} else {
DT::datatable(data_to_display, options = list(scrollX=T, pageLength=10), rownames=F)
}
})
# --- 自定义基因显示 ---
custom_genes <- reactiveVal(NULL)
observeEvent(input$show_custom_genes, {
req(input$custom_genes_input)
# 解析用户输入的基因
genes <- strsplit(input$custom_genes_input, ",")[[1]]
genes <- trimws(genes) # 去除空格
genes <- genes[genes != ""] # 去除空字符串
if (length(genes) > 0) {
custom_genes(genes)
showNotification(paste("已设置显示", length(genes), "个自定义基因"), type = "message")
} else {
custom_genes(NULL)
showNotification("请输入有效的基因名称", type = "warning")
}
})
# 清除自定义基因
observeEvent(input$clear_custom_genes, {
custom_genes(NULL)
updateTextInput(session, "custom_genes_input", value = "")
showNotification("已清除自定义基因", type = "message")
})
# --- 火山图 ---
output$interactive_volcano <- renderPlotly({
req(get_deg_results())
res_data <- get_deg_results()
res <- res_data$deg_df # 获取实际的数据框
# 添加调试信息
cat("火山图数据检查:\n")
cat("数据类型:", class(res), "\n")
cat("数据列名:", paste(colnames(res), collapse = ", "), "\n")
if ("log2FoldChange" %in% colnames(res)) {
cat("log2FoldChange类型:", class(res$log2FoldChange), "\n")
}
# 根据数据来源选择p值类型
if (input$data_source == "counts") {
pval_col <- input$pval_type
} else {
pval_col <- if(input$deg_pval_type == "p_val_adj") "padj" else "pvalue"
}
# 使用用户选择的Y轴类型
y_axis_col <- input$y_axis_type
# 检查log2FoldChange列
if (!("log2FoldChange" %in% colnames(res) && is.numeric(res$log2FoldChange))) {
showNotification("错误:log2FoldChange列不存在或不是数值类型", type = "error")
showNotification(paste("当前列名:", paste(colnames(res), collapse = ", ")), type = "message")
return(NULL)
}
# 安全计算-log10值,处理非数值和NA值
if (y_axis_col %in% colnames(res) && is.numeric(res[[y_axis_col]])) {
# 确保数值有效且大于0(log10需要正数)
valid_values <- res[[y_axis_col]]
# 使用机器最小正值代替0,避免log10(0)的问题
min_positive <- .Machine$double.xmin # 约为2.2e-308
valid_values[valid_values <= 0 & !is.na(valid_values)] <- min_positive
valid_values[is.na(valid_values)] <- NA
res$y_value <- -log10(valid_values)
# 检查是否有有效的y值
if (all(is.na(res$y_value))) {
showNotification(paste("错误:所有", y_axis_col, "值无效(<=0或NA),无法绘制火山图"), type = "error")
return(NULL)
}
# 如果有极小值被替换,给出警告
n_replaced <- sum(res[[y_axis_col]] <= 0 & !is.na(res[[y_axis_col]]))
if (n_replaced > 0) {
showNotification(sprintf("注意:有 %d 个p值为0或负值的基因被替换为最小正值", n_replaced), type = "message")
}
} else {
showNotification(paste("错误:列", y_axis_col, "不存在或不是数值类型"), type = "error")
return(NULL)
}
color_map <- c("Not DE"="#95a5a6", "Up"=input$up_color, "Down"=input$down_color)
txt_col <- if(input$theme_toggle) "#00e0ff" else "black"
# 创建基础火山图
p <- plot_ly(res,
x = ~log2FoldChange, y = ~y_value, color = ~Status,
colors = color_map,
text = ~SYMBOL, type = 'scatter', mode = 'markers',
marker = list(size = input$point_size, opacity = input$point_alpha),
hoverinfo = 'text',
hovertext = ~paste("Gene:", SYMBOL,
"<br>log2FC:", round(log2FoldChange, 3),
"<br>-log10(", y_axis_col, "):", round(y_value, 3),
"<br>Status:", Status)) %>%
layout(
xaxis = list(
title = "log2(Fold Change)",
range = c(input$x_axis_min, input$x_axis_max),
titlefont = list(size = input$axis_title_size),
tickfont = list(size = input$axis_label_size),
showgrid = input$show_grid,
gridcolor = if(input$show_grid) "#ddd" else "transparent",
gridwidth = 1
),
yaxis = list(
title = paste0("-log10(", y_axis_col, ")"),
titlefont = list(size = input$axis_title_size),
tickfont = list(size = input$axis_label_size),
showgrid = input$show_grid,
gridcolor = if(input$show_grid) "#ddd" else "transparent",
gridwidth = 1
),
font = list(color = txt_col),
paper_bgcolor = "rgba(0,0,0,0)",
plot_bgcolor = "rgba(0,0,0,0)"
)
# 添加自定义基因标签
if (!is.null(custom_genes())) {
selected_genes <- custom_genes()
# 在结果中查找这些基因
gene_data <- res[res$SYMBOL %in% selected_genes | res$GeneID %in% selected_genes, ]
if (nrow(gene_data) > 0) {
# 添加基因标签
p <- p %>%
add_annotations(
x = gene_data$log2FoldChange,
y = gene_data$y_value,
text = gene_data$SYMBOL,
xref = "x",
yref = "y",
showarrow = TRUE,
arrowhead = 2,
arrowsize = 1,
arrowwidth = 1,
arrowcolor = input$gene_label_color,
ax = 20,
ay = -40,
font = list(
size = input$gene_label_size,
color = input$gene_label_color,
family = "Arial",
weight = if(input$gene_label_bold) "bold" else "normal"
),
bgcolor = "rgba(255,255,255,0.8)",
bordercolor = input$gene_label_color,
borderwidth = 1,
borderpad = 4,
opacity = 0.8
)
}
}
p
})
# --- 静态火山图用于导出 ---
volcano_static_plot <- reactive({
req(get_deg_results())
res_data <- get_deg_results()
res <- res_data$deg_df # 获取实际的数据框
# 根据数据来源选择p值类型
if (input$data_source == "counts") {
pval_col <- input$pval_type
} else {
pval_col <- if(input$deg_pval_type == "p_val_adj") "padj" else "pvalue"
}
# 使用用户选择的Y轴类型
y_axis_col <- input$y_axis_type
# 检查log2FoldChange列
if (!("log2FoldChange" %in% colnames(res) && is.numeric(res$log2FoldChange))) {
showNotification("错误:log2FoldChange列不存在或不是数值类型", type = "error")
showNotification(paste("当前列名:", paste(colnames(res), collapse = ", ")), type = "message")
return(NULL)
}
# 安全计算-log10值,处理非数值和NA值
if (y_axis_col %in% colnames(res) && is.numeric(res[[y_axis_col]])) {
# 确保数值有效且大于0(log10需要正数)
valid_values <- res[[y_axis_col]]
# 使用机器最小正值代替0,避免log10(0)的问题
min_positive <- .Machine$double.xmin # 约为2.2e-308
valid_values[valid_values <= 0 & !is.na(valid_values)] <- min_positive
valid_values[is.na(valid_values)] <- NA
res$y_value <- -log10(valid_values)
# 检查是否有有效的y值
if (all(is.na(res$y_value))) {
showNotification(paste("错误:所有", y_axis_col, "值无效(<=0或NA),无法绘制火山图"), type = "error")
return(NULL)
}
# 如果有极小值被替换,给出警告
n_replaced <- sum(res[[y_axis_col]] <= 0 & !is.na(res[[y_axis_col]]))
if (n_replaced > 0) {
showNotification(sprintf("注意:有 %d 个p值为0或负值的基因被替换为最小正值", n_replaced), type = "message")
}
} else {
showNotification(paste("错误:列", y_axis_col, "不存在或不是数值类型"), type = "error")
return(NULL)
}
# 设置颜色
res$color <- ifelse(res$Status == "Up", input$up_color,
ifelse(res$Status == "Down", input$down_color, "#95a5a6"))
# 创建ggplot火山图(与交互图保持一致的大小比例)
p <- ggplot(res, aes(x = log2FoldChange, y = y_value, color = Status)) +
geom_point(alpha = input$point_alpha, size = input$point_size) +
scale_color_manual(values = c("Up" = input$up_color, "Down" = input$down_color, "Not DE" = "#95a5a6")) +
labs(
x = "log2(Fold Change)",
y = paste0("-log10(", y_axis_col, ")"),
title = "Volcano Plot"
) +
theme_minimal() +
theme(
axis.title = element_text(size = input$axis_title_size),
axis.text = element_text(size = input$axis_label_size),
legend.title = element_text(size = input$axis_title_size),
legend.text = element_text(size = input$axis_label_size),
plot.title = element_text(size = input$axis_title_size + 2, hjust = 0.5),
panel.grid.major = element_line(
color = if(input$show_grid) "gray" else "transparent",
linewidth = if(input$show_grid) 0.5 else 0
),
panel.grid.minor = element_line(
color = if(input$show_grid) "gray" else "transparent",
linewidth = if(input$show_grid) 0.25 else 0
)
) +
xlim(input$x_axis_min, input$x_axis_max)
# 添加自定义基因标签(与交互图保持一致的大小)
if (!is.null(custom_genes())) {
selected_genes <- custom_genes()
gene_data <- res[res$SYMBOL %in% selected_genes | res$GeneID %in% selected_genes, ]
if (nrow(gene_data) > 0) {
p <- p +
geom_text_repel(
data = gene_data,
aes(label = SYMBOL),
size = input$gene_label_size,
color = input$gene_label_color,
fontface = if(input$gene_label_bold) "bold" else "plain",
box.padding = 0.5,
point.padding = 0.3,
max.overlaps = Inf
)
}
}
return(p)
})
# --- 火山图导出 ---
output$download_volcano <- downloadHandler(
filename = function() {
paste0("volcano_plot_", Sys.Date(), ".", input$export_format)
},
content = function(file) {
req(volcano_static_plot())
if (input$export_format == "png") {
png(file, width = input$export_width, height = input$export_height, units = "in", res = 300)
} else if (input$export_format == "pdf") {
pdf(file, width = input$export_width, height = input$export_height)
} else if (input$export_format == "svg") {
svg(file, width = input$export_width, height = input$export_height)
}
print(volcano_static_plot())
dev.off()
}
)
# 返回差异分析结果
return(get_deg_results)
}