File size: 8,795 Bytes
7e6a9d1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 | # =====================================================
# 数据输入模块
# =====================================================
data_input_server <- function(input, output, session) {
# --- 数据读取 ---
raw_data <- reactive({
req(input$file)
df <- read.csv(input$file$datapath, header = TRUE)
if (ncol(df) >= 2) {
rownames(df) <- make.names(df[,1], unique = TRUE)
df <- df[,-1, drop = FALSE]
}
df
})
# --- 差异基因结果读取 ---
deg_file_data <- reactive({
req(input$deg_file)
df <- read.csv(input$deg_file$datapath, header = TRUE)
return(df)
})
# 🆕 --- 芯片差异结果读取 ---
chip_file_data <- reactive({
req(input$chip_file)
df <- read.csv(input$chip_file$datapath, header = TRUE)
return(df)
})
output$group_selector <- renderUI({
req(raw_data())
cols <- colnames(raw_data())
tagList(
selectInput("control_group", "Control组", choices = cols, multiple = TRUE),
selectInput("treat_group", "Treatment组", choices = cols, multiple = TRUE)
)
})
# --- 增强的注释函数 ---
annotate_genes <- function(gene_ids, species_code) {
db_pkg <- if(species_code == "Mm") "org.Mm.eg.db" else "org.Hs.eg.db"
if (!require(db_pkg, character.only = TRUE, quietly = TRUE)) {
warning("数据库包 ", db_pkg, " 未安装")
return(NULL)
}
db_obj <- get(db_pkg)
# 清理基因符号
clean_ids <- trimws(gene_ids)
clean_ids <- gsub("[\t\n\r]", "", clean_ids)
# 对于Ensembl ID,保留版本号用于匹配(有些数据库需要版本号)
# 对于非Ensembl ID,移除特殊字符
is_ensembl <- grepl("^ENS", clean_ids, ignore.case = TRUE)
clean_ids[!is_ensembl] <- gsub("[^[:alnum:]]", "", clean_ids[!is_ensembl])
# 根据物种标准化大小写
# 注意:对于ENSEMBL ID,保持原始格式以便匹配
if (species_code == "Mm") {
# 小鼠基因:首字母大写,其余小写(但ENSEMBL ID保持原样)
clean_ids <- sapply(clean_ids, function(x) {
if (grepl("^ENS", x, ignore.case = TRUE)) {
# ENSEMBL ID:保持原样
x
} else if (grepl("^[A-Za-z]", x)) {
# 普通基因符号:首字母大写,其余小写
paste0(toupper(substr(x, 1, 1)), tolower(substr(x, 2, nchar(x))))
} else {
x
}
}, USE.NAMES = FALSE)
} else {
# 人类基因:全部大写(但ENSEMBL ID保持原样)
clean_ids <- sapply(clean_ids, function(x) {
if (grepl("^ENS", x, ignore.case = TRUE)) {
# ENSEMBL ID:保持原样
x
} else {
# 其他基因:全部大写
toupper(x)
}
}, USE.NAMES = FALSE)
}
# 去除特殊字符
clean_ids <- gsub("[^[:alnum:]]", "", clean_ids)
cat("基因注释: 清理后基因数量 =", length(clean_ids), "\n")
cat("前5个清理后的基因:", paste(head(clean_ids, 5), collapse=", "), "\n")
# 尝试不同keytype,收集所有成功注释的基因
all_anno <- data.frame()
# 1. 首先尝试SYMBOL(最常用)
tryCatch({
# 只尝试在数据库中有匹配的基因
valid_symbols <- clean_ids[clean_ids %in% keys(db_obj, keytype = "SYMBOL")]
if (length(valid_symbols) > 0) {
cat("找到", length(valid_symbols), "个有效的SYMBOL\n")
anno <- AnnotationDbi::select(db_obj,
keys = valid_symbols,
columns = c("SYMBOL", "ENTREZID"),
keytype = "SYMBOL")
if (nrow(anno) > 0) {
anno <- anno[!duplicated(anno$SYMBOL), ]
all_anno <- rbind(all_anno, anno)
cat("SYMBOL注释成功:", nrow(anno), "个基因\n")
}
} else {
cat("没有有效的SYMBOL\n")
}
}, error = function(e) {
cat("SYMBOL注释错误:", e$message, "\n")
})
# 2. 尝试ENSEMBL ID(带版本号和不带版本号)
tryCatch({
ensembl_ids <- clean_ids[grepl("^ENS", clean_ids, ignore.case = TRUE)]
if (length(ensembl_ids) > 0) {
# 首先尝试带版本号的ID
valid_ensembl <- ensembl_ids[ensembl_ids %in% keys(db_obj, keytype = "ENSEMBL")]
if (length(valid_ensembl) > 0) {
cat("找到", length(valid_ensembl), "个有效的ENSEMBL ID (带版本号)\n")
anno <- AnnotationDbi::select(db_obj,
keys = valid_ensembl,
columns = c("ENSEMBL", "SYMBOL", "ENTREZID"),
keytype = "ENSEMBL")
if (nrow(anno) > 0) {
anno <- anno[!duplicated(anno$ENSEMBL), ]
all_anno <- rbind(all_anno, anno)
cat("ENSEMBL注释成功:", nrow(anno), "个基因\n")
}
}
# 对于未匹配的Ensembl ID,尝试去除版本号后匹配
unmatched_ensembl <- ensembl_ids[!ensembl_ids %in% valid_ensembl]
if (length(unmatched_ensembl) > 0) {
# 移除版本号
ensembl_no_version <- gsub("\\..*", "", unmatched_ensembl)
valid_no_version <- ensembl_no_version[ensembl_no_version %in% keys(db_obj, keytype = "ENSEMBL")]
if (length(valid_no_version) > 0) {
cat("找到", length(valid_no_version), "个有效的ENSEMBL ID (不带版本号)\n")
anno <- AnnotationDbi::select(db_obj,
keys = valid_no_version,
columns = c("ENSEMBL", "SYMBOL", "ENTREZID"),
keytype = "ENSEMBL")
if (nrow(anno) > 0) {
# 记录原始ID(带版本号)到数据库ID的映射
anno$ORIGINAL_ENSEMBL <- unmatched_ensembl[match(valid_no_version, ensembl_no_version)]
anno <- anno[!duplicated(anno$ENSEMBL), ]
all_anno <- rbind(all_anno, anno)
cat("ENSEMBL注释成功 (无版本号):", nrow(anno), "个基因\n")
}
}
}
}
}, error = function(e) {
cat("ENSEMBL注释错误:", e$message, "\n")
})
# 3. 尝试ENTREZID(如果输入已经是数字ID)
tryCatch({
numeric_ids <- clean_ids[grepl("^[0-9]+$", clean_ids)]
if (length(numeric_ids) > 0) {
valid_entrez <- numeric_ids[numeric_ids %in% keys(db_obj, keytype = "ENTREZID")]
if (length(valid_entrez) > 0) {
cat("找到", length(valid_entrez), "个有效的ENTREZID\n")
anno <- AnnotationDbi::select(db_obj,
keys = valid_entrez,
columns = c("ENTREZID", "SYMBOL"),
keytype = "ENTREZID")
if (nrow(anno) > 0) {
anno <- anno[!duplicated(anno$ENTREZID), ]
all_anno <- rbind(all_anno, anno)
cat("ENTREZID注释成功:", nrow(anno), "个基因\n")
}
}
}
}, error = function(e) {
cat("ENTREZID注释错误:", e$message, "\n")
})
if (nrow(all_anno) > 0) {
# 去重
all_anno <- all_anno[!duplicated(all_anno), ]
cat("总注释成功:", nrow(all_anno), "个基因\n")
# 确保有SYMBOL列
if (!"SYMBOL" %in% colnames(all_anno)) {
all_anno$SYMBOL <- NA
}
return(all_anno)
} else {
cat("所有注释尝试都失败\n")
return(NULL)
}
}
# --- 过滤假基因函数 ---
filter_pseudo_genes <- function(df) {
# 过滤明确的假基因(Gm开头、Rik或-ps结尾)
# 同时检查SYMBOL列和GeneID列
df_filtered <- df %>%
filter(
# 检查SYMBOL列
(is.na(SYMBOL) | SYMBOL == "" |
(!grepl("^Gm", SYMBOL, ignore.case = TRUE) &
!grepl("Rik$", SYMBOL, ignore.case = TRUE) &
!grepl("-ps$", SYMBOL, ignore.case = TRUE))),
# 检查GeneID列(防止未注释的假基因通过)
(is.na(GeneID) | GeneID == "" |
(!grepl("^Gm", GeneID, ignore.case = TRUE) &
!grepl("Rik$", GeneID, ignore.case = TRUE) &
!grepl("-ps$", GeneID, ignore.case = TRUE)))
)
removed_count <- nrow(df) - nrow(df_filtered)
if (removed_count > 0) {
showNotification(paste("过滤了", removed_count, "个假基因(Gm开头、Rik或-ps结尾)"), type = "message")
}
return(df_filtered)
}
# 返回数据函数
list(
raw_data = raw_data,
deg_file_data = deg_file_data,
chip_file_data = chip_file_data, # 🆕 添加芯片数据
annotate_genes = annotate_genes,
filter_pseudo_genes = filter_pseudo_genes
)
} |