2026年4月2日 研究日志¶
今日研究内容:数据清理并绘图。今日也来分享几个我用来做单细胞测序数据处理的函数吧。
↓我用来规范化聚类分析流程的函数。先创建好用于存放绘图,报告和基因分析结果的文件夹。因为写的时候完全是考虑着自己用所以习惯都是按照自己喜欢来的。
# Cluster####
# Preparation before clustering
Preparation_folders <- function() {
# Define the list of folders that need to be created
folders <- c("plot", "report", "marker_gene")
# Creat folders
for (folder in folders) {
if (!dir.exists(folder)) {
dir.create(folder, recursive = TRUE, showWarnings = FALSE)
message("Folder created successfully: ", folder)
} else {
message("Folder already exists: ", folder)
}
}
# Returns the creation result summary
results <- sapply(folders, dir.exists)
return(invisible(results))
}
↓用来一键化PCA和拐点图,分门别类把数据放在合适位置,不同样本放在不同文件夹。
# Run PCA and Plot Elbow to find suitable dimensions
RunPCA_PlotElbow <- function(scdata,
sample_name = "Sample",
plot_dir = "./plot/",
ndims = 50,
width = 6,
height = 6) {
# Load necessary packages
require(Seurat)
# Run PCA analysis
scdata <- Seurat::RunPCA(scdata)
# Create the output directory if it does not exist
sample_dir <- file.path(plot_dir, sample_name)
if (!dir.exists(sample_dir)) {
dir.create(sample_dir, recursive = TRUE, showWarnings = FALSE)
message("Created directory: ", sample_dir)
}
# Create a file path
file_path <- file.path(sample_dir, paste0(sample_name, "_elbow.tiff"))
# Generate and save the elbow plot
elbow_plot <- Seurat::ElbowPlot(scdata, ndims = ndims)
ggplot2::ggsave(
filename = file_path,
plot = elbow_plot,
device = "tiff",
width = width,
height = height
)
message("Elbow plot saved to: ", file_path)
return(scdata)
}
↓当你进行单细胞测序数据分析时,你需要考虑分辨率,这决定了你会把细胞分为多少个亚群。然而究竟多少亚群合适呢?优质答案:我不知道。什么分辨率合适要看你的研究目的和数据本身的情况。那么具体该怎么解决分辨率问题呢?答案是多试试就知道了。你可以使用下面的Try_different_resolutions函数,将数据集scdata和聚类分辨率列表resulution键入,便可以一键获得你想尝试的所有分辨率的UMAP图,放在plot文件夹中(你愿意改也行啦……我是这个习惯啦),总之反复尝试总能获得较优的结果的。
# Run cluster and umap repeat to find the best resolution
Try_different_resolutions <- function(scdata,
resolutions,
dims = 1:15,
min.dist = 0.5,
spread = 1.5,
plot_dir = "./plot/") {
# Load necessary packages
require(Seurat)
require(ggplot2)
# Obtain the variable name of the passed object
dataset_name <- deparse(substitute(scdata))
# Clean special characters in dataset names
clean_name <- gsub("[[:space:][:punct:]]", "_", dataset_name)
# Create a directory structure: plot_dir/dataset_name/UMAP/
dataset_dir <- file.path(plot_dir, clean_name)
umap_dir <- file.path(dataset_dir, "UMAP")
# Safely create directories
if (!dir.exists(umap_dir)) {
dir.create(umap_dir, recursive = TRUE, showWarnings = FALSE)
message("Creat PATH: ", umap_dir)
}
# Seurat analysis workflow
scdata <- FindNeighbors(object = scdata, dims = dims)
scdata <- RunUMAP(scdata, dims = dims, min.dist = min.dist, spread = spread)
# Looping through different resolutions
for (res in resolutions) {
cluster_name <- paste0("TEST_res_", res)
scdata <- FindClusters(object = scdata, resolution = res, cluster.name = cluster_name)
# Creating UMAP Plots
umap_plot <- DimPlot(scdata, reduction = "umap", label = TRUE, group.by = cluster_name) +
ggtitle(paste0("UMAP Plot (Resolution = ", res, ")"))
# Constructing file paths
plot_file <- paste0("umap_res_", res, ".png")
plot_path <- file.path(umap_dir, plot_file)
# Save the image
ggsave(
filename = plot_path,
plot = umap_plot,
device = "png",
width = 8,
height = 6,
dpi = 300
)
message("Save UMAP plot: ", plot_path)
}
# Returns the Seurat object
return(scdata)
}
↓Get_marker_genes家族。说是家族其实没有互相调用的关系,只是Get_marker_genes()对所有的分辨率都运行一次FindAllMarkers并只选择topN(默认为5)的基因保存csv,Get_marker_genes_all()则保存完整的marker gene表。我是用运行较快的Get_marker_genes()检查每种分辨率的合理性,再使用确信合适的分辨率运行Get_marker_genes_all()生成最终的分析结果。
# Run cluster and find marker gene repeat to make sure the best resolution
Get_marker_genes <- function(scdata,
resolutions,
genes_amount = 5,
table_dir = "./marker_gene/",
recompute_clusters = FALSE) {
# Load necessary package
require(dplyr)
require(readr)
# Obtain the variable name of the passed object
dataset_name <- deparse(substitute(scdata))
# Clean special characters in dataset names
clean_name <- gsub("[[:space:][:punct:]]", "_", dataset_name)
# Get the precision file path
table_dir <- paste0(table_dir, clean_name)
# Make sure the output directory exists
if (!dir.exists(table_dir)) {
dir.create(table_dir, recursive = TRUE, showWarnings = FALSE)
message("Created directory: ", table_dir)
}
# Traversing different resolutions
for (res in resolutions) {
cluster_col_name <- paste0("TEST_res_", res)
# Conditional computation clustering
if(recompute_clusters || !cluster_col_name %in% colnames(scdata@meta.data)) {
scdata <- FindClusters(
object = scdata,
resolution = res,
cluster.name = cluster_col_name
)
}
# Set the current cluster as the active flag
Idents(scdata) <- cluster_col_name
# Finding marker genes
cluster_markers <- FindAllMarkers(
scdata,
only.pos = TRUE,
min.pct = 0.25,
logfc.threshold = 0.25
)
# Extract the top genes for each cluster
top_markers <- cluster_markers %>%
group_by(cluster) %>%
arrange(desc(avg_log2FC)) %>%
slice_head(n = genes_amount)
# Creating a gene list
top_marks_table <- top_markers %>%
group_by(cluster) %>%
summarise(
top_genes = paste(gene, collapse = "/"),
.groups = "drop"
)
# Constructing file paths
file_path <- file.path(table_dir, paste0("/cluster_genes_res", res, ".csv"))
# Save the results
write_csv(top_marks_table, file_path)
message("Saved marker genes for resolution ", res, " to: ", file_path)
}
# Returns the modified Seurat object
return(scdata)
}
Get_marker_genes_all <- function(scdata,
resolutions,
genes_amount = 5,
table_dir = "./marker_gene_all/",
recompute_clusters = FALSE) {
# Load necessary packages
require(dplyr)
require(readr)
require(Seurat)
# Get the dataset name (clean special characters)
dataset_name <- gsub("[[:space:][:punct:]]", "_", deparse(substitute(scdata)))
# Create Output Directory
dir.create(file.path(table_dir, dataset_name),
recursive = TRUE, showWarnings = FALSE)
# Traversing different resolutions
for (res in resolutions) {
cluster_col <- paste0("TEST_res_", res)
# Recalculate clusters (if necessary)
if(recompute_clusters || !cluster_col %in% colnames(scdata@meta.data)) {
scdata <- FindClusters(scdata, resolution = res, cluster.name = cluster_col)
}
# Set the active cluster ID
Idents(scdata) <- cluster_col
clusters <- levels(Idents(scdata))
# Create a separate directory for each cluster
res_dir <- file.path(table_dir, dataset_name, paste0("res_", res))
dir.create(res_dir, recursive = TRUE, showWarnings = FALSE)
# Initialize the summary table
top_marks_summary <- data.frame()
# Traverse each cluster to find marker genes
for (cluster_id in clusters) {
# Find the marker gene of the current cluster (compared with all other clusters)
markers <- FindMarkers(
object = scdata,
ident.1 = cluster_id,
only.pos = TRUE,
min.pct = 0.25,
logfc.threshold = 0.25
)
# Add gene name and cluster information
markers$gene <- rownames(markers)
markers$cluster <- cluster_id
# Save complete results to a separate CSV
cluster_file <- file.path(res_dir, paste0("cluster_", cluster_id, "_markers.csv"))
write_csv(markers, cluster_file)
# directly merge all genes into the total table
all_marks_summary <- bind_rows(all_marks_summary, markers)
}
# Save COMPLETE summary table for current resolution
summary_file <- file.path(table_dir, dataset_name, paste0("res_", res, "_all_markers.csv"))
write_csv(all_marks_summary, summary_file)
message(sprintf("Saved ALL markers for res=%g to: %s", res, summary_file))
}
return(scdata)
}
↓最后是这个我没有真正用上的细胞类型标记基因数据库。写这个数据库是为了自动根据标记基因来推断cluster的细胞类型。不过后来发现用不着自己做,而且项目实际上不大(满打满算4个月的独立项目),所以没来得及用上这个数据库就手动解决掉了问题。但这玩意可是个S3对象,有class的,我甚至写了add_markers函数拿来扩展数据库,我对此很满意,所以在这里分享出来,要是有人喜欢自己拿走吧(笑)
# Marker_gene_dictionary
# Create a cell type-marker gene database (expandable S3 object)
create_marker_db <- function() {
marker_db <- list(
"T cell" = c("CD3D", "IL7R", "CD4", "CD8A"),
"B cell" = c("CD19", "MS4A1", "CD79A"),
"Macrophages" = c("LYZ", "CD14", "FCGR3A"),
"NK cell" = c("GNLY", "NKG7"),
"Melanocytes" = c("TYR", "MITF", "SOX10"),
"Malignant cell" = c("BRAF", "CDKN2A", "TP53"),
"Tumor-associated fibroblasts" = c("ACTA2","CTSK","CCND1","BCAN")
)
class(marker_db) <- "cell_marker_db"
return(marker_db)
}
# Generic function for updating the database (for subsequent expansion)
add_markers <- function(db, cell_type, genes) {
UseMethod("add_markers")
}
add_markers.cell_marker_db <- function(db, cell_type, genes) {
db[[cell_type]] <- unique(c(db[[cell_type]], genes))
return(db)
}
# Initialize the database
marker_db <- create_marker_db()
今天就到这里了,这周早些时候完成了组内的文献汇报,准备汇报真的好累,明天说不定就休息了。看看情况,明天是不是休息日就看明天有没有技术博客了(笑)