2026年4月2日 研究日志¶

今日研究内容:数据清理并绘图。今日也来分享几个我用来做单细胞测序数据处理的函数吧。

↓我用来规范化聚类分析流程的函数。先创建好用于存放绘图,报告和基因分析结果的文件夹。因为写的时候完全是考虑着自己用所以习惯都是按照自己喜欢来的。

In [ ]:
# Cluster####
# Preparation before clustering
Preparation_folders <- function() {
  # Define the list of folders that need to be created
  folders <- c("plot", "report", "marker_gene")
  
  # Creat folders
  for (folder in folders) {
    if (!dir.exists(folder)) {
      dir.create(folder, recursive = TRUE, showWarnings = FALSE)
      message("Folder created successfully: ", folder)
    } else {
      message("Folder already exists: ", folder)
    }
  }
  
  # Returns the creation result summary
  results <- sapply(folders, dir.exists)
  return(invisible(results))
}

↓用来一键化PCA和拐点图,分门别类把数据放在合适位置,不同样本放在不同文件夹。

In [ ]:
# Run PCA and Plot Elbow to find suitable dimensions
RunPCA_PlotElbow <- function(scdata, 
                            sample_name = "Sample",
                            plot_dir = "./plot/",
                            ndims = 50,
                            width = 6, 
                            height = 6) {
  # Load necessary packages
  require(Seurat)
  
  # Run PCA analysis
  scdata <- Seurat::RunPCA(scdata)
  
  # Create the output directory if it does not exist
  sample_dir <- file.path(plot_dir, sample_name)
  if (!dir.exists(sample_dir)) {
    dir.create(sample_dir, recursive = TRUE, showWarnings = FALSE)
    message("Created directory: ", sample_dir)
  }
  
  # Create a file path
  file_path <- file.path(sample_dir, paste0(sample_name, "_elbow.tiff"))
  
  # Generate and save the elbow plot
  elbow_plot <- Seurat::ElbowPlot(scdata, ndims = ndims)
  ggplot2::ggsave(
    filename = file_path,
    plot = elbow_plot,
    device = "tiff",
    width = width,
    height = height
  )
  
  message("Elbow plot saved to: ", file_path)
  return(scdata)
}

↓当你进行单细胞测序数据分析时,你需要考虑分辨率,这决定了你会把细胞分为多少个亚群。然而究竟多少亚群合适呢?优质答案:我不知道。什么分辨率合适要看你的研究目的和数据本身的情况。那么具体该怎么解决分辨率问题呢?答案是多试试就知道了。你可以使用下面的Try_different_resolutions函数,将数据集scdata和聚类分辨率列表resulution键入,便可以一键获得你想尝试的所有分辨率的UMAP图,放在plot文件夹中(你愿意改也行啦……我是这个习惯啦),总之反复尝试总能获得较优的结果的。

In [ ]:
# Run cluster and umap repeat to find the best resolution
Try_different_resolutions <- function(scdata,
                                      resolutions,
                                      dims = 1:15,
                                      min.dist = 0.5,
                                      spread = 1.5,
                                      plot_dir = "./plot/") {
  # Load necessary packages
  require(Seurat)
  require(ggplot2)
  
  # Obtain the variable name of the passed object
  dataset_name <- deparse(substitute(scdata))
  
  # Clean special characters in dataset names
  clean_name <- gsub("[[:space:][:punct:]]", "_", dataset_name)
  
  # Create a directory structure: plot_dir/dataset_name/UMAP/
  dataset_dir <- file.path(plot_dir, clean_name)
  umap_dir <- file.path(dataset_dir, "UMAP")
  
  # Safely create directories
  if (!dir.exists(umap_dir)) {
    dir.create(umap_dir, recursive = TRUE, showWarnings = FALSE)
    message("Creat PATH: ", umap_dir)
  }
  
  # Seurat analysis workflow
  scdata <- FindNeighbors(object = scdata, dims = dims)
  scdata <- RunUMAP(scdata, dims = dims, min.dist = min.dist, spread = spread)
  
  # Looping through different resolutions
  for (res in resolutions) {
    cluster_name <- paste0("TEST_res_", res)
    scdata <- FindClusters(object = scdata, resolution = res, cluster.name = cluster_name)
    
    # Creating UMAP Plots
    umap_plot <- DimPlot(scdata, reduction = "umap", label = TRUE, group.by = cluster_name) + 
      ggtitle(paste0("UMAP Plot (Resolution = ", res, ")"))
    
    # Constructing file paths
    plot_file <- paste0("umap_res_", res, ".png")
    plot_path <- file.path(umap_dir, plot_file)
    
    # Save the image
    ggsave(
      filename = plot_path,
      plot = umap_plot,
      device = "png",
      width = 8,
      height = 6,
      dpi = 300
    )
    
    message("Save UMAP plot: ", plot_path)
  }
  
  # Returns the Seurat object
  return(scdata)
}

↓Get_marker_genes家族。说是家族其实没有互相调用的关系,只是Get_marker_genes()对所有的分辨率都运行一次FindAllMarkers并只选择topN(默认为5)的基因保存csv,Get_marker_genes_all()则保存完整的marker gene表。我是用运行较快的Get_marker_genes()检查每种分辨率的合理性,再使用确信合适的分辨率运行Get_marker_genes_all()生成最终的分析结果。

In [ ]:
# Run cluster and find marker gene repeat to make sure the best resolution
Get_marker_genes <- function(scdata,
                             resolutions,
                             genes_amount = 5,
                             table_dir = "./marker_gene/",
                             recompute_clusters = FALSE) {
  
  # Load necessary package
  require(dplyr)
  require(readr)
  
  # Obtain the variable name of the passed object
  dataset_name <- deparse(substitute(scdata))
  
  # Clean special characters in dataset names
  clean_name <- gsub("[[:space:][:punct:]]", "_", dataset_name)
  
  # Get the precision file path
  table_dir <- paste0(table_dir, clean_name)
  
  # Make sure the output directory exists
  if (!dir.exists(table_dir)) {
    dir.create(table_dir, recursive = TRUE, showWarnings = FALSE)
    message("Created directory: ", table_dir)
  }
  
  # Traversing different resolutions
  for (res in resolutions) {
    cluster_col_name <- paste0("TEST_res_", res)
    
    # Conditional computation clustering
    if(recompute_clusters || !cluster_col_name %in% colnames(scdata@meta.data)) {
      scdata <- FindClusters(
        object = scdata,
        resolution = res,
        cluster.name = cluster_col_name
      )
    }
    
    # Set the current cluster as the active flag
    Idents(scdata) <- cluster_col_name
    
    # Finding marker genes
    cluster_markers <- FindAllMarkers(
      scdata,
      only.pos = TRUE,
      min.pct = 0.25,
      logfc.threshold = 0.25
    )
    
    # Extract the top genes for each cluster
    top_markers <- cluster_markers %>%
      group_by(cluster) %>%
      arrange(desc(avg_log2FC)) %>%
      slice_head(n = genes_amount)  
    
    # Creating a gene list
    top_marks_table <- top_markers %>%
      group_by(cluster) %>%
      summarise(
        top_genes = paste(gene, collapse = "/"),
        .groups = "drop"
      )
    
    # Constructing file paths
    file_path <- file.path(table_dir, paste0("/cluster_genes_res", res, ".csv"))
    
    # Save the results
    write_csv(top_marks_table, file_path)
    message("Saved marker genes for resolution ", res, " to: ", file_path)
  }
  
  # Returns the modified Seurat object
  return(scdata)
}


Get_marker_genes_all <- function(scdata,
                             resolutions,
                             genes_amount = 5,
                             table_dir = "./marker_gene_all/",
                             recompute_clusters = FALSE) {
  
  # Load necessary packages
  require(dplyr)
  require(readr)
  require(Seurat)
  
  # Get the dataset name (clean special characters)
  dataset_name <- gsub("[[:space:][:punct:]]", "_", deparse(substitute(scdata)))
  
  # Create Output Directory
  dir.create(file.path(table_dir, dataset_name), 
             recursive = TRUE, showWarnings = FALSE)
  
  # Traversing different resolutions
  for (res in resolutions) {
    cluster_col <- paste0("TEST_res_", res)
    
    # Recalculate clusters (if necessary)
    if(recompute_clusters || !cluster_col %in% colnames(scdata@meta.data)) {
      scdata <- FindClusters(scdata, resolution = res, cluster.name = cluster_col)
    }
    
    # Set the active cluster ID
    Idents(scdata) <- cluster_col
    clusters <- levels(Idents(scdata))
    
    # Create a separate directory for each cluster
    res_dir <- file.path(table_dir, dataset_name, paste0("res_", res))
    dir.create(res_dir, recursive = TRUE, showWarnings = FALSE)
    
    # Initialize the summary table
    top_marks_summary <- data.frame()
    
    # Traverse each cluster to find marker genes
    for (cluster_id in clusters) {
      # Find the marker gene of the current cluster (compared with all other clusters)
      markers <- FindMarkers(
        object = scdata,
        ident.1 = cluster_id,
        only.pos = TRUE,
        min.pct = 0.25,
        logfc.threshold = 0.25
      )
      
      # Add gene name and cluster information
      markers$gene <- rownames(markers)
      markers$cluster <- cluster_id
      
      # Save complete results to a separate CSV
      cluster_file <- file.path(res_dir, paste0("cluster_", cluster_id, "_markers.csv"))
      write_csv(markers, cluster_file)
      
      # directly merge all genes into the total table
      all_marks_summary <- bind_rows(all_marks_summary, markers)
    }
    
    # Save COMPLETE summary table for current resolution
    summary_file <- file.path(table_dir, dataset_name, paste0("res_", res, "_all_markers.csv"))
    write_csv(all_marks_summary, summary_file)
    message(sprintf("Saved ALL markers for res=%g to: %s", res, summary_file))
  }
  
  return(scdata)
}

↓最后是这个我没有真正用上的细胞类型标记基因数据库。写这个数据库是为了自动根据标记基因来推断cluster的细胞类型。不过后来发现用不着自己做,而且项目实际上不大(满打满算4个月的独立项目),所以没来得及用上这个数据库就手动解决掉了问题。但这玩意可是个S3对象,有class的,我甚至写了add_markers函数拿来扩展数据库,我对此很满意,所以在这里分享出来,要是有人喜欢自己拿走吧(笑)

In [ ]:
# Marker_gene_dictionary
# Create a cell type-marker gene database (expandable S3 object)
create_marker_db <- function() {
  marker_db <- list(
    "T cell" = c("CD3D", "IL7R", "CD4", "CD8A"),
    "B cell" = c("CD19", "MS4A1", "CD79A"),
    "Macrophages" = c("LYZ", "CD14", "FCGR3A"),
    "NK cell" = c("GNLY", "NKG7"),
    "Melanocytes" = c("TYR", "MITF", "SOX10"),
    "Malignant cell" = c("BRAF", "CDKN2A", "TP53"),
    "Tumor-associated fibroblasts" = c("ACTA2","CTSK","CCND1","BCAN")
  )
  class(marker_db) <- "cell_marker_db"
  return(marker_db)
}

# Generic function for updating the database (for subsequent expansion)
add_markers <- function(db, cell_type, genes) {
  UseMethod("add_markers")
}

add_markers.cell_marker_db <- function(db, cell_type, genes) {
  db[[cell_type]] <- unique(c(db[[cell_type]], genes))
  return(db)
}

# Initialize the database
marker_db <- create_marker_db()

今天就到这里了,这周早些时候完成了组内的文献汇报,准备汇报真的好累,明天说不定就休息了。看看情况,明天是不是休息日就看明天有没有技术博客了(笑)