library(janitor)
library(tidyr)
library(dplyr)
library(stringr)

# ================================
# 1. 清洗函数
# ================================
clean_colnames <- function(df) {
  names(df) <- janitor::make_clean_names(names(df))
  df
}

drop_empty <- function(df) {
  df %>% janitor::remove_empty(c("rows", "cols"))
}

fix_types <- function(df) {
  df %>% mutate(across(where(is.character), ~ trimws(.x)))
}

normalize_strings <- function(df) {
  df %>% mutate(across(where(is.character), ~ tolower(.x)))
}

remove_duplicates <- function(df) {
  df %>% distinct()
}

impute_missing <- function(df) {
  df %>% mutate(across(where(is.numeric), ~ ifelse(is.na(.x), median(.x, na.rm = TRUE), .x)))
}

# ================================


# ================================
# 2. 日志函数
# ================================
log_step <- function(msg) {
  cat(sprintf("[INFO] %s\n", msg))
}


# ================================
# 3. 主 Pipeline
# ================================
clean_pipeline <- function(df,
                           do_colnames   = TRUE,
                           do_drop_empty = TRUE,
                           do_fix_types  = TRUE,
                           do_normalize  = TRUE,
                           do_dedup      = TRUE,
                           do_impute     = TRUE,
                           extra_steps   = list()) {
  
  # 内部步骤列表
  steps <- list()
  
  if (do_colnames)   steps <- append(steps, list(clean_colnames))
  if (do_drop_empty) steps <- append(steps, list(drop_empty))
  if (do_fix_types)  steps <- append(steps, list(fix_types))
  if (do_normalize)  steps <- append(steps, list(normalize_strings))
  if (do_dedup)      steps <- append(steps, list(remove_duplicates))
  if (do_impute)     steps <- append(steps, list(impute_missing))
  
  # 添加额外步骤
  steps <- c(steps, extra_steps)
  
  # 执行 Pipeline
  for (f in steps) {
    log_step(paste("Running step:", deparse(substitute(f))))
    df <- f(df)
  }
  
  log_step("Pipeline completed.")
  df
}


# ================================
# 4. 使用
# ================================
# df_clean <- clean_pipeline(raw_df)

# 带额外步骤
# df_clean <- clean_pipeline(raw_df, extra_steps = list(
#   function(x) mutate(x, new_col = row_number())
# ))
2026年4月7日 研究日志¶

2026年4月7日研究日志¶