import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

np.random.seed(42)

# -----------------------------
# 1. 设定参数：两种细胞类型 + 刺激前后
# -----------------------------
n_cells_before = 5000
n_cells_after  = 5000

# 细胞类型比例：刺激前 A 少 B 多，刺激后 A 大量增殖
prop_A_before, prop_B_before = 0.2, 0.8
prop_A_after,  prop_B_after  = 0.7, 0.3

# 基因表达均值（未刺激）
mu_A_before = 10.0   # A 型细胞：高表达
mu_B_before = 3.0    # B 型细胞：低表达

# 刺激后：每种细胞内表达都下降（关键设定）
mu_A_after = 7.0     # A 内部下降
mu_B_after = 1.5     # B 内部下降

sigma = 1.0          # 表达噪声

# -----------------------------
# 2. 生成单细胞数据
# -----------------------------
def simulate_condition(n_cells, prop_A, mu_A, mu_B, sigma, label):
    n_A = int(n_cells * prop_A)
    n_B = n_cells - n_A
    
    expr_A = np.random.normal(mu_A, sigma, n_A)
    expr_B = np.random.normal(mu_B, sigma, n_B)
    
    df = pd.DataFrame({
        "condition": label,
        "cell_type": ["A"] * n_A + ["B"] * n_B,
        "expression": np.concatenate([expr_A, expr_B])
    })
    return df

df_before = simulate_condition(
    n_cells_before, prop_A_before,
    mu_A_before, mu_B_before, sigma,
    label="Before"
)

df_after = simulate_condition(
    n_cells_after, prop_A_after,
    mu_A_after, mu_B_after, sigma,
    label="After"
)

df = pd.concat([df_before, df_after], ignore_index=True)

# -----------------------------
# 3. 计算：按细胞类型 vs bulk 的均值
# -----------------------------
# Before 在左，After 在右
df["condition"] = pd.Categorical(df["condition"], categories=["Before", "After"], ordered=True)

group_means = (
    df.groupby(["condition", "cell_type"])["expression"]
      .mean()
      .reset_index()
)

bulk_means = (
    df.groupby("condition")["expression"]
      .mean()
      .reset_index()
      .assign(cell_type="Bulk")
)

pivot_table = group_means.pivot(index="cell_type", columns="condition", values="expression")

#display(group_means)
display(pivot_table)
display(bulk_means)

# -----------------------------
# 4. 可视化
# -----------------------------
sns.set(style="whitegrid", context="talk")

fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# (1) 每种细胞类型内：表达都下降
sns.barplot(
    data=group_means,
    x="cell_type", y="expression", hue="condition",
    ax=axes[0]
)
axes[0].set_title("Within each cell type\n(expression decreases after stimulus)")

# (2) bulk：总体表达反而上升
sns.barplot(
    data=bulk_means,
    x="cell_type", y="expression", hue="condition",
    ax=axes[1]
)
axes[1].set_title("Bulk measurement\n(overall expression increases)")

for ax in axes:
    ax.set_xlabel("")
    ax.set_ylabel("Expression")

plt.tight_layout()
plt.show()

# 细胞类型比例：刺激前 A 少 B 多，刺激后 A 大量增殖
prop_A_before, prop_B_before = 0.2, 0.8
prop_A_after,  prop_B_after  = 0.7, 0.3

2026年4月15日研究日志¶

首先先从“肥胖悖论”开始吧¶

为什么要从这里讲起？¶

为何是单细胞？¶

为什么单细胞测序是解决这一悖论的终极方案？¶

单细胞测序让我们第一次“看见”每个细胞，而不是平均值¶

单细胞测序天然避免细胞组成偏倚¶

单细胞测序让我们第一次能够“解构”组织层面的信号¶

最后，从肥胖悖论到单细胞测序，我们学到的是什么？¶

condition	Before	After
cell_type
A	10.019332	6.983551
B	3.002169	1.505469

	condition	expression	cell_type
0	Before	4.405602	Bulk
1	After	5.340126	Bulk

2026年4月15日 研究日志¶

首先 先从“肥胖悖论”开始吧¶

为什么要从这里讲起？¶

为何是单细胞？¶

为什么单细胞测序是解决这一悖论的终极方案？¶

单细胞测序让我们第一次“看见”每个细胞，而不是平均值¶

单细胞测序天然避免细胞组成偏倚¶

单细胞测序让我们第一次能够“解构”组织层面的信号¶

最后，从肥胖悖论到单细胞测序，我们学到的是什么？¶

2026年4月15日研究日志¶

首先先从“肥胖悖论”开始吧¶