Hunman Breast Cancer Clustering Analysis

2_HBC_Clustering

2: Human breast cancer_Clustering¶

HiSTaR can process 10X Visium data to extract low-dimension representation. In this tutorial, we use HBC dataset to introduce the analysis.
The raw breast cancer dataset can be downloaded from: https://www.10xgenomics.com/datasets/human-breast-cancer-block-a-section-1-1-standard-1-1-0;
The complete experimental dataset is available here https://zenodo.org/records/15599070

In [1]:

import scanpy as sc
import matplotlib.pyplot as plt
from pathlib import Path
import torch
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [2]:

import HiSTaR
HiSTaR.fix_seed(2023)

In [3]:

data_root = Path('../data/BRCA1')
data_dir = data_root / 'V1_Human_Breast_Cancer_Block_A_Section_1'
adata = sc.read_visium(path=data_dir , count_file='filtered_feature_bc_matrix.h5')
adata.var_names_make_unique()

In [4]:

#Normalization
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.highly_variable_genes(adata, flavor="seurat_v3", n_top_genes=2000)  
sc.pp.log1p(adata)

from sklearn.decomposition import PCA
adata_X = PCA(n_components=200, random_state=42).fit_transform(adata.X)
adata.obsm['X_pca'] = adata_X

In [5]:

adata

Out[5]:

AnnData object with n_obs × n_vars = 3798 × 36601
    obs: 'in_tissue', 'array_row', 'array_col'
    var: 'gene_ids', 'feature_types', 'genome', 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm'
    uns: 'spatial', 'hvg', 'log1p'
    obsm: 'spatial', 'X_pca'

Constructing the spatial network¶

In [6]:

# construct graph
graph_dict = HiSTaR.graph_construction(adata, 6)

Running HiSTaR¶

In [7]:

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
histar_net = HiSTaR.histar(adata.obsm['X_pca'], graph_dict, device=device)
histar_net.train()
histar_feat, _, _, _ = histar_net.process()
adata.obsm['HiSTaR'] = histar_feat

Clusting¶

In [8]:

#HiSTaR.configure_r_environment()   # If you encounter problems loading R packages, you can manually configure your path in this function.

In [9]:

HiSTaR.mclust_R(adata, n_clusters=20, use_rep='HiSTaR', key_added='HiSTaR_clust')

R[write to console]:                    __           __ 
   ____ ___  _____/ /_  _______/ /_
  / __ `__ \/ ___/ / / / / ___/ __/
 / / / / / / /__/ / /_/ (__  ) /_  
/_/ /_/ /_/\___/_/\__,_/____/\__/   version 6.1.1
Type 'citation("mclust")' for citing this R package in publications.

fitting ...
  |======================================================================| 100%

Out[9]:

AnnData object with n_obs × n_vars = 3798 × 36601
    obs: 'in_tissue', 'array_row', 'array_col', 'HiSTaR_clust'
    var: 'gene_ids', 'feature_types', 'genome', 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm'
    uns: 'spatial', 'hvg', 'log1p'
    obsm: 'spatial', 'X_pca', 'HiSTaR'

Visualization¶

In [10]:

plt.rcParams.update({
    "figure.figsize": (4, 4),
    "figure.facecolor": "white",  
    "axes.facecolor": "white" 
})

plt.figure(facecolor='white')
sc.pl.spatial(
    adata,
    img_key=None,
    color="HiSTaR_clust",
    size=1.5,
    title="HiSTaR",
    frameon=True,           
    legend_loc=None,        
    show=False,         
)
plt.tight_layout()
# plt.savefig('HBC_spatial.png',dpi=300, bbox_inches='tight')

<Figure size 400x400 with 0 Axes>

No description has been provided for this image

In [11]:

sc.pp.neighbors(adata, use_rep='HiSTaR')
sc.tl.umap(adata)
plt.rcParams["figure.figsize"] = (4, 3)
sc.pl.umap(
    adata,
    color='HiSTaR_clust',
    title='HiSTaR',
    show=False,
    s=16,
    frameon=False
)
plt.tight_layout()
# plt.savefig('HBC_umap.png',dpi=300, bbox_inches='tight')

In [12]:

def get_cluster_markers(
    adata,
    cluster_key='HiSTaR_clust',
    n_markers=3,
    pval_cutoff=1e-5,
    logfc_cutoff=2,
    allow_partial=True,
    key_added='diff_genes'
):
    sc.tl.rank_genes_groups(
        adata,
        groupby=cluster_key,
        method='wilcoxon',
        key_added=key_added,
        pts=True
    )

    clusters = adata.obs[cluster_key].cat.categories.astype(str).tolist()
    marker_df = pd.DataFrame()
    for cluster in clusters:
        try:
            de_df = sc.get.rank_genes_groups_df(
                adata,
                group=cluster,
                key=key_added
            ).sort_values('logfoldchanges', key=abs, ascending=False)
            
            filtered = de_df[
                (de_df['pvals_adj'] < pval_cutoff) &
                (de_df['logfoldchanges'].abs() > logfc_cutoff)
            ]
            
            if len(filtered) < n_markers:
                filtered = de_df[
                    (de_df['pvals_adj'] < 0.05) &
                    (de_df['logfoldchanges'].abs() > logfc_cutoff)
                ]
            
            top_genes = filtered.head(n_markers)['names'].tolist()
            if allow_partial:
                genes = top_genes
            else:
                genes = top_genes + [f'None_{i}' for i in range(n_markers - len(top_genes))]
            
            cluster_df = pd.DataFrame(
                [genes],
                columns=[f'Gene{i+1}' for i in range(n_markers)],
                index=[f'Cluster_{cluster}']
            )
            marker_df = pd.concat([marker_df, cluster_df])
            
        except Exception as e:
            print(f"Cluster {cluster} error: {str(e)}")
            continue
    
    return marker_df


n_markers=3
marker_df = get_cluster_markers(
    adata,
    n_markers=n_markers,
    pval_cutoff=1e-5,
    logfc_cutoff=1,
    allow_partial=False
)

selected_genes = [g for g in marker_df.values.flatten() if g is not None]
print(marker_df)

                Gene1       Gene2         Gene3
Cluster_1      ARPP21    Z82214.2         SPAG6
Cluster_2     SLC31A2       TRIT1        ZSCAN1
Cluster_3      CRISP3     SLITRK6  KIAA2012-AS1
Cluster_4        DSC3       MMP10         KRT6B
Cluster_5      CLEC3A       LIN7A         THBS4
Cluster_6       IGSF3      TSEN15     HIST1H2BC
Cluster_7         ALB        TCF4         PGGHG
Cluster_8   LINC00052  AC129507.4        SNAP25
Cluster_9    PGM5-AS1  AC087379.2         KRT37
Cluster_10       KRT5       KRT17         KRT14
Cluster_11     KIF20A      CHRDL1       NKIRAS1
Cluster_12     FCGR3B        CPB1         CELF3
Cluster_13  LINC00645        CDSN       RASSF10
Cluster_14       CPB1         LTF         NPY1R
Cluster_15       SYT1   MRPS30-DT         PDE5A
Cluster_16    RASGRP2        JAML          IL7R
Cluster_17       DLX2       KLK11     LINC02551
Cluster_18      IGLC2       IGHG3         IGLC3
Cluster_19     CRISP3  AC089998.4    AC089998.1
Cluster_20     CRISP3     SLITRK6        ABCC12

In [13]:

import scanpy as sc
from matplotlib import pyplot as plt
from matplotlib.colors import LinearSegmentedColormap

colors = ["#FFFFFF", "#CAB2D6", "#6A3D9B", "#2E0A4B"]
custom_cmap = LinearSegmentedColormap.from_list("graphst_cmap", colors)

plt.rcParams.update({
    'figure.facecolor': 'white',   
    'savefig.facecolor': 'white',  
    'axes.facecolor': 'white',     
    'axes.edgecolor': 'black',     
    'axes.labelcolor': 'black',    
    'xtick.color': 'black',        
    'ytick.color': 'black',
})

fig = sc.pl.spatial(
    adata,
    color=selected_genes,
    ncols=6,
    spot_size=200,
    img_key=None,
    cmap=custom_cmap,
    vmin='p1',
    vmax='p99',
    frameon=False,    
    return_fig=True,
    show=False
)

for ax in fig.axes:
    ax.set_facecolor('white')

# fig.savefig(
#     f'clust_{n_markers}_white.png',
#     dpi=600,
#     bbox_inches='tight',
#     facecolor='white',   
#     pad_inches=0
# )

In [14]:

import matplotlib
import matplotlib.pyplot as plt
import scanpy as sc

matplotlib.rcParams.update({
    'font.family': 'Arial',
    'font.size': 14,
    'axes.labelcolor': 'black',
    'xtick.color': 'black',
    'ytick.color': 'black',
    'axes.titlecolor': 'black',
    'figure.facecolor': 'white',  
    'axes.facecolor': 'white',    
    'savefig.facecolor': 'white'
})


custom_cmap = matplotlib.colors.LinearSegmentedColormap.from_list(
    "custom_gradient", 
    ["#FFE5E5", "#FF7F7F", "#D93F3F", "#8B0000"]
)


dot_plot = sc.pl.DotPlot(
    adata,
    var_names=selected_genes,
    groupby='HiSTaR_clust',
    standard_scale='var',
    cmap=custom_cmap,
    vmin=0,
    vmax=1
)


dot_plot.make_figure()
for ax in plt.gcf().axes:
    ax.set_facecolor('white')

# plt.savefig(
#     f'dot_plot_{n_markers}_white.png',
#     bbox_inches='tight',
#     dpi=600,
#     facecolor='white',
#     pad_inches=0
# )