Prepare data input#
[ ]:
import os
import sys
os.environ["OMP_NUM_THREADS"] = "11"
os.environ["OPENBLAS_NUM_THREADS"] = "8" # export OPENBLAS_NUM_THREADS=4
os.environ["MKL_NUM_THREADS"] = "11" # export MKL_NUM_THREADS=6
os.environ["VECLIB_MAXIMUM_THREADS"] = "8" # export VECLIB_MAXIMUM_THREADS=4
os.environ["NUMEXPR_NUM_THREADS"] = "11" # export NUMEXPR_NUM_THREADS=6
os.environ["NUMBA_CACHE_DIR"]='/tmp/numba_cache'
import numpy as np
import pandas as pd
In this notebook, we will demonstrate how to prepare input of multi-omics data for scVAEIT. To begin with, we use a simple example with toy data.
Toy data#
We first illustrate the procedure of preparing input data for scVAEIT. Consider the following example, where we have 2 datasets, each with 2 modalities measured.
Dataset |
RNA |
ADT |
ATAC |
|---|---|---|---|
1 |
√ |
√ |
|
2 |
√ |
√ |
We will use similar data structure as annData to demonstrate how to prepare input data for scVAEIT.
Raw data#
Counts#
The counts can be represented as a list of tuples, where each tuple contains the count matrix for each modality. For example, the input data can be represented as:
[
(dat1_count_rna, dat1_count_adt, None)
(dat2_count_rna, None, dat2_count_atac)
]
Each of the variable is a numpy array or None.
Features#
The feature names used to align different datasets can be represented as a list of tuples, where each tuple contains the feature names for each modality. For example, the feature names can be represented as:
[
(dat1_var_rna, dat1_var_adt, None)
(dat2_var_rna, None, dat2_var_atac)
]
Each of the variable can be either a dataframe or a numpy array, or None.
Metadata#
The metadata can also be represented by a list of arrays, where each array contains the metadata for each modality. For example, the metadata can be represented as:
[
dat1_obs,
dat2_obs
]
Each of the variable can be either a dataframe or a numpy array.
[2]:
# Toy examples
n1 = 2; p1 = [2, 2, 0]
dat1_count_rna, dat1_count_adt, dat1_count_atac = np.ones((n1, 2)), np.ones((n1, 2)), None
dat1_var_rna, dat1_var_adt, dat1_var_atac = np.array(['gene1', 'gene2']), np.array(['adt1', 'adt2']), None
dat1_obs = pd.DataFrame({'covariate': [1, 2], 'dataset_id': [0, 0]})
n2 = 3; p2 = [3, 0, 3]
dat2_count_rna, dat2_count_adt, dat2_count_atac = 2*np.ones((n2, 3)), None, 2*np.ones((n2, 3))
dat2_var_rna, dat2_var_adt, dat2_var_atac = np.array(['gene1', 'gene2', 'gene3']), None, np.array(['atac1', 'atac2', 'atac3'])
dat2_obs = pd.DataFrame({'covariate': [1, 2, 3], 'dataset_id': [1, 1, 1]})
Vertical concatenation of single modality of multiple datasets#
[3]:
from functools import reduce
from tqdm import tqdm
def vertical_concat(
counts, vars, obs, compact_mask=False
):
'''
Concatenate multiple datasets of one modality for mosaic integration.
Parameters
----------
counts : list
A list of counts from different datasets each of shape (n_i, p_i), None if missing.
vars : list
A list of pandas.Dataframe or numpy.array, each of shape (p_i, ), None if missing
Variables from different datasets used to align across datasets.
obs : list
A list of pandas.Dataframe or numpy.array, each of shape (n_i, c), where c is the number of covariates/batches to be adjust.
The covariates/batches from different datasets, the last column should be the dataset id.
The measurements should be the same across different datasets.
compact_mask : bool
If True, the mask is of shape (N, p) where N is the number of datasets and p=sum_i p_i is the number of features;
otherwise it is a matrix of shape (n, p), where n=sum_i n_i is the number of samples.
Returns
-------
data : np.ndarray
The concatenated data matrix.
batches : np.ndarray
The covariates for each cell.
masks : tf.Tensor
The masks for missing pattern.
'''
n_datasets = len(counts)
# Check the input data
# and determine the total number of unique features
total_features = 0
features = []
for i in range(n_datasets):
if isinstance(obs[i], pd.DataFrame):
obs[i] = obs[i].values
elif not isinstance(obs[i], np.ndarray):
raise ValueError(f"obs[{i}] must be a DataFrame or numpy array")
if obs[i].ndim != 2:
obs[i] = obs[i][:,None]
if isinstance(vars[i], pd.DataFrame):
vars[i] = np.array(vars[i].columns)
elif vars[i] is None:
continue
elif not isinstance(vars[i], np.ndarray):
raise ValueError(f"vars[{i}] must be a DataFrame or numpy array")
features.append(vars[i])
features = reduce(np.union1d, *features)
total_features = len(features)
print('Total number of features:', total_features)
concatenated_data = []
concatenated_batches = []
concatenated_masks = []
print('Concatenate {} datasets...'.format(n_datasets))
for i in tqdm(range(n_datasets)):
data = np.zeros((obs[i].shape[0], total_features), dtype=np.float32)
if compact_mask:
mask = -np.ones((1, total_features), dtype=np.float32)
else:
mask = -np.ones((obs[i].shape[0], total_features), dtype=np.float32)
if counts[i] is not None:
id_obs_features = np.where(np.isin(vars[i], features))[0]
data[:,id_obs_features] = counts[i]
mask[:,id_obs_features] = 0
concatenated_data.append(data)
concatenated_batches.append(obs[i])
concatenated_masks.append(mask)
data = np.vstack(concatenated_data)
batches = np.vstack(concatenated_batches)
masks = np.vstack(concatenated_masks)
return data, batches, masks
[4]:
# Call the function
data_rna, batches, masks_rna = vertical_concat([dat1_count_rna, dat2_count_rna], [dat1_var_rna, dat2_var_rna], [dat1_obs, dat2_obs])
# Print results
print("Data:\n", data_rna)
print("Batches:\n", batches)
print("Masks:\n", masks_rna)
Total number of features: 3
Concatenate 2 datasets...
100%|██████████| 2/2 [00:00<00:00, 2384.48it/s]
Data:
[[1. 1. 0.]
[1. 1. 0.]
[2. 2. 2.]
[2. 2. 2.]
[2. 2. 2.]]
Batches:
[[1 0]
[2 0]
[1 1]
[2 1]
[3 1]]
Masks:
[[ 0. 0. -1.]
[ 0. 0. -1.]
[ 0. 0. 0.]
[ 0. 0. 0.]
[ 0. 0. 0.]]
Similarly, we can concatenate the ADT and ATAC modalities, separately. Because batches would be the same for every modality, we simply ignore the output for it below.
[5]:
data_adt, _, masks_adt = vertical_concat([dat1_count_adt, dat2_count_adt], [dat1_var_adt, dat2_var_adt], [dat1_obs, dat2_obs])
data_atac, _, masks_atac = vertical_concat([dat1_count_atac, dat2_count_atac], [dat1_var_atac, dat2_var_atac], [dat1_obs, dat2_obs])
Total number of features: 2
Concatenate 2 datasets...
100%|██████████| 2/2 [00:00<00:00, 12985.46it/s]
Total number of features: 3
Concatenate 2 datasets...
100%|██████████| 2/2 [00:00<00:00, 14716.86it/s]
100%|██████████| 2/2 [00:00<00:00, 14716.86it/s]
Horizontal concatenation of multimodalities#
[6]:
def horizontal_concat(
vertical_concated_data, vertical_concated_masks
):
'''
Concatenate multiple modalities of datasets for mosaic integration.
Parameters
----------
vertical_concated_data : list
A list of counts from different modalities.
vertical_concated_masks : list
A list of masks from different modalities.
Returns
-------
data : np.ndarray
The concatenated data matrix.
masks : tf.Tensor
The masks for missing pattern.
'''
n_modalities = len(vertical_concated_data)
# Initialize lists to store concatenated results
concatenated_data = []
concatenated_masks = []
# Determine the total number of features
dim_block = np.array([data.shape[1] for data in vertical_concated_data])
print('Concatenate {} modalities...'.format(n_modalities))
for i in tqdm(range(n_modalities)):
data = vertical_concated_data[i]
masks = vertical_concated_masks[i]
concatenated_data.append(data)
concatenated_masks.append(masks)
# Concatenate data, batches, and masks horizontally
data = np.hstack(concatenated_data)
masks = np.hstack(concatenated_masks)
return data, masks, dim_block
[7]:
# Call the function
data, masks, dim_block = horizontal_concat([data_rna, data_adt, data_atac], [masks_rna, masks_adt, masks_atac])
# Print results
print("Data:\n", data)
print("Masks:\n", masks)
print('Dim of modalities:', dim_block)
Concatenate 3 modalities...
100%|██████████| 3/3 [00:00<00:00, 62291.64it/s]
Data:
[[1. 1. 0. 1. 1. 0. 0. 0.]
[1. 1. 0. 1. 1. 0. 0. 0.]
[2. 2. 2. 0. 0. 2. 2. 2.]
[2. 2. 2. 0. 0. 2. 2. 2.]
[2. 2. 2. 0. 0. 2. 2. 2.]]
Masks:
[[ 0. 0. -1. 0. 0. -1. -1. -1.]
[ 0. 0. -1. 0. 0. -1. -1. -1.]
[ 0. 0. 0. -1. -1. 0. 0. 0.]
[ 0. 0. 0. -1. -1. 0. 0. 0.]
[ 0. 0. 0. -1. -1. 0. 0. 0.]]
Dim of modalities: [3 2 3]