step by step example( for developer)

import

import sys
import numpy as np
import vireoSNP
from vireoSNP.utils.vireo_base import match
from vireoSNP.utils.vcf_utils import load_VCF, write_VCF, parse_donor_GPb
from vireoSNP.utils.vcf_utils import read_sparse_GeneINFO, GenoINFO_maker
import matplotlib.pyplot as plt
import pandas as pd
from vireoSNP.utils.io_utils import match_donor_VCF
import re as re
import os
## set workdiretory on vireobulk's folder
os.chdir("/home/ccheng/Vireobulk_analysis/")
## import seperated function from vireo-bulk's base file
from base import merged_VCF_to_sdf,rematch_merge_vcf_annotated,find_all,removedfault,prediction_bulk

loadfile & data clean

##donor vcf is the donor reference
##bulk_merged_data is the piled up result
##df_anno is the preprocessed annovar annotation of SNPs
##the pipeline of generating all three input files is in preprocessing https://vireobulk-analysis.readthedocs.io/en/latest/preprocessing.html
donor_vcf=load_VCF(   "/media/wgs/subset_stimulation_bam/filter_pbmc10donors.vcf.gz",    biallelic_only=True,    load_sample=True, sparse=False , format_list=None)
bulk_merged_data is =load_VCF("/media/wgs/subset_stimulation_bam/10pbmcdonor/cellSNP.base.vcf.gz", biallelic_only=True,load_sample=False,format_list=None,)
df_anno=pd.read_csv("/media/wgs/subset_stimulation_bam/10pbmcdonor/pbmcvcf_annotated_processed.csv")
donor_number=10
#this step is to ensure the order and numbers of SNPs of three files are consistent and remove unannoatation ./. (probilibity 1/3 of 3 genotype)

df_b,df_d,m_GT=rematch_merge_vcf_annotated(bulk_merged_data,donor_vcf,df_anno)
df_b,df_d,m_GT=removedfault(df_b,df_d,m_GT)
df_b
variants AD DP OTH
0 chr1_14574_A_G 2.0 35.0 1.0
1 chr1_14590_G_A 0.0 50.0 0.0
2 chr1_14599_T_A 0.0 53.0 2.0
3 chr1_14604_A_G 1.0 60.0 0.0
4 chr1_14610_T_C 1.0 62.0 1.0
... ... ... ... ...
128180 chrX_155999818_C_T 66.0 80.0 0.0
128181 chrX_155999869_A_G 88.0 89.0 0.0
128182 chrX_155999957_G_C 73.0 86.0 0.0
128183 chrX_156025297_G_A 65.0 195.0 1.0
128187 chrY_3120449_C_G 0.0 28.0 1.0

127155 rows × 4 columns

df_d
variants function genes avsnp GT
0 chr1_14574_A_G ncRNA_exonic WASH7P rs28503599 [0/0, 0/1, 0/1, 0/0, 0/0, 0/0, 0/1, 0/0, 0/1, ...
1 chr1_14590_G_A ncRNA_exonic WASH7P rs707679 [0/0, 0/1, 0/1, 0/0, 0/0, 0/1, 0/1, 0/0, 0/0, ...
2 chr1_14599_T_A ncRNA_exonic WASH7P rs707680 [0/1, 0/1, 0/1, 0/0, 0/0, 0/1, 0/1, 0/1, 0/1, ...
3 chr1_14604_A_G ncRNA_exonic WASH7P rs541940975 [0/1, 0/1, 0/1, 0/0, 0/0, 0/1, 0/1, 0/1, 0/1, ...
4 chr1_14610_T_C ncRNA_exonic WASH7P rs878986575 [0/1, 0/1, 0/1, 0/0, 0/0, 0/1, 0/1, 0/1, 0/1, ...
... ... ... ... ... ...
128180 chrX_155999818_C_T intronic IL9R rs3093470 [1/1, 0/0, 1/1, 1/1, 0/1, 0/1, 0/1, 1/1, 1/1, ...
128181 chrX_155999869_A_G intronic IL9R rs3091261 [1/1, 1/1, 1/1, 1/1, 1/1, 1/1, 1/1, 1/1, 1/1, ...
128182 chrX_155999957_G_C intronic IL9R rs3093472 [1/1, 0/0, 1/1, 1/1, 0/1, 0/1, 0/1, 1/1, 1/1, ...
128183 chrX_156025297_G_A downstream DDX11L16 rs185932868 [0/1, 0/0, 0/1, 0/0, 0/1, 0/0, 0/0, 0/1, 0/1, ...
128187 chrY_3120449_C_G intergenic LINC00278;TGIF2LY rs778218266 [0/1, 1/1, 0/1, 1/1, 0/0, 1/1, 1/1, 0/0, 0/1, ...

127155 rows × 5 columns

##processing data to vector, import to model
gene_unique=np.unique(df_d["genes"].tolist())
a=np.array(df_b["AD"])
d=np.array(df_b["DP"])
a=np.array(list(map(float,a)))
d=np.array(list(map(float,d)))

run donor decomposition

## establish model ,only parameter is the number of donors
model =vireoSNP.VireoBulk(donor_number)
## fit model learn_theta is the flag whether using learned allele frequency ,check its meaning  on document and manuscript
model.fit(a,d,m_GT,learn_theta=True )
## this results represent high confidence
model.theta
array([0.01164797, 0.44852633, 0.99267153])
donor_vcf['samples']
['18-G-017_R1',
 '18-G-018_R1',
 '18-G-019_R1',
 '18-G-020_R1',
 '18-G-021_R1',
 '18-G-022_R1',
 '18-G-023_R1',
 '18-G-024_R1',
 '18-G-025_R1',
 '18-G-026_R1']
## the percentage of sample donors , the order is  the same as donor_vcf['samples']
model.psi
array([0.06418119, 0.04299516, 0.05283177, 0.08967812, 0.14387843,
       0.06906367, 0.04872266, 0.11876265, 0.29342823, 0.07645813])
## premodel represent thes the NULL hypothesis that the gene abundance is the same as the donor composition 

premodel=model.psi

gene level abundance estimation

##RUN gene level demutipl

bulk_demuti_result= prediction_bulk(df_b,df_d,m_GT,donor_vcf,premodel,gene_unique)
###the 10 donor gene level demutiplexing may not as accurate as it in two donors  this result is just for demonstration
bulk_demuti_result
18-G-017_R1 18-G-018_R1 18-G-019_R1 18-G-020_R1 18-G-021_R1 18-G-022_R1 18-G-023_R1 18-G-024_R1 18-G-025_R1 18-G-026_R1 chi p numSNP
A1BG 0.240487 0.031524 0.246912 0.0424 0.212023 0.01513 0.073156 0.081881 0.022049 0.034438 3.290594 0.951652 3
A1BG-AS1 0.322538 0.000538 0.043626 0.000298 0.024924 0.110349 0.003099 0.307019 0.003918 0.183691 2.721691 0.974315 2
A2M 0.111379 0.186228 0.015524 0.070369 0.002524 0.00208 0.113103 0.210372 0.260654 0.027765 6.853791 0.652339 2
A2M;PZP 0.297699 0.057649 0.159085 0.049461 0.025333 0.013277 0.182469 0.030398 0.16116 0.023469 5.948257 0.745086 1
A2MP1 0.054614 0.433896 0.016346 0.109016 0.105064 0.050424 0.014692 0.170016 0.045928 0.000005 22.90447 0.006414 3
... ... ... ... ... ... ... ... ... ... ... ... ... ...
ZXDC 0.183335 0.069798 0.073204 0.156388 0.011695 0.013849 0.084582 0.1278 0.164845 0.114505 2.391586 0.983658 13
ZYG11B 0.084576 0.162609 0.022348 0.183719 0.0511 0.027481 0.088382 0.223326 0.086492 0.069967 10.147248 0.338697 9
ZYX 0.026771 0.061399 0.018724 0.008064 0.084577 0.122853 0.104039 0.288391 0.283356 0.001826 7.684969 0.566179 2
ZZEF1 0.063709 0.064758 0.008241 0.103454 0.050555 0.040779 0.000061 0.09356 0.474027 0.100855 1.673724 0.995641 29
ZZZ3 0.036555 0.076785 0.262047 0.230531 0.154715 0.06554 0.002896 0.050482 0.036244 0.084205 4.397809 0.883336 25

13350 rows × 13 columns