step by step example（ for developer）

import

import sys
import numpy as np
import vireoSNP
from vireoSNP.utils.vireo_base import match
from vireoSNP.utils.vcf_utils import load_VCF, write_VCF, parse_donor_GPb
from vireoSNP.utils.vcf_utils import read_sparse_GeneINFO, GenoINFO_maker
import matplotlib.pyplot as plt
import pandas as pd
from vireoSNP.utils.io_utils import match_donor_VCF
import re as re
import os

## set workdiretory on vireobulk's folder
os.chdir("/home/ccheng/Vireobulk_analysis/")

## import seperated function from vireo-bulk's base file
from base import merged_VCF_to_sdf,rematch_merge_vcf_annotated,find_all,removedfault,prediction_bulk

loadfile & data clean

##donor vcf is the donor reference
##bulk_merged_data is the piled up result
##df_anno is the preprocessed annovar annotation of SNPs
##the pipeline of generating all three input files is in preprocessing https://vireobulk-analysis.readthedocs.io/en/latest/preprocessing.html
donor_vcf=load_VCF(   "/media/wgs/subset_stimulation_bam/filter_pbmc10donors.vcf.gz",    biallelic_only=True,    load_sample=True, sparse=False , format_list=None)
bulk_merged_data is =load_VCF("/media/wgs/subset_stimulation_bam/10pbmcdonor/cellSNP.base.vcf.gz", biallelic_only=True,load_sample=False,format_list=None,)
df_anno=pd.read_csv("/media/wgs/subset_stimulation_bam/10pbmcdonor/pbmcvcf_annotated_processed.csv")
donor_number=10

#this step is to ensure the order and numbers of SNPs of three files are consistent and remove unannoatation ./. （probilibity 1/3 of 3 genotype）

df_b,df_d,m_GT=rematch_merge_vcf_annotated(bulk_merged_data,donor_vcf,df_anno)
df_b,df_d,m_GT=removedfault(df_b,df_d,m_GT)

df_b

	variants	AD	DP	OTH
0	chr1_14574_A_G	2.0	35.0	1.0
1	chr1_14590_G_A	0.0	50.0	0.0
2	chr1_14599_T_A	0.0	53.0	2.0
3	chr1_14604_A_G	1.0	60.0	0.0
4	chr1_14610_T_C	1.0	62.0	1.0
...	...	...	...	...
128180	chrX_155999818_C_T	66.0	80.0	0.0
128181	chrX_155999869_A_G	88.0	89.0	0.0
128182	chrX_155999957_G_C	73.0	86.0	0.0
128183	chrX_156025297_G_A	65.0	195.0	1.0
128187	chrY_3120449_C_G	0.0	28.0	1.0

127155 rows × 4 columns

df_d

	variants	function	genes	avsnp	GT
0	chr1_14574_A_G	ncRNA_exonic	WASH7P	rs28503599	[0/0, 0/1, 0/1, 0/0, 0/0, 0/0, 0/1, 0/0, 0/1, ...
1	chr1_14590_G_A	ncRNA_exonic	WASH7P	rs707679	[0/0, 0/1, 0/1, 0/0, 0/0, 0/1, 0/1, 0/0, 0/0, ...
2	chr1_14599_T_A	ncRNA_exonic	WASH7P	rs707680	[0/1, 0/1, 0/1, 0/0, 0/0, 0/1, 0/1, 0/1, 0/1, ...
3	chr1_14604_A_G	ncRNA_exonic	WASH7P	rs541940975	[0/1, 0/1, 0/1, 0/0, 0/0, 0/1, 0/1, 0/1, 0/1, ...
4	chr1_14610_T_C	ncRNA_exonic	WASH7P	rs878986575	[0/1, 0/1, 0/1, 0/0, 0/0, 0/1, 0/1, 0/1, 0/1, ...
...	...	...	...	...	...
128180	chrX_155999818_C_T	intronic	IL9R	rs3093470	[1/1, 0/0, 1/1, 1/1, 0/1, 0/1, 0/1, 1/1, 1/1, ...
128181	chrX_155999869_A_G	intronic	IL9R	rs3091261	[1/1, 1/1, 1/1, 1/1, 1/1, 1/1, 1/1, 1/1, 1/1, ...
128182	chrX_155999957_G_C	intronic	IL9R	rs3093472	[1/1, 0/0, 1/1, 1/1, 0/1, 0/1, 0/1, 1/1, 1/1, ...
128183	chrX_156025297_G_A	downstream	DDX11L16	rs185932868	[0/1, 0/0, 0/1, 0/0, 0/1, 0/0, 0/0, 0/1, 0/1, ...
128187	chrY_3120449_C_G	intergenic	LINC00278;TGIF2LY	rs778218266	[0/1, 1/1, 0/1, 1/1, 0/0, 1/1, 1/1, 0/0, 0/1, ...

127155 rows × 5 columns

##processing data to vector, import to model
gene_unique=np.unique(df_d["genes"].tolist())
a=np.array(df_b["AD"])
d=np.array(df_b["DP"])
a=np.array(list(map(float,a)))
d=np.array(list(map(float,d)))

run donor decomposition

## establish model ,only parameter is the number of donors
model =vireoSNP.VireoBulk(donor_number)

## fit model learn_theta is the flag whether using learned allele frequency ,check its meaning  on document and manuscript
model.fit(a,d,m_GT,learn_theta=True )

## this results represent high confidence
model.theta

array([0.01164797, 0.44852633, 0.99267153])

donor_vcf['samples']

['18-G-017_R1',
 '18-G-018_R1',
 '18-G-019_R1',
 '18-G-020_R1',
 '18-G-021_R1',
 '18-G-022_R1',
 '18-G-023_R1',
 '18-G-024_R1',
 '18-G-025_R1',
 '18-G-026_R1']

## the percentage of sample donors , the order is  the same as donor_vcf['samples']
model.psi

array([0.06418119, 0.04299516, 0.05283177, 0.08967812, 0.14387843,
       0.06906367, 0.04872266, 0.11876265, 0.29342823, 0.07645813])

## premodel represent thes the NULL hypothesis that the gene abundance is the same as the donor composition 

premodel=model.psi

gene level abundance estimation

##RUN gene level demutipl

bulk_demuti_result= prediction_bulk(df_b,df_d,m_GT,donor_vcf,premodel,gene_unique)

###the 10 donor gene level demutiplexing may not as accurate as it in two donors  this result is just for demonstration
bulk_demuti_result

	18-G-017_R1	18-G-018_R1	18-G-019_R1	18-G-020_R1	18-G-021_R1	18-G-022_R1	18-G-023_R1	18-G-024_R1	18-G-025_R1	18-G-026_R1	chi	p	numSNP
A1BG	0.240487	0.031524	0.246912	0.0424	0.212023	0.01513	0.073156	0.081881	0.022049	0.034438	3.290594	0.951652	3
A1BG-AS1	0.322538	0.000538	0.043626	0.000298	0.024924	0.110349	0.003099	0.307019	0.003918	0.183691	2.721691	0.974315	2
A2M	0.111379	0.186228	0.015524	0.070369	0.002524	0.00208	0.113103	0.210372	0.260654	0.027765	6.853791	0.652339	2
A2M;PZP	0.297699	0.057649	0.159085	0.049461	0.025333	0.013277	0.182469	0.030398	0.16116	0.023469	5.948257	0.745086	1
A2MP1	0.054614	0.433896	0.016346	0.109016	0.105064	0.050424	0.014692	0.170016	0.045928	0.000005	22.90447	0.006414	3
...	...	...	...	...	...	...	...	...	...	...	...	...	...
ZXDC	0.183335	0.069798	0.073204	0.156388	0.011695	0.013849	0.084582	0.1278	0.164845	0.114505	2.391586	0.983658	13
ZYG11B	0.084576	0.162609	0.022348	0.183719	0.0511	0.027481	0.088382	0.223326	0.086492	0.069967	10.147248	0.338697	9
ZYX	0.026771	0.061399	0.018724	0.008064	0.084577	0.122853	0.104039	0.288391	0.283356	0.001826	7.684969	0.566179	2
ZZEF1	0.063709	0.064758	0.008241	0.103454	0.050555	0.040779	0.000061	0.09356	0.474027	0.100855	1.673724	0.995641	29
ZZZ3	0.036555	0.076785	0.262047	0.230531	0.154715	0.06554	0.002896	0.050482	0.036244	0.084205	4.397809	0.883336	25

13350 rows × 13 columns