step by step example( for developer)
import
import sys
import numpy as np
import vireoSNP
from vireoSNP.utils.vireo_base import match
from vireoSNP.utils.vcf_utils import load_VCF, write_VCF, parse_donor_GPb
from vireoSNP.utils.vcf_utils import read_sparse_GeneINFO, GenoINFO_maker
import matplotlib.pyplot as plt
import pandas as pd
from vireoSNP.utils.io_utils import match_donor_VCF
import re as re
import os
## set workdiretory on vireobulk's folder
os.chdir("/home/ccheng/Vireobulk_analysis/")
## import seperated function from vireo-bulk's base file
from base import merged_VCF_to_sdf,rematch_merge_vcf_annotated,find_all,removedfault,prediction_bulk
loadfile & data clean
##donor vcf is the donor reference
##bulk_merged_data is the piled up result
##df_anno is the preprocessed annovar annotation of SNPs
##the pipeline of generating all three input files is in preprocessing https://vireobulk-analysis.readthedocs.io/en/latest/preprocessing.html
donor_vcf=load_VCF( "/media/wgs/subset_stimulation_bam/filter_pbmc10donors.vcf.gz", biallelic_only=True, load_sample=True, sparse=False , format_list=None)
bulk_merged_data is =load_VCF("/media/wgs/subset_stimulation_bam/10pbmcdonor/cellSNP.base.vcf.gz", biallelic_only=True,load_sample=False,format_list=None,)
df_anno=pd.read_csv("/media/wgs/subset_stimulation_bam/10pbmcdonor/pbmcvcf_annotated_processed.csv")
donor_number=10
#this step is to ensure the order and numbers of SNPs of three files are consistent and remove unannoatation ./. (probilibity 1/3 of 3 genotype)
df_b,df_d,m_GT=rematch_merge_vcf_annotated(bulk_merged_data,donor_vcf,df_anno)
df_b,df_d,m_GT=removedfault(df_b,df_d,m_GT)
df_b
| variants | AD | DP | OTH | |
|---|---|---|---|---|
| 0 | chr1_14574_A_G | 2.0 | 35.0 | 1.0 |
| 1 | chr1_14590_G_A | 0.0 | 50.0 | 0.0 |
| 2 | chr1_14599_T_A | 0.0 | 53.0 | 2.0 |
| 3 | chr1_14604_A_G | 1.0 | 60.0 | 0.0 |
| 4 | chr1_14610_T_C | 1.0 | 62.0 | 1.0 |
| ... | ... | ... | ... | ... |
| 128180 | chrX_155999818_C_T | 66.0 | 80.0 | 0.0 |
| 128181 | chrX_155999869_A_G | 88.0 | 89.0 | 0.0 |
| 128182 | chrX_155999957_G_C | 73.0 | 86.0 | 0.0 |
| 128183 | chrX_156025297_G_A | 65.0 | 195.0 | 1.0 |
| 128187 | chrY_3120449_C_G | 0.0 | 28.0 | 1.0 |
127155 rows × 4 columns
df_d
| variants | function | genes | avsnp | GT | |
|---|---|---|---|---|---|
| 0 | chr1_14574_A_G | ncRNA_exonic | WASH7P | rs28503599 | [0/0, 0/1, 0/1, 0/0, 0/0, 0/0, 0/1, 0/0, 0/1, ... |
| 1 | chr1_14590_G_A | ncRNA_exonic | WASH7P | rs707679 | [0/0, 0/1, 0/1, 0/0, 0/0, 0/1, 0/1, 0/0, 0/0, ... |
| 2 | chr1_14599_T_A | ncRNA_exonic | WASH7P | rs707680 | [0/1, 0/1, 0/1, 0/0, 0/0, 0/1, 0/1, 0/1, 0/1, ... |
| 3 | chr1_14604_A_G | ncRNA_exonic | WASH7P | rs541940975 | [0/1, 0/1, 0/1, 0/0, 0/0, 0/1, 0/1, 0/1, 0/1, ... |
| 4 | chr1_14610_T_C | ncRNA_exonic | WASH7P | rs878986575 | [0/1, 0/1, 0/1, 0/0, 0/0, 0/1, 0/1, 0/1, 0/1, ... |
| ... | ... | ... | ... | ... | ... |
| 128180 | chrX_155999818_C_T | intronic | IL9R | rs3093470 | [1/1, 0/0, 1/1, 1/1, 0/1, 0/1, 0/1, 1/1, 1/1, ... |
| 128181 | chrX_155999869_A_G | intronic | IL9R | rs3091261 | [1/1, 1/1, 1/1, 1/1, 1/1, 1/1, 1/1, 1/1, 1/1, ... |
| 128182 | chrX_155999957_G_C | intronic | IL9R | rs3093472 | [1/1, 0/0, 1/1, 1/1, 0/1, 0/1, 0/1, 1/1, 1/1, ... |
| 128183 | chrX_156025297_G_A | downstream | DDX11L16 | rs185932868 | [0/1, 0/0, 0/1, 0/0, 0/1, 0/0, 0/0, 0/1, 0/1, ... |
| 128187 | chrY_3120449_C_G | intergenic | LINC00278;TGIF2LY | rs778218266 | [0/1, 1/1, 0/1, 1/1, 0/0, 1/1, 1/1, 0/0, 0/1, ... |
127155 rows × 5 columns
##processing data to vector, import to model
gene_unique=np.unique(df_d["genes"].tolist())
a=np.array(df_b["AD"])
d=np.array(df_b["DP"])
a=np.array(list(map(float,a)))
d=np.array(list(map(float,d)))
run donor decomposition
## establish model ,only parameter is the number of donors
model =vireoSNP.VireoBulk(donor_number)
## fit model learn_theta is the flag whether using learned allele frequency ,check its meaning on document and manuscript
model.fit(a,d,m_GT,learn_theta=True )
## this results represent high confidence
model.theta
array([0.01164797, 0.44852633, 0.99267153])
donor_vcf['samples']
['18-G-017_R1',
'18-G-018_R1',
'18-G-019_R1',
'18-G-020_R1',
'18-G-021_R1',
'18-G-022_R1',
'18-G-023_R1',
'18-G-024_R1',
'18-G-025_R1',
'18-G-026_R1']
## the percentage of sample donors , the order is the same as donor_vcf['samples']
model.psi
array([0.06418119, 0.04299516, 0.05283177, 0.08967812, 0.14387843,
0.06906367, 0.04872266, 0.11876265, 0.29342823, 0.07645813])
## premodel represent thes the NULL hypothesis that the gene abundance is the same as the donor composition
premodel=model.psi
gene level abundance estimation
##RUN gene level demutipl
bulk_demuti_result= prediction_bulk(df_b,df_d,m_GT,donor_vcf,premodel,gene_unique)
###the 10 donor gene level demutiplexing may not as accurate as it in two donors this result is just for demonstration
bulk_demuti_result
| 18-G-017_R1 | 18-G-018_R1 | 18-G-019_R1 | 18-G-020_R1 | 18-G-021_R1 | 18-G-022_R1 | 18-G-023_R1 | 18-G-024_R1 | 18-G-025_R1 | 18-G-026_R1 | chi | p | numSNP | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| A1BG | 0.240487 | 0.031524 | 0.246912 | 0.0424 | 0.212023 | 0.01513 | 0.073156 | 0.081881 | 0.022049 | 0.034438 | 3.290594 | 0.951652 | 3 |
| A1BG-AS1 | 0.322538 | 0.000538 | 0.043626 | 0.000298 | 0.024924 | 0.110349 | 0.003099 | 0.307019 | 0.003918 | 0.183691 | 2.721691 | 0.974315 | 2 |
| A2M | 0.111379 | 0.186228 | 0.015524 | 0.070369 | 0.002524 | 0.00208 | 0.113103 | 0.210372 | 0.260654 | 0.027765 | 6.853791 | 0.652339 | 2 |
| A2M;PZP | 0.297699 | 0.057649 | 0.159085 | 0.049461 | 0.025333 | 0.013277 | 0.182469 | 0.030398 | 0.16116 | 0.023469 | 5.948257 | 0.745086 | 1 |
| A2MP1 | 0.054614 | 0.433896 | 0.016346 | 0.109016 | 0.105064 | 0.050424 | 0.014692 | 0.170016 | 0.045928 | 0.000005 | 22.90447 | 0.006414 | 3 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| ZXDC | 0.183335 | 0.069798 | 0.073204 | 0.156388 | 0.011695 | 0.013849 | 0.084582 | 0.1278 | 0.164845 | 0.114505 | 2.391586 | 0.983658 | 13 |
| ZYG11B | 0.084576 | 0.162609 | 0.022348 | 0.183719 | 0.0511 | 0.027481 | 0.088382 | 0.223326 | 0.086492 | 0.069967 | 10.147248 | 0.338697 | 9 |
| ZYX | 0.026771 | 0.061399 | 0.018724 | 0.008064 | 0.084577 | 0.122853 | 0.104039 | 0.288391 | 0.283356 | 0.001826 | 7.684969 | 0.566179 | 2 |
| ZZEF1 | 0.063709 | 0.064758 | 0.008241 | 0.103454 | 0.050555 | 0.040779 | 0.000061 | 0.09356 | 0.474027 | 0.100855 | 1.673724 | 0.995641 | 29 |
| ZZZ3 | 0.036555 | 0.076785 | 0.262047 | 0.230531 | 0.154715 | 0.06554 | 0.002896 | 0.050482 | 0.036244 | 0.084205 | 4.397809 | 0.883336 | 25 |
13350 rows × 13 columns