#!/usr/bin/env python
from bgen_reader import open_bgen
import numpy as np
import h5py, argparse
from snipar.ibd import write_segs_from_matrix
from snipar.map import decode_map_from_pos
from snipar.utilities import *
from snipar.simulate import *
parser = argparse.ArgumentParser()
parser.add_argument('--bgen',
type=str,help='Address of the phased genotypes in .bgen format. If there is a @ in the address, @ is replaced by the chromosome numbers in the range of chr_range for each chromosome (chr_range is an optional parameters for this script).')
parser.add_argument('--chr_range',
type=parseNumRange,
nargs='*',
action=NumRangeAction,
help='number of the chromosomes to be imputed. Should be a series of ranges with x-y format or integers.', default=None)
parser.add_argument('n_causal',type=int,help='Number of causal loci')
parser.add_argument('outprefix',type=str,help='Prefix for simulation output files')
parser.add_argument('--n_random',type=int,help='Number of generations of random mating',default=1)
parser.add_argument('--n_am',type=int,help='Number of generations of assortative mating',default=0)
parser.add_argument('--r_par',type=float,help='Phenotypic correlation of parents (for assortative mating)',default=None)
parser.add_argument('--h2_direct',type=float,help='Heritability due to direct effects in first generation',default=None)
parser.add_argument('--h2_total',type=float,help='Total variance explained by direct effects and indirect genetic effects from parents',default=None)
parser.add_argument('--r_dir_indir',type=float,help='Correlation between direct and indirect genetic effects',default=None)
parser.add_argument('--beta_vert',type=float,help='Vertical transmission coefficient',default=0)
parser.add_argument('--save_par_gts',action='store_true',help='Save the genotypes of the parents of the final generation',default=False)
parser.add_argument('--impute',action='store_true',help='Impute parental genotypes from phased sibling genotypes & IBD',default=False)
parser.add_argument('--unphased_impute',action='store_true',help='Impute parental genotypes from unphased sibling genotypes & IBD',default=False)
[docs]def main(args):
if args.beta_vert > 0 and args.h2_total is not None:
raise(ValueError('Cannot simulate both indirect effects and vertical transmission separately. Choose one'))
if args.beta_vert > 0 and args.h2_direct is None:
raise(ValueError('Must provide h2_direct if simulating vertical transmission'))
if args.h2_direct is None and args.h2_total is None:
raise(ValueError('Must provide one of h2_direct or h2_total'))
if args.h2_direct is not None and args.h2_total is not None:
raise(ValueError('Must provide one of h2_direct or h2_total'))
if args.n_random >= 0:
ngen_random = args.n_random
else:
raise(ValueError('Number of generations cannot be negative'))
if args.n_am > 0:
if args.r_par is not None:
if (args.r_par**2) <= 1:
r_y = args.r_par
else:
raise(ValueError('Parental correlation must be between -1 and 1'))
else:
raise(ValueError('Must specify parental phenotypic correlation for assortative mating'))
ngen_am = args.n_am
if args.h2_direct is not None:
if 0 <= args.h2_direct <= 1:
h2 = args.h2_direct
else:
raise(ValueError('Heritability must be between 0 and 1'))
if args.h2_total is not None:
if args.r_dir_indir is None:
raise(ValueError('Must specify correlation between direct and indirect genetic effects'))
else:
if (args.r_dir_indir**2) <= 1:
r_dir_alpha = args.r_dir_indir
else:
raise(ValueError('Correlation between direct and indirect effects must be between -1 and 1'))
if 0 <= args.h2_total <= 1:
h2_total = args.h2_total
else:
raise(ValueError('Heritability must be between 0 and 1'))
else:
h2_total = 0
beta_vert = args.beta_vert
ncausal = args.n_causal
bgenfiles, chroms = parse_obsfiles(args.bgen, obsformat='bgen', chromosomes=args.chr_range)
# Read genotypes
haps = []
maps = []
snp_ids = []
alleles = []
positions = []
for i in range(bgenfiles.shape[0]):
print('Reading in chromosome '+str(chroms[i]))
# Read bgen
bgen = open_bgen(bgenfiles[i], verbose=True)
# Read map
map = decode_map_from_pos(chroms[i], bgen.positions)
not_nan = np.logical_not(np.isnan(map))
maps.append(map[not_nan])
positions.append(bgen.positions[not_nan])
# Snp
snp_ids.append(bgen.rsids[not_nan])
# Alleles
alleles.append(np.array([x.split(',') for x in bgen.allele_ids[not_nan]]))
# Read genotypes
gts = bgen.read(([x for x in range(bgen.samples.shape[0])], [x for x in range(bgen.ids.shape[0])]), np.bool_)[:, :,
np.array([0, 2])]
gts = gts[:,not_nan]
nfam = int(np.floor(gts.shape[0] / 2))
ngen = np.zeros((nfam, 2, gts.shape[1], 2), dtype=np.bool_)
ngen[:, 0, :, :] = gts[0:nfam, :, :]
ngen[:, 1, :, :] = gts[nfam:(2 * nfam), :, :]
del gts
haps.append(ngen)
# Simulate population
total_matings = ngen_random+ngen_am
if h2_total == 0:
V = np.zeros((total_matings+1,2))
else:
V = np.zeros((total_matings,2))
a_count = 0
# Produce next generation
old_haps = haps
for gen in range(0, total_matings):
# Simulate phenotype for AM
if gen == 0 and h2_total == 0:
print('Simulating phenotype')
# Simulate phenotype
nsnp_chr = np.array([x.shape[2] for x in old_haps])
nsnp = np.sum(nsnp_chr)
if ncausal > nsnp:
raise(ValueError('Not enough SNPs to simulate phenotype with '+str(ncausal)+' causal SNPs'))
a = np.zeros((nsnp))
causal = np.sort(np.random.choice(np.arange(0,nsnp),ncausal,replace=False))
a[causal] = np.random.randn(ncausal)
G_p, G_m = compute_genetic_component(old_haps,causal,a)
scale_fctr = np.sqrt(h2/np.var(np.hstack((G_p,G_m))))
a = a*scale_fctr
# Compute parental phenotypes
if np.abs(beta_vert) > 0 and gen>0:
print('Computing parental phenotypes')
G_p, G_m, Y_p, Y_m = compute_phenotype_vert(old_haps, causal, a, 1-h2, beta_vert, Y_p[father_indices], Y_m[mother_indices])
elif h2_total==0:
print('Computing parental phenotypes')
G_p, G_m, Y_p, Y_m = compute_phenotype(old_haps, causal, a, 1 - h2)
# Record variance components
if gen>0 or h2_total==0:
V[a_count, :] = np.array([np.var(np.hstack((G_p, G_m))), np.var(np.hstack((Y_p, Y_m)))])
print('Genetic variance: ' + str(round(V[a_count, 0], 4)))
print('Phenotypic variance: ' + str(round(V[a_count, 1], 4)))
print('Heritability: ' + str(round(V[a_count, 0] / V[a_count, 1], 4)))
a_count += 1
## Match parents
print('Mating ' + str(gen + 1))
# Random mating
if gen<ngen_random:
father_indices = random_mating_indices(nfam)
mother_indices = random_mating_indices(nfam)
# Assortative mating
if gen>=ngen_random:
# Compute parental phenotypes
print('Computing parental phenotypes')
# Match assortatively
print('Matching assortatively')
father_indices, mother_indices = am_indices(Y_p, Y_m, r_y)
# Print variances
print('Parental phenotypic correlation: '+str(round(np.corrcoef(Y_p[father_indices],Y_m[mother_indices])[0,1],4)))
print('Parental genotypic correlation: '+str(round(np.corrcoef(G_p[father_indices],G_m[mother_indices])[0,1],4)))
# Generate haplotpyes of new generation
new_haps = []
ibd = []
for chr in range(0,len(haps)):
print('Chromosome '+str(chroms[0]+chr))
new_haps_chr, ibd_chr = produce_next_gen(father_indices,mother_indices,old_haps[chr][:,0,:,:],old_haps[chr][:,1,:,:],maps[chr])
new_haps.append(new_haps_chr)
ibd.append(ibd_chr)
# Compute indirect effect component
if h2_total>0:
if gen==0:
print('Simulating indirect genetic effects')
nsnp_chr = np.array([x.shape[2] for x in old_haps])
nsnp = np.sum(nsnp_chr)
if ncausal > nsnp:
raise (ValueError('Not enough SNPs to simulate phenotype with ' + str(ncausal) + ' causal SNPs'))
ab = np.zeros((nsnp,2))
causal = np.sort(np.random.choice(np.arange(0, nsnp), ncausal, replace=False))
ab[causal,:] = np.random.multivariate_normal(np.zeros((2)),
np.array([[1,r_dir_alpha],[r_dir_alpha,1]]),
size=ncausal)
G_males, G_females, Y_males, Y_females = compute_phenotype_indirect(new_haps,old_haps,father_indices,mother_indices,causal,ab[:,0],ab[:,1],0)
scale_fctr = np.sqrt(h2_total / np.var(np.hstack((Y_males, Y_females))))
ab = ab*scale_fctr
print('Computing parental phenotype')
G_p, G_m, Y_p, Y_m = compute_phenotype_indirect(new_haps,old_haps,father_indices,mother_indices,causal,ab[:,0],ab[:,1],1-h2_total)
if gen<(total_matings-1):
old_haps = new_haps
print('Computing final generation phenotypes')
if np.abs(beta_vert)>0:
G_males, G_females, Y_males, Y_females = compute_phenotype_vert(new_haps, causal, a, 1 - h2, beta_vert, Y_p[father_indices], Y_m[mother_indices])
elif h2_total>0:
G_males, G_females, Y_males, Y_females = compute_phenotype_indirect(new_haps, old_haps, father_indices, mother_indices, causal,
ab[:, 0], ab[:, 1], 1 - h2_total)
else:
G_males, G_females, Y_males, Y_females = compute_phenotype(new_haps, causal, a, 1 - h2)
print('Sibling genotypic correlation: ' + str(round(np.corrcoef(G_males, G_females)[0, 1], 4)))
print('Sibling phenotypic correlation: ' + str(round(np.corrcoef(Y_males, Y_females)[0, 1], 4)))
# Final offspring generation
V[a_count,:] = np.array([np.var(np.hstack((G_males,G_females))),np.var(np.hstack((Y_males, Y_females)))])
print('Saving output to file')
# Save variance
vcf = args.outprefix+'VCs.txt'
np.savetxt(vcf, V)
print('Variance components saved to '+str(vcf))
print('Saving pedigree')
# Produce pedigree
ped = np.zeros((nfam*2,6),dtype='U30')
for i in range(0,nfam):
ped[(2*i):(2*(i+1)),0] = i
ped[(2 * i):(2 * (i + 1)), 1] = np.array([str(i)+'_0',str(i)+'_1'],dtype=str)
ped[(2 * i):(2 * (i + 1)),2] = 'P_'+str(i)
ped[(2 * i):(2 * (i + 1)), 3] = 'M_' + str(i)
ped[(2 * i):(2 * (i + 1)), 4] = np.array(['0','1'])
ped[(2 * i):(2 * (i + 1)), 5] = np.array([Y_males[i],Y_females[i]],dtype=ped.dtype)
sibpairs = ped[:,1].reshape((int(ped.shape[0]/2),2))
ped = np.vstack((np.array(['FID','IID','FATHER_ID','MOTHER_ID','SEX','PHENO']),ped))
np.savetxt(args.outprefix+'sibs.ped',ped[:,0:4],fmt='%s')
np.savetxt(args.outprefix+'sibs.fam',ped[1:ped.shape[0],:],fmt='%s')
## Save to HDF5 file
print('Saving genotypes to '+args.outprefix+'genotypes.hdf5')
hf = h5py.File(args.outprefix+'genotypes.hdf5','w')
# save pedigree
hf['ped'] = encode_str_array(ped)
# save offspring genotypes
for i in range(len(haps)):
print('Writing genotypes for chromosome '+str(chroms[i]))
bim_i = np.vstack((snp_ids[i], maps[i], positions[i], alleles[i][:,0],alleles[i][:,1])).T
hf['chr_'+str(chroms[i])+'_bim'] = encode_str_array(bim_i)
gts_chr = np.sum(new_haps[i], axis=3, dtype=np.uint8)
hf['chr_'+str(chroms[i])] = gts_chr.reshape((gts_chr.shape[0]*2,gts_chr.shape[2]),order='C')
# # Imputed parental genotypes
if args.impute or args.unphased_impute:
print('Imputing parental genotypes and saving')
freqs = np.mean(gts_chr, axis=(0, 1)) / 2.0
# Phased
if args.impute:
phased_imp = impute_all_fams_phased(new_haps[i],freqs,ibd[i])
hf['imp_chr'+str(chroms[i])] = phased_imp
# Unphased
if args.unphased_impute:
ibd[i] = np.sum(ibd[i],axis=2)
imp = impute_all_fams(gts_chr, freqs, ibd[i])
hf['unphased_imp_chr_'+str(chroms[i])] = imp
# Parental genotypes
if args.save_par_gts:
print('Saving true parental genotypes')
par_gts_chr = np.zeros((old_haps[i].shape[0],2,old_haps[i].shape[2]),dtype=np.uint8)
par_gts_chr[:,0,:] = np.sum(old_haps[i][father_indices,0,:,:],axis=2)
par_gts_chr[:,1,:] = np.sum(old_haps[i][mother_indices,1,:,:],axis=2)
hf['par_chr_'+str(chroms[i])] = par_gts_chr
hf.close()
# Write IBD segments
snp_count = 0
if h2_total>0:
causal_out = np.zeros((ab.shape[0], 5), dtype='U30')
else:
causal_out = np.zeros((a.shape[0],4),dtype='U30')
for i in range(len(haps)):
print('Writing IBD segments for chromosome '+str(chroms[i]))
# Segments
if not args.unphased_impute:
ibd[i] = np.sum(ibd[i],axis=2)
segs = write_segs_from_matrix(ibd[i], sibpairs,
snp_ids[i], positions[i],maps[i],chroms[i],
args.outprefix+'chr_'+str(chroms[i])+'.segments.gz')
# Causal effects
if h2_total==0:
a_chr = a[snp_count:(snp_count + snp_ids[i].shape[0])]
causal_out[snp_count:(snp_count + snp_ids[i].shape[0]),:] = np.vstack((snp_ids[i],alleles[i][:,0],alleles[i][:,1],
a_chr)).T
else:
ab_chr = ab[snp_count:(snp_count + snp_ids[i].shape[0]),:]
causal_out[snp_count:(snp_count + snp_ids[i].shape[0]),:] = np.vstack((snp_ids[i],alleles[i][:,0],alleles[i][:,1],
ab_chr[:,0],ab_chr[:,1])).T
snp_count += snp_ids[i].shape[0]
# count
np.savetxt(args.outprefix+'causal_effects.txt',causal_out,fmt='%s')
if __name__ == "__main__":
args=parser.parse_args()
main(args)