Source code for sibreg.sibreg

import numpy as np
import numpy.ma as ma
from pysnptools.snpreader import Bed, Pheno
from scipy.optimize import fmin_l_bfgs_b
import h5py, code
from bgen_reader import open_bgen

[docs]class model(object): """Define a linear model with within-class correlations. Args: y : :class:`~numpy:numpy.array` 1D array of phenotype observations X : :class:`~numpy:numpy.array` Design matrix for the fixed mean effects. labels : :class:`~numpy:numpy.array` 1D array of sample labels Returns: model : :class:`sibreg.model` """ def __init__(self,y,X,labels, add_intercept = False): if y.shape[0] == X.shape[0] and X.shape[0] == labels.shape[0]: pass else: raise(ValueError('inconsistent sample sizes of response, covariates, and labels')) # Get sample size self.n = X.shape[0] if X.ndim == 1: X = X.reshape((self.n,1)) if add_intercept: X = np.hstack((np.ones((self.n,1),dtype=X.dtype),X)) self.X = X # Label mapping self.label_counts = dict() self.label_indices = dict() for l in range(0,labels.shape[0]): if labels[l] not in self.label_counts: self.label_counts[labels[l]]=1 self.label_indices[labels[l]] = [l] else: self.label_counts[labels[l]]+=1 self.label_indices[labels[l]].append(l) self.y_lab = dict() self.X_lab = dict() for label in self.label_indices.keys(): self.y_lab[label]=y[self.label_indices[label]] self.X_lab[label]=X[self.label_indices[label],:] self.n_labels = len(self.y_lab.keys()) # response self.y=y self.labels=labels
[docs] def alpha_mle(self, tau, sigma2, compute_cov = False, xtx_out = False): """ Compute the MLE of alpha given variance parameters Args: sigma2 : :class:`float` variance of model residuals tau : :class:`float` ratio of variance of model residuals to variance explained by mean differences between classes Returns: alpha : :class:`~numpy:numpy.array` MLE of alpha """ X_T_X = np.zeros((self.X.shape[1],self.X.shape[1]),dtype = np.float64) X_T_y = np.zeros((self.X.shape[1]), dtype = np.float64) for label in self.y_lab.keys(): sigma_u = sigma2/tau Sigma_lab = sigma_u*np.ones((self.label_counts[label],self.label_counts[label])) np.fill_diagonal(Sigma_lab,sigma_u+sigma2) Sigma_lab_inv = np.linalg.inv(Sigma_lab) X_T_X = X_T_X + np.dot(self.X_lab[label].T,Sigma_lab_inv.dot(self.X_lab[label])) X_T_y = X_T_y + np.dot(self.X_lab[label].T, Sigma_lab_inv.dot(self.y_lab[label])) if xtx_out: return [X_T_X,X_T_y.reshape((self.X.shape[1]))] else: alpha = np.linalg.solve(X_T_X,X_T_y) alpha = alpha.reshape((alpha.shape[0],)) if compute_cov: alpha_cov = np.linalg.inv(X_T_X) return [alpha,alpha_cov] else: return alpha
# Compute likelihood of data given beta, alpha
[docs] def likelihood_and_gradient(self, sigma2, tau): """ Compute the loss function, which is -2 times the likelihood along with its gradient Args: sigma2 : :class:`float` variance of model residuals tau : :class:`float` ratio of variance of model residuals to variance explained by mean differences between classes Returns: L, grad : :class:`float` loss function and gradient, divided by sample size """ ## Likelihood alpha = self.alpha_mle(tau, sigma2) resid = self.y - self.X.dot(alpha) RSS = np.sum(np.square(resid)) L = self.n * np.log(sigma2)+RSS/sigma2 ## Gradient with respect to sigma2 grad_sigma2 = self.n/sigma2-RSS/np.square(sigma2) ## Gradient with respect to tau grad_tau = 0 for label in self.y_lab.keys(): resid_label=resid[self.label_indices[label]] resid_sum = np.sum(resid_label) resid_square_sum = np.square(resid_sum) # Add to likelihood L = L - resid_square_sum/(sigma2*(tau+self.label_counts[label]))+np.log(1+self.label_counts[label]/tau) # Add to grad sigma2 grad_sigma2+=resid_square_sum/(np.square(sigma2)*(tau+self.label_counts[label])) # Add to grad tau grad_tau+=(resid_square_sum/sigma2-self.label_counts[label]*(1+self.label_counts[label]/tau))/np.square(tau+self.label_counts[label]) # Overall gradient vector grad = np.hstack((grad_sigma2,grad_tau)) return L/self.n, grad/self.n
[docs] def optimize_model(self,init_params): """ Find the parameters that minimise the loss function for a given regularisation parameter Args: init_param : :class:`array` initial values for residual variance (sigma^2_epsilon) followed by ratio of residual variance to within-class variance (tau) Returns: optim : :class:`dict` dictionary with keys: 'success', whether optimisation was successful (bool); 'warnflag', output of L-BFGS-B algorithm giving warnings; 'sigma2', MLE of residual variance; 'tau', MLE of ratio of residual variance to within-class variance; 'likelihood', maximum of likelihood. """ # Paramtere boundaries parbounds=[(0.00001, None),(0.00001, None)] # Optimize optimized = fmin_l_bfgs_b(func=lik_and_grad,x0=init_params, args=(self.y, self.X, self.labels), bounds = parbounds) # Get MLE optim = {} optim['success'] = True optim['warnflag'] = optimized[2]['warnflag'] if optim['warnflag'] != 0: print('Optimization unsuccessful.') optim['success'] = False optim['sigma2'] = optimized[0][0] optim['tau'] = optimized[0][1] # Get parameter covariance optim['likelihood'] = -0.5 * np.float64(self.n) * (optimized[1] + np.log(2 * np.pi)) return optim
[docs] def sigma_inv_root(self,tau,sigma2): sigma_u = sigma2 / tau sigma2_nsqrt = dict() famsizes = np.unique(list(self.label_counts.values())) sigma2_nsqrt[1] = np.power(sigma_u+sigma2,-0.5) famsizes = famsizes[famsizes>1] for famsize in famsizes: Sigma_lab = sigma_u*np.ones((famsize,famsize)) np.fill_diagonal(Sigma_lab,sigma_u+sigma2) vals, vectors = np.linalg.eigh(Sigma_lab) vals = np.power(vals,0.25) vectors = vectors/vals sigma2_nsqrt[famsize] = vectors.dot(vectors.T) return sigma2_nsqrt
[docs] def predict(self,X): """ Predict new observations based on model regression coefficients Args: X : :class:`array` matrix of covariates to predict from Returns: y : :class:`array` predicted values """ if hasattr(self,'alpha'): return X.dot(self.alpha) else: raise(AttributeError('Model does not have known regression coefficients. Try optimizing model first'))
[docs] def set_alpha(self,alpha): self.alpha = alpha
[docs]def lik_and_grad(pars,*args): # Wrapper for function to pass to L-BFGS-B y, X, labels = args mod = model(y,X,labels) return mod.likelihood_and_gradient(pars[0],pars[1])
[docs]def simulate(n,alpha,sigma2,tau): """Simulate from a linear model with correlated observations within-class. The mean for each class is drawn from a normal distribution. Args: n : :class:`int` sample size alpha : :class:`~numpy:numpy.array` value of regression coefficeints sigma2 : :class:`float` variance of residuals tau : :class:`float` ratio of variance of residuals to variance of distribution of between individual means Returns: model : :class:`regrnd.model` linear model with repeated observations """ c = alpha.shape[0] #X = np.random.randn((n * c)).reshape((n, c)) X_cov = np.ones((c,c)) np.fill_diagonal(X_cov,1.2) X = np.random.multivariate_normal(np.zeros((c)),X_cov,n).reshape((n, c)) labels = np.random.choice(n//10,n) random_effects = np.sqrt(sigma2//tau)*np.random.randn(n) y = X.dot(alpha)+random_effects[labels-1]+np.random.randn(n)*np.sqrt(sigma2) return model(y,X,labels)
[docs]class gtarray(object): """Define a genotype or PGS array that stores individual IDs, family IDs, and SNP information. Args: garray : :class:`~numpy:numpy.array` 2 or 3 dimensional numpy array of genotypes/PGS values. First dimension is individuals. For a 2 dimensional array, the second dimension is SNPs or PGS values. For a 3 dimensional array, the second dimension indexes the individual and his/her relatives' genotypes (for example: proband, paternal, and maternal); and the third dimension is the SNPs. ids : :class:`~numpy:numpy.array` vector of individual IDs sid : :class:`~numpy:numpy.array` vector of SNP ids, equal in length size of last dimension of array alleles : :class:`~numpy:numpy.array` [L x 2] matrix of ref and alt alleles for the SNPs. L must match size of sid pos : :class:`~numpy:numpy.array` vector of SNP positions; must match size of sid chrom : :class:`~numpy:numpy.array` vector of SNP chromosomes; must match size of sid fams : :class:`~numpy:numpy.array` vector of family IDs; must match size of ids par_status : :class:`~numpy:numpy.array' [N x 2] numpy matrix that records whether parents have observed or imputed genotypes/PGS, where N matches size of ids. The first column is for the father of that individual; the second column is for the mother of that individual. If the parent is neither observed nor imputed, the value is -1; if observed, 0; and if imputed, 1. Returns: G : :class:`sibreg.gtarray` """ def __init__(self, garray, ids, sid=None, alleles=None, pos=None, chrom=None, fams=None, par_status=None): if type(garray) == np.ndarray or type(garray)==np.ma.core.MaskedArray: if type(garray) == np.ndarray: self.gts = ma.array(garray,mask=np.isnan(garray)) else: self.gts = garray self.shape = garray.shape self.ndim = garray.ndim self.dtype = garray.dtype self.freqs = None else: raise ValueError('Genotypes must be a numpy ndarray') if garray.shape[0] == ids.shape[0]: self.ids = ids self.id_dict = make_id_dict(ids) else: raise ValueError('Shape of genotypes and ids does not match') if sid is not None: if sid.shape[0] == garray.shape[1]: self.snp_index = 1 self.sid = sid self.sid_dict = make_id_dict(sid) elif sid.shape[0] == garray.shape[2]: self.snp_index = 2 self.sid = sid self.sid_dict = make_id_dict(sid) else: raise ValueError('Shape of SNP ids (sid) does not match shape of genotype array') if alleles is not None: if self.sid is not None: if alleles.shape[0] == self.sid.shape[0]: self.alleles = alleles else: raise ValueError('Size of alleles does not match size of genotypes') else: raise(ValueError('Must provide SNP ids')) else: self.alleles = None if pos is not None: if self.sid is not None: if pos.shape[0] == self.sid.shape[0]: self.pos = pos else: raise ValueError('Size of position vector does not match size of genotypes') else: raise(ValueError('Must provide SNP ids')) else: self.pos = None if chrom is not None: if self.sid is not None: if chrom.shape[0] == self.sid.shape[0]: self.chrom = chrom else: raise ValueError('Size of position vector does not match size of genotypes') else: raise(ValueError('Must provide SNP ids')) else: self.chrom = None if fams is not None: if fams.shape[0] == ids.shape[0] and fams.ndim==1: self.fams = fams else: raise ValueError('Fams not of same length as IDs') else: self.fams = None if par_status is not None: if par_status.shape[0] == ids.shape[0] and par_status.shape[1] == 2: self.par_status = par_status else: raise ValueError('Incompatible par status array') else: self.par_status = None self.mean_normalised = False if np.sum(self.gts.mask)>0: self.has_NAs = True else: self.has_NAs = False self.info = None
[docs] def compute_freqs(self): """ Computes the frequencies of the SNPs. Stored in self.freqs. """ if self.ndim == 2: self.freqs = ma.mean(self.gts,axis=0)/2.0 elif self.ndim == 3: self.freqs = ma.mean(self.gts[:,0,:], axis=0) / 2.0
[docs] def filter(self,filter_pass): if self.freqs is not None: self.freqs = self.freqs[filter_pass] if self.ndim == 2: self.gts = self.gts[:,filter_pass] elif self.ndim == 3: self.gts = self.gts[:,:,filter_pass] self.shape = self.gts.shape if self.sid is not None: self.sid = self.sid[filter_pass] self.sid_dict = make_id_dict(self.sid) if self.pos is not None: self.pos = self.pos[filter_pass] if self.alleles is not None: self.alleles = self.alleles[filter_pass] if self.chrom is not None: self.chrom = self.chrom[filter_pass]
[docs] def filter_maf(self, min_maf = 0.01, verbose=False): """ Filter SNPs based on having minor allele frequency (MAF) greater than min_maf, and have % missing observations less than max_missing. """ if self.freqs is None: self.compute_freqs() freqs_pass = np.logical_and(self.freqs > min_maf, self.freqs < (1 - min_maf)) if verbose: print(str(self.freqs.shape[0] - np.sum(freqs_pass)) + ' SNPs with MAF<' + str(min_maf)) self.filter(freqs_pass)
[docs] def filter_missingness(self, max_missing = 5, verbose=False): if self.ndim == 2: missingness = ma.mean(self.gts.mask,axis=0) elif self.ndim == 3: missingness = ma.mean(self.gts.mask,axis = (0,1)) missingness_pass = 100 * missingness < max_missing if verbose: print(str(self.freqs.shape[0] - np.sum(missingness_pass)) + ' SNPs with missingness >' + str(max_missing) + '%') self.filter(missingness_pass)
[docs] def compute_info(self): if self.freqs is None: self.compute_freqs() if self.ndim == 2: self.variances = np.var(self.gts, axis=0) elif self.ndim==3: self.variances = np.var(self.gts[:,0,:], axis=0) self.info = self.variances/(2.0*self.freqs*(1-self.freqs))
[docs] def filter_info(self, min_info = 0.99, verbose=False): if self.info is None: self.compute_info() info_pass = self.info > min_info if verbose: print(str(self.info.shape[0] - np.sum(info_pass)) + ' SNPs with INFO <' + str(min_info)) self.filter(info_pass)
[docs] def filter_ids(self,keep_ids, verbose=False): """ Keep only individuals with ids given by keep_ids """ in_ids = np.array([x in self.id_dict for x in keep_ids]) n_filtered = np.sum(in_ids) if n_filtered==0: raise(ValueError('No individuals would be left after filtering')) else: if verbose: print('After filtering, '+str(n_filtered)+' individuals remain') indices = np.array([self.id_dict[x] for x in keep_ids[in_ids]]) if self.ndim == 2: self.gts = self.gts[indices, :] elif self.ndim == 3: self.gts = self.gts[indices, :, :] self.ids = self.ids[indices] self.id_dict = make_id_dict(self.ids) self.shape = self.gts.shape if self.fams is not None: self.fams = self.fams[indices]
[docs] def mean_normalise(self): """ This normalises the SNPs/PGS columns to have mean-zero. """ if not self.mean_normalised: if self.gts.ndim == 2: self.gts = self.gts - ma.mean(self.gts,axis=0) elif self.gts.ndim==3: for i in range(0, self.gts.shape[1]): self.gts[:, i, :] = self.gts[:, i, :] - ma.mean(self.gts[:, i, :], axis=0) self.mean_normalised = True
[docs] def scale(self): """ This normalises the SNPs/PGS columns to have variance 1. """ if self.gts.ndim == 2: self.gts = self.gts/ma.std(self.gts, axis=0) elif self.gts.ndim == 3: for i in range(0, self.gts.shape[1]): self.gts[:, i, :] = self.gts[:, i, :]/ma.std(self.gts[:, i, :], axis=0)
[docs] def fill_NAs(self): """ This normalises the SNP columns to have mean-zero, then fills in NA values with zero. """ if not self.mean_normalised: self.mean_normalise() NAs = np.sum(self.gts.mask, axis=0) self.gts[self.gts.mask] = 0 self.gts.mask = False self.has_NAs = False return NAs
[docs] def add(self,garray): """ Adds another gtarray of the same dimension to this array and returns the sum. It matches IDs before summing. """ if type(garray)==gtarray: pass else: raise ValueError('Must add to another gtarray') if not self.gts.ndim == garray.gts.ndim: raise ValueError('Arrays must have same number of dimensions') if self.gts.ndim == 2: if not self.gts.shape[1] == garray.gts.shape[1]: raise ValueError('Arrays must have same dimensions (apart from first)') if self.gts.ndim == 3: if not self.gts.shape[1:3] == garray.gts.shape[1:3]: raise ValueError('Arrays must have same dimensions (apart from first)') # Match IDs common_ids = list(self.id_dict.keys() & garray.id_dict.keys()) if len(common_ids)==0: raise ValueError('No IDs in common') self_index = np.array([self.id_dict[x] for x in common_ids]) other_index = np.array([garray.id_dict[x] for x in common_ids]) # Out if self.ids.ndim == 1: ids_out = self.ids[self_index] else: ids_out = self.ids[self_index,:] if self.gts.ndim ==2: add_gts = self.gts[self_index,:]+garray.gts[other_index,:] else: add_gts = self.gts[self_index, :,:] + garray.gts[other_index, :,:] return gtarray(add_gts,ids_out,self.sid,alleles = self.alleles, fams = self.fams[self_index])
[docs] def diagonalise(self,inv_root): """ This will transform the genotype array based on the inverse square root of the phenotypic covariance matrix from the family based linear mixed model. """ if self.fams is None: raise(ValueError('Family labels needed for diagonalization')) if not self.mean_normalised: self.mean_normalise() if self.has_NAs: self.fill_NAs() unique_fams, famsizes = np.unique(self.fams, return_counts = True) fam_indices = dict() # Transform for fam in unique_fams: fam_indices[fam] = np.where(self.fams == fam)[0] famsize = fam_indices[fam].shape[0] if self.ndim == 2: if famsize == 1: self.gts[fam_indices[fam], :] = inv_root[1]*self.gts[fam_indices[fam],:] else: self.gts[fam_indices[fam],:] = inv_root[famsize].dot(self.gts[fam_indices[fam],:]) elif self.ndim == 3: if famsize == 1: self.gts[fam_indices[fam], : , :] = inv_root[1]*self.gts[fam_indices[fam], : , :] else: for j in range(self.shape[1]): self.gts[fam_indices[fam],j, :] = inv_root[famsize].dot(self.gts[fam_indices[fam],j, :]) self.fam_indices = fam_indices
[docs]class pgs(object): """Define a polygenic score based on a set of SNPs with weights and ref/alt allele pairs. Args: snp_ids : :class:`~numpy:numpy.array` vector of SNP ids snp_ids : :class:`~numpy:numpy.array` vector of weights of equal length to snp_ids alleles : :class:`~numpy:numpy.array` [L x 2] matrix of ref and alt alleles for the SNPs. L must match size of snp_ids Returns: pgs : :class:`sibreg.pgs` """ def __init__(self,snp_ids,weights,alleles): if snp_ids.shape[0] == weights.shape[0] and alleles.shape[0] == weights.shape[0] and alleles.shape[1]==2: self.snp_ids = snp_ids self.snp_dict = make_id_dict(snp_ids) self.weights = weights self.alleles = alleles else: raise ValueError('All inputs must have the same dimension')
[docs] def compute(self,garray, cols = None): """Compute polygenic score values from a given genotype array. Finds the SNPs in the genotype array that have weights in the pgs and matching alleles, and computes the PGS based on these SNPs and the weights after allele-matching. Args: garray : :class:`sbreg.gtarray` genotype array to compute PGS values for cols : :class:`numpy:numpy.array` names to give the columns in the output gtarray Returns: pg : :class:`sibreg.gtarray` 2d gtarray with PGS values. If a 3d gtarray is input, then each column corresponds to the second dimension on the input gtarray (for example, individual, paternal, maternal PGS). If a 2d gtarray is input, then there will be only one column in the output gtarray. The names given in 'cols' are stored in 'sid' attribute of the output. """ if type(garray) == gtarray: garray.fill_NAs() else: raise ValueError('Must be of gtarray class') if garray.alleles is None: raise ValueError('Alleles of genotype matrix must be provided') # Match SNP IDs in_pgs_snps = np.array([x in self.snp_dict for x in garray.sid]) nmatch = np.sum(in_pgs_snps) if nmatch==0: raise ValueError('No overlap between PGS SNPs and genotype SNPs') # Get weights matched_snps = garray.sid[in_pgs_snps] matched_alleles = garray.alleles[in_pgs_snps,:] snp_indices = np.zeros((nmatch),dtype=int) for i in range(0,nmatch): snp_indices[i] = self.snp_dict[matched_snps[i]] weights_compute = self.weights[snp_indices] alleles = self.alleles[snp_indices,:] # Match alleles and adjust weights a_match = np.logical_and(alleles[:,0] == matched_alleles[:, 0], alleles[:,1] == matched_alleles[:, 1]) a_reverse = np.logical_and(alleles[:,0] == matched_alleles[:, 1], alleles[:,1] == matched_alleles[:, 0]) a_nomatch = np.logical_and(np.logical_not(a_match), np.logical_not(a_reverse)) n_nomatch = np.sum(a_nomatch) if n_nomatch > 0: print('Removing ' + str(n_nomatch) + ' SNPs due to allele mismatch between genotypes and PGS alleles') weights_compute[a_nomatch] = 0 weights_compute[a_reverse] = -weights_compute[a_reverse] ### Compute PGS if garray.ndim == 2: pgs_val = garray.gts[:,in_pgs_snps].dot(weights_compute) elif garray.ndim ==3: pgs_val = np.zeros((garray.gts.shape[0],garray.gts.shape[1]),garray.dtype) for i in range(0,garray.gts.shape[1]): pgs_val[:,i] = garray.gts[:,i,in_pgs_snps].dot(weights_compute) return gtarray(pgs_val, garray.ids, sid = cols, fams = garray.fams)
[docs]def make_id_dict(x,col=0): """ Make a dictionary that maps from the values in the given column (col) to their row-index in the input array """ if len(x.shape)>1: x = x[:,col] id_dict = {} for i in range(0,x.shape[0]): id_dict[x[i]] = i return id_dict
[docs]def convert_str_array(x): """ Convert an ascii array to unicode array (UTF-8) """ x_shape = x.shape x = x.flatten() x_out = np.array([y.decode('UTF-8') for y in x]) return x_out.reshape(x_shape)
[docs]def encode_str_array(x): """ Encode a unicode array as an ascii array """ x_shape = x.shape x = x.flatten() x_out = np.array([y.encode('ascii') for y in x]) return x_out.reshape(x_shape)
[docs]def find_individuals_with_sibs(ids,ped,gts_ids, return_ids_only = False): """ Used in get_gts_matrix and get_fam_means to find the individuals in ids that have genotyped siblings. """ # Find genotyped sibships of size > 1 ped_dict = make_id_dict(ped, 1) ids_in_ped = np.array([x in ped_dict for x in gts_ids]) gts_fams = np.zeros((gts_ids.shape[0]),dtype=gts_ids.dtype) gts_fams[ids_in_ped] = np.array([ped[ped_dict[x], 0] for x in gts_ids[ids_in_ped]]) fams, counts = np.unique(gts_fams[ids_in_ped], return_counts=True) sibships = set(fams[counts > 1]) # Find individuals with genotyped siblings ids_in_ped = np.array([x in ped_dict for x in ids]) ids = ids[ids_in_ped] ids_fams = np.array([ped[ped_dict[x], 0] for x in ids]) ids_with_sibs = np.array([x in sibships for x in ids_fams]) ids = ids[ids_with_sibs] ids_fams = ids_fams[ids_with_sibs] if return_ids_only: return ids else: return ids, ids_fams, gts_fams
[docs]def get_fam_means(ids,ped,gts,gts_ids,remove_proband = True, return_famsizes = False): """ Used in get_gts_matrix to find the mean genotype in each sibship (family) for each SNP or for a PGS. The gtarray that is returned is indexed based on the subset of ids provided from sibships of size 2 or greater. If remove_proband=True, then the genotype/PGS of the index individual is removed from the fam_mean given for that individual. """ ids, ids_fams, gts_fams = find_individuals_with_sibs(ids,ped,gts_ids) fams = np.unique(ids_fams) fams_dict = make_id_dict(fams) # Compute sums of genotypes in each family fam_sums = np.zeros((fams.shape[0],gts.shape[1]),dtype=gts.dtype) fam_counts = np.zeros((fams.shape[0]),dtype=int) for i in range(0,fams.shape[0]): fam_indices = np.where(gts_fams==fams[i])[0] fam_sums[i,:] = np.sum(gts[fam_indices,:],axis=0) fam_counts[i] = fam_indices.shape[0] # Place in vector corresponding to IDs if remove_proband: gts_id_dict = make_id_dict(gts_ids) G_sib = np.zeros((ids.shape[0],gts.shape[1]),dtype = np.float32) for i in range(0,ids.shape[0]): fam_index = fams_dict[ids_fams[i]] G_sib[i,:] = fam_sums[fam_index,:] n_i = fam_counts[fam_index] if remove_proband: G_sib[i,:] = G_sib[i,:] - gts[gts_id_dict[ids[i]],:] n_i = n_i-1 G_sib[i,:] = G_sib[i,:]/float(n_i) if return_famsizes: return [gtarray(G_sib, ids),fam_counts,fam_sums] else: return gtarray(G_sib,ids)
[docs]def find_par_gts(pheno_ids, ped, fams, gts_id_dict): """ Used in get_gts_matrix to find whether individuals have imputed or observed parental genotypes, and to find the indices of the observed/imputed parents in the observed/imputed genotype arrays. 'par_status' codes whether an individual has parents that are observed or imputed or neither. 'gt_indices' records the relevant index of the parent in the observed/imputed genotype arrays 'fam_labels' records the family of the individual based on the pedigree """ # Whether mother and father have observed/imputed genotypes par_status = np.zeros((pheno_ids.shape[0],2),dtype=int) par_status[:] = -1 # Indices of obsered/imputed genotypes in relevant arrays gt_indices = np.zeros((pheno_ids.shape[0],3),dtype=int) gt_indices[:] = -1 ## Build dictionaries # Where each individual is in the pedigree ped_dict = make_id_dict(ped,1) # Where the imputed data is for each family fam_dict = make_id_dict(fams) # Store family ID of each individual fam_labels = np.zeros((pheno_ids.shape[0]),dtype=fams.dtype) # Find status and find indices for i in range(0,pheno_ids.shape[0]): # Find index in genotypes if pheno_ids[i] in gts_id_dict: gt_indices[i,0] = gts_id_dict[pheno_ids[i]] # Find index in pedigree if pheno_ids[i] in ped_dict: ped_i = ped[ped_dict[pheno_ids[i]], :] fam_labels[i] = ped_i[0] # Check for observed father if ped_i[2] in gts_id_dict: gt_indices[i,1] = gts_id_dict[ped_i[2]] par_status[i,0] = 0 # Check for observed mother if ped_i[3] in gts_id_dict: gt_indices[i, 2] = gts_id_dict[ped_i[3]] par_status[i,1] = 0 # If parent not observed, look for imputation if ped_i[0] in fam_dict: imp_index = fam_dict[ped_i[0]] # Check if this is imputation of father, or mother, or both if ped_i[4] == 'False' and not par_status[i,0] == 0: gt_indices[i, 1] = imp_index par_status[i, 0] = 1 if ped_i[5] == 'False' and not par_status[i,1] == 0: gt_indices[i, 2] = imp_index par_status[i, 1] = 1 return par_status, gt_indices, fam_labels
[docs]def make_gts_matrix(gts,imp_gts,par_status,gt_indices, parsum = False): """ Used in get_gts_matrix to construct the family based genotype matrix given observed/imputed genotypes. 'gt_indices' has the indices in the observed/imputed genotype arrays; and par_status codes whether the parents are observed (0) or imputed (1). """ if np.min(gt_indices)<0: raise(ValueError('Missing genotype index')) N = gt_indices.shape[0] if parsum: gdim = 2 else: gdim = 3 G = np.zeros((N,gdim,gts.shape[1]),np.float32) # Proband genotypes G[:,0,:] = gts[gt_indices[:,0],:] # Paternal genotypes G[par_status[:, 0] == 0, 1 ,:] = gts[gt_indices[par_status[:, 0] == 0, 1], :] G[par_status[:, 0] == 1, 1, :] = imp_gts[gt_indices[par_status[:, 0] == 1, 1], :] # Maternal genotypes if parsum: G[par_status[:, 1] == 0, 1, :] += gts[gt_indices[par_status[:, 1] == 0, 2], :] G[par_status[:, 1] == 1, 1, :] += imp_gts[gt_indices[par_status[:, 1] == 1, 2], :] else: G[par_status[:, 1] == 0, 2, :] = gts[gt_indices[par_status[:, 1] == 0, 2], :] G[par_status[:, 1] == 1, 2, :] = imp_gts[gt_indices[par_status[:, 1] == 1, 2], :] return G
[docs]def get_gts_matrix(par_gts_f, gts_f, snp_ids = None,ids = None, sib = False, compute_controls = False, parsum = False, start=0, end=None, print_sample_info=False): """Reads observed and imputed genotypes and constructs a family based genotype matrix for the individuals with observed/imputed parental genotypes, and if sib=True, at least one genotyped sibling. Args: par_gts_f : :class:`str` path to HDF5 file with imputed parental genotypes gts_f : :class:`str` path to bed file with observed genotypes snp_ids : :class:`numpy.ndarray` If provided, only obtains the subset of SNPs specificed that are present in both imputed and observed genotypes ids : :class:`numpy.ndarray` If provided, only obtains the ids with observed genotypes and imputed/observed parental genotypes (and observed sibling genotypes if sib=True) sib : :class:`bool` Retrieve genotypes for individuals with at least one genotyped sibling along with the average of their siblings' genotypes and observed/imputed parental genotypes. Default False. compute_controls : :class:`bool` Compute polygenic scores for control families (families with observed parental genotypes set to missing). Default False. parsum : :class:`bool` Return the sum of maternal and paternal observed/imputed genotypes rather than separate maternal/paternal genotypes. Default False. Returns: G : :class:`sibreg.gtarray` Genotype array for the subset of genotyped individuals with complete imputed/obsereved parental genotypes. The array is [N x k x L], where N is the number of individuals; k depends on whether sib=True and whether parsum=True; and L is the number of SNPs. If sib=False and parsum=False, then k=3 and this axis indexes individual's genotypes, individual's father's imputed/observed genotypes, individual's mother's imputed/observed genotypes. If sib=True and parsum=False, then k=4, and this axis indexes the individual, the sibling, the paternal, and maternal genotypes in that order. If parsum=True and sib=False, then k=2, and this axis indexes the individual and sum of paternal and maternal genotypes; etc. If compute_controls=True, then a list is returned, where the first element is as above, and the following elements give equivalent genotyping arrays for control families where the mother has been set to missing, the father has been set to missing, and both parents have been set to missing. """ ####### Find parental status ####### ### Imputed parental file ### par_gts_f = h5py.File(par_gts_f,'r') # Get pedigree ped = convert_str_array(np.array(par_gts_f['pedigree'])) ped = ped[1:ped.shape[0],:] # Remove control families controls = np.array([x[0]=='_' for x in ped[:,0]]) # Compute genotype matrices if gts_f[(len(gts_f)-4):len(gts_f)] == '.bed': G = [get_gts_matrix_given_ped(ped[np.logical_not(controls),:],par_gts_f,gts_f, snp_ids=snp_ids, ids=ids, sib=sib, parsum=parsum, start=start, end=end, print_sample_info = print_sample_info)] if compute_controls: G.append(get_gts_matrix_given_ped(ped[np.array([x[0:3]=='_p_' for x in ped[:,0]]),],par_gts_f,gts_f, snp_ids=snp_ids, ids=ids, sib=sib, parsum=parsum, start=start, end=end, print_sample_info = print_sample_info)) G.append( get_gts_matrix_given_ped(ped[np.array([x[0:3] == '_m_' for x in ped[:, 0]]),], par_gts_f, gts_f, snp_ids=snp_ids, ids=ids, sib=sib, parsum=parsum, start=start, end=end, print_sample_info = print_sample_info)) G.append( get_gts_matrix_given_ped(ped[np.array([x[0:3] == '_o_' for x in ped[:, 0]]),], par_gts_f, gts_f, snp_ids=snp_ids, ids=ids, sib=sib, parsum=parsum, start=start, end=end, print_sample_info = print_sample_info)) return G else: return G[0] elif gts_f[(len(gts_f)-5):len(gts_f)] == '.bgen': G = [get_gts_matrix_given_ped_bgen(ped[np.logical_not(controls),:],par_gts_f,gts_f, snp_ids=snp_ids, ids=ids, sib=sib, parsum=parsum, start=start, end=end, print_sample_info = print_sample_info)] if compute_controls: G.append(get_gts_matrix_given_ped_bgen(ped[np.array([x[0:3]=='_p_' for x in ped[:,0]]),],par_gts_f,gts_f, snp_ids=snp_ids, ids=ids, sib=sib, parsum=parsum, start=start, end=end, print_sample_info = print_sample_info)) G.append( get_gts_matrix_given_ped_bgen(ped[np.array([x[0:3] == '_m_' for x in ped[:, 0]]),], par_gts_f, gts_f, snp_ids=snp_ids, ids=ids, sib=sib, parsum=parsum, start=start, end=end, print_sample_info = print_sample_info)) G.append( get_gts_matrix_given_ped_bgen(ped[np.array([x[0:3] == '_o_' for x in ped[:, 0]]),], par_gts_f, gts_f, snp_ids=snp_ids, ids=ids, sib=sib, parsum=parsum, start=start, end=end, print_sample_info = print_sample_info)) return G else: return G[0] else: raise(ValueError('Unknown filetype for observed genotypes file: '+str(gts_f)))
[docs]def get_indices_given_ped(ped, fams, gts_ids, ids=None, sib=False, verbose = False): """ Used in get_gts_matrix_given_ped to get the ids of individuals with observed/imputed parental genotypes and, if sib=True, at least one genotyped sibling. It returns those ids along with the indices of the relevant individuals and their first degree relatives in the observed genotypes (observed indices), and the indices of the imputed parental genotypes for those individuals. """ # Made dictionary for observed genotypes gts_id_dict = make_id_dict(gts_ids) # If IDs not provided, use all individuals with observed genotypes if ids is None: ids = gts_ids # Find individuals with siblings if sib: ids = find_individuals_with_sibs(ids, ped, gts_ids, return_ids_only=True) if verbose: print('Found ' + str(ids.shape[0]) + ' individuals with genotyped siblings') ### Find parental status if verbose: print('Checking for observed/imputed parental genotypes') par_status, gt_indices, fam_labels = find_par_gts(ids, ped, fams, gts_id_dict) # Find which individuals can be used none_missing = np.min(gt_indices, axis=1) none_missing = none_missing >= 0 N = np.sum(none_missing) if N == 0: raise ValueError( 'No individuals with phenotype observations and complete observed/imputed genotype observations') if verbose: print(str(N) + ' individuals with phenotype observations and complete observed/imputed genotypes observations') # Take those that can be used gt_indices = gt_indices[none_missing, :] par_status = par_status[none_missing, :] ids = ids[none_missing] # Find indices of individuals and their parents in observed genotypes observed_indices = np.sort(np.unique(np.hstack((gt_indices[:, 0], gt_indices[par_status[:, 0] == 0, 1], gt_indices[par_status[:, 1] == 0, 2])))) # Get indices of imputed parents imp_indices = np.sort(np.unique(np.hstack((gt_indices[par_status[:, 0] == 1, 1], gt_indices[par_status[:, 1] == 1, 2])))) # Return ids with imputed/observed parents return ids, observed_indices, imp_indices
[docs]def match_observed_and_imputed_snps(gts_f, par_gts_f, bim, snp_ids=None, start=0, end=None): """ Used in get_gts_matrix_given_ped to match observed and imputed SNPs and return SNP information on shared SNPs. Removes SNPs that have duplicated SNP ids. in_obs_sid contains the SNPs in the imputed genotypes that are present in the observed SNPs obs_sid_index contains the index in the observed SNPs of the common SNPs """ # Match SNPs from imputed and observed and restrict to those in list if snp_ids is None: snp_ids = gts_f.sid if end is None: end = snp_ids.shape[0] snp_ids = snp_ids[start:end] # Get bim info alleles = np.loadtxt(bim, dtype='U', usecols=(4, 5)) pos = np.loadtxt(bim, dtype=int, usecols=3) chromosome = np.loadtxt(bim, dtype=int, usecols=0) # Remove duplicate ids unique_snps, snp_indices, snp_counts = np.unique(snp_ids, return_index=True, return_counts=True) snp_set = set(snp_ids[snp_indices[snp_counts == 1]]) if len(snp_set) < snp_ids.shape[0]: print(str(snp_ids.shape[0]-len(snp_set))+' SNPs with duplicate IDs removed') # Read and match SNP ids imp_bim = convert_str_array(np.array(par_gts_f['bim_values'])) imp_sid = imp_bim[:, 1] obs_sid = gts_f.sid obs_sid_dict = make_id_dict(obs_sid) in_obs_sid = np.zeros((imp_sid.shape[0]), dtype=bool) obs_sid_index = np.zeros((imp_sid.shape[0]), dtype=int) for i in range(0, imp_sid.shape[0]): if imp_sid[i] in obs_sid_dict and imp_sid[i] in snp_set: in_obs_sid[i] = True obs_sid_index[i] = obs_sid_dict[imp_sid[i]] if np.sum(in_obs_sid) == 0: raise ValueError('No SNPs in common between imputed and observed data') obs_sid_index = obs_sid_index[in_obs_sid] sid = imp_sid[in_obs_sid] alleles = alleles[obs_sid_index, :] chromosome = chromosome[obs_sid_index] pos = pos[obs_sid_index] return chromosome, sid, pos, alleles, in_obs_sid, obs_sid_index
[docs]def get_gts_matrix_given_ped(ped, par_gts_f, gts_f, snp_ids=None, ids=None, sib=False, parsum=False, start=0, end=None, verbose=False, print_sample_info = False): """ Used in get_gts_matrix: see get_gts_matrix for documentation """ ### Genotype file ### bim = gts_f.split('.bed')[0] + '.bim' gts_f = Bed(gts_f,count_A1=True) # get ids of genotypes and make dict gts_ids = gts_f.iid[:, 1] # Get families with imputed parental genotypes fams = convert_str_array(np.array(par_gts_f['families'])) ### Find ids with observed/imputed parents and indices of those in observed/imputed data ids, observed_indices, imp_indices = get_indices_given_ped(ped, fams, gts_ids, ids=ids, sib=sib, verbose=print_sample_info) ### Match observed and imputed SNPs ### if verbose: print('Matching observed and imputed SNPs') chromosome, sid, pos, alleles, in_obs_sid, obs_sid_index = match_observed_and_imputed_snps(gts_f, par_gts_f, bim, snp_ids=snp_ids, start=start, end=end) # Read imputed parental genotypes if verbose: print('Reading imputed parental genotypes') if (imp_indices.shape[0]*in_obs_sid.shape[0]) < (np.sum(in_obs_sid)*fams.shape[0]): imp_gts = np.array(par_gts_f['imputed_par_gts'][imp_indices, :]) imp_gts = imp_gts[:,np.arange(in_obs_sid.shape[0])[in_obs_sid]] else: imp_gts = np.array(par_gts_f['imputed_par_gts'][:,np.arange(in_obs_sid.shape[0])[in_obs_sid]]) imp_gts = imp_gts[imp_indices,:] fams = fams[imp_indices] # Read observed genotypes if verbose: print('Reading observed genotypes') gts = gts_f[observed_indices, obs_sid_index].read().val gts_ids = gts_f.iid[observed_indices,1] gts_id_dict = make_id_dict(gts_ids) # Find indices in reduced data par_status, gt_indices, fam_labels = find_par_gts(ids, ped, fams, gts_id_dict) if verbose: print('Constructing family based genotype matrix') ### Make genotype design matrix if sib: if parsum: G = np.zeros((ids.shape[0], 3, gts.shape[1]), dtype=np.float32) G[:, np.array([0, 2]), :] = make_gts_matrix(gts, imp_gts, par_status, gt_indices, parsum=parsum) else: G = np.zeros((ids.shape[0],4,gts.shape[1]), dtype=np.float32) G[:,np.array([0,2,3]),:] = make_gts_matrix(gts, imp_gts, par_status, gt_indices, parsum=parsum) G[:,1,:] = get_fam_means(ids, ped, gts, gts_ids, remove_proband=True).gts else: G = make_gts_matrix(gts, imp_gts, par_status, gt_indices, parsum=parsum) del gts del imp_gts return gtarray(G, ids, sid, alleles=alleles, pos=pos, chrom=chromosome, fams=fam_labels, par_status=par_status)
[docs]def get_gts_matrix_given_ped_bgen(ped, par_gts_f, gts_f, snp_ids=None, ids=None, sib=False, parsum=False, start=0, end=None, verbose=False, print_sample_info = False): """ Used in get_gts_matrix: see get_gts_matrix for documentation """ ### Genotype file ### gts_f = open_bgen(gts_f, verbose=verbose) # get ids of genotypes and make dict gts_ids = gts_f.samples # Get families with imputed parental genotypes fams = convert_str_array(np.array(par_gts_f['families'])) ### Find ids with observed/imputed parents and indices of those in observed/imputed data ids, observed_indices, imp_indices = get_indices_given_ped(ped, fams, gts_ids, ids=ids, sib=sib, verbose=print_sample_info) ### Match observed and imputed SNPs ### if verbose: print('Matching observed and imputed SNPs') chromosome, sid, pos, alleles, in_obs_sid, obs_sid_index = match_observed_and_imputed_snps_bgen(gts_f, par_gts_f, snp_ids=snp_ids, start=start, end=end) # Read imputed parental genotypes if verbose: print('Reading imputed parental genotypes') if (imp_indices.shape[0]*in_obs_sid.shape[0]) < (np.sum(in_obs_sid)*fams.shape[0]): imp_gts = np.array(par_gts_f['imputed_par_gts'][imp_indices, :]) imp_gts = imp_gts[:,np.arange(in_obs_sid.shape[0])[in_obs_sid]] else: imp_gts = np.array(par_gts_f['imputed_par_gts'][:,np.arange(in_obs_sid.shape[0])[in_obs_sid]]) imp_gts = imp_gts[imp_indices,:] fams = fams[imp_indices] # Read observed genotypes if verbose: print('Reading observed genotypes') gts = np.sum(gts_f.read((observed_indices,obs_sid_index), np.float32)[:,:,np.array([0,2])],axis=2) gts_ids = gts_ids[observed_indices] gts_id_dict = make_id_dict(gts_ids) # Find indices in reduced data par_status, gt_indices, fam_labels = find_par_gts(ids, ped, fams, gts_id_dict) if verbose: print('Constructing family based genotype matrix') ### Make genotype design matrix if sib: if parsum: G = np.zeros((ids.shape[0], 3, gts.shape[1]), dtype=np.float32) G[:, np.array([0, 2]), :] = make_gts_matrix(gts, imp_gts, par_status, gt_indices, parsum=parsum) else: G = np.zeros((ids.shape[0],4,gts.shape[1]), dtype=np.float32) G[:,np.array([0,2,3]),:] = make_gts_matrix(gts, imp_gts, par_status, gt_indices, parsum=parsum) G[:,1,:] = get_fam_means(ids, ped, gts, gts_ids, remove_proband=True).gts else: G = make_gts_matrix(gts, imp_gts, par_status, gt_indices, parsum=parsum) del gts del imp_gts return gtarray(G, ids, sid, alleles=alleles, pos=pos, chrom=chromosome, fams=fam_labels, par_status=par_status)
[docs]def match_observed_and_imputed_snps_bgen(gts_f, par_gts_f, snp_ids=None, start=0, end=None): """ Used in get_gts_matrix_given_ped to match observed and imputed SNPs and return SNP information on shared SNPs. Removes SNPs that have duplicated SNP ids. in_obs_sid contains the SNPs in the imputed genotypes that are present in the observed SNPs obs_sid_index contains the index in the observed SNPs of the common SNPs """ # Match SNPs from imputed and observed and restrict to those in list if snp_ids is None: snp_ids = gts_f.ids if end is None: end = snp_ids.shape[0] snp_ids = snp_ids[start:end] # Get bim info alleles = np.array([x.split(',') for x in gts_f.allele_ids]) pos = np.array(gts_f.positions) chromosome = np.array(gts_f.chromosomes) # Remove duplicate ids unique_snps, snp_indices, snp_counts = np.unique(snp_ids, return_index=True, return_counts=True) snp_set = set(snp_ids[snp_indices[snp_counts == 1]]) if len(snp_set) < snp_ids.shape[0]: print(str(snp_ids.shape[0]-len(snp_set))+' SNPs with duplicate IDs removed') ## Read and match SNP ids imp_bim = convert_str_array(np.array(par_gts_f['bim_values'])) # Find relevant column for SNP ids in imputed data imp_bim_cols = convert_str_array(np.array(par_gts_f['bim_columns'])) if 'rsid' in imp_bim_cols: id_col = np.where('rsid' == imp_bim_cols)[0][0] elif 'id' in imp_bim_cols: id_col = np.where('id' == imp_bim_cols)[0][0] else: raise(ValueError('Could not find SNP ids in imputed parental genotypes')) imp_sid = imp_bim[:, id_col] obs_sid = gts_f.ids if np.unique(obs_sid).shape[0] == 1: obs_sid = gts_f.rsids obs_sid_dict = make_id_dict(obs_sid) in_obs_sid = np.zeros((imp_sid.shape[0]), dtype=bool) obs_sid_index = np.zeros((imp_sid.shape[0]), dtype=int) for i in range(0, imp_sid.shape[0]): if imp_sid[i] in obs_sid_dict and imp_sid[i] in snp_set: in_obs_sid[i] = True obs_sid_index[i] = obs_sid_dict[imp_sid[i]] if np.sum(in_obs_sid) == 0: raise ValueError('No SNPs in common between imputed and observed data') obs_sid_index = obs_sid_index[in_obs_sid] sid = imp_sid[in_obs_sid] alleles = alleles[obs_sid_index, :] if 'Chr' in imp_bim_cols: chr_col = np.where('Chr' == imp_bim_cols)[0][0] else: chr_col = 0 chromosome = imp_bim[in_obs_sid,chr_col] pos = pos[obs_sid_index] return chromosome, sid, pos, alleles, in_obs_sid, obs_sid_index
[docs]def compute_pgs(par_gts_f, gts_f, pgs, sib=False, compute_controls=False): """Compute a polygenic score (PGS) for the individuals with observed genotypes and observed/imputed parental genotypes. Args: par_gts_f : :class:`str` path to HDF5 file with imputed parental genotypes gts_f : :class:`str` path to bed file with observed genotypes pgs : :class:`sibreg.pgs` the PGS, defined by the weights for a set of SNPs and the alleles of those SNPs sib : :class:`bool` Compute the PGS for genotyped individuals with at least one genotyped sibling and observed/imputed parental genotypes. Default False. compute_controls : :class:`bool` Compute polygenic scores for control families (families with observed parental genotypes set to missing). Default False. Returns: pg : :class:`sibreg.gtarray` Return the polygenic score as a genotype array with columns: individual's PGS, mean of their siblings' PGS, observed/imputed paternal PGS, observed/imputed maternal PGS """ G = get_gts_matrix(par_gts_f, gts_f, snp_ids=pgs.snp_ids, sib=sib, compute_controls=compute_controls) if sib: cols = np.array(['proband', 'sibling', 'paternal', 'maternal']) else: cols = np.array(['proband', 'paternal', 'maternal']) if compute_controls: return [pgs.compute(x,cols) for x in G] else: return pgs.compute(G,cols)
[docs]def fit_sibreg_model(y, X, fam_labels, add_intercept=False, tau_init=1, return_model=True, return_vcomps=True, return_fixed=True): """Compute the MLE for the fixed effects in a family-based linear mixed model. Args: y : :class:`~numpy:numpy.array` vector of phenotype values X: :class:`~numpy:numpy.array` regression design matrix for fixed effects fam_labels : :class:`~numpy:numpy.array` vector of family labels: residual correlations in y are modelled between family members (that share a fam_label) add_intercept : :class:`bool` whether to add an intercept to the fixed effect design matrix Returns: model : :class:`sibreg.model` the sibreg model object, if return_model=True vcomps: :class:`float` the MLEs for the variance parameters: sigma2 (residual variance) and tau (ratio between sigma2 and family variance), if return_vcomps=True alpha : :class:`~numpy:numpy.array` MLE of fixed effects, if return_fixed=True alpha_cov : :class:`~numpy:numpy.array` sampling variance-covariance matrix for MLE of fixed effects, if return_fixed=True """ # Optimize model sigma_2_init = np.var(y)*tau_init/(1+tau_init) null_model = model(y, X, fam_labels, add_intercept=add_intercept) null_optim = null_model.optimize_model(np.array([sigma_2_init,tau_init])) # Create return list return_list = [] if return_model: return_list.append(null_model) if return_vcomps: return_list += [null_optim['sigma2'], null_optim['tau']] if return_fixed: return_list += null_model.alpha_mle(null_optim['tau'], null_optim['sigma2'], compute_cov=True) return return_list
[docs]def read_phenotype(phenofile, missing_char = 'NA', phen_index = 1): """Read a phenotype file and remove missing values. Args: phenofile : :class:`str` path to plain text phenotype file with columns FID, IID, phenotype1, phenotype2, ... missing_char : :class:`str` The character that denotes a missing phenotype value; 'NA' by default. phen_index : :class:`int` The index of the phenotype (counting from 1) if multiple phenotype columns present in phenofile Returns: y : :class:`~numpy:numpy.array` vector of non-missing phenotype values from specified column of phenofile pheno_ids: :class:`~numpy:numpy.array` corresponding vector of individual IDs (IID) """ pheno = Pheno(phenofile, missing=missing_char).read() y = np.array(pheno.val) pheno_ids = np.array(pheno.iid)[:,1] if y.ndim == 1: pass elif y.ndim == 2: y = y[:, phen_index - 1] else: raise (ValueError('Incorrect dimensions of phenotype array')) # Remove y NAs y_not_nan = np.logical_not(np.isnan(y)) if np.sum(y_not_nan) < y.shape[0]: y = y[y_not_nan] pheno_ids = pheno_ids[y_not_nan] print('Number of non-missing phenotype observations: ' + str(y.shape[0])) return y, pheno_ids
[docs]def match_phenotype(G,y,pheno_ids): """Match a phenotype to a genotype array by individual IDs. Args: G : :class:`gtarray` genotype array to match phenotype to y : :class:`~numpy:numpy.array` vector of phenotype values pheno_ids: :class:`~numpy:numpy.array` vector of individual IDs corresponding to phenotype vector, y Returns: y : :class:`~numpy:numpy.array` vector of phenotype values matched by individual IDs to the genotype array """ in_G_dict = np.array([x in G.id_dict for x in pheno_ids]) y = y[in_G_dict] pheno_ids = pheno_ids[in_G_dict] pheno_id_dict = make_id_dict(pheno_ids) y = y[[pheno_id_dict[x] for x in G.ids]] return y
[docs]def read_covariates(covar, missing_char = 'NA'): covar = Pheno(covar, missing=missing_char).read() X = np.array(covar.val) X = gtarray(X, ids=np.array(covar.iid)[:,1]) X.fill_NAs() return X