Source code for snipar.scripts.correlate

#!/usr/bin/env python
import numpy as np
import argparse
from snipar.correlate import *
from numba import set_num_threads
from numba import config as numba_config
from snipar.utilities import *

parser = argparse.ArgumentParser()
parser.add_argument('sumstats', type=str, help='Address of sumstats files in SNIPar sumstats.gz text format (without .sumstats.gz suffix). If there is a @ in the address, @ is replaced by the chromosome numbers in chr_range (optional argument)')
parser.add_argument('--chr_range',
                        type=parseNumRange,
                        nargs='*',
                        action=NumRangeAction,
                        help='number of the chromosomes to be imputed. Should be a series of ranges with x-y format or integers.', default=None)
parser.add_argument('out',type=str,help='Prefix for output file(s)')
parser.add_argument('--ldscores',type=str,help='Address of ldscores as output by LDSC',default=None)
parser.add_argument('--bed', type=str,
                    help='Address of observed genotype files in .bed format (without .bed suffix). If there is a # in the address, # is replaced by the chromosome numbers in the range of 1-22.',
                    default=None)
parser.add_argument('--threads',type=int,help='Number of threads to use for IBD inference. Uses all available by default.',default=None)
parser.add_argument('--min_maf',type=float,help='Ignore SNPs with minor allele frequency below min_maf (default 0.05)', default=0.05)
parser.add_argument('--corr_filter',type=float,help='Filter out SNPs with outlying sampling correlations more than corr_filter SDs from mean (default 6)',default=6.0)
parser.add_argument('--n_blocks',type=int,help='Number of blocks to use for block-jacknife variance estimate (default 200)',default=200)
parser.add_argument('--save_delete',action='store_true',help='Save jacknife delete values',default=False)
parser.add_argument('--ld_wind',type=float,help='The window, in cM, within which LD scores are computed (default 1cM)',default=1.0)
parser.add_argument('--ld_out',type=str,help='Output LD scores in LDSC format to this address',default=None)

[docs]def main(args): # Set number of threads if args.threads is not None: if args.threads < numba_config.NUMBA_NUM_THREADS: set_num_threads(args.threads) print('Number of threads: '+str(args.threads)) if args.ldscores is None and args.bed is None: raise(ValueError('Must provide either LD scores or genotypes to compute ld scores')) if args.ldscores is not None and args.bed is not None: raise(ValueError('Both LD scores and genotypes provided. Provided LD scores will be used')) # Find sumstats files sumstats_files, chroms = parse_obsfiles(args.sumstats, obsformat='sumstats.gz', chromosomes=args.chr_range) # Read sumstats s = read_sumstats_files(sumstats_files, chroms) # Filter print('Filtering on missing values') s.filter_NAs() print('Filtering out SNPs with MAF<'+str(args.min_maf)) s.filter_maf(args.min_maf) print('Filtering out SNPs with sampling correlations more than '+str(args.corr_filter)+' SDs from mean') s.filter_corrs(args.corr_filter) # Get LD scores if args.ldscores is not None: ld_files, chroms = parse_obsfiles(args.ldscores, obsformat='l2.ldscore.gz', chromosomes=[x for x in range(1,23)]) s.scores_from_ldsc(ld_files) s.filter_NAs() elif args.bed is not None: bedfiles, chroms = parse_obsfiles(args.bed, obsformat='bed', chromosomes=chroms) s.compute_ld_scores(bedfiles, chroms, args.ld_wind, args.ld_out) s.filter_NAs() # Compute correlations print('Using '+str(s.sid.shape[0])+' SNPs to compute correlations') r_dir_pop, r_dir_pop_SE, r_dir_pop_delete = s.cor_direct_pop(args.n_blocks) print('Correlation between direct and population effects: '+str(round(r_dir_pop,4))+' (S.E. '+str(round(r_dir_pop_SE,4))+')') r_dir_avg_NTC, r_dir_avg_NTC_SE, r_dir_avg_NTC_delete = s.cor_direct_avg_NTC(args.n_blocks) print('Correlation between direct and average NTCs: '+str(round(r_dir_avg_NTC,4))+' (S.E. '+str(round(r_dir_avg_NTC_SE,4))+')') outfile = str(args.out)+'_corrs.txt' print('Saving correlation estimates to '+outfile) first_col = np.array(['r_direct_population','r_direct_avg_NTC']).reshape((2,1)) header = np.array(['correlation','est','SE']).reshape((1,3)) cor_out = np.zeros((2,2)) cor_out[0,:] = np.array([r_dir_pop,r_dir_pop_SE]) cor_out[1,:] = np.array([r_dir_avg_NTC,r_dir_avg_NTC_SE]) outarray = np.vstack((header,np.hstack((first_col,cor_out)))) np.savetxt(outfile,outarray,fmt='%s') if args.save_delete: delete_outfile = args.out+'_delete.txt' print('Saving jacknife delete values to '+str(delete_outfile)) delete_out = np.vstack((r_dir_pop_delete,r_dir_avg_NTC_delete)).T delete_out = np.vstack((np.array(['r_direct_population','r_direct_avg_NTC']).reshape((1,2)),delete_out)) np.savetxt(delete_outfile, delete_out,fmt='%s')
if __name__ == "__main__": args=parser.parse_args() main(args)