Source code for snipar.tests.test_manhattan

import h5py, code
import numpy as np
import pandas as pd
from pysnptools.snpreader import Bed
from scipy.stats import norm
from pandas import DataFrame
from tqdm import tqdm
import statsmodels.api as sm
from scipy.stats import linregress
[docs]def imputation_test(chromosomes, imputed_prefix = '', expected_prefix = "../UKBioRDE_revision/data/tmp/filtered_ukb_chr", start = None, end = None, backup_threshold=0.01, genotyping_error_threshold=0.01, outname_prefix = "", ): #Data files for chromosome i should be named in this fashion: "prefix{i}" import matplotlib.pyplot as plt chromosomes_expected_genes_o = [] chromosomes_expected_genes_pm = [] chromosomes_imputed_genes_o = [] chromosomes_imputed_genes_pm = [] chrom_sizes = [] parent_ratio_backups = [] sib_ratio_backups = [] estimated_genotyping_errors = [] for chromosome in tqdm(chromosomes): with h5py.File(imputed_prefix+str(chromosome)+".hdf5",'r') as f: gts = np.array(f["imputed_par_gts"]).astype(float) fids = np.array(f["families"]).astype(str) ped_array = np.array(f["pedigree"]).astype(str) ped = pd.DataFrame(ped_array[1:], columns = ped_array[0]) non_duplicates = np.array(f["non_duplicates"]) parent_ratio_backup = np.array(f["parent_ratio_backup"]) sib_ratio_backup = np.array(f["sib_ratio_backup"]) estimated_genotyping_error = np.array(f["estimated_genotyping_error"]) rsids = np.array(f["bim_values"])[:,1] if "maf_x" in f: maf_x = np.array(f["maf_x"]) maf_coefs = np.array(f["maf_coefs"]) maf_TSS = np.array(f["maf_TSS"]) maf_RSS1 = np.array(f["maf_RSS1"]) maf_RSS2 = np.array(f["maf_RSS2"]) maf_R2_1 = np.array(f["maf_R2_1"]) maf_R2_2 = np.array(f["maf_R2_2"]) maf_larger1 = np.array(f["maf_larger1"]) maf_less0 = np.array(f["maf_less0"]) #TODO concat these across chroms #Fix backup ratio start end if start is not None: non_duplicates = non_duplicates+start parent_ratio_backups.append(parent_ratio_backup) sib_ratio_backups.append(sib_ratio_backup) estimated_genotyping_errors.append(estimated_genotyping_error) expected = Bed(expected_prefix+str(chromosome)+".bed", count_A1 = True) expected_gts = expected[:, non_duplicates].read().val.astype(float) chrom_sizes.append(gts.shape[1]) expected_ids = expected.iid iid_to_bed_index = {i:index for index, i in enumerate(expected_ids[:,1])} #fids of control families start with _ #this has the predix _*_ index_of_families_in_imputation = {fid:index for index,fid in enumerate(fids)} # no parent control starts with _o_ # only has father control starts with _p_ # only has father control starts with _m_ control_o_families = list({row["FID"][3:] for index, row in ped.iterrows() if row["FID"].startswith("_o_")}) #for each family select id of the parents parent_ids = ped.groupby("FID").agg({ 'FATHER_ID':lambda x: ([a for a in list(x) if a in ped["IID"].tolist()]+[None])[0], 'MOTHER_ID':lambda x: ([a for a in list(x) if a in ped["IID"].tolist()]+[None])[0], }) parents_of_control_o_families = parent_ids.loc[control_o_families] mother_indexes_control_o = [iid_to_bed_index[parents_of_control_o_families.loc[i, "MOTHER_ID"]] for i in control_o_families] father_indexes_control_o = [iid_to_bed_index[parents_of_control_o_families.loc[i, "FATHER_ID"]] for i in control_o_families] expected_parent_gts_control_o = (expected_gts[mother_indexes_control_o,:]+expected_gts[father_indexes_control_o,:])/2 expected_genes_o = expected_parent_gts_control_o index_of_control_families_in_imputation_o = [index_of_families_in_imputation["_o_"+i] for i in control_o_families] imputed_genes_o = gts[index_of_control_families_in_imputation_o,:] control_p = list({row["FID"][3:] for index, row in ped.iterrows() if row["FID"].startswith("_p_")}) control_m = list({row["FID"][3:] for index, row in ped.iterrows() if row["FID"].startswith("_m_")}) control_pm_families = control_p + control_m parent_of_control_m = parent_ids.loc[control_m] parent_of_control_p = parent_ids.loc[control_p] father_indexes_control_m = [iid_to_bed_index[parent_of_control_m.loc[i, "FATHER_ID"]] for i in control_m] mother_indexes_control_p = [iid_to_bed_index[parent_of_control_p.loc[i, "MOTHER_ID"]] for i in control_p] expected_parent_gts_control_pm = expected_gts[mother_indexes_control_p + father_indexes_control_m, :] expected_genes_pm = expected_parent_gts_control_pm index_of_control_families_in_imputation_pm = [index_of_families_in_imputation["_p_" + i] for i in control_p] + [index_of_families_in_imputation["_m_" + i] for i in control_m] imputed_genes_pm = gts[index_of_control_families_in_imputation_pm,:] chromosomes_expected_genes_o.append(expected_genes_o) chromosomes_expected_genes_pm.append(expected_genes_pm) chromosomes_imputed_genes_o.append(imputed_genes_o) chromosomes_imputed_genes_pm.append(imputed_genes_pm) whole_expected_genes_o = np.hstack(chromosomes_expected_genes_o) whole_imputed_genes_o = np.hstack(chromosomes_imputed_genes_o) whole_expected_genes_pm = np.hstack(chromosomes_expected_genes_pm) whole_imputed_genes_pm = np.hstack(chromosomes_imputed_genes_pm) parent_ratio_backup = np.hstack(parent_ratio_backups) sib_ratio_backup = np.hstack(sib_ratio_backups) estimated_genotyping_error = np.hstack(estimated_genotyping_errors) pm_coefs = [] pm_pvals = [] empty = [] results = [] pop_size, nsnps = whole_imputed_genes_pm.shape for i in tqdm(range(nsnps)): x = whole_imputed_genes_pm[:,i] y = whole_expected_genes_pm[:,i] mask = ~(np.isnan(x) | np.isnan(y)) noval = (np.sum(mask) < 10) if noval: pm_coefs.append(-1) pm_pvals.append(-1) results.append(None) else: result = sm.OLS(y[mask], x[mask]).fit() pm_coefs.append(result.params[0]) pm_pvals.append(result.t_test(([1],1)).pvalue) results.append(result) empty.append(noval) pm_pvals_log10 = -np.log10(pm_pvals) numerical = ~(np.array(empty) | np.isnan(pm_pvals_log10) | np.isinf(pm_pvals_log10) | (parent_ratio_backup > backup_threshold) | (estimated_genotyping_error > genotyping_error_threshold)) counter = 0 pm_chrom_sizes = [i for i in chrom_sizes] for i in range(len(chrom_sizes)): tmp = chrom_sizes[i] pm_chrom_sizes[i] = np.sum(numerical[counter:counter+chrom_sizes[i]]) counter += tmp pm_pvals_log10 = pm_pvals_log10[numerical] pm_coefs = np.array(pm_coefs)[numerical] pm_pvals = np.array(pm_pvals)[numerical] pm_pvals_log10_sorted = np.sort(pm_pvals_log10) z = np.arange(len(pm_pvals_log10))/len(pm_pvals_log10) x = -np.log(1-z)/np.log(10) fig = plt.figure(figsize=(10, 10)) ax = fig.add_subplot(111) ax.plot(x, pm_pvals_log10_sorted, '-b') ax.plot(x, pm_pvals_log10_sorted, '.r') ax.plot([np.min(x), np.max(x)], [np.min(x), np.max(x)], 'y') plt.xlabel("expected(-log10(p-value))") plt.ylabel("observed(-log10(p-value))") plt.title(f"{outname_prefix}\nqqplot") print(f"{imputed_prefix}_{outname_prefix}_pm_qq.png") plt.savefig(f"{imputed_prefix}_{outname_prefix}_pm_qq") plt.clf() # sample data nsnps = len(pm_coefs) all_chrom_names = [] for i in range(len(chromosomes)): all_chrom_names += [f'ch-{chromosomes[i]}']*pm_chrom_sizes[i] df = DataFrame({'gene' : [f'gene-{i}' % i for i in np.arange(nsnps)], 'pm_coef' : pm_coefs, 'pm_pvals' : pm_pvals, 'pm_pvals_log10': pm_pvals_log10, 'chromosome' : all_chrom_names}) for col in ['pm_coef', 'pm_pvals', 'pm_pvals_log10']: df.chromosome = df.chromosome.astype('category') df.chromosome = df.chromosome.cat.set_categories([f'ch-{i}' for i in chromosomes], ordered=True) df = df.sort_values('chromosome') # How to plot gene vs. -log10(pvalue) and colour it by chromosome? df['ind'] = range(len(df)) df_grouped = df.groupby(('chromosome')) # manhattan plot fig = plt.figure(figsize=(40, 8)) # Set the figure size ax = fig.add_subplot(111) colors = ['darkred','darkgreen','darkblue', 'darkslategray'] x_labels = [] x_labels_pos = [] for num, (name, group) in enumerate(df_grouped): # group.plot(kind='scatter', x='ind', y='coef',color=colors[num % len(colors)], ax=ax, size=1) ax.scatter(group["ind"], group[col], color=colors[num % len(colors)], s=1) x_labels.append(name) x_labels_pos.append((group['ind'].iloc[-1] - (group['ind'].iloc[-1] - group['ind'].iloc[0])/2)) ax.set_xticks(x_labels_pos) ax.set_xticklabels(x_labels) # set axis limits ax.set_xlim([0, len(df)]) ax.set_ylim([0, np.ma.masked_invalid(df[col]).max()+0.2]) ax.grid(True) # x axis label ax.set_xlabel('Chromosome') ax.set_title(f"{outname_prefix}\n{col}") plt.savefig(f"{imputed_prefix}_{outname_prefix}_pm_manhattan_{col}") plt.clf() print(f"{imputed_prefix}_{outname_prefix}_pm_manhattan_{col}.png") #------------------------------------------------------------------ o_coefs = [] o_pvals = [] empty = [] pop_size, nsnps = whole_imputed_genes_o.shape for i in tqdm(range(nsnps)): x = whole_imputed_genes_o[:,i] y = whole_expected_genes_o[:,i] mask = ~(np.isnan(x) | np.isnan(y)) noval = (np.sum(mask) < 10) if noval: o_coefs.append(-1) o_pvals.append(-1) else: result = sm.OLS(y[mask], x[mask]).fit() o_coefs.append(result.params[0]) o_pvals.append(result.t_test(([1],1)).pvalue) empty.append(noval) o_pvals_log10 = -np.log10(o_pvals) numerical = ~(np.array(empty) | np.isnan(o_pvals_log10) | np.isinf(o_pvals_log10) | (sib_ratio_backup > backup_threshold) | (estimated_genotyping_error > genotyping_error_threshold)) counter = 0 o_chrom_sizes = [i for i in chrom_sizes] for i in range(len(chrom_sizes)): tmp = chrom_sizes[i] o_chrom_sizes[i] = np.sum(numerical[counter:counter+chrom_sizes[i]]) counter += tmp o_pvals_log10 = o_pvals_log10[numerical] o_coefs = np.array(o_coefs)[numerical] o_pvals = np.array(o_pvals)[numerical] o_pvals_log10_sorted = np.sort(o_pvals_log10) z = np.arange(len(o_pvals_log10))/len(o_pvals_log10) x = -np.log(1-z)/np.log(10) fig = plt.figure(figsize=(10, 10)) ax = fig.add_subplot(111) ax.plot(x, o_pvals_log10_sorted, '-b') ax.plot(x, o_pvals_log10_sorted, '.r') ax.plot([np.min(x), np.max(x)], [np.min(x), np.max(x)], 'y') plt.xlabel("expected(-log10(p-value))") plt.ylabel("observed(-log10(p-value))") plt.title(f"{outname_prefix}\nqqplot") print(f"{imputed_prefix}_{outname_prefix}_o_qq.png") plt.savefig(f"{imputed_prefix}_{outname_prefix}_o_qq") plt.clf() # sample data nsnps = len(o_coefs) all_chrom_names = [] for i in range(len(chromosomes)): all_chrom_names += [f'ch-{chromosomes[i]}']*o_chrom_sizes[i] df = DataFrame({'gene' : [f'gene-{i}' % i for i in np.arange(nsnps)], 'o_coef' : o_coefs, 'o_pvals' : o_pvals, 'o_pvals_log10': o_pvals_log10, 'chromosome' : all_chrom_names}) for col in ['o_coef', 'o_pvals', 'o_pvals_log10']: df.chromosome = df.chromosome.astype('category') df.chromosome = df.chromosome.cat.set_categories([f'ch-{i}' for i in chromosomes], ordered=True) df = df.sort_values('chromosome') # How to plot gene vs. -log10(pvalue) and colour it by chromosome? df['ind'] = range(len(df)) df_grouped = df.groupby(('chromosome')) # manhattan plot fig = plt.figure(figsize=(40, 8)) # Set the figure size ax = fig.add_subplot(111) colors = ['darkred','darkgreen','darkblue', 'darkslategray'] x_labels = [] x_labels_pos = [] for num, (name, group) in enumerate(df_grouped): # group.plot(kind='scatter', x='ind', y='coef',color=colors[num % len(colors)], ax=ax, size=1) ax.scatter(group["ind"], group[col], color=colors[num % len(colors)], s=1) x_labels.append(name) x_labels_pos.append((group['ind'].iloc[-1] - (group['ind'].iloc[-1] - group['ind'].iloc[0])/2)) ax.set_xticks(x_labels_pos) ax.set_xticklabels(x_labels) # set axis limits ax.set_xlim([0, len(df)]) ax.set_ylim([0, np.ma.masked_invalid(df[col]).max()+0.2]) ax.grid(True) # x axis label ax.set_xlabel('Chromosome') ax.set_title(f"{outname_prefix}\n{col}") plt.savefig(f"{imputed_prefix}_{outname_prefix}_o_manhattan_{col}") plt.clf() print(f"{imputed_prefix}_{outname_prefix}_o_manhattan_{col}.png")