Source code for spycone.splicingfactor

from collections import defaultdict
import numpy as np
from Bio import motifs
from Bio.Seq import Seq
import pandas as pd
import os
import pickle
import requests
import seaborn as sns
import matplotlib.pyplot as plt
from functools import reduce
from scipy.stats import pearsonr
from scipy.stats import mannwhitneyu, fisher_exact, kruskal
from Bio.SeqUtils import GC
from joblib import Parallel, delayed
import gc


from ._util_stat.multipletesting import pvalue_correction

dir_path = os.path.dirname(os.path.realpath(__file__))

[docs]class SF_coexpression(): """ Return the coexpression between Splicing factor and isoforms Parameters ------------ dataset : input dataset object padj_method : Multiple testing method. Default=Bonferonni corr_cutoff : Correlation coefficient cutoff. Dafault=0.7 padj_cutoff : Adjusted p-value cutoff. Default=0.05 method : Method to calculate the correlation value. Return ------- Create an instance for co-expression analysis of splicing factors and transcript abundance. """ def __init__(self, dataset, padj_method="bonf", corr_cutoff=0.7, padj_cutoff = 0.05, method="pearson"): self.dataset=dataset self.padj_method = padj_method self.corr_cutoff = corr_cutoff self.method= method def _get_splicing_factors(self): sfs=[] with open(os.path.join(dir_path, "data/network/splicingfactors.txt"), "r") as f: tmp=f.readlines() for ss in tmp: sfs.append(ss.replace("\n", "")) sfs = pd.DataFrame(sfs, columns=['GeneSymbol']) mapper = dict(zip(self.dataset.symbs, self.dataset.gene_id)) sfs['entrezid'] = list(map(lambda x: mapper[x] if x in self.dataset.symbs else None, sfs['GeneSymbol'])) sfs = sfs[sfs['GeneSymbol'].isin(self.dataset.symbs)] print(str(np.sum(sfs['GeneSymbol'].isin(self.dataset.symbs)))+" of the splicing factor are in the dataset") return sfs def _get_matrices(self, sfs, timelag=1): ##calculate isoform abundances # isoabun = np.empty(shape=(self.dataset.shape[0], 0, self.dataset.shape[2])) transcript_idx = [] geneid = [] for gene, info in self.dataset.isoobj.normdf.items(): if len(info['indices'])>1: transcript_idx.append(info['indices']) isoabun = np.concatenate((isoabun, info['normarr']), axis=1) for _ in range(info['normarr'].shape[1]): geneid.append(gene) transcript_idx = reduce(lambda x,y: x+y, transcript_idx) transcript = self.dataset.transcript_id[transcript_idx] isoabun = np.concatenate(isoabun, axis=1) ##isoform abundance of isoforms shape = (isoforms, time points*replicates) #correlate splicing factor with isoform abundance ###splicing factor expression shape = (splicing factor, time points*replicates) sfs_expression = np.empty(shape=(self.dataset.shape[0], 0, self.dataset.shape[2])) sf_id = [] for e,sf in enumerate(sfs['entrezid']): tmp = self.dataset.isoobj.normdf[sf] sfexp = np.expand_dims(np.sum(tmp['array'], axis=1),axis=1) sfs_expression=np.concatenate([sfs_expression, sfexp], axis=1) sf_id.append(sfs['GeneSymbol'].tolist()[e]) sfs_expression = np.concatenate(sfs_expression, axis=1) ##filter for only IS genes isgenes_idx = np.where(np.isin(geneid, self.dataset.isoobj.is_result['gene']))[0] final_isoabun = isoabun[isgenes_idx,:] final_transcript = transcript[isgenes_idx] final_geneid = [geneid[x] for x in isgenes_idx] ##filter splicing factors variances highvar = np.where(np.var(sfs_expression, axis=1)>1)[0] final_sfsexp =sfs_expression[highvar,:] final_sfid = [sf_id[x] for x in highvar] ## final_mat = np.concatenate([final_isoabun, final_sfsexp]) mat_index = np.append(final_transcript, final_sfid) assert final_mat.shape[0] == mat_index.shape[0] return final_mat, mat_index, final_transcript, final_geneid, final_sfid def _corrcoef_loop(self, matrix): rows, cols = matrix.shape[0], matrix.shape[1] r = np.ones(shape=(rows, rows)) p = np.ones(shape=(rows, rows)) for i in range(rows): for j in range(i+1, rows): if self.method=="pearson": r_, p_ = pearsonr(matrix[i], matrix[j]) if self.method=="softdtw": pass r[i, j] = r[j, i] = r_ p[i, j] = p[j, i] = p_ return r, p def _calculate_coexpression(self, final_mat, final_transcript, final_geneid, final_sfid): ##correlation with p values #######tmp read the results below sf_iso_mat, sf_iso_pval = self._corrcoef_loop(final_mat) ####multiple correction mapper = dict(zip(self.dataset.gene_id, self.dataset.symbs)) sf_iso_07 = sf_iso_mat[final_transcript.shape[0]+1:, 0:final_transcript.shape[0]] sf_iso_dict = {"sf":[], "iso":[], "iso_gene":[], "iso_genesymb":[], "corr":[], "pval":[]} for sf in range(sf_iso_07.shape[0]): for iso in range(sf_iso_07.shape[0]): sf_iso_dict['sf'].append(final_sfid[sf]) sf_iso_dict['iso'].append(final_transcript[iso]) sf_iso_dict['iso_gene'].append(final_geneid[iso]) sf_iso_dict['iso_genesymb'].append(mapper[final_geneid[iso]]) sf_iso_dict['corr'].append(sf_iso_07[sf,iso]) sf_iso_dict['pval'].append(sf_iso_pval[sf][iso]) ##correct for multiple testing sf_iso_df = pd.DataFrame(sf_iso_dict) sf_iso_df = sf_iso_df.sort_values("pval") mp = pvalue_correction(sf_iso_df['pval'], method=self.padj_method) sf_iso_df['padj']=mp.corrected_pvals ##cleaning data, sf_iso_df['cluster'] = list(map(self._map_to_clusters, sf_iso_df['iso_gene'])) sf_iso_df = sf_iso_df[(np.abs(sf_iso_df['corr'])>self.corr_cutoff) & (sf_iso_df['padj']<0.05)].sort_values("iso_gene") sf_iso_df['bin_corr'] = [1 if x > 0 else -1 for x in sf_iso_df['corr']] sf_group = sf_iso_df.groupby(["bin_corr","iso_gene","iso_genesymb", "cluster"]).aggregate(lambda x: tuple(x)) if sf_group.shape[0] == 0: return "No significant correlated SF is found." everycluster = pd.DataFrame(sf_group.groupby(["cluster", "bin_corr"]).aggregate(lambda x: tuple(x))['sf'].apply(lambda x: np.unique(reduce(lambda a,y: a+y, x)))).reset_index() #union_sf : the sf that are only found in one direction ####get list of SF that is positively and negatively correlated to at least one of the isoform in the cluster everycluster['diff_sf']=everycluster.groupby("cluster").aggregate(lambda x: list(x))['sf'].apply(lambda x: np.repeat(set(x[0]).symmetric_difference(set(x[1])),2) if len(x)>1 else [x[0]]).explode('sf') unique_sf = [] union_sf =[] for x,s in everycluster.iterrows(): unique_sf.append(set(s['sf']).intersection(set(s['diff_sf']))) union_sf.append(set(s['sf']).difference(set(s['diff_sf']))) everycluster['unique_sf']=unique_sf everycluster['union_sf']=union_sf return sf_iso_df, everycluster def _map_to_clusters(self, gene): #sf_iso_clu = [] #for gene in sf_iso_df['iso_gene']: match=False for u,v in self.dataset.clusterobj.genelist_clusters.items(): if gene in set(v): match=True break if match: return u #sf_iso_df['cluster'] = sf_iso_clu def coexpression_with_sf(self): sfs = self._get_splicing_factors() final_mat, mat_index, list_transcript, list_geneid, list_sfid = self._get_matrices(sfs) sf_iso_df, cluster_sf = self._calculate_coexpression(final_mat, list_transcript, list_geneid, list_sfid) return sf_iso_df, cluster_sf
def create_motifsobj(): ########this only read the consensus # motifs_df = pd.read_csv(os.path.join(dir, "sfanalysis/motif_info.csv"), sep="\t") # print(motifs_df.head()) # motifs_df['motifs']=motifs_df['motifs'].apply(eval) # instances_pre = motifs_df[motifs_df['motifname']==self.list_SF[0]]['motifs'].to_list() # motif_obj = [] # for instance in instances_pre: # instance = list(map(lambda x: x.replace("N",""), instance)) # print(instance) # instances = list(map(Seq, instance)) # m = motifs.create(instances) # motif_obj.append(m) # return motif_obj motif_obj = defaultdict(list) for mat in os.listdir(os.path.join(dir_path, "data/eCLIP_PWM")): if ".txt" in mat : with open(os.path.join(dir_path, "data/eCLIP_PWM",mat)) as handle: record = motifs.parse(handle, "TRANSFAC") motif_obj[mat.split(".")[1]].append(record) with open(os.path.join(dir_path, "data/network/splicingfactors.txt"),"w") as f: for u in motif_obj.keys(): f.write(u+"\n") f.close() # with open(os.path.join(dir,"spycone_pkg/spycone/data/motif_obj.pkl"), "wb") as f: # pickle.dump(motif_obj, f, pickle.HIGHEST_PROTOCOL) # f = open(os.path.join(dir,"spycone_pkg/spycone/data/motif_obj.pkl"), "rb") # motif_obj = pickle.load(f) motif_pssm = defaultdict(list) for sf, motif_o in motif_obj.items(): for e,bpmotif in enumerate(motif_o): if len(bpmotif)==1: pwm=bpmotif[0].counts.normalize() # background = {"A":(1-gc_ratio)/2,"C":gc_ratio/2,"G":gc_ratio/2,"T":(1-gc_ratio)/2} # pssm=pwm.log_odds(background) motif_pssm[sf].append([pwm]) with open(os.path.join(dir,"spycone_pkg/spycone/data/motif_pwm.pkl"), "wb") as f: pickle.dump(motif_pssm, f, pickle.HIGHEST_PROTOCOL) return motif_obj, motif_pssm
[docs]class SF_motifsearch(): ''' Return the PSSM score of target SF and exons binding. Parameters ------------ list_SF : list List of splicing factors / RBPs. E.g. the SF that is co-expressed to the isoform abundance. list_genes : list List of genes you want to check for SF binding sites. E.g. the cluster of genes that the input SF is co-expressed. gtf_df : str or dataframe gtf file path / dataframe gc_ratio : GC content that makes up the background for PSSM score calculation. Default=0.6. flanking : Flanking region size Return ------- Create an instance for SF motif enrichment analysis. ''' def __init__(self, list_SF, list_genes, dataset, gtf, gc_ratio=0.6, flanking=400): self.list_SF = list_SF self.list_genes = list_genes self.dataset = dataset self.gtf = gtf self.flanking = flanking #self.motif_pssm = pickle.load(open(os.path.join(dir_path,"data/motif_pssm.pkl"), "rb")) self.motif_obj = pickle.load(open(os.path.join(dir_path,"data/motif_obj.pkl"), "rb")) self.motif_pwm = pickle.load(open(os.path.join(dir_path,"data/motif_pwm.pkl"), "rb")) self.motif_thres = pickle.load(open(os.path.join(dir_path,"data/motif_thres.pkl"), "rb")) ###perform motifs search def get_seq(self, exid, site="5p", bg=False): target = self.gtf[self.gtf['exon_id']==exid] if target.shape[0]!=0: targetpos=target[['seqname','start','end','exon_id']] targetpos = targetpos.set_index(['exon_id']) targetpos = targetpos[['seqname', 'start','end']].astype(str) # if bg: # exon_len = int(targetpos['end'].values[0])-int(targetpos['start'].values[0])+1 # random_start = np.random.randint(1,45000000, size=1) # url = f"https://api.genome.ucsc.edu/getData/sequence?genome=hg38;chrom=chr{targetpos['seqname'].values[0]};start={str(int(random_start)-self.flanking)};end={str(int(random_start)+exon_len+self.flanking)}" # response = requests.get(url) # else: if site=="5p": url = f"https://api.genome.ucsc.edu/getData/sequence?genome=hg38;chrom=chr{targetpos['seqname'].values[0]};start={str(int(targetpos['start'].values[0])-self.flanking)};end={str(int(targetpos['start'].values[0])+self.flanking)}" elif site=="3p": url = f"https://api.genome.ucsc.edu/getData/sequence?genome=hg38;chrom=chr{targetpos['seqname'].values[0]};start={str(int(targetpos['end'].values[0])-self.flanking)};end={str(int(targetpos['end'].values[0])+self.flanking)}" response = requests.get(url) if 'dna' in response.json().keys(): return response.json()['dna'] else: return None def _exon_search(self, exons, sf, site): sf_results=defaultdict(list) motifs_pwm = self.motif_pwm[sf] for exon in exons: exon_seq = self.get_seq(exid=exon, site=site) if exon_seq is None: continue else: for e,pwm in enumerate(motifs_pwm): if len(pwm)==1: test_seq = Seq(exon_seq) submotstr = f'{sf}_{e}' gc_ratio = round(GC(test_seq)/100,2) background = {"A":(1-gc_ratio)/2,"C":gc_ratio/2,"G":gc_ratio/2,"T":(1-gc_ratio)/2} pssm=pwm[0].log_odds(background) # distribution = pssm[0].distribution(background=background, precision=10**4) # threshold = distribution.threshold_patser() submotifs = None submotstr = None # for pos, sc in pssm[0].search(test_seq, self.threshold): # dire = np.sign(pos) # #relpos_start = np.abs(pos) - self.flanking # #relpos_end = np.abs(pos) - len(exon_seq) + self.flanking # submotifs = f"{exon};{dire};{pos};{sc}" # submotstr = f'{sf}_{e}' submotstr = f'{sf}_{e}' allscores = pssm.calculate(test_seq) # if self.motif_thres[submotstr] is not None: # allscores = pssm[0].search(test_seq, threshold=self.motif_thres[submotstr]) # for pos, sc in enumerate(allscores): # submotifs = f"{exon};1;{pos};{sc}" # sf_results[submotstr].append(submotifs) #### this only save the scores not the position this_threshold = self.motif_thres[submotstr] sf_results[submotstr] = [sc if not np.isnan(sc) else next if sc > this_threshold else next for sc in allscores] else: continue gc.collect() return sf_results def _parallel_search(self, sf, exon_dicts, site, bg=False): sf_results = [] motif_str = [] #for each SF search in every exons #for gene, exons in self.target_exons_dict.items(): # with concurrent.futures.ProcessPoolExecutor() as executor: # for gene, each_gene_res in zip(exon_dicts.values(), executor.map(self._exon_search, exon_dicts.values(), np.repeat(sf, len(exon_dicts.values())))): # sf_results.append(each_gene_res) sf_results = Parallel()(delayed(self._exon_search)(exons=v, sf=sf, site=site) for v in exon_dicts.values()) # for v in exon_dicts.values(): # res = self._exon_search(v, sf, site=site) # sf_results.append(res) sf_per_results = defaultdict(list) for eachgene in sf_results: for eachmot, eachres in eachgene.items(): if eachmot is not None: for x in eachres: sf_per_results[eachmot].append(x) return sf_per_results def search_motifs(self, exons="loss", site="5p"): """ Calculate PSSM scores on all exons in the input list of genes. Parameters ----------- """ #get exons from IS results target_exons = self.dataset.isoobj.is_result[self.dataset.isoobj.is_result['gene_symb'].isin(self.list_genes)] target_exons_dict = defaultdict(list) n_exons =0 if exons=="loss": for x in target_exons.iterrows(): target_exons_dict[x[1]['gene_symb']]=x[1]['loss_exons'] n_exons+=len(x[1]['loss_exons']) else: for x in target_exons.iterrows(): target_exons_dict[x[1]['gene_symb']]=x[1]['gain_exons'] n_exons+=len(x[1]['gain_exons']) self.target_exons_dict = target_exons_dict sf_results = defaultdict(list) motif_str =defaultdict(list) #get motif objects from list SF # motif_obj, motif_pssm = self.create_motifsobj() # self.motif_obj = motif_obj # self.motif_pssm = motif_pssm # parallelize motif searching ## return a list of result in the following format: ### exonID;direction of the strand;matched position of the seq;score from biopython;relative pos to the start site (including flanking region);relative pos to the end + flanking region; length of the exon; for sf in self.list_SF: each_sf_res = self._parallel_search(sf, target_exons_dict, site=site) sf_results[sf].append(each_sf_res) # iterate the results to create a dictionary where keys are SF and values are the results for each exon ### note that each SF might have multiple motifs, in this case the motifs are denoted as SF_1, SF_2 from the same SF. self.motif_result = sf_results gc.collect() return sf_results, n_exons def search_background_motifs(self, site="5p"): allexons = self.gtf["exon_id"].to_numpy() #get exons from IS results if "self.target_exons_dict" not in locals(): ValueError("Please run search_motifs first.") bg_exons_dict=defaultdict(list) n_exons=0 target_exons = self.dataset.isoobj.is_result[self.dataset.isoobj.is_result['gene_symb'].isin(self.list_genes)] for x in target_exons.iterrows(): bg_exons_dict[x[1]['gene_symb']]=x[1]['constant_exons'] n_exons+=len(x[1]['constant_exons']) self.bg_exons_dict = bg_exons_dict print("get exons bg dict") #get motif objects from list SF sf_results = defaultdict(list) motif_str =defaultdict(list) #get motif objects from list SF # motif_obj, motif_pssm = self.create_motifsobj() # self.motif_obj = motif_obj # self.motif_pssm = motif_pssm # parallelize motif searching ## return a list of result in the following format: ### exonID;direction of the strand;matched position of the seq;score from biopython;relative pos to the start site (including flanking region);relative pos to the end + flanking region; length of the exon; # with concurrent.futures.ProcessPoolExecutor() as executor: # for each_sf_res, sf in zip(self.list_SF, executor.map(self._parallel_search, self.list_SF)): for sf in self.list_SF: each_sf_res = self._parallel_search(sf, bg_exons_dict, site=site) sf_results[sf].append(each_sf_res) # iterate the results to create a dictionary where keys are SF and values are the results for each exon ### note that each SF might have multiple motifs, in this case the motifs are denoted as SF_1, SF_2 from the same SF. self.motif_background = sf_results gc.collect() return sf_results, n_exons def _df_forvis(self, lossres, gainres, bg=None): ''' MannWhitney U test or Fisher exact test ''' def _checkscore(x): if np.isfinite(x): return x else: return 0 vis_sf = [] ldf = {#'allpos':[], 'logscores':[], #'allscores':[], 'sf_motif':[], 'threshold':[]} for sf,v in lossres[0].items(): for eachmot, exons in v[0].items(): for ex in exons: # signs = np.sign(float(ex.split(";")[2])) # signss = 1 if signs == 0 else signs # signsss = "+ strand" if signss==1 else "- strand" #ldf['allpos'].append(np.abs(float(ex.split(";")[2]))-self.flanking) thisscore=float(ex) ldf['logscores'].append(thisscore) # df['alldir'].append(signsss) ldf['sf_motif'].append(eachmot) ldf['threshold'].append(self.motif_thres[eachmot]) ldf = pd.DataFrame(ldf) ldf['rawtype'] = "lost exons" ldf['type'] = f"LE ({lossres[1]})" vis_sf.append(ldf) gdf = {#'allpos':[], 'logscores':[], #'allscores':[], 'sf_motif':[], 'threshold':[]} #countexons=[] for sf,v in gainres[0].items(): for eachmot, exons in v[0].items(): for ex in exons: #countexons.append(ex) # signs = np.sign(float(ex.split(";")[2])) # signss = 1 if signs == 0 else signs # signsss = "+ strand" if signss==1 else "- strand" #gdf['allpos'].append(np.abs(float(ex.split(";")[2]))-self.flanking) thisscore=float(ex) gdf['logscores'].append(thisscore) #gdf['allscores'].append(10**thisscore) # df['alldir'].append(signsss) gdf['sf_motif'].append(eachmot) gdf['threshold'].append(self.motif_thres[eachmot]) gdf = pd.DataFrame(gdf) gdf['rawtype'] = "gained exons" gdf['type'] = f"GE ({gainres[1]})" vis_sf.append(gdf) if bg: bgdf = {#'allpos':[], 'logscores':[], #'allscores':[], 'sf_motif':[], 'threshold':[]} for sf,v in bg[0].items(): for eachmot, exons in v[0].items(): for ex in exons: #countexons.append(ex.split(";")[0]) # signs = np.sign(float(ex.split(";")[2])) # signss = 1 if signs == 0 else signs # signsss = "+ strand" if signss==1 else "- strand" #bgdf['allpos'].append(np.abs(float(ex.split(";")[2]))-self.flanking) thisscore=float(ex) bgdf['logscores'].append(thisscore) #bgdf['allscores'].append(10**thisscore) # bgdf['alldir'].append(signsss) bgdf['sf_motif'].append(eachmot) bgdf['threshold'].append(self.motif_thres[eachmot]) bgdf = pd.DataFrame(bgdf) bgdf['rawtype'] = "background" bgdf['type'] = f"UE ({bg[1]})" vis_sf.append(bgdf) alldf = pd.concat(vis_sf) alldf = alldf[alldf['threshold'].notnull()] alldf=alldf[(np.isfinite(alldf['logscores']))].reset_index() #alldf['titles'] = alldf['sf_motif']+", "+alldf['alldir'] return alldf def _get_pvals(self, alldf, method="mannwhitney"): alldf = alldf[alldf['logscores']>alldf['threshold']] ### cal pvalue if method=="mannwhitney": pvals =defaultdict(list) #for stra in ["- strand", "+ strand"]: #ddf = df[df['alldir']==stra].reset_index() #bgddf = bgdf[bgdf['alldir']==stra].reset_index() for sf in alldf['sf_motif'].unique(): idx=alldf[(alldf['sf_motif']==sf) & (alldf['rawtype']=="gained exons")].index lidx=alldf[(alldf['sf_motif']==sf) & (alldf['rawtype']=="lost exons")].index bgidx=alldf[(alldf['sf_motif']==sf) & (alldf['rawtype']=="background")].index gexons = [alldf['logscores'][x] for x in idx] lexons = [alldf['logscores'][x] for x in lidx] bgs = [alldf['logscores'][x] for x in bgidx] try: pvals[sf].append(mannwhitneyu(lexons, bgs, alternative="greater")[1]) #[lost, gain] except ValueError: pvals[sf].append(1) try: pvals[sf].append(mannwhitneyu(gexons, bgs, alternative="greater")[1]) #[lost, gain] except ValueError: pvals[sf].append(1) try: pvals[sf].append(mannwhitneyu(lexons, gexons, alternative="greater")[1]) #[lost, gain] except ValueError: pvals[sf].append(1) return pvals def vis_motif_density(self, res=None, style="boxplot", bg=None, method="mannwhitney", pval_cutoff=None, merge_motif=False, alldf=None, pvals=None, palette=None, sf_list=None): if alldf is None and pvals is None: if res is None: raise("Please input result from search_motif() as res") alldf, pvals = self._df_pval_forvis(res, bg=bg, method=method) filtered_alldf = alldf[alldf['logscores']>alldf['threshold']] def print_pvalues(g, i, j, hdiff, pvid): for x,ax in enumerate(g.axes_dict.values()): text=pvals[ax.get_title()][pvid] px = [] ph=[] phmin = [] for p in alldf['type'].unique(): tmp = filtered_alldf[(filtered_alldf['type']==p)&(filtered_alldf['sf_motif']==ax.get_title())]['logscores'] if not np.isnan(tmp.max()): ph.append(tmp.max()) ##max of each group phmin.append(filtered_alldf[(filtered_alldf['type']==p)&(filtered_alldf['sf_motif']==ax.get_title())]['logscores'].min()) else: ph.append(0) phmin.append(0) pxx = (i+j)/2 phh = max(ph[i],ph[j]) props = {'connectionstyle':'bar','arrowstyle':'-', 'shrinkA':20,'shrinkB':20,'linewidth':2} ax.plot([i,i,j,j], [phh+hdiff, phh+hdiff+0.2, phh+hdiff+0.2, phh+hdiff], lw=1.5, c="k") ax.text(s=f"p={str('{:.2e}'.format(text))}", x=pxx-0.3, y=phh+hdiff+0.3) ax.set_ylim((np.min(phmin)-1, phh+5)) if sf_list: filtered_alldf = filtered_alldf[filtered_alldf['sf_motif'].isin(sf_list)] if pval_cutoff: sigs_sf = [] for u,v in pvals.items(): checksig = False for vv in v: if vv<=pval_cutoff: checksig=True if checksig == True: sigs_sf.append(u) sigs_df = filtered_alldf[filtered_alldf['sf_motif'].isin(sigs_sf)] palette1 = defaultdict(str) for e,x in enumerate(sigs_df['type'].unique()): palette1[x] = list(palette.values())[e] if style=="boxplot": plt.figure(figsize=[8,9]) g= sns.catplot(data=sigs_df, x="type", y="logscores", col="sf_motif", col_wrap=5, kind="box",linewidth=3, palette=palette1, sharey=False) g.set_titles("{col_name}") print_pvalues(g, 0, 1, 0.5, 2) #loss vs gain print_pvalues(g, 1, 2, 1, 1) #gain vs bg print_pvalues(g, 0, 2, 2, 0) #loss vs bg # for x,ax in enumerate(g.axes_dict.values()): # thistitle = ax.get_title() # pvvv = pvals[thistitle][3] # ax.set_title(thistitle+"\n"+f"kruskal p-value: {str('{:.2e}'.format(pvvv))}", y=0.95) g.set_axis_labels(x_var="Type", y_var="log(Score)") g.set_xticklabels() else: if style=="boxplot": palette1 = defaultdict(str) for e,x in enumerate(filtered_alldf['type'].unique()): palette1[x] = list(palette.values())[e] plt.figure(figsize=[8,9]) g= sns.catplot(data=filtered_alldf, x="type", y="logscores", col="sf_motif", col_wrap=5, kind="box",linewidth=3, palette=palette1, sharey=False) g.set_titles("{col_name}") print_pvalues(g, 0, 1, 0.5, 2) #loss vs gain print_pvalues(g, 1, 2, 1, 1) #gain vs bg print_pvalues(g, 0, 2, 1.5, 0) #loss vs bg # for x,ax in enumerate(g.axes_dict.values()): # thistitle = ax.get_title() # pvvv = pvals[thistitle][3] # ax.set_title(thistitle+"\n"+f"kruskal p-value: {str('{:.2e}'.format(pvvv))}", y=0.95) g.set_axis_labels(x_var="Type", y_var="log(Score)") g.set_xticklabels()