import numpy as np
import pandas as pd
from collections import defaultdict
from scipy.stats import mannwhitneyu
from scipy.stats import chi2
import warnings
from functools import reduce
from itertools import combinations
import pickle, os, sys
warnings.simplefilter("ignore")
from collections import defaultdict
from ._util_stat.multipletesting import pvalue_correction
from ._feature import featuresObj
from ._shuffle import shuffling
# Disable
def _blockPrint():
sys.stdout = open(os.devnull, 'w')
# Restore
def _enablePrint():
sys.stdout = sys.__stdout__
dir_path = os.path.dirname(os.path.realpath(__file__))
# def _iso_switch_between_arrays(arr1, arr2, orgarr1, orgarr2):
# ab = np.sign(arr1-arr2)
# ##look for switch points
# points=[[x for x in range(ab.shape[1]) if ab[r,x] != np.roll(ab[r], 1)[x] and x!=0] for r in range(ab.shape[0])]
# #points: switch points of all replicates
# ##check if all r have switch points
# #then take the points where all appeared
# def multiple_intersect(points):
# return np.unique(list(reduce(lambda x, y: set(x).intersection(set(y)), points)))
# #calculate differences
# #[(abs(a[x-1] - a[x]) + abs(b[x-1] -b[x]))/2 for x in points], a,b
# if any(points):
# final_sp = multiple_intersect(points)
# if not any(final_sp):
# final_sp = [list(set(x[0]).intersection(set(x[1]))) for x in combinations(points,2)]
# final_sp = np.unique(reduce(lambda x,y: x+y, final_sp))
# if not any(final_sp):
# final_sp = reduce(lambda x,y : x+y, points)
# final_sp = np.sort(final_sp)
# # mean of max difference at the time points between 2 arrays for all replicates
# allsp_diff = [np.mean([np.max([abs(arr1[r,x-1] - arr2[r,x-1]), abs(arr2[r,x] -arr1[r,x])]) for r in range(arr1.shape[0])]) for x in final_sp]
# else:
# final_sp = []
# allsp_diff=[]
# #calculate corr
# cors = [(1-np.corrcoef(orgarr1[r], orgarr2[r])[0,1])/2 for r in range(arr1.shape[0])]
# #for each switch point, the max switch diff before and after
# #allsp : all the before and after switch differences for all switch points
# return len(final_sp)/arr1.shape[1], allsp_diff, final_sp, np.mean(cors)
class basic_pvalue():
def __init__(self, testobject, object_type, iso_switch = True, shuffle=None, fitness_scores = None, fitness_scores_two_sided = False, n_permutations=1000):
self.testobject=testobject ##DataSet
self.object_type = object_type #connectivity, clusters, iso_pairs
self.shuffle = shuffle
self.fitness_scores = fitness_scores
self.fitness_scores_two_sided = fitness_scores_two_sided
#self.combine_pvalues = combine_pvalues
self.n_permutations = n_permutations
self.iso_switch = iso_switch
def initialize_features_for_permutation(self, features, shuffled):
pass
def calculate_original_object_fitness(self):
pass
# Print iterations progress
def progressbar(self, it, prefix="", size=60, file=sys.stdout):
count = len(it)
def show(j):
x = int(size*j/count)
file.write("%s[%s%s] %i/%i\r" % (prefix, "#"*x, "."*int((size-x)), j, count))
file.flush()
show(0)
for i, item in enumerate(it):
yield item
show(i+1)
file.write("\n")
file.flush()
def empirical_pvalue(self, total_observation,better_observation):
if total_observation > 0:
result = (1+better_observation)/ (1+total_observation)
return result
def combine_pvals(self, pvals):
chisqprob = lambda chisq, df: chi2.sf(chisq, df)
s = -2 * np.sum(np.log(pvals))
return chisqprob(s, 2 * len(pvals))
def do_calculate(self):
if self.object_type=="clusters":
n_object = self.testobject.clusterobj._final_n_cluster
timeserieslist = self.testobject.timeserieslist
#get features of clusters
print("Calculating features.")
features_original = featuresObj(self.testobject, timeserieslist, feature_type="clusters")
#get fitness of clusters
print("Calculating fitness score.")
original_fitness_score = features_original.feature_store
#data structure for storing permuted features #[features][permutation]
permuted_fitness = np.zeros((original_fitness_score.shape[0], n_object))
# else:
# if not isinstance(testobject, clusterObj):
# raise ValueError("ClusterObj object is needed.")
if self.object_type == "iso_pairs":
n_object = len(self.testobject.isoobj.isopairs['major_transcript'])
timeserieslist = self.testobject.timeserieslist
features_original = featuresObj(self.testobject, timeserieslist, feature_type="iso_pairs")
original_fitness_score = features_original.feature_store
permuted_fitness = np.zeros((original_fitness_score.shape[0], n_object))
batch_smaller=np.zeros((original_fitness_score.shape[0], n_object))
batch_larger=np.zeros((original_fitness_score.shape[0], n_object))
batch_equal=np.zeros((original_fitness_score.shape[0], n_object))
batch_comp=np.zeros((original_fitness_score.shape[0], n_object))
#shuffle feature and fitness
#def _cal_permutations(p):
for p in self.progressbar(it=range(self.n_permutations), prefix="Permutation:", size=self.n_permutations/10):
_blockPrint()
shuffleObj = shuffling(timeserieslist=timeserieslist)
#shuffle1 = np.array(shuffleObj.shuffle_dataset_rowwise(), dtype="double")
#self.testobject.timeserieslist = shuffle1 ##reassign testobject time series to the shuffled dataset
#calculate features for shuffle obj
iso = iso_function(self.testobject)
#corrdf = iso.detect_isoform_switch(filtering=False, min_diff=self.testobject.isoobj.min_diff, corr_cutoff=self.testobject.isoobj.corr_cutoff, event_im_cutoff=self.testobject.isoobj.event_im_cutoff)
#self.testobject = iso.total_isoform_usage(corrdf)
#shuffle_clu = clustering.clustering(self.testobject, input_type="isoformusage", algorithm=self.testobject.clusterobj.algorithm, metric=self.testobject.clusterobj.metric, linkage=self.testobject.clusterobj.linkage, n_clusters=self.testobject.clusterobj.n_clusters)
if self.object_type=="clusters":
permuted_features = featuresObj(testobject=self.testobject, timeserieslist=timeserieslist, feature_type = "clusters", seed=p)
#calculate fitness for shuffle obj
permuted_fitness = permuted_features.feature_store
elif self.object_type=="iso_pairs":
permuted_features = featuresObj(testobject=self.testobject, timeserieslist = timeserieslist, feature_type="shuffle_iso_pairs", seed=p)
permuted_fitness = permuted_features.feature_store
for fs in range(original_fitness_score.shape[0]):
fitness = original_fitness_score[fs]
shuffledfitness = permuted_fitness[fs]
#comparison
for o in range(len(fitness)):
compare = shuffledfitness[o] - fitness[o]
if compare < 0:
batch_smaller[fs][o] += 1
elif compare > 0:
batch_larger[fs][o] += 1
else:
batch_equal[fs][o] += 1
batch_comp[fs][o] += 1
p+=1
_enablePrint()
#res = Parallel(backend="multiprocessing") (delayed(_cal_permutations)(int(p)) for p in self.progressbar(it=range(self.n_permutations), prefix="Permutation:", size=self.n_permutations/10))
#print(res)
#end permutation
print("Done permutation")
pvals=np.zeros((original_fitness_score.shape[0], n_object))
total_observation = 0
better_observation = 0
for fs in range(original_fitness_score.shape[0]):
for o in range(n_object):
total_observation = batch_comp[fs][o]
if self.fitness_scores_two_sided is True:
better_observation = min(batch_larger[fs][o], batch_smaller[fs][o])*2 + batch_equal[fs][o]
else:
better_observation = batch_equal[fs][o] + batch_larger[fs][o]
pvals[fs][o] = self.empirical_pvalue(total_observation, better_observation)
# p_vals = Parallel(backend="multiprocessing") (delayed(_count_permutations)(fs, o) for o in range(n_object) for fs in range(original_fitness_score.shape[0]))
#p_vals = np.array(p_vals).reshape(n_object, original_fitness_score.shape[0])
##combine pvalues
result_pvalue = np.zeros(n_object)
for o in range(n_object):
p_values_list = np.zeros(original_fitness_score.shape[0])
p_values_list = pvals[:,o]
result_pvalue[o] = self.combine_pvals(p_values_list)
return result_pvalue
[docs]class iso_function():
"""
Isoform level analysis
Parameters
----------
DataSet :
DataSet object
Species :
Species ID
Methods
--------
detect_isoform_switch :
Parameters
----------
combine : {'median', 'mean'}
aggregation methods for replicates. Default="median"
filtering : (boolean, default=True)
if True, low expression genes will be filtered out.
filter_cutoff : default=2
expression mean cutoff to filter.
corr_cutoff : default = 0.7
minimum correlation of isoform pairs to be included in the output.
p_val_cutoff : default = 0.05
significant p-value cutoff to be included in the output.
min_diff : default = 0.1
minimum differences of relative abundance to be included in the output.
event_im_cutoff : default = 0.1
minimum event importance to be included in the output.
adjustp : str {'fdr_bh' (default), 'holm_bonf', 'bonf'}
Method for multiple testing
bonf: Bonferroni method
holm_bonf: holm-bonferroni method
fdr_bh: Benjamin-hochberg false discovery rate
n_permutations :
Number of permutations if permutation test is used.
total_isoform_usage :
Parameters
----------
ids_result
the result dataframe of isoform switch detection
norm : `boolean`, default=True
if True, it normalizes time series matrix to relative abundance to gene expression.
gene_level : `boolean`, default=True
if True, it calculates total isoform usage for each gene, otherwise individual isoform usage for each isoform
Return
--------
Create an instance for isoform switch analysis.
"""
def __init__(self, dataset):
self.dataset = dataset
self.species = dataset.species
if "." in dataset.transcript_id[0]:
infile = open(os.path.join(dir_path,f"data/network/{dataset.species}_ver_trans_pfam.pkl"),'rb')
exfile = open(os.path.join(dir_path,f"data/network/exons_{dataset.species}_ensembl.pkl"),'rb')
else:
infile = open(os.path.join(dir_path,f"data/network/{dataset.species}_ver_trans_pfam.pkl"),'rb')
exfile = open(os.path.join(dir_path,f"data/network/exons_{dataset.species}_ensembl.pkl"),'rb')
self.trans_pfam_dict = pickle.load(infile)
self.trans_exon_dict = pickle.load(exfile)
groupdict = defaultdict(lambda: defaultdict(list))
for idx, sym in enumerate(self.dataset.gene_id):
if sym != 'n':
groupdict[sym]['trans_id'].append(self.dataset.transcript_id[idx])
groupdict[sym]['indices'].append(idx)
groupdict[sym]['array'].append(self.dataset.timeserieslist[:,idx,:])
#normlize work
for sym, df in groupdict.items():
org = df['array'][0]
for x in range(1,len(df['array'])):
org = np.hstack([org, df['array'][x]])
df['array'] = org.reshape(self.dataset.reps1, len(df['array']), self.dataset.timepts)
tmp = np.array([df['array'][x]/np.sum(df['array'][x], axis=0) for x in range(df['array'].shape[0])])
tmp[np.isnan(tmp)] = 1
groupdict[sym]['normarr'] = tmp
self.normdf = groupdict
def _take_interval(self, sp, timepts):
newsp = np.insert(sp,0,0)
newsp = np.append(newsp, timepts)
intervals = []
for i in range(len(sp)):
intervals.append(np.min([newsp[i+1]-newsp[i], newsp[i+2]-newsp[i+1]]))
return intervals
def _iso_switch_between_arrays(self, arr1, arr2, orgarr1, orgarr2):
ab = np.sign(arr1-arr2)
if self.dataset.reps1 ==1:
points = [x for x in range(1, ab.shape[1]) if (ab[0][x] != np.roll(ab[0], 1)[x])]
final_sp=points
allsp_diff =[np.mean([abs(arr1[:,x-1] - arr2[:,x-1]), abs(arr2[:,x] -arr1[:,x])]) for x in final_sp]
cors = (1-np.corrcoef(orgarr1, orgarr2)[0,1])/2
iso_ratio=len(final_sp)/arr1.shape[0]
else:
##look for switch points
points=[[x for x in range(ab.shape[1]) if ab[r,x] != np.roll(ab[r], 1)[x] and x!=0] for r in range(ab.shape[0])]
#points: switch points of all replicates
##check if all r have switch points
#then take the points where all appeared
def multiple_intersect(points):
return np.unique(list(reduce(lambda x, y: set(x).intersection(set(y)), points)))
#calculate differences
#[(abs(a[x-1] - a[x]) + abs(b[x-1] -b[x]))/2 for x in points], a,b
rep_agree = round(self.dataset.reps1*0.6)
if any(points):
final_sp = multiple_intersect(points)
if not any(final_sp):
final_sp = [list(set(x[0]).intersection(set(x[1]))) for x in combinations(points,rep_agree)]
final_sp = np.unique(reduce(lambda x,y: x+y, final_sp))
if not any(final_sp):
final_sp = []
final_sp = np.sort(final_sp)
# mean of max difference at the time points between 2 arrays for all replicates
allsp_diff = [np.mean([np.max([abs(arr1[r,x-1] - arr2[r,x-1]), abs(arr2[r,x] -arr1[r,x])]) for r in range(arr1.shape[0])]) for x in final_sp]
else:
final_sp = []
allsp_diff=[]
#calculate corr
cors = [(1-np.corrcoef(orgarr1[r], orgarr2[r])[0,1])/2 for r in range(arr1.shape[0])]
iso_ratio = len(final_sp)/arr1.shape[1]
#for each switch point, the max switch diff before and after
#allsp : all the before and after switch differences for all switch points
return iso_ratio, allsp_diff, final_sp, np.mean(cors)
def _event_enrichness(self, normdf, sp, arr1, arr2):
##penalize low expressed genes
penal1 = np.mean([arr1[r]/np.max(normdf[r],axis=0) for r in range(normdf.shape[0])],axis=0)
penal2 = np.mean([arr2[r]/np.max(normdf[r],axis=0) for r in range(normdf.shape[0])],axis=0)
return np.mean(np.append(penal1[sp-1:sp+1],penal2[sp-1:sp+1]))
def _remove_low_expressed(self):
#return isoform indices
timedf = pd.concat([pd.DataFrame(self.timeserieslist[x]) for x in range(self.timeserieslist.shape[0])], axis=1)
geneexp = pd.concat([pd.Series(self.symbs, name='symbol'), timedf],axis=1)
#filter out those higher than 2 by default (can change)
lowgenes_idx = np.where(geneexp.groupby('symbol').agg('sum').mean(axis=1) < 2)[0]
#get the gene names lower than cutoff
lowgenes = geneexp.iloc[lowgenes_idx, 0]
return lowgenes
def _remove_low_abundance(self, pre_normdf):
#return isoform indices
#calculate fold change of major isoform with respect to the minor isoforms
lowexp = np.mean(np.max(pre_normdf, axis=2),axis=0)
#filter out those higher than 2 by default (can change)
lowgenes = np.where(lowexp<=0.1)[0]
filtered_normdf = np.delete(pre_normdf, lowgenes, axis=1)
return filtered_normdf, lowgenes
def _combine_pvals(self, pvals):
##no used
chisqprob = lambda chisq, df: chi2.sf(chisq, df)
s = -2 * np.sum(np.log(pvals))
return chisqprob(s, 2 * len(pvals))
def _take_interval(self, sp):
#check interval between switch points
newsp = np.insert(sp,0,0)
newsp = np.append(newsp, self.dataset.timepts)
intervals = []
for i in range(len(sp)):
intervals.append(np.min([newsp[i+1]-newsp[i], newsp[i+2]-newsp[i+1]]))
return intervals
def _diff_before_after_switch(self, normdf, arr1, arr2, allsp_diff, switch_points):
tmp_sw = np.insert(switch_points,0,0)
tmp_sw = np.append(tmp_sw, self.dataset.timepts)
if self.dataset.reps1>1:
#get the intervals between time points
if any(switch_points):
#the best sp has the highest mean differences before and after switch points between the two time series
bestpval1=0
bestpval2=0
allbestp=1
finaldiff = 0
finalenrich = 0
best_switch_point=0
this_prob = 0
thispt=0
for bp in range(1, len(tmp_sw)-1):
#if len(switch_points) != bp+1:
iso1_pval = mannwhitneyu(arr1[:,tmp_sw[bp-1]:tmp_sw[bp]].reshape(1,-1)[0], arr1[:,tmp_sw[bp]:tmp_sw[bp+1]].reshape(1,-1)[0])
iso2_pval = mannwhitneyu(arr2[:,tmp_sw[bp-1]:tmp_sw[bp]].reshape(1,-1)[0], arr2[:,tmp_sw[bp]:tmp_sw[bp+1]].reshape(1,-1)[0])
thisdiff = np.mean([abs(np.mean(arr1[:,tmp_sw[bp-1]:tmp_sw[bp]]) - np.mean(arr1[:,tmp_sw[bp]:tmp_sw[bp+1]])), abs(np.mean(arr2[:,tmp_sw[bp-1]:tmp_sw[bp]]) - np.mean(arr2[:,tmp_sw[bp]:tmp_sw[bp+1]]))])
thisenrich = self._event_enrichness(normdf, tmp_sw[bp], arr1, arr2)
iso_prob = ((np.sum(arr1[:,tmp_sw[bp-1]:tmp_sw[bp]].reshape(1,-1)[0]>arr2[:,tmp_sw[bp-1]:tmp_sw[bp]].reshape(1,-1)[0]))/arr1[:,tmp_sw[bp-1]:tmp_sw[bp]].reshape(1,-1)[0].shape[0]) + ((np.sum(arr1[:,tmp_sw[bp]:tmp_sw[bp+1]].reshape(1,-1)[0]<arr2[:,tmp_sw[bp]:tmp_sw[bp+1]].reshape(1,-1)[0]))/(arr1[:,tmp_sw[bp]:tmp_sw[bp+1]].reshape(1,-1)[0].shape[0]))
thispt=tmp_sw[bp]
if (iso1_pval[1] < allbestp) & (thisenrich > finalenrich):
bestpval1, bestpval2=iso1_pval[1], iso2_pval[1]
allbestp=iso1_pval[1]
finaldiff = thisdiff
finalenrich = thisenrich
best_switch_point=tmp_sw[bp]
this_prob = iso_prob/2
elif (iso2_pval[1] < allbestp) & (thisenrich > finalenrich):
bestpval1, bestpval2=iso1_pval[1], iso2_pval[1]
allbestp=iso2_pval[1]
finaldiff = thisdiff
finalenrich = thisenrich
best_switch_point=tmp_sw[bp]
this_prob = iso_prob/2
else:
continue
return finaldiff, bestpval1, bestpval2, best_switch_point, finalenrich, this_prob
else:
return 0, 1, 1, 0, 0, 0
else:
#the best sp has the highest mean differences before and after switch points between the two time series
if any(allsp_diff):
bp = np.argmax(allsp_diff)
finalenrich = self._event_enrichness(normdf, bp, arr1, arr2)
iso_prob = ((np.sum(arr1[:,tmp_sw[bp-1]:tmp_sw[bp]].reshape(1,-1)[0]>arr2[:,tmp_sw[bp-1]:tmp_sw[bp]].reshape(1,-1)[0]))/arr1[:,tmp_sw[bp-1]:tmp_sw[bp]].reshape(1,-1)[0].shape[0]) + ((np.sum(arr1[:,tmp_sw[bp]:tmp_sw[bp+1]].reshape(1,-1)[0]<arr2[:,tmp_sw[bp]:tmp_sw[bp+1]].reshape(1,-1)[0]))/(arr1[:,tmp_sw[bp]:tmp_sw[bp+1]].reshape(1,-1)[0].shape[0]))
return allsp_diff[bp], 1, 1, switch_points[bp], finalenrich, iso_prob
else:
return 0,1,1,0,0,0
def _isoform_differential_usage(self, thisnormdf, thisexp):
#define major transcript
#highest mean across time in averaged replicates
#vis
# for n in range(normdf.shape[1]):
# plt.plot(normdf[0,n,:])
# plt.show()
iso_ratio = np.zeros((thisnormdf.shape[1],thisnormdf.shape[1]))
iso_diff_value = np.zeros((thisnormdf.shape[1],thisnormdf.shape[1]))
maj_pval = np.zeros((thisnormdf.shape[1],thisnormdf.shape[1]))
min_pval = np.zeros((thisnormdf.shape[1],thisnormdf.shape[1]))
best_switch_point = [[] for _ in range(thisnormdf.shape[1])]
corr_list = np.zeros((thisnormdf.shape[1],thisnormdf.shape[1]))
enrichness = np.zeros((thisnormdf.shape[1],thisnormdf.shape[1]))
iso_prob = np.zeros((thisnormdf.shape[1],thisnormdf.shape[1]))
all_switch_point = [[] for _ in range(thisnormdf.shape[1])]
#check for all pairs
for maj in range(thisnormdf.shape[1]):
orgarr1 = thisexp[:,maj,:]
arr1 = thisnormdf[:,maj,:]
for x in range(thisnormdf.shape[1]): #each minor transcript
if x == maj:
iso_ratio[maj, x], iso_diff_value[maj, x] = 0, 0
best_switch_point[maj].append(0)
all_switch_point[maj].append(0)
corr_list[maj, x] = 0
continue
arr2 = thisnormdf[:,x,:]
orgarr2 = thisexp[:,x,:]
##finding switch points, correlation
iso_ratio[maj, x], allsp, final_sp, corr_list[maj, x] = self._iso_switch_between_arrays(arr1, arr2, orgarr1, orgarr2)
##calculate diff. value, p-values
iso_diff_value[maj, x], maj_pval[maj, x], min_pval[maj, x], bs, enrichness[maj,x], iso_prob[maj, x] = self._diff_before_after_switch(thisnormdf, arr1, arr2, allsp, final_sp)
best_switch_point[maj].append(bs)
all_switch_point[maj].append(final_sp)
return iso_ratio, maj_pval, min_pval, iso_diff_value, best_switch_point, corr_list, enrichness, all_switch_point, iso_prob
def _correlation_coef(self, thisnormdf, maj):
m_normdf = np.median(thisnormdf, axis=0)
corr_mat = (1-np.corrcoef(m_normdf))/2
return corr_mat
def _exclusive_domain(self, iso1, iso2):
if iso1 in self.trans_pfam_dict.keys():
pfams1 = self.trans_pfam_dict[iso1]
else:
pfams1 = []
if iso2 in self.trans_pfam_dict.keys():
pfams2 = self.trans_pfam_dict[iso2]
else:
pfams2 = []
diff = set(pfams1).symmetric_difference(set(pfams2))
diff = [x for x in list(diff) if not type(x)==float]
return diff
def _diff_exons(self, iso1, iso2, final_sp, isoid):
if iso1 in self.trans_exon_dict.keys():
exons1 = self.trans_exon_dict[iso1]
else:
exons1 = []
if iso2 in self.trans_exon_dict.keys():
exons2 = self.trans_exon_dict[iso2]
else:
exons2 = []
##major or minor isoforms
##define isoform_from
iso1_id = np.where(np.isin(self.normdf[isoid]['trans_id'], iso1))[0][0]
iso2_id = np.where(np.isin(self.normdf[isoid]['trans_id'], iso2))[0][0]
iso1_values = np.median(self.normdf[isoid]['array'][:,iso1_id, final_sp-1])
iso2_values = np.median(self.normdf[isoid]['array'][:,iso2_id, final_sp-1])
if iso1_values > iso2_values:
majoriso, minoriso = iso1, iso2
majoriso_exons, minoriso_exons = exons1, exons2
else:
majoriso, minoriso = iso2, iso1
majoriso_exons, minoriso_exons = exons2, exons1
gain_exons = set(majoriso_exons).difference(set(minoriso_exons))
loss_exons = set(minoriso_exons).difference(set(majoriso_exons))
nondiff = set(exons1).intersection(set(exons2))
gain_exons = [x for x in list(gain_exons) if not type(x)==float]
loss_exons = [x for x in list(loss_exons) if not type(x)==float]
nondiff = [x for x in list(nondiff) if not type(x)==float]
return majoriso, minoriso, gain_exons, loss_exons, nondiff
def detect_isoform_switch(self, filtering = True, prob_cutoff=0.5, corr_cutoff=0.5, event_im_cutoff = 0.1, min_diff = 0.05, p_val_cutoff = 0.05, n_permutations=100, adjustp='fdr_bh', diff_domain=True):
"""
Detect isoform switching events along time series.
Return
------
DataFrame
"""
#calculate probability and correlation for switching
#
#isodf = pd.DataFrame({'symbol':DataSet.symbs, 'isoform_id':DataSet.isoform_id})
self.min_diff = min_diff
self.corr_cutoff = corr_cutoff
self.event_im_cutoff = event_im_cutoff
self.timeserieslist = self.dataset.timeserieslist
self.transcript_id = self.dataset.transcript_id
self.symbs = self.dataset.symbs
self.gene_id = self.dataset.gene_id
if filtering==True:
lowgenes = self._remove_low_expressed()
else:
lowgenes=set()
print(str(len(set(lowgenes))) + " of genes with low expression are filtered out!!.")
print(f"calculating for {len(self.normdf.keys())} genes...")
iso_pairs_id = defaultdict(list)
iso_dict_info = defaultdict(tuple)
for gene, values in self.normdf.items():
idxs = values['indices']
thisnormdf = values['normarr']
thisexp = values['array']
# maj = np.argmax(np.median(np.median(thisexp, axis=0), axis=1))
# secondmaj = None
# if thisnormdf.shape[1] > 2:
# ##get second maj isoform
# secondmaj = np.argsort(np.median(np.median(thisexp, axis=0), axis=1))[-2]
# self.normdf[gene]['majs'].append(maj)
# self.normdf[gene]['majs'].append(secondmaj)
#calculate correlations
#corrmat= self.correlation_coef(thisnormdf, maj)
#calculate isoform ratio
res_maj = self._isoform_differential_usage(thisnormdf, thisexp)
#return major transcript and transcript that switch
if self.dataset.reps1 > 1:
for ids1, iso_ratio0 in enumerate(res_maj[0]):
for ids2, iso_ratio in enumerate(iso_ratio0):
if iso_ratio >0 :
##
majoriso, minoriso, gain_exons, loss_exons, nondiff = self._diff_exons(self.transcript_id[idxs[ids1]], self.transcript_id[idxs[ids2]], res_maj[4][ids1][ids2], str(gene))
iso_pairs_id['gene'].append(str(gene))
iso_pairs_id['gene_symb'].append(self.symbs[idxs[ids1]])
iso_pairs_id['major_transcript'].append(majoriso)
iso_pairs_id['minor_transcript'].append(minoriso)
iso_pairs_id['switch_prob'].append(res_maj[8][ids1][ids2])
iso_pairs_id['diff'].append(res_maj[3][ids1, ids2])
iso_pairs_id['maj_pval'].append(res_maj[1][ids1, ids2])
iso_pairs_id['min_pval'].append(res_maj[2][ids1, ids2])
iso_pairs_id['p_value'].append(self._combine_pvals([res_maj[1][ids1,ids2], res_maj[2][ids1, ids2]]))
iso_pairs_id['corr'].append(res_maj[5][ids1, ids2])
iso_pairs_id['best_switch_point'].append(res_maj[4][ids1][ids2])
iso_pairs_id['event_importance'].append(res_maj[6][ids1][ids2])
iso_pairs_id['gain_exons'].append(gain_exons)
iso_pairs_id['loss_exons'].append(loss_exons)
iso_pairs_id['constant_exons'].append(nondiff)
iso_pairs_id['all_switch_points'].append(res_maj[7][ids1][ids2])
iso_dict_info[self.transcript_id[idxs[ids1]]] = (gene, ids1)
iso_dict_info[self.transcript_id[idxs[ids2]]] = (gene, ids2)
if diff_domain == True:
iso_pairs_id['exclusive_domains'].append(self._exclusive_domain(self.transcript_id[idxs[ids1]],self.transcript_id[idxs[ids2]]))
# iso_dict_info[self.isoform_id[idxs[res_maj[3]]]] = (gene, res_maj[3])
# iso_dict_info[self.isoform_id[idxs[ids]]] = (gene, ids)
else:
#p-values is calculated
# for ids, iso_ratio in enumerate(res_maj[0]):
# if iso_ratio >0 :
# iso_pairs_id['gene'].append(gene)
# iso_pairs_id['major_transcript'].append(self.transcript_id[idxs[res_maj[3]]])
# iso_pairs_id['minor_transcript'].append(self.transcript_id[idxs[ids]])
# iso_pairs_id['iso_ratio'].append(iso_ratio)
# iso_pairs_id['diff'].append(res_maj[4][ids])
# iso_pairs_id['corr'].append(res_maj[6][ids])
# iso_pairs_id['final_sp'].append(res_maj[5][ids])
# iso_dict_info[self.transcript_id[idxs[res_maj[3]]]] = (gene, res_maj[3])
# iso_dict_info[self.transcript_id[idxs[ids]]] = (gene, ids)
for ids1, iso_ratio0 in enumerate(res_maj[0]):
for ids2, iso_ratio in enumerate(iso_ratio0):
if iso_ratio >0 :
##
majoriso, minoriso, gain_exons, loss_exons, nondiff = self._diff_exons(self.transcript_id[idxs[ids1]], self.transcript_id[idxs[ids2]], res_maj[4][ids1][ids2], str(gene))
iso_pairs_id['gene'].append(str(gene))
iso_pairs_id['gene_symb'].append(self.symbs[idxs[ids1]])
iso_pairs_id['major_transcript'].append(majoriso)
iso_pairs_id['minor_transcript'].append(minoriso)
iso_pairs_id['switch_prob'].append(res_maj[8][ids1][ids2])
iso_pairs_id['diff'].append(res_maj[3][ids1, ids2])
iso_pairs_id['corr'].append(res_maj[5][ids1, ids2])
iso_pairs_id['best_switch_point'].append(res_maj[4][ids1][ids2])
iso_pairs_id['event_importance'].append(res_maj[6][ids1][ids2])
iso_pairs_id['gain_exons'].append(gain_exons)
iso_pairs_id['loss_exons'].append(loss_exons)
iso_pairs_id['constant_exons'].append(nondiff)
iso_pairs_id['all_switch_points'].append(res_maj[7][ids1][ids2])
iso_dict_info[self.transcript_id[idxs[ids1]]] = (gene, ids1)
iso_dict_info[self.transcript_id[idxs[ids2]]] = (gene, ids2)
if diff_domain == True:
iso_pairs_id['exclusive_domains'].append(self._exclusive_domain(self.transcript_id[idxs[ids1]],self.transcript_id[idxs[ids2]]))
self.isopairs = iso_pairs_id
self.iso_dict_info = iso_dict_info
self.dataset.isoobj = self
if self.dataset.reps1 == 1:
cal = basic_pvalue(testobject=self.dataset, object_type="iso_pairs", n_permutations = n_permutations)
pval = cal.do_calculate() #self iso_function object
iso_pairs_id['p_value'] = pval
self.isopairs = iso_pairs_id
res = pd.DataFrame(iso_pairs_id).sort_values('p_value')
mct = pvalue_correction(res['p_value'], method=adjustp)
res['adj_pval'] = mct.corrected_pvals
res = res[(res['switch_prob']>prob_cutoff) & (res['diff']>min_diff) & (res['adj_pval']<p_val_cutoff) & (res['corr']>corr_cutoff) & (res['event_importance']>event_im_cutoff)]
res = res.sort_values("switch_prob", ascending=False).drop_duplicates(['major_transcript', 'minor_transcript'])
print("----Result statistics----")
print(f"Total genes with IS genes: {res['gene'].unique().shape[0]}")
print(f"Events found: {res.shape[0]}")
print(f"Events with affecting domains: {np.sum([1 for x in res[ 'exclusive_domains'] if len(x)>0])}")
print(f"Mean event importance: {res['event_importance'].mean()}")
print(f"Mean difference before and after switch: {res['diff'].mean()}")
print("--------------------------")
print("DONE")
self.is_result = res
self.dataset.isoobj = self
return res.reset_index(drop=True)[['gene', 'gene_symb', 'major_transcript', 'minor_transcript', 'switch_prob', 'corr', 'diff', 'event_importance', 'exclusive_domains', 'p_value', 'adj_pval']]
def _isoform_usage(self, eachgene, norm):
#eachgene 3D array [replicates, isoforms, timepoints]
result = np.zeros((eachgene.shape[0], eachgene.shape[1], eachgene.shape[2]-1))
if norm:
#calculate the size factors
sizefactor = np.sum(eachgene, axis=1)
sizefactor[sizefactor==0] = 1
#normalize to transcript abundance for each replicates
normdf = np.array([eachgene[x,:,:] / sizefactor[x] for x in range(eachgene.shape[0])])
else:
normdf = eachgene
#subtract usage from next time point
for i in range(eachgene.shape[2]-1):
result[:,:,i] = np.abs(normdf[:,:,i]-normdf[:,:,i+1])
#sum up and take median for one gene across all replicates
result = np.median(np.sum(result, axis=1),axis=0)
return result
def total_isoform_usage(self, isd_result=None, norm=True, gene_level=True):
'''
only genes with more than 1 isoform are computed.
Parameters
----------
ids_result
the result dataframe of isoform switch detection
norm : `boolean`, default=True
if True, it normalizes time series matrix to relative abundance to gene expression.
gene_level : `boolean`, default=True
if True, it calculates total isoform usage for each gene, otherwise individual isoform usage for each isoform
Return
-------
New DataSet Object
'''
isodf = pd.DataFrame({'symbol':self.dataset.gene_id, 'isoform_id':self.dataset.transcript_id})
groupdict = isodf.groupby(['symbol']).indices
symbolmap = dict(zip(self.dataset.gene_id, self.dataset.symbs))
tp = self.dataset.timepts
result_df = pd.DataFrame(columns=['entrez','gene'] + ['tp{}'.format(x) for x in range(self.dataset.timepts-1)])
if isd_result is not None:
keepgenes = isd_result['gene'].unique().tolist()
else:
keepgenes = np.unique(self.dataset.gene_id)
if gene_level is True:
for gene, iso in groupdict.items():
if len(iso) > 1 and gene in set(keepgenes):
# iso_switch = isd_result[isd_result['gene']==gene]
# all_isos = pd.concat([iso_switch.major_isoform, iso_switch.minor_isoform]).unique().tolist()
# keepiso = np.where(pd.Series(DataSet.isoform_id).isin(all_isos))[0]
eachgene = np.array(self.dataset.timeserieslist[:,iso,:], dtype="double")
#calculate isoform usage for each gene
res = self._isoform_usage(eachgene, norm)
result_df.loc[result_df.shape[0]] = [gene, symbolmap[gene]] + res.tolist()
#make changes in DataSet and return new dataset
self.dataset.tiu_ts = np.array(result_df.iloc[:,2:])
self.dataset.tiu_timeserieslist = np.expand_dims(np.array(result_df.iloc[:,2:]), axis=0)
self.dataset.tiu_gene_id = result_df['entrez']
self.dataset.tiu_symbs = result_df['gene']
self.dataset.tiu_timepts = tp-1
elif isd_result is not None and gene_level is False:
keepgenes = isd_result['gene'].unique().tolist()
keep_genes_arrays = np.hstack([self.normdf[x]['normarr'] for x in keepgenes])
result = np.zeros((keep_genes_arrays.shape[0], keep_genes_arrays.shape[1], keep_genes_arrays.shape[2]-1))
keep_trans_indices = reduce(lambda x,y: x+y, [self.normdf[x]['indices'] for x in keepgenes])
result_trans = self.dataset.transcript_id[keep_trans_indices]
result_entrez = self.dataset.gene_id[keep_trans_indices]
result_sym = self.dataset.symbs[keep_trans_indices]
for i in range(keep_genes_arrays.shape[2]-1):
result[:,:,i] = np.abs(keep_genes_arrays[:,:,i]-keep_genes_arrays[:,:,i+1])
self.dataset.tiu_ts = result
self.dataset.tiu_timeserieslist = result
self.dataset.tiu_gene_id = result_entrez
self.dataset.tiu_symbs = result_sym
self.dataset.tiu_transcript_id = result_trans
self.dataset.tiu_timepts = tp-1
return self.dataset
def isoform_abundance(self, is_result=None):
"""
Get isoform relative abundance based on isoform switched genes
"""
if is_result is None:
keepgenes = is_result['gene'].unique().tolist()
else:
keepgenes = self.dataset.gene_id
result = np.hstack([self.normdf[x]['normarr'] for x in keepgenes])
keep_trans_indices = reduce(lambda x,y: x+y, [self.normdf[x]['indices'] for x in keepgenes])
result_trans = self.dataset.transcript_id[keep_trans_indices]
result_entrez = self.dataset.gene_id[keep_trans_indices]
tiuds = self.dataset.__copy__()
tiuds.ts[0] = result
tiuds.timeserieslist = result
tiuds.gene_id = result_entrez
tiuds.transcript_id = result_trans
tiuds.timepts = self.dataset.timepts
tiuds.reps1 = self.dataset.reps1
return tiuds