Source code for spycone.iso

import numpy as np
import pandas as pd
from collections import defaultdict
from scipy.stats import mannwhitneyu
from scipy.stats import chi2
import warnings
from functools import reduce
from itertools import combinations
import pickle, os, sys
warnings.simplefilter("ignore")
from collections import defaultdict

from ._util_stat.multipletesting import pvalue_correction
from ._feature import featuresObj 
from ._shuffle import shuffling
# Disable
def _blockPrint():
    sys.stdout = open(os.devnull, 'w')

# Restore
def _enablePrint():
    sys.stdout = sys.__stdout__

dir_path = os.path.dirname(os.path.realpath(__file__))

# def _iso_switch_between_arrays(arr1, arr2, orgarr1, orgarr2):
#         ab = np.sign(arr1-arr2)

#         ##look for switch points
#         points=[[x for x in range(ab.shape[1]) if ab[r,x] != np.roll(ab[r], 1)[x] and x!=0] for r in range(ab.shape[0])]

#         #points: switch points of all replicates
#         ##check if all r have switch points
#         #then take the points where all appeared
#         def multiple_intersect(points):
#             return np.unique(list(reduce(lambda x, y: set(x).intersection(set(y)), points)))

#         #calculate differences
#         #[(abs(a[x-1] - a[x]) + abs(b[x-1] -b[x]))/2 for x in points], a,b
#         if any(points):
#             final_sp = multiple_intersect(points) 
#             if not any(final_sp):
#                 final_sp = [list(set(x[0]).intersection(set(x[1]))) for x in combinations(points,2)]
#                 final_sp = np.unique(reduce(lambda x,y: x+y, final_sp))
#                 if not any(final_sp):
#                     final_sp = reduce(lambda x,y : x+y, points)
        

#             final_sp = np.sort(final_sp)
#             # mean of max difference at the time points between 2 arrays for all replicates
#             allsp_diff = [np.mean([np.max([abs(arr1[r,x-1] - arr2[r,x-1]), abs(arr2[r,x] -arr1[r,x])]) for r in range(arr1.shape[0])]) for x in final_sp]
            
#         else:
#             final_sp = []
#             allsp_diff=[]

#         #calculate corr
#         cors = [(1-np.corrcoef(orgarr1[r], orgarr2[r])[0,1])/2 for r in range(arr1.shape[0])]
        
#         #for each switch point, the max switch diff before and after 
#         #allsp : all the before and after switch differences for all switch points
#         return len(final_sp)/arr1.shape[1], allsp_diff, final_sp, np.mean(cors)
class basic_pvalue():
    def __init__(self, testobject, object_type, iso_switch = True, shuffle=None, fitness_scores = None, fitness_scores_two_sided = False, n_permutations=1000):
        self.testobject=testobject ##DataSet
        self.object_type = object_type #connectivity, clusters, iso_pairs
        self.shuffle = shuffle
        self.fitness_scores = fitness_scores
        self.fitness_scores_two_sided = fitness_scores_two_sided
        #self.combine_pvalues = combine_pvalues
        self.n_permutations = n_permutations
        self.iso_switch = iso_switch

    def initialize_features_for_permutation(self, features, shuffled):
        pass

    def calculate_original_object_fitness(self):
        pass

    # Print iterations progress
    def progressbar(self, it, prefix="", size=60, file=sys.stdout):
        count = len(it)
        def show(j):
            x = int(size*j/count)
            file.write("%s[%s%s] %i/%i\r" % (prefix, "#"*x, "."*int((size-x)), j, count))
            file.flush()        
        show(0)
        for i, item in enumerate(it):
            yield item
            show(i+1)
        file.write("\n")
        file.flush()


    def empirical_pvalue(self, total_observation,better_observation):
        if total_observation > 0:
            result = (1+better_observation)/ (1+total_observation)
            return result
        
    def combine_pvals(self, pvals):
        chisqprob = lambda chisq, df: chi2.sf(chisq, df)
        s = -2 * np.sum(np.log(pvals))
        
        return chisqprob(s, 2 * len(pvals))

        
    def do_calculate(self):
        if self.object_type=="clusters":
            n_object = self.testobject.clusterobj._final_n_cluster
            timeserieslist = self.testobject.timeserieslist
            #get features of clusters
            print("Calculating features.")
            features_original = featuresObj(self.testobject, timeserieslist, feature_type="clusters")
            #get fitness of clusters
            print("Calculating fitness score.")
            original_fitness_score = features_original.feature_store
        

            #data structure for storing permuted features #[features][permutation]
            permuted_fitness = np.zeros((original_fitness_score.shape[0], n_object))

        # else:
        #     if not isinstance(testobject, clusterObj):
        #         raise ValueError("ClusterObj object is needed.")
            
        if self.object_type == "iso_pairs":
            n_object = len(self.testobject.isoobj.isopairs['major_transcript'])
            timeserieslist = self.testobject.timeserieslist
            features_original = featuresObj(self.testobject, timeserieslist, feature_type="iso_pairs")
            original_fitness_score = features_original.feature_store
            permuted_fitness = np.zeros((original_fitness_score.shape[0], n_object))
 

        batch_smaller=np.zeros((original_fitness_score.shape[0], n_object))
        batch_larger=np.zeros((original_fitness_score.shape[0], n_object))
        batch_equal=np.zeros((original_fitness_score.shape[0], n_object))
        batch_comp=np.zeros((original_fitness_score.shape[0], n_object))

        #shuffle feature and fitness  

        #def _cal_permutations(p):
        for p in self.progressbar(it=range(self.n_permutations), prefix="Permutation:", size=self.n_permutations/10):
            _blockPrint()
  
            shuffleObj = shuffling(timeserieslist=timeserieslist)
            #shuffle1 = np.array(shuffleObj.shuffle_dataset_rowwise(), dtype="double")
            #self.testobject.timeserieslist = shuffle1 ##reassign testobject time series to the shuffled dataset
            
            #calculate features for shuffle obj
            iso = iso_function(self.testobject)
            #corrdf = iso.detect_isoform_switch(filtering=False, min_diff=self.testobject.isoobj.min_diff, corr_cutoff=self.testobject.isoobj.corr_cutoff, event_im_cutoff=self.testobject.isoobj.event_im_cutoff)
            #self.testobject = iso.total_isoform_usage(corrdf)
            #shuffle_clu = clustering.clustering(self.testobject, input_type="isoformusage", algorithm=self.testobject.clusterobj.algorithm, metric=self.testobject.clusterobj.metric, linkage=self.testobject.clusterobj.linkage, n_clusters=self.testobject.clusterobj.n_clusters)


            if self.object_type=="clusters":
                permuted_features = featuresObj(testobject=self.testobject, timeserieslist=timeserieslist, feature_type = "clusters", seed=p)
            #calculate fitness for shuffle obj 
                permuted_fitness = permuted_features.feature_store


            elif self.object_type=="iso_pairs":
                permuted_features = featuresObj(testobject=self.testobject, timeserieslist = timeserieslist, feature_type="shuffle_iso_pairs", seed=p)
                permuted_fitness = permuted_features.feature_store

            
            for fs in range(original_fitness_score.shape[0]):
                fitness = original_fitness_score[fs]
                shuffledfitness = permuted_fitness[fs]
                    
                #comparison
                for o in range(len(fitness)):
                    compare = shuffledfitness[o] - fitness[o]

                    if compare < 0:
                        batch_smaller[fs][o] += 1
                    elif compare > 0:
                        batch_larger[fs][o] += 1
                    else:
                        batch_equal[fs][o] += 1
                    batch_comp[fs][o] += 1

            p+=1
            _enablePrint()
           

        #res = Parallel(backend="multiprocessing") (delayed(_cal_permutations)(int(p)) for p in self.progressbar(it=range(self.n_permutations), prefix="Permutation:", size=self.n_permutations/10))
        #print(res)
       
        #end permutation
        print("Done permutation")
        pvals=np.zeros((original_fitness_score.shape[0], n_object))
        total_observation = 0
        better_observation = 0
        

        for fs in range(original_fitness_score.shape[0]):
            for o in range(n_object):
                total_observation = batch_comp[fs][o]
                

                if self.fitness_scores_two_sided is True:
                    better_observation = min(batch_larger[fs][o], batch_smaller[fs][o])*2 + batch_equal[fs][o]
                else:
                    better_observation = batch_equal[fs][o] + batch_larger[fs][o]

                pvals[fs][o] = self.empirical_pvalue(total_observation, better_observation)
                
        # p_vals = Parallel(backend="multiprocessing") (delayed(_count_permutations)(fs, o) for o in range(n_object) for fs in range(original_fitness_score.shape[0])) 
       
        #p_vals = np.array(p_vals).reshape(n_object, original_fitness_score.shape[0])

        ##combine pvalues
        result_pvalue = np.zeros(n_object)

        for o in range(n_object):
            p_values_list = np.zeros(original_fitness_score.shape[0])
            p_values_list = pvals[:,o]
            result_pvalue[o] = self.combine_pvals(p_values_list)

        return result_pvalue

[docs]class iso_function():
    """
    Isoform level analysis

    Parameters 
    ----------
    DataSet : 
        DataSet object
    Species : 
        Species ID

    Methods
    --------
    detect_isoform_switch :

        Parameters
        ----------
        combine : {'median', 'mean'}
            aggregation methods for replicates. Default="median"

        filtering : (boolean, default=True) 
            if True, low expression genes will be filtered out.

        filter_cutoff : default=2
            expression mean cutoff to filter.

        corr_cutoff : default = 0.7
            minimum correlation of isoform pairs to be included in the output.

        p_val_cutoff : default = 0.05   
            significant p-value cutoff to be included in the output.

        min_diff : default = 0.1
            minimum differences of relative abundance to be included in the output.

        event_im_cutoff : default = 0.1
            minimum event importance to be included in the output.

        adjustp : str {'fdr_bh' (default), 'holm_bonf', 'bonf'}
            Method for multiple testing
            bonf: Bonferroni method
            holm_bonf: holm-bonferroni method
            fdr_bh: Benjamin-hochberg false discovery rate

        n_permutations :
            Number of permutations if permutation test is used.

    total_isoform_usage : 
    
        Parameters
        ----------
        ids_result 
            the result dataframe of isoform switch detection

        norm : `boolean`, default=True
            if True, it normalizes time series matrix to relative abundance to gene expression.

        gene_level : `boolean`, default=True
            if True, it calculates total isoform usage for each gene, otherwise individual isoform usage for each isoform


    Return 
    --------
    Create an instance for isoform switch analysis. 
    
    """
    def __init__(self, dataset):
        self.dataset = dataset
        self.species = dataset.species
    
        if "." in dataset.transcript_id[0]:
            infile = open(os.path.join(dir_path,f"data/network/{dataset.species}_ver_trans_pfam.pkl"),'rb')
            exfile = open(os.path.join(dir_path,f"data/network/exons_{dataset.species}_ensembl.pkl"),'rb')
        else:
            infile = open(os.path.join(dir_path,f"data/network/{dataset.species}_ver_trans_pfam.pkl"),'rb')
            exfile = open(os.path.join(dir_path,f"data/network/exons_{dataset.species}_ensembl.pkl"),'rb')
        self.trans_pfam_dict = pickle.load(infile)
        self.trans_exon_dict = pickle.load(exfile)

        groupdict = defaultdict(lambda: defaultdict(list))
        for idx, sym in enumerate(self.dataset.gene_id):
            if sym != 'n':
                groupdict[sym]['trans_id'].append(self.dataset.transcript_id[idx])
                groupdict[sym]['indices'].append(idx)
                groupdict[sym]['array'].append(self.dataset.timeserieslist[:,idx,:])
        
        #normlize work
        for sym, df in groupdict.items():
            org = df['array'][0]
            for x in range(1,len(df['array'])):
                org = np.hstack([org, df['array'][x]])
            
            df['array'] = org.reshape(self.dataset.reps1, len(df['array']), self.dataset.timepts)
            
            tmp = np.array([df['array'][x]/np.sum(df['array'][x], axis=0) for x in range(df['array'].shape[0])])
            tmp[np.isnan(tmp)] = 1
            groupdict[sym]['normarr'] = tmp

        self.normdf = groupdict

    def _take_interval(self, sp, timepts):
        newsp = np.insert(sp,0,0)
        newsp = np.append(newsp, timepts)
        intervals = []
        for i in range(len(sp)):
            intervals.append(np.min([newsp[i+1]-newsp[i], newsp[i+2]-newsp[i+1]]))
        return intervals

    def _iso_switch_between_arrays(self, arr1, arr2, orgarr1, orgarr2):
        ab = np.sign(arr1-arr2)

        if self.dataset.reps1 ==1:
            points = [x for x in range(1, ab.shape[1]) if (ab[0][x] != np.roll(ab[0], 1)[x])]
            final_sp=points
            allsp_diff =[np.mean([abs(arr1[:,x-1] - arr2[:,x-1]), abs(arr2[:,x] -arr1[:,x])]) for x in final_sp]
            cors = (1-np.corrcoef(orgarr1, orgarr2)[0,1])/2
            iso_ratio=len(final_sp)/arr1.shape[0]
        else:
        ##look for switch points
            points=[[x for x in range(ab.shape[1]) if ab[r,x] != np.roll(ab[r], 1)[x] and x!=0] for r in range(ab.shape[0])]
        #points: switch points of all replicates
        ##check if all r have switch points
        #then take the points where all appeared
            def multiple_intersect(points):
                return np.unique(list(reduce(lambda x, y: set(x).intersection(set(y)), points)))

            #calculate differences
            #[(abs(a[x-1] - a[x]) + abs(b[x-1] -b[x]))/2 for x in points], a,b
            rep_agree = round(self.dataset.reps1*0.6)
            if any(points):
                final_sp = multiple_intersect(points) 
                if not any(final_sp):
                    final_sp = [list(set(x[0]).intersection(set(x[1]))) for x in combinations(points,rep_agree)]
                    final_sp = np.unique(reduce(lambda x,y: x+y, final_sp))
                    if not any(final_sp):
                        final_sp = []
            

                final_sp = np.sort(final_sp)
                # mean of max difference at the time points between 2 arrays for all replicates
                allsp_diff = [np.mean([np.max([abs(arr1[r,x-1] - arr2[r,x-1]), abs(arr2[r,x] -arr1[r,x])]) for r in range(arr1.shape[0])]) for x in final_sp]
                
            else:
                final_sp = []
                allsp_diff=[]

        
        #calculate corr
            cors = [(1-np.corrcoef(orgarr1[r], orgarr2[r])[0,1])/2 for r in range(arr1.shape[0])]
            iso_ratio = len(final_sp)/arr1.shape[1]
        #for each switch point, the max switch diff before and after 
        #allsp : all the before and after switch differences for all switch points
        return iso_ratio, allsp_diff, final_sp, np.mean(cors)


    def _event_enrichness(self, normdf, sp, arr1, arr2):
        ##penalize low expressed genes
        penal1 = np.mean([arr1[r]/np.max(normdf[r],axis=0) for r in range(normdf.shape[0])],axis=0)
        penal2 = np.mean([arr2[r]/np.max(normdf[r],axis=0) for r in range(normdf.shape[0])],axis=0)

        return np.mean(np.append(penal1[sp-1:sp+1],penal2[sp-1:sp+1]))

        
    def _remove_low_expressed(self):
        #return isoform indices
        timedf = pd.concat([pd.DataFrame(self.timeserieslist[x]) for x in range(self.timeserieslist.shape[0])], axis=1)
        geneexp = pd.concat([pd.Series(self.symbs, name='symbol'), timedf],axis=1)

        #filter out those higher than 2 by default (can change)
        lowgenes_idx = np.where(geneexp.groupby('symbol').agg('sum').mean(axis=1) < 2)[0]
        #get the gene names lower than cutoff
        lowgenes = geneexp.iloc[lowgenes_idx, 0] 


        return lowgenes

    def _remove_low_abundance(self, pre_normdf):
        #return isoform indices
    

        #calculate fold change of major isoform with respect to the minor isoforms
        lowexp = np.mean(np.max(pre_normdf, axis=2),axis=0)
        
        #filter out those higher than 2 by default (can change)
        lowgenes = np.where(lowexp<=0.1)[0]
        
        filtered_normdf = np.delete(pre_normdf, lowgenes, axis=1)
        return filtered_normdf, lowgenes

    def _combine_pvals(self, pvals):
        ##no used
        chisqprob = lambda chisq, df: chi2.sf(chisq, df)
        s = -2 * np.sum(np.log(pvals))
        
        return chisqprob(s, 2 * len(pvals))

        
    def _take_interval(self, sp):
        #check interval between switch points
        newsp = np.insert(sp,0,0)
        newsp = np.append(newsp, self.dataset.timepts)
        intervals = []
        for i in range(len(sp)):
            intervals.append(np.min([newsp[i+1]-newsp[i], newsp[i+2]-newsp[i+1]]))
        return intervals

    def _diff_before_after_switch(self, normdf, arr1, arr2, allsp_diff, switch_points):
        tmp_sw =  np.insert(switch_points,0,0)
        tmp_sw = np.append(tmp_sw, self.dataset.timepts)
        if self.dataset.reps1>1:
            #get the intervals between time points
            if any(switch_points):
                #the best sp has the highest mean differences before and after switch points between the two time series 
                bestpval1=0
                bestpval2=0
                allbestp=1
                finaldiff = 0
                finalenrich = 0
                best_switch_point=0
                this_prob = 0
                
                thispt=0 

                for bp in range(1, len(tmp_sw)-1):
                    #if len(switch_points) != bp+1:
                    iso1_pval = mannwhitneyu(arr1[:,tmp_sw[bp-1]:tmp_sw[bp]].reshape(1,-1)[0], arr1[:,tmp_sw[bp]:tmp_sw[bp+1]].reshape(1,-1)[0])
                    iso2_pval = mannwhitneyu(arr2[:,tmp_sw[bp-1]:tmp_sw[bp]].reshape(1,-1)[0], arr2[:,tmp_sw[bp]:tmp_sw[bp+1]].reshape(1,-1)[0])
                    thisdiff = np.mean([abs(np.mean(arr1[:,tmp_sw[bp-1]:tmp_sw[bp]]) - np.mean(arr1[:,tmp_sw[bp]:tmp_sw[bp+1]])), abs(np.mean(arr2[:,tmp_sw[bp-1]:tmp_sw[bp]]) - np.mean(arr2[:,tmp_sw[bp]:tmp_sw[bp+1]]))])
                    thisenrich = self._event_enrichness(normdf, tmp_sw[bp], arr1, arr2)
                    iso_prob = ((np.sum(arr1[:,tmp_sw[bp-1]:tmp_sw[bp]].reshape(1,-1)[0]>arr2[:,tmp_sw[bp-1]:tmp_sw[bp]].reshape(1,-1)[0]))/arr1[:,tmp_sw[bp-1]:tmp_sw[bp]].reshape(1,-1)[0].shape[0]) + ((np.sum(arr1[:,tmp_sw[bp]:tmp_sw[bp+1]].reshape(1,-1)[0]<arr2[:,tmp_sw[bp]:tmp_sw[bp+1]].reshape(1,-1)[0]))/(arr1[:,tmp_sw[bp]:tmp_sw[bp+1]].reshape(1,-1)[0].shape[0]))

                    thispt=tmp_sw[bp]

                    if (iso1_pval[1] < allbestp) & (thisenrich > finalenrich):
                        bestpval1, bestpval2=iso1_pval[1], iso2_pval[1]
                        allbestp=iso1_pval[1]
                        finaldiff = thisdiff
                        finalenrich = thisenrich
                        best_switch_point=tmp_sw[bp]
                        this_prob = iso_prob/2
                    elif (iso2_pval[1] < allbestp) & (thisenrich > finalenrich):
                        bestpval1, bestpval2=iso1_pval[1], iso2_pval[1]
                        allbestp=iso2_pval[1]
                        finaldiff = thisdiff
                        finalenrich = thisenrich
                        best_switch_point=tmp_sw[bp]
                        this_prob = iso_prob/2
                    else:
                        continue

                return finaldiff, bestpval1, bestpval2, best_switch_point, finalenrich, this_prob
            else:
                return 0, 1, 1, 0, 0, 0

        else:             
            #the best sp has the highest mean differences before and after switch points between the two time series 
            if any(allsp_diff):
                bp = np.argmax(allsp_diff)
                finalenrich = self._event_enrichness(normdf, bp, arr1, arr2)
                iso_prob = ((np.sum(arr1[:,tmp_sw[bp-1]:tmp_sw[bp]].reshape(1,-1)[0]>arr2[:,tmp_sw[bp-1]:tmp_sw[bp]].reshape(1,-1)[0]))/arr1[:,tmp_sw[bp-1]:tmp_sw[bp]].reshape(1,-1)[0].shape[0]) + ((np.sum(arr1[:,tmp_sw[bp]:tmp_sw[bp+1]].reshape(1,-1)[0]<arr2[:,tmp_sw[bp]:tmp_sw[bp+1]].reshape(1,-1)[0]))/(arr1[:,tmp_sw[bp]:tmp_sw[bp+1]].reshape(1,-1)[0].shape[0]))
                return allsp_diff[bp], 1, 1, switch_points[bp], finalenrich, iso_prob
            else:
                return 0,1,1,0,0,0

    def _isoform_differential_usage(self, thisnormdf, thisexp):
        #define major transcript
        #highest mean across time in averaged replicates
    
        #vis
        # for n in range(normdf.shape[1]):
        #     plt.plot(normdf[0,n,:])
        # plt.show()
        
        iso_ratio = np.zeros((thisnormdf.shape[1],thisnormdf.shape[1]))
        iso_diff_value = np.zeros((thisnormdf.shape[1],thisnormdf.shape[1]))
        maj_pval = np.zeros((thisnormdf.shape[1],thisnormdf.shape[1]))
        min_pval = np.zeros((thisnormdf.shape[1],thisnormdf.shape[1]))
        best_switch_point = [[] for _ in range(thisnormdf.shape[1])]
        corr_list = np.zeros((thisnormdf.shape[1],thisnormdf.shape[1]))
        enrichness = np.zeros((thisnormdf.shape[1],thisnormdf.shape[1]))
        iso_prob = np.zeros((thisnormdf.shape[1],thisnormdf.shape[1]))
        all_switch_point = [[] for _ in range(thisnormdf.shape[1])]

        #check for all pairs
        for maj in range(thisnormdf.shape[1]):
            orgarr1 = thisexp[:,maj,:] 
            arr1 = thisnormdf[:,maj,:]
            for x in range(thisnormdf.shape[1]): #each minor transcript
                if x == maj:
                    iso_ratio[maj, x], iso_diff_value[maj, x] = 0, 0
                    best_switch_point[maj].append(0)
                    all_switch_point[maj].append(0)
                    corr_list[maj, x] = 0
                    continue
                arr2 = thisnormdf[:,x,:]
                orgarr2 = thisexp[:,x,:]

                ##finding switch points, correlation
               
                iso_ratio[maj, x], allsp, final_sp, corr_list[maj, x] = self._iso_switch_between_arrays(arr1, arr2, orgarr1, orgarr2)
          
                ##calculate diff. value, p-values
                iso_diff_value[maj, x], maj_pval[maj, x], min_pval[maj, x], bs, enrichness[maj,x], iso_prob[maj, x] = self._diff_before_after_switch(thisnormdf, arr1, arr2, allsp, final_sp)
                
                best_switch_point[maj].append(bs)
                all_switch_point[maj].append(final_sp)


        return iso_ratio, maj_pval, min_pval, iso_diff_value, best_switch_point, corr_list, enrichness, all_switch_point, iso_prob


    def _correlation_coef(self, thisnormdf, maj):
        m_normdf = np.median(thisnormdf, axis=0)
        
        corr_mat = (1-np.corrcoef(m_normdf))/2


        return corr_mat
    

    def _exclusive_domain(self, iso1, iso2):
        if iso1 in self.trans_pfam_dict.keys():
            pfams1 = self.trans_pfam_dict[iso1]
        else: 
            pfams1 = []

        if iso2 in self.trans_pfam_dict.keys():
            pfams2 = self.trans_pfam_dict[iso2]
        else: 
            pfams2 = []
        
        diff = set(pfams1).symmetric_difference(set(pfams2))
        diff = [x for x in list(diff) if not type(x)==float]
        return diff

    def _diff_exons(self, iso1, iso2, final_sp, isoid):
        if iso1 in self.trans_exon_dict.keys():
            exons1 = self.trans_exon_dict[iso1]
        else: 
            exons1 = []

        if iso2 in self.trans_exon_dict.keys():
            exons2 = self.trans_exon_dict[iso2]
        else: 
            exons2 = []

        ##major or minor isoforms
        ##define isoform_from

        iso1_id = np.where(np.isin(self.normdf[isoid]['trans_id'], iso1))[0][0]
        iso2_id = np.where(np.isin(self.normdf[isoid]['trans_id'], iso2))[0][0]
        iso1_values = np.median(self.normdf[isoid]['array'][:,iso1_id, final_sp-1])
        iso2_values = np.median(self.normdf[isoid]['array'][:,iso2_id, final_sp-1])
        
        if iso1_values > iso2_values:
            majoriso, minoriso = iso1, iso2
            majoriso_exons, minoriso_exons = exons1, exons2
        else:
            majoriso, minoriso = iso2, iso1
            majoriso_exons, minoriso_exons = exons2, exons1

        gain_exons = set(majoriso_exons).difference(set(minoriso_exons))
        loss_exons = set(minoriso_exons).difference(set(majoriso_exons))
        nondiff = set(exons1).intersection(set(exons2))
        gain_exons = [x for x in list(gain_exons) if not type(x)==float]
        loss_exons = [x for x in list(loss_exons) if not type(x)==float]
        nondiff = [x for x in list(nondiff) if not type(x)==float]

        return majoriso, minoriso, gain_exons, loss_exons, nondiff
        
                
    def detect_isoform_switch(self, filtering = True, prob_cutoff=0.5, corr_cutoff=0.5, event_im_cutoff = 0.1, min_diff = 0.05, p_val_cutoff = 0.05, n_permutations=100, adjustp='fdr_bh', diff_domain=True):
        """
        Detect isoform switching events along time series.
        
        

        Return
        ------

        DataFrame 

        """
        #calculate probability and correlation for switching
        #
        #isodf = pd.DataFrame({'symbol':DataSet.symbs, 'isoform_id':DataSet.isoform_id})
        self.min_diff = min_diff
        self.corr_cutoff = corr_cutoff
        self.event_im_cutoff = event_im_cutoff
        self.timeserieslist = self.dataset.timeserieslist
        self.transcript_id = self.dataset.transcript_id
        self.symbs = self.dataset.symbs
        self.gene_id = self.dataset.gene_id
        if filtering==True:
            lowgenes = self._remove_low_expressed()
        else:
            lowgenes=set()
        print(str(len(set(lowgenes))) + " of genes with low expression are filtered out!!.")

        print(f"calculating for {len(self.normdf.keys())} genes...")
              
        iso_pairs_id = defaultdict(list)
        iso_dict_info = defaultdict(tuple)
        for gene, values in self.normdf.items():
            idxs = values['indices']
            thisnormdf = values['normarr']
            thisexp = values['array']

            
            # maj = np.argmax(np.median(np.median(thisexp, axis=0), axis=1))
            # secondmaj = None
            # if thisnormdf.shape[1] > 2:
            #     ##get second maj isoform
            #     secondmaj = np.argsort(np.median(np.median(thisexp, axis=0), axis=1))[-2]

            # self.normdf[gene]['majs'].append(maj)
            # self.normdf[gene]['majs'].append(secondmaj)
            #calculate correlations
            #corrmat= self.correlation_coef(thisnormdf, maj)

            #calculate isoform ratio
            res_maj = self._isoform_differential_usage(thisnormdf, thisexp)

            #return major transcript and transcript that switch
            if self.dataset.reps1 > 1:
                for ids1, iso_ratio0 in enumerate(res_maj[0]):
                    for ids2, iso_ratio in enumerate(iso_ratio0):
                        if iso_ratio >0 :
                            ##
                            majoriso, minoriso, gain_exons, loss_exons, nondiff = self._diff_exons(self.transcript_id[idxs[ids1]], self.transcript_id[idxs[ids2]], res_maj[4][ids1][ids2], str(gene))
                            iso_pairs_id['gene'].append(str(gene))
                            iso_pairs_id['gene_symb'].append(self.symbs[idxs[ids1]])
                            iso_pairs_id['major_transcript'].append(majoriso)
                            iso_pairs_id['minor_transcript'].append(minoriso)
                            iso_pairs_id['switch_prob'].append(res_maj[8][ids1][ids2])
                            iso_pairs_id['diff'].append(res_maj[3][ids1, ids2])
                            iso_pairs_id['maj_pval'].append(res_maj[1][ids1, ids2])
                            iso_pairs_id['min_pval'].append(res_maj[2][ids1, ids2])
                            iso_pairs_id['p_value'].append(self._combine_pvals([res_maj[1][ids1,ids2], res_maj[2][ids1, ids2]]))
                            iso_pairs_id['corr'].append(res_maj[5][ids1, ids2])
                            iso_pairs_id['best_switch_point'].append(res_maj[4][ids1][ids2])
                            iso_pairs_id['event_importance'].append(res_maj[6][ids1][ids2])
                            iso_pairs_id['gain_exons'].append(gain_exons)
                            iso_pairs_id['loss_exons'].append(loss_exons)
                            iso_pairs_id['constant_exons'].append(nondiff)
                            iso_pairs_id['all_switch_points'].append(res_maj[7][ids1][ids2])

                            iso_dict_info[self.transcript_id[idxs[ids1]]] = (gene, ids1)
                            iso_dict_info[self.transcript_id[idxs[ids2]]] = (gene, ids2)

                            if diff_domain == True:
                                iso_pairs_id['exclusive_domains'].append(self._exclusive_domain(self.transcript_id[idxs[ids1]],self.transcript_id[idxs[ids2]]))
                            # iso_dict_info[self.isoform_id[idxs[res_maj[3]]]] = (gene, res_maj[3])
                            # iso_dict_info[self.isoform_id[idxs[ids]]] = (gene, ids)


            else:
                 #p-values is calculated 
                # for ids, iso_ratio in enumerate(res_maj[0]):
                #     if iso_ratio >0 :
                #         iso_pairs_id['gene'].append(gene)
                #         iso_pairs_id['major_transcript'].append(self.transcript_id[idxs[res_maj[3]]])
                #         iso_pairs_id['minor_transcript'].append(self.transcript_id[idxs[ids]])
                #         iso_pairs_id['iso_ratio'].append(iso_ratio)
                #         iso_pairs_id['diff'].append(res_maj[4][ids])
                #         iso_pairs_id['corr'].append(res_maj[6][ids])
                #         iso_pairs_id['final_sp'].append(res_maj[5][ids])
                #         iso_dict_info[self.transcript_id[idxs[res_maj[3]]]] = (gene, res_maj[3])
                #         iso_dict_info[self.transcript_id[idxs[ids]]] = (gene, ids)

                for ids1, iso_ratio0 in enumerate(res_maj[0]):
                    for ids2, iso_ratio in enumerate(iso_ratio0):
                        if iso_ratio >0 :
                            ##
                            majoriso, minoriso, gain_exons, loss_exons, nondiff = self._diff_exons(self.transcript_id[idxs[ids1]], self.transcript_id[idxs[ids2]], res_maj[4][ids1][ids2], str(gene))
                            iso_pairs_id['gene'].append(str(gene))
                            iso_pairs_id['gene_symb'].append(self.symbs[idxs[ids1]])
                            iso_pairs_id['major_transcript'].append(majoriso)
                            iso_pairs_id['minor_transcript'].append(minoriso)
                            iso_pairs_id['switch_prob'].append(res_maj[8][ids1][ids2])
                            iso_pairs_id['diff'].append(res_maj[3][ids1, ids2])
                            iso_pairs_id['corr'].append(res_maj[5][ids1, ids2])
                            iso_pairs_id['best_switch_point'].append(res_maj[4][ids1][ids2])
                            iso_pairs_id['event_importance'].append(res_maj[6][ids1][ids2])
                            iso_pairs_id['gain_exons'].append(gain_exons)
                            iso_pairs_id['loss_exons'].append(loss_exons)
                            iso_pairs_id['constant_exons'].append(nondiff)
                            iso_pairs_id['all_switch_points'].append(res_maj[7][ids1][ids2])

                            iso_dict_info[self.transcript_id[idxs[ids1]]] = (gene, ids1)
                            iso_dict_info[self.transcript_id[idxs[ids2]]] = (gene, ids2)

                            if diff_domain == True:
                                iso_pairs_id['exclusive_domains'].append(self._exclusive_domain(self.transcript_id[idxs[ids1]],self.transcript_id[idxs[ids2]]))

        self.isopairs = iso_pairs_id
        self.iso_dict_info = iso_dict_info
        self.dataset.isoobj = self

        if self.dataset.reps1 == 1:
            cal = basic_pvalue(testobject=self.dataset, object_type="iso_pairs", n_permutations = n_permutations)
            pval = cal.do_calculate() #self iso_function object
            iso_pairs_id['p_value'] = pval

            self.isopairs = iso_pairs_id

        res = pd.DataFrame(iso_pairs_id).sort_values('p_value')
        mct = pvalue_correction(res['p_value'], method=adjustp)
        res['adj_pval'] = mct.corrected_pvals

        res = res[(res['switch_prob']>prob_cutoff) & (res['diff']>min_diff) & (res['adj_pval']<p_val_cutoff) & (res['corr']>corr_cutoff) & (res['event_importance']>event_im_cutoff)]
        res = res.sort_values("switch_prob", ascending=False).drop_duplicates(['major_transcript', 'minor_transcript'])
        print("----Result statistics----")
        print(f"Total genes with IS genes: {res['gene'].unique().shape[0]}")
        print(f"Events found: {res.shape[0]}")
        print(f"Events with affecting domains: {np.sum([1 for x in res[  'exclusive_domains'] if len(x)>0])}")
        print(f"Mean event importance: {res['event_importance'].mean()}")
        print(f"Mean difference before and after switch: {res['diff'].mean()}")
        print("--------------------------")
        print("DONE")
        
        self.is_result = res
        self.dataset.isoobj = self

        return res.reset_index(drop=True)[['gene', 'gene_symb', 'major_transcript', 'minor_transcript', 'switch_prob', 'corr', 'diff', 'event_importance', 'exclusive_domains', 'p_value', 'adj_pval']]


    def _isoform_usage(self, eachgene, norm):
        #eachgene 3D array [replicates, isoforms, timepoints]
        result = np.zeros((eachgene.shape[0], eachgene.shape[1], eachgene.shape[2]-1))
        
        if norm:
            #calculate the size factors 
            sizefactor = np.sum(eachgene, axis=1)
            sizefactor[sizefactor==0] = 1

            #normalize to transcript abundance for each replicates
            normdf = np.array([eachgene[x,:,:] / sizefactor[x] for x in range(eachgene.shape[0])])
        
        else:
            normdf = eachgene
        
        #subtract usage from next time point
        for i in range(eachgene.shape[2]-1):
            result[:,:,i] = np.abs(normdf[:,:,i]-normdf[:,:,i+1])
    
        #sum up and take median for one gene across all replicates
        result = np.median(np.sum(result, axis=1),axis=0)
        
        return result
            

    def total_isoform_usage(self, isd_result=None, norm=True, gene_level=True):
        '''
        only genes with more than 1 isoform are computed.

        Parameters
        ----------
        ids_result 
            the result dataframe of isoform switch detection

        norm : `boolean`, default=True
            if True, it normalizes time series matrix to relative abundance to gene expression.

        gene_level : `boolean`, default=True
            if True, it calculates total isoform usage for each gene, otherwise individual isoform usage for each isoform

        Return
        -------
        New DataSet Object
        
        '''
        isodf = pd.DataFrame({'symbol':self.dataset.gene_id, 'isoform_id':self.dataset.transcript_id})
        groupdict = isodf.groupby(['symbol']).indices
        symbolmap = dict(zip(self.dataset.gene_id, self.dataset.symbs))
        tp = self.dataset.timepts
        
        result_df = pd.DataFrame(columns=['entrez','gene'] + ['tp{}'.format(x) for x in range(self.dataset.timepts-1)])
                
        if isd_result is not None:
            keepgenes = isd_result['gene'].unique().tolist()
        else:
            keepgenes = np.unique(self.dataset.gene_id)  

        if gene_level is True:        
            for gene, iso in groupdict.items():
                if len(iso) > 1 and gene in set(keepgenes):
                # iso_switch = isd_result[isd_result['gene']==gene]
                # all_isos = pd.concat([iso_switch.major_isoform, iso_switch.minor_isoform]).unique().tolist()
                # keepiso = np.where(pd.Series(DataSet.isoform_id).isin(all_isos))[0]

                    eachgene = np.array(self.dataset.timeserieslist[:,iso,:], dtype="double")
                    
                    #calculate isoform usage for each gene
                    res = self._isoform_usage(eachgene, norm)
                
                    result_df.loc[result_df.shape[0]] = [gene, symbolmap[gene]] + res.tolist()

            #make changes in DataSet and return new dataset
            self.dataset.tiu_ts = np.array(result_df.iloc[:,2:])
            self.dataset.tiu_timeserieslist = np.expand_dims(np.array(result_df.iloc[:,2:]), axis=0)
            self.dataset.tiu_gene_id = result_df['entrez']
            self.dataset.tiu_symbs = result_df['gene']
            self.dataset.tiu_timepts = tp-1

        
        elif isd_result is not None and gene_level is False:
            keepgenes = isd_result['gene'].unique().tolist()
            keep_genes_arrays = np.hstack([self.normdf[x]['normarr'] for x in keepgenes])

            result = np.zeros((keep_genes_arrays.shape[0], keep_genes_arrays.shape[1], keep_genes_arrays.shape[2]-1))

            keep_trans_indices = reduce(lambda x,y: x+y, [self.normdf[x]['indices'] for x in keepgenes])
            result_trans = self.dataset.transcript_id[keep_trans_indices]
            result_entrez = self.dataset.gene_id[keep_trans_indices]
            result_sym = self.dataset.symbs[keep_trans_indices]

            for i in range(keep_genes_arrays.shape[2]-1):
                result[:,:,i] = np.abs(keep_genes_arrays[:,:,i]-keep_genes_arrays[:,:,i+1])


            self.dataset.tiu_ts = result
            self.dataset.tiu_timeserieslist = result
            self.dataset.tiu_gene_id = result_entrez
            self.dataset.tiu_symbs = result_sym
            self.dataset.tiu_transcript_id = result_trans
            self.dataset.tiu_timepts = tp-1

        return self.dataset


    def isoform_abundance(self, is_result=None):
        """
        Get isoform relative abundance based on isoform switched genes
        """
        if is_result is None:
            keepgenes = is_result['gene'].unique().tolist()
        else:
            keepgenes = self.dataset.gene_id

        result = np.hstack([self.normdf[x]['normarr'] for x in keepgenes])

        keep_trans_indices = reduce(lambda x,y: x+y, [self.normdf[x]['indices'] for x in keepgenes])
        result_trans = self.dataset.transcript_id[keep_trans_indices]
        result_entrez = self.dataset.gene_id[keep_trans_indices]

        tiuds = self.dataset.__copy__()
        tiuds.ts[0] = result
        tiuds.timeserieslist = result
        tiuds.gene_id = result_entrez
        tiuds.transcript_id = result_trans
        tiuds.timepts = self.dataset.timepts
        tiuds.reps1 = self.dataset.reps1

        return tiuds