Source code for spycone.DataSet

import numpy as np
import pandas as pd
from collections import defaultdict
import os 
dir_path = os.path.dirname(os.path.realpath(__file__))
from ._preprocessing._discretization import discretization_with_steps, discretize_timeseries, discretize_replicates

def _map_to_gene_name(keys, keytype, species, transid_type = None):
    keymap = {'entrezgeneid': 'NCBI gene (formerly Entrezgene) ID', 'ensemblgeneid': 'Gene stable ID', 'ensembltransid' : 'Transcript stable ID', 'ensembltransidver': "Transcript stable ID version"}
    ann = pd.read_csv(os.path.join(dir_path, "data/annotation/{}_ann.csv".format(species)), index_col=keymap[keytype], dtype=str)
    ann.index = list(map(str, list(map(lambda x: int(x) if not np.isnan(x) else x, ann.index))))
    ann_dict = ann.to_dict()
    
    symbs = []
    for k in keys:
        try:
            symbs.append(ann_dict['Gene name'][str(k)])
        except:
            symbs.append('n')

    return symbs

def _map_trans_to_gene(keys, keytype, species, transid_type = None):
    keymap = {'entrezgeneid': 'NCBI gene (formerly Entrezgene) ID', 'ensemblgeneid': 'Gene stable ID', 'ensembltransid' : 'Transcript stable ID', 'ensembltransidver': "Transcript stable ID version"}
    ann = pd.read_csv(os.path.join(dir_path, "data/annotation/{}_ann.csv".format(species)), index_col=keymap[transid_type], dtype=str)
    ann_dict = ann.to_dict()

    a = []
    for k in keys:
        try:
            a.append(str(ann_dict[keymap[keytype]][k]))
        except:
            a.append('n')

    return a

##dataset and settings
[docs]class dataset():
    """ 
    Input dataset.

    Parameters
    ----------

    ts 
        matrix or dataframe of the time series dataset.

    gene_id 
        list of gene id. (id that matched the biological network, default network : entrez ID). If gene_id is not given, entrez gene id will be mapped. 

    transcript_id 
        list of transcript id if the expression matrix is in transcript-level.

    keytype
        type of ID in gene list ('entrezgeneid', 'ensemblgeneid'). If only transcript ID is given, this serves as the keytype that will be mapped to gene_id.
    
    species
        Specifying species ID for annotation (human : 9606, mouse: 10090)

    reps1 
        number of replicates 

    timepts 
        number of time points

    gtf : (optional)
        provide corresponding gtf file for mapping gene names 

    symbs : (optional)
        list of gene symbols or gene names (can be automatically mapped for human and mouse)

    discretization_steps : (optional)
        discretize the expression matrix according to the number of steps given


    Attributes
    -----------
    timeserieslist 
        automatically generated 3D-array for analysis

    Methods
    ----------
    __copy__ 
        copy function for the dataset object

    remove_objects 
        remove the given index of the object

    """
    SPECIES = [9606, 10090]
    KEYTYPE = ['entrezgeneid', 'ensemblgeneid', 'ensembltransid', 'ensembltransidver']
    def __init__(
        self, 
        ts, 
        species,
        keytype,
        reps1,
        timepts,
        gtf=None,
        gene_id=None,
        transcript_id = None, 
        timeserieslist = None,
        symbs = None,
        discretization_steps = None):
        """
        ts : matrix of the time series data.
        gene_list : list of the gene id
        reps1 : Number of replicates for ts.
        timepts : Number of time points.
        """
        self.ts = [ts]
        self.reps1 = reps1
        self.gene_id = gene_id
        self.species=species
        self.keytype=keytype
        self.timepts = timepts
        self.timeserieslist = []
        self.symbs = symbs
        self.transcript_id = transcript_id
        self.discretization_steps = discretization_steps

        # if not isinstance(self.reps1, list):
        #     self.reps1 = [self.reps1]
        #check attributes
        if not hasattr(self.ts, "shape"):
            self.ts[0] = np.array(self.ts[0], dtype="double")

        if self.timepts*self.reps1 != self.ts[0].shape[1]:
            raise ValueError("Number of columns is not the same as number of time points.")

        if self.species not in self.SPECIES:
            raise ValueError("Please provide a supported species ID.")

        if self.keytype not in self.KEYTYPE:
            raise ValueError("Please provide a supported key type.")

        ######gene list check
        if self.gene_id is None and self.transcript_id is None:
            raise ValueError("Please provide at least gene ID or transcript ID.")

        TRANS_ID_TYPE = None
        if self.transcript_id is not None:
            if "." in self.transcript_id[0]:
                TRANS_ID_TYPE = "ensembltransidver"
            else:
                TRANS_ID_TYPE = "ensembltransid"

        if self.transcript_id is not None and self.gene_id is None:
            self.gene_id = _map_trans_to_gene(self.transcript_id, self.keytype, self.species, transid_type=TRANS_ID_TYPE)

        if self.symbs is None:
            self.symbs = _map_to_gene_name(self.gene_id, self.keytype, self.species, transid_type=TRANS_ID_TYPE)

        if not isinstance(self.gene_id, list):
            self.gene_id = [str(x) for x in self.gene_id]
            
        if not isinstance(self.symbs, (np.ndarray)):
            self.symbs = np.array(self.symbs)
        # if (self.ts[0].shape[0] != self.gene_list[0] | self.gene_list[0] != self.symbs1[0]).any():
        #     raise ValueError("Input length do not match.") 

        if pd.isna((np.array(self.gene_id))).all():
            raise ValueError("The gene ID list contains NaN values.")

        if pd.isna(self.ts).all():
            raise ValueError("The count matrix contains NaN values.")
        

        tmp = []
        
        if self.reps1 > 1:
            tp = self.timepts
            ts = self.ts[0]
            rep = int(self.reps1)

            for r in range(rep):
                start=tp*r
                end = start+tp
                tmp.append(ts[:,start:end])
            
            self.timeserieslist = np.array(tmp, dtype="double")
        else:
            self.timeserieslist = np.expand_dims(ts, axis=0)

        ##discretization
        if self.discretization_steps is not None:
            data_min = np.min(self.timeserieslist)
            data_max = np.max(self.timeserieslist)
            neg = self.discretization_steps[0]
            pos = self.discretization_steps[1]
            discretizeprototype = discretization_with_steps(data_min, data_max, neg, pos)
            

            for i in range(self.timeserieslist.shape[1]):
                tmp_ts = self.timeserieslist[:,i,:]
                discretizepattern = discretize_replicates(tmp_ts, discretizeprototype)
                self.timeserieslist[:,i,:] = tmp_ts 
        
        if self.transcript_id is not None:
            self._get_gene_level()

    @property
    def shape(self):
        return self.timeserieslist.shape

[docs]    def __copy__(self):
        return DataSet(ts=self.ts[0],
        gene_id=self.gene_id,
        reps1=self.reps1,
        timepts=self.timepts,
        species=self.species,
        keytype=self.keytype,
        transcript_id=self.transcript_id,
        timeserieslist=self.timeserieslist,
        symbs=self.symbs)

    def _get_gene_level(self):
        genes = defaultdict(list)
        for a in range(len(self.gene_id)):
            genes[self.gene_id[a]].append(a)

        self.genelevel_expression = np.empty(shape=(self.reps1,0,self.timepts))
        self.genelevel_id = []
        self.genelevel_symb = []
        for u,v in genes.items():
            tmp = np.expand_dims(np.sum(self.timeserieslist[:,v,:], axis=1),axis=1)
            self.genelevel_expression = np.concatenate([self.genelevel_expression, tmp], axis=1)
            self.genelevel_id.append(u)
            self.genelevel_symb.append(self.symbs[v[0]])
        
    def _remove_objects(self, objects_to_remove):
        if not isinstance(objects_to_remove, list):
            objects_to_remove = list(objects_to_remove)

        #remove objects from entries
        if self.transcript_id is not None:
            self.transcript_id = np.delete(np.array(self.transcript_id), objects_to_remove)

        self.ts[0] = np.delete(np.array(self.ts[0]), objects_to_remove, axis=0)
        tmp_genelist = np.delete(np.array(self.gene_id), objects_to_remove)
        self.gene_id = tmp_genelist
        tmp_symbs = np.delete(np.array(self.symbs), objects_to_remove)
        self.symbs = tmp_symbs
        tmp_timeseries_id = np.delete(np.arange(np.array(self.timeserieslist).shape[1]), objects_to_remove, axis=0)
        tmp_timeseries = [[self.timeserieslist[x][y] for y in tmp_timeseries_id] for x in range(np.array(self.timeserieslist).shape[0])]
        self.timeserieslist = np.array(tmp_timeseries, dtype="double")

        #recalculate gene level expression
        self._get_gene_level()

    def _add_clustering_object(self, clusterobj):
        self.clusterobj = clusterobj

    def _add_iso_switch_obj(self, isoobj):
        self.isoobj = isoobj