Source code for tdads.machine_learning


# machine learning of persistence diagrams
from tdads.distance import *
from tdads.kernel import *
from multiprocessing import cpu_count
from sklearn.manifold import MDS
from sklearn.decomposition import KernelPCA
from sklearn.svm import SVC, SVR
from numpy import concatenate, array

# multidimensional scaling
[docs] class diagram_mds: '''Multidimensional scaling with persistence diagrams.''' def __init__(self, n_components:int = 2, random_state:int = None, precomputed:bool = False, dim:int = 0, metric:str = 'W', p:float = 2, sigma:float = None, n_cores:int = cpu_count() - 1): '''Multidimensional scaling with persistence diagrams. Parameters ---------- `n_components` : int The number of dimensions in which to disperse the distance values, default 2. `random_state` : int Determines the random number generator used for reproducibility, default None. `precomputed` : bool Determines whether a precomputed distance matrix of persistence diagrams (`True`) or a list of persistence diagrams (`False`, the default) will be passed to the fit method. `dim` : int The non-negative homological dimension in which distances will be computed (default 0). `metric` : str One of \"W\" (default) or \"FIM\" for the wasserstein/bottleneck and Fisher information metric functions respectively. `p` : float The power parameter for the wasserstein metric, must be at least 1 (default 2). `sigma` : float The scale parameter for the Fisher information metric, default None but must be supplied when `metric` is \"FIM\". `n_cores` : int The number of CPU cores to use for parallel computation of distance matrices. Default is the number of available cores minus 1. Attributes ---------- `distance` : tdads.distance.distance The object used to compute the distance matrix of persistence diagrams. `MDS` : sklearn.manifold._mds.MDS The sklearn.manifold.MDS object used for embedding the distance matrix. `precomputed` : bool The input `precomputed` parameter. ''' self.distance = distance(dim = dim, metric = metric, p = p, sigma = sigma, n_cores = n_cores) self.MDS = MDS(n_components = n_components, metric = False, n_jobs = n_cores, random_state = random_state, dissimilarity = 'precomputed') if isinstance(precomputed, type(True)) == False: raise Exception('precomputed must be True or False.') self.precomputed = precomputed
[docs] def __str__(self): '''Describe a persistence diagram multidimensional scaling object via its distance metric.''' s = 'Non-metric multidimensional scaling of persistence diagrams. Distance metric used: ' + self.distance.__str__ return s
[docs] def fit_transform(self, X, y:any = None): '''Fit the data in X and compute the position of the persistence diagrams in the embedding space. Parameters ---------- `X` : {array-like of shape `(n_diagrams, n_diagrams)`} or {list of length `n_diagrams`} Either a precomputed distance matrix of `n_diagrams` many persistence diagrams (if `precomputed` was set to `True`) or a list of `n_diagrams` many persistence diagrams (otherwise). `y` : Ignored Not used, present for API consistency by convention. Returns ------- `X_new` : ndarray of shape `(n_diagrams, n_components)` `X` transformed in the new space. Examples -------- >>> from tdads.machine_learning import diagram_mds >>> from tdads.distance import distance >>> from ripser import ripser >>> import numpy as np >>> # create 2 datasets >>> data1 = np.random((100,2)) >>> data2 = np.random((100,2)) >>> # compute persistence diagrams with ripser >>> diagram1 = ripser(data1) >>> diagram2 = ripser(data2) >>> # project into 2D with the 2-wasserstein distance >>> mds = diagram_mds() >>> mds.fit_transform([D1, D2]) >>> # can also fit with a precomputed distance matrix >>> d_wass = distance() >>> dist_mat = d_wass.compute_matrix([D1, D2]) >>> mds_precomp = diagram_mds(precomputed = True) >>> mds_precomp.fit_transform(dist_mat) ''' if self.precomputed == False: if isinstance(X, type(array([0,1]))): raise Exception('When precomputed is False, X must be a list of persistence diagrams.') X = self.distance.compute_matrix(X) else: if isinstance(X, type([0,1])): raise Exception('When precomputed is True, X must be a ndarray distance matrix of persistence diagrams.') X_new = self.MDS.fit_transform(X, y) return X_new
# kernel PCA
[docs] class diagram_kpca: '''Kernel PCA with persistence diagrams.''' def __init__(self, n_components:int = 2, random_state:int = None, precomputed:bool = False, diagrams:list = None, dim:int = 0, sigma:float = 1.0, t:float = 1.0, n_cores:int = cpu_count() - 1): '''Multidimensional scaling with persistence diagrams. Parameters ---------- `n_components` : int The number of dimensions in which to disperse the kernel values, default 2. `random_state` : int Determines the random number generator used for reproducibility, default None. `precomputed` : bool Determines whether a precomputed Gram matrix of persistence diagrams (`True`) or a list of persistence diagrams (`False`, the default) will be passed to the fit method. `diagrams` : list An optional list of persistence diagrams. If `precomputed` is `False` then `diagrams` will take the value of the list of diagrams passed to the `fit` method. Otherwise, `diagrams` must be supplied in order to perform inference (i.e. prediction). `dim` : int The non-negative homological dimension in which distances will be computed (default 0). `metric` : str One of \"W\" (default) or \"FIM\" for the wasserstein/bottleneck and Fisher information metric functions respectively. `sigma` : float The scale parameter for the Fisher information metric, default 1. `t` : float The positive dispersion parameter for the persistence Fisher kernel, default 1. `n_cores` : int The number of CPU cores to use for parallel computation of distance matrices. Default is the number of available cores minus 1. Attributes ---------- `kernel` : tdads.kernel.kernel The object used to compute the (cross) Gram matrices of persistence diagrams. `kPCA` : sklearn.decomposition._kernel_pca.KernelPCA The kernel PCA object used for embedding the persistence diagrams. `precomputed` : bool The input `precomputed` parameter. `diagrams` : list of length `n_diagrams` The input `diagrams` parameter for inference. ''' self.kernel = kernel(dim = dim, sigma = sigma, t = t, n_cores = n_cores) self.kPCA = KernelPCA(n_components = n_components, n_jobs = n_cores, random_state = random_state, kernel = 'precomputed') if isinstance(precomputed, type(True)) == False: raise Exception('precomputed must be True or False.') self.precomputed = precomputed if isinstance(diagrams, type([1,2])) == False: raise Exception('diagrams must be a list of diagrams.') self.diagrams = diagrams
[docs] def __str__(self): '''Describe a persistence diagram kernel principle components analysis object via its kernel function.''' s = 'Kernel PCA of persistence diagrams. Kernel used: ' + self.kernel.__str__ return s
[docs] def fit(self, X, y:any = None): '''Fit the model from data in X. Parameters ---------- `X` : {array-like of shape `(n_diagrams, n_diagrams)`} or {list of length `n_diagrams`} Either a precomputed Gram matrix of `n_diagrams` many persistence diagrams (if `precomputed` was set to `True`) or a list of `n_diagrams` many persistence diagrams (otherwise). `y` : Ignored Not used, present for API consistency by convention. Returns ------- `self` : object Returns the instance itself. Examples -------- >>> from tdads.machine_learning import diagram_mds >>> from tdads.kernel import kernel >>> from ripser import ripser >>> import numpy as np >>> # create 2 datasets >>> data1 = np.random((100,2)) >>> data2 = np.random((100,2)) >>> # compute persistence diagrams with ripser >>> diagram1 = ripser(data1) >>> diagram2 = ripser(data2) >>> # fit model with the persistence Fisher kernel (sigma = t = 1) >>> kpca = diagram_kpca() >>> kpca_fitted = kpca.fit([D1, D2]) >>> # can also fit with a precomputed distance matrix >>> pfk = kernel() >>> gram_mat = pfk.compute_matrix([D1, D2]) >>> kpca_precomp = diagram_kpca(precomputed = True) >>> kpca_precomp_fitted = kpca_precomp.fit(gram_mat) ''' if self.precomputed == False: if isinstance(X, type(array([0,1]))): raise Exception('When precomputed is False, X must be a list of persistence diagrams.') self.diagrams = X X = self.kernel.compute_matrix(X) else: if isinstance(X, type([0,1])): raise Exception('When precomputed is True, X must be a ndarray distance matrix of persistence diagrams.') self.kPCA = self.kPCA.fit(X, y) return self
[docs] def transform(self, X): '''Project new persistence diagrams into the embedding space. Parameters ---------- `X` : {array-like of shape `(n_diagrams, n_diagrams)`} or {list of length `n_diagrams`} Either a precomputed (cross) Gram matrix of shape `(n_new_diagrams, n_diagrams)` (between the new persistence diagrams and the training set diagrams, if `precomputed` was set to `True`) or a list of `n_new_diagrams` many persistence diagrams (otherwise). Returns ------- `X_new` : ndarray The embedding of the new persistence diagrams. Examples -------- >>> from tdads.machine_learning import diagram_mds >>> from tdads.kernel import kernel >>> from ripser import ripser >>> import numpy as np >>> # create 2 datasets >>> data1 = np.random((100,2)) >>> data2 = np.random((100,2)) >>> # compute persistence diagrams with ripser >>> diagram1 = ripser(data1) >>> diagram2 = ripser(data2) >>> # fit models (regular and precomputed) with the >>> # persistence Fisher kernel (sigma = t = 1) >>> kpca = diagram_kpca() >>> kpca_fitted = kpca.fit([D1, D2]) # or >>> pfk = kernel() >>> gram_mat = pfk.compute_matrix([D1, D2]) >>> kpca_precomp = diagram_kpca(precomputed = True) >>> kpca_precomp_fitted = kpca_precomp.fit(gram_mat) >>> # create 2 new datasets >>> data3 = np.random((100,2)) >>> data4 = np.random((100,2)) >>> # project new data into 2D space >>> kpca_fitted.transform([D3, D4]) # or >>> cross_gram = pfk.compute_matrix([D1, D2], [D3, D4]) >>> kpca_precomputed_fitted.transform([D3, D4]) ''' if self.precomputed == False: X = self.kernel.compute_matrix(X, self.diagrams) return self.kPCA.transform(X)
[docs] def fit_transform(self, X, y:any = None): '''Fit the data in X and compute the position of the persistence diagrams in the embedding space. Parameters ---------- `X` : {array-like of shape `(n_diagrams, n_diagrams)`} or {list of length `n_diagrams`} Either a precomputed Gram matrix of `n_diagrams` many persistence diagrams (if `precomputed` was set to `True`) or a list of `n_diagrams` many persistence diagrams (otherwise). `y` : Ignored Not used, present for API consistency by convention. Returns ------- `X_new` : ndarray `X` transformed in the new space. Examples -------- >>> from tdads.machine_learning import diagram_mds >>> from tdads.kernel import kernel >>> from ripser import ripser >>> import numpy as np >>> # create 2 datasets >>> data1 = np.random((100,2)) >>> data2 = np.random((100,2)) >>> # compute persistence diagrams with ripser >>> diagram1 = ripser(data1) >>> diagram2 = ripser(data2) >>> # fit models (regular and precomputed) with the >>> # persistence Fisher kernel (sigma = t = 1) and >>> # project into 2D space >>> kpca = diagram_kpca() >>> kpca.fit_transform([D1, D2]) # or >>> pfk = kernel() >>> gram_mat = pfk.compute_matrix([D1, D2]) >>> kpca_precomp = diagram_kpca(precomputed = True) >>> kpca_precomp.fit_transform(gram_mat) ''' self = self.fit(X, y) X_new = self.transform(X) return X_new
# class diagram_svm(): # '''Support vector machine for persistence diagrams.''' # def __init__(self, diagrams:list = None, cv:int = 1, dims:list = [0], sigmas:list = [1.0], ts:list = [1.0], precomputed:bool = False, Cs:list = [1.0], epsilons:list = [0.1], n_cores:int = cpu_count() - 1): # '''Support vector machines for persistence diagrams. # Parameters # ---------- # `diagrams` : list of persistence diagrams, default None # When `precomputed = True`, `diagrams` must be supplied in order to call the predict method. # `cv` : int, default 1 # The number of folds for cross validattion. The default is no cross-validation. # `dims` : list of int, default [0] # The homological dimensions in which to fit SVM models. # `sigmas` : list of float, default [1.0] # The values of `sigma` for the persistence Fisher kernel. # `ts` : list of float, default [1.0] # The values of `t` for the persistence Fisher kernel. # `precomputed` : bool, default False # If `True` then the `fit` method will expect precomputed Gram matrices for training, otherwise # a list of persistence diagrams. # `Cs` : list of float, default [1.0] # A list of regularization parameters. The strength of the regularization is inversely proportional to C. # Must be strictly positive. The penalty is a squared l2. # `epsilons` : list of float, default [0.1] # A list of epsilons in the epsilon-SVR model. If performing classification set `epsilon = None`. # `epsilons` specifies the epsilon-tubes within which no penalty is associated # in the training loss function with points predicted within a distance epsilon from the actual value. # Must be non-negative. # `n_cores` : int # The number of CPU cores to use for parallel computation of distance matrices. Default is the # number of available cores minus 1. # Attributes # ---------- # `precomputed` : bool # The input `precomputed` parameter. # `cv` : int # The input `cv` parameter. # `diagrams` : list of length `n_diagrams` # The input `diagrams` parameter for inference when `precomputed` is `False`. # `n_cores` : int # The input `n_cores` parameter. # `parameter_grid` : ndarray either of shape `(num_param_combos, 5)` or `(num_param_combos, 4)` # The cartesian product of all possible model parameter combinations (the number of which is `num_param_combos`). # The columns give the values of `dims`, `sigmas`, `ts`, `Cs` and `epsilons` (for regression) in that order, resulting # in five columns for classification and four for regression. # `models` : list of sklearn.svm._classes.SVR or sklearn.svm._classes.SVC of length `num_param_combos` # One model for each row of `parameter_grid` (to be fit with those parameters). # `final_model` : None or {sklearn.svm._classes.SVR or sklearn.svm._classes.SVC} # Initially None but becomes the optimal model object based on cross-validation results # once the `fit` method has been called. # `final_model_kernel` : None or tdads.kernel.kernel # Initially None but becomes the kernel object with parameters determined by `final_model` # once the `fit` method has been called. # ''' # if isinstance(precomputed, type(True)) == False: # raise Exception('precomputed must be True or False.') # self.precomputed = precomputed # if isinstance(cv, type(1)) == False: # raise Exception('cv must be an integer.') # if cv < 1: # raise Exception('cv must be at least 1.') # self.cv = cv # if isinstance(n_cores, type(1)) == False: # raise Exception('n_cores must be an integer.') # if n_cores < 1: # raise Exception('n_cores must be at least 1.') # self.n_cores = n_cores # if set([type(d) for d in dims]) != set([type(1)]): # raise Exception('Each dimension in dims must be an integer.') # if min(dims) < 0: # raise Exception('Each dimension in dims must be non-negative.') # if set([x in set([type(1), type(1.0)]) for x in set([type(s) for s in sigmas])]) != set([True]): # raise Exception('Each sigma value must be a number.') # if min(sigmas) <= 0: # raise Exception('Each sigma value must be positive.') # if set([x in set([type(1), type(1.0)]) for x in set([type(t) for t in ts])]) != set([True]): # raise Exception('Each t value must be a number.') # if min(ts) <= 0: # raise Exception('Each t value must be positive.') # if isinstance(Cs, type([0,1])) == False: # raise Exception('Cs must be a list.') # if isinstance(epsilons, type([0,1])) == False: # raise Exception('epsilons must be a list.') # if epsilons != None: # parameter_grid = product(dims, sigmas, ts, Cs, epsilons) # else: # parameter_grid = product(dims, sigmas, ts, Cs) # parameter_grid = concatenate([[array(x)] for x in parameter_grid]) # if epsilons == None: # self.models = [SVC(C=parameter_grid[i,3]) for i in range(len(parameter_grid))] # else: # self.models = [SVR(C=parameter_grid[i,3],epsilon=parameter_grid[i,4]) for i in range(len(parameter_grid))] # self.final_model = None # self.parameter_grid = parameter_grid # self.final_model_kernel = None # def __str__(self): # if self.parameter_grid.shape[1] == 4: # task = 'classification' # else: # task = 'regression' # if self.final_model == None: # fit_str = 'Model has not yet been fit.' # else: # fit_str = 'Model has been fit.' # return 'Support vector ' + task + ' object. ' + fit_str # def fit(self, X, y): # '''Fit the SVM model according to the training data. # Parameters # ---------- # `X` : {array-like of shape `(n_diagrams, n_diagrams)`} or {list of length `n_diagrams`} # Either a precomputed Gram matrix of `n_diagrams` many persistence diagrams (if `precomputed` was set to `True`) or a list of `n_diagrams` many persistence diagrams (otherwise). # `y` : array-like of shape `(n_diagrams,)` # Target values (class labels in classification, real numbers in regression). # Returns # ------- # `self` : object # The fitted estimator. # Examples # -------- # # DO! # ''' # # make row memberships for cv # if self.parameter_grid.shape[1] == 5: # 1 # else: # 1