Source code for graphtools.api

import numpy as np
import warnings
from scipy import sparse
import pickle
import pygsp
import tasklogger

from . import base, graphs

_logger = tasklogger.get_tasklogger("graphtools")


[docs]def Graph( data, n_pca=None, rank_threshold=None, knn=5, decay=40, bandwidth=None, bandwidth_scale=1.0, knn_max=None, anisotropy=0, distance="euclidean", thresh=1e-4, kernel_symm="+", theta=None, precomputed=None, beta=1, sample_idx=None, adaptive_k=None, n_landmark=None, n_svd=100, n_jobs=-1, verbose=False, random_state=None, graphtype="auto", use_pygsp=False, initialize=True, **kwargs ): """Create a graph built on data. Automatically selects the appropriate DataGraph subclass based on chosen parameters. Selection criteria: - if `graphtype` is given, this will be respected - otherwise: -- if `sample_idx` is given, an MNNGraph will be created -- if `precomputed` is not given, and either `decay` is `None` or `thresh` is given, a kNNGraph will be created - otherwise, a TraditionalGraph will be created. Incompatibilities: - MNNGraph and kNNGraph cannot be precomputed - kNNGraph and TraditionalGraph do not accept sample indices Parameters ---------- data : array-like, shape=[n_samples,n_features] accepted types: `numpy.ndarray`, `scipy.sparse.spmatrix`. TODO: accept pandas dataframes' n_pca : {`int`, `None`, `bool`, 'auto'}, optional (default: `None`) number of PC dimensions to retain for graph building. If n_pca in `[None, False, 0]`, uses the original data. If 'auto' or `True` then estimate using a singular value threshold Note: if data is sparse, uses SVD instead of PCA TODO: should we subtract and store the mean? rank_threshold : `float`, 'auto', optional (default: 'auto') threshold to use when estimating rank for `n_pca in [True, 'auto']`. If 'auto', this threshold is s_max * eps * max(n_samples, n_features) where s_max is the maximum singular value of the data matrix and eps is numerical precision. [press2007]_. knn : `int`, optional (default: 5) Number of nearest neighbors (including self) to use to build the graph decay : `int` or `None`, optional (default: 40) Rate of alpha decay to use. If `None`, alpha decay is not used and a vanilla k-Nearest Neighbors graph is returned. bandwidth : `float`, list-like,`callable`, or `None`, optional (default: `None`) Fixed bandwidth to use. If given, overrides `knn`. Can be a single bandwidth, list-like (shape=[n_samples]) of bandwidths for each sample, or a `callable` that takes in an `n x n` distance matrix and returns a a single value or list-like of length n (shape=[n_samples]) bandwidth_scale : `float`, optional (default : 1.0) Rescaling factor for bandwidth. knn_max : `int` or `None`, optional (default : `None`) Maximum number of neighbors with nonzero affinity anisotropy : float, optional (default: 0) Level of anisotropy between 0 and 1 (alpha in Coifman & Lafon, 2006) distance : `str`, optional (default: `'euclidean'`) Any metric from `scipy.spatial.distance` can be used distance metric for building kNN graph. TODO: actually sklearn.neighbors has even more choices thresh : `float`, optional (default: `1e-4`) Threshold above which to calculate alpha decay kernel. All affinities below `thresh` will be set to zero in order to save on time and memory constraints. kernel_symm : string, optional (default: '+') Defines method of kernel symmetrization. '+' : additive '*' : multiplicative 'mnn' : min-max MNN symmetrization 'none' : no symmetrization theta: float (default: None) Min-max symmetrization constant or matrix. Only used if kernel_symm='mnn'. K = `theta * min(K, K.T) + (1 - theta) * max(K, K.T)` precomputed : {'distance', 'affinity', 'adjacency', `None`}, optional (default: `None`) If the graph is precomputed, this variable denotes which graph matrix is provided as `data`. Only one of `precomputed` and `n_pca` can be set. beta: float, optional(default: 1) Multiply between - batch connections by beta sample_idx: array-like Batch index for MNN kernel adaptive_k : `{'min', 'mean', 'sqrt', 'none'}` (default: None) Weights MNN kernel adaptively using the number of cells in each sample according to the selected method. n_landmark : `int`, optional (default: 2000) number of landmarks to use n_svd : `int`, optional (default: 100) number of SVD components to use for spectral clustering random_state : `int` or `None`, optional (default: `None`) Random state for random PCA verbose : `bool`, optional (default: `True`) Verbosity. TODO: should this be an integer instead to allow multiple levels of verbosity? n_jobs : `int`, optional (default : 1) The number of jobs to use for the computation. If -1 all CPUs are used. If 1 is given, no parallel computing code is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used graphtype : {'exact', 'knn', 'mnn', 'auto'} (Default: 'auto') Manually selects graph type. Only recommended for expert users use_pygsp : `bool` (Default: `False`) If true, inherits from `pygsp.graphs.Graph`. initialize : `bool` (Default: `True`) If True, initialize the kernel matrix on instantiation **kwargs : extra arguments for `pygsp.graphs.Graph` Returns ------- G : `DataGraph` Raises ------ ValueError : if selected parameters are incompatible. References ---------- .. [press2007] W. Press, S. Teukolsky, W. Vetterling and B. Flannery, “Numerical Recipes (3rd edition)”, Cambridge University Press, 2007, page 795. """ _logger.set_level(verbose) if sample_idx is not None and len(np.unique(sample_idx)) == 1: warnings.warn("Only one unique sample. Not using MNNGraph") sample_idx = None if graphtype == "mnn": graphtype = "auto" if graphtype == "auto": # automatic graph selection if sample_idx is not None: # only mnn does batch correction graphtype = "mnn" elif precomputed is not None: # precomputed requires exact graph graphtype = "exact" elif decay is None: # knn kernel graphtype = "knn" elif (thresh == 0 and knn_max is None) or callable(bandwidth): # compute full distance matrix graphtype = "exact" else: # decay kernel with nonzero threshold - knn is more efficient graphtype = "knn" # set base graph type if graphtype == "knn": basegraph = graphs.kNNGraph if precomputed is not None: raise ValueError( "kNNGraph does not support precomputed " "values. Use `graphtype='exact'` or " "`precomputed=None`" ) if sample_idx is not None: raise ValueError( "kNNGraph does not support batch " "correction. Use `graphtype='mnn'` or " "`sample_idx=None`" ) elif graphtype == "mnn": basegraph = graphs.MNNGraph if precomputed is not None: raise ValueError( "MNNGraph does not support precomputed " "values. Use `graphtype='exact'` and " "`sample_idx=None` or `precomputed=None`" ) elif graphtype == "exact": basegraph = graphs.TraditionalGraph if sample_idx is not None: raise ValueError( "TraditionalGraph does not support batch " "correction. Use `graphtype='mnn'` or " "`sample_idx=None`" ) else: raise ValueError( "graphtype '{}' not recognized. Choose from " "['knn', 'mnn', 'exact', 'auto']".format(graphtype) ) # set add landmarks if necessary parent_classes = [basegraph] msg = "Building {} graph".format(graphtype) if n_landmark is not None: parent_classes.append(graphs.LandmarkGraph) msg = msg + " with landmarks" if use_pygsp: parent_classes.append(base.PyGSPGraph) if len(parent_classes) > 2: msg = msg + " with PyGSP inheritance" else: msg = msg + " and PyGSP inheritance" _logger.debug(msg) class_names = [p.__name__.replace("Graph", "") for p in parent_classes] try: Graph = eval("graphs." + "".join(class_names) + "Graph") except NameError: raise RuntimeError("unknown graph classes {}".format(parent_classes)) params = kwargs for parent_class in parent_classes: for param in parent_class._get_param_names(): try: params[param] = eval(param) except NameError: # keyword argument not specified above - no problem pass # build graph and return _logger.debug( "Initializing {} with arguments {}".format( parent_classes, ", ".join( [ "{}='{}'".format(key, value) for key, value in params.items() if key != "data" ] ), ) ) return Graph(**params)
[docs]def from_igraph(G, attribute="weight", **kwargs): """Convert an igraph.Graph to a graphtools.Graph Creates a graphtools.graphs.TraditionalGraph with a precomputed adjacency matrix Parameters ---------- G : igraph.Graph Graph to be converted attribute : str, optional (default: "weight") attribute containing edge weights, if any. If None, unweighted graph is built kwargs keyword arguments for graphtools.Graph Returns ------- G : graphtools.graphs.TraditionalGraph """ if "precomputed" in kwargs: if kwargs["precomputed"] != "adjacency": warnings.warn( "Cannot build graph from igraph with precomputed={}. " "Use 'adjacency' instead.".format(kwargs["precomputed"]), UserWarning, ) del kwargs["precomputed"] try: K = G.get_adjacency(attribute=attribute).data except ValueError as e: if str(e) == "Attribute does not exist": warnings.warn( "Edge attribute {} not found. " "Returning unweighted graph".format(attribute), UserWarning, ) K = G.get_adjacency(attribute=None).data return Graph(sparse.coo_matrix(K), precomputed="adjacency", **kwargs)
[docs]def read_pickle(path): """Load pickled Graphtools object (or any object) from file. Parameters ---------- path : str File path where the pickled object will be loaded. """ with open(path, "rb") as f: G = pickle.load(f) if not isinstance(G, base.BaseGraph): warnings.warn("Returning object that is not a graphtools.base.BaseGraph") elif isinstance(G, base.PyGSPGraph) and isinstance(G.logger, str): G.logger = pygsp.utils.build_logger(G.logger) return G