Source code for graphtools.api

import numpy as np
import warnings
from scipy import sparse
import pickle
import pygsp
import tasklogger

from . import base, graphs

_logger = tasklogger.get_tasklogger("graphtools")


[docs]def Graph(
    data,
    n_pca=None,
    rank_threshold=None,
    knn=5,
    decay=40,
    bandwidth=None,
    bandwidth_scale=1.0,
    knn_max=None,
    anisotropy=0,
    distance="euclidean",
    thresh=1e-4,
    kernel_symm="+",
    theta=None,
    precomputed=None,
    beta=1,
    sample_idx=None,
    adaptive_k=None,
    n_landmark=None,
    n_svd=100,
    n_jobs=-1,
    verbose=False,
    random_state=None,
    graphtype="auto",
    use_pygsp=False,
    initialize=True,
    **kwargs
):
    """Create a graph built on data.

    Automatically selects the appropriate DataGraph subclass based on
    chosen parameters.
    Selection criteria:
    - if `graphtype` is given, this will be respected
    - otherwise:
    -- if `sample_idx` is given, an MNNGraph will be created
    -- if `precomputed` is not given, and either `decay` is `None` or `thresh`
    is given, a kNNGraph will be created
    - otherwise, a TraditionalGraph will be created.

    Incompatibilities:
    - MNNGraph and kNNGraph cannot be precomputed
    - kNNGraph and TraditionalGraph do not accept sample indices

    Parameters
    ----------
    data : array-like, shape=[n_samples,n_features]
        accepted types: `numpy.ndarray`, `scipy.sparse.spmatrix`.
        TODO: accept pandas dataframes'

    n_pca : {`int`, `None`, `bool`, 'auto'}, optional (default: `None`)
        number of PC dimensions to retain for graph building.
        If n_pca in `[None, False, 0]`, uses the original data.
        If 'auto' or `True` then estimate using a singular value threshold
        Note: if data is sparse, uses SVD instead of PCA
        TODO: should we subtract and store the mean?

    rank_threshold : `float`, 'auto', optional (default: 'auto')
        threshold to use when estimating rank for
        `n_pca in [True, 'auto']`.
        If 'auto', this threshold is
        s_max * eps * max(n_samples, n_features)
        where s_max is the maximum singular value of the data matrix
        and eps is numerical precision. [press2007]_.

    knn : `int`, optional (default: 5)
        Number of nearest neighbors (including self) to use to build the graph

    decay : `int` or `None`, optional (default: 40)
        Rate of alpha decay to use. If `None`, alpha decay is not used and a vanilla
        k-Nearest Neighbors graph is returned.

    bandwidth : `float`, list-like,`callable`, or `None`, optional (default: `None`)
        Fixed bandwidth to use. If given, overrides `knn`. Can be a single
        bandwidth, list-like (shape=[n_samples]) of bandwidths for each
        sample, or a `callable` that takes in an `n x n` distance matrix and returns a
        a single value or list-like of length n (shape=[n_samples])

    bandwidth_scale : `float`, optional (default : 1.0)
        Rescaling factor for bandwidth.

    knn_max : `int` or `None`, optional (default : `None`)
        Maximum number of neighbors with nonzero affinity

    anisotropy : float, optional (default: 0)
        Level of anisotropy between 0 and 1
        (alpha in Coifman & Lafon, 2006)

    distance : `str`, optional (default: `'euclidean'`)
        Any metric from `scipy.spatial.distance` can be used
        distance metric for building kNN graph.
        TODO: actually sklearn.neighbors has even more choices

    thresh : `float`, optional (default: `1e-4`)
        Threshold above which to calculate alpha decay kernel.
        All affinities below `thresh` will be set to zero in order to save
        on time and memory constraints.

    kernel_symm : string, optional (default: '+')
        Defines method of kernel symmetrization.
        '+'  : additive
        '*'  : multiplicative
        'mnn' : min-max MNN symmetrization
        'none' : no symmetrization

    theta: float (default: None)
        Min-max symmetrization constant or matrix. Only used if kernel_symm='mnn'.
        K = `theta * min(K, K.T) + (1 - theta) * max(K, K.T)`

    precomputed : {'distance', 'affinity', 'adjacency', `None`}, optional (default: `None`)
        If the graph is precomputed, this variable denotes which graph
        matrix is provided as `data`.
        Only one of `precomputed` and `n_pca` can be set.

    beta: float, optional(default: 1)
        Multiply between - batch connections by beta

    sample_idx: array-like
        Batch index for MNN kernel

    adaptive_k : `{'min', 'mean', 'sqrt', 'none'}` (default: None)
        Weights MNN kernel adaptively using the number of cells in
        each sample according to the selected method.

    n_landmark : `int`, optional (default: 2000)
        number of landmarks to use

    n_svd : `int`, optional (default: 100)
        number of SVD components to use for spectral clustering

    random_state : `int` or `None`, optional (default: `None`)
        Random state for random PCA

    verbose : `bool`, optional (default: `True`)
        Verbosity.
        TODO: should this be an integer instead to allow multiple
        levels of verbosity?

    n_jobs : `int`, optional (default : 1)
        The number of jobs to use for the computation.
        If -1 all CPUs are used. If 1 is given, no parallel computing code is
        used at all, which is useful for debugging.
        For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for
        n_jobs = -2, all CPUs but one are used

    graphtype : {'exact', 'knn', 'mnn', 'auto'} (Default: 'auto')
        Manually selects graph type. Only recommended for expert users

    use_pygsp : `bool` (Default: `False`)
        If true, inherits from `pygsp.graphs.Graph`.

    initialize : `bool` (Default: `True`)
        If True, initialize the kernel matrix on instantiation

    **kwargs : extra arguments for `pygsp.graphs.Graph`

    Returns
    -------
    G : `DataGraph`

    Raises
    ------
    ValueError : if selected parameters are incompatible.

    References
    ----------
    .. [press2007] W. Press, S. Teukolsky, W. Vetterling and B. Flannery,
        “Numerical Recipes (3rd edition)”,
        Cambridge University Press, 2007, page 795.
    """
    _logger.set_level(verbose)
    if sample_idx is not None and len(np.unique(sample_idx)) == 1:
        warnings.warn("Only one unique sample. Not using MNNGraph")
        sample_idx = None
        if graphtype == "mnn":
            graphtype = "auto"
    if graphtype == "auto":
        # automatic graph selection
        if sample_idx is not None:
            # only mnn does batch correction
            graphtype = "mnn"
        elif precomputed is not None:
            # precomputed requires exact graph
            graphtype = "exact"
        elif decay is None:
            # knn kernel
            graphtype = "knn"
        elif (thresh == 0 and knn_max is None) or callable(bandwidth):
            # compute full distance matrix
            graphtype = "exact"
        else:
            # decay kernel with nonzero threshold - knn is more efficient
            graphtype = "knn"

    # set base graph type
    if graphtype == "knn":
        basegraph = graphs.kNNGraph
        if precomputed is not None:
            raise ValueError(
                "kNNGraph does not support precomputed "
                "values. Use `graphtype='exact'` or "
                "`precomputed=None`"
            )
        if sample_idx is not None:
            raise ValueError(
                "kNNGraph does not support batch "
                "correction. Use `graphtype='mnn'` or "
                "`sample_idx=None`"
            )

    elif graphtype == "mnn":
        basegraph = graphs.MNNGraph
        if precomputed is not None:
            raise ValueError(
                "MNNGraph does not support precomputed "
                "values. Use `graphtype='exact'` and "
                "`sample_idx=None` or `precomputed=None`"
            )
    elif graphtype == "exact":
        basegraph = graphs.TraditionalGraph
        if sample_idx is not None:
            raise ValueError(
                "TraditionalGraph does not support batch "
                "correction. Use `graphtype='mnn'` or "
                "`sample_idx=None`"
            )
    else:
        raise ValueError(
            "graphtype '{}' not recognized. Choose from "
            "['knn', 'mnn', 'exact', 'auto']".format(graphtype)
        )

    # set add landmarks if necessary
    parent_classes = [basegraph]
    msg = "Building {} graph".format(graphtype)
    if n_landmark is not None:
        parent_classes.append(graphs.LandmarkGraph)
        msg = msg + " with landmarks"
    if use_pygsp:
        parent_classes.append(base.PyGSPGraph)
        if len(parent_classes) > 2:
            msg = msg + " with PyGSP inheritance"
        else:
            msg = msg + " and PyGSP inheritance"

    _logger.debug(msg)

    class_names = [p.__name__.replace("Graph", "") for p in parent_classes]
    try:
        Graph = eval("graphs." + "".join(class_names) + "Graph")
    except NameError:
        raise RuntimeError("unknown graph classes {}".format(parent_classes))

    params = kwargs
    for parent_class in parent_classes:
        for param in parent_class._get_param_names():
            try:
                params[param] = eval(param)
            except NameError:
                # keyword argument not specified above - no problem
                pass

    # build graph and return
    _logger.debug(
        "Initializing {} with arguments {}".format(
            parent_classes,
            ", ".join(
                [
                    "{}='{}'".format(key, value)
                    for key, value in params.items()
                    if key != "data"
                ]
            ),
        )
    )
    return Graph(**params)


[docs]def from_igraph(G, attribute="weight", **kwargs):
    """Convert an igraph.Graph to a graphtools.Graph

    Creates a graphtools.graphs.TraditionalGraph with a
    precomputed adjacency matrix

    Parameters
    ----------
    G : igraph.Graph
        Graph to be converted
    attribute : str, optional (default: "weight")
        attribute containing edge weights, if any.
        If None, unweighted graph is built
    kwargs
        keyword arguments for graphtools.Graph

    Returns
    -------
    G : graphtools.graphs.TraditionalGraph
    """
    if "precomputed" in kwargs:
        if kwargs["precomputed"] != "adjacency":
            warnings.warn(
                "Cannot build graph from igraph with precomputed={}. "
                "Use 'adjacency' instead.".format(kwargs["precomputed"]),
                UserWarning,
            )
        del kwargs["precomputed"]
    try:
        K = G.get_adjacency(attribute=attribute).data
    except ValueError as e:
        if str(e) == "Attribute does not exist":
            warnings.warn(
                "Edge attribute {} not found. "
                "Returning unweighted graph".format(attribute),
                UserWarning,
            )
        K = G.get_adjacency(attribute=None).data
    return Graph(sparse.coo_matrix(K), precomputed="adjacency", **kwargs)


[docs]def read_pickle(path):
    """Load pickled Graphtools object (or any object) from file.

    Parameters
    ----------
    path : str
        File path where the pickled object will be loaded.
    """
    with open(path, "rb") as f:
        G = pickle.load(f)

    if not isinstance(G, base.BaseGraph):
        warnings.warn("Returning object that is not a graphtools.base.BaseGraph")
    elif isinstance(G, base.PyGSPGraph) and isinstance(G.logger, str):
        G.logger = pygsp.utils.build_logger(G.logger)
    return G