Source code for holoviews.operation.stats

import numpy as np
import param

from ..core import Dataset, Dimension, NdOverlay
from ..core.operation import Operation
from ..core.util import cartesian_product, isfinite
from ..element import Area, Bivariate, Contours, Curve, Distribution, Image, Polygons
from .element import contours


def _kde_support(bin_range, bw, gridsize, cut, clip):
    """Establish support for a kernel density estimate."""
    kmin, kmax = bin_range[0] - bw * cut, bin_range[1] + bw * cut
    if isfinite(clip[0]):
        kmin = max(kmin, clip[0])
    if isfinite(clip[1]):
        kmax = min(kmax, clip[1])
    return np.linspace(kmin, kmax, gridsize)


[docs]class univariate_kde(Operation): """ Computes a 1D kernel density estimate (KDE) along the supplied dimension. Kernel density estimation is a non-parametric way to estimate the probability density function of a random variable. The KDE works by placing a Gaussian kernel at each sample with the supplied bandwidth. These kernels are then summed to produce the density estimate. By default a good bandwidth is determined using the bw_method but it may be overridden by an explicit value. """ bw_method = param.ObjectSelector(default='scott', objects=['scott', 'silverman'], doc=""" Method of automatically determining KDE bandwidth""") bandwidth = param.Number(default=None, doc=""" Allows supplying explicit bandwidth value rather than relying on scott or silverman method.""") cut = param.Number(default=3, doc=""" Draw the estimate to cut * bw from the extreme data points.""") bin_range = param.NumericTuple(default=None, length=2, doc=""" Specifies the range within which to compute the KDE.""") dimension = param.String(default=None, doc=""" Along which dimension of the Element to compute the KDE.""") filled = param.Boolean(default=True, doc=""" Controls whether to return filled or unfilled KDE.""") n_samples = param.Integer(default=100, doc=""" Number of samples to compute the KDE over.""") groupby = param.ClassSelector(default=None, class_=(str, Dimension), doc=""" Defines a dimension to group the Histogram returning an NdOverlay of Histograms.""") _per_element = True def _process(self, element, key=None): if self.p.groupby: if not isinstance(element, Dataset): raise ValueError('Cannot use histogram groupby on non-Dataset Element') grouped = element.groupby(self.p.groupby, group_type=Dataset, container_type=NdOverlay) self.p.groupby = None return grouped.map(self._process, Dataset) try: from scipy import stats from scipy.linalg import LinAlgError except ImportError: raise ImportError(f'{type(self).__name__} operation requires SciPy to be installed.') from None params = {} if isinstance(element, Distribution): selected_dim = element.kdims[0] if element.group != type(element).__name__: params['group'] = element.group params['label'] = element.label vdim = element.vdims[0] vdim_name = f'{selected_dim.name}_density' vdims = [vdim.clone(vdim_name, label='Density') if vdim.name == 'Density' else vdim] else: if self.p.dimension: selected_dim = element.get_dimension(self.p.dimension) else: dimensions = element.vdims+element.kdims if not dimensions: raise ValueError("%s element does not declare any dimensions " "to compute the kernel density estimate on." % type(element).__name__) selected_dim = dimensions[0] vdim_name = f'{selected_dim.name}_density' vdims = [Dimension(vdim_name, label='Density')] data = element.dimension_values(selected_dim) bin_range = self.p.bin_range or element.range(selected_dim) if bin_range == (0, 0) or any(not isfinite(r) for r in bin_range): bin_range = (0, 1) elif bin_range[0] == bin_range[1]: bin_range = (bin_range[0]-0.5, bin_range[1]+0.5) element_type = Area if self.p.filled else Curve data = data[isfinite(data)] if len(data) else [] if len(data) > 1: try: kde = stats.gaussian_kde(data) except LinAlgError: return element_type([], selected_dim, vdims, **params) if self.p.bandwidth: kde.set_bandwidth(self.p.bandwidth) bw = kde.scotts_factor() * data.std(ddof=1) if self.p.bin_range: xs = np.linspace(bin_range[0], bin_range[1], self.p.n_samples) else: xs = _kde_support(bin_range, bw, self.p.n_samples, self.p.cut, selected_dim.range) ys = kde.evaluate(xs) else: xs = np.linspace(bin_range[0], bin_range[1], self.p.n_samples) ys = np.full_like(xs, 0) return element_type((xs, ys), kdims=[selected_dim], vdims=vdims, **params)
[docs]class bivariate_kde(Operation): """ Computes a 2D kernel density estimate (KDE) of the first two dimensions in the input data. Kernel density estimation is a non-parametric way to estimate the probability density function of a random variable. The KDE works by placing 2D Gaussian kernel at each sample with the supplied bandwidth. These kernels are then summed to produce the density estimate. By default a good bandwidth is determined using the bw_method but it may be overridden by an explicit value. """ contours = param.Boolean(default=True, doc=""" Whether to compute contours from the KDE, determines whether to return an Image or Contours/Polygons.""") bw_method = param.ObjectSelector(default='scott', objects=['scott', 'silverman'], doc=""" Method of automatically determining KDE bandwidth""") bandwidth = param.Number(default=None, doc=""" Allows supplying explicit bandwidth value rather than relying on scott or silverman method.""") cut = param.Number(default=3, doc=""" Draw the estimate to cut * bw from the extreme data points.""") filled = param.Boolean(default=False, doc=""" Controls whether to return filled or unfilled contours.""") levels = param.ClassSelector(default=10, class_=(list, int), doc=""" A list of scalar values used to specify the contour levels.""") n_samples = param.Integer(default=100, doc=""" Number of samples to compute the KDE over.""") x_range = param.NumericTuple(default=None, length=2, doc=""" The x_range as a tuple of min and max x-value. Auto-ranges if set to None.""") y_range = param.NumericTuple(default=None, length=2, doc=""" The x_range as a tuple of min and max y-value. Auto-ranges if set to None.""") _per_element = True def _process(self, element, key=None): try: from scipy import stats except ImportError: raise ImportError(f'{type(self).__name__} operation requires SciPy to be installed.') from None if len(element.dimensions()) < 2: raise ValueError("bivariate_kde can only be computed on elements " "declaring at least two dimensions.") xdim, ydim = element.dimensions()[:2] params = {} if isinstance(element, Bivariate): if element.group != type(element).__name__: params['group'] = element.group params['label'] = element.label vdim = element.vdims[0] else: vdim = 'Density' data = element.array([0, 1]).T xmin, xmax = self.p.x_range or element.range(0) ymin, ymax = self.p.y_range or element.range(1) if any(not isfinite(v) for v in (xmin, xmax)): xmin, xmax = -0.5, 0.5 elif xmin == xmax: xmin, xmax = xmin-0.5, xmax+0.5 if any(not isfinite(v) for v in (ymin, ymax)): ymin, ymax = -0.5, 0.5 elif ymin == ymax: ymin, ymax = ymin-0.5, ymax+0.5 data = data[:, isfinite(data).min(axis=0)] if data.shape[1] > 1 else np.empty((2, 0)) if data.shape[1] > 1: kde = stats.gaussian_kde(data) if self.p.bandwidth: kde.set_bandwidth(self.p.bandwidth) bw = kde.scotts_factor() * data.std(ddof=1) if self.p.x_range: xs = np.linspace(xmin, xmax, self.p.n_samples) else: xs = _kde_support((xmin, xmax), bw, self.p.n_samples, self.p.cut, xdim.range) if self.p.y_range: ys = np.linspace(ymin, ymax, self.p.n_samples) else: ys = _kde_support((ymin, ymax), bw, self.p.n_samples, self.p.cut, ydim.range) xx, yy = cartesian_product([xs, ys], False) positions = np.vstack([xx.ravel(), yy.ravel()]) f = np.reshape(kde(positions).T, xx.shape) elif self.p.contours: eltype = Polygons if self.p.filled else Contours return eltype([], kdims=[xdim, ydim], vdims=[vdim]) else: xs = np.linspace(xmin, xmax, self.p.n_samples) ys = np.linspace(ymin, ymax, self.p.n_samples) f = np.zeros((self.p.n_samples, self.p.n_samples)) img = Image((xs, ys, f.T), kdims=element.dimensions()[:2], vdims=[vdim], **params) if self.p.contours: cntr = contours(img, filled=self.p.filled, levels=self.p.levels) return cntr.clone(cntr.data[1:], **params) return img