Source code for dcor.homogeneity

"""
Functions for testing homogeneity of several distributions.

The functions in this module provide methods for testing if
the samples generated from two random vectors have the same
distribution.
"""
from __future__ import annotations

from typing import Callable, Sequence, TypeVar

import numpy as np

from . import distances as _distances
from ._energy import (
    EstimationStatistic,
    EstimationStatisticLike,
    _check_valid_energy_exponent,
    _energy_distance_from_distance_matrices,
    energy_distance,
)
from ._hypothesis import HypothesisTest, _permutation_test_with_sym_matrix
from ._utils import ArrayType, RandomLike, _transform_to_2d, array_namespace

Array = TypeVar("Array", bound=ArrayType)


def _energy_test_statistic_coefficient(
    n: int,
    m: int,
) -> float:
    """Coefficient of the test statistic."""
    return n * m / (n + m)


def _energy_test_statistic_from_distance_matrices(
    distance_xx: Array,
    distance_yy: Array,
    distance_xy: Array,
    n: int,
    m: int,
    average: Callable[[Array], Array] | None = None,
    estimation_stat: EstimationStatisticLike = EstimationStatistic.V_STATISTIC,
) -> Array:
    """Test statistic with precomputed distance matrices."""
    energy_distance = _energy_distance_from_distance_matrices(
        distance_xx=distance_xx,
        distance_yy=distance_yy,
        distance_xy=distance_xy,
        average=average,
        estimation_stat=estimation_stat,
    )

    return _energy_test_statistic_coefficient(n, m) * energy_distance


[docs]def energy_test_statistic(
    x: Array,
    y: Array,
    *,
    exponent: float = 1,
    average: Callable[[Array], Array] | None = None,
    estimation_stat: EstimationStatisticLike = EstimationStatistic.V_STATISTIC,
) -> Array:
    """
    Homogeneity statistic.

    Computes the statistic for homogeneity based on the energy distance, for
    random vectors corresponding to :math:`x` and :math:`y`.

    Args:
        x: First random vector. The columns correspond with the individual
            random variables while the rows are individual instances of the
            random vector.
        y: Second random vector. The columns correspond with the individual
            random variables while the rows are individual instances of the
            random vector.
        exponent: Exponent of the Euclidean distance, in the range
            :math:`(0, 2)`.
        average: A function that will be used to calculate an average of
            distances. This defaults to np.mean.
        estimation_stat: If EstimationStatistic.U_STATISTIC, calculate energy
            distance using Hoeffding's unbiased U-statistics. Otherwise, use
            von Mises's biased V-statistics.
            If this is provided as a string, it will first be converted to
            an EstimationStatistic enum instance.

    Returns:
        Value of the statistic for homogeneity based on the energy distance.

    See Also:
        energy_distance

    Examples:
        >>> import numpy as np
        >>> import dcor
        >>> a = np.array([[1, 2, 3, 4],
        ...               [5, 6, 7, 8],
        ...               [9, 10, 11, 12],
        ...               [13, 14, 15, 16]])
        >>> b = np.array([[1, 0, 0, 1],
        ...               [0, 1, 1, 1],
        ...               [1, 1, 1, 1]])
        >>> dcor.homogeneity.energy_test_statistic(a, a)
        0.0
        >>> dcor.homogeneity.energy_test_statistic(a, b) # doctest: +ELLIPSIS
        35.2766732...
        >>> dcor.homogeneity.energy_test_statistic(b, b)
        0.0

    """
    x, y = _transform_to_2d(x, y)

    n = x.shape[0]
    m = y.shape[0]

    coefficient = _energy_test_statistic_coefficient(n, m)

    return coefficient * energy_distance(
        x,
        y,
        exponent=exponent,
        average=average,
        estimation_stat=estimation_stat,
    )


def _energy_test_statistic_multivariate_from_distance_matrix(
    distance: Array,
    indexes: Sequence[int],
    sizes: Sequence[int],
    average: Callable[[Array], Array] | None = None,
    estimation_stat: EstimationStatisticLike = EstimationStatistic.V_STATISTIC,
) -> Array:
    """Statistic for several random vectors given the distance matrix."""
    first_iter = True

    for i, _ in enumerate(indexes):
        for j in range(i + 1, len(indexes)):
            slice_i = slice(indexes[i], indexes[i] + sizes[i])
            slice_j = slice(indexes[j], indexes[j] + sizes[j])

            n = sizes[i]
            m = sizes[j]

            distance_xx = distance[slice_i, slice_i]
            distance_yy = distance[slice_j, slice_j]
            distance_xy = distance[slice_i, slice_j]

            pairwise_energy = _energy_test_statistic_from_distance_matrices(
                distance_xx=distance_xx,
                distance_yy=distance_yy,
                distance_xy=distance_xy,
                n=n,
                m=m,
                average=average,
                estimation_stat=estimation_stat,
            )

            if first_iter:
                energy = pairwise_energy
                first_iter = False
            else:
                energy += pairwise_energy

    return energy


[docs]def energy_test(
    *args: T,
    num_resamples: int = 0,
    exponent: float = 1,
    random_state: RandomLike = None,
    average: Callable[[Array], Array] | None = None,
    estimation_stat: EstimationStatisticLike = EstimationStatistic.V_STATISTIC,
    n_jobs: int | None = 1,
) -> HypothesisTest[Array]:
    """
    Test of homogeneity based on the energy distance.

    Compute the test of homogeneity based on the energy distance, for
    an arbitrary number of random vectors.

    The test is a permutation test where the null hypothesis is that all
    random vectors have the same distribution.

    Args:
        args: Random vectors. The columns correspond with the individual random
            variables while the rows are individual instances of the random
            vector.
        num_resamples: Number of permutations resamples to take in the
            permutation test.
        exponent: Exponent of the Euclidean distance, in the range
            :math:`(0, 2)`.
        random_state: Random state to generate the permutations.
        average: A function that will be used to calculate an average of
            distances. This defaults to np.mean.
        estimation_stat: If EstimationStatistic.U_STATISTIC, calculate energy
            distance using Hoeffding's unbiased U-statistics. Otherwise, use
            von Mises's biased V-statistics. If this is provided as a string,
            it will first be converted to an EstimationStatistic enum instance.
        n_jobs: Number of jobs executed in parallel by Joblib.

    Returns:
        Results of the hypothesis test.

    See Also:
        energy_distance

    Examples:
        >>> import numpy as np
        >>> import dcor
        >>> a = np.array([[1, 2, 3, 4],
        ...               [5, 6, 7, 8],
        ...               [9, 10, 11, 12],
        ...               [13, 14, 15, 16]])
        >>> b = np.array([[1, 0, 0, 1],
        ...               [0, 1, 1, 1],
        ...               [1, 1, 1, 1]])
        >>> c = np.array([[1000, 0, 0, 1000],
        ...               [0, 1000, 1000, 1000],
        ...               [1000, 1000, 1000, 1000]])
        >>> dcor.homogeneity.energy_test(a, a)
        HypothesisTest(pvalue=1.0, statistic=0.0)
        >>> dcor.homogeneity.energy_test(a, b) # doctest: +ELLIPSIS
        HypothesisTest(pvalue=1.0, statistic=35.2766732...)
        >>> dcor.homogeneity.energy_test(b, b)
        HypothesisTest(pvalue=1.0, statistic=0.0)
        >>> dcor.homogeneity.energy_test(a, b, num_resamples=5, random_state=0)
        HypothesisTest(pvalue=0.1666666..., statistic=35.2766732...)
        >>> dcor.homogeneity.energy_test(a, b, num_resamples=5, random_state=6)
        HypothesisTest(pvalue=0.3333333..., statistic=35.2766732...)
        >>> dcor.homogeneity.energy_test(a, c, num_resamples=7, random_state=0)
        HypothesisTest(pvalue=0.125, statistic=4233.8935035...)

        A different exponent for the Euclidean distance in the range
        :math:`(0, 2)` can be used:

        >>> dcor.homogeneity.energy_test(a, b, exponent=1.5)
        ...                                               # doctest: +ELLIPSIS
        HypothesisTest(pvalue=1.0, statistic=171.0623923...)

    """
    samples = list(_transform_to_2d(*args))

    num_samples = len(samples)

    _check_valid_energy_exponent(exponent)

    sample_sizes = tuple(a.shape[0] for a in samples)

    xp = array_namespace(*samples)

    # NumPy namespace has no concat function yet
    try:
        concat = xp.concat
    except AttributeError:
        concat = np.concatenate
    pooled_samples = concat(samples)

    sample_indexes_array = np.zeros(num_samples, dtype=int)
    sample_indexes_array[1:] = np.cumsum(sample_sizes)[:-1]
    sample_indexes = tuple(sample_indexes_array)

    # Compute the distance matrix once
    sample_distances = _distances.pairwise_distances(
        pooled_samples,
        exponent=exponent,
    )

    # Use the energy statistic with appropiate values
    def statistic_function(distance_matrix: Array) -> Array:
        return _energy_test_statistic_multivariate_from_distance_matrix(
            distance=distance_matrix,
            indexes=sample_indexes,
            sizes=sample_sizes,
            average=average,
            estimation_stat=estimation_stat,
        )

    return _permutation_test_with_sym_matrix(
        sample_distances,
        statistic_function=statistic_function,
        num_resamples=num_resamples,
        random_state=random_state,
        n_jobs=n_jobs,
    )