Source code for dcor.independence

"""
Functions for testing independence of several distributions.

The functions in this module provide methods for testing if
the samples generated from two random vectors are independent.
"""
from __future__ import annotations

from typing import TypeVar

import numpy as np
import scipy.stats

from ._dcor import u_distance_correlation_sqr
from ._dcor_internals import (
    _check_same_n_elements,
    _distance_matrix_generic,
    _u_distance_matrix,
    double_centered,
    mean_product,
    u_complementary_projection,
    u_product,
)
from ._hypothesis import HypothesisTest, _permutation_test_with_sym_matrix
from ._utils import (
    ArrayType,
    RandomLike,
    _random_state_init,
    _sqrt,
    _transform_to_2d,
)

Array = TypeVar("Array", bound=ArrayType)


[docs]def distance_covariance_test(
    x: Array,
    y: Array,
    *,
    num_resamples: int = 0,
    exponent: float = 1,
    random_state: RandomLike = None,
    n_jobs: int = 1,
) -> HypothesisTest[Array]:
    """
    Test of distance covariance independence.

    Compute the test of independence based on the distance
    covariance, for two random vectors.

    The test is a permutation test where the null hypothesis is that the two
    random vectors are independent.

    Args:
        x: First random vector. The columns correspond with the individual
            random variables while the rows are individual instances of the
            random vector.
        y: Second random vector. The columns correspond with the individual
            random variables while the rows are individual instances of the
            random vector.
        exponent: Exponent of the Euclidean distance, in the range
            :math:`(0, 2)`. Equivalently, it is twice the Hurst parameter of
            fractional Brownian motion.
        num_resamples: Number of permutations resamples to take in the
            permutation test.
        random_state: Random state to generate the permutations.
        n_jobs: Number of jobs executed in parallel by Joblib.

    Returns:
        Results of the hypothesis test.

    See Also:
        distance_covariance

    Examples:
        >>> import numpy as np
        >>> import dcor
        >>> a = np.array([[1, 2, 3, 4],
        ...               [5, 6, 7, 8],
        ...               [9, 10, 11, 12],
        ...               [13, 14, 15, 16]])
        >>> b = np.array([[1, 0, 0, 1],
        ...               [0, 1, 1, 1],
        ...               [1, 1, 1, 1],
        ...               [1, 1, 0, 1]])
        >>> dcor.independence.distance_covariance_test(a, a)
        HypothesisTest(pvalue=1.0, statistic=208.0)
        >>> dcor.independence.distance_covariance_test(a, b)
        ...                                      # doctest: +ELLIPSIS
        HypothesisTest(pvalue=1.0, statistic=11.75323056...)
        >>> dcor.independence.distance_covariance_test(b, b)
        HypothesisTest(pvalue=1.0, statistic=1.3604610...)
        >>> dcor.independence.distance_covariance_test(a, b,
        ... num_resamples=5, random_state=0)
        HypothesisTest(pvalue=0.8333333333333334, statistic=11.7532305...)
        >>> dcor.independence.distance_covariance_test(a, b,
        ... num_resamples=5, random_state=13)
        HypothesisTest(pvalue=0.5..., statistic=11.7532305...)
        >>> dcor.independence.distance_covariance_test(a, a,
        ... num_resamples=7, random_state=0)
        HypothesisTest(pvalue=0.125, statistic=208.0)

    """
    x, y = _transform_to_2d(x, y)

    _check_same_n_elements(x, y)

    random_state = _random_state_init(random_state)

    # Compute U-centered matrices
    u_x = _distance_matrix_generic(
        x,
        centering=double_centered,
        exponent=exponent,
    )
    u_y = _distance_matrix_generic(
        y,
        centering=double_centered,
        exponent=exponent,
    )

    # Use the dcov statistic
    def statistic_function(distance_matrix: Array) -> Array:
        return u_x.shape[0] * mean_product(
            distance_matrix,
            u_y,
        )

    return _permutation_test_with_sym_matrix(
        u_x,
        statistic_function=statistic_function,
        num_resamples=num_resamples,
        random_state=random_state,
        n_jobs=n_jobs,
    )


def partial_distance_covariance_test(
    x: Array,
    y: Array,
    z: Array,
    *,
    num_resamples: int = 0,
    exponent: float = 1,
    random_state: RandomLike = None,
    n_jobs: int | None = 1,
) -> HypothesisTest[Array]:
    """
    Test of partial distance covariance independence.

    Compute the test of independence based on the partial distance
    covariance, for two random vectors conditioned on a third.

    The test is a permutation test where the null hypothesis is that the first
    two random vectors are independent given the third one.

    Args:
        x: First random vector. The columns correspond with the individual
            random variables while the rows are individual instances of the
            random vector.
        y: Second random vector. The columns correspond with the individual
            random variables while the rows are individual instances of the
            random vector.
        z: Observed random vector. The columns correspond with the individual
            random variables while the rows are individual instances of the
            random vector.
        exponent: Exponent of the Euclidean distance, in the range
            :math:`(0, 2)`. Equivalently, it is twice the Hurst parameter of
            fractional Brownian motion.
        num_resamples: Number of permutations resamples to take in the
            permutation test.
        random_state: Random state to generate the permutations.
        n_jobs: Number of jobs executed in parallel by Joblib.

    Returns:
        Results of the hypothesis test.

    See Also:
        partial_distance_covariance

    Examples:
        >>> import numpy as np
        >>> import dcor
        >>> a = np.array([[1, 2, 3, 4],
        ...               [5, 6, 7, 8],
        ...               [9, 10, 11, 12],
        ...               [13, 14, 15, 16]])
        >>> b = np.array([[1, 0, 0, 1],
        ...               [0, 1, 1, 1],
        ...               [1, 1, 1, 1],
        ...               [1, 1, 0, 1]])
        >>> c = np.array([[1000, 0, 0, 1000],
        ...               [0, 1000, 1000, 1000],
        ...               [1000, 1000, 1000, 1000],
        ...               [1000, 1000, 0, 1000]])
        >>> dcor.independence.partial_distance_covariance_test(a, a, b)
        ...                                       # doctest: +ELLIPSIS
        HypothesisTest(pvalue=1.0, statistic=142.6664416...)
        >>> dcor.independence.partial_distance_covariance_test(a, b, c)
        ...                                      # doctest: +ELLIPSIS
        HypothesisTest(pvalue=1.0, statistic=7.0791037...e-15)
        >>> dcor.independence.partial_distance_covariance_test(b, b, c)
        ...                                      # doctest: +ELLIPSIS
        HypothesisTest(pvalue=1.0, statistic=6.3170502...e-31)
        >>> dcor.independence.partial_distance_covariance_test(a, b, c,
        ... num_resamples=5, random_state=0)
        HypothesisTest(pvalue=0.1666666..., statistic=7.0791037...e-15)
        >>> dcor.independence.partial_distance_covariance_test(a, b, c,
        ... num_resamples=5, random_state=13)
        HypothesisTest(pvalue=0.1666666..., statistic=7.0791037...e-15)
        >>> dcor.independence.partial_distance_covariance_test(a, c, b,
        ... num_resamples=7, random_state=0)
        HypothesisTest(pvalue=1.0, statistic=-7.5701764...e-12)

    """
    random_state = _random_state_init(random_state)

    # Compute U-centered matrices
    u_x = _u_distance_matrix(x, exponent=exponent)
    u_y = _u_distance_matrix(y, exponent=exponent)
    u_z = _u_distance_matrix(z, exponent=exponent)

    # Compute projections
    proj = u_complementary_projection(u_z)

    p_xz = proj(u_x)
    p_yz = proj(u_y)

    # Use the pdcor statistic
    def statistic_function(distance_matrix: Array) -> Array:
        return u_x.shape[0] * u_product(
            distance_matrix,
            p_yz,
        )

    return _permutation_test_with_sym_matrix(
        p_xz,
        statistic_function=statistic_function,
        num_resamples=num_resamples,
        random_state=random_state,
        n_jobs=n_jobs,
    )


[docs]def distance_correlation_t_statistic(
    x: Array,
    y: Array,
) -> Array:
    """
    Statistic used in :func:`distance_correlation_t_test`.

    Args:
        x: First random vector. The columns correspond with the individual
            random variables while the rows are individual instances of the
            random vector.
        y: Second random vector. The columns correspond with the individual
            random variables while the rows are individual instances of the
            random vector.

    Returns:
        T statistic.

    See Also:
        distance_correlation_t_test

    Examples:
        >>> import numpy as np
        >>> import dcor
        >>> a = np.array([[1, 2, 3, 4],
        ...               [5, 6, 7, 8],
        ...               [9, 10, 11, 12],
        ...               [13, 14, 15, 16]])
        >>> b = np.array([[1, 0, 0, 1],
        ...               [0, 1, 1, 1],
        ...               [1, 1, 1, 1],
        ...               [1, 1, 0, 1]])
        >>> with np.errstate(divide='ignore'):
        ...     dcor.independence.distance_correlation_t_statistic(a, a)
        inf
        >>> dcor.independence.distance_correlation_t_statistic(a, b)
        ...                                      # doctest: +ELLIPSIS
        -0.4430164...
        >>> with np.errstate(divide='ignore'):
        ...     dcor.independence.distance_correlation_t_statistic(b, b)
        inf

    """
    bcdcor = u_distance_correlation_sqr(x, y)

    n = x.shape[0]
    v = n * (n - 3) / 2

    return np.sqrt(v - 1) * bcdcor / _sqrt(1 - bcdcor**2)


[docs]def distance_correlation_t_test(
    x: Array,
    y: Array,
) -> HypothesisTest[Array]:
    """
    Test of independence for high dimension.

    It is based on convergence to a Student t distribution.
    The null hypothesis is that the two random vectors are
    independent.

    Args:
        x: First random vector. The columns correspond with the individual
            random variables while the rows are individual instances of the
            random vector.
        y: Second random vector. The columns correspond with the individual
            random variables while the rows are individual instances of the
            random vector.

    Returns:
        Results of the hypothesis test.

    See Also:
        distance_correlation_t_statistic

    Examples:
        >>> import numpy as np
        >>> import dcor
        >>> a = np.array([[1, 2, 3, 4],
        ...               [5, 6, 7, 8],
        ...               [9, 10, 11, 12],
        ...               [13, 14, 15, 16]])
        >>> b = np.array([[1, 0, 0, 1],
        ...               [0, 1, 1, 1],
        ...               [1, 1, 1, 1],
        ...               [1, 1, 0, 1]])
        >>> with np.errstate(divide='ignore'):
        ...     dcor.independence.distance_correlation_t_test(a, a)
        ...                                      # doctest: +ELLIPSIS
        HypothesisTest(pvalue=0.0, statistic=inf)
        >>> dcor.independence.distance_correlation_t_test(a, b)
        ...                                      # doctest: +ELLIPSIS
        HypothesisTest(pvalue=0.6327451..., statistic=-0.4430164...)
        >>> with np.errstate(divide='ignore'):
        ...     dcor.independence.distance_correlation_t_test(b, b)
        ...                                      # doctest: +ELLIPSIS
        HypothesisTest(pvalue=0.0, statistic=inf)

    """
    t_test = distance_correlation_t_statistic(x, y)

    n = x.shape[0]
    v = n * (n - 3) / 2
    df = v - 1

    p_value = 1 - scipy.stats.t.cdf(t_test, df=df)

    return HypothesisTest(pvalue=p_value, statistic=t_test)