"""
Functions for testing independence of several distributions.
The functions in this module provide methods for testing if
the samples generated from two random vectors are independent.
"""
from __future__ import annotations
from typing import TypeVar
import numpy as np
import scipy.stats
from ._dcor import u_distance_correlation_sqr
from ._dcor_internals import (
_check_same_n_elements,
_distance_matrix_generic,
_u_distance_matrix,
double_centered,
mean_product,
u_complementary_projection,
u_product,
)
from ._hypothesis import HypothesisTest, _permutation_test_with_sym_matrix
from ._utils import (
ArrayType,
RandomLike,
_random_state_init,
_sqrt,
_transform_to_2d,
)
Array = TypeVar("Array", bound=ArrayType)
[docs]def distance_covariance_test(
x: Array,
y: Array,
*,
num_resamples: int = 0,
exponent: float = 1,
random_state: RandomLike = None,
n_jobs: int = 1,
) -> HypothesisTest[Array]:
"""
Test of distance covariance independence.
Compute the test of independence based on the distance
covariance, for two random vectors.
The test is a permutation test where the null hypothesis is that the two
random vectors are independent.
Args:
x: First random vector. The columns correspond with the individual
random variables while the rows are individual instances of the
random vector.
y: Second random vector. The columns correspond with the individual
random variables while the rows are individual instances of the
random vector.
exponent: Exponent of the Euclidean distance, in the range
:math:`(0, 2)`. Equivalently, it is twice the Hurst parameter of
fractional Brownian motion.
num_resamples: Number of permutations resamples to take in the
permutation test.
random_state: Random state to generate the permutations.
n_jobs: Number of jobs executed in parallel by Joblib.
Returns:
Results of the hypothesis test.
See Also:
distance_covariance
Examples:
>>> import numpy as np
>>> import dcor
>>> a = np.array([[1, 2, 3, 4],
... [5, 6, 7, 8],
... [9, 10, 11, 12],
... [13, 14, 15, 16]])
>>> b = np.array([[1, 0, 0, 1],
... [0, 1, 1, 1],
... [1, 1, 1, 1],
... [1, 1, 0, 1]])
>>> dcor.independence.distance_covariance_test(a, a)
HypothesisTest(pvalue=1.0, statistic=208.0)
>>> dcor.independence.distance_covariance_test(a, b)
... # doctest: +ELLIPSIS
HypothesisTest(pvalue=1.0, statistic=11.75323056...)
>>> dcor.independence.distance_covariance_test(b, b)
HypothesisTest(pvalue=1.0, statistic=1.3604610...)
>>> dcor.independence.distance_covariance_test(a, b,
... num_resamples=5, random_state=0)
HypothesisTest(pvalue=0.8333333333333334, statistic=11.7532305...)
>>> dcor.independence.distance_covariance_test(a, b,
... num_resamples=5, random_state=13)
HypothesisTest(pvalue=0.5..., statistic=11.7532305...)
>>> dcor.independence.distance_covariance_test(a, a,
... num_resamples=7, random_state=0)
HypothesisTest(pvalue=0.125, statistic=208.0)
"""
x, y = _transform_to_2d(x, y)
_check_same_n_elements(x, y)
random_state = _random_state_init(random_state)
# Compute U-centered matrices
u_x = _distance_matrix_generic(
x,
centering=double_centered,
exponent=exponent,
)
u_y = _distance_matrix_generic(
y,
centering=double_centered,
exponent=exponent,
)
# Use the dcov statistic
def statistic_function(distance_matrix: Array) -> Array:
return u_x.shape[0] * mean_product(
distance_matrix,
u_y,
)
return _permutation_test_with_sym_matrix(
u_x,
statistic_function=statistic_function,
num_resamples=num_resamples,
random_state=random_state,
n_jobs=n_jobs,
)
def partial_distance_covariance_test(
x: Array,
y: Array,
z: Array,
*,
num_resamples: int = 0,
exponent: float = 1,
random_state: RandomLike = None,
n_jobs: int | None = 1,
) -> HypothesisTest[Array]:
"""
Test of partial distance covariance independence.
Compute the test of independence based on the partial distance
covariance, for two random vectors conditioned on a third.
The test is a permutation test where the null hypothesis is that the first
two random vectors are independent given the third one.
Args:
x: First random vector. The columns correspond with the individual
random variables while the rows are individual instances of the
random vector.
y: Second random vector. The columns correspond with the individual
random variables while the rows are individual instances of the
random vector.
z: Observed random vector. The columns correspond with the individual
random variables while the rows are individual instances of the
random vector.
exponent: Exponent of the Euclidean distance, in the range
:math:`(0, 2)`. Equivalently, it is twice the Hurst parameter of
fractional Brownian motion.
num_resamples: Number of permutations resamples to take in the
permutation test.
random_state: Random state to generate the permutations.
n_jobs: Number of jobs executed in parallel by Joblib.
Returns:
Results of the hypothesis test.
See Also:
partial_distance_covariance
Examples:
>>> import numpy as np
>>> import dcor
>>> a = np.array([[1, 2, 3, 4],
... [5, 6, 7, 8],
... [9, 10, 11, 12],
... [13, 14, 15, 16]])
>>> b = np.array([[1, 0, 0, 1],
... [0, 1, 1, 1],
... [1, 1, 1, 1],
... [1, 1, 0, 1]])
>>> c = np.array([[1000, 0, 0, 1000],
... [0, 1000, 1000, 1000],
... [1000, 1000, 1000, 1000],
... [1000, 1000, 0, 1000]])
>>> dcor.independence.partial_distance_covariance_test(a, a, b)
... # doctest: +ELLIPSIS
HypothesisTest(pvalue=1.0, statistic=142.6664416...)
>>> dcor.independence.partial_distance_covariance_test(a, b, c)
... # doctest: +ELLIPSIS
HypothesisTest(pvalue=1.0, statistic=7.0791037...e-15)
>>> dcor.independence.partial_distance_covariance_test(b, b, c)
... # doctest: +ELLIPSIS
HypothesisTest(pvalue=1.0, statistic=6.3170502...e-31)
>>> dcor.independence.partial_distance_covariance_test(a, b, c,
... num_resamples=5, random_state=0)
HypothesisTest(pvalue=0.1666666..., statistic=7.0791037...e-15)
>>> dcor.independence.partial_distance_covariance_test(a, b, c,
... num_resamples=5, random_state=13)
HypothesisTest(pvalue=0.1666666..., statistic=7.0791037...e-15)
>>> dcor.independence.partial_distance_covariance_test(a, c, b,
... num_resamples=7, random_state=0)
HypothesisTest(pvalue=1.0, statistic=-7.5701764...e-12)
"""
random_state = _random_state_init(random_state)
# Compute U-centered matrices
u_x = _u_distance_matrix(x, exponent=exponent)
u_y = _u_distance_matrix(y, exponent=exponent)
u_z = _u_distance_matrix(z, exponent=exponent)
# Compute projections
proj = u_complementary_projection(u_z)
p_xz = proj(u_x)
p_yz = proj(u_y)
# Use the pdcor statistic
def statistic_function(distance_matrix: Array) -> Array:
return u_x.shape[0] * u_product(
distance_matrix,
p_yz,
)
return _permutation_test_with_sym_matrix(
p_xz,
statistic_function=statistic_function,
num_resamples=num_resamples,
random_state=random_state,
n_jobs=n_jobs,
)
[docs]def distance_correlation_t_statistic(
x: Array,
y: Array,
) -> Array:
"""
Statistic used in :func:`distance_correlation_t_test`.
Args:
x: First random vector. The columns correspond with the individual
random variables while the rows are individual instances of the
random vector.
y: Second random vector. The columns correspond with the individual
random variables while the rows are individual instances of the
random vector.
Returns:
T statistic.
See Also:
distance_correlation_t_test
Examples:
>>> import numpy as np
>>> import dcor
>>> a = np.array([[1, 2, 3, 4],
... [5, 6, 7, 8],
... [9, 10, 11, 12],
... [13, 14, 15, 16]])
>>> b = np.array([[1, 0, 0, 1],
... [0, 1, 1, 1],
... [1, 1, 1, 1],
... [1, 1, 0, 1]])
>>> with np.errstate(divide='ignore'):
... dcor.independence.distance_correlation_t_statistic(a, a)
inf
>>> dcor.independence.distance_correlation_t_statistic(a, b)
... # doctest: +ELLIPSIS
-0.4430164...
>>> with np.errstate(divide='ignore'):
... dcor.independence.distance_correlation_t_statistic(b, b)
inf
"""
bcdcor = u_distance_correlation_sqr(x, y)
n = x.shape[0]
v = n * (n - 3) / 2
return np.sqrt(v - 1) * bcdcor / _sqrt(1 - bcdcor**2)
[docs]def distance_correlation_t_test(
x: Array,
y: Array,
) -> HypothesisTest[Array]:
"""
Test of independence for high dimension.
It is based on convergence to a Student t distribution.
The null hypothesis is that the two random vectors are
independent.
Args:
x: First random vector. The columns correspond with the individual
random variables while the rows are individual instances of the
random vector.
y: Second random vector. The columns correspond with the individual
random variables while the rows are individual instances of the
random vector.
Returns:
Results of the hypothesis test.
See Also:
distance_correlation_t_statistic
Examples:
>>> import numpy as np
>>> import dcor
>>> a = np.array([[1, 2, 3, 4],
... [5, 6, 7, 8],
... [9, 10, 11, 12],
... [13, 14, 15, 16]])
>>> b = np.array([[1, 0, 0, 1],
... [0, 1, 1, 1],
... [1, 1, 1, 1],
... [1, 1, 0, 1]])
>>> with np.errstate(divide='ignore'):
... dcor.independence.distance_correlation_t_test(a, a)
... # doctest: +ELLIPSIS
HypothesisTest(pvalue=0.0, statistic=inf)
>>> dcor.independence.distance_correlation_t_test(a, b)
... # doctest: +ELLIPSIS
HypothesisTest(pvalue=0.6327451..., statistic=-0.4430164...)
>>> with np.errstate(divide='ignore'):
... dcor.independence.distance_correlation_t_test(b, b)
... # doctest: +ELLIPSIS
HypothesisTest(pvalue=0.0, statistic=inf)
"""
t_test = distance_correlation_t_statistic(x, y)
n = x.shape[0]
v = n * (n - 3) / 2
df = v - 1
p_value = 1 - scipy.stats.t.cdf(t_test, df=df)
return HypothesisTest(pvalue=p_value, statistic=t_test)