import numpy as np
import pandas as pd
from collections import namedtuple
from pingouin.utils import remove_na, _postprocess_dataframe
__all__ = ["multivariate_normality", "multivariate_ttest", "box_m"]
[docs]
def multivariate_normality(X, alpha=0.05):
"""Henze-Zirkler multivariate normality test.
Parameters
----------
X : np.array
Data matrix of shape (n_samples, n_features).
alpha : float
Significance level.
Returns
-------
hz : float
The Henze-Zirkler test statistic.
pval : float
P-value.
normal : boolean
True if X comes from a multivariate normal distribution.
See Also
--------
normality : Test the univariate normality of one or more variables.
homoscedasticity : Test equality of variance.
sphericity : Mauchly's test for sphericity.
Notes
-----
The Henze-Zirkler test [1]_ has a good overall power against alternatives
to normality and works for any dimension and sample size.
Adapted to Python from a Matlab code [2]_ by Antonio Trujillo-Ortiz and
tested against the
`MVN <https://cran.r-project.org/web/packages/MVN/MVN.pdf>`_ R package.
Rows with missing values are automatically removed.
References
----------
.. [1] Henze, N., & Zirkler, B. (1990). A class of invariant consistent
tests for multivariate normality. Communications in Statistics-Theory
and Methods, 19(10), 3595-3617.
.. [2] Trujillo-Ortiz, A., R. Hernandez-Walls, K. Barba-Rojo and L.
Cupul-Magana. (2007). HZmvntest: Henze-Zirkler's Multivariate
Normality Test. A MATLAB file.
Examples
--------
>>> import pingouin as pg
>>> data = pg.read_dataset('multivariate')
>>> X = data[['Fever', 'Pressure', 'Aches']]
>>> pg.multivariate_normality(X, alpha=.05)
HZResults(hz=0.540086101851555, pval=0.7173686509622386, normal=True)
"""
from scipy.stats import lognorm
# Check input and remove missing values
X = np.asarray(X)
assert X.ndim == 2, "X must be of shape (n_samples, n_features)."
X = X[~np.isnan(X).any(axis=1)]
n, p = X.shape
assert n >= 3, "X must have at least 3 rows."
assert p >= 2, "X must have at least two columns."
# Covariance matrix
S = np.cov(X, rowvar=False, bias=True)
S_inv = np.linalg.pinv(S, hermitian=True).astype(X.dtype) # Preserving original dtype
difT = X - X.mean(0)
# Squared-Mahalanobis distances
Dj = np.diag(np.linalg.multi_dot([difT, S_inv, difT.T]))
Y = np.linalg.multi_dot([X, S_inv, X.T])
Y_diag = np.diag(Y)
Djk = -2 * Y.T + Y_diag + Y_diag[..., None]
# Smoothing parameter
b = 1 / (np.sqrt(2)) * ((2 * p + 1) / 4) ** (1 / (p + 4)) * (n ** (1 / (p + 4)))
# Is matrix full-rank (columns are linearly independent)?
if np.linalg.matrix_rank(S) == p:
hz = n * (
1 / (n**2) * np.sum(np.sum(np.exp(-(b**2) / 2 * Djk)))
- 2
* ((1 + (b**2)) ** (-p / 2))
* (1 / n)
* (np.sum(np.exp(-((b**2) / (2 * (1 + (b**2)))) * Dj)))
+ ((1 + (2 * (b**2))) ** (-p / 2))
)
else:
hz = n * 4
wb = (1 + b**2) * (1 + 3 * b**2)
a = 1 + 2 * b**2
# Mean and variance
mu = 1 - a ** (-p / 2) * (1 + p * b**2 / a + (p * (p + 2) * (b**4)) / (2 * a**2))
si2 = (
2 * (1 + 4 * b**2) ** (-p / 2)
+ 2 * a ** (-p) * (1 + (2 * p * b**4) / a**2 + (3 * p * (p + 2) * b**8) / (4 * a**4))
- 4 * wb ** (-p / 2) * (1 + (3 * p * b**4) / (2 * wb) + (p * (p + 2) * b**8) / (2 * wb**2))
)
# Lognormal mean and variance
pmu = np.log(np.sqrt(mu**4 / (si2 + mu**2)))
psi = np.sqrt(np.log1p(si2 / mu**2))
# P-value
pval = lognorm.sf(hz, psi, scale=np.exp(pmu))
normal = True if pval > alpha else False
HZResults = namedtuple("HZResults", ["hz", "pval", "normal"])
return HZResults(hz=hz, pval=pval, normal=normal)
[docs]
def multivariate_ttest(X, Y=None, paired=False):
"""Hotelling T-squared test (= multivariate T-test)
Parameters
----------
X : np.array
First data matrix of shape (n_samples, n_features).
Y : np.array or None
Second data matrix of shape (n_samples, n_features). If ``Y`` is a 1D
array of shape (n_features), a one-sample test is performed where the
null hypothesis is defined in ``Y``. If ``Y`` is None, a one-sample
is performed against np.zeros(n_features).
paired : boolean
Specify whether the two observations are related (i.e. repeated
measures) or independent. If ``paired`` is True, ``X`` and ``Y`` must
have exactly the same shape.
Returns
-------
stats : :py:class:`pandas.DataFrame`
* ``'T2'``: T-squared value
* ``'F'``: F-value
* ``'df1'``: first degree of freedom
* ``'df2'``: second degree of freedom
* ``'p-val'``: p-value
See Also
--------
multivariate_normality : Multivariate normality test.
ttest : Univariate T-test.
Notes
-----
The Hotelling 's T-squared test [1]_ is the multivariate counterpart of
the T-test.
Rows with missing values are automatically removed using the
:py:func:`remove_na` function.
Tested against the `Hotelling
<https://cran.r-project.org/web/packages/Hotelling/Hotelling.pdf>`_ R
package.
References
----------
.. [1] Hotelling, H. The Generalization of Student's Ratio. Ann. Math.
Statist. 2 (1931), no. 3, 360--378.
See also http://www.real-statistics.com/multivariate-statistics/
Examples
--------
Two-sample independent Hotelling T-squared test
>>> import pingouin as pg
>>> data = pg.read_dataset('multivariate')
>>> dvs = ['Fever', 'Pressure', 'Aches']
>>> X = data[data['Condition'] == 'Drug'][dvs]
>>> Y = data[data['Condition'] == 'Placebo'][dvs]
>>> pg.multivariate_ttest(X, Y)
T2 F df1 df2 pval
hotelling 4.228679 1.326644 3 32 0.282898
Two-sample paired Hotelling T-squared test
>>> pg.multivariate_ttest(X, Y, paired=True)
T2 F df1 df2 pval
hotelling 4.468456 1.314252 3 15 0.306542
One-sample Hotelling T-squared test with a specified null hypothesis
>>> null_hypothesis_means = [37.5, 70, 5]
>>> pg.multivariate_ttest(X, Y=null_hypothesis_means)
T2 F df1 df2 pval
hotelling 253.230991 74.479703 3 15 3.081281e-09
"""
from scipy.stats import f
x = np.asarray(X)
assert x.ndim == 2, "x must be of shape (n_samples, n_features)"
if Y is None:
y = np.zeros(x.shape[1])
# Remove rows with missing values in x
x = x[~np.isnan(x).any(axis=1)]
else:
nx, kx = x.shape
y = np.asarray(Y)
assert y.ndim in [1, 2], "Y must be 1D or 2D."
if y.ndim == 1:
# One sample with specified null
assert y.size == kx
else:
# Two-sample
err = "X and Y must have the same number of features (= columns)."
assert y.shape[1] == kx, err
if paired:
err = "X and Y must have the same number of rows if paired."
assert y.shape[0] == nx, err
# Remove rows with missing values in both x and y
x, y = remove_na(x, y, paired=paired, axis="rows")
# Shape of arrays
nx, k = x.shape
ny = y.shape[0]
assert nx >= 5, "At least five samples are required."
if y.ndim == 1 or paired is True:
n = nx
if y.ndim == 1:
# One sample test
cov = np.cov(x, rowvar=False)
diff = x.mean(0) - y
else:
# Paired two sample
cov = np.cov(x - y, rowvar=False)
diff = x.mean(0) - y.mean(0)
inv_cov = np.linalg.pinv(cov, hermitian=True)
t2 = (diff @ inv_cov) @ diff * n
else:
n = nx + ny - 1
x_cov = np.cov(x, rowvar=False)
y_cov = np.cov(y, rowvar=False)
pooled_cov = ((nx - 1) * x_cov + (ny - 1) * y_cov) / (n - 1)
inv_cov = np.linalg.pinv((1 / nx + 1 / ny) * pooled_cov, hermitian=True)
diff = x.mean(0) - y.mean(0)
t2 = (diff @ inv_cov) @ diff
# F-value, degrees of freedom and p-value
fval = t2 * (n - k) / (k * (n - 1))
df1 = k
df2 = n - k
pval = f.sf(fval, df1, df2)
# Create output dictionnary
stats = {"T2": t2, "F": fval, "df1": df1, "df2": df2, "pval": pval}
stats = pd.DataFrame(stats, index=["hotelling"])
return _postprocess_dataframe(stats)
[docs]
def box_m(data, dvs, group, alpha=0.001):
"""Test equality of covariance matrices using the Box's M test.
Parameters
----------
data : :py:class:`pandas.DataFrame`
Long-format dataframe.
dvs : list
Dependent variables.
group : str
Grouping variable.
alpha : float
Significance level. Default is 0.001 as recommended in [2]_. A
non-significant p-value (higher than alpha) indicates that the
covariance matrices are homogenous (= equal).
Returns
-------
stats : :py:class:`pandas.DataFrame`
* ``'Chi2'``: Test statistic
* ``'pval'``: p-value
* ``'df'``: The Chi-Square statistic's degree of freedom
* ``'equal_cov'``: True if ``data`` has equal covariance
Notes
-----
.. warning:: Box's M test is susceptible to errors if the data does not
meet the assumption of multivariate normality or if the sample size is
too large or small [3]_.
Pingouin uses :py:meth:`pandas.DataFrameGroupBy.cov` to calculate the
variance-covariance matrix of each group. Missing values are automatically
excluded from the calculation by Pandas.
Mathematical expressions can be found in [1]_.
This function has been tested against the boxM package of the `biotools`
R package [4]_.
References
----------
.. [1] Rencher, A. C. (2003). Methods of multivariate analysis (Vol. 492).
John Wiley & Sons.
.. [2] Hahs-Vaughn, D. (2016). Applied Multivariate Statistical Concepts.
Taylor & Francis.
.. [3] https://en.wikipedia.org/wiki/Box%27s_M_test
.. [4] https://cran.r-project.org/web/packages/biotools/index.html
Examples
--------
1. Box M test with 3 dependent variables of 4 groups (equal sample size)
>>> import pandas as pd
>>> import pingouin as pg
>>> from scipy.stats import multivariate_normal as mvn
>>> data = pd.DataFrame(mvn.rvs(size=(100, 3), random_state=42),
... columns=['A', 'B', 'C'])
>>> data['group'] = [1] * 25 + [2] * 25 + [3] * 25 + [4] * 25
>>> data.head()
A B C group
0 0.496714 -0.138264 0.647689 1
1 1.523030 -0.234153 -0.234137 1
2 1.579213 0.767435 -0.469474 1
3 0.542560 -0.463418 -0.465730 1
4 0.241962 -1.913280 -1.724918 1
>>> pg.box_m(data, dvs=['A', 'B', 'C'], group='group')
Chi2 df pval equal_cov
box 11.634185 18.0 0.865537 True
2. Box M test with 3 dependent variables of 2 groups (unequal sample size)
>>> data = pd.DataFrame(mvn.rvs(size=(30, 2), random_state=42),
... columns=['A', 'B'])
>>> data['group'] = [1] * 20 + [2] * 10
>>> pg.box_m(data, dvs=['A', 'B'], group='group')
Chi2 df pval equal_cov
box 0.706709 3.0 0.871625 True
"""
# Safety checks
from scipy.stats import chi2
assert isinstance(data, pd.DataFrame), "data must be a pandas dataframe."
assert group in data.columns, "The grouping variable is not in data."
assert set(dvs).issubset(data.columns), "The DVs are not in data."
grp = data.groupby(group, observed=True)[dvs]
assert grp.ngroups > 1, "Data must have at least two columns."
# Calculate covariance matrix and descriptive statistics
# - n_covs is the number of covariance matrices
# - n_dvs is the number of variables
# - n_samp is the number of samples in each covariance matrix
# - nobs is the total number of observations
covs = grp.cov(numeric_only=True)
n_covs, n_dvs = covs.index.levshape
n_samp = grp.count().iloc[:, 0].to_numpy() # NaN are excluded by .count
nobs = n_samp.sum()
v = n_samp - 1
# Calculate pooled covariance matrix (S) and M statistics
covs = covs.to_numpy().reshape(n_covs, n_dvs, -1)
S = (covs * v[..., None, None]).sum(axis=0) / (nobs - n_covs)
# The following lines might raise an error if the covariance matrices are
# not invertible (e.g. missing values in input).
S_det = np.linalg.det(S)
M = ((np.linalg.det(covs) / S_det) ** (v / 2)).prod()
# Calculate C in reference [1] (page 257-259)
if len(np.unique(n_samp)) == 1:
# All groups have same number of samples
c = ((n_covs + 1) * (2 * n_dvs**2 + 3 * n_dvs - 1)) / (
6 * n_covs * (n_dvs + 1) * (nobs / n_covs - 1)
)
else:
# Unequal sample size
c = (2 * n_dvs**2 + 3 * n_dvs - 1) / (6 * (n_dvs + 1) * (n_covs - 1))
c *= (1 / v).sum() - 1 / v.sum()
# Calculate U statistics and degree of fredom
u = -2 * (1 - c) * np.log(M)
df = 0.5 * n_dvs * (n_dvs + 1) * (n_covs - 1)
p = chi2.sf(u, df)
equal_cov = True if p > alpha else False
stats = pd.DataFrame(
index=["box"], data={"Chi2": [u], "df": [df], "pval": [p], "equal_cov": [equal_cov]}
)
return _postprocess_dataframe(stats)