# Author: Raphael Vallat <raphaelvallat9@gmail.com>
# Date: May 2018
import scipy
import numpy as np
import pandas as pd
from pingouin import remove_na, _check_dataframe, _postprocess_dataframe
__all__ = [
"mad",
"madmedianrule",
"mwu",
"wilcoxon",
"kruskal",
"friedman",
"cochran",
"harrelldavis",
]
[docs]
def mad(a, normalize=True, axis=0):
"""
Median Absolute Deviation (MAD) along given axis of an array.
Parameters
----------
a : array-like
Input array.
normalize : boolean.
If True, scale by a normalization constant :math:`c \\approx 0.67`
to ensure consistency with the standard deviation for normally
distributed data.
axis : int or None, optional
Axis along which the MAD is computed. Default is 0.
Can also be None to compute the MAD over the entire array.
Returns
-------
mad : float
mad = median(abs(a - median(a))) / c
See also
--------
madmedianrule, numpy.std
Notes
-----
The `median absolute deviation
<https://en.wikipedia.org/wiki/Median_absolute_deviation>`_ (MAD) computes
the median over the absolute deviations from the median. It is a measure of
dispersion similar to the standard deviation, but is more robust to
outliers.
SciPy 1.3 and higher includes a similar function:
:py:func:`scipy.stats.median_abs_deviation`.
Please note that missing values are automatically removed.
Examples
--------
>>> from pingouin import mad
>>> a = [1.2, 5.4, 3.2, 7.8, 2.5]
>>> mad(a)
2.965204437011204
>>> mad(a, normalize=False)
2.0
2D arrays with missing values (axis handling example)
>>> import numpy as np
>>> np.random.seed(123)
>>> w = np.random.normal(size=(5, 10))
>>> w[3, 2] = np.nan
>>> mad(w) # Axis = 0 (default) = iterate over the columns
array([0.60304023, 2.35057834, 0.90350696, 1.28599837, 1.16024152,
0.38653752, 1.92564066, 1.2480913 , 0.42580373, 1.69814622])
>>> mad(w, axis=1) # Axis = 1 = iterate over the rows
array([1.32639022, 1.19295036, 1.41198672, 0.78020689, 1.01531254])
>>> mad(w, axis=None) # Axis = None = over the entire array
1.1607762457644006
Compare with Scipy >= 1.3
>>> from scipy.stats import median_abs_deviation
>>> median_abs_deviation(w, scale='normal', axis=None, nan_policy='omit')
1.1607762457644006
"""
a = np.asarray(a)
if axis is None:
# Calculate the MAD over the entire array
a = np.ravel(a)
axis = 0
c = scipy.stats.norm.ppf(3 / 4.0) if normalize else 1
center = np.apply_over_axes(np.nanmedian, a, axis)
return np.nanmedian((np.fabs(a - center)) / c, axis=axis)
[docs]
def mwu(x, y, alternative="two-sided", **kwargs):
"""Mann-Whitney U Test (= Wilcoxon rank-sum test). It is the non-parametric
version of the independent T-test.
Parameters
----------
x, y : array_like
First and second set of observations. ``x`` and ``y`` must be
independent.
alternative : string
Defines the alternative hypothesis, or tail of the test. Must be one of
"two-sided" (default), "greater" or "less". See :py:func:`scipy.stats.mannwhitneyu` for
more details.
**kwargs : dict
Additional keywords arguments that are passed to :py:func:`scipy.stats.mannwhitneyu`.
Returns
-------
stats : :py:class:`pandas.DataFrame`
* ``'U-val'``: U-value corresponding with sample x
* ``'alternative'``: tail of the test
* ``'p-val'``: p-value
* ``'RBC'`` : rank-biserial correlation
* ``'CLES'`` : common language effect size
See also
--------
scipy.stats.mannwhitneyu, wilcoxon, ttest
Notes
-----
The Mann–Whitney U test [1]_ (also called Wilcoxon rank-sum test) is a
non-parametric test of the null hypothesis that it is equally likely that
a randomly selected value from one sample will be less than or greater
than a randomly selected value from a second sample. The test assumes
that the two samples are independent. This test corrects for ties and by
default uses a continuity correction (see :py:func:`scipy.stats.mannwhitneyu` for details).
The rank biserial correlation [2]_ is the difference between
the proportion of favorable evidence minus the proportion of unfavorable
evidence. Values range from -1 to 1, with negative values indicating that `y > x`, and
positive values indicating `x > y`.
The common language effect size is the proportion of pairs where ``x`` is
higher than ``y``. It was first introduced by McGraw and Wong (1992) [3]_.
Pingouin uses a brute-force version of the formula given by Vargha and
Delaney 2000 [4]_:
.. math:: \\text{CL} = P(X > Y) + .5 \\times P(X = Y)
The advantage is of this method are twofold. First, the brute-force
approach pairs each observation of ``x`` to its ``y`` counterpart, and
therefore does not require normally distributed data. Second, the formula
takes ties into account and therefore works with ordinal data.
When tail is ``'less'``, the CLES is then set to :math:`1 - \\text{CL}`,
which gives the proportion of pairs where ``x`` is *lower* than ``y``.
References
----------
.. [1] Mann, H. B., & Whitney, D. R. (1947). On a test of whether one of
two random variables is stochastically larger than the other.
The annals of mathematical statistics, 50-60.
.. [2] Kerby, D. S. (2014). The simple difference formula: An approach to
teaching nonparametric correlation. Comprehensive Psychology,
3, 11-IT.
.. [3] McGraw, K. O., & Wong, S. P. (1992). A common language effect size
statistic. Psychological bulletin, 111(2), 361.
.. [4] Vargha, A., & Delaney, H. D. (2000). A Critique and Improvement of
the “CL” Common Language Effect Size Statistics of McGraw and Wong.
Journal of Educational and Behavioral Statistics: A Quarterly
Publication Sponsored by the American Educational Research
Association and the American Statistical Association, 25(2),
101–132. https://doi.org/10.2307/1165329
Examples
--------
>>> import numpy as np
>>> import pingouin as pg
>>> np.random.seed(123)
>>> x = np.random.uniform(low=0, high=1, size=20)
>>> y = np.random.uniform(low=0.2, high=1.2, size=20)
>>> pg.mwu(x, y, alternative='two-sided')
U-val alternative p-val RBC CLES
MWU 97.0 two-sided 0.00556 -0.515 0.2425
Compare with SciPy
>>> import scipy
>>> scipy.stats.mannwhitneyu(x, y, use_continuity=True, alternative='two-sided')
MannwhitneyuResult(statistic=97.0, pvalue=0.0055604599321374135)
One-sided test
>>> pg.mwu(x, y, alternative='greater')
U-val alternative p-val RBC CLES
MWU 97.0 greater 0.997442 -0.515 0.2425
>>> pg.mwu(x, y, alternative='less')
U-val alternative p-val RBC CLES
MWU 97.0 less 0.00278 -0.515 0.7575
Passing keyword arguments to :py:func:`scipy.stats.mannwhitneyu`:
>>> pg.mwu(x, y, alternative='two-sided', method='exact')
U-val alternative p-val RBC CLES
MWU 97.0 two-sided 0.004681 -0.515 0.2425
Reversing the order of `x` and `y`.
>>> pg.mwu(y, x)
U-val alternative p-val RBC CLES
MWU 303.0 two-sided 0.00556 0.515 0.7575
"""
x = np.asarray(x)
y = np.asarray(y)
# Remove NA
x, y = remove_na(x, y, paired=False)
# Check tails
assert alternative in [
"two-sided",
"greater",
"less",
], "Alternative must be one of 'two-sided' (default), 'greater' or 'less'."
if "tail" in kwargs:
raise ValueError(
"Since Pingouin 0.4.0, the 'tail' argument has been renamed to 'alternative'."
)
uval_x, pval = scipy.stats.mannwhitneyu(x, y, alternative=alternative, **kwargs)
# Effect size 1: Common Language Effect Size
# CLES is tail-specific and calculated according to the formula given in
# Vargha and Delaney 2000 which works with ordinal data.
diff = x[:, None] - y
# cles = max((diff < 0).sum(), (diff > 0).sum()) / diff.size
# Tail = 'greater', with ties set to 0.5
# Note that tail = 'two-sided' gives same output as tail = 'greater'
cles = np.where(diff == 0, 0.5, diff > 0).mean()
cles = 1 - cles if alternative == "less" else cles
# Effect size 2: rank biserial correlation (Wendt 1972)
uval_y = x.shape[0] * y.shape[0] - uval_x
rbc = 1 - (2 * uval_y) / diff.size # diff.size = x.size * y.size
# Fill output DataFrame
stats = pd.DataFrame(
{"U-val": uval_x, "alternative": alternative, "p-val": pval, "RBC": rbc, "CLES": cles},
index=["MWU"],
)
return _postprocess_dataframe(stats)
[docs]
def wilcoxon(x, y=None, alternative="two-sided", **kwargs):
"""
Wilcoxon signed-rank test. It is the non-parametric version of the paired T-test.
Parameters
----------
x : array_like
Either the first set of measurements
(in which case y is the second set of measurements),
or the differences between two sets of measurements
(in which case y is not to be specified.) Must be one-dimensional.
y : array_like
Either the second set of measurements (if x is the first set of
measurements), or not specified (if x is the differences between
two sets of measurements.) Must be one-dimensional.
alternative : string
Defines the alternative hypothesis, or tail of the test. Must be one of
"two-sided" (default), "greater" or "less". See :py:func:`scipy.stats.wilcoxon` for
more details.
**kwargs : dict
Additional keywords arguments that are passed to :py:func:`scipy.stats.wilcoxon`.
Returns
-------
stats : :py:class:`pandas.DataFrame`
* ``'W-val'``: W-value
* ``'alternative'``: tail of the test
* ``'p-val'``: p-value
* ``'RBC'`` : matched pairs rank-biserial correlation (effect size)
* ``'CLES'`` : common language effect size
See also
--------
scipy.stats.wilcoxon, mwu
Notes
-----
The Wilcoxon signed-rank test [1]_ tests the null hypothesis that two
related paired samples come from the same distribution. In particular,
it tests whether the distribution of the differences x - y is symmetric
about zero.
.. important:: Pingouin automatically applies a continuity correction.
Therefore, the p-values will be slightly different than
:py:func:`scipy.stats.wilcoxon` unless ``correction=True`` is
explicitly passed to the latter.
In addition to the test statistic and p-values, Pingouin also computes two
measures of effect size. The matched pairs rank biserial correlation [2]_
is the simple difference between the proportion of favorable and
unfavorable evidence; in the case of the Wilcoxon signed-rank test,
the evidence consists of rank sums (Kerby 2014):
.. math:: r = f - u
The common language effect size is the proportion of pairs where ``x`` is
higher than ``y``. It was first introduced by McGraw and Wong (1992) [3]_.
Pingouin uses a brute-force version of the formula given by Vargha and
Delaney 2000 [4]_:
.. math:: \\text{CL} = P(X > Y) + .5 \\times P(X = Y)
The advantage is of this method are twofold. First, the brute-force
approach pairs each observation of ``x`` to its ``y`` counterpart, and
therefore does not require normally distributed data. Second, the formula
takes ties into account and therefore works with ordinal data.
When tail is ``'less'``, the CLES is then set to :math:`1 - \\text{CL}`,
which gives the proportion of pairs where ``x`` is *lower* than ``y``.
References
----------
.. [1] Wilcoxon, F. (1945). Individual comparisons by ranking methods.
Biometrics bulletin, 1(6), 80-83.
.. [2] Kerby, D. S. (2014). The simple difference formula: An approach to
teaching nonparametric correlation. Comprehensive Psychology,
3, 11-IT.
.. [3] McGraw, K. O., & Wong, S. P. (1992). A common language effect size
statistic. Psychological bulletin, 111(2), 361.
.. [4] Vargha, A., & Delaney, H. D. (2000). A Critique and Improvement of
the “CL” Common Language Effect Size Statistics of McGraw and Wong.
Journal of Educational and Behavioral Statistics: A Quarterly
Publication Sponsored by the American Educational Research
Association and the American Statistical Association, 25(2),
101–132. https://doi.org/10.2307/1165329
Examples
--------
Wilcoxon test on two related samples.
>>> import numpy as np
>>> import pingouin as pg
>>> x = np.array([20, 22, 19, 20, 22, 18, 24, 20, 19, 24, 26, 13])
>>> y = np.array([38, 37, 33, 29, 14, 12, 20, 22, 17, 25, 26, 16])
>>> pg.wilcoxon(x, y, alternative='two-sided')
W-val alternative p-val RBC CLES
Wilcoxon 20.5 two-sided 0.285765 -0.378788 0.395833
Same but using pre-computed differences. However, the CLES effect size
cannot be computed as it requires the raw data.
>>> pg.wilcoxon(x - y)
W-val alternative p-val RBC CLES
Wilcoxon 20.5 two-sided 0.285765 -0.378788 NaN
Compare with SciPy
>>> import scipy
>>> scipy.stats.wilcoxon(x, y)
WilcoxonResult(statistic=20.5, pvalue=0.2661660677806492)
The p-value is not exactly similar to Pingouin. This is because Pingouin automatically applies
a continuity correction. Disabling it gives the same p-value as scipy:
>>> pg.wilcoxon(x, y, alternative='two-sided', correction=False)
W-val alternative p-val RBC CLES
Wilcoxon 20.5 two-sided 0.266166 -0.378788 0.395833
One-sided test
>>> pg.wilcoxon(x, y, alternative='greater')
W-val alternative p-val RBC CLES
Wilcoxon 20.5 greater 0.876244 -0.378788 0.395833
>>> pg.wilcoxon(x, y, alternative='less')
W-val alternative p-val RBC CLES
Wilcoxon 20.5 less 0.142883 -0.378788 0.604167
"""
x = np.asarray(x)
if y is not None:
y = np.asarray(y)
x, y = remove_na(x, y, paired=True) # Remove NA
else:
x = x[~np.isnan(x)]
# Check tails
assert alternative in [
"two-sided",
"greater",
"less",
], "Alternative must be one of 'two-sided' (default), 'greater' or 'less'."
if "tail" in kwargs:
raise ValueError(
"Since Pingouin 0.4.0, the 'tail' argument has been renamed to 'alternative'."
)
# Compute test
if "correction" not in kwargs:
kwargs["correction"] = True
wval, pval = scipy.stats.wilcoxon(x=x, y=y, alternative=alternative, **kwargs)
# Effect size 1: Common Language Effect Size
# Since Pingouin v0.3.5, CLES is tail-specific and calculated
# according to the formula given in Vargha and Delaney 2000 which
# works with ordinal data.
if y is not None:
diff = x[:, None] - y
# cles = max((diff < 0).sum(), (diff > 0).sum()) / diff.size
# alternative = 'greater', with ties set to 0.5
# Note that alternative = 'two-sided' gives same output as alternative = 'greater'
cles = np.where(diff == 0, 0.5, diff > 0).mean()
cles = 1 - cles if alternative == "less" else cles
else:
# CLES cannot be computed if y is None
cles = np.nan
# Effect size 2: matched-pairs rank biserial correlation (Kerby 2014)
if y is not None:
d = x - y
d = d[d != 0]
else:
d = x[x != 0]
r = scipy.stats.rankdata(abs(d))
rsum = r.sum()
r_plus = np.sum((d > 0) * r)
r_minus = np.sum((d < 0) * r)
rbc = r_plus / rsum - r_minus / rsum
# Fill output DataFrame
stats = pd.DataFrame(
{"W-val": wval, "alternative": alternative, "p-val": pval, "RBC": rbc, "CLES": cles},
index=["Wilcoxon"],
)
return _postprocess_dataframe(stats)
[docs]
def kruskal(data=None, dv=None, between=None, detailed=False):
"""Kruskal-Wallis H-test for independent samples.
Parameters
----------
data : :py:class:`pandas.DataFrame`
DataFrame
dv : string
Name of column containing the dependent variable.
between : string
Name of column containing the between factor.
Returns
-------
stats : :py:class:`pandas.DataFrame`
* ``'H'``: The Kruskal-Wallis H statistic, corrected for ties
* ``'p-unc'``: Uncorrected p-value
* ``'dof'``: degrees of freedom
Notes
-----
The Kruskal-Wallis H-test tests the null hypothesis that the population
median of all of the groups are equal. It is a non-parametric version of
ANOVA. The test works on 2 or more independent samples, which may have
different sizes.
Due to the assumption that H has a chi square distribution, the number of
samples in each group must not be too small. A typical rule is that each
sample must have at least 5 measurements.
NaN values are automatically removed.
Examples
--------
Compute the Kruskal-Wallis H-test for independent samples.
>>> from pingouin import kruskal, read_dataset
>>> df = read_dataset('anova')
>>> kruskal(data=df, dv='Pain threshold', between='Hair color')
Source ddof1 H p-unc
Kruskal Hair color 3 10.58863 0.014172
"""
# Check data
data = _check_dataframe(dv=dv, between=between, data=data, effects="between")
# Remove NaN values
data = data[[dv, between]].dropna()
# Reset index (avoid duplicate axis error)
data = data.reset_index(drop=True)
# Extract number of groups and total sample size
n_groups = data[between].nunique()
n = data[dv].size
# Rank data, dealing with ties appropriately
data["rank"] = scipy.stats.rankdata(data[dv])
# Find the total of rank per groups
grp = data.groupby(between, observed=True)["rank"]
sum_rk_grp = grp.sum().to_numpy()
n_per_grp = grp.count().to_numpy()
# Calculate chi-square statistic (H)
H = (12 / (n * (n + 1)) * np.sum(sum_rk_grp**2 / n_per_grp)) - 3 * (n + 1)
# Correct for ties
H /= scipy.stats.tiecorrect(data["rank"].to_numpy())
# Calculate DOF and p-value
ddof1 = n_groups - 1
p_unc = scipy.stats.chi2.sf(H, ddof1)
# Create output dataframe
stats = pd.DataFrame(
{
"Source": between,
"ddof1": ddof1,
"H": H,
"p-unc": p_unc,
},
index=["Kruskal"],
)
return _postprocess_dataframe(stats)
[docs]
def friedman(data=None, dv=None, within=None, subject=None, method="chisq"):
"""Friedman test for repeated measurements.
Parameters
----------
data : :py:class:`pandas.DataFrame`
DataFrame. Both wide and long-format dataframe are supported for this test.
dv : string
Name of column containing the dependent variable (only required if ``data`` is in
long format).
within : string
Name of column containing the within-subject factor (only required if ``data`` is in
long format). Two or more within-factor are not currently supported.
subject : string
Name of column containing the subject/rater identifier (only required if ``data`` is in
long format).
method : string
Statistical test to perform. Must be ``'chisq'`` (chi-square test) or ``'f'`` (F test).
See notes below for explanation.
Returns
-------
stats : :py:class:`pandas.DataFrame`
* ``'W'``: Kendall's coefficient of concordance, corrected for ties
If ``method='chisq'``
* ``'Q'``: The Friedman chi-square statistic, corrected for ties
* ``'dof'``: degrees of freedom
* ``'p-unc'``: Uncorrected p-value of the chi squared test
If ``method='f'``
* ``'F'``: The Friedman F statistic, corrected for ties
* ``'dof1'``: degrees of freedom of the numerator
* ``'dof2'``: degrees of freedom of the denominator
* ``'p-unc'``: Uncorrected p-value of the F test
Notes
-----
The Friedman test is used for non-parametric (rank-based) one-way repeated measures ANOVA.
It is equivalent to the test of significance of Kendalls's
coefficient of concordance (Kendall's W). Most commonly a Q statistic,
which has asymptotical chi-squared distribution, is computed and used for
testing. However, the chi-squared test tend to be overly conservative for small numbers
of samples and/or repeated measures, in which case a F-test is more adequate [1]_.
Data can be in wide or long format. Missing values are automatically removed using a
strict listwise approach (= complete-case analysis). In other words, any subject with one or
more missing value(s) is completely removed from the dataframe prior to running the
test.
References
----------
.. [1] Marozzi, M. (2014). Testing for concordance between several
criteria. Journal of Statistical Computation and Simulation,
84(9), 1843–1850. https://doi.org/10.1080/00949655.2013.766189
.. [2] https://www.real-statistics.com/anova-repeated-measures/friedman-test/
Examples
--------
Compute the Friedman test for repeated measurements, using a wide-format dataframe
>>> import pandas as pd
>>> import pingouin as pg
>>> df = pd.DataFrame({
... 'white': {0: 10, 1: 8, 2: 7, 3: 9, 4: 7, 5: 4, 6: 5, 7: 6, 8: 5, 9: 10, 10: 4, 11: 7},
... 'red': {0: 7, 1: 5, 2: 8, 3: 6, 4: 5, 5: 7, 6: 9, 7: 6, 8: 4, 9: 6, 10: 7, 11: 3},
... 'rose': {0: 8, 1: 5, 2: 6, 3: 4, 4: 7, 5: 5, 6: 3, 7: 7, 8: 6, 9: 4, 10: 4, 11: 3}})
>>> pg.friedman(df)
Source W ddof1 Q p-unc
Friedman Within 0.083333 2 2.0 0.367879
Compare with SciPy
>>> from scipy.stats import friedmanchisquare
>>> friedmanchisquare(*df.to_numpy().T)
FriedmanchisquareResult(statistic=1.9999999999999893, pvalue=0.3678794411714444)
Using a long-format dataframe
>>> df_long = df.melt(ignore_index=False).reset_index()
>>> pg.friedman(data=df_long, dv="value", within="variable", subject="index")
Source W ddof1 Q p-unc
Friedman variable 0.083333 2 2.0 0.367879
Using the F-test method
>>> pg.friedman(df, method="f")
Source W ddof1 ddof2 F p-unc
Friedman Within 0.083333 1.833333 20.166667 1.0 0.378959
"""
# Convert from wide to long-format, if needed
if all([v is None for v in [dv, within, subject]]):
assert isinstance(data, pd.DataFrame)
data = data._get_numeric_data().dropna() # Listwise deletion of missing values
assert data.shape[0] > 2, "Data must have at least 3 non-missing rows."
assert data.shape[1] > 1, "Data must contain at least two columns."
data["Subj"] = np.arange(data.shape[0])
data = data.melt(id_vars="Subj", var_name="Within", value_name="DV")
subject, within, dv = "Subj", "Within", "DV"
# Check dataframe
data = _check_dataframe(dv=dv, within=within, data=data, subject=subject, effects="within")
assert not data[within].isnull().any(), "Cannot have missing values in `within`."
assert not data[subject].isnull().any(), "Cannot have missing values in `subject`."
# Pivot the table to a wide-format dataframe. This has several effects:
# 1) Force missing values to be explicit (a NaN cell is created)
# 2) Automatic collapsing to the mean if multiple within factors are present
# 3) If using dropna, remove rows with missing values (listwise deletion).
# The latter is the same behavior as JASP (= strict complete-case analysis).
data_piv = data.pivot_table(index=subject, columns=within, values=dv, observed=True)
data_piv = data_piv.dropna()
# Extract data in numpy array and calculate ranks
X = data_piv.to_numpy()
n, k = X.shape
ranked = scipy.stats.rankdata(X, axis=1)
ssbn = (ranked.sum(axis=0) ** 2).sum()
# Correction for ties
ties = 0
for i in range(n):
replist, repnum = scipy.stats.find_repeats(X[i])
for t in repnum:
ties += t * (t * t - 1)
# Compute Kendall's W corrected for ties
W = (12 * ssbn - 3 * n**2 * k * (k + 1) ** 2) / (n**2 * k * (k - 1) * (k + 1) - n * ties)
if method == "chisq":
# Compute the Q statistic
Q = n * (k - 1) * W
# Approximate the p-value
ddof1 = k - 1
p_unc = scipy.stats.chi2.sf(Q, ddof1)
# Create output dataframe
stats = pd.DataFrame(
{"Source": within, "W": W, "ddof1": ddof1, "Q": Q, "p-unc": p_unc}, index=["Friedman"]
)
elif method == "f":
# Compute the F statistic
F = W * (n - 1) / (1 - W)
# Approximate the p-value
ddof1 = k - 1 - 2 / n
ddof2 = (n - 1) * ddof1
p_unc = scipy.stats.f.sf(F, ddof1, ddof2)
# Create output dataframe
stats = pd.DataFrame(
{"Source": within, "W": W, "ddof1": ddof1, "ddof2": ddof2, "F": F, "p-unc": p_unc},
index=["Friedman"],
)
return _postprocess_dataframe(stats)
[docs]
def cochran(data=None, dv=None, within=None, subject=None):
"""Cochran Q test. A special case of the Friedman test when the dependent
variable is binary.
Parameters
----------
data : :py:class:`pandas.DataFrame`
DataFrame. Both wide and long-format dataframe are supported for this test.
dv : string
Name of column containing the dependent variable (only required if ``data`` is in
long format).
within : string
Name of column containing the within-subject factor (only required if ``data`` is in
long format). Two or more within-factor are not currently supported.
subject : string
Name of column containing the subject/rater identifier (only required if ``data`` is in
long format).
Returns
-------
stats : :py:class:`pandas.DataFrame`
* ``'Q'``: The Cochran Q statistic
* ``'p-unc'``: Uncorrected p-value
* ``'dof'``: degrees of freedom
Notes
-----
The Cochran Q test [1]_ is a non-parametric test for ANOVA with repeated
measures where the dependent variable is binary.
The Q statistics is defined as:
.. math:: Q = \\frac{(r-1)(r\\sum_j^rx_j^2-N^2)}{rN-\\sum_i^nx_i^2}
where :math:`N` is the total sum of all observations, :math:`j=1,...,r`
where :math:`r` is the number of repeated measures, :math:`i=1,...,n` where
:math:`n` is the number of observations per condition.
The p-value is then approximated using a chi-square distribution with
:math:`r-1` degrees of freedom:
.. math:: Q \\sim \\chi^2(r-1)
Data are expected to be in long-format. Missing values are automatically removed using a
strict listwise approach (= complete-case analysis). In other words, any subject with one or
more missing value(s) is completely removed from the dataframe prior to running the
test.
References
----------
.. [1] Cochran, W.G., 1950. The comparison of percentages in matched
samples. Biometrika 37, 256–266.
https://doi.org/10.1093/biomet/37.3-4.256
Examples
--------
Compute the Cochran Q test for repeated measurements.
>>> from pingouin import cochran, read_dataset
>>> df = read_dataset('cochran')
>>> cochran(data=df, dv='Energetic', within='Time', subject='Subject')
Source dof Q p-unc
cochran Time 2 6.705882 0.034981
Same but using a wide-format dataframe
>>> df_wide = df.pivot_table(index="Subject", columns="Time", values="Energetic")
>>> cochran(df_wide)
Source dof Q p-unc
cochran Within 2 6.705882 0.034981
"""
# Convert from wide to long-format, if needed
if all([v is None for v in [dv, within, subject]]):
assert isinstance(data, pd.DataFrame)
data = data._get_numeric_data().dropna() # Listwise deletion of missing values
assert data.shape[0] > 2, "Data must have at least 3 non-missing rows."
assert data.shape[1] > 1, "Data must contain at least two columns."
data["Subj"] = np.arange(data.shape[0])
data = data.melt(id_vars="Subj", var_name="Within", value_name="DV")
subject, within, dv = "Subj", "Within", "DV"
# Check data
data = _check_dataframe(dv=dv, within=within, data=data, subject=subject, effects="within")
assert not data[within].isnull().any(), "Cannot have missing values in `within`."
assert not data[subject].isnull().any(), "Cannot have missing values in `subject`."
# Pivot and melt the table. This has several effects:
# 1) Force missing values to be explicit (a NaN cell is created)
# 2) Automatic collapsing to the mean if multiple within factors are present
# 3) If using dropna, remove rows with missing values (listwise deletion).
# The latter is the same behavior as JASP (= strict complete-case analysis).
data_piv = data.pivot_table(index=subject, columns=within, values=dv, observed=True)
data_piv = data_piv.dropna()
data = data_piv.melt(ignore_index=False, value_name=dv).reset_index()
# Groupby and extract size
grp = data.groupby(within, observed=True)[dv]
grp_s = data.groupby(subject, observed=True)[dv]
k = data[within].nunique()
dof = k - 1
# n = grp.count().unique()[0]
# Q statistic and p-value
q = (dof * (k * np.sum(grp.sum() ** 2) - grp.sum().sum() ** 2)) / (
k * grp.sum().sum() - np.sum(grp_s.sum() ** 2)
)
p_unc = scipy.stats.chi2.sf(q, dof)
# Create output dataframe
stats = pd.DataFrame({"Source": within, "dof": dof, "Q": q, "p-unc": p_unc}, index=["cochran"])
return _postprocess_dataframe(stats)
[docs]
def harrelldavis(x, quantile=0.5, axis=-1):
"""Harrell-Davis robust estimate of the :math:`q^{th}` quantile(s) of the
data.
.. versionadded:: 0.2.9
Parameters
----------
x : array_like
Data, must be a one or two-dimensional vector.
quantile : float or array_like
Quantile or sequence of quantiles to compute, must be between 0 and 1.
Default is ``0.5``.
axis : int
Axis along which the MAD is computed. Default is the last axis (-1).
Can be either 0, 1 or -1.
Returns
-------
y : float or array_like
The estimated quantile(s). If ``quantile`` is a single quantile, will
return a float, otherwise will compute each quantile separately and
returns an array of floats.
Notes
-----
The Harrell-Davis method [1]_ estimates the :math:`q^{th}` quantile by a
linear combination of the order statistics. Results have been tested
against a Matlab implementation [2]_. Note that this method is also
used to measure the confidence intervals of the difference between
quantiles of two groups, as implemented in the shift function [3]_.
See Also
--------
plot_shift
References
----------
.. [1] Frank E. Harrell, C. E. Davis, A new distribution-free quantile
estimator, Biometrika, Volume 69, Issue 3, December 1982, Pages
635–640, https://doi.org/10.1093/biomet/69.3.635
.. [2] https://github.com/GRousselet/matlab_stats/blob/master/hd.m
.. [3] Rousselet, G. A., Pernet, C. R. and Wilcox, R. R. (2017). Beyond
differences in means: robust graphical methods to compare two groups
in neuroscience. Eur J Neurosci, 46: 1738-1748.
https://doi.org/doi:10.1111/ejn.13610
Examples
--------
Estimate the 0.5 quantile (i.e median) of 100 observation picked from a
normal distribution with zero mean and unit variance.
>>> import numpy as np
>>> import pingouin as pg
>>> np.random.seed(123)
>>> x = np.random.normal(0, 1, 100)
>>> round(pg.harrelldavis(x, quantile=0.5), 4)
-0.0499
Several quantiles at once
>>> pg.harrelldavis(x, quantile=[0.25, 0.5, 0.75])
array([-0.84133224, -0.04991657, 0.95897233])
On the last axis of a 2D vector (default)
>>> np.random.seed(123)
>>> x = np.random.normal(0, 1, (10, 100))
>>> pg.harrelldavis(x, quantile=[0.25, 0.5, 0.75])
array([[-0.84133224, -0.52346777, -0.81801193, -0.74611216, -0.64928321,
-0.48565262, -0.64332799, -0.8178394 , -0.70058282, -0.73088088],
[-0.04991657, 0.02932655, -0.08905073, -0.1860034 , 0.06970415,
0.15129817, 0.00430958, -0.13784786, -0.08648077, -0.14407123],
[ 0.95897233, 0.49543002, 0.57712236, 0.48620599, 0.85899005,
0.7903462 , 0.76558585, 0.62528436, 0.60421847, 0.52620286]])
On the first axis
>>> pg.harrelldavis(x, quantile=[0.5], axis=0).shape
(100,)
"""
x = np.asarray(x)
assert x.ndim <= 2, "Only 1D or 2D array are supported for this function."
assert axis in [0, 1, -1], "Axis must be 0, 1 or -1."
# Sort the input array
x = np.sort(x, axis=axis)
n = x.shape[axis]
vec = np.arange(n)
if isinstance(quantile, float):
quantile = [quantile]
y = []
for q in quantile:
# Harrell-Davis estimate of the qth quantile
m1 = (n + 1) * q
m2 = (n + 1) * (1 - q)
w = scipy.stats.beta.cdf((vec + 1) / n, m1, m2) - scipy.stats.beta.cdf((vec) / n, m1, m2)
if axis != 0:
y.append((w * x).sum(axis))
else:
y.append((w[..., None] * x).sum(axis)) # Store results
if len(y) == 1:
y = y[0] # Return a float instead of a list if n quantile is 1
else:
y = np.array(y)
return y