Source code for pingouin.nonparametric

# Author: Raphael Vallat <raphaelvallat9@gmail.com>
# Date: May 2018
import scipy
import numpy as np
import pandas as pd
from pingouin import remove_na, _check_dataframe, _postprocess_dataframe

__all__ = [
    "mad",
    "madmedianrule",
    "mwu",
    "wilcoxon",
    "kruskal",
    "friedman",
    "cochran",
    "harrelldavis",
]


[docs] def mad(a, normalize=True, axis=0): """ Median Absolute Deviation (MAD) along given axis of an array. Parameters ---------- a : array-like Input array. normalize : boolean. If True, scale by a normalization constant :math:`c \\approx 0.67` to ensure consistency with the standard deviation for normally distributed data. axis : int or None, optional Axis along which the MAD is computed. Default is 0. Can also be None to compute the MAD over the entire array. Returns ------- mad : float mad = median(abs(a - median(a))) / c See also -------- madmedianrule, numpy.std Notes ----- The `median absolute deviation <https://en.wikipedia.org/wiki/Median_absolute_deviation>`_ (MAD) computes the median over the absolute deviations from the median. It is a measure of dispersion similar to the standard deviation, but is more robust to outliers. SciPy 1.3 and higher includes a similar function: :py:func:`scipy.stats.median_abs_deviation`. Please note that missing values are automatically removed. Examples -------- >>> from pingouin import mad >>> a = [1.2, 5.4, 3.2, 7.8, 2.5] >>> mad(a) 2.965204437011204 >>> mad(a, normalize=False) 2.0 2D arrays with missing values (axis handling example) >>> import numpy as np >>> np.random.seed(123) >>> w = np.random.normal(size=(5, 10)) >>> w[3, 2] = np.nan >>> mad(w) # Axis = 0 (default) = iterate over the columns array([0.60304023, 2.35057834, 0.90350696, 1.28599837, 1.16024152, 0.38653752, 1.92564066, 1.2480913 , 0.42580373, 1.69814622]) >>> mad(w, axis=1) # Axis = 1 = iterate over the rows array([1.32639022, 1.19295036, 1.41198672, 0.78020689, 1.01531254]) >>> mad(w, axis=None) # Axis = None = over the entire array 1.1607762457644006 Compare with Scipy >= 1.3 >>> from scipy.stats import median_abs_deviation >>> median_abs_deviation(w, scale='normal', axis=None, nan_policy='omit') 1.1607762457644006 """ a = np.asarray(a) if axis is None: # Calculate the MAD over the entire array a = np.ravel(a) axis = 0 c = scipy.stats.norm.ppf(3 / 4.0) if normalize else 1 center = np.apply_over_axes(np.nanmedian, a, axis) return np.nanmedian((np.fabs(a - center)) / c, axis=axis)
[docs] def madmedianrule(a): """Robust outlier detection based on the MAD-median rule. Parameters ---------- a : array-like Input array. Must be one-dimensional. Returns ------- outliers: boolean (same shape as a) Boolean array indicating whether each sample is an outlier (True) or not (False). See also -------- mad Notes ----- The MAD-median-rule ([1]_, [2]_) will refer to declaring :math:`X_i` an outlier if .. math:: \\frac{\\left | X_i - M \\right |}{\\text{MAD}_{\\text{norm}}} > K, where :math:`M` is the median of :math:`X`, :math:`\\text{MAD}_{\\text{norm}}` the normalized median absolute deviation of :math:`X`, and :math:`K` is the square root of the .975 quantile of a :math:`X^2` distribution with one degree of freedom, which is roughly equal to 2.24. References ---------- .. [1] Hall, P., Welsh, A.H., 1985. Limit theorems for the median deviation. Ann. Inst. Stat. Math. 37, 27–36. https://doi.org/10.1007/BF02481078 .. [2] Wilcox, R. R. Introduction to Robust Estimation and Hypothesis Testing. (Academic Press, 2011). Examples -------- >>> import pingouin as pg >>> a = [-1.09, 1., 0.28, -1.51, -0.58, 6.61, -2.43, -0.43] >>> pg.madmedianrule(a) array([False, False, False, False, False, True, False, False]) """ a = np.asarray(a) assert a.ndim == 1, "Only 1D array / list are supported for this function." k = np.sqrt(scipy.stats.chi2.ppf(0.975, 1)) return (np.fabs(a - np.median(a)) / mad(a)) > k
[docs] def mwu(x, y, alternative="two-sided", **kwargs): """Mann-Whitney U Test (= Wilcoxon rank-sum test). It is the non-parametric version of the independent T-test. Parameters ---------- x, y : array_like First and second set of observations. ``x`` and ``y`` must be independent. alternative : string Defines the alternative hypothesis, or tail of the test. Must be one of "two-sided" (default), "greater" or "less". See :py:func:`scipy.stats.mannwhitneyu` for more details. **kwargs : dict Additional keywords arguments that are passed to :py:func:`scipy.stats.mannwhitneyu`. Returns ------- stats : :py:class:`pandas.DataFrame` * ``'U-val'``: U-value corresponding with sample x * ``'alternative'``: tail of the test * ``'p-val'``: p-value * ``'RBC'`` : rank-biserial correlation * ``'CLES'`` : common language effect size See also -------- scipy.stats.mannwhitneyu, wilcoxon, ttest Notes ----- The Mann–Whitney U test [1]_ (also called Wilcoxon rank-sum test) is a non-parametric test of the null hypothesis that it is equally likely that a randomly selected value from one sample will be less than or greater than a randomly selected value from a second sample. The test assumes that the two samples are independent. This test corrects for ties and by default uses a continuity correction (see :py:func:`scipy.stats.mannwhitneyu` for details). The rank biserial correlation [2]_ is the difference between the proportion of favorable evidence minus the proportion of unfavorable evidence. Values range from -1 to 1, with negative values indicating that `y > x`, and positive values indicating `x > y`. The common language effect size is the proportion of pairs where ``x`` is higher than ``y``. It was first introduced by McGraw and Wong (1992) [3]_. Pingouin uses a brute-force version of the formula given by Vargha and Delaney 2000 [4]_: .. math:: \\text{CL} = P(X > Y) + .5 \\times P(X = Y) The advantage is of this method are twofold. First, the brute-force approach pairs each observation of ``x`` to its ``y`` counterpart, and therefore does not require normally distributed data. Second, the formula takes ties into account and therefore works with ordinal data. When tail is ``'less'``, the CLES is then set to :math:`1 - \\text{CL}`, which gives the proportion of pairs where ``x`` is *lower* than ``y``. References ---------- .. [1] Mann, H. B., & Whitney, D. R. (1947). On a test of whether one of two random variables is stochastically larger than the other. The annals of mathematical statistics, 50-60. .. [2] Kerby, D. S. (2014). The simple difference formula: An approach to teaching nonparametric correlation. Comprehensive Psychology, 3, 11-IT. .. [3] McGraw, K. O., & Wong, S. P. (1992). A common language effect size statistic. Psychological bulletin, 111(2), 361. .. [4] Vargha, A., & Delaney, H. D. (2000). A Critique and Improvement of the “CL” Common Language Effect Size Statistics of McGraw and Wong. Journal of Educational and Behavioral Statistics: A Quarterly Publication Sponsored by the American Educational Research Association and the American Statistical Association, 25(2), 101–132. https://doi.org/10.2307/1165329 Examples -------- >>> import numpy as np >>> import pingouin as pg >>> np.random.seed(123) >>> x = np.random.uniform(low=0, high=1, size=20) >>> y = np.random.uniform(low=0.2, high=1.2, size=20) >>> pg.mwu(x, y, alternative='two-sided') U-val alternative p-val RBC CLES MWU 97.0 two-sided 0.00556 -0.515 0.2425 Compare with SciPy >>> import scipy >>> scipy.stats.mannwhitneyu(x, y, use_continuity=True, alternative='two-sided') MannwhitneyuResult(statistic=97.0, pvalue=0.0055604599321374135) One-sided test >>> pg.mwu(x, y, alternative='greater') U-val alternative p-val RBC CLES MWU 97.0 greater 0.997442 -0.515 0.2425 >>> pg.mwu(x, y, alternative='less') U-val alternative p-val RBC CLES MWU 97.0 less 0.00278 -0.515 0.7575 Passing keyword arguments to :py:func:`scipy.stats.mannwhitneyu`: >>> pg.mwu(x, y, alternative='two-sided', method='exact') U-val alternative p-val RBC CLES MWU 97.0 two-sided 0.004681 -0.515 0.2425 Reversing the order of `x` and `y`. >>> pg.mwu(y, x) U-val alternative p-val RBC CLES MWU 303.0 two-sided 0.00556 0.515 0.7575 """ x = np.asarray(x) y = np.asarray(y) # Remove NA x, y = remove_na(x, y, paired=False) # Check tails assert alternative in [ "two-sided", "greater", "less", ], "Alternative must be one of 'two-sided' (default), 'greater' or 'less'." if "tail" in kwargs: raise ValueError( "Since Pingouin 0.4.0, the 'tail' argument has been renamed to 'alternative'." ) uval_x, pval = scipy.stats.mannwhitneyu(x, y, alternative=alternative, **kwargs) # Effect size 1: Common Language Effect Size # CLES is tail-specific and calculated according to the formula given in # Vargha and Delaney 2000 which works with ordinal data. diff = x[:, None] - y # cles = max((diff < 0).sum(), (diff > 0).sum()) / diff.size # Tail = 'greater', with ties set to 0.5 # Note that tail = 'two-sided' gives same output as tail = 'greater' cles = np.where(diff == 0, 0.5, diff > 0).mean() cles = 1 - cles if alternative == "less" else cles # Effect size 2: rank biserial correlation (Wendt 1972) uval_y = x.shape[0] * y.shape[0] - uval_x rbc = 1 - (2 * uval_y) / diff.size # diff.size = x.size * y.size # Fill output DataFrame stats = pd.DataFrame( {"U-val": uval_x, "alternative": alternative, "p-val": pval, "RBC": rbc, "CLES": cles}, index=["MWU"], ) return _postprocess_dataframe(stats)
[docs] def wilcoxon(x, y=None, alternative="two-sided", **kwargs): """ Wilcoxon signed-rank test. It is the non-parametric version of the paired T-test. Parameters ---------- x : array_like Either the first set of measurements (in which case y is the second set of measurements), or the differences between two sets of measurements (in which case y is not to be specified.) Must be one-dimensional. y : array_like Either the second set of measurements (if x is the first set of measurements), or not specified (if x is the differences between two sets of measurements.) Must be one-dimensional. alternative : string Defines the alternative hypothesis, or tail of the test. Must be one of "two-sided" (default), "greater" or "less". See :py:func:`scipy.stats.wilcoxon` for more details. **kwargs : dict Additional keywords arguments that are passed to :py:func:`scipy.stats.wilcoxon`. Returns ------- stats : :py:class:`pandas.DataFrame` * ``'W-val'``: W-value * ``'alternative'``: tail of the test * ``'p-val'``: p-value * ``'RBC'`` : matched pairs rank-biserial correlation (effect size) * ``'CLES'`` : common language effect size See also -------- scipy.stats.wilcoxon, mwu Notes ----- The Wilcoxon signed-rank test [1]_ tests the null hypothesis that two related paired samples come from the same distribution. In particular, it tests whether the distribution of the differences x - y is symmetric about zero. .. important:: Pingouin automatically applies a continuity correction. Therefore, the p-values will be slightly different than :py:func:`scipy.stats.wilcoxon` unless ``correction=True`` is explicitly passed to the latter. In addition to the test statistic and p-values, Pingouin also computes two measures of effect size. The matched pairs rank biserial correlation [2]_ is the simple difference between the proportion of favorable and unfavorable evidence; in the case of the Wilcoxon signed-rank test, the evidence consists of rank sums (Kerby 2014): .. math:: r = f - u The common language effect size is the proportion of pairs where ``x`` is higher than ``y``. It was first introduced by McGraw and Wong (1992) [3]_. Pingouin uses a brute-force version of the formula given by Vargha and Delaney 2000 [4]_: .. math:: \\text{CL} = P(X > Y) + .5 \\times P(X = Y) The advantage is of this method are twofold. First, the brute-force approach pairs each observation of ``x`` to its ``y`` counterpart, and therefore does not require normally distributed data. Second, the formula takes ties into account and therefore works with ordinal data. When tail is ``'less'``, the CLES is then set to :math:`1 - \\text{CL}`, which gives the proportion of pairs where ``x`` is *lower* than ``y``. References ---------- .. [1] Wilcoxon, F. (1945). Individual comparisons by ranking methods. Biometrics bulletin, 1(6), 80-83. .. [2] Kerby, D. S. (2014). The simple difference formula: An approach to teaching nonparametric correlation. Comprehensive Psychology, 3, 11-IT. .. [3] McGraw, K. O., & Wong, S. P. (1992). A common language effect size statistic. Psychological bulletin, 111(2), 361. .. [4] Vargha, A., & Delaney, H. D. (2000). A Critique and Improvement of the “CL” Common Language Effect Size Statistics of McGraw and Wong. Journal of Educational and Behavioral Statistics: A Quarterly Publication Sponsored by the American Educational Research Association and the American Statistical Association, 25(2), 101–132. https://doi.org/10.2307/1165329 Examples -------- Wilcoxon test on two related samples. >>> import numpy as np >>> import pingouin as pg >>> x = np.array([20, 22, 19, 20, 22, 18, 24, 20, 19, 24, 26, 13]) >>> y = np.array([38, 37, 33, 29, 14, 12, 20, 22, 17, 25, 26, 16]) >>> pg.wilcoxon(x, y, alternative='two-sided') W-val alternative p-val RBC CLES Wilcoxon 20.5 two-sided 0.285765 -0.378788 0.395833 Same but using pre-computed differences. However, the CLES effect size cannot be computed as it requires the raw data. >>> pg.wilcoxon(x - y) W-val alternative p-val RBC CLES Wilcoxon 20.5 two-sided 0.285765 -0.378788 NaN Compare with SciPy >>> import scipy >>> scipy.stats.wilcoxon(x, y) WilcoxonResult(statistic=20.5, pvalue=0.2661660677806492) The p-value is not exactly similar to Pingouin. This is because Pingouin automatically applies a continuity correction. Disabling it gives the same p-value as scipy: >>> pg.wilcoxon(x, y, alternative='two-sided', correction=False) W-val alternative p-val RBC CLES Wilcoxon 20.5 two-sided 0.266166 -0.378788 0.395833 One-sided test >>> pg.wilcoxon(x, y, alternative='greater') W-val alternative p-val RBC CLES Wilcoxon 20.5 greater 0.876244 -0.378788 0.395833 >>> pg.wilcoxon(x, y, alternative='less') W-val alternative p-val RBC CLES Wilcoxon 20.5 less 0.142883 -0.378788 0.604167 """ x = np.asarray(x) if y is not None: y = np.asarray(y) x, y = remove_na(x, y, paired=True) # Remove NA else: x = x[~np.isnan(x)] # Check tails assert alternative in [ "two-sided", "greater", "less", ], "Alternative must be one of 'two-sided' (default), 'greater' or 'less'." if "tail" in kwargs: raise ValueError( "Since Pingouin 0.4.0, the 'tail' argument has been renamed to 'alternative'." ) # Compute test if "correction" not in kwargs: kwargs["correction"] = True wval, pval = scipy.stats.wilcoxon(x=x, y=y, alternative=alternative, **kwargs) # Effect size 1: Common Language Effect Size # Since Pingouin v0.3.5, CLES is tail-specific and calculated # according to the formula given in Vargha and Delaney 2000 which # works with ordinal data. if y is not None: diff = x[:, None] - y # cles = max((diff < 0).sum(), (diff > 0).sum()) / diff.size # alternative = 'greater', with ties set to 0.5 # Note that alternative = 'two-sided' gives same output as alternative = 'greater' cles = np.where(diff == 0, 0.5, diff > 0).mean() cles = 1 - cles if alternative == "less" else cles else: # CLES cannot be computed if y is None cles = np.nan # Effect size 2: matched-pairs rank biserial correlation (Kerby 2014) if y is not None: d = x - y d = d[d != 0] else: d = x[x != 0] r = scipy.stats.rankdata(abs(d)) rsum = r.sum() r_plus = np.sum((d > 0) * r) r_minus = np.sum((d < 0) * r) rbc = r_plus / rsum - r_minus / rsum # Fill output DataFrame stats = pd.DataFrame( {"W-val": wval, "alternative": alternative, "p-val": pval, "RBC": rbc, "CLES": cles}, index=["Wilcoxon"], ) return _postprocess_dataframe(stats)
[docs] def kruskal(data=None, dv=None, between=None, detailed=False): """Kruskal-Wallis H-test for independent samples. Parameters ---------- data : :py:class:`pandas.DataFrame` DataFrame dv : string Name of column containing the dependent variable. between : string Name of column containing the between factor. Returns ------- stats : :py:class:`pandas.DataFrame` * ``'H'``: The Kruskal-Wallis H statistic, corrected for ties * ``'p-unc'``: Uncorrected p-value * ``'dof'``: degrees of freedom Notes ----- The Kruskal-Wallis H-test tests the null hypothesis that the population median of all of the groups are equal. It is a non-parametric version of ANOVA. The test works on 2 or more independent samples, which may have different sizes. Due to the assumption that H has a chi square distribution, the number of samples in each group must not be too small. A typical rule is that each sample must have at least 5 measurements. NaN values are automatically removed. Examples -------- Compute the Kruskal-Wallis H-test for independent samples. >>> from pingouin import kruskal, read_dataset >>> df = read_dataset('anova') >>> kruskal(data=df, dv='Pain threshold', between='Hair color') Source ddof1 H p-unc Kruskal Hair color 3 10.58863 0.014172 """ # Check data data = _check_dataframe(dv=dv, between=between, data=data, effects="between") # Remove NaN values data = data[[dv, between]].dropna() # Reset index (avoid duplicate axis error) data = data.reset_index(drop=True) # Extract number of groups and total sample size n_groups = data[between].nunique() n = data[dv].size # Rank data, dealing with ties appropriately data["rank"] = scipy.stats.rankdata(data[dv]) # Find the total of rank per groups grp = data.groupby(between, observed=True)["rank"] sum_rk_grp = grp.sum().to_numpy() n_per_grp = grp.count().to_numpy() # Calculate chi-square statistic (H) H = (12 / (n * (n + 1)) * np.sum(sum_rk_grp**2 / n_per_grp)) - 3 * (n + 1) # Correct for ties H /= scipy.stats.tiecorrect(data["rank"].to_numpy()) # Calculate DOF and p-value ddof1 = n_groups - 1 p_unc = scipy.stats.chi2.sf(H, ddof1) # Create output dataframe stats = pd.DataFrame( { "Source": between, "ddof1": ddof1, "H": H, "p-unc": p_unc, }, index=["Kruskal"], ) return _postprocess_dataframe(stats)
[docs] def friedman(data=None, dv=None, within=None, subject=None, method="chisq"): """Friedman test for repeated measurements. Parameters ---------- data : :py:class:`pandas.DataFrame` DataFrame. Both wide and long-format dataframe are supported for this test. dv : string Name of column containing the dependent variable (only required if ``data`` is in long format). within : string Name of column containing the within-subject factor (only required if ``data`` is in long format). Two or more within-factor are not currently supported. subject : string Name of column containing the subject/rater identifier (only required if ``data`` is in long format). method : string Statistical test to perform. Must be ``'chisq'`` (chi-square test) or ``'f'`` (F test). See notes below for explanation. Returns ------- stats : :py:class:`pandas.DataFrame` * ``'W'``: Kendall's coefficient of concordance, corrected for ties If ``method='chisq'`` * ``'Q'``: The Friedman chi-square statistic, corrected for ties * ``'dof'``: degrees of freedom * ``'p-unc'``: Uncorrected p-value of the chi squared test If ``method='f'`` * ``'F'``: The Friedman F statistic, corrected for ties * ``'dof1'``: degrees of freedom of the numerator * ``'dof2'``: degrees of freedom of the denominator * ``'p-unc'``: Uncorrected p-value of the F test Notes ----- The Friedman test is used for non-parametric (rank-based) one-way repeated measures ANOVA. It is equivalent to the test of significance of Kendalls's coefficient of concordance (Kendall's W). Most commonly a Q statistic, which has asymptotical chi-squared distribution, is computed and used for testing. However, the chi-squared test tend to be overly conservative for small numbers of samples and/or repeated measures, in which case a F-test is more adequate [1]_. Data can be in wide or long format. Missing values are automatically removed using a strict listwise approach (= complete-case analysis). In other words, any subject with one or more missing value(s) is completely removed from the dataframe prior to running the test. References ---------- .. [1] Marozzi, M. (2014). Testing for concordance between several criteria. Journal of Statistical Computation and Simulation, 84(9), 1843–1850. https://doi.org/10.1080/00949655.2013.766189 .. [2] https://www.real-statistics.com/anova-repeated-measures/friedman-test/ Examples -------- Compute the Friedman test for repeated measurements, using a wide-format dataframe >>> import pandas as pd >>> import pingouin as pg >>> df = pd.DataFrame({ ... 'white': {0: 10, 1: 8, 2: 7, 3: 9, 4: 7, 5: 4, 6: 5, 7: 6, 8: 5, 9: 10, 10: 4, 11: 7}, ... 'red': {0: 7, 1: 5, 2: 8, 3: 6, 4: 5, 5: 7, 6: 9, 7: 6, 8: 4, 9: 6, 10: 7, 11: 3}, ... 'rose': {0: 8, 1: 5, 2: 6, 3: 4, 4: 7, 5: 5, 6: 3, 7: 7, 8: 6, 9: 4, 10: 4, 11: 3}}) >>> pg.friedman(df) Source W ddof1 Q p-unc Friedman Within 0.083333 2 2.0 0.367879 Compare with SciPy >>> from scipy.stats import friedmanchisquare >>> friedmanchisquare(*df.to_numpy().T) FriedmanchisquareResult(statistic=1.9999999999999893, pvalue=0.3678794411714444) Using a long-format dataframe >>> df_long = df.melt(ignore_index=False).reset_index() >>> pg.friedman(data=df_long, dv="value", within="variable", subject="index") Source W ddof1 Q p-unc Friedman variable 0.083333 2 2.0 0.367879 Using the F-test method >>> pg.friedman(df, method="f") Source W ddof1 ddof2 F p-unc Friedman Within 0.083333 1.833333 20.166667 1.0 0.378959 """ # Convert from wide to long-format, if needed if all([v is None for v in [dv, within, subject]]): assert isinstance(data, pd.DataFrame) data = data._get_numeric_data().dropna() # Listwise deletion of missing values assert data.shape[0] > 2, "Data must have at least 3 non-missing rows." assert data.shape[1] > 1, "Data must contain at least two columns." data["Subj"] = np.arange(data.shape[0]) data = data.melt(id_vars="Subj", var_name="Within", value_name="DV") subject, within, dv = "Subj", "Within", "DV" # Check dataframe data = _check_dataframe(dv=dv, within=within, data=data, subject=subject, effects="within") assert not data[within].isnull().any(), "Cannot have missing values in `within`." assert not data[subject].isnull().any(), "Cannot have missing values in `subject`." # Pivot the table to a wide-format dataframe. This has several effects: # 1) Force missing values to be explicit (a NaN cell is created) # 2) Automatic collapsing to the mean if multiple within factors are present # 3) If using dropna, remove rows with missing values (listwise deletion). # The latter is the same behavior as JASP (= strict complete-case analysis). data_piv = data.pivot_table(index=subject, columns=within, values=dv, observed=True) data_piv = data_piv.dropna() # Extract data in numpy array and calculate ranks X = data_piv.to_numpy() n, k = X.shape ranked = scipy.stats.rankdata(X, axis=1) ssbn = (ranked.sum(axis=0) ** 2).sum() # Correction for ties ties = 0 for i in range(n): replist, repnum = scipy.stats.find_repeats(X[i]) for t in repnum: ties += t * (t * t - 1) # Compute Kendall's W corrected for ties W = (12 * ssbn - 3 * n**2 * k * (k + 1) ** 2) / (n**2 * k * (k - 1) * (k + 1) - n * ties) if method == "chisq": # Compute the Q statistic Q = n * (k - 1) * W # Approximate the p-value ddof1 = k - 1 p_unc = scipy.stats.chi2.sf(Q, ddof1) # Create output dataframe stats = pd.DataFrame( {"Source": within, "W": W, "ddof1": ddof1, "Q": Q, "p-unc": p_unc}, index=["Friedman"] ) elif method == "f": # Compute the F statistic F = W * (n - 1) / (1 - W) # Approximate the p-value ddof1 = k - 1 - 2 / n ddof2 = (n - 1) * ddof1 p_unc = scipy.stats.f.sf(F, ddof1, ddof2) # Create output dataframe stats = pd.DataFrame( {"Source": within, "W": W, "ddof1": ddof1, "ddof2": ddof2, "F": F, "p-unc": p_unc}, index=["Friedman"], ) return _postprocess_dataframe(stats)
[docs] def cochran(data=None, dv=None, within=None, subject=None): """Cochran Q test. A special case of the Friedman test when the dependent variable is binary. Parameters ---------- data : :py:class:`pandas.DataFrame` DataFrame. Both wide and long-format dataframe are supported for this test. dv : string Name of column containing the dependent variable (only required if ``data`` is in long format). within : string Name of column containing the within-subject factor (only required if ``data`` is in long format). Two or more within-factor are not currently supported. subject : string Name of column containing the subject/rater identifier (only required if ``data`` is in long format). Returns ------- stats : :py:class:`pandas.DataFrame` * ``'Q'``: The Cochran Q statistic * ``'p-unc'``: Uncorrected p-value * ``'dof'``: degrees of freedom Notes ----- The Cochran Q test [1]_ is a non-parametric test for ANOVA with repeated measures where the dependent variable is binary. The Q statistics is defined as: .. math:: Q = \\frac{(r-1)(r\\sum_j^rx_j^2-N^2)}{rN-\\sum_i^nx_i^2} where :math:`N` is the total sum of all observations, :math:`j=1,...,r` where :math:`r` is the number of repeated measures, :math:`i=1,...,n` where :math:`n` is the number of observations per condition. The p-value is then approximated using a chi-square distribution with :math:`r-1` degrees of freedom: .. math:: Q \\sim \\chi^2(r-1) Data are expected to be in long-format. Missing values are automatically removed using a strict listwise approach (= complete-case analysis). In other words, any subject with one or more missing value(s) is completely removed from the dataframe prior to running the test. References ---------- .. [1] Cochran, W.G., 1950. The comparison of percentages in matched samples. Biometrika 37, 256–266. https://doi.org/10.1093/biomet/37.3-4.256 Examples -------- Compute the Cochran Q test for repeated measurements. >>> from pingouin import cochran, read_dataset >>> df = read_dataset('cochran') >>> cochran(data=df, dv='Energetic', within='Time', subject='Subject') Source dof Q p-unc cochran Time 2 6.705882 0.034981 Same but using a wide-format dataframe >>> df_wide = df.pivot_table(index="Subject", columns="Time", values="Energetic") >>> cochran(df_wide) Source dof Q p-unc cochran Within 2 6.705882 0.034981 """ # Convert from wide to long-format, if needed if all([v is None for v in [dv, within, subject]]): assert isinstance(data, pd.DataFrame) data = data._get_numeric_data().dropna() # Listwise deletion of missing values assert data.shape[0] > 2, "Data must have at least 3 non-missing rows." assert data.shape[1] > 1, "Data must contain at least two columns." data["Subj"] = np.arange(data.shape[0]) data = data.melt(id_vars="Subj", var_name="Within", value_name="DV") subject, within, dv = "Subj", "Within", "DV" # Check data data = _check_dataframe(dv=dv, within=within, data=data, subject=subject, effects="within") assert not data[within].isnull().any(), "Cannot have missing values in `within`." assert not data[subject].isnull().any(), "Cannot have missing values in `subject`." # Pivot and melt the table. This has several effects: # 1) Force missing values to be explicit (a NaN cell is created) # 2) Automatic collapsing to the mean if multiple within factors are present # 3) If using dropna, remove rows with missing values (listwise deletion). # The latter is the same behavior as JASP (= strict complete-case analysis). data_piv = data.pivot_table(index=subject, columns=within, values=dv, observed=True) data_piv = data_piv.dropna() data = data_piv.melt(ignore_index=False, value_name=dv).reset_index() # Groupby and extract size grp = data.groupby(within, observed=True)[dv] grp_s = data.groupby(subject, observed=True)[dv] k = data[within].nunique() dof = k - 1 # n = grp.count().unique()[0] # Q statistic and p-value q = (dof * (k * np.sum(grp.sum() ** 2) - grp.sum().sum() ** 2)) / ( k * grp.sum().sum() - np.sum(grp_s.sum() ** 2) ) p_unc = scipy.stats.chi2.sf(q, dof) # Create output dataframe stats = pd.DataFrame({"Source": within, "dof": dof, "Q": q, "p-unc": p_unc}, index=["cochran"]) return _postprocess_dataframe(stats)
[docs] def harrelldavis(x, quantile=0.5, axis=-1): """Harrell-Davis robust estimate of the :math:`q^{th}` quantile(s) of the data. .. versionadded:: 0.2.9 Parameters ---------- x : array_like Data, must be a one or two-dimensional vector. quantile : float or array_like Quantile or sequence of quantiles to compute, must be between 0 and 1. Default is ``0.5``. axis : int Axis along which the MAD is computed. Default is the last axis (-1). Can be either 0, 1 or -1. Returns ------- y : float or array_like The estimated quantile(s). If ``quantile`` is a single quantile, will return a float, otherwise will compute each quantile separately and returns an array of floats. Notes ----- The Harrell-Davis method [1]_ estimates the :math:`q^{th}` quantile by a linear combination of the order statistics. Results have been tested against a Matlab implementation [2]_. Note that this method is also used to measure the confidence intervals of the difference between quantiles of two groups, as implemented in the shift function [3]_. See Also -------- plot_shift References ---------- .. [1] Frank E. Harrell, C. E. Davis, A new distribution-free quantile estimator, Biometrika, Volume 69, Issue 3, December 1982, Pages 635–640, https://doi.org/10.1093/biomet/69.3.635 .. [2] https://github.com/GRousselet/matlab_stats/blob/master/hd.m .. [3] Rousselet, G. A., Pernet, C. R. and Wilcox, R. R. (2017). Beyond differences in means: robust graphical methods to compare two groups in neuroscience. Eur J Neurosci, 46: 1738-1748. https://doi.org/doi:10.1111/ejn.13610 Examples -------- Estimate the 0.5 quantile (i.e median) of 100 observation picked from a normal distribution with zero mean and unit variance. >>> import numpy as np >>> import pingouin as pg >>> np.random.seed(123) >>> x = np.random.normal(0, 1, 100) >>> round(pg.harrelldavis(x, quantile=0.5), 4) -0.0499 Several quantiles at once >>> pg.harrelldavis(x, quantile=[0.25, 0.5, 0.75]) array([-0.84133224, -0.04991657, 0.95897233]) On the last axis of a 2D vector (default) >>> np.random.seed(123) >>> x = np.random.normal(0, 1, (10, 100)) >>> pg.harrelldavis(x, quantile=[0.25, 0.5, 0.75]) array([[-0.84133224, -0.52346777, -0.81801193, -0.74611216, -0.64928321, -0.48565262, -0.64332799, -0.8178394 , -0.70058282, -0.73088088], [-0.04991657, 0.02932655, -0.08905073, -0.1860034 , 0.06970415, 0.15129817, 0.00430958, -0.13784786, -0.08648077, -0.14407123], [ 0.95897233, 0.49543002, 0.57712236, 0.48620599, 0.85899005, 0.7903462 , 0.76558585, 0.62528436, 0.60421847, 0.52620286]]) On the first axis >>> pg.harrelldavis(x, quantile=[0.5], axis=0).shape (100,) """ x = np.asarray(x) assert x.ndim <= 2, "Only 1D or 2D array are supported for this function." assert axis in [0, 1, -1], "Axis must be 0, 1 or -1." # Sort the input array x = np.sort(x, axis=axis) n = x.shape[axis] vec = np.arange(n) if isinstance(quantile, float): quantile = [quantile] y = [] for q in quantile: # Harrell-Davis estimate of the qth quantile m1 = (n + 1) * q m2 = (n + 1) * (1 - q) w = scipy.stats.beta.cdf((vec + 1) / n, m1, m2) - scipy.stats.beta.cdf((vec) / n, m1, m2) if axis != 0: y.append((w * x).sum(axis)) else: y.append((w[..., None] * x).sum(axis)) # Store results if len(y) == 1: y = y[0] # Return a float instead of a list if n quantile is 1 else: y = np.array(y) return y