Source code for pingouin.contingency

# Date: May 2019
import warnings
import numpy as np
import pandas as pd

from scipy.stats.contingency import expected_freq
from scipy.stats import power_divergence, binom, chi2 as sp_chi2

from pingouin import power_chi2, _postprocess_dataframe


__all__ = ["chi2_independence", "chi2_mcnemar", "dichotomous_crosstab"]


###############################################################################
# CHI-SQUARED TESTS
###############################################################################


[docs] def chi2_independence(data, x, y, correction=True): """ Chi-squared independence tests between two categorical variables. The test is computed for different values of :math:`\\lambda`: 1, 2/3, 0, -1/2, -1 and -2 (Cressie and Read, 1984). Parameters ---------- data : :py:class:`pandas.DataFrame` The dataframe containing the ocurrences for the test. x, y : string The variables names for the Chi-squared test. Must be names of columns in ``data``. correction : bool Whether to apply Yates' correction when the degree of freedom of the observed contingency table is 1 (Yates 1934). Returns ------- expected : :py:class:`pandas.DataFrame` The expected contingency table of frequencies. observed : :py:class:`pandas.DataFrame` The (corrected or not) observed contingency table of frequencies. stats : :py:class:`pandas.DataFrame` The test summary, containing four columns: * ``'test'``: The statistic name * ``'lambda'``: The :math:`\\lambda` value used for the power\ divergence statistic * ``'chi2'``: The test statistic * ``'pval'``: The p-value of the test * ``'cramer'``: The Cramer's V effect size * ``'power'``: The statistical power of the test Notes ----- From Wikipedia: *The chi-squared test is used to determine whether there is a significant difference between the expected frequencies and the observed frequencies in one or more categories.* As application examples, this test can be used to *i*) evaluate the quality of a categorical variable in a classification problem or to *ii*) check the similarity between two categorical variables. In the first example, a good categorical predictor and the class column should present high :math:`\\chi^2` and low p-value. In the second example, similar categorical variables should present low :math:`\\chi^2` and high p-value. This function is a wrapper around the :py:func:`scipy.stats.power_divergence` function. .. warning :: As a general guideline for the consistency of this test, the observed and the expected contingency tables should not have cells with frequencies lower than 5. References ---------- * Cressie, N., & Read, T. R. (1984). Multinomial goodness‐of‐fit tests. Journal of the Royal Statistical Society: Series B (Methodological), 46(3), 440-464. * Yates, F. (1934). Contingency Tables Involving Small Numbers and the :math:`\\chi^2` Test. Supplement to the Journal of the Royal Statistical Society, 1, 217-235. Examples -------- Let's see if gender is a good categorical predictor for the presence of heart disease. >>> import pingouin as pg >>> data = pg.read_dataset('chi2_independence') >>> data['sex'].value_counts(ascending=True) sex 0 96 1 207 Name: count, dtype: int64 If gender is not a good predictor for heart disease, we should expect the same 96:207 ratio across the target classes. >>> expected, observed, stats = pg.chi2_independence(data, x='sex', ... y='target') >>> expected target 0 1 sex 0 43.722772 52.277228 1 94.277228 112.722772 Let's see what the data tells us. >>> observed target 0 1 sex 0 24.5 71.5 1 113.5 93.5 The proportion is lower on the class 0 and higher on the class 1. The tests should be sensitive to this difference. >>> stats.round(3) test lambda chi2 dof pval cramer power 0 pearson 1.000 22.717 1.0 0.0 0.274 0.997 1 cressie-read 0.667 22.931 1.0 0.0 0.275 0.998 2 log-likelihood 0.000 23.557 1.0 0.0 0.279 0.998 3 freeman-tukey -0.500 24.220 1.0 0.0 0.283 0.998 4 mod-log-likelihood -1.000 25.071 1.0 0.0 0.288 0.999 5 neyman -2.000 27.458 1.0 0.0 0.301 0.999 Very low p-values indeed. The gender qualifies as a good predictor for the presence of heart disease on this dataset. """ # Python code inspired by SciPy's chi2_contingency assert isinstance(data, pd.DataFrame), "data must be a pandas DataFrame." assert isinstance(x, (str, int)), "x must be a string or int." assert isinstance(y, (str, int)), "y must be a string or int." assert all(col in data.columns for col in (x, y)), "columns are not in dataframe." assert isinstance(correction, bool), "correction must be a boolean." observed = pd.crosstab(data[x], data[y]) if observed.size == 0: raise ValueError("No data; observed has size 0.") expected = pd.DataFrame(expected_freq(observed), index=observed.index, columns=observed.columns) # All count frequencies should be at least 5 for df, name in zip([observed, expected], ["observed", "expected"]): if (df < 5).any(axis=None): warnings.warn(f"Low count on {name} frequencies.") dof = float(expected.size - sum(expected.shape) + expected.ndim - 1) if dof == 1 and correction: # Adjust `observed` according to Yates' correction for continuity. observed = observed + 0.5 * np.sign(expected - observed) ddof = observed.size - 1 - dof n = data.shape[0] stats = [] names = [ "pearson", "cressie-read", "log-likelihood", "freeman-tukey", "mod-log-likelihood", "neyman", ] for name, lambda_ in zip(names, [1.0, 2 / 3, 0.0, -1 / 2, -1.0, -2.0]): if dof == 0: chi2, p, cramer, power = 0.0, 1.0, np.nan, np.nan else: chi2, p = power_divergence(observed, expected, ddof=ddof, axis=None, lambda_=lambda_) dof_cramer = min(expected.shape) - 1 cramer = np.sqrt(chi2 / (n * dof_cramer)) power = power_chi2(dof=dof, w=cramer, n=n, alpha=0.05) stats.append( { "test": name, "lambda": lambda_, "chi2": chi2, "dof": dof, "pval": p, "cramer": cramer, "power": power, } ) stats = pd.DataFrame(stats)[["test", "lambda", "chi2", "dof", "pval", "cramer", "power"]] return expected, observed, _postprocess_dataframe(stats)
[docs] def chi2_mcnemar(data, x, y, correction=True): """ Performs the exact and approximated versions of McNemar's test. Parameters ---------- data : :py:class:`pandas.DataFrame` The dataframe containing the ocurrences for the test. Each row must represent either a subject or a pair of subjects. x, y : string The variables names for the McNemar's test. Must be names of columns in ``data``. If each row of ``data`` represents a subject, then ``x`` and ``y`` must be columns containing dichotomous measurements in two different contexts. For instance: the presence of pain before and after a certain treatment. If each row of ``data`` represents a pair of subjects, then ``x`` and ``y`` must be columns containing dichotomous measurements for each of the subjects. For instance: a positive response to a certain drug in the control group and in the test group, supposing that each pair contains a subject in each group. The 2x2 crosstab is created using the :py:func:`pingouin.dichotomous_crosstab` function. .. warning:: Missing values are not allowed. correction : bool Whether to apply the correction for continuity (Edwards, A. 1948). Returns ------- observed : :py:class:`pandas.DataFrame` The observed contingency table of frequencies. stats : :py:class:`pandas.DataFrame` The test summary: * ``'chi2'``: The test statistic * ``'dof'``: The degree of freedom * ``'p-approx'``: The approximated p-value * ``'p-exact'``: The exact p-value Notes ----- The McNemar's test is compatible with dichotomous paired data, generally used to assert the effectiveness of a certain procedure, such as a treatment or the use of a drug. "Dichotomous" means that the values of the measurements are binary. "Paired data" means that each measurement is done twice, either on the same subject in two different moments or in two similar (paired) subjects from different groups (e.g.: control/test). In order to better understand the idea behind McNemar's test, let's illustrate it with an example. Suppose that we wanted to compare the effectiveness of two different treatments (X and Y) for athlete's foot on a certain group of `n` people. To achieve this, we measured their responses to such treatments on each foot. The observed data summary was: * Number of people with good responses to X and Y: `a` * Number of people with good response to X and bad response to Y: `b` * Number of people with bad response to X and good response to Y: `c` * Number of people with bad responses to X and Y: `d` Now consider the two groups: 1. The group of people who had good response to X (`a` + `b` subjects) 2. The group of people who had good response to Y (`a` + `c` subjects) If the treatments have the same effectiveness, we should expect the probabilities of having good responses to be the same, regardless of the treatment. Mathematically, such statement can be translated into the following equation: .. math:: \\frac{a+b}{n} = \\frac{a+c}{n} \\Rightarrow b = c Thus, this test should indicate higher statistical significances for higher distances between `b` and `c` (McNemar, Q. 1947): .. math:: \\chi^2 = \\frac{(b - c)^2}{b + c} References ---------- * Edwards, A. L. (1948). Note on the "correction for continuity" in testing the significance of the difference between correlated proportions. Psychometrika, 13(3), 185-187. * McNemar, Q. (1947). Note on the sampling error of the difference between correlated proportions or percentages. Psychometrika, 12(2), 153-157. Examples -------- >>> import pingouin as pg >>> data = pg.read_dataset('chi2_mcnemar') >>> observed, stats = pg.chi2_mcnemar(data, 'treatment_X', 'treatment_Y') >>> observed treatment_Y 0 1 treatment_X 0 20 40 1 8 12 In this case, `c` (40) seems to be a significantly greater than `b` (8). The McNemar test should be sensitive to this. >>> stats chi2 dof p-approx p-exact mcnemar 20.020833 1 0.000008 0.000003 """ # Python code initially inspired by statsmodel's mcnemar assert isinstance(data, pd.DataFrame), "data must be a pandas DataFrame." assert all( isinstance(column, (str, int)) for column in (x, y) ), "column names must be string or int." assert all(column in data.columns for column in (x, y)), "columns are not in dataframe." for column in (x, y): if data[column].isna().any(): raise ValueError("Null values are not allowed.") observed = dichotomous_crosstab(data, x, y) # Careful, the order of b and c is inverted compared to wikipedia # because the colums / rows of the crosstab is [0, 1] and not [1, 0]. c, b = observed.at[0, 1], observed.at[1, 0] n_discordants = b + c if (b, c) == (0, 0): raise ValueError( "McNemar's test does not work if the secondary " + "diagonal of the observed data summary does not " + "have values different from 0." ) chi2 = (abs(b - c) - int(correction)) ** 2 / n_discordants pexact = min(1, 2 * binom.cdf(min(b, c), n_discordants, 0.5)) stats = { "chi2": chi2, "dof": 1, "p-approx": sp_chi2.sf(chi2, 1), "p-exact": pexact, # 'p-mid': pexact - binom.pmf(b, n_discordants, 0.5) } stats = pd.DataFrame(stats, index=["mcnemar"]) return observed, _postprocess_dataframe(stats)
############################################################################### # DICHOTOMOUS CONTINGENCY TABLES ############################################################################### def _dichotomize_series(data, column): """Converts the values of a pd.DataFrame column into 0 or 1""" series = data[column] if series.dtype == bool: return series.astype(int) def convert_elem(elem): if isinstance(elem, (int, float)) and elem in (0, 1): return int(elem) if isinstance(elem, str): lower = elem.lower() if lower in ("n", "no", "absent", "false", "f", "negative"): return 0 elif lower in ("y", "yes", "present", "true", "t", "positive", "p"): return 1 raise ValueError( "Invalid value to build a 2x2 contingency " "table on column {}: {}".format(column, elem) ) return series.apply(convert_elem)
[docs] def dichotomous_crosstab(data, x, y): """ Generates a 2x2 contingency table from a :py:class:`pandas.DataFrame` that contains only dichotomous entries, which are converted to 0 or 1. Parameters ---------- data : :py:class:`pandas.DataFrame` Pandas dataframe x, y : string Column names in ``data``. Currently, Pingouin recognizes the following values as dichotomous measurements: * ``0``, ``0.0``, ``False``, ``'No'``, ``'N'``, ``'Absent'``,\ ``'False'``, ``'F'`` or ``'Negative'`` for negative cases; * ``1``, ``1.0``, ``True``, ``'Yes'``, ``'Y'``, ``'Present'``,\ ``'True'``, ``'T'``, ``'Positive'`` or ``'P'``, for positive cases; If strings are used, Pingouin will recognize them regardless of their uppercase/lowercase combinations. Returns ------- crosstab : :py:class:`pandas.DataFrame` The 2x2 crosstab. See :py:func:`pandas.crosstab` for more details. Examples -------- >>> import pandas as pd >>> import pingouin as pg >>> df = pd.DataFrame({'A': ['Yes', 'No', 'No'], 'B': [0., 1., 0.]}) >>> pg.dichotomous_crosstab(data=df, x='A', y='B') B 0 1 A 0 1 1 1 1 0 """ crosstab = pd.crosstab(_dichotomize_series(data, x), _dichotomize_series(data, y)) shape = crosstab.shape if shape != (2, 2): if shape == (2, 1): crosstab.loc[:, int(not bool(crosstab.columns[0]))] = [0, 0] elif shape == (1, 2): crosstab.loc[int(not bool(crosstab.index[0])), :] = [0, 0] else: # shape = (1, 1) or shape = (>2, >2) raise ValueError( "Both series contain only one unique value. " "Cannot build 2x2 contingency table." ) crosstab = crosstab.sort_index(axis=0).sort_index(axis=1) return crosstab