Source code for pingouin.contingency

# Date: May 2019
import warnings
import numpy as np
import pandas as pd

from scipy.stats.contingency import expected_freq
from scipy.stats import power_divergence, binom, chi2 as sp_chi2

from pingouin import power_chi2, _postprocess_dataframe


__all__ = ["chi2_independence", "chi2_mcnemar", "dichotomous_crosstab"]


###############################################################################
# CHI-SQUARED TESTS
###############################################################################



[docs]
def chi2_independence(data, x, y, correction=True):
    """
    Chi-squared independence tests between two categorical variables.

    The test is computed for different values of :math:`\\lambda`: 1, 2/3, 0,
    -1/2, -1 and -2 (Cressie and Read, 1984).

    Parameters
    ----------
    data : :py:class:`pandas.DataFrame`
        The dataframe containing the ocurrences for the test.
    x, y : string
        The variables names for the Chi-squared test. Must be names of columns
        in ``data``.
    correction : bool
        Whether to apply Yates' correction when the degree of freedom of the
        observed contingency table is 1 (Yates 1934).

    Returns
    -------
    expected : :py:class:`pandas.DataFrame`
        The expected contingency table of frequencies.
    observed : :py:class:`pandas.DataFrame`
        The (corrected or not) observed contingency table of frequencies.
    stats : :py:class:`pandas.DataFrame`
        The test summary, containing four columns:

        * ``'test'``: The statistic name
        * ``'lambda'``: The :math:`\\lambda` value used for the power\
                        divergence statistic
        * ``'chi2'``: The test statistic
        * ``'pval'``: The p-value of the test
        * ``'cramer'``: The Cramer's V effect size
        * ``'power'``: The statistical power of the test

    Notes
    -----
    From Wikipedia:

    *The chi-squared test is used to determine whether there is a significant
    difference between the expected frequencies and the observed frequencies
    in one or more categories.*

    As application examples, this test can be used to *i*) evaluate the
    quality of a categorical variable in a classification problem or to *ii*)
    check the similarity between two categorical variables. In the first
    example, a good categorical predictor and the class column should present
    high :math:`\\chi^2` and low p-value. In the second example, similar
    categorical variables should present low :math:`\\chi^2` and high p-value.

    This function is a wrapper around the
    :py:func:`scipy.stats.power_divergence` function.

    .. warning :: As a general guideline for the consistency of this test, the
        observed and the expected contingency tables should not have cells
        with frequencies lower than 5.

    References
    ----------
    * Cressie, N., & Read, T. R. (1984). Multinomial goodness‐of‐fit
      tests. Journal of the Royal Statistical Society: Series B
      (Methodological), 46(3), 440-464.

    * Yates, F. (1934). Contingency Tables Involving Small Numbers and the
      :math:`\\chi^2` Test. Supplement to the Journal of the Royal
      Statistical Society, 1, 217-235.

    Examples
    --------
    Let's see if gender is a good categorical predictor for the presence of
    heart disease.

    >>> import pingouin as pg
    >>> data = pg.read_dataset('chi2_independence')
    >>> data['sex'].value_counts(ascending=True)
    sex
    0     96
    1    207
    Name: count, dtype: int64

    If gender is not a good predictor for heart disease, we should expect the
    same 96:207 ratio across the target classes.

    >>> expected, observed, stats = pg.chi2_independence(data, x='sex',
    ...                                                  y='target')
    >>> expected
    target          0           1
    sex
    0       43.722772   52.277228
    1       94.277228  112.722772

    Let's see what the data tells us.

    >>> observed
    target      0     1
    sex
    0        24.5  71.5
    1       113.5  93.5

    The proportion is lower on the class 0 and higher on the class 1. The
    tests should be sensitive to this difference.

    >>> stats.round(3)
                     test  lambda    chi2  dof  pval  cramer  power
    0             pearson   1.000  22.717  1.0   0.0   0.274  0.997
    1        cressie-read   0.667  22.931  1.0   0.0   0.275  0.998
    2      log-likelihood   0.000  23.557  1.0   0.0   0.279  0.998
    3       freeman-tukey  -0.500  24.220  1.0   0.0   0.283  0.998
    4  mod-log-likelihood  -1.000  25.071  1.0   0.0   0.288  0.999
    5              neyman  -2.000  27.458  1.0   0.0   0.301  0.999

    Very low p-values indeed. The gender qualifies as a good predictor for the
    presence of heart disease on this dataset.
    """
    # Python code inspired by SciPy's chi2_contingency
    assert isinstance(data, pd.DataFrame), "data must be a pandas DataFrame."
    assert isinstance(x, (str, int)), "x must be a string or int."
    assert isinstance(y, (str, int)), "y must be a string or int."
    assert all(col in data.columns for col in (x, y)), "columns are not in dataframe."
    assert isinstance(correction, bool), "correction must be a boolean."

    observed = pd.crosstab(data[x], data[y])

    if observed.size == 0:
        raise ValueError("No data; observed has size 0.")

    expected = pd.DataFrame(expected_freq(observed), index=observed.index, columns=observed.columns)

    # All count frequencies should be at least 5
    for df, name in zip([observed, expected], ["observed", "expected"]):
        if (df < 5).any(axis=None):
            warnings.warn(f"Low count on {name} frequencies.")

    dof = float(expected.size - sum(expected.shape) + expected.ndim - 1)

    if dof == 1 and correction:
        # Adjust `observed` according to Yates' correction for continuity.
        observed = observed + 0.5 * np.sign(expected - observed)

    ddof = observed.size - 1 - dof
    n = data.shape[0]
    stats = []
    names = [
        "pearson",
        "cressie-read",
        "log-likelihood",
        "freeman-tukey",
        "mod-log-likelihood",
        "neyman",
    ]

    for name, lambda_ in zip(names, [1.0, 2 / 3, 0.0, -1 / 2, -1.0, -2.0]):
        if dof == 0:
            chi2, p, cramer, power = 0.0, 1.0, np.nan, np.nan
        else:
            chi2, p = power_divergence(observed, expected, ddof=ddof, axis=None, lambda_=lambda_)
            dof_cramer = min(expected.shape) - 1
            cramer = np.sqrt(chi2 / (n * dof_cramer))
            power = power_chi2(dof=dof, w=cramer, n=n, alpha=0.05)

        stats.append(
            {
                "test": name,
                "lambda": lambda_,
                "chi2": chi2,
                "dof": dof,
                "pval": p,
                "cramer": cramer,
                "power": power,
            }
        )

    stats = pd.DataFrame(stats)[["test", "lambda", "chi2", "dof", "pval", "cramer", "power"]]
    return expected, observed, _postprocess_dataframe(stats)




[docs]
def chi2_mcnemar(data, x, y, correction=True):
    """
    Performs the exact and approximated versions of McNemar's test.

    Parameters
    ----------
    data : :py:class:`pandas.DataFrame`
        The dataframe containing the ocurrences for the test. Each row must
        represent either a subject or a pair of subjects.
    x, y : string
        The variables names for the McNemar's test. Must be names of columns
        in ``data``.

        If each row of ``data`` represents a subject, then ``x`` and ``y`` must
        be columns containing dichotomous measurements in two different
        contexts. For instance: the presence of pain before and after a certain
        treatment.

        If each row of ``data`` represents a pair of subjects, then ``x`` and
        ``y`` must be columns containing dichotomous measurements for each of
        the subjects. For instance: a positive response to a certain drug in
        the control group and in the test group, supposing that each pair
        contains a subject in each group.

        The 2x2 crosstab is created using the
        :py:func:`pingouin.dichotomous_crosstab` function.

        .. warning:: Missing values are not allowed.

    correction : bool
        Whether to apply the correction for continuity (Edwards, A. 1948).

    Returns
    -------
    observed : :py:class:`pandas.DataFrame`
        The observed contingency table of frequencies.
    stats : :py:class:`pandas.DataFrame`
        The test summary:

        * ``'chi2'``: The test statistic
        * ``'dof'``: The degree of freedom
        * ``'p-approx'``: The approximated p-value
        * ``'p-exact'``: The exact p-value

    Notes
    -----
    The McNemar's test is compatible with dichotomous paired data, generally
    used to assert the effectiveness of a certain procedure, such as a
    treatment or the use of a drug. "Dichotomous" means that the values of the
    measurements are binary. "Paired data" means that each measurement is done
    twice, either on the same subject in two different moments or in two
    similar (paired) subjects from different groups (e.g.: control/test). In
    order to better understand the idea behind McNemar's test, let's illustrate
    it with an example.

    Suppose that we wanted to compare the effectiveness of two different
    treatments (X and Y) for athlete's foot on a certain group of `n` people.
    To achieve this, we measured their responses to such treatments on each
    foot. The observed data summary was:

    * Number of people with good responses to X and Y: `a`
    * Number of people with good response to X and bad response to Y: `b`
    * Number of people with bad response to X and good response to Y: `c`
    * Number of people with bad responses to X and Y: `d`

    Now consider the two groups:

    1. The group of people who had good response to X (`a` + `b` subjects)
    2. The group of people who had good response to Y (`a` + `c` subjects)

    If the treatments have the same effectiveness, we should expect the
    probabilities of having good responses to be the same, regardless of the
    treatment. Mathematically, such statement can be translated into the
    following equation:

    .. math::

        \\frac{a+b}{n} = \\frac{a+c}{n} \\Rightarrow b = c

    Thus, this test should indicate higher statistical significances for higher
    distances between `b` and `c` (McNemar, Q. 1947):

    .. math::

        \\chi^2 = \\frac{(b - c)^2}{b + c}

    References
    ----------
    * Edwards, A. L. (1948). Note on the "correction for continuity" in
      testing the significance of the difference between correlated
      proportions. Psychometrika, 13(3), 185-187.

    * McNemar, Q. (1947). Note on the sampling error of the difference
      between correlated proportions or percentages. Psychometrika, 12(2),
      153-157.

    Examples
    --------
    >>> import pingouin as pg
    >>> data = pg.read_dataset('chi2_mcnemar')
    >>> observed, stats = pg.chi2_mcnemar(data, 'treatment_X', 'treatment_Y')
    >>> observed
    treatment_Y   0   1
    treatment_X
    0            20  40
    1             8  12

    In this case, `c` (40) seems to be a significantly greater than `b` (8).
    The McNemar test should be sensitive to this.

    >>> stats
                chi2  dof  p-approx   p-exact
    mcnemar  20.020833    1  0.000008  0.000003
    """
    # Python code initially inspired by statsmodel's mcnemar
    assert isinstance(data, pd.DataFrame), "data must be a pandas DataFrame."
    assert all(
        isinstance(column, (str, int)) for column in (x, y)
    ), "column names must be string or int."
    assert all(column in data.columns for column in (x, y)), "columns are not in dataframe."

    for column in (x, y):
        if data[column].isna().any():
            raise ValueError("Null values are not allowed.")

    observed = dichotomous_crosstab(data, x, y)
    # Careful, the order of b and c is inverted compared to wikipedia
    # because the colums / rows of the crosstab is [0, 1] and not [1, 0].
    c, b = observed.at[0, 1], observed.at[1, 0]
    n_discordants = b + c

    if (b, c) == (0, 0):
        raise ValueError(
            "McNemar's test does not work if the secondary "
            + "diagonal of the observed data summary does not "
            + "have values different from 0."
        )

    chi2 = (abs(b - c) - int(correction)) ** 2 / n_discordants
    pexact = min(1, 2 * binom.cdf(min(b, c), n_discordants, 0.5))
    stats = {
        "chi2": chi2,
        "dof": 1,
        "p-approx": sp_chi2.sf(chi2, 1),
        "p-exact": pexact,
        # 'p-mid': pexact - binom.pmf(b, n_discordants, 0.5)
    }

    stats = pd.DataFrame(stats, index=["mcnemar"])

    return observed, _postprocess_dataframe(stats)



###############################################################################
# DICHOTOMOUS CONTINGENCY TABLES
###############################################################################


def _dichotomize_series(data, column):
    """Converts the values of a pd.DataFrame column into 0 or 1"""
    series = data[column]
    if series.dtype == bool:
        return series.astype(int)

    def convert_elem(elem):
        if isinstance(elem, (int, float)) and elem in (0, 1):
            return int(elem)
        if isinstance(elem, str):
            lower = elem.lower()
            if lower in ("n", "no", "absent", "false", "f", "negative"):
                return 0
            elif lower in ("y", "yes", "present", "true", "t", "positive", "p"):
                return 1
        raise ValueError(
            "Invalid value to build a 2x2 contingency "
            "table on column {}: {}".format(column, elem)
        )

    return series.apply(convert_elem)



[docs]
def dichotomous_crosstab(data, x, y):
    """
    Generates a 2x2 contingency table from a :py:class:`pandas.DataFrame` that
    contains only dichotomous entries, which are converted to 0 or 1.

    Parameters
    ----------
    data : :py:class:`pandas.DataFrame`
        Pandas dataframe
    x, y : string
        Column names in ``data``.

        Currently, Pingouin recognizes the following values as dichotomous
        measurements:

        * ``0``, ``0.0``, ``False``, ``'No'``, ``'N'``, ``'Absent'``,\
        ``'False'``, ``'F'`` or ``'Negative'`` for negative cases;

        * ``1``, ``1.0``, ``True``, ``'Yes'``, ``'Y'``, ``'Present'``,\
        ``'True'``, ``'T'``, ``'Positive'`` or ``'P'``,  for positive cases;

        If strings are used, Pingouin will recognize them regardless of their
        uppercase/lowercase combinations.

    Returns
    -------
    crosstab : :py:class:`pandas.DataFrame`
        The 2x2 crosstab. See :py:func:`pandas.crosstab` for more details.

    Examples
    --------
    >>> import pandas as pd
    >>> import pingouin as pg
    >>> df = pd.DataFrame({'A': ['Yes', 'No', 'No'], 'B': [0., 1., 0.]})
    >>> pg.dichotomous_crosstab(data=df, x='A', y='B')
    B  0  1
    A
    0  1  1
    1  1  0
    """
    crosstab = pd.crosstab(_dichotomize_series(data, x), _dichotomize_series(data, y))
    shape = crosstab.shape
    if shape != (2, 2):
        if shape == (2, 1):
            crosstab.loc[:, int(not bool(crosstab.columns[0]))] = [0, 0]
        elif shape == (1, 2):
            crosstab.loc[int(not bool(crosstab.index[0])), :] = [0, 0]
        else:  # shape = (1, 1) or shape = (>2, >2)
            raise ValueError(
                "Both series contain only one unique value. " "Cannot build 2x2 contingency table."
            )
    crosstab = crosstab.sort_index(axis=0).sort_index(axis=1)
    return crosstab