"""Plotting functions.
Authors
- Raphael Vallat <raphaelvallat9@gmail.com>
- Nicolas Legrand <legrand@cyceron.fr>
"""
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
import matplotlib.pyplot as plt
import matplotlib.transforms as transforms
# Set default Seaborn preferences (disabled Pingouin >= 0.3.4)
# See https://github.com/raphaelvallat/pingouin/issues/85
# sns.set(style='ticks', context='notebook')
__all__ = [
"plot_blandaltman",
"qqplot",
"plot_paired",
"plot_shift",
"plot_rm_corr",
"plot_circmean",
]
[docs]
def plot_blandaltman(
x, y, agreement=1.96, xaxis="mean", confidence=0.95, annotate=True, ax=None, **kwargs
):
"""
Generate a Bland-Altman plot to compare two sets of measurements.
Parameters
----------
x, y : pd.Series, np.array, or list
First and second measurements.
agreement : float
Multiple of the standard deviation to plot agreement limits.
The defaults is 1.96, which corresponds to 95% confidence interval if
the differences are normally distributed.
xaxis : str
Define which measurements should be used as the reference (x-axis).
Default is to use the average of x and y ("mean"). Accepted values are
"mean", "x" or "y".
confidence : float
If not None, plot the specified percentage confidence interval of
the mean and limits of agreement. The CIs of the mean difference and
agreement limits describe a possible error in the
estimate due to a sampling error. The greater the sample size,
the narrower the CIs will be.
annotate : bool
If True (default), annotate the values for the mean difference
and agreement limits.
ax : matplotlib axes
Axis on which to draw the plot.
**kwargs : optional
Optional argument(s) passed to :py:func:`matplotlib.pyplot.scatter`.
Returns
-------
ax : Matplotlib Axes instance
Returns the Axes object with the plot for further tweaking.
Notes
-----
Bland-Altman plots [1]_ are extensively used to evaluate the agreement
among two different instruments or two measurements techniques.
They allow identification of any systematic difference between the
measurements (i.e., fixed bias) or possible outliers.
The mean difference (= x - y) is the estimated bias, and the SD of the
differences measures the random fluctuations around this mean.
If the mean value of the difference differs significantly from 0 on the
basis of a 1-sample t-test, this indicates the presence of fixed bias.
If there is a consistent bias, it can be adjusted for by subtracting the
mean difference from the new method.
It is common to compute 95% limits of agreement for each comparison
(average difference ± 1.96 standard deviation of the difference), which
tells us how far apart measurements by 2 methods were more likely to be
for most individuals. If the differences within mean ± 1.96 SD are not
clinically important, the two methods may be used interchangeably.
The 95% limits of agreement can be unreliable estimates of the population
parameters especially for small sample sizes so, when comparing methods
or assessing repeatability, it is important to calculate confidence
intervals for the 95% limits of agreement.
The code is an adaptation of the
`PyCompare <https://github.com/jaketmp/pyCompare>`_ package. The present
implementation is a simplified version; please refer to the original
package for more advanced functionalities.
References
----------
.. [1] Bland, J. M., & Altman, D. (1986). Statistical methods for assessing
agreement between two methods of clinical measurement. The lancet,
327(8476), 307-310.
.. [2] Giavarina, D. (2015). Understanding bland altman analysis.
Biochemia medica, 25(2), 141-151.
Examples
--------
Bland-Altman plot (example data from [2]_)
.. plot::
>>> import pingouin as pg
>>> df = pg.read_dataset("blandaltman")
>>> ax = pg.plot_blandaltman(df['A'], df['B'])
>>> plt.tight_layout()
"""
# Safety check
assert xaxis in ["mean", "x", "y"]
# Get names before converting to NumPy array
xname = x.name if isinstance(x, pd.Series) else "x"
yname = y.name if isinstance(y, pd.Series) else "y"
x = np.asarray(x)
y = np.asarray(y)
assert x.ndim == 1 and y.ndim == 1
assert x.size == y.size
assert not np.isnan(x).any(), "Missing values in x or y are not supported."
assert not np.isnan(y).any(), "Missing values in x or y are not supported."
# Update default kwargs with specified inputs
_scatter_kwargs = {"color": "tab:blue", "alpha": 0.8}
_scatter_kwargs.update(kwargs)
# Calculate mean, STD and SEM of x - y
n = x.size
dof = n - 1
diff = x - y
mean_diff = np.mean(diff)
std_diff = np.std(diff, ddof=1)
mean_diff_se = np.sqrt(std_diff**2 / n)
# Limits of agreements
high = mean_diff + agreement * std_diff
low = mean_diff - agreement * std_diff
high_low_se = np.sqrt(3 * std_diff**2 / n)
# Define x-axis
if xaxis == "mean":
xval = np.vstack((x, y)).mean(0)
xlabel = f"Mean of {xname} and {yname}"
elif xaxis == "x":
xval = x
xlabel = xname
else:
xval = y
xlabel = yname
# Start the plot
if ax is None:
ax = plt.gca()
# Plot the mean diff, limits of agreement and scatter
ax.scatter(xval, diff, **_scatter_kwargs)
ax.axhline(mean_diff, color="k", linestyle="-", lw=2)
ax.axhline(high, color="k", linestyle=":", lw=1.5)
ax.axhline(low, color="k", linestyle=":", lw=1.5)
# Annotate values
if annotate:
loa_range = high - low
offset = (loa_range / 100.0) * 1.5
trans = transforms.blended_transform_factory(ax.transAxes, ax.transData)
xloc = 0.98
ax.text(xloc, mean_diff + offset, "Mean", ha="right", va="bottom", transform=trans)
ax.text(xloc, mean_diff - offset, "%.2f" % mean_diff, ha="right", va="top", transform=trans)
ax.text(
xloc, high + offset, "+%.2f SD" % agreement, ha="right", va="bottom", transform=trans
)
ax.text(xloc, high - offset, "%.2f" % high, ha="right", va="top", transform=trans)
ax.text(xloc, low - offset, "-%.2f SD" % agreement, ha="right", va="top", transform=trans)
ax.text(xloc, low + offset, "%.2f" % low, ha="right", va="bottom", transform=trans)
# Add 95% confidence intervals for mean bias and limits of agreement
if confidence is not None:
assert 0 < confidence < 1
ci = dict()
ci["mean"] = stats.t.interval(confidence, dof, loc=mean_diff, scale=mean_diff_se)
ci["high"] = stats.t.interval(confidence, dof, loc=high, scale=high_low_se)
ci["low"] = stats.t.interval(confidence, dof, loc=low, scale=high_low_se)
ax.axhspan(ci["mean"][0], ci["mean"][1], facecolor="tab:grey", alpha=0.2)
ax.axhspan(ci["high"][0], ci["high"][1], facecolor=_scatter_kwargs["color"], alpha=0.2)
ax.axhspan(ci["low"][0], ci["low"][1], facecolor=_scatter_kwargs["color"], alpha=0.2)
# Labels
ax.set_ylabel(f"{xname} - {yname}")
ax.set_xlabel(xlabel)
return ax
def _ppoints(n, a=0.5):
"""
Ordinates For Probability Plotting.
Numpy analogue or `R`'s `ppoints` function.
Parameters
----------
n : int
Number of points generated
a : float
Offset fraction (typically between 0 and 1)
Returns
-------
p : array
Sequence of probabilities at which to evaluate the inverse
distribution.
"""
a = 3 / 8 if n <= 10 else 0.5
return (np.arange(n) + 1 - a) / (n + 1 - 2 * a)
[docs]
def qqplot(x, dist="norm", sparams=(), confidence=0.95, square=True, ax=None, **kwargs):
"""Quantile-Quantile plot.
Parameters
----------
x : array_like
Sample data.
dist : str or stats.distributions instance, optional
Distribution or distribution function name. The default is `'norm'`
for a normal probability plot.
sparams : tuple, optional
Distribution-specific shape parameters (shape parameters, location,
and scale). See :py:func:`scipy.stats.probplot` for more details.
confidence : float
Confidence level (.95 = 95%) for point-wise confidence envelope.
Can be disabled by passing False.
square: bool
If True (default), ensure equal aspect ratio between X and Y axes.
ax : matplotlib axes
Axis on which to draw the plot
**kwargs : optional
Optional argument(s) passed to :py:func:`matplotlib.pyplot.scatter`.
Returns
-------
ax : Matplotlib Axes instance
Returns the Axes object with the plot for further tweaking.
Raises
------
ValueError
If ``sparams`` does not contain the required parameters for ``dist``.
(e.g. :py:class:`scipy.stats.t` has a mandatory degrees of
freedom parameter *df*.)
Notes
-----
This function returns a scatter plot of the quantile of the sample data
``x`` against the theoretical quantiles of the distribution given in
``dist`` (default = *'norm'*).
The points plotted in a Q–Q plot are always non-decreasing when viewed
from left to right. If the two distributions being compared are identical,
the Q–Q plot follows the 45° line y = x. If the two distributions agree
after linearly transforming the values in one of the distributions,
then the Q–Q plot follows some line, but not necessarily the line y = x.
If the general trend of the Q–Q plot is flatter than the line y = x,
the distribution plotted on the horizontal axis is more dispersed than
the distribution plotted on the vertical axis. Conversely, if the general
trend of the Q–Q plot is steeper than the line y = x, the distribution
plotted on the vertical axis is more dispersed than the distribution
plotted on the horizontal axis. Q–Q plots are often arced, or "S" shaped,
indicating that one of the distributions is more skewed than the other,
or that one of the distributions has heavier tails than the other.
In addition, the function also plots a best-fit line (linear regression)
for the data and annotates the plot with the coefficient of
determination :math:`R^2`. Note that the intercept and slope of the
linear regression between the quantiles gives a measure of the relative
location and relative scale of the samples.
.. warning:: Be extra careful when using fancier distributions with several
parameters. Always double-check your results with another
software or package.
References
----------
* https://github.com/cran/car/blob/master/R/qqPlot.R
* Fox, J. (2008), Applied Regression Analysis and Generalized Linear
Models, 2nd Ed., Sage Publications, Inc.
Examples
--------
Q-Q plot using a normal theoretical distribution:
.. plot::
>>> import numpy as np
>>> import pingouin as pg
>>> np.random.seed(123)
>>> x = np.random.normal(size=50)
>>> ax = pg.qqplot(x, dist='norm')
Two Q-Q plots using two separate axes:
.. plot::
>>> import numpy as np
>>> import pingouin as pg
>>> import matplotlib.pyplot as plt
>>> np.random.seed(123)
>>> x = np.random.normal(size=50)
>>> x_exp = np.random.exponential(size=50)
>>> fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(9, 4))
>>> ax1 = pg.qqplot(x, dist='norm', ax=ax1, confidence=False)
>>> ax2 = pg.qqplot(x_exp, dist='expon', ax=ax2)
Using custom location / scale parameters as well as another Seaborn style
.. plot::
>>> import numpy as np
>>> import seaborn as sns
>>> import pingouin as pg
>>> import matplotlib.pyplot as plt
>>> np.random.seed(123)
>>> x = np.random.normal(size=50)
>>> mean, std = 0, 0.8
>>> sns.set_style('darkgrid')
>>> ax = pg.qqplot(x, dist='norm', sparams=(mean, std))
"""
# Update default kwargs with specified inputs
_scatter_kwargs = {"marker": "o", "color": "blue"}
_scatter_kwargs.update(kwargs)
if isinstance(dist, str):
dist = getattr(stats, dist)
x = np.asarray(x)
x = x[~np.isnan(x)] # NaN are automatically removed
# Check sparams: if single parameter, tuple becomes int
if not isinstance(sparams, (tuple, list)):
sparams = (sparams,)
# For fancier distributions, check that the required parameters are passed
if len(sparams) < dist.numargs:
raise ValueError(
"The following sparams are required for this "
"distribution: %s. See scipy.stats.%s for details." % (dist.shapes, dist.name)
)
# Extract quantiles and regression
quantiles = stats.probplot(x, sparams=sparams, dist=dist, fit=False)
theor, observed = quantiles[0], quantiles[1]
fit_params = dist.fit(x)
loc = fit_params[-2]
scale = fit_params[-1]
shape = fit_params[:-2] if len(fit_params) > 2 else None
# Observed values to observed quantiles
if loc != 0 and scale != 1:
observed = (np.sort(observed) - fit_params[-2]) / fit_params[-1]
# Linear regression
slope, intercept, r, _, _ = stats.linregress(theor, observed)
# Start the plot
if ax is None:
ax = plt.gca()
ax.scatter(theor, observed, **_scatter_kwargs)
ax.set_xlabel("Theoretical quantiles")
ax.set_ylabel("Ordered quantiles")
# Add diagonal line
end_pts = [ax.get_xlim(), ax.get_ylim()]
end_pts[0] = min(end_pts[0])
end_pts[1] = max(end_pts[1])
ax.plot(end_pts, end_pts, color="slategrey", lw=1.5)
ax.set_xlim(end_pts)
ax.set_ylim(end_pts)
# Add regression line and annotate R2
fit_val = slope * theor + intercept
ax.plot(theor, fit_val, "r-", lw=2)
posx = end_pts[0] + 0.60 * (end_pts[1] - end_pts[0])
posy = end_pts[0] + 0.10 * (end_pts[1] - end_pts[0])
ax.text(posx, posy, "$R^2=%.3f$" % r**2)
if confidence is not False:
# Confidence envelope
n = x.size
P = _ppoints(n)
crit = stats.norm.ppf(1 - (1 - confidence) / 2)
pdf = dist.pdf(theor) if shape is None else dist.pdf(theor, *shape)
se = (slope / pdf) * np.sqrt(P * (1 - P) / n)
upper = fit_val + crit * se
lower = fit_val - crit * se
ax.plot(theor, upper, "r--", lw=1.25)
ax.plot(theor, lower, "r--", lw=1.25)
# Make square
if square:
ax.set_aspect("equal")
return ax
[docs]
def plot_paired(
data=None,
dv=None,
within=None,
subject=None,
order=None,
boxplot=True,
boxplot_in_front=False,
orient="v",
ax=None,
colors=["green", "grey", "indianred"],
pointplot_kwargs={"scale": 0.6, "marker": "."},
boxplot_kwargs={"color": "lightslategrey", "width": 0.2},
):
"""
Paired plot.
Parameters
----------
data : :py:class:`pandas.DataFrame`
Long-format dataFrame.
dv : string
Name of column containing the dependent variable.
within : string
Name of column containing the within-subject factor.
subject : string
Name of column containing the subject identifier.
order : list of str
List of values in ``within`` that define the order of elements on the
x-axis of the plot. If None, uses alphabetical order.
boxplot : boolean
If True, add a boxplot to the paired lines using the
:py:func:`seaborn.boxplot` function.
boxplot_in_front : boolean
If True, the boxplot is plotted on the foreground (i.e. above the
individual lines) and with a slight transparency. This makes the
overall plot more readable when plotting a large numbers of subjects.
.. versionadded:: 0.3.8
orient : string
Plot the boxplots vertically and the subjects on the x-axis if
``orient='v'`` (default). Set to ``orient='h'`` to rotate the plot by
by 90 degrees.
.. versionadded:: 0.3.9
ax : matplotlib axes
Axis on which to draw the plot.
colors : list of str
Line colors names. Default is green when value increases from A to B,
indianred when value decreases from A to B and grey when the value is
the same in both measurements.
pointplot_kwargs : dict
Dictionnary of optional arguments that are passed to the
:py:func:`seaborn.pointplot` function.
boxplot_kwargs : dict
Dictionnary of optional arguments that are passed to the
:py:func:`seaborn.boxplot` function.
Returns
-------
ax : Matplotlib Axes instance
Returns the Axes object with the plot for further tweaking.
Notes
-----
Data must be a long-format pandas DataFrame. Missing values are automatically removed using a
strict listwise approach (= complete-case analysis).
Examples
--------
Default paired plot:
.. plot::
>>> import pingouin as pg
>>> df = pg.read_dataset('mixed_anova').query("Time != 'January'")
>>> df = df.query("Group == 'Meditation' and Subject > 40")
>>> ax = pg.plot_paired(data=df, dv='Scores', within='Time', subject='Subject')
Paired plot on an existing axis (no boxplot and uniform color):
.. plot::
>>> import pingouin as pg
>>> import matplotlib.pyplot as plt
>>> df = pg.read_dataset('mixed_anova').query("Time != 'January'")
>>> df = df.query("Group == 'Meditation' and Subject > 40")
>>> fig, ax1 = plt.subplots(1, 1, figsize=(5, 4))
>>> pg.plot_paired(data=df, dv='Scores', within='Time',
... subject='Subject', ax=ax1, boxplot=False,
... colors=['grey', 'grey', 'grey']) # doctest: +SKIP
Horizontal paired plot with three unique within-levels:
.. plot::
>>> import pingouin as pg
>>> import matplotlib.pyplot as plt
>>> df = pg.read_dataset('mixed_anova').query("Group == 'Meditation'")
>>> # df = df.query("Group == 'Meditation' and Subject > 40")
>>> pg.plot_paired(data=df, dv='Scores', within='Time',
... subject='Subject', orient='h') # doctest: +SKIP
With the boxplot on the foreground:
.. plot::
>>> import pingouin as pg
>>> df = pg.read_dataset('mixed_anova').query("Time != 'January'")
>>> df = df.query("Group == 'Control'")
>>> ax = pg.plot_paired(data=df, dv='Scores', within='Time',
... subject='Subject', boxplot_in_front=True)
"""
from pingouin.utils import _check_dataframe
# Update default kwargs with specified inputs
_pointplot_kwargs = {"scale": 0.6, "marker": "."}
_pointplot_kwargs.update(pointplot_kwargs)
_boxplot_kwargs = {"color": "lightslategrey", "width": 0.2}
_boxplot_kwargs.update(boxplot_kwargs)
# Extract pointplot alpha, if set
pp_alpha = _pointplot_kwargs.pop("alpha", 1.0)
# Calculate size of the plot elements by scale as in Seaborn pointplot
scale = _pointplot_kwargs.pop("scale")
lw = plt.rcParams["lines.linewidth"] * 1.8 * scale # get the linewidth
mew = lw * 0.75 # get the markeredgewidth
markersize = np.pi * np.square(lw) * 2 # get the markersize
# Set boxplot in front of Line2D plot (zorder=2 for both) and add alpha
if boxplot_in_front:
_boxplot_kwargs.update(
{
"boxprops": {"zorder": 3}, # Boxplot on top
"whiskerprops": {"zorder": 3},
"zorder": 3,
}
)
else:
_boxplot_kwargs.update(
{
"boxprops": {"zorder": 1}, # Boxplot behind
"whiskerprops": {"zorder": 1},
"zorder": 1,
}
)
# Validate args
data = _check_dataframe(data=data, dv=dv, within=within, subject=subject, effects="within")
# Pivot and melt the table. This has several effects:
# 1) Force missing values to be explicit (a NaN cell is created)
# 2) Automatic collapsing to the mean if multiple within factors are present
# 3) If using dropna, remove rows with missing values (listwise deletion).
# The latter is the same behavior as JASP (= strict complete-case analysis).
data_piv = data.pivot_table(index=subject, columns=within, values=dv, observed=True)
data_piv = data_piv.dropna()
data = data_piv.melt(ignore_index=False, value_name=dv).reset_index()
# Extract within-subject level (alphabetical order)
x_cat = np.unique(data[within])
if order is None:
order = x_cat
else:
assert len(order) == len(
x_cat
), "Order must have the same number of elements as the number of levels in `within`."
# Substitue within by integer order of the ordered columns to allow for
# changing the order of numeric withins.
data["wthn"] = data[within].replace({_ordr: i for i, _ordr in enumerate(order)})
order_num = range(len(order)) # Make numeric order
# Start the plot
if ax is None:
ax = plt.gca()
# Set x and y depending on orientation using the num. replacement within
_x = "wthn" if orient == "v" else dv
_y = dv if orient == "v" else "wthn"
for cat in range(len(x_cat) - 1):
_order = (order_num[cat], order_num[cat + 1])
# Extract data of the current subject-combination
data_now = data.loc[data["wthn"].isin(_order), [dv, "wthn", subject]]
# Select colors for all lines between the current subjects
y1 = data_now.loc[data_now["wthn"] == _order[0], dv].to_numpy()
y2 = data_now.loc[data_now["wthn"] == _order[1], dv].to_numpy()
# Line and scatter colors depending on subject dv trend
_colors = np.where(y1 < y2, colors[0], np.where(y1 > y2, colors[2], colors[1]))
# Line and scatter colors as hue-indexed dictionary
_colors = {subj: clr for subj, clr in zip(data_now[subject].unique(), _colors)}
# Plot individual lines using Seaborn
sns.lineplot(
data=data_now,
x=_x,
y=_y,
hue=subject,
palette=_colors,
ls="-",
lw=lw,
legend=False,
ax=ax,
)
# Plot individual markers using Seaborn
sns.scatterplot(
data=data_now,
x=_x,
y=_y,
hue=subject,
palette=_colors,
edgecolor="face",
lw=mew,
sizes=[markersize] * data_now.shape[0],
legend=False,
ax=ax,
**_pointplot_kwargs,
)
# Set zorder and alpha of pointplot markers and lines
_ = plt.setp(ax.collections, alpha=pp_alpha, zorder=2) # Set marker alpha
_ = plt.setp(ax.lines, alpha=pp_alpha, zorder=2) # Set line alpha
if boxplot:
# Set boxplot x and y depending on orientation
_xbp = within if orient == "v" else dv
_ybp = dv if orient == "v" else within
sns.boxplot(data=data, x=_xbp, y=_ybp, order=order, ax=ax, orient=orient, **_boxplot_kwargs)
# Set alpha to patch of boxplot but not to whiskers
for patch in ax.artists:
r, g, b, a = patch.get_facecolor()
patch.set_facecolor((r, g, b, 0.75))
else:
# If no boxplot, axis needs manual styling as in Seaborn pointplot
if orient == "v":
xlabel, ylabel = within, dv
ax.set_xticks(np.arange(len(x_cat)))
ax.set_xticklabels(order)
ax.xaxis.grid(False)
ax.set_xlim(-0.5, len(x_cat) - 0.5, auto=None)
else:
xlabel, ylabel = dv, within
ax.set_yticks(np.arange(len(x_cat)))
ax.set_yticklabels(order)
ax.yaxis.grid(False)
ax.set_ylim(-0.5, len(x_cat) - 0.5, auto=None)
ax.invert_yaxis()
ax.set_xlabel(xlabel)
ax.set_ylabel(ylabel)
return ax
[docs]
def plot_shift(
x,
y,
paired=False,
n_boot=1000,
percentiles=np.arange(10, 100, 10),
confidence=0.95,
seed=None,
show_median=True,
violin=True,
):
"""Shift plot.
Parameters
----------
x, y : array_like
First and second set of observations.
paired : bool
Specify whether ``x`` and ``y`` are related (i.e. repeated measures) or independent.
.. versionadded:: 0.3.0
n_boot : int
Number of bootstrap iterations. The higher, the better, the slower.
percentiles: array_like
Sequence of percentiles to compute, which must be between 0 and 100 inclusive.
Default set to [10, 20, 30, 40, 50, 60, 70, 80, 90].
confidence : float
Confidence level (0.95 = 95%) for the confidence intervals.
seed : int or None
Random seed for generating bootstrap samples, can be integer or None for no seed (default).
show_median: boolean
If True (default), show the median with black lines.
violin: boolean
If True (default), plot the density of X and Y distributions. Defaut set to True.
Returns
-------
fig : matplotlib Figure instance
Matplotlib Figure. To get the individual axes, use fig.axes.
See also
--------
harrelldavis
Notes
-----
The shift plot is described in [1]_. It computes a shift function [2]_ for two (in)dependent
groups using the robust Harrell-Davis quantile estimator in conjunction with bias-corrected
bootstrap confidence intervals.
References
----------
.. [1] Rousselet, G. A., Pernet, C. R. and Wilcox, R. R. (2017). Beyond
differences in means: robust graphical methods to compare two groups
in neuroscience. Eur J Neurosci, 46: 1738-1748.
doi:10.1111/ejn.13610
.. [2] https://garstats.wordpress.com/2016/07/12/shift-function/
Examples
--------
Default shift plot
.. plot::
>>> import numpy as np
>>> import pingouin as pg
>>> np.random.seed(42)
>>> x = np.random.normal(5.5, 2, 50)
>>> y = np.random.normal(6, 1.5, 50)
>>> fig = pg.plot_shift(x, y)
With different options, and custom axes labels
.. plot::
>>> import pingouin as pg
>>> import matplotlib.pyplot as plt
>>> data = pg.read_dataset("pairwise_corr")
>>> fig = pg.plot_shift(data["Neuroticism"], data["Conscientiousness"], paired=True,
... n_boot=2000, percentiles=[25, 50, 75], show_median=False, seed=456,
... violin=False)
>>> fig.axes[0].set_xlabel("Groups")
>>> fig.axes[0].set_ylabel("Values", size=15)
>>> fig.axes[0].set_title("Comparing Neuroticism and Conscientiousness", size=15)
>>> fig.axes[1].set_xlabel("Neuroticism quantiles", size=12)
>>> plt.tight_layout()
"""
from pingouin.regression import _bias_corrected_ci
from pingouin.nonparametric import harrelldavis as hd
# Safety check
x = np.asarray(x)
y = np.asarray(y)
percentiles = np.asarray(percentiles) / 100 # Convert to 0 - 1 range
assert x.ndim == 1, "x must be 1D."
assert y.ndim == 1, "y must be 1D."
nx, ny = x.size, y.size
assert not np.isnan(x).any(), "Missing values are not allowed."
assert not np.isnan(y).any(), "Missing values are not allowed."
assert nx >= 10, "x must have at least 10 samples."
assert ny >= 10, "y must have at least 10 samples."
assert 0 < confidence < 1, "confidence must be between 0 and 1."
if paired:
assert nx == ny, "x and y must have the same size when paired=True."
# Robust percentile
x_per = hd(x, percentiles)
y_per = hd(y, percentiles)
delta = y_per - x_per
# Compute bootstrap distribution of differences
rng = np.random.RandomState(seed)
if paired:
bootsam = rng.choice(np.arange(nx), size=(nx, n_boot), replace=True)
bootstat = hd(y[bootsam], percentiles, axis=0) - hd(x[bootsam], percentiles, axis=0)
else:
x_list = rng.choice(x, size=(nx, n_boot), replace=True)
y_list = rng.choice(y, size=(ny, n_boot), replace=True)
bootstat = hd(y_list, percentiles, axis=0) - hd(x_list, percentiles, axis=0)
# Find upper and lower confidence interval for each quantiles
# Bias-corrected bootstrapped confidence interval
lower, median_per, upper = [], [], []
for i, d in enumerate(delta):
ci = _bias_corrected_ci(bootstat[i, :], d, alpha=(1 - confidence))
median_per.append(_bias_corrected_ci(bootstat[i, :], d, alpha=1)[0])
lower.append(ci[0])
upper.append(ci[1])
lower = np.asarray(lower)
median_per = np.asarray(median_per)
upper = np.asarray(upper)
# Create long-format dataFrame for use with Seaborn
data = pd.DataFrame({"value": np.concatenate([x, y]), "variable": ["X"] * nx + ["Y"] * ny})
#############################
# Plots X and Y distributions
#############################
fig = plt.figure(figsize=(8, 5))
ax1 = plt.subplot2grid((3, 3), (0, 0), rowspan=2, colspan=3)
# Boxplot X & Y
def adjacent_values(vals, q1, q3):
upper_adjacent_value = q3 + (q3 - q1) * 1.5
upper_adjacent_value = np.clip(upper_adjacent_value, q3, vals[-1])
lower_adjacent_value = q1 - (q3 - q1) * 1.5
lower_adjacent_value = np.clip(lower_adjacent_value, vals[0], q1)
return lower_adjacent_value, upper_adjacent_value
for dis, pos in zip([x, y], [1.2, -0.2]):
qrt1, medians, qrt3 = np.percentile(dis, [25, 50, 75])
whiskers = adjacent_values(np.sort(dis), qrt1, qrt3)
ax1.plot(medians, pos, marker="o", color="white", zorder=10)
ax1.hlines(pos, qrt1, qrt3, color="k", linestyle="-", lw=7, zorder=9)
ax1.hlines(pos, whiskers[0], whiskers[1], color="k", linestyle="-", lw=2, zorder=9)
ax1 = sns.stripplot(
data=data,
x="value",
y="variable",
orient="h",
order=["Y", "X"],
palette=["#88bedc", "#cfcfcf"],
)
if violin:
vl = plt.violinplot([y, x], showextrema=False, vert=False, widths=1)
# Upper plot
paths = vl["bodies"][0].get_paths()[0]
paths.vertices[:, 1][paths.vertices[:, 1] >= 1] = 1
paths.vertices[:, 1] = paths.vertices[:, 1] - 1.2
vl["bodies"][0].set_edgecolor("k")
vl["bodies"][0].set_facecolor("#88bedc")
vl["bodies"][0].set_alpha(0.8)
# Lower plot
paths = vl["bodies"][1].get_paths()[0]
paths.vertices[:, 1][paths.vertices[:, 1] <= 2] = 2
paths.vertices[:, 1] = paths.vertices[:, 1] - 0.8
vl["bodies"][1].set_edgecolor("k")
vl["bodies"][1].set_facecolor("#cfcfcf")
vl["bodies"][1].set_alpha(0.8)
# Rescale ylim
ax1.set_ylim(2, -1)
for i in range(len(percentiles)):
# Connection between quantiles
if upper[i] < 0:
col = "#4c72b0"
elif lower[i] > 0:
col = "#c34e52"
else:
col = "darkgray"
plt.plot([y_per[i], x_per[i]], [0.2, 0.8], marker="o", color=col, zorder=10)
# X quantiles
plt.plot([x_per[i], x_per[i]], [0.8, 1.2], "k--", zorder=9)
# Y quantiles
plt.plot([y_per[i], y_per[i]], [-0.2, 0.2], "k--", zorder=9)
if show_median:
x_med, y_med = np.median(x), np.median(y)
plt.plot([x_med, x_med], [0.8, 1.2], "k-")
plt.plot([y_med, y_med], [-0.2, 0.2], "k-")
plt.xlabel("Scores (a.u.)", size=15)
ax1.set_yticklabels(["Y", "X"], size=15)
ax1.set_ylabel("")
#######################
# Plots quantiles shift
#######################
ax2 = plt.subplot2grid((3, 3), (2, 0), rowspan=1, colspan=3)
for i, per in enumerate(x_per):
if upper[i] < 0:
col = "#4c72b0"
elif lower[i] > 0:
col = "#c34e52"
else:
col = "darkgray"
plt.plot([per, per], [upper[i], lower[i]], lw=3, color=col, zorder=10)
plt.plot(per, median_per[i], marker="o", ms=10, color=col, zorder=10)
plt.axhline(y=0, ls="--", lw=2, color="gray")
ax2.set_xlabel("X quantiles", size=15)
ax2.set_ylabel("Y - X quantiles \n differences (a.u.)", size=10)
plt.tight_layout()
return fig
[docs]
def plot_rm_corr(
data=None,
x=None,
y=None,
subject=None,
legend=False,
kwargs_facetgrid=dict(height=4, aspect=1),
kwargs_line=dict(ls="solid"),
kwargs_scatter=dict(marker="o"),
):
"""Plot a repeated measures correlation.
Parameters
----------
data : :py:class:`pandas.DataFrame`
Dataframe.
x, y : string
Name of columns in ``data`` containing the two dependent variables.
subject : string
Name of column in ``data`` containing the subject indicator.
legend : boolean
If True, add legend to plot. Legend will show all the unique values in
``subject``.
kwargs_facetgrid : dict
Optional keyword arguments passed to :py:class:`seaborn.FacetGrid`
kwargs_line : dict
Optional keyword arguments passed to :py:class:`matplotlib.pyplot.plot`
kwargs_scatter : dict
Optional keyword arguments passed to :py:class:`matplotlib.pyplot.scatter`
Returns
-------
g : :py:class:`seaborn.FacetGrid`
Seaborn FacetGrid.
See also
--------
rm_corr
Notes
-----
Repeated measures correlation [1]_ (rmcorr) is a statistical technique
for determining the common within-individual association for paired
measures assessed on two or more occasions for multiple individuals.
Results have been tested against the
`rmcorr <https://github.com/cran/rmcorr>` R package. Note that this
function requires `statsmodels
<https://www.statsmodels.org/stable/index.html>`_.
Missing values are automatically removed from the ``data``
(listwise deletion).
References
----------
.. [1] Bakdash, J.Z., Marusich, L.R., 2017. Repeated Measures Correlation.
Front. Psychol. 8, 456. https://doi.org/10.3389/fpsyg.2017.00456
Examples
--------
Default repeated mesures correlation plot
.. plot::
>>> import pingouin as pg
>>> df = pg.read_dataset('rm_corr')
>>> g = pg.plot_rm_corr(data=df, x='pH', y='PacO2', subject='Subject')
With some tweakings
.. plot::
>>> import pingouin as pg
>>> import seaborn as sns
>>> df = pg.read_dataset('rm_corr')
>>> sns.set(style='darkgrid', font_scale=1.2)
>>> g = pg.plot_rm_corr(data=df, x='pH', y='PacO2',
... subject='Subject', legend=True,
... kwargs_facetgrid=dict(height=4.5, aspect=1.5,
... palette='Spectral'))
"""
# Check that stasmodels is installed
from pingouin.utils import _is_statsmodels_installed
_is_statsmodels_installed(raise_error=True)
from statsmodels.formula.api import ols
# Safety check (duplicated from pingouin.rm_corr)
assert isinstance(data, pd.DataFrame), "Data must be a DataFrame"
assert x in data.columns, "The %s column is not in data." % x
assert y in data.columns, "The %s column is not in data." % y
assert data[x].dtype.kind in "bfiu", "%s must be numeric." % x
assert data[y].dtype.kind in "bfiu", "%s must be numeric." % y
assert subject in data.columns, "The %s column is not in data." % subject
if data[subject].nunique() < 3:
raise ValueError("rm_corr requires at least 3 unique subjects.")
# Remove missing values
data = data[[x, y, subject]].dropna(axis=0)
# Calculate rm_corr
# rmc = pg.rm_corr(data=data, x=x, y=y, subject=subject)
# Fit ANCOVA model
# https://patsy.readthedocs.io/en/latest/builtins-reference.html
# C marks the data as categorical
# Q allows to quote variable that do not meet Python variable name rule
# e.g. if variable is "weight.in.kg" or "2A"
assert x not in ["C", "Q"], "`x` must not be 'C' or 'Q'."
assert y not in ["C", "Q"], "`y` must not be 'C' or 'Q'."
assert subject not in ["C", "Q"], "`subject` must not be 'C' or 'Q'."
formula = f"Q('{y}') ~ C(Q('{subject}')) + Q('{x}')"
model = ols(formula, data=data).fit()
# Fitted values
data["pred"] = model.fittedvalues
# Define color palette
if "palette" not in kwargs_facetgrid:
kwargs_facetgrid["palette"] = sns.hls_palette(data[subject].nunique())
# Start plot
g = sns.FacetGrid(data, hue=subject, **kwargs_facetgrid)
g = g.map(sns.regplot, x, "pred", scatter=False, ci=None, truncate=True, line_kws=kwargs_line)
g = g.map(sns.scatterplot, x, y, **kwargs_scatter)
if legend:
g.add_legend()
return g
[docs]
def plot_circmean(
angles,
square=True,
ax=None,
kwargs_markers=dict(color="tab:blue", marker="o", mfc="none", ms=10),
kwargs_arrow=dict(width=0.01, head_width=0.1, head_length=0.1, fc="tab:red", ec="tab:red"),
):
"""Plot the circular mean and vector length of a set of angles
on the unit circle.
.. versionadded:: 0.3.3
Parameters
----------
angles : array or list
Angles (expressed in radians). Only 1D array are supported here.
square: bool
If True (default), ensure equal aspect ratio between X and Y axes.
ax : matplotlib axes
Axis on which to draw the plot.
kwargs_markers : dict
Optional keywords arguments that are passed to
:obj:`matplotlib.axes.Axes.plot`
to control the markers aesthetics.
kwargs_arrow : dict
Optional keywords arguments that are passed to
:obj:`matplotlib.axes.Axes.arrow`
to control the arrow aesthetics.
Returns
-------
ax : Matplotlib Axes instance
Returns the Axes object with the plot for further tweaking.
Examples
--------
Default plot
.. plot::
>>> import pingouin as pg
>>> ax = pg.plot_circmean([0.05, -0.8, 1.2, 0.8, 0.5, -0.3, 0.3, 0.7])
Changing some aesthetics parameters
.. plot::
>>> import pingouin as pg
>>> import matplotlib.pyplot as plt
>>> _, ax = plt.subplots(1, 1, figsize=(3, 3))
>>> ax = pg.plot_circmean([0.05, -0.8, 1.2, 0.8, 0.5, -0.3, 0.3, 0.7],
... kwargs_markers=dict(color='k', mfc='k'),
... kwargs_arrow=dict(ec='k', fc='k'), ax=ax)
.. plot::
>>> import pingouin as pg
>>> import seaborn as sns
>>> sns.set(font_scale=1.5, style='white')
>>> ax = pg.plot_circmean([0.8, 1.5, 3.14, 5.2, 6.1, 2.8, 2.6, 3.2],
... kwargs_markers=dict(marker="None"))
"""
from matplotlib.patches import Circle
from .circular import circ_r, circ_mean
# Sanity checks
angles = np.asarray(angles)
assert angles.ndim == 1, "angles must be a one-dimensional array."
assert angles.size > 1, "angles must have at least 2 values."
assert isinstance(kwargs_markers, dict), "kwargs_markers must be a dict."
assert isinstance(kwargs_arrow, dict), "kwargs_arrow must be a dict."
# Fill missing values in dict
if "color" not in kwargs_markers.keys():
kwargs_markers["color"] = "tab:blue"
if "marker" not in kwargs_markers.keys():
kwargs_markers["marker"] = "o"
if "mfc" not in kwargs_markers.keys():
kwargs_markers["mfc"] = "none"
if "ms" not in kwargs_markers.keys():
kwargs_markers["ms"] = 10
if "width" not in kwargs_arrow.keys():
kwargs_arrow["width"] = 0.01
if "head_width" not in kwargs_arrow.keys():
kwargs_arrow["head_width"] = 0.1
if "head_length" not in kwargs_arrow.keys():
kwargs_arrow["head_length"] = 0.1
if "fc" not in kwargs_arrow.keys():
kwargs_arrow["fc"] = "tab:red"
if "ec" not in kwargs_arrow.keys():
kwargs_arrow["ec"] = "tab:red"
# Convert angles to unit vector
z = np.exp(1j * angles)
r = circ_r(angles) # Resulting vector length
phi = circ_mean(angles) # Circular mean
zm = r * np.exp(1j * phi)
# Plot unit circle
if ax is None:
ax = plt.gca()
circle = Circle((0, 0), 1, edgecolor="k", facecolor="none", linewidth=2)
ax.add_patch(circle)
ax.axvline(0, lw=1, ls=":", color="slategrey")
ax.axhline(0, lw=1, ls=":", color="slategrey")
ax.plot(np.real(z), np.imag(z), ls="None", **kwargs_markers)
# Plot mean resultant vector
ax.arrow(0, 0, np.real(zm), np.imag(zm), **kwargs_arrow)
# X and Y ticks in radians
ax.set_xticks([])
ax.set_yticks([])
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
ax.spines["left"].set_visible(False)
ax.spines["bottom"].set_visible(False)
ax.text(1.2, 0, "0", verticalalignment="center")
ax.text(-1.3, 0, r"$\pi$", verticalalignment="center")
ax.text(0, 1.2, r"$+\pi/2$", horizontalalignment="center")
ax.text(0, -1.3, r"$-\pi/2$", horizontalalignment="center")
# Make square
if square:
ax.set_aspect("equal")
return ax