Source code for pints.residuals_diagnostics

#
# Functions for analysing the residuals and evaluating noise models
#
# This file is part of PINTS (https://github.com/pints-team/pints/) which is
# released under the BSD 3-clause license. See accompanying LICENSE.md for
# copyright notice and full license details.
#


[docs]def plot_residuals_binned_autocorrelation(parameters, problem, thinning=None, n_bins=25): r""" Plot the autocorrelation of the residuals within bins (i.e. discrete time windows across the series). Given a time series with observed residuals .. math:: e_i = y_i - f(t_i; \theta) This method divides the vector of residuals into some number of equally sized bins. The lag 1 autocorrelation is calculated for the residuals within each bin. The plot shows the lag 1 autocorrelation in each bin over time. This diagnostic is useful for diagnosing time series with noise whose autocorrelation varies over time. When passing an array of parameters (from an MCMC sampler), this method plots the autocorrelations of the posterior median residual values. Typically, this diagnostic is called after obtaining the residuals of an IID fit, in order to determine whether the IID fit is satisfactory or a more complex noise model is needed. This function returns a ``matplotlib`` figure. Parameters ---------- parameters The parameter values with shape ``(n_samples, n_parameters)``. When passing a single best fit parameter vector, ``n_samples`` will be 1. problem The problem given by a :class:`pints.SingleOutputProblem` or :class:`pints.MultiOutputProblem`, with ``n_parameters`` greater than or equal to the ``n_parameters`` of the ``parameters``. Extra parameters not found in the problem are ignored. thinning Optional int value (greater than zero). If thinning is set to ``n``, only every nth sample in parameters will be used. If set to ``None`` (default), some thinning will be applied so that about 200 samples will be used. n_bins Optional int value (greater than zero) giving the number of bins into which to divide the time series. By default, it is fixed to 25. """ def lag_one_autocorr_resids(data): return acorr(data, 1)[-1] return _plot_residuals_binned(parameters, problem, thinning=thinning, n_bins=n_bins, calculate=lag_one_autocorr_resids, label='Lag 1 autocorrelation', ylim=(-1, 1), draw_horizontal=True)
[docs]def plot_residuals_binned_std(parameters, problem, thinning=None, n_bins=25): r""" Plot the standard deviation of the residuals within bins (i.e. discrete time windows across the series). Given a time series with observed residuals .. math:: e_i = y_i - f(t_i; \theta) This method divides the vector of residuals into some number of equally sized bins. The standard deviation is calculated for the residuals within each bin. The plot shows the standard deviation in each bin over time. This diagnostic is particularly useful for diagnosing time series whose noise exhibits a change in variance over time. When passing an array of parameters (from an MCMC sampler), this method will plot the standard deviation of the posterior median residual values. Typically, this diagnostic can be called after obtaining the residuals of an IID fit, in order to determine whether the IID fit is satisfactory or a more complex noise model is needed. This function returns a ``matplotlib`` figure. Parameters ---------- parameters The parameter values with shape ``(n_samples, n_parameters)``. When passing a single best fit parameter vector, ``n_samples`` will be 1. problem The problem given by a :class:`pints.SingleOutputProblem` or :class:`pints.MultiOutputProblem`, with ``n_parameters`` greater than or equal to the ``n_parameters`` of the ``parameters``. Extra parameters not found in the problem are ignored. thinning Optional int value (greater than zero). If thinning is set to ``n``, only every nth sample in parameters will be used. If set to ``None`` (default), some thinning will be applied so that about 200 samples will be used. n_bins Optional int value (greater than zero) giving the number of bins into which to divide the time series. By default, it is fixed to 25. """ import numpy as np return _plot_residuals_binned(parameters, problem, thinning=thinning, n_bins=n_bins, calculate=np.std, label='Standard deviation')
def _plot_residuals_binned(parameters, problem, thinning=None, n_bins=25, calculate=None, label='Standard deviation', ylim=None, draw_horizontal=False): """ Make a matplotlib plot of some function of the binned residuals. This is a general function which divides the residuals into bins, performs some calculation from the residuals in each bin, and plots the results. It supports both lag 1 autocorrelation and standard deviation. Parameters ---------- parameters The parameter values with shape ``(n_samples, n_parameters)``. When passing a single best fit parameter vector, ``n_samples`` will be 1. problem The problem given by a :class:`pints.SingleOutputProblem` or :class:`pints.MultiOutputProblem`, with ``n_parameters`` greater than or equal to the ``n_parameters`` of the ``parameters``. Extra parameters not found in the problem are ignored. thinning Optional int value (greater than zero). If thinning is set to ``n``, only every nth sample in parameters will be used. If set to ``None`` (default), some thinning will be applied so that about 200 samples will be used. n_bins Optional int value (greater than zero) giving the number of bins into which to divide the time series. By default, it is fixed to 25. calculate : function What value to calculate within each bin. This function should take as input a numpy array of residuals within a bin and return a scalar value which will be added to the plot. label : str A label to put on the y axis of the plot, describing what function of the binned residuals is being plotted. ylim : tuple Optional bounds for the y-axis of the plot. draw_horizontal : bool Whether or not to draw a horizontal line at y=0 on the plot. This line may be desired when zero is an important reference point for the value being calculated in each bin. """ import numpy as np import matplotlib.pyplot as plt times = problem.times() # Get the number of problem outputs n_outputs = problem.n_outputs() # Get the matrix of residuals values residuals = calculate_residuals(parameters, problem, thinning=thinning) # Get the posterior median residuals residuals = np.median(residuals, axis=0) n_bins = int(n_bins) if n_bins < 1: raise ValueError('n_bins must be an integer value greater than zero.') if n_bins > len(times): raise ValueError('n_bins must not exceed the number of time points') # Make the figure, with one axes for each output fig = plt.figure() for output in range(n_outputs): ax = fig.add_subplot(n_outputs, 1, output + 1) # Get the residuals along this output e = residuals[output, :] # Divide the residuals into bins binned_data = np.array_split(e, n_bins) binned_times = np.array_split(times, n_bins) # Calculate lag 1 autocorrelation and time in each bin bin_values = [] bin_times = [] for data, t in zip(binned_data, binned_times): bin_values.append(calculate(data)) bin_times.append(np.mean(t)) # Plot the binned data ax.plot(bin_times, bin_values, 'o-', color='red') if ylim is not None: ax.set_ylim(*ylim) ax.set_xlabel('Time') ax.set_ylabel(label) ax.set_title('Output %d' % (output + 1)) if draw_horizontal: # Draw a horizontal line at 0 autocorrelation ax.axhline(0, color='C0', zorder=-10) return fig
[docs]def plot_residuals_distance(parameters, problem, thinning=None): r""" Plot a distance matrix of the residuals. Given a time series with observed residuals .. math:: e_i = y_i - f(t_i; \theta) this function generates and plots the distance matrix :math:`D` whose entries are defined by .. math:: D_{i, j} = |e_i - e_j| The plot of this matrix may be helpful for identifying a time series with correlated noise. When the noise terms are correlated, the distance matrix :math:`D` is likely to have a banded appearance. For problems with multiple outputs, one distance matrix is generated for each output. When passing an array of parameters (from an MCMC sampler), this method will plot the distance matrix of the posterior median residual values. Typically, this diagnostic is called after obtaining the residuals of an IID fit, in order to determine whether the IID fit is satisfactory or a more complex noise model is needed. This function returns a ``matplotlib`` figure. Parameters ---------- parameters The parameter values with shape ``(n_samples, n_parameters)``. When passing a single best fit parameter vector, ``n_samples`` will be 1. problem The problem given by a :class:`pints.SingleOutputProblem` or :class:`pints.MultiOutputProblem`, with ``n_parameters`` greater than or equal to the ``n_parameters`` of the ``parameters``. Extra parameters not found in the problem are ignored. thinning Optional int value (greater than zero). If thinning is set to ``n``, only every nth sample in parameters will be used. If set to ``None`` (default), some thinning will be applied so that about 200 samples will be used. """ import numpy as np import matplotlib.pyplot as plt times = problem.times() # Get the number of problem outputs n_outputs = problem.n_outputs() # Get the matrix of residuals values residuals = calculate_residuals(parameters, problem, thinning=thinning) # Get the posterior median residuals residuals = np.median(residuals, axis=0) # Make the figure, with one axes for each output fig = plt.figure() for output in range(n_outputs): ax = fig.add_subplot(n_outputs, 1, output + 1) # Get the residuals along this output e = residuals[output, :] # Calculate the distance matrix D = np.abs(e[:, np.newaxis] - e) D = np.flip(D, axis=0) im = ax.imshow(D, cmap='Greys_r', extent=[min(times), max(times), min(times), max(times)]) ax.set_xlabel('Time') ax.set_ylabel('Time') ax.set_title('Output %d' % (output + 1)) fig.colorbar(im) return fig
[docs]def plot_residuals_autocorrelation(parameters, problem, max_lag=10, thinning=None, significance_level=0.05, posterior_interval=0.95): r""" Generate an autocorrelation plot of the residuals. This function can be used to analyse the results of either optimisation or MCMC Bayesian inference. When multiple samples of the residuals are present (corresponding to multiple MCMC samples), the plot illustrates the distribution of autocorrelations across the MCMC samples. At each lag, a point is drawn at the median autocorrelation, and a line is drawn giving the percentile range of the posterior interval specified as an argument (by default, the 2.5th to the 97.5th percentile). When multiple outputs are present, one residuals plot will be generated for each output. When a significance level is provided, confidence bounds for the sample autocorrelations under the assumption of IID residuals are drawn on the plot. Many of the observed residuals autocorrelations falling outside these bounds could imply evidence against the residuals being IID. Under the assumption that the residuals of length :math:`n` are IID with mean 0 and variance :math:`\sigma^2`, for large :math:`n` the residuals sample autocorrelations are approximately IID Normal(mean=0, variance=1/n). This result is proved in [1]_ (see Theorem 7.2.2 and Example 7.2.1). Therefore, the confidence bounds can be calculated by :math:`\pm z^* n^{-1/2}` for the appropriate critical value :math:`z^*`. This function returns a ``matplotlib`` figure. Parameters ---------- parameters The parameter values with shape ``(n_samples, n_parameters)``. When passing a single best fit parameter vector, ``n_samples`` will be 1. problem The problem given by a :class:`pints.SingleOutputProblem` or :class:`pints.MultiOutputProblem`, with ``n_parameters`` greater than or equal to the ``n_parameters`` of the ``parameters``. Extra parameters not found in the problem are ignored. max_lag Optional int value (default 10). The highest lag to plot. thinning Optional int value (greater than zero). If thinning is set to ``n``, only every nth sample in parameters will be used. If set to ``None`` (default), some thinning will be applied so that about 200 samples will be used. significance_level ``None`` or float value (default 0.05). When a significance level is provided, dashed lines for the confidence interval corresponding to that significance level are drawn on the plot. When ``None``, no lines are drawn. posterior_interval Float value (default 0.95). When multiple samples of the parameter values are provided, this gives the size of the credible region of the posterior to plot. References ---------- .. [1] Brockwell, P. J., & Davis, R. A. (1991). Time series: Theory and methods (2nd ed.). New York: Springer. """ import numpy as np import scipy.special import matplotlib.pyplot as plt # Get the number of problem outputs n_outputs = problem.n_outputs() # Get the matrix of residuals values residuals = calculate_residuals(parameters, problem, thinning=thinning) # Get the number of samples n_samples = residuals.shape[0] # If there are multiple samples of the parameters, calculate the # percentiles of the posterior to plot if n_samples > 1: if posterior_interval > 1 or posterior_interval < 0: raise ValueError('posterior interval must fall between 0 and 1') upper_pctle = (0.5 + posterior_interval / 2) * 100 lower_pctle = (0.5 - posterior_interval / 2) * 100 # Find the location of the confidence interval dotted lines if significance_level is not None: if significance_level > 1 or significance_level < 0: raise ValueError('significance level must fall between 0 and ' '1') threshold = scipy.special.ndtri(1 - significance_level / 2) threshold /= np.sqrt(residuals.shape[2]) # Set up one axes for each output fig, axes = plt.subplots(n_outputs, 1, sharex=True, figsize=(6, 4 * n_outputs)) # If there is only a single axes, place it in a list anyway. if n_outputs == 1: axes = [axes] # Iterate over each problem output. Each output gets its own residuals # autocorrelation plot for output_idx in range(n_outputs): if n_samples == 1: c = acorr(residuals[0, output_idx, :], max_lag) median_acorr = c[max_lag:] yerr = None else: # In this case, there are multiple samples, and a point plot of the # distribution of autocorrelations is desired. Start by # instantiating an empty array to hold autocorrelations for each # sample. cs = np.zeros((n_samples, max_lag + 1)) # For each residual vector, get the autocorrelations for sample_idx in range(n_samples): c = acorr(residuals[sample_idx, output_idx, :], max_lag) cs[sample_idx, :] = c[max_lag:] # Calculate the necessary percentiles of the sample of # autocorrelations median_acorr = np.median(cs, axis=0) lower_acorr = np.percentile(cs, lower_pctle, axis=0) upper_acorr = np.percentile(cs, upper_pctle, axis=0) # Calculate the length of each bar in the point plot yerr = np.vstack((median_acorr - lower_acorr, upper_acorr - median_acorr)) # Plot the autocorrelation points and distributions. matplotlib # errorbar is used to handle the distribution lines axes[output_idx].errorbar(np.arange(0, max_lag + 1), median_acorr, yerr=yerr, color='red', fmt='o-') # Draw the dashed lines showing the confidence interval if significance_level is not None: axes[output_idx].axhline(threshold, ls='--', c='k', zorder=-10) axes[output_idx].axhline(-threshold, ls='--', c='k', zorder=-10) # Draw a horizontal line at 0 autocorrelation axes[output_idx].axhline(0, color='C0', zorder=-10) # Add y-label and adjust limits axes[output_idx].set_ylabel('Output %d\nresiduals autocorrelation' % (output_idx + 1)) axes[output_idx].set_xlim(-0.5, max_lag + 0.5) # Add x-label (common to all outputs) axes[-1].set_xlabel('Lag') return fig
[docs]def plot_residuals_vs_output(parameters, problem, thinning=None): """Draw a plot of the magnitude of residuals versus the solution output. This plot is useful to detect any dependence between the error model and the magnitude of the solution. For example, it may help to detect multiplicative Gaussian noise, in which the standard deviation of the error scales with the output. When multiple samples of the parameters are provided (from an MCMC chain), the residuals are calculated and plotted relative to the posterior median of the solution outputs. This function returns a ``matplotlib`` figure. Parameters ---------- ``parameters`` The parameter values with shape ``(n_samples, n_parameters)``. When passing a single best fit parameter vector, ``n_samples`` will be 1. ``problem`` The problem given by a :class:`pints.SingleOutputProblem` or :class:`pints.MultiOutputProblem`, with ``n_parameters`` greater than or equal to the ``n_parameters`` of the ``parameters``. Extra parameters not found in the problem are ignored. ``thinning`` Optional, integer value (greater than zero). If thinning is set to ``n``, only every nth sample in parameters will be used. If set to ``None`` (default), some thinning will be applied so that about 200 samples will be used. """ import numpy as np import matplotlib.pyplot as plt # Make sure that the parameters argument has the correct shape try: n_samples, n_params = parameters.shape except ValueError: raise ValueError('`parameters` must be of shape (n_samples,' + ' n_parameters).') n_outputs = problem.n_outputs() # Get the thinning level if thinning is None: thinning = max(1, int(n_samples / 200)) else: thinning = int(thinning) if thinning < 1: raise ValueError( 'Thinning rate must be `None` or an integer greater than' ' zero.') # Solve the model for each parameter predicted_values = [] for params in parameters[::thinning, :n_params]: predicted_values.append(problem.evaluate(params)) # Get the posterior median solution posterior_median_values = np.median(predicted_values, axis=0) # Calculate the residuals relative to this posterior median residuals = problem.values() - posterior_median_values # If there is only one output, add the output dimension manually if n_outputs == 1: residuals = residuals[np.newaxis, :] posterior_median_values = posterior_median_values[np.newaxis, :] # Set up one axes for each output fig, axes = plt.subplots(n_outputs, 1, figsize=(6, 4 * n_outputs)) # If there is only a single axes, place it in a list anyway if n_outputs == 1: axes = [axes] # Plot the calculated residuals for output_idx in range(n_outputs): ax = axes[output_idx] ax.scatter(posterior_median_values[output_idx], np.abs(residuals[output_idx]), alpha=0.4) ax.set_xlabel('solution magnitude') ax.set_ylabel('residuals (absolute value)') return fig
[docs]def acorr(x, max_lag): """ Calculate the normalised autocorrelation for a given data series. This function uses the same procedure as ``matplotlib.pyplot.acorr``, but it just calculates the autocorrelation without plotting anything. Returns the autocorrelation as a NumPy array. Parameters ---------- x A 1d NumPy array containing the time series for which to calculate autocorrelation. max_lag An int specifying the highest lag to consider. """ import numpy as np c = np.correlate(x, x, mode='full') # Normalise c /= np.dot(x, x) # Truncate at max_lag in each direction T = len(x) c = c[T - 1 - max_lag:T + max_lag] return c
[docs]def calculate_residuals(parameters, problem, thinning=None): """ Calculate the residuals (difference between actual data and the fit). Either a single set of parameters or a chain of MCMC samples can be provided. The residuals are returned as a 3-dimensional NumPy array with shape ``(n_samples, n_outputs, n_times)``. Parameters ---------- parameters The parameter values with shape ``(n_samples, n_parameters)``. When passing a single best fit parameter vector, ``n_samples`` will be 1. problem The problem given by a :class:`pints.SingleOutputProblem` or :class:`pints.MultiOutputProblem`, with ``n_parameters`` greater than or equal to the ``n_parameters`` of the ``parameters``. Extra parameters not found in the problem are ignored. thinning Optional, integer value (greater than zero). If thinning is set to ``n``, only every nth sample in parameters will be used. If set to ``None`` (default), some thinning will be applied so that about 200 samples will be used. """ import numpy as np # Make sure that the parameters argument has the correct shape try: n_samples, n_params = parameters.shape except ValueError: raise ValueError('`parameters` must be of shape (n_samples,' + ' n_parameters).') # Get the number of problem parameters n_params = problem.n_parameters() # Get the thinning level if thinning is None: thinning = max(1, int(n_samples / 200)) else: thinning = int(thinning) if thinning < 1: raise ValueError( 'Thinning rate must be `None` or an integer greater than' ' zero.') # Solve the model for each parameter setting i = 0 predicted_values = [] for params in parameters[::thinning, :n_params]: predicted_values.append(problem.evaluate(params)) i += 1 predicted_values = np.array(predicted_values) # Calculate the residuals residuals = problem.values() - predicted_values # Arrange the residuals into the correct format # (n_samples, n_outputs, n_times) if residuals.ndim == 2: residuals = residuals[:, np.newaxis, :] else: residuals = np.swapaxes(residuals, 2, 1) return residuals