Source code for pints._optimisers._cmaes_bare

#
# Bare-bones re-implementation of CMA-ES.
#
# This file is part of PINTS (https://github.com/pints-team/pints/) which is
# released under the BSD 3-clause license. See accompanying LICENSE.md for
# copyright notice and full license details.
#
import numpy as np
import pints
import warnings

from numpy.linalg import norm
from scipy.special import gamma



[docs]
class BareCMAES(pints.PopulationBasedOptimiser):
    """
    Finds the best parameters using the CMA-ES method described in [1, 2],
    using a bare bones re-implementation.

    For general use, we recommend the :class:`pints.CMAES` optimiser, which
    wraps around the ``cma`` module provided by the authors of CMA-ES. The
    ``cma`` module provides a battle-tested version of the optimiser.

    The role of this class, is to provide a simpler implementation of only the
    core algorithm of CMA-ES, which is easier to read and analyse, and which
    can be used to compare with bare implementations of other methods.

    Extends :class:`PopulationBasedOptimiser`.

    References
    ----------
    .. [1] The CMA Evolution Strategy: A Tutorial
           Nikolaus Hanse, arxiv
           https://arxiv.org/abs/1604.00772

    .. [2] Hansen, Mueller, Koumoutsakos (2003) "Reducing the time complexity
           of the derandomized evolution strategy with covariance matrix
           adaptation (CMA-ES)". Evolutionary Computation
           https://doi.org/10.1162/106365603321828970

    """

    def __init__(self, x0, sigma0=0.1, boundaries=None):
        super(BareCMAES, self).__init__(x0, sigma0, boundaries)

        # Set initial state
        self._running = False
        self._ready_for_tell = False

        # Best solution found
        self._x_best = pints.vector(x0)
        self._f_best = np.inf

        # Number of iterations run
        self._iterations = 0

        # Mean of the proposal distribution
        self._mu = np.copy(self._x0)

        # Approximate value at self._mu
        self._f_guessed = np.inf

        # Step size
        self._eta = np.min(self._sigma0)

        # Covariance matrix C and decomposition in rotation R and scaling S
        # A decomposition C = R S S R.T can be made, such that R is the matrix
        # of eigenvectors of C, and S is a diagonal matrix containing the
        # square roots of the eigenvalues of C.
        # Here, R and S can be interpreted as a rotation and a scaling matrix
        # respectively.
        # Note that only C is updated directly, while R and S are simply
        # recalculated at every step.
        self._C = np.identity(self._n_parameters)
        self._R = np.identity(self._n_parameters)
        self._S = np.identity(self._n_parameters)

        # Constant used in tell()
        self._e = (
            np.sqrt(2)
            * gamma((self._n_parameters + 1) / 2)
            / gamma(self._n_parameters / 2)
        )


[docs]
    def ask(self):
        """ See :meth:`Optimiser.ask()`. """
        # Initialise on first call
        if not self._running:
            self._initialise()

        # Ready for tell now
        self._ready_for_tell = True

        # Create new samples
        # Normalised samples: centered at zero and no rotation or scaling
        self._zs = np.array([np.random.normal(0, 1, self._n_parameters)
                             for i in range(self._population_size)])

        # Centered samples: centered at zero, with rotation and scaling
        self._ys = np.array([self._R.dot(self._S).dot(z) for z in self._zs])

        # Samples from N(mu, eta**2 * C)
        self._xs = np.array([self._mu + self._eta * y for y in self._ys])

        # Boundaries? Then only pass user xs that are within bounds
        if self._boundaries is not None:
            self._user_ids = np.nonzero(
                [self._boundaries.check(x) for x in self._xs])
            self._user_xs = self._xs[self._user_ids]
            if len(self._user_xs) == 0:     # pragma: no cover
                warnings.warn('All points requested by BareCMAES are outside'
                              ' the boundaries.')
        else:
            self._user_xs = self._xs

        # Set as read-only and return
        self._user_xs.setflags(write=False)
        return self._user_xs



[docs]
    def cov(self, decomposed=False):
        """
        Returns the current covariance matrix ``C`` of the proposal
        distribution.

        If the optional argument ``decomposed`` is set to ``True``, a tuple
        ``(R, S)`` will be returned such that ``R`` contains the eigenvectors
        of ``C`` while ``S`` is a diagonal matrix containing the squares of the
        eigenvalues of ``C``, such that ``C = R S S R.T``.
        """
        if decomposed:
            return self._R, self._S
        else:
            return np.copy(self._C)



[docs]
    def f_best(self):
        """ See :meth:`Optimiser.f_best()`. """
        return self._f_best



[docs]
    def f_guessed(self):
        """ See :meth:`Optimiser.f_guessed()`. """
        return self._f_guessed



[docs]
    def mean(self):
        """
        Returns the current mean of the proposal distribution.
        """
        return self.x_guessed()


    def _initialise(self):
        """
        Initialises the optimiser for the first iteration.
        """
        assert (not self._running)

        # Parent generation population size
        # The parameter parent_pop_size is the mu in the papers. It represents
        # the size of a parent population used to update our paramters.
        self._parent_pop_size = self._population_size // 2

        # Weights, all set equal for the moment
        # Sum of all positive weights should be 1
        self._W = 1 + np.arange(self._population_size)
        self._W = np.log(0.5 * (self._population_size + 1)) - np.log(self._W)

        # Inverse of the sum of the first parent weights squared (variance
        # effective selection mass)
        self._muEff = (
            np.sum(self._W[:self._parent_pop_size]) ** 2
            / np.sum(np.square(self._W[:self._parent_pop_size]))
        )

        # Inverse of the Sum of the last weights squared (variance effective
        # selection mass)
        self._muEffMinus = (
            np.sum(self._W[self._parent_pop_size:]) ** 2
            / np.sum(np.square(self._W[self._parent_pop_size:]))
        )

        # cummulation, evolution paths, used to update Cov matrix and sigma)
        self._pc = np.zeros(self._n_parameters)
        self._psig = np.zeros(self._n_parameters)

        # learning rate for the mean
        self._cm = 1

        # Decay rate of the evolution path for C
        self._ccov = (4 + self._muEff / self._n_parameters) / (
            self._n_parameters + 4 + 2 * self._muEff / self._n_parameters)

        # Decay rate of the evolution path for sigma
        self._csig = (2 + self._muEff) / (self._n_parameters + 5 + self._muEff)

        # See rank-1 vs rank-mu updates
        # Learning rate for rank-1 update
        self._c1 = 2 / ((self._n_parameters + 1.3) ** 2 + self._muEff)

        # Learning rate for rank-mu update
        self._cmu = min(
            2 * (self._muEff - 2 + 1 / self._muEff)
            / ((self._n_parameters + 2) ** 2 + self._muEff),
            1 - self._c1
        )

        # Damping of the step-size (sigma0) update
        self._dsig = 1 + self._csig + 2 * max(
            0, np.sqrt((self._muEff - 1) / (self._n_parameters + 1)) - 1)

        # Parameters from the Table 1 of [1]
        alpha_mu = 1 + self._c1 / self._cmu
        alpha_mueff = 1 + 2 * self._muEffMinus / (self._muEff + 2)
        alpha_pos_def = \
            (1 - self._c1 - self._cmu) / (self._n_parameters * self._cmu)

        # Rescale the weights
        sum_pos = np.sum(self._W[self._W > 0])
        sum_neg = np.sum(self._W[self._W < 0])
        scale_pos = 1 / sum_pos
        scale_neg = min(alpha_mu, alpha_mueff, alpha_pos_def) / -sum_neg
        self._W[self._W > 0] *= scale_pos
        self._W[self._W < 0] *= scale_neg

        # Update optimiser state
        self._running = True


[docs]
    def name(self):
        """ See :meth:`Optimiser.name()`. """
        return 'Bare-bones CMA-ES'



[docs]
    def running(self):
        """ See :meth:`Optimiser.running()`. """
        return self._running



[docs]
    def stop(self):
        """ See :meth:`Optimiser.stop()`. """

        # We use the condition number defined in the pycma code at
        # cma/evolution_strategy.py#L2965.
        cond = np.diagonal(self._S)
        cond = (np.max(cond) / np.min(cond)) ** 2
        if cond > 1e14:     # pragma: no cover
            return 'Ill-conditioned covariance matrix'
        return False


    def _suggested_population_size(self):
        """
        See :meth:`PopulationBasedOptimiser._suggested_population_size().
        """
        return 4 + int(3 * np.log(self._n_parameters))


[docs]
    def tell(self, fx):
        """ See :meth:`Optimiser.tell()`. """

        # Check ask-tell pattern
        if not self._ready_for_tell:
            raise Exception('ask() not called before tell()')
        self._ready_for_tell = False

        # Update iteration count
        self._iterations += 1

        # Some aliases for readability
        n = self._n_parameters
        npo = self._population_size
        npa = self._parent_pop_size

        # Boundaries? Then reconstruct full fx vector
        if self._boundaries is not None and len(fx) < npo:
            user_fx = fx
            fx = np.ones((npo, )) * np.inf
            fx[self._user_ids] = user_fx

        # Order the points from best to worst score
        order = np.argsort(fx)
        xs = self._xs[order]
        zs = self._zs[order]
        ys = self._ys[order]

        # Update the mean
        self._mu += self._cm * np.sum(
            ((xs[:npa] - self._mu).T * self._W[:npa]).T, axis=0)

        # Get the weighted means of y and z
        zmeans = np.sum((zs[:npa].T * self._W[:npa]).T, 0)
        ymeans = np.sum((ys[:npa].T * self._W[:npa]).T, 0)

        # Evolution path of sigma (the step size)
        # Note that self._R.dot(zmeans) =
        #     self._R.dot(np.linalg.inv(self._S)).dot(self._R.T).dot(ymeans)
        # Normalizing constants for the evolution path udpate
        c = np.sqrt(self._csig * (2 - self._csig) * self._muEff)
        self._psig = (1 - self._csig) * self._psig + c * self._R.dot(zmeans)

        # In cma/sigma_adaptation.py#L71 they are NOT using exp_size_N0I, but
        # instead use a term based on n_parameters.
        # Heaviside function helps to stall the of pc if norm ||psig|| is too
        # large. This helps to prevent too fast increases in the axes of C when
        # the step size is too small.
        cond = (
            norm(self._psig)
            / np.sqrt(1 - (1 - self._csig) ** (2 * (self._iterations + 1)))
            < (1.4 + 2 / (n + 1)) * self._e
        )
        h_sig = 1 if cond else 0
        delta_sig = (1 - h_sig) * self._ccov * (2 - self._ccov)

        # Evolution path for the rank-1 update
        c = np.sqrt(self._ccov * (2 - self._ccov) * self._muEff)
        self._pc = (1 - self._ccov) * self._pc + h_sig * c * ymeans

        # Weight changes taken from the tutorial (no explanation is given for
        # the change) these weights are used for the rank mu update only.
        # They allow to keep positive definiteness according to
        # cma/purecma.py#L419
        temp_weights = np.copy(self._W)
        for i, w in enumerate(self._W):
            if w < 0:
                temp_weights[i] = w * n / (norm(self._R * zs[i]) ** 2)

        # Update the Covariance matrix:

        # Calculate the rank 1 update using the evolution path
        rank1 = self._c1 * np.outer(self._pc, self._pc)

        # Calculate the rank-mu update
        yy = np.array([np.outer(y, y) for y in ys]).T
        rankmu = self._cmu * np.sum((yy * temp_weights).T, 0)

        # Update C
        self._C = rank1 + rankmu + self._C * (
            1 + delta_sig * self._c1 - self._c1 - self._cmu * sum(self._W))

        # Avoid numerical issues by forcing C to be symmetric
        self._C = np.triu(self._C) + np.triu(self._C, 1).T

        # Update the step size
        # Here we are simply looking at the ratio of the length of the
        #  evolution path vs its expected lenght ( E|Gaussian(0,I)|)
        # We use the difference of the ratio with 1 and scale using the
        #  learning rate and the damping parameters.
        self._eta *= np.exp(
            self._csig / self._dsig * (norm(self._psig) / self._e - 1))

        # Update eigenvectors and eigenvalues of C
        eig = np.linalg.eigh(self._C)
        self._S = np.sqrt(np.diag(eig[0]))
        self._R = eig[1]

        # Update f_guessed on the assumption that the lowest value in our
        # sample approximates f(mu)
        self._f_guessed = fx[order[0]]

        # Update x_best and f_best
        if self._f_guessed < self._f_best:
            self._f_best = self._f_guessed
            self._x_best = np.array(xs[0], copy=True)



[docs]
    def x_best(self):
        """ See :meth:`Optimiser.x_best()`. """
        return self._x_best



[docs]
    def x_guessed(self):
        """ See :meth:`Optimiser.x_guessed()`. """
        return np.array(self._mu, copy=True)
Source code for pints._optimisers._cmaes_bare

Pints

Navigation

Related Topics