Source code for pints.toy._german_credit_hierarchical

#
# German credit toy hierarchical log pdf.
#
# This file is part of PINTS (https://github.com/pints-team/pints/) which is
# released under the BSD 3-clause license. See accompanying LICENSE.md for
# copyright notice and full license details.
#
import io
import numpy as np
import scipy
import urllib.request

from . import ToyLogPDF



[docs]
class GermanCreditHierarchicalLogPDF(ToyLogPDF):
    r"""
    Toy distribution based on a hierarchical logistic regression model, which
    takes the form,

    .. math::

        f(z, y|\beta) \propto \text{exp}(-\sum_{i=1}^{N} \text{log}(1 +
        \text{exp}(-y_i z_i.\beta)) - \beta.\beta/2\sigma^2 -
        N/2 \text{log }\sigma^2 - \lambda \sigma^2)

    The data :math:`(z, y)` are a matrix of individual predictors (with 1s in
    the first column) and responses (1 if the individual should receive credit
    and -1 if not) respectively; :math:`\beta` is a 325x1 vector of
    coefficients and :math:`N=1000`; :math:`z` is the design matrix formed
    by creating all interactions between individual variables and themselves
    as defined in [2]_.

    Extends :class:`pints.LogPDF`.

    Parameters
    ----------
    theta : float
        vector of coefficients of length 326 (first dimension is sigma; other
        entries make up beta)

    References
    ----------
    .. [1] `"UCI machine learning repository"
      <https://archive.ics.uci.edu/ml/datasets/statlog+(german+credit+data)>`_,
      2010. A. Frank and A. Asuncion.
    .. [2] "The No-U-Turn Sampler:  Adaptively Setting Path Lengths in
           Hamiltonian Monte Carlo", 2014, M.D. Hoffman and A. Gelman.
    """
    def __init__(self, x=None, y=None, download=False):
        if x is None or y is None:
            if download is False:
                raise ValueError('No data supplied. Consider setting download'
                                 ' to True to download data.')
            x, y = self._download_data()
            dims = x.shape[1]
        else:
            if download is True:
                raise ValueError(
                    'Either supply no data or set download to True to download'
                    ' data, but not both.')
            dims = x.shape[1]
            if dims != 25:
                raise ValueError('x must have 25 predictor columns.')
            if max(y) != 1 or min(y) != -1:
                raise ValueError('Output must be either 1 or -1.')

        # make design matrix
        self._x = np.copy(x)
        x = x[:, 1:]
        z = np.zeros((1000, 325))
        zz = np.zeros((z.shape[0], 300))
        k = 0
        for i in range(x.shape[1]):
            for j in range(i, x.shape[1]):
                zz[:, k] = np.transpose(x[:, i] * x[:, j])
                k += 1
        zz = np.column_stack([x, zz])
        z[:, 0] = np.ones(1000)
        z[:, 1:] = zz

        self._y = y
        self._z = z
        self._n_parameters = 326
        self._N = len(y)
        self._lambda = 0.01

    def __call__(self, theta):
        sigma = theta[0]
        beta = theta[1::]
        sigma_sq = sigma**2
        log_prob = sum(-np.log(1 + np.exp(-self._y * np.dot(self._z, beta))))
        log_prob += -1 / (2 * sigma_sq) * np.dot(beta, beta) \
                    - self._N / 2 * np.log(sigma) \
                    - self._lambda * sigma_sq
        return log_prob


[docs]
    def data(self):
        """ Returns data used to fit model: `x`, `y` and `z`."""
        return self._x, self._y, self._z


    def _download_data(self):
        """ Downloads data from [1]. """
        url = ('http://archive.ics.uci.edu/ml/machine-learning-databases/'
               'statlog/german/german.data-numeric')
        url = urllib.request.urlopen(url)
        try:
            raw_data = url.read()
        finally:
            url.close()
        a = np.genfromtxt(io.BytesIO(raw_data), delimiter=4)[:, :25]

        # get output
        y = a[:, -1]
        y[y == 1] = -1
        y[y == 2] = 1

        # get inputs and standardise
        x = a[:, :-1]
        x = scipy.stats.zscore(x)
        x1 = np.zeros((x.shape[0], x.shape[1] + 1))
        x1[:, 0] = np.ones(x.shape[0])
        x1[:, 1:] = x
        x = np.copy(x1)
        return x, y


[docs]
    def evaluateS1(self, theta):
        """ See :meth:`LogPDF.evaluateS1()`. """
        sigma = theta[0]
        sigma_sq = sigma**2

        beta = theta[1::]
        log_prob = 0.0
        grad_log_prob = np.zeros(self.n_parameters())
        for i in range(self._N):
            exp_yxb = np.exp(-self._y[i] * np.dot(self._z[i], beta))
            log_prob += -np.log(1 + exp_yxb)
            grad_log_prob[1:] += (
                self._z[i] * self._y[i] * exp_yxb / (1 + exp_yxb))

        scale = -1 / (2 * sigma_sq)
        log_prob += scale * np.dot(beta, beta)
        grad_log_prob[1:] += scale * 2 * beta

        log_prob += -self._N / 2 * np.log(sigma_sq)
        grad_log_prob[0] += -self._N / sigma

        log_prob += -self._lambda * sigma_sq
        grad_log_prob[0] += -self._lambda * 2 * sigma

        return log_prob, grad_log_prob



[docs]
    def n_parameters(self):
        return self._n_parameters



[docs]
    def suggested_bounds(self):
        """ See :meth:`ToyLogPDF.suggested_bounds()`. """
        magnitude = 100
        bounds = np.tile([-magnitude, magnitude], (self._n_parameters, 1))
        return np.transpose(bounds).tolist()
Source code for pints.toy._german_credit_hierarchical

Pints

Navigation

Related Topics