Source code for pints.toy._german_credit_hierarchical

#
# German credit toy hierarchical log pdf.
#
# This file is part of PINTS (https://github.com/pints-team/pints/) which is
# released under the BSD 3-clause license. See accompanying LICENSE.md for
# copyright notice and full license details.
#
import io
import numpy as np
import scipy
import urllib.request

from . import ToyLogPDF


[docs] class GermanCreditHierarchicalLogPDF(ToyLogPDF): r""" Toy distribution based on a hierarchical logistic regression model, which takes the form, .. math:: f(z, y|\beta) \propto \text{exp}(-\sum_{i=1}^{N} \text{log}(1 + \text{exp}(-y_i z_i.\beta)) - \beta.\beta/2\sigma^2 - N/2 \text{log }\sigma^2 - \lambda \sigma^2) The data :math:`(z, y)` are a matrix of individual predictors (with 1s in the first column) and responses (1 if the individual should receive credit and -1 if not) respectively; :math:`\beta` is a 325x1 vector of coefficients and :math:`N=1000`; :math:`z` is the design matrix formed by creating all interactions between individual variables and themselves as defined in [2]_. Extends :class:`pints.LogPDF`. Parameters ---------- theta : float vector of coefficients of length 326 (first dimension is sigma; other entries make up beta) References ---------- .. [1] `"UCI machine learning repository" <https://archive.ics.uci.edu/ml/datasets/statlog+(german+credit+data)>`_, 2010. A. Frank and A. Asuncion. .. [2] "The No-U-Turn Sampler: Adaptively Setting Path Lengths in Hamiltonian Monte Carlo", 2014, M.D. Hoffman and A. Gelman. """ def __init__(self, x=None, y=None, download=False): if x is None or y is None: if download is False: raise ValueError('No data supplied. Consider setting download' ' to True to download data.') x, y = self._download_data() dims = x.shape[1] else: if download is True: raise ValueError( 'Either supply no data or set download to True to download' ' data, but not both.') dims = x.shape[1] if dims != 25: raise ValueError('x must have 25 predictor columns.') if max(y) != 1 or min(y) != -1: raise ValueError('Output must be either 1 or -1.') # make design matrix self._x = np.copy(x) x = x[:, 1:] z = np.zeros((1000, 325)) zz = np.zeros((z.shape[0], 300)) k = 0 for i in range(x.shape[1]): for j in range(i, x.shape[1]): zz[:, k] = np.transpose(x[:, i] * x[:, j]) k += 1 zz = np.column_stack([x, zz]) z[:, 0] = np.ones(1000) z[:, 1:] = zz self._y = y self._z = z self._n_parameters = 326 self._N = len(y) self._lambda = 0.01 def __call__(self, theta): sigma = theta[0] beta = theta[1::] sigma_sq = sigma**2 log_prob = sum(-np.log(1 + np.exp(-self._y * np.dot(self._z, beta)))) log_prob += -1 / (2 * sigma_sq) * np.dot(beta, beta) \ - self._N / 2 * np.log(sigma) \ - self._lambda * sigma_sq return log_prob
[docs] def data(self): """ Returns data used to fit model: `x`, `y` and `z`.""" return self._x, self._y, self._z
def _download_data(self): """ Downloads data from [1]. """ url = ('http://archive.ics.uci.edu/ml/machine-learning-databases/' 'statlog/german/german.data-numeric') url = urllib.request.urlopen(url) try: raw_data = url.read() finally: url.close() a = np.genfromtxt(io.BytesIO(raw_data), delimiter=4)[:, :25] # get output y = a[:, -1] y[y == 1] = -1 y[y == 2] = 1 # get inputs and standardise x = a[:, :-1] x = scipy.stats.zscore(x) x1 = np.zeros((x.shape[0], x.shape[1] + 1)) x1[:, 0] = np.ones(x.shape[0]) x1[:, 1:] = x x = np.copy(x1) return x, y
[docs] def evaluateS1(self, theta): """ See :meth:`LogPDF.evaluateS1()`. """ sigma = theta[0] sigma_sq = sigma**2 beta = theta[1::] log_prob = 0.0 grad_log_prob = np.zeros(self.n_parameters()) for i in range(self._N): exp_yxb = np.exp(-self._y[i] * np.dot(self._z[i], beta)) log_prob += -np.log(1 + exp_yxb) grad_log_prob[1:] += ( self._z[i] * self._y[i] * exp_yxb / (1 + exp_yxb)) scale = -1 / (2 * sigma_sq) log_prob += scale * np.dot(beta, beta) grad_log_prob[1:] += scale * 2 * beta log_prob += -self._N / 2 * np.log(sigma_sq) grad_log_prob[0] += -self._N / sigma log_prob += -self._lambda * sigma_sq grad_log_prob[0] += -self._lambda * 2 * sigma return log_prob, grad_log_prob
[docs] def n_parameters(self): return self._n_parameters
[docs] def suggested_bounds(self): """ See :meth:`ToyLogPDF.suggested_bounds()`. """ magnitude = 100 bounds = np.tile([-magnitude, magnitude], (self._n_parameters, 1)) return np.transpose(bounds).tolist()