Source code for GPy.models.ibp_lfm

# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
# Licensed under the BSD 3-clause license (see LICENSE.txt)

import numpy as np

from ..core.sparse_gp_mpi import SparseGP_MPI
from .. import kern
from ..util.linalg import jitchol, backsub_both_sides, tdot, dtrtrs, dtrtri, pdinv
from ..util import diag
from ..core.parameterization import Param
from ..likelihoods import Gaussian
from ..inference.latent_function_inference.var_dtc_parallel import VarDTC_minibatch
from ..inference.latent_function_inference.posterior import Posterior
from GPy.core.parameterization.variational import VariationalPrior
from ..core.parameterization.parameterized import Parameterized
from paramz.transformations import Logexp, Logistic, __fixed__
log_2_pi = np.log(2*np.pi)

[docs]class VarDTC_minibatch_IBPLFM(VarDTC_minibatch):
    '''
    Modifications of VarDTC_minibatch for IBP LFM
    '''
    
    def __init__(self, batchsize=None, limit=3, mpi_comm=None):
        super(VarDTC_minibatch_IBPLFM, self).__init__(batchsize, limit, mpi_comm)

[docs]    def gatherPsiStat(self, kern, X, Z, Y, beta, Zp):

        het_noise = beta.size > 1
        
        assert beta.size == 1
    
        trYYT = self.get_trYYT(Y)
        if self.Y_speedup and not het_noise:
            Y = self.get_YYTfactor(Y)
    
        num_inducing = Z.shape[0]
        num_data, output_dim = Y.shape
        batchsize = num_data if self.batchsize is None else self.batchsize
    
        psi2_full = np.zeros((num_inducing, num_inducing)) # MxM
        psi1Y_full = np.zeros((output_dim, num_inducing)) # DxM
        psi0_full = 0.
        YRY_full = 0.
    
        for n_start in range(0, num_data, batchsize):
            n_end = min(batchsize+n_start, num_data)
            if batchsize == num_data:
                Y_slice = Y
                X_slice = X
            else:
                Y_slice = Y[n_start:n_end]
                X_slice = X[n_start:n_end]
            
            if het_noise:
                b = beta[n_start]
                YRY_full += np.inner(Y_slice, Y_slice)*b
            else:
                b = beta
    
            psi0 = kern._Kdiag(X_slice) #Kff^q
            psi1 = kern.K(X_slice, Z) #Kfu

            indX = X_slice.values
            indX = np.int_(np.round(indX[:, -1]))

            Zp = Zp.gamma.values
            # Extend Zp across columns
            indZ = Z.values
            indZ = np.int_(np.round(indZ[:, -1])) - Zp.shape[0]
            Zpq = Zp[:, indZ]

            for d in np.unique(indX):
                indd = indX == d
                psi1d = psi1[indd, :]
                Zpd = Zp[d, :]
                Zp2 = Zpd[:, None]*Zpd[None, :] - np.diag(np.power(Zpd, 2)) + np.diag(Zpd)
                psi2_full += (np.dot(psi1d.T, psi1d)*Zp2[np.ix_(indZ, indZ)])*b #Zp2*Kufd*Kfud*beta

            psi0_full += np.sum(psi0*Zp[indX, :])*b
            psi1Y_full += np.dot(Y_slice.T, psi1*Zpq[indX, :])*b
    
        if not het_noise:
            YRY_full = trYYT*beta
    
        if self.mpi_comm is not None:
            from mpi4py import MPI
            psi0_all = np.array(psi0_full)
            psi1Y_all = psi1Y_full.copy()
            psi2_all = psi2_full.copy()
            YRY_all = np.array(YRY_full)
            self.mpi_comm.Allreduce([psi0_full, MPI.DOUBLE], [psi0_all, MPI.DOUBLE])
            self.mpi_comm.Allreduce([psi1Y_full, MPI.DOUBLE], [psi1Y_all, MPI.DOUBLE])
            self.mpi_comm.Allreduce([psi2_full, MPI.DOUBLE], [psi2_all, MPI.DOUBLE])
            self.mpi_comm.Allreduce([YRY_full, MPI.DOUBLE], [YRY_all, MPI.DOUBLE])
            return psi0_all, psi1Y_all, psi2_all, YRY_all
    
        return psi0_full, psi1Y_full, psi2_full, YRY_full


[docs]    def inference_likelihood(self, kern, X, Z, likelihood, Y, Zp):
        """
        The first phase of inference:
        Compute: log-likelihood, dL_dKmm

        Cached intermediate results: Kmm, KmmInv,
        """
        
        num_data, output_dim = Y.shape
        input_dim = Z.shape[0]
        if self.mpi_comm is not None:
            from mpi4py import MPI
            num_data_all = np.array(num_data,dtype=np.int32)
            self.mpi_comm.Allreduce([np.int32(num_data), MPI.INT], [num_data_all, MPI.INT])
            num_data = num_data_all

        #see whether we've got a different noise variance for each datum
        beta = 1./np.fmax(likelihood.variance, 1e-6)
        het_noise = beta.size > 1
        if het_noise:
            self.batchsize = 1

        psi0_full, psi1Y_full, psi2_full, YRY_full = self.gatherPsiStat(kern, X, Z, Y, beta, Zp)

        #======================================================================
        # Compute Common Components
        #======================================================================

        Kmm = kern.K(Z).copy()
        diag.add(Kmm, self.const_jitter)
        if not np.isfinite(Kmm).all():
            print(Kmm)
        Lm = jitchol(Kmm)
        LmInv = dtrtri(Lm)

        LmInvPsi2LmInvT = np.dot(LmInv, np.dot(psi2_full, LmInv.T))
        Lambda = np.eye(Kmm.shape[0])+LmInvPsi2LmInvT
        LL = jitchol(Lambda)
        LLInv = dtrtri(LL)
        logdet_L = 2.*np.sum(np.log(np.diag(LL)))
        LmLLInv = np.dot(LLInv, LmInv)
        
        b = np.dot(psi1Y_full, LmLLInv.T)
        bbt = np.sum(np.square(b))
        v = np.dot(b, LmLLInv).T
        LLinvPsi1TYYTPsi1LLinvT = tdot(b.T)

        tmp = -np.dot(np.dot(LLInv.T, LLinvPsi1TYYTPsi1LLinvT + output_dim*np.eye(input_dim)), LLInv)
        dL_dpsi2R = .5*np.dot(np.dot(LmInv.T, tmp + output_dim*np.eye(input_dim)), LmInv)

        # Cache intermediate results
        self.midRes['dL_dpsi2R'] = dL_dpsi2R
        self.midRes['v'] = v

        #======================================================================
        # Compute log-likelihood
        #======================================================================
        if het_noise:
            logL_R = -np.sum(np.log(beta))
        else:
            logL_R = -num_data*np.log(beta)
        logL = -(output_dim*(num_data*log_2_pi+logL_R+psi0_full-np.trace(LmInvPsi2LmInvT))+YRY_full-bbt)*.5 - output_dim*logdet_L*.5

        #======================================================================
        # Compute dL_dKmm
        #======================================================================

        dL_dKmm = dL_dpsi2R - .5*output_dim*np.dot(np.dot(LmInv.T, LmInvPsi2LmInvT), LmInv)

        #======================================================================
        # Compute the Posterior distribution of inducing points p(u|Y)
        #======================================================================

        if not self.Y_speedup or het_noise:
            wd_inv = backsub_both_sides(Lm, np.eye(input_dim)- backsub_both_sides(LL, np.identity(input_dim), transpose='left'), transpose='left')
            post = Posterior(woodbury_inv=wd_inv, woodbury_vector=v, K=Kmm, mean=None, cov=None, K_chol=Lm)
        else:
            post = None

        #======================================================================
        # Compute dL_dthetaL for uncertian input and non-heter noise
        #======================================================================

        if not het_noise:
            dL_dthetaL = .5*(YRY_full*beta + beta*output_dim*psi0_full - num_data*output_dim*beta) - beta*(dL_dpsi2R*psi2_full).sum() - beta*(v.T*psi1Y_full).sum()
            self.midRes['dL_dthetaL'] = dL_dthetaL

        return logL, dL_dKmm, post
        
[docs]    def inference_minibatch(self, kern, X, Z, likelihood, Y, Zp):
        """
        The second phase of inference: Computing the derivatives over a minibatch of Y
        Compute: dL_dpsi0, dL_dpsi1, dL_dpsi2, dL_dthetaL
        return a flag showing whether it reached the end of Y (isEnd)
        """

        num_data, output_dim = Y.shape

        #see whether we've got a different noise variance for each datum
        beta = 1./np.fmax(likelihood.variance, 1e-6)
        het_noise = beta.size > 1
        # VVT_factor is a matrix such that tdot(VVT_factor) = VVT...this is for efficiency!
        #self.YYTfactor = beta*self.get_YYTfactor(Y)
        if self.Y_speedup and not het_noise:
            YYT_factor = self.get_YYTfactor(Y)
        else:
            YYT_factor = Y

        n_start = self.batch_pos
        batchsize = num_data if self.batchsize is None else self.batchsize
        n_end = min(batchsize+n_start, num_data)
        if n_end == num_data:
            isEnd = True
            self.batch_pos = 0
        else:
            isEnd = False
            self.batch_pos = n_end

        if batchsize == num_data:
            Y_slice = YYT_factor
            X_slice = X
        else:
            Y_slice = YYT_factor[n_start:n_end]
            X_slice = X[n_start:n_end]

        psi0 = kern._Kdiag(X_slice) #Kffdiag
        psi1 = kern.K(X_slice, Z) #Kfu
        betapsi1 = np.einsum('n,nm->nm', beta, psi1)

        X_slice = X_slice.values
        Z = Z.values

        Zp = Zp.gamma.values
        indX = np.int_(X_slice[:, -1])
        indZ = np.int_(Z[:, -1]) - Zp.shape[0]

        betaY = beta*Y_slice

        #======================================================================
        # Load Intermediate Results
        #======================================================================

        dL_dpsi2R = self.midRes['dL_dpsi2R']
        v = self.midRes['v']

        #======================================================================
        # Compute dL_dpsi
        #======================================================================

        dL_dpsi0 = -.5*output_dim*(beta * Zp[indX, :]) #XxQ #TODO: Check this gradient

        dL_dpsi1 = np.dot(betaY, v.T)
        dL_dEZp  = psi1*dL_dpsi1
        dL_dpsi1 = Zp[np.ix_(indX, indZ)]*dL_dpsi1
        dL_dgamma = np.zeros(Zp.shape)
        for d in np.unique(indX):
            indd = indX == d
            betapsi1d = betapsi1[indd, :]
            psi1d = psi1[indd, :]
            Zpd = Zp[d, :]
            Zp2 = Zpd[:, None]*Zpd[None, :] - np.diag(np.power(Zpd, 2)) + np.diag(Zpd)
            dL_dpsi1[indd, :] += np.dot(betapsi1d, Zp2[np.ix_(indZ, indZ)] * dL_dpsi2R)*2.

            dL_EZp2 = dL_dpsi2R * (np.dot(psi1d.T, psi1d) * beta)*2.  # Zpd*Kufd*Kfud*beta
            #Gradient of Likelihood wrt gamma is calculated here
            EZ = Zp[d, indZ]
            for q in range(Zp.shape[1]):
                EZt = EZ.copy()
                indq = indZ == q
                EZt[indq] = .5
                dL_dgamma[d, q] = np.sum(dL_dEZp[np.ix_(indd, indq)]) + np.sum(dL_EZp2[:, indq]*EZt[:, None]) -\
                    .5*beta*(np.sum(psi0[indd, q]))

        #======================================================================
        # Compute dL_dthetaL
        #======================================================================
        if isEnd:
            dL_dthetaL = self.midRes['dL_dthetaL']
        else:
            dL_dthetaL = 0.

        grad_dict = {'dL_dKdiag': dL_dpsi0,
                     'dL_dKnm': dL_dpsi1,
                     'dL_dthetaL': dL_dthetaL,
                     'dL_dgamma': dL_dgamma}

        return isEnd, (n_start, n_end), grad_dict


[docs]def update_gradients(model, mpi_comm=None):
    if mpi_comm is None:
        Y = model.Y
        X = model.X
    else:
        Y = model.Y_local
        X = model.X[model.N_range[0]:model.N_range[1]]

    model._log_marginal_likelihood, dL_dKmm, model.posterior = model.inference_method.inference_likelihood(model.kern, X, model.Z, model.likelihood, Y, model.Zp)

    het_noise = model.likelihood.variance.size > 1

    if het_noise:
        dL_dthetaL = np.empty((model.Y.shape[0],))
    else:
        dL_dthetaL = np.float64(0.)

    kern_grad = model.kern.gradient.copy()
    kern_grad[:] = 0.
    model.Z.gradient = 0.
    gamma_gradient = model.Zp.gamma.copy()
    gamma_gradient[:] = 0.

    isEnd = False
    while not isEnd:
        isEnd, n_range, grad_dict = model.inference_method.inference_minibatch(model.kern, X, model.Z, model.likelihood, Y, model.Zp)

        if (n_range[1]-n_range[0]) == X.shape[0]:
            X_slice = X
        elif mpi_comm is None:
            X_slice = model.X[n_range[0]:n_range[1]]
        else:
            X_slice = model.X[model.N_range[0]+n_range[0]:model.N_range[0]+n_range[1]]

        #gradients w.r.t. kernel
        model.kern.update_gradients_diag(grad_dict['dL_dKdiag'], X_slice)
        kern_grad += model.kern.gradient

        model.kern.update_gradients_full(grad_dict['dL_dKnm'], X_slice, model.Z)
        kern_grad += model.kern.gradient

        #gradients w.r.t. Z
        model.Z.gradient += model.kern.gradients_X(grad_dict['dL_dKnm'].T, model.Z, X_slice)

        #gradients w.r.t. posterior parameters of Zp
        gamma_gradient += grad_dict['dL_dgamma']

        if het_noise:
            dL_dthetaL[n_range[0]:n_range[1]] = grad_dict['dL_dthetaL']
        else:
            dL_dthetaL += grad_dict['dL_dthetaL']

    # Gather the gradients from multiple MPI nodes
    if mpi_comm is not None:
        from mpi4py import MPI
        if het_noise:
            raise "het_noise not implemented!"
        kern_grad_all = kern_grad.copy()
        Z_grad_all = model.Z.gradient.copy()
        gamma_grad_all = gamma_gradient.copy()
        mpi_comm.Allreduce([kern_grad, MPI.DOUBLE], [kern_grad_all, MPI.DOUBLE])
        mpi_comm.Allreduce([model.Z.gradient, MPI.DOUBLE], [Z_grad_all, MPI.DOUBLE])
        mpi_comm.Allreduce([gamma_gradient, MPI.DOUBLE], [gamma_grad_all, MPI.DOUBLE])
        kern_grad = kern_grad_all
        model.Z.gradient = Z_grad_all
        gamma_gradient = gamma_grad_all

    #gradients w.r.t. kernel
    model.kern.update_gradients_full(dL_dKmm, model.Z, None)
    model.kern.gradient += kern_grad

    #gradients w.r.t. Z
    model.Z.gradient += model.kern.gradients_X(dL_dKmm, model.Z)

    #gradient w.r.t. gamma
    model.Zp.gamma.gradient = gamma_gradient

    # Update Log-likelihood
    KL_div = model.variational_prior.KL_divergence(model.Zp)
    # update for the KL divergence
    model.variational_prior.update_gradients_KL(model.Zp)

    model._log_marginal_likelihood += KL_div

    # dL_dthetaL
    model.likelihood.update_gradients(dL_dthetaL)


[docs]class IBPPosterior(Parameterized):
    '''
    The IBP distribution for variational approximations.
    '''
    def __init__(self, binary_prob, tau=None, name='Sensitivity space', *a, **kw):
        """
        binary_prob : the probability of including a latent function over an output.
        """
        super(IBPPosterior, self).__init__(name=name, *a, **kw)
        self.gamma = Param("binary_prob", binary_prob, Logistic(1e-10, 1. - 1e-10))
        self.link_parameter(self.gamma)
        if tau is not None:
            assert tau.size == 2*self.gamma_.shape[1]
            self.tau = Param("tau", tau, Logexp())
        else:
            self.tau = Param("tau", np.ones((2, self.gamma.shape[1])), Logexp())
        self.link_parameter(self.tau)

[docs]    def set_gradients(self, grad):
        self.gamma.gradient, self.tau.gradient = grad

    def __getitem__(self, s):
        pass
    #     if isinstance(s, (int, slice, tuple, list, np.ndarray)):
    #         import copy
    #         n = self.__new__(self.__class__, self.name)
    #         dc = self.__dict__.copy()
    #         dc['binary_prob'] = self.binary_prob[s]
    #         dc['tau'] = self.tau
    #         dc['parameters'] = copy.copy(self.parameters)
    #         n.__dict__.update(dc)
    #         n.parameters[dc['binary_prob']._parent_index_] = dc['binary_prob']
    #         n.parameters[dc['tau']._parent_index_] = dc['tau']
    #         n._gradient_array_ = None
    #         oversize = self.size - self.gamma.size - self.tau.size
    #         n.size = n.gamma.size + n.tau.size + oversize
    #         return n
    #     else:
    #         return super(IBPPosterior, self).__getitem__(s)

[docs]class IBPPrior(VariationalPrior):
    def __init__(self, rank, alpha=2., name='IBPPrior', **kw):
        super(IBPPrior, self).__init__(name=name, **kw)
        from paramz.transformations import __fixed__  
        self.rank = rank
        self.alpha = Param('alpha', alpha, __fixed__)
        self.link_parameter(self.alpha)

[docs]    def KL_divergence(self, variational_posterior):
        from scipy.special import gamma, psi
        
        eta, tau = variational_posterior.gamma.values, variational_posterior.tau.values
        
        sum_eta = np.sum(eta, axis=0) #sum_d gamma(d,q)
        D_seta = eta.shape[0] - sum_eta
        ad = self.alpha/eta.shape[1]
        psitau1 = psi(tau[0, :])
        psitau2 = psi(tau[1, :])
        sumtau = np.sum(tau, axis=0)
        psitau = psi(sumtau)
        # E[log p(z)]
        part1 = np.sum(sum_eta*psitau1 + D_seta*psitau2 - eta.shape[0]*psitau)
            
        # E[log p(pi)]
        part1 += (ad - 1.)*np.sum(psitau1 - psitau) + eta.shape[1]*np.log(ad)
        
        #H(z)
        part2 = np.sum(-(1.-eta)*np.log(1.-eta) - eta*np.log(eta))
        #H(pi)
        part2 += np.sum(np.log(gamma(tau[0, :])*gamma(tau[1, :])/gamma(sumtau))-(tau[0, :]-1.)*psitau1-(tau[1, :]-1.)*psitau2\
                 + (sumtau-2.)*psitau)
        
        return part1+part2

[docs]    def update_gradients_KL(self, variational_posterior):
        eta, tau = variational_posterior.gamma.values, variational_posterior.tau.values

        from scipy.special import psi, polygamma
        dgamma = np.log(1. - eta) - np.log(eta) + psi(tau[0, :]) - psi(tau[1, :])
        variational_posterior.gamma.gradient += dgamma
        ad = self.alpha/self.rank
        sumeta = np.sum(eta, axis=0)
        sumtau = np.sum(tau, axis=0)
        common = (-eta.shape[0] - (ad - 1.) + (sumtau - 2.))*polygamma(1, sumtau)
        variational_posterior.tau.gradient[0, :] = (sumeta + ad - tau[0, :])*polygamma(1, tau[0, :]) + common
        variational_posterior.tau.gradient[1, :] = ((eta.shape[0] - sumeta) - (tau[1, :] - 1.))*polygamma(1, tau[1, :])\
                                                   + common


[docs]class IBPLFM(SparseGP_MPI):
    """
    Indian Buffet Process for Latent Force Models

    :param Y: observed data (np.ndarray) or GPy.likelihood
    :type Y: np.ndarray| GPy.likelihood instance
    :param X: input data (np.ndarray) [X:values, X:index], index refers to the number of the output
    :type X: np.ndarray
    :param input_dim: latent dimensionality
    :type input_dim: int
    : param rank: number of latent functions

    """
    def __init__(self, X, Y, input_dim=2, output_dim=1, rank=1, Gamma=None, num_inducing=10,
                 Z=None, kernel=None, inference_method=None, likelihood=None, name='IBP for LFM', alpha=2., beta=2., connM=None, tau=None, mpi_comm=None, normalizer=False, variational_prior=None,**kwargs):

        if kernel is None:
            kernel = kern.EQ_ODE2(input_dim, output_dim, rank)
        
        if Gamma is None:
            gamma = np.empty((output_dim, rank)) # The posterior probabilities of the binary variable in the variational approximation
            gamma[:] = 0.5 + 0.1 * np.random.randn(output_dim, rank)
            gamma[gamma>1.-1e-9] = 1.-1e-9
            gamma[gamma<1e-9] = 1e-9
        else:
            gamma = Gamma.copy()
        
        #TODO: create a vector of inducing points
        if Z is None:
            Z = np.random.permutation(X.copy())[:num_inducing]
        assert Z.shape[1] == X.shape[1]
        
        if likelihood is None:
            likelihood = Gaussian()

        if inference_method is None:
            inference_method = VarDTC_minibatch_IBPLFM(mpi_comm=mpi_comm)
        
        #Definition of variational terms
        self.variational_prior = IBPPrior(rank=rank, alpha=alpha) if variational_prior is None else variational_prior
        self.Zp = IBPPosterior(gamma, tau=tau)

        super(IBPLFM, self).__init__(X, Y, Z, kernel, likelihood, variational_prior=self.variational_prior, inference_method=inference_method, name=name, mpi_comm=mpi_comm, normalizer=normalizer, **kwargs)
        self.link_parameter(self.Zp, index=0)
        
[docs]    def set_Zp_gradients(self, Zp, Zp_grad):
        """Set the gradients of the posterior distribution of Zp in its specific form."""
        Zp.gamma.gradient = Zp_grad
    
[docs]    def get_Zp_gradients(self, Zp):
        """Get the gradients of the posterior distribution of Zp in its specific form."""
        return Zp.gamma.gradient

    def _propogate_Zp_val(self):
        pass

[docs]    def parameters_changed(self):
        #super(IBPLFM,self).parameters_changed()
        if isinstance(self.inference_method, VarDTC_minibatch_IBPLFM):
            update_gradients(self,  mpi_comm=self.mpi_comm)
            return

        # Add the KL divergence term
        self._log_marginal_likelihood += self.variational_prior.KL_divergence(self.Zp)
        #TODO Change the following according to this variational distribution
        #self.Zp.gamma.gradient = self.

        # update for the KL divergence
        self.variational_prior.update_gradients_KL(self.Zp)