%load_ext autoreload
%autoreload 2

import importlib, pickle, os, sys, copy, numpy as np

import matplotlib.pyplot as plt
from matplotlib import animation
from IPython.display import HTML
os.environ['KMP_DUPLICATE_LIB_OK']='True'
# %matplotlib notebook
plt.rcParams.update({
    "text.usetex": True,
    "font.family": "serif",
    "font.serif": ["Palatino"],
})

import torch
import torch.nn as nn
import torch.nn.functional as F
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
import hamiltorch

from anim_utils import plot_opt_animation, plot_mcmc_animation, plot_mcmc_surface

# the loss function
m    = torch.tensor([0.0,0.0])
std  = torch.tensor([[1.0,0.81],[0.81,1.0]])
dist = torch.distributions.MultivariateNormal(m,std)

def forward_simulate(f, x0, ts):
    X  = [x0]
    ode_steps = len(ts)-1
    for i in range(ode_steps):
        h  = ts[i+1]-ts[i]
        t  = ts[i]
        x  = X[i]
        x_next = x + h*f(t,x)
        X.append(x_next)
    X = torch.stack(X) # T,N,d
    return X

def loss_fnc(x):
    return -dist.log_prob(x)

# GD and ODE flow simulations
def solve_systems(gd_lr=0.33, num_iter=30, num_euler=50):
    x0_  = torch.tensor([0.,5.]).to(torch.float32)
    xpar = torch.nn.Parameter(x0_.clone()) # 1,P

    # gradient descent solution
    opt = torch.optim.SGD([xpar],lr=gd_lr)
    gd_loss_trace = []
    for i in range(num_iter):
        gd_loss_trace.append(xpar.detach().clone())
        opt.zero_grad()
        loss = loss_fnc(xpar)
        loss.backward()
        opt.step()
    gd_loss_trace = torch.stack(gd_loss_trace).numpy()

    # ode solution
    dt = gd_lr / num_euler
    ts = torch.arange(num_euler*num_iter) * dt
    x0 = torch.nn.Parameter(x0_.clone())
    def odef(t,x):
        f    = -loss_fnc(x)
        grad = torch.autograd.grad(f,x)[0]
        return grad
    ode_loss_trace = forward_simulate(odef, x0, ts) # T,P
    ode_loss_trace = ode_loss_trace.detach().numpy()
    
    return gd_loss_trace,ode_loss_trace


gd_loss_trace, ode_loss_trace = solve_systems()

anim = plot_opt_animation(loss_fnc, gd_loss_trace, ode_loss_trace)
HTML(anim.to_jshtml())

# dataset generation
N = 500  # num data points
d = 2    # data dim
sig = 0.1 # emission parameter
S = torch.eye(d) # d,d

A = torch.ones(N, d)/d + 0.1*torch.randn(N,d) # N,d
L = torch.linalg.cholesky(S) # d,d
theta = L @ torch.randn(d,1) # d,1
x     = A @ theta + np.sqrt(sig)*torch.randn(N,1)

# optimization variables
NSTEPS = 15000
theta0 = torch.Tensor([[2.,4.]]).T # initial parameters

prior_dist = torch.distributions.Normal(torch.zeros(d),torch.ones(d)) 
th = torch.nn.Parameter(theta0) # initial parameters

# stochastic MAP computation
def map_estimate(th, frac=0.1):
    # get a minibatch 
    # th is [d,M]
    N_  = int(N*frac)
    idx = torch.randperm(N)[:N_]
    x_,A_ = x[idx],A[idx] # [N_,1], [N_,d]
    # MAP computation
    xhat  = A_ @ th # [N_,M]
    lhood = torch.distributions.Normal(xhat,sig).log_prob(x_) # [N_,M]
    lhood = lhood.sum(0) # [M]
    prior = prior_dist.log_prob(th.T) # [M,d]
    prior = prior.sum(1) # [M]
    return prior + 1/frac*lhood

# optimization loop
thetas,losses = [],[]
eta = 2.5e-5 # step size
for i in range(NSTEPS):
    thetas.append(th.detach().numpy().copy())
    MAP   = map_estimate(th, 0.1)
    grad  = torch.autograd.grad(MAP,th)[0]
    noise = torch.randn(d,1) * np.sqrt(eta)
    th.data = th.data + eta*grad/2 + noise
    losses.append(MAP.item())
thetas = np.array(thetas).squeeze(-1) # NSTEPS,d


###################################################
# visualizing the loss surface
Ngr = 100
w   = 1
xnp = np.linspace(theta[0]-w,theta[0]+w, Ngr)
ynp = np.linspace(theta[1]-w,theta[1]+w, Ngr)
X,Y = np.meshgrid(xnp, ynp)
XY  = torch.tensor(np.array([X.T.flatten(), Y.T.flatten()])).to(torch.float32)
map_XY = map_estimate(XY).reshape(Ngr,Ngr)
plt.figure(1,(8,6))
plt.contourf(X, Y, -map_XY, levels=10)
plt.colorbar()

# visualizing samples
plt.plot(thetas[:,0], thetas[:,1], color='r', label='samples')
plt.plot(theta[0,0].item(),theta[1,0].item(), 'y*', markersize=20, label='true th')
plt.xlim([theta[0]-w,theta[0]+w])
plt.ylim([theta[1]-w,theta[1]+w])
plt.title('Parameter space', fontsize=25)
plt.xlabel('dimension 1', fontsize=20)
plt.ylabel('dimension 2', fontsize=20)
plt.legend()

<matplotlib.legend.Legend at 0x7f79f0c6cc70>

N_tr = 6
N_val = 300

x_val = torch.linspace(-5,5,N_val).view(-1,1)
y_val = torch.sin(x_val).view(-1,1)

x_train = torch.linspace(-3.14,3.14,N_tr).view(-1,1)
y_train = torch.sin(x_train).view(-1,1) + torch.randn_like(x_train)*0.1

plt.figure(figsize=(6,3))
plt.plot(x_train.numpy(),y_train.numpy(),'.',markersize=20, label='x train')
plt.plot(x_val.numpy(),y_val.numpy(),'.',markersize=5, label='x test')

plt.legend(fontsize=10)
plt.show()

x_train = x_train.to(device)
y_train = y_train.to(device)

x_val = x_val.to(device)
y_val = y_val.to(device)

class MLP(nn.Module):
    def __init__(self, nin=1, nout=1, H=50):
        super(MLP, self).__init__()
        self.layer1 = nn.Linear(nin, H)
        self.relu   = nn.ReLU()
        self.layer2 = nn.Linear(H, nout)

    def forward(self, x):
        x = self.layer1(x)
        x = self.relu(x)
        x = self.layer2(x)
        return x

hamiltorch.set_random_seed(123)
mlp = MLP()
params_init = hamiltorch.util.flatten(mlp).to(device).clone()
step_size = 0.005
num_samples = 1000
num_steps_per_sample = 20
tau_out = 100.
params_hmc = hamiltorch.sample_model(mlp, x_train, y_train, model_loss='regression', params_init=params_init, \
                                     num_samples=num_samples, step_size=step_size, \
                                     num_steps_per_sample=num_steps_per_sample, tau_out=tau_out)

%time pred_list, log_prob_list = hamiltorch.predict_model(mlp, x=x_val, y=y_val, model_loss='regression', \
                                                          samples=params_hmc[:], tau_out=tau_out, \
                                                          tau_list=torch.ones(len(list(mlp.parameters()))))

print('\nExpected validation log probability: {:.2f}'.format(torch.stack(log_prob_list).mean()))
print('\nExpected MSE: {:.2f}'.format(((pred_list.mean(0) - y_val)**2).mean()))

Sampling (Sampler.HMC; Integrator.IMPLICIT)
Time spent  | Time remain.| Progress             | Samples   | Samples/sec
0d:00:00:13 | 0d:00:00:00 | #################### | 1000/1000 | 72.43       
Acceptance Rate 0.79
CPU times: user 257 ms, sys: 4.5 ms, total: 262 ms
Wall time: 267 ms

Expected validation log probability: -3207.61

Expected MSE: 0.09

burn = num_samples//2
pred_list_ = pred_list[burn:]

plt.figure(figsize=(10,5))
plt.plot(x_val.cpu().numpy(), pred_list_.cpu().numpy().squeeze().T, 'C0', alpha=0.051)
plt.plot(x_val.cpu().numpy(),pred_list_.mean(0).cpu().numpy().squeeze().T, 'C1', alpha=0.9, \
         linewidth=3, label='mean pred')
# plt.fill_between(x_val.cpu().numpy().squeeze(), 
#                 pred_list_.mean(0).cpu().numpy().squeeze().T - 2*pred_list_.std(0).cpu().numpy().squeeze().T,
#                 pred_list_.mean(0).cpu().numpy().squeeze().T + 2*pred_list_.std(0).cpu().numpy().squeeze().T,
#                 alpha=.9, color='C1'
# )
plt.plot(x_train.cpu().numpy(),y_train.cpu().numpy(),'.C3',markersize=30, label='x train')


plt.legend(fontsize=20)
plt.ylim([-5,5])
plt.show()

D = 15

def funnel_ll(w, dim=D):
    v_dist = torch.distributions.Normal(0,3)
    ll = v_dist.log_prob(w[0])
    x_dist = torch.distributions.Normal(0, torch.exp(-w[0])**0.5)
    ll += x_dist.log_prob(w[1:]).sum()
    return ll

hamiltorch.set_random_seed(123)
params_init = torch.ones(D + 1)
params_init[0] = 0.
step_size = 0.2
num_samples = 1000 # For results in plot num_samples = 10000
L = 25

params_hmc = hamiltorch.sample(log_prob_func=funnel_ll, params_init=params_init, num_samples=num_samples,
                               step_size=step_size, num_steps_per_sample=L)

coords_hmc = torch.cat(params_hmc).reshape(len(params_hmc),-1).numpy()

# Explicit RMHMC with SOFTABS
hamiltorch.set_random_seed(123)
params_init = torch.ones(D + 1)
params_init[0] = 0.
step_size = 0.14 
num_samples = 1000
L = 25
omega=10
softabs_const=10**6
jitter=0.001

params_e_rmhmc = hamiltorch.sample(log_prob_func=funnel_ll, params_init=params_init, num_samples=num_samples,
                                   sampler=hamiltorch.Sampler.RMHMC, integrator=hamiltorch.Integrator.EXPLICIT,
                                   metric=hamiltorch.Metric.SOFTABS, jitter=jitter,
                                   num_steps_per_sample=L, step_size=step_size, explicit_binding_const=omega, 
                                   softabs_const=softabs_const)


coords_e_rmhmc = torch.cat(params_e_rmhmc).reshape(len(params_e_rmhmc),-1).numpy()

coords_hmc,coords_e_rmhmc = torch.load('etc/funnel_samples.pth')

xlim = [-4,4]
ylim = [0,7]#[-2,9]
text_x = -1.5
text_y = 8
font_size_text = 20
fs = 17
vxx = torch.linspace(xlim[0],xlim[1],300)
p = torch.distributions.Normal(0,3)
v_pdf = torch.exp(p.log_prob(vxx))

fig, axs = plt.subplots(1, 2, figsize=(10,5), sharey=True)
axs[0].scatter(coords_hmc[:,1], coords_hmc[:,0], s=5, alpha=0.3, rasterized=True, \
               color='C0', label='HMC')
l = axs[0].legend(loc=0,fontsize=fs)
l.legendHandles[0]._sizes = [100]
axs[0].grid()
axs[0].set_xlim(xlim)
axs[0].set_ylim(ylim)
axs[0].tick_params(axis='both', labelsize=fs)
axs[0].set_xlabel(r'$x_1$',fontsize=font_size_text)
axs[0].set_ylabel(r'$v$',fontsize=font_size_text,rotation=0,labelpad=30)

axs[1].scatter(coords_e_rmhmc[:,1], coords_e_rmhmc[:,0], s=5, alpha=0.3, rasterized=True, \
               color='C2', label='Explicit\nRMHMC')
l = axs[1].legend(loc=0,fontsize=fs)
l.legendHandles[0]._sizes = [100]
axs[1].grid()
axs[1].set_xlim(xlim)
axs[1].set_ylim(ylim)
axs[1].tick_params(axis='both', labelsize=fs)
axs[1].set_xlabel(r'$x_1$',fontsize=font_size_text)

plt.tight_layout()

m1 = torch.tensor([0.,0.])
s1 = torch.tensor([[2.,1.],[1.,2.]])
d1 = torch.distributions.MultivariateNormal(m1,s1)

m2 = torch.tensor([4.,4.])
s2 = torch.tensor([[2.,-1.],[-1.,2.]])
d2 = torch.distributions.MultivariateNormal(m2,s2)

def mixture_pdf(X):
    return d1.log_prob(X).exp() + 2*d2.log_prob(X).exp() 

def p(X, beta):
    return (mixture_pdf(X).log()*beta).exp()

original_density = lambda X: p(X, 1.0)
flat_density     = lambda X: p(X, 0.01)
peaky_density    = lambda X: p(X, 10)

plot_mcmc_surface(original_density, flat_density, peaky_density, betas=[1,0.01,10])

def mixture_density(X):
    return d1.log_prob(X).exp() + 2*d2.log_prob(X).exp() 

def __U(X):
    return -mixture_density(X).log()

def run_optimization(th0, is_mcmc=False, is_annealed=False):
    thetas, losses, betas = [],[],[]
    th    = torch.nn.Parameter(th0) # initial parameters
    d     = 2
    eta   = 5e-3 # step size
    Niter = 1000
    beta  = 1.0
    beta0 = 0.025
    beta1 = 1000
    p     = np.power(beta1/beta0,1/Niter)
    for i in range(Niter):
        thetas.append(th.detach().numpy().copy())
        U     = __U(th)
        grad  = torch.autograd.grad(U,th)[0]
        th.data = th.data - eta*grad
        if is_mcmc: 
            if is_annealed:
                beta = beta0 * np.power(p,i)
            else:
                beta = 1.0
            noise = torch.randn(1,d) * np.sqrt(2*eta/beta)
            th.data = th.data + noise
        betas.append(beta)
        losses.append(U.item())
    thetas = np.array(thetas).squeeze(1) # Niter,d
    anim = plot_mcmc_animation(mixture_density, thetas, betas, num_frames=25)
    return anim

# anim = run_optimization(torch.tensor([[0.,2.]]), is_mcmc=False)
# anim = run_optimization(torch.tensor([[0.,2.]]), is_mcmc=True, is_annealed=False)
anim = run_optimization(torch.tensor([[0.,2.]]), is_mcmc=True, is_annealed=True)
HTML(anim.to_jshtml())

<Figure size 640x480 with 0 Axes>

1. Ordinary Differential Equations (ODEs) and Gradient Descent (GD)¶

1.1. Ordinary Differential Equations (ODEs)¶

Numerical Integration¶

1.2. Gradient Descent (GD)¶

From GD to ODEs¶

2.1. A Naive Attempt¶

2.2. Second Attempt¶

MH correction¶

2.3. Simple Gaussian Experiment¶

2.4. Completing the Picture with a Detour: SGD without Diffusion Process¶

3. A Quick Recap of Markov Chain Monte Carlo¶

3.1. Metropolis-Hastings Algorithm¶

3.1.1. Ergodicity¶

3.1.2. Detailed balance¶

3.1.3. Random-walk MH Demo¶

3.2. Hamiltonian (Hybrid) Monte Carlo (HMC)¶

3.2.1. Hamiltonian dynamics¶

3.2.2. Hamiltonian Monte Carlo¶

3.2.3. Practical Algorithm¶

3.2.4. Notes¶

3.3. Sampling an MLP via HMC¶

3.4. Demo¶

4. A Complete Recipe for Stochastic Gradient MCMC¶

4.1. Formal Framework¶

Notes¶

4.2. How about Stochastic Gradients?¶

4.3. Some Instances of This Formulation¶

4.3.1. Stochastic Gradient Langevin Dynamics¶

4.3.2. Hamiltonian Monte Carlo¶

4.3.3. Riemannian Manifold Hamiltonian Monte Carlo (RMHMC)¶

4.4. RMHMC Illustration¶

5. Bridging the Gap between SG-MCMC and Stochastic Optimization¶

5.1. Simulated Annealing¶

5.2. Details of SANTA¶

5.2.1. Thermostats¶

5.2.2. Riemannian Geometry¶

5.2.3. Symmetric Splitting Scheme¶

1. Ordinary Differential Equations (ODEs) and Gradient Descent (GD)¶

1.1. Ordinary Differential Equations (ODEs)¶

Numerical Integration¶

1.2. Gradient Descent (GD)¶

From GD to ODEs¶

2. Are Stochastic Gradient Descent (SGD) and Stochastic Differential Equations (SDEs) Likewise Related?¶

2.1. A Naive Attempt¶

2.2. Second Attempt¶

MH correction¶

2.3. Simple Gaussian Experiment¶

2.4. Completing the Picture with a Detour: SGD without Diffusion Process¶

3. A Quick Recap of Markov Chain Monte Carlo¶

3.1. Metropolis-Hastings Algorithm¶

3.1.1. Ergodicity¶

3.1.2. Detailed balance¶

3.1.3. Random-walk MH Demo¶

3.2. Hamiltonian (Hybrid) Monte Carlo (HMC)¶

3.2.1. Hamiltonian dynamics¶

3.2.2. Hamiltonian Monte Carlo¶

3.2.3. Practical Algorithm¶

3.2.4. Notes¶

3.3. Sampling an MLP via HMC¶

Related Paper¶

3.4. Demo¶

4. A Complete Recipe for Stochastic Gradient MCMC¶

4.1. Formal Framework¶

Notes¶

4.2. How about Stochastic Gradients?¶

4.3. Some Instances of This Formulation¶

4.3.1. Stochastic Gradient Langevin Dynamics¶

4.3.2. Hamiltonian Monte Carlo¶

4.3.3. Riemannian Manifold Hamiltonian Monte Carlo (RMHMC)¶

4.4. RMHMC Illustration¶

5. Bridging the Gap between SG-MCMC and Stochastic Optimization¶

5.1. Simulated Annealing¶

5.2. Details of SANTA¶

5.2.1. Thermostats¶

5.2.2. Riemannian Geometry¶

5.2.3. Symmetric Splitting Scheme¶

Related Paper¶