%load_ext autoreload
%autoreload 2
!pip install torch torchvision torchdiffeq numpy scipy matplotlib pillow sklearn

import numpy as np
from IPython import display
import time
from sklearn.datasets import make_circles

import torch
import torch.nn as nn
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

from torchdiffeq  import odeint
from bnn          import BNN
from vae_utils    import MNIST_Encoder, MNIST_Decoder
from plot_utils   import plot_vdp_trajectories, plot_ode, plot_vdp_animation, plot_cnf_animation, \
    plot_mnist_sequences, plot_mnist_predictions, plot_cnf_data
from utils       import get_minibatch, mnist_loaders, inf_generator, mnist_accuracy, \
    count_parameters, conv3x3, group_norm, Flatten, load_rotating_mnist

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Requirement already satisfied: torch in c:\users\cagat\anaconda3\lib\site-packages (1.11.0)
Requirement already satisfied: torchvision in c:\users\cagat\anaconda3\lib\site-packages (0.12.0)
Requirement already satisfied: torchdiffeq in c:\users\cagat\anaconda3\lib\site-packages (0.2.2)
Requirement already satisfied: numpy in c:\users\cagat\anaconda3\lib\site-packages (1.20.3)
Requirement already satisfied: scipy in c:\users\cagat\anaconda3\lib\site-packages (1.7.1)
Requirement already satisfied: matplotlib in c:\users\cagat\anaconda3\lib\site-packages (3.4.3)
Requirement already satisfied: pillow in c:\users\cagat\anaconda3\lib\site-packages (8.4.0)
Requirement already satisfied: sklearn in c:\users\cagat\anaconda3\lib\site-packages (0.0)
Requirement already satisfied: typing_extensions in c:\users\cagat\anaconda3\lib\site-packages (from torch) (3.10.0.2)
Requirement already satisfied: requests in c:\users\cagat\anaconda3\lib\site-packages (from torchvision) (2.26.0)
Requirement already satisfied: python-dateutil>=2.7 in c:\users\cagat\anaconda3\lib\site-packages (from matplotlib) (2.8.2)
Requirement already satisfied: cycler>=0.10 in c:\users\cagat\anaconda3\lib\site-packages (from matplotlib) (0.10.0)
Requirement already satisfied: kiwisolver>=1.0.1 in c:\users\cagat\anaconda3\lib\site-packages (from matplotlib) (1.3.1)
Requirement already satisfied: pyparsing>=2.2.1 in c:\users\cagat\anaconda3\lib\site-packages (from matplotlib) (3.0.4)
Requirement already satisfied: scikit-learn in c:\users\cagat\anaconda3\lib\site-packages (from sklearn) (0.24.2)
Requirement already satisfied: six in c:\users\cagat\anaconda3\lib\site-packages (from cycler>=0.10->matplotlib) (1.16.0)
Requirement already satisfied: certifi>=2017.4.17 in c:\users\cagat\anaconda3\lib\site-packages (from requests->torchvision) (2021.10.8)
Requirement already satisfied: urllib3<1.27,>=1.21.1 in c:\users\cagat\anaconda3\lib\site-packages (from requests->torchvision) (1.26.7)
Requirement already satisfied: charset-normalizer~=2.0.0 in c:\users\cagat\anaconda3\lib\site-packages (from requests->torchvision) (2.0.4)
Requirement already satisfied: idna<4,>=2.5 in c:\users\cagat\anaconda3\lib\site-packages (from requests->torchvision) (3.2)
Requirement already satisfied: joblib>=0.11 in c:\users\cagat\anaconda3\lib\site-packages (from scikit-learn->sklearn) (1.1.0)
Requirement already satisfied: threadpoolctl>=2.0.0 in c:\users\cagat\anaconda3\lib\site-packages (from scikit-learn->sklearn) (2.2.0)

# define the differential function
class VDP(nn.Module):
    
    def __init__(self,mu):
        ''' mu is the only parameter in VDP oscillator '''
        super().__init__()
        self.mu = mu
        
    def forward(self, t, x):
        ''' Implements the right hand side
            Inputs
                t - []     time
                x - [N,d]  state(s)
            Output
                \dot{x} - [N,d], time derivative
        '''
        d1 = x[...,1:2]
        d2 = self.mu*(1-x[...,0:1]**2)*x[...,1:2]-x[...,0:1]
        return torch.cat([d1,d2],-1)

# create the differential function, needs to be a nn.Module
vdp = VDP(1.0).to(device)

# initial value, of shape [N,n]
x0 = torch.tensor([[1.0,0.0]]).float().to(device)

# integration time points, of shape [T]
ts = torch.linspace(0., 15., 500).to(device)

# forward integration
with torch.no_grad():
    X = odeint(vdp, x0, ts) # [T,N,n]

# animation
anim = plot_vdp_animation(ts,X,vdp)
display.HTML(anim.to_jshtml())

# feel free to modify the parameter
vdp = VDP(5.0).to(device)

# feel free to try out different initial values
x0 = torch.tensor(
    [[-2.0,-3.0],[-2.0,3.0]]
).float().to(device)

# integration time points, of shape [T]
ts = torch.linspace(0., 15., 500).to(device)

# forward integration
with torch.no_grad():
    X = odeint(vdp, x0, ts) # [T,N,D]

plot_ode(ts,X,vdp)

class NODE(nn.Module):
    def __init__(self, d):
        ''' d - ODE dimensionality '''
        super().__init__()
        self._f = nn.Sequential(nn.Linear(d,200), 
                                nn.ELU(), 
                                nn.Linear(200,200), 
                                nn.ELU(), 
                                nn.Linear(200,d))
    
    def ode_rhs(self, t, x):
        ''' differential function = f(x)'''
        return self._f(x)
    
    def forward(self, ts, x0, method='dopri5'):
        ''' Forward integrates the NODE system and returns state solutions
            Input
                ts - [T]   time points
                x0 - [N,d] initial value
            Returns
                X  - [T,N,d] forward simulated states
        '''
        return odeint(self.ode_rhs, x0, ts, method=method)

node = NODE(2).to(device)

# let's compute the integral of our neural net!
x0 = torch.tensor([[1.0,0.0]]).float().to(device)
ts = torch.linspace(0., 20., 1000).to(device)

X = node(ts,x0)
plot_ode(ts, X, node.ode_rhs)

# lets first generate data
vdp = VDP(1.0).to(device)
x0 = 6*torch.rand([10,2]).to(device) - 3 # 10 random initial values in [-3,3]
tvdp = torch.linspace(0., 10., 50).to(device)
with torch.no_grad():
    Xvdp = odeint(vdp, x0, tvdp)
    Yvdp = Xvdp + torch.randn_like(Xvdp)*0.1 # noisy data with observation noise has std 0.1

plot_vdp_trajectories(tvdp, Yvdp, vdp)

Plotting the first 3 data sequences.

# optimization loop
Niter  = 1000 # number of optimization iterations
tsub   = 11   # subsequence length in each minibatch

optimizer = torch.optim.Adam(node.parameters(),1e-3)
for i in range(Niter):
    optimizer.zero_grad()
    t_,Y_ = get_minibatch(tvdp, Yvdp, tsub=tsub)
    Xhat = node(t_, Y_[0]) # forward simulation
    loss = ((Xhat-Y_)**2).mean() # MSE
    loss.backward()
    optimizer.step()
    if i%50==0:
        Xhat = node(tvdp, Yvdp[0]) # forward simulation
        display.clear_output(wait=True)
        plot_ode(tvdp, Yvdp, node.ode_rhs, Xhat.detach())

---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_22896/3685077909.py in <module>
     14         Xhat = node(tvdp, Yvdp[0]) # forward simulation
     15         display.clear_output(wait=True)
---> 16         plot_ode(tvdp, Yvdp, node.ode_rhs, Xhat.detach())

~\OneDrive\academic-mixed\conferences\22-probai\plot_utils.py in plot_ode(t, X, ode_rhs, Xhat, L, return_fig)
     81     fig = plt.figure(1,[15,7.5],constrained_layout=True)
     82     gs  = fig.add_gridspec(3, 3)
---> 83     ax1 = fig.add_subplot(gs[:, 0])
     84 
     85     ax1.set_xlabel('State $x_1$',fontsize=17)

~\anaconda3\lib\site-packages\matplotlib\figure.py in add_subplot(self, *args, **kwargs)
    782             projection_class, pkw = self._process_projection_requirements(
    783                 *args, **kwargs)
--> 784             ax = subplot_class_factory(projection_class)(self, *args, **pkw)
    785             key = (projection_class, pkw)
    786         return self._add_axes_internal(ax, key)

~\anaconda3\lib\site-packages\matplotlib\axes\_subplots.py in __init__(self, fig, *args, **kwargs)
     34         """
     35         # _axes_class is set in the subplot_class_factory
---> 36         self._axes_class.__init__(self, fig, [0, 0, 1, 1], **kwargs)
     37         # This will also update the axes position.
     38         self.set_subplotspec(SubplotSpec._from_subplot_args(fig, args))

~\anaconda3\lib\site-packages\matplotlib\_api\deprecation.py in wrapper(*args, **kwargs)
    469                 "parameter will become keyword-only %(removal)s.",
    470                 name=name, obj_type=f"parameter of {func.__name__}()")
--> 471         return func(*args, **kwargs)
    472 
    473     return wrapper

~\anaconda3\lib\site-packages\matplotlib\axes\_base.py in __init__(self, fig, rect, facecolor, frameon, sharex, sharey, label, xscale, yscale, box_aspect, **kwargs)
    632 
    633         self._rasterization_zorder = None
--> 634         self.cla()
    635 
    636         # funcs used to format x and y - fall back on major formatters

~\anaconda3\lib\site-packages\matplotlib\axes\_base.py in cla(self)
   1294         self.set_axis_on()
   1295 
-> 1296         self.xaxis.set_clip_path(self.patch)
   1297         self.yaxis.set_clip_path(self.patch)
   1298 

~\anaconda3\lib\site-packages\matplotlib\axis.py in set_clip_path(self, clippath, transform)
    916 
    917     def set_clip_path(self, clippath, transform=None):
--> 918         super().set_clip_path(clippath, transform)
    919         for child in self.majorTicks + self.minorTicks:
    920             child.set_clip_path(clippath, transform)

~\anaconda3\lib\site-packages\matplotlib\artist.py in set_clip_path(self, path, transform)
    778             if isinstance(path, Rectangle):
    779                 self.clipbox = TransformedBbox(Bbox.unit(),
--> 780                                                path.get_transform())
    781                 self._clippath = None
    782                 success = True

~\anaconda3\lib\site-packages\matplotlib\patches.py in get_transform(self)
    271     def get_transform(self):
    272         """Return the `~.transforms.Transform` applied to the `Patch`."""
--> 273         return self.get_patch_transform() + artist.Artist.get_transform(self)
    274 
    275     def get_data_transform(self):

~\anaconda3\lib\site-packages\matplotlib\patches.py in get_patch_transform(self)
    777         bbox = self.get_bbox()
    778         return (transforms.BboxTransformTo(bbox)
--> 779                 + transforms.Affine2D().rotate_deg_around(
    780                     bbox.x0, bbox.y0, self.angle))
    781 

~\anaconda3\lib\site-packages\matplotlib\transforms.py in rotate_deg_around(self, x, y, degrees)
   2000         # Cast to float to avoid wraparound issues with uint8's
   2001         x, y = float(x), float(y)
-> 2002         return self.translate(-x, -y).rotate_deg(degrees).translate(x, y)
   2003 
   2004     def translate(self, tx, ty):

~\anaconda3\lib\site-packages\matplotlib\transforms.py in rotate_deg(self, degrees)
   1978         and :meth:`scale`.
   1979         """
-> 1980         return self.rotate(math.radians(degrees))
   1981 
   1982     def rotate_around(self, x, y, theta):

~\anaconda3\lib\site-packages\matplotlib\transforms.py in rotate(self, theta)
   1966         rotate_mtx = np.array([[a, -b, 0.0], [b, a, 0.0], [0.0, 0.0, 1.0]],
   1967                               float)
-> 1968         self._mtx = np.dot(rotate_mtx, self._mtx)
   1969         self.invalidate()
   1970         return self

<__array_function__ internals> in dot(*args, **kwargs)

KeyboardInterrupt:

<Figure size 1080x540 with 0 Axes>

state_dict = torch.load('etc/trained_node.pkl')
node.load_state_dict(state_dict)
node.eval()

Xhat = node(tvdp, Yvdp[:,0]) # forward simulation
plot_ode(tvdp, Yvdp, node.ode_rhs, Xhat.detach())

# we read 1042 sequences of length 16, where each observation is a 28x28 grey-scale image
Ymnist_tr, Ymnist_test = load_rotating_mnist(device) # [T,N,1,28,28]
plot_mnist_sequences(Ymnist_tr)

# let's create artificial time points corresponding to rotation angles <===> T=16
tmnist = 0.1*torch.arange(16).to(device)

Plotting 5 rotating MNIST sequences.

from torch.distributions import Normal, kl_divergence

class ODEVAE(nn.Module):
    def __init__(self, q, n_filt=16):
        ''' Inputs:
                q      - latent dimensionaliy
                n_filt - number of filters in the first CNN layer
        '''
        super().__init__()
        self.encoder  = MNIST_Encoder(q, n_filt)
        self.bnode    = BNN(n_in=q, n_out=q, n_hid_layers=2, n_hidden=100, act='elu')
        self.decoder  = MNIST_Decoder(q, n_filt)
        self.obs_loss = nn.BCELoss(reduction='sum')
        self.q        = q
        
    def forward(self, ts, Y, method='dopri5'):
        ''' Performs encoding, latent forward integration and decoding.
            Note that we always draw a single sample from the encoder to improve the readibility of our code.
            Inputs:
                ts - [T]           observation time points
                Y  - [T,N,1,28,28] input sequences
            Returns:
                q_z0_mu  - [N,q]           initial value means
                q_z0_sig - [N,q]           initial value std
                zt       - [T,N,q]       latent trajectoy
                Xhat     - [T,N,1,28,28] reconstructions
        '''
        [T,N,nc,d,d] = Y.shape
        # encode mean and variance
        q_z0_mu, q_z0_sig = self.encoder(Y) # N,q & N,q
        # sample differential function
        f = self.bnode.draw_f()
        ode_rhs = lambda t,x: f(x)
        # sample initial values
        z0 = q_z0_mu + q_z0_sig*torch.randn_like(q_z0_sig)
        # forward integrate
        zt = odeint(ode_rhs, z0, ts, method=method) # T,N,q
        # decode
        Xhat = self.decoder(zt) # T,N,nc,d,d
        return q_z0_mu, q_z0_sig, zt, Xhat

odevae = ODEVAE(q=8).to(device)

def compute_elbo(odevae, ts, Y):
    ''' Computes the ELBO.
        Note that we always draw a single sample from the encoder to improve the readibility of our code.
        Inputs:
            ts - [T] observation time points
            Y  - [T,N,1,28,28] input sequences
        Returns:
            rec    - [] expected log likelihood
            kl_enc - [] the KL term due to z_0 
            kl_bnn - [] the KL term due to bnn weights w
    '''
    q_z0_mu, q_z0_sig, zt, Xhat = odevae(ts, Y)
    # reconstruction
    rec = -odevae.obs_loss(Xhat,Y)
    # KL divergence on z_0
    q_z0_mu, q_z0_sig = q_z0_mu.reshape(-1), q_z0_sig.reshape(-1)
    q = Normal(q_z0_mu,q_z0_sig)
    N = Normal(torch.zeros_like(q_z0_mu),torch.ones_like(q_z0_sig))
    kl_enc = kl_divergence(q,N).sum()
    # KL divergence on bnn weights
    kl_bnn = odevae.bnode.kl()
    return rec, kl_enc, kl_bnn

Nsub  = 25  # number of sequences in each minibatch
C     = Ymnist_tr.shape[0] / Nsub # scaling factor
Niter = 2000

optimizer = torch.optim.Adam(odevae.parameters(), 1e-3)

for i in range(Niter):
    optimizer.zero_grad()
    t_,Y_ = get_minibatch(tmnist, Ymnist_tr, Nsub=Nsub)
    rec, kl_enc, kl_bnn = compute_elbo(odevae, t_, Y_)
    rec  = rec*C 
    kl   = kl_enc*C + kl_bnn
    loss = -rec + kl
    loss.backward()
    optimizer.step()
    if i%25==0:
        with torch.no_grad():
            t_,Y_ = get_minibatch(tmnist, Ymnist_tr, Nsub=5)
            q_z0_mu, q_z0_sig, zt, Xhat = odevae(t_,Y_)
            display.clear_output(wait=True)
            plot_mnist_predictions(Y_, zt, Xhat)

Plotting 5 rotating MNIST sequences (top rows) and corresponding predictions (bottom).

---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_22896/2566184225.py in <module>
     12     kl   = kl_enc*C + kl_bnn
     13     loss = -rec + kl
---> 14     loss.backward()
     15     optimizer.step()
     16     if i%25==0:

~\anaconda3\lib\site-packages\torch\_tensor.py in backward(self, gradient, retain_graph, create_graph, inputs)
    361                 create_graph=create_graph,
    362                 inputs=inputs)
--> 363         torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
    364 
    365     def register_hook(self, hook):

~\anaconda3\lib\site-packages\torch\autograd\__init__.py in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)
    171     # some Python versions print out the first line of a multi-line function
    172     # calls in the traceback and some print out the last line
--> 173     Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
    174         tensors, grad_tensors_, retain_graph, create_graph, inputs,
    175         allow_unreachable=True, accumulate_grad=True)  # Calls into the C++ engine to run the backward pass

~\anaconda3\lib\site-packages\torch\autograd\function.py in apply(self, *args)
    241 
    242 class BackwardCFunction(_C._FunctionBase, FunctionCtx, _HookMixin):
--> 243     def apply(self, *args):
    244         # _forward_cls is defined by derived class
    245         # The user should define either backward or vjp but never both.

KeyboardInterrupt:

# load a trained model
state_dict = torch.load('etc/trained_odevae.pkl')
odevae.load_state_dict(state_dict)
odevae.eval();

t_,Y_ = get_minibatch(tmnist, Ymnist_tr, Nsub=5)
q_z0_mu, q_z0_sig, zt, Xhat = odevae(t_,Y_)
plot_mnist_predictions(Y_, zt, Xhat)

Plotting 5 rotating MNIST sequences (top rows) and corresponding predictions (bottom).

t_,Y_ = get_minibatch(tmnist, Ymnist_test, Nsub=5)
q_z0_mu, q_z0_sig, zt, Xhat = odevae(t_,Y_)
plot_mnist_predictions(Y_, zt, Xhat)

Plotting 5 rotating MNIST sequences (top rows) and corresponding predictions (bottom).

class ResNet(nn.Module):
    def __init__(self, num_blocks, inplanes, planes, stride=1):
        super(ResNet, self).__init__()
        self.blocks = nn.Sequential(*[ResNetBlock(inplanes, planes) for _ in range(num_blocks)])
    
    def forward(self,x):
        return self.blocks(x)

class ResNetBlock(nn.Module):
    def __init__(self, inplanes, planes, stride=1):
        super(ResNetBlock, self).__init__()
        self.net = nn.Sequential(group_norm(inplanes), 
                                nn.ReLU(inplace=True), 
                                conv3x3(inplanes, planes, stride), 
                                group_norm(planes), 
                                nn.ReLU(inplace=True),
                                conv3x3(planes, planes))

    def forward(self, x):
        shortcut = x
        net_out  = self.net(x)
        return net_out + shortcut

class NODE(nn.Module):
    def __init__(self, dim):
        super(NODE, self).__init__()
        self.norm1 = group_norm(dim)
        self.relu  = nn.ReLU(inplace=True)
        self.conv1 = ConcatConv2d(dim, dim, 3, 1, 1)
        self.norm2 = group_norm(dim)
        self.conv2 = ConcatConv2d(dim, dim, 3, 1, 1)
        self.norm3 = group_norm(dim)
        self.integration_time = torch.tensor([0, 1]).float()

    def ode_rhs(self, t, x):
        out = self.norm1(x)
        out = self.relu(out)
        out = self.conv1(t, out)
        out = self.norm2(out)
        out = self.relu(out)
        out = self.conv2(t, out)
        out = self.norm3(out)
        return out

    def forward(self, x, method='dopri5'):
        ''' Forward integrates the NODE system and returns state solutions
            Input
                x   - [N, num_filt, w, c] initial value
            Returns
                out - [N, num_filt, w, c] the final state of the ODE system
        '''
        self.integration_time = self.integration_time.type_as(x)
        # we solve the ODE system with less tolerance (bigger error) for faster computation
        out = odeint(self.ode_rhs, x, self.integration_time, method=method, rtol=1e-3, atol=1e-6)
        return out[-1]

    
class ConcatConv2d(nn.Module):
    ''' Convolutional layers that use current time stamp information '''

    def __init__(self, dim_in, dim_out, ksize=3, stride=1, padding=0, dilation=1, groups=1, bias=True):
        super(ConcatConv2d, self).__init__()
        self._layer = nn.Conv2d(dim_in + 1, dim_out, kernel_size=ksize, stride=stride, padding=padding, 
                                dilation=dilation, groups=groups, bias=bias)

    def forward(self, t, x):
        tt  = torch.ones_like(x[:, :1, :, :]) * t
        ttx = torch.cat([tt, x], 1)  
        return self._layer(ttx)

trans_layer = 'odenet' # can be replaced with 'resnet'
num_filt    = 16

# downsampling
downsampling_layers = [
    nn.Conv2d(1, num_filt, 3, 1),
    group_norm(num_filt),
    nn.ReLU(inplace=True),
    nn.Conv2d(num_filt, num_filt, 4, 2, 1),
    group_norm(num_filt),
    nn.ReLU(inplace=True),
    nn.Conv2d(num_filt, num_filt, 4, 2, 1),
]

# feature transformation
if trans_layer=='odenet':
    feature_layers = NODE(num_filt)
else:
    feature_layers = ResNet(6, num_filt, num_filt)

    
# fully connected layer
fc_layers = [group_norm(num_filt), nn.ReLU(inplace=True), nn.AdaptiveAvgPool2d((1, 1)), Flatten(), nn.Linear(num_filt, 10)]

model = nn.Sequential(*downsampling_layers, feature_layers, *fc_layers).to(device)
print('Number of parameters: {}'.format(count_parameters(model)))

Number of parameters: 13674

lr       = 0.1
niters   = 1000
batch_size  = 100
print_every = 10
test_every  = 100

train_loader, test_loader, train_eval_loader = mnist_loaders(batch_size)
data_gen = inf_generator(train_loader)

optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9)
criterion = nn.CrossEntropyLoss().to(device)

start_time = time.time()

for itr in range(1,niters):
    optimizer.zero_grad()
    x, y = data_gen.__next__()
    x = x.to(device)
    y = y.to(device)
    logits = model(x)
    loss   = criterion(logits, y)
    loss.backward()
    optimizer.step()
    
    # print the train trace
    if itr % print_every == 0:
        end_time = time.time()
        print("Iter {:04d} | Time {:.3f} | loss {:.4f}".format(itr, end_time-start_time, loss.item()))
        start_time = time.time()
        
    # print the test trace
    if itr % test_every == 0:
        with torch.no_grad():
            val_acc   = mnist_accuracy(model, device, test_loader)
            train_acc = mnist_accuracy(model, device, train_eval_loader)
            print("Iter {:04d} | Train Acc {:.4f} | Test Acc {:.4f}".format(itr, train_acc, val_acc))
            start_time = time.time()

Iter 0010 | Time 6.542 | loss 2.3096
Iter 0020 | Time 4.249 | loss 2.2972
Iter 0030 | Time 3.997 | loss 2.3011
Iter 0040 | Time 3.802 | loss 2.2980
Iter 0050 | Time 4.125 | loss 2.2560
Iter 0060 | Time 3.900 | loss 2.2598
Iter 0070 | Time 4.135 | loss 2.2707
Iter 0080 | Time 3.838 | loss 2.2191
Iter 0090 | Time 3.869 | loss 2.2175
Iter 0100 | Time 4.226 | loss 2.1295
Iter 0100 | Train Acc 0.2241 | Test Acc 0.2304
Iter 0110 | Time 4.251 | loss 2.1413
Iter 0120 | Time 4.707 | loss 2.0850
Iter 0130 | Time 4.607 | loss 1.9517
Iter 0140 | Time 4.918 | loss 1.9446
Iter 0150 | Time 4.886 | loss 1.8859
Iter 0160 | Time 5.123 | loss 1.7849
Iter 0170 | Time 4.808 | loss 1.8127
Iter 0180 | Time 3.504 | loss 1.6171
Iter 0190 | Time 5.056 | loss 1.4815
Iter 0200 | Time 4.927 | loss 1.3827
Iter 0200 | Train Acc 0.5300 | Test Acc 0.4915
Iter 0210 | Time 4.652 | loss 1.4271
Iter 0220 | Time 5.089 | loss 1.2138
Iter 0230 | Time 5.435 | loss 1.4058
Iter 0240 | Time 4.788 | loss 1.3113
Iter 0250 | Time 5.520 | loss 1.1210
Iter 0260 | Time 4.495 | loss 1.2944
Iter 0270 | Time 4.125 | loss 1.0757
Iter 0280 | Time 4.546 | loss 0.9276
Iter 0290 | Time 4.931 | loss 0.8121
Iter 0300 | Time 4.986 | loss 0.8438
Iter 0300 | Train Acc 0.7711 | Test Acc 0.7578
Iter 0310 | Time 5.618 | loss 0.6255
Iter 0320 | Time 5.369 | loss 0.6043
Iter 0330 | Time 4.268 | loss 0.9336
Iter 0340 | Time 4.155 | loss 0.6923
Iter 0350 | Time 3.723 | loss 0.5035
Iter 0360 | Time 4.180 | loss 0.5041
Iter 0370 | Time 4.250 | loss 0.5891
Iter 0380 | Time 4.120 | loss 0.5354
Iter 0390 | Time 4.771 | loss 0.4401
Iter 0400 | Time 3.975 | loss 0.3725
Iter 0400 | Train Acc 0.9137 | Test Acc 0.8963
Iter 0410 | Time 5.455 | loss 0.4050
Iter 0420 | Time 5.021 | loss 0.4614
Iter 0430 | Time 5.138 | loss 0.3602
Iter 0440 | Time 5.415 | loss 0.3874
Iter 0450 | Time 4.886 | loss 0.3347
Iter 0460 | Time 5.736 | loss 0.3265
Iter 0470 | Time 5.647 | loss 0.3185
Iter 0480 | Time 5.115 | loss 0.3498
Iter 0490 | Time 5.542 | loss 0.4387
Iter 0500 | Time 5.116 | loss 0.2702
Iter 0500 | Train Acc 0.9478 | Test Acc 0.9344
Iter 0510 | Time 5.081 | loss 0.2277
Iter 0520 | Time 4.992 | loss 0.2838
Iter 0530 | Time 5.295 | loss 0.1582
Iter 0540 | Time 4.998 | loss 0.1799
Iter 0550 | Time 5.261 | loss 0.1326
Iter 0560 | Time 4.771 | loss 0.1627
Iter 0570 | Time 5.035 | loss 0.2268
Iter 0580 | Time 4.707 | loss 0.1679
Iter 0590 | Time 5.040 | loss 0.2313
Iter 0600 | Time 4.861 | loss 0.3063
Iter 0600 | Train Acc 0.9611 | Test Acc 0.9530
Iter 0610 | Time 6.264 | loss 0.1455
Iter 0620 | Time 5.131 | loss 0.0863
Iter 0630 | Time 5.089 | loss 0.3072
Iter 0640 | Time 4.739 | loss 0.2373
Iter 0650 | Time 4.729 | loss 0.2867
Iter 0660 | Time 3.876 | loss 0.1067
Iter 0670 | Time 4.914 | loss 0.2129
Iter 0680 | Time 4.827 | loss 0.1498
Iter 0690 | Time 4.755 | loss 0.1544
Iter 0700 | Time 5.183 | loss 0.1400
Iter 0700 | Train Acc 0.9578 | Test Acc 0.9385
Iter 0710 | Time 4.593 | loss 0.1683
Iter 0720 | Time 4.276 | loss 0.1603
Iter 0730 | Time 4.959 | loss 0.1810
Iter 0740 | Time 5.126 | loss 0.1254
Iter 0750 | Time 5.013 | loss 0.1094
Iter 0760 | Time 4.967 | loss 0.1577
Iter 0770 | Time 4.986 | loss 0.1577
Iter 0780 | Time 4.987 | loss 0.1373
Iter 0790 | Time 5.078 | loss 0.1809
Iter 0800 | Time 4.979 | loss 0.2024
Iter 0800 | Train Acc 0.9681 | Test Acc 0.9556
Iter 0810 | Time 4.789 | loss 0.1155
Iter 0820 | Time 4.343 | loss 0.1994
Iter 0830 | Time 4.870 | loss 0.0731
Iter 0840 | Time 4.715 | loss 0.1313
Iter 0850 | Time 4.789 | loss 0.1689
Iter 0860 | Time 5.046 | loss 0.1357
Iter 0870 | Time 5.221 | loss 0.1015
Iter 0880 | Time 4.790 | loss 0.2174
Iter 0890 | Time 4.182 | loss 0.0859
Iter 0900 | Time 3.657 | loss 0.1786
Iter 0900 | Train Acc 0.9715 | Test Acc 0.9670
Iter 0910 | Time 3.574 | loss 0.1121
Iter 0920 | Time 4.230 | loss 0.1827
Iter 0930 | Time 5.000 | loss 0.2199
Iter 0940 | Time 4.898 | loss 0.0507
Iter 0950 | Time 4.963 | loss 0.0881
Iter 0960 | Time 4.880 | loss 0.1290
Iter 0970 | Time 5.087 | loss 0.1222
Iter 0980 | Time 4.660 | loss 0.1613
Iter 0990 | Time 5.193 | loss 0.0820

def trace_df_dz(f, z):
    """Calculates the trace of the Jacobian df/dz.
    Stolen from: https://github.com/rtqichen/ffjord/blob/master/lib/layers/odefunc.py#L13
    Input:
        f - function output [N,d]
        z - current state [N,d]
    Returns:
        tr(df/dz) - [N]
    """
    sum_diag = 0.
    for i in range(z.shape[1]):
        sum_diag += torch.autograd.grad(f[:, i].sum(), z, create_graph=True)[0].contiguous()[:, i].contiguous()
    return sum_diag.contiguous()

from hyper_net import HyperNetwork

class CNF(nn.Module):
    """Adapted from the NumPy implementation at:
    https://gist.github.com/rtqichen/91924063aa4cc95e7ef30b3a5491cc52
    """
    def __init__(self, in_out_dim, hidden_dim, width):
        super().__init__()
        self.f = HyperNetwork(in_out_dim, hidden_dim, width)

    def ode_rhs(self, t, states):
        ''' Differential function implementation. states is (x1,logp_diff_t1) where
                x1 - [N,d] initial values for ODE states
                logp_diff_t1 - [N,1] initial values for density changes
        '''
        z,logp_z = states # [N,d], [N,1]
        N = z.shape[0]
        with torch.set_grad_enabled(True):
            z.requires_grad_(True)
            dz_dt      = self.f(t,z) # [N,d] 
            dlogp_z_dt = -trace_df_dz(dz_dt, z).view(N, 1)
        return (dz_dt, dlogp_z_dt)
    
    def forward(self, ts, z0, logp_diff_t0, method='dopri5'):
        ''' Forward integrates the CNF system. Returns state and density change solutions.
            Input
                ts - [T]   time points
                z0 - [N,d] initial values for ODE states
                logp_diff_t0 - [N,1] initial values for density changes
            Retuns:
                zt -     [T,N,...]  state trajectory computed at t
                logp_t - [T,N,1]    density change computed over time
        '''
        zt, logp_t = odeint(self.ode_rhs, (z0, logp_diff_t0), ts, method=method)
        return zt, logp_t

# data generation
Ntrain = 10000

def get_batch(num_samples):
    points, _ = make_circles(n_samples=num_samples, noise=0.06, factor=0.5)
    return torch.tensor(points).type(torch.float32).to(device) # N,2
tr_data = get_batch(Ntrain)

plot_cnf_data(tr_data)

# model and flow parameters
hidden_dim = 32
width      = 64
t0 = 0  # flow start time
t1 = 1  # flow end time

# optimization parameters
lr     = 3e-3
niters = 1000
Nsamp  = 100
print_every = 25

# model
cnf  = CNF(in_out_dim=2, hidden_dim=hidden_dim, width=width).to(device)
ts   = torch.tensor([t1, t0]).type(torch.float32).to(device) # for training, we flow the samples backward (in time) 
p_z0 = torch.distributions.MultivariateNormal(
    loc=torch.tensor([0.0, 0.0]).to(device),
    covariance_matrix=torch.tensor([[0.1, 0.0], [0.0, 0.1]]).to(device)
)

optimizer = torch.optim.Adam(cnf.parameters(), lr=lr)
for itr in range(1, niters+1):
    optimizer.zero_grad()

    # get a random sample minibatch
    idx = torch.randperm(Ntrain)[:Nsamp]
    x1  = tr_data[idx] # Nsamp,2
    
    # initialize initial densities
    logp_diff_t1 = torch.zeros(Nsamp, 1).type(torch.float32).to(device)
    
    # compute the backward solutions
    z_t,  logp_diff_t  = cnf(ts, x1, logp_diff_t1) # outputs time first
    z_t0, logp_diff_t0 = z_t[-1], logp_diff_t[-1]
    
    # compute the density of each sample
    logp_x = p_z0.log_prob(z_t0).to(device) - logp_diff_t0.view(-1)
    loss   = -logp_x.mean(0)
    loss.backward()
    optimizer.step()
    
    if itr%print_every==0:
        print('Iter: {}, loss: {:.4f}'.format(itr, loss.item()))

print('Training complete after {} iters.'.format(itr))

Iter: 25, loss: 1.6009
Iter: 50, loss: 1.5427
Iter: 75, loss: 1.5508
Iter: 100, loss: 1.5496
Iter: 125, loss: 1.5721
Iter: 150, loss: 1.5380
Iter: 175, loss: 1.6878
Iter: 200, loss: 1.5414

---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_22896/3529732314.py in <module>
     31 
     32     # compute the backward solutions
---> 33     z_t,  logp_diff_t  = cnf(ts, x1, logp_diff_t1) # outputs time first
     34     z_t0, logp_diff_t0 = z_t[-1], logp_diff_t[-1]
     35 

~\anaconda3\lib\site-packages\torch\nn\modules\module.py in _call_impl(self, *input, **kwargs)
   1108         if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1109                 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1110             return forward_call(*input, **kwargs)
   1111         # Do not call functions when jit is used
   1112         full_backward_hooks, non_full_backward_hooks = [], []

~\AppData\Local\Temp/ipykernel_22896/769506971.py in forward(self, ts, z0, logp_diff_t0, method)
     32                 logp_t - [T,N,1]    density change computed over time
     33         '''
---> 34         zt, logp_t = odeint(self.ode_rhs, (z0, logp_diff_t0), ts, method=method)
     35         return zt, logp_t

~\anaconda3\lib\site-packages\torchdiffeq\_impl\odeint.py in odeint(func, y0, t, rtol, atol, method, options, event_fn)
     75 
     76     if event_fn is None:
---> 77         solution = solver.integrate(t)
     78     else:
     79         event_t, solution = solver.integrate_until_event(t[0], event_fn)

~\anaconda3\lib\site-packages\torchdiffeq\_impl\solvers.py in integrate(self, t)
     28         self._before_integrate(t)
     29         for i in range(1, len(t)):
---> 30             solution[i] = self._advance(t[i])
     31         return solution
     32 

~\anaconda3\lib\site-packages\torchdiffeq\_impl\rk_common.py in _advance(self, next_t)
    192         while next_t > self.rk_state.t1:
    193             assert n_steps < self.max_num_steps, 'max_num_steps exceeded ({}>={})'.format(n_steps, self.max_num_steps)
--> 194             self.rk_state = self._adaptive_step(self.rk_state)
    195             n_steps += 1
    196         return _interp_evaluate(self.rk_state.interp_coeff, self.rk_state.t0, self.rk_state.t1, next_t)

~\anaconda3\lib\site-packages\torchdiffeq\_impl\rk_common.py in _adaptive_step(self, rk_state)
    253         # trigger both. (i.e. interleaving them would be wrong.)
    254 
--> 255         y1, f1, y1_error, k = _runge_kutta_step(self.func, y0, f0, t0, dt, t1, tableau=self.tableau)
    256         # dtypes:
    257         # y1.dtype == self.y0.dtype

~\anaconda3\lib\site-packages\torchdiffeq\_impl\rk_common.py in _runge_kutta_step(func, y0, f0, t0, dt, t1, tableau)
     66     k = _UncheckedAssign.apply(k, f0, (..., 0))
     67     for i, (alpha_i, beta_i) in enumerate(zip(tableau.alpha, tableau.beta)):
---> 68         if alpha_i == 1.:
     69             # Always step to perturbing just before the end time, in case of discontinuities.
     70             ti = t1

KeyboardInterrupt:

# load the pre-trained model
state_dict = torch.load('etc/trained_cnf.pkl')
cnf.load_state_dict(state_dict)
cnf.eval()

# samples
viz_samples   = 30000
viz_timesteps = 41
target_sample = get_batch(viz_samples)

# simulate the flow
with torch.no_grad():
    # Generate evolution of samples
    z_t0 = p_z0.sample([viz_samples]).to(device)
    logp_diff_t0 = torch.zeros(viz_samples, 1).type(torch.float32).to(device)

    ts = torch.tensor(np.linspace(t0, t1, viz_timesteps)).to(device)
    z_t_samples, _  = cnf(ts, z_t0, logp_diff_t0)

    # Generate evolution of density
    x = np.linspace(-1.5, 1.5, 100)
    y = np.linspace(-1.5, 1.5, 100)
    points = np.vstack(np.meshgrid(x, y)).reshape([2, -1]).T
    
    z_t1 = torch.tensor(points).type(torch.float32).to(device)
    logp_diff_t1 = torch.zeros(z_t1.shape[0], 1).type(torch.float32).to(device)
    ts = torch.tensor(np.linspace(t1, t0, viz_timesteps)).to(device)
    z_t_density, logp_diff_t = cnf(ts, z_t1, logp_diff_t1)

anim = plot_cnf_animation(target_sample, t0, t1, viz_timesteps, p_z0, z_t1, z_t_samples, z_t_density, logp_diff_t)
display.HTML(anim.to_jshtml())

Organization of the Lecture¶

Practicalities¶

1. Ordinary Differential Equations (ODEs)¶

1.1. Computing ODE Solutions¶

1.2. Example: Van der Pol Oscillator¶

1.3. Break: VDP & ODE Parameters¶

2. Neural ODE (NODE)¶

2.1. Problem Formulation¶

2.2. Maximum Likelihood Estimation¶

2.3. Example: Learning VDP Sequences with NODE¶

2.4. Break: NN Differential Function and/or Adjoints¶

3. Latent Bayesian Neural ODEs (ODEVAE)¶

3.1. Variational Inference¶

3.2. Evidence Lower-bound¶

3.3. Example Dataset: Rotating MNIST¶

3.4. Implementation¶

15-MIN BREAK¶

4. ResNets are Discretized ODEs¶

4.1. Classification Objective¶

4.2. Implementation¶

4.3. Training¶

4.4. Break: ODE Solver Parameters¶

5. Continuous-time Normalizing Flows¶

5.1. Normalizing Flows¶

5.2. Continuous-time Normalizing Flows¶

5.3. Implementation¶

5.4. Training¶

5.5. Break: Wrap-off¶

6.1. ODE-RNN [rubanova2019latent]¶

6.2. ODE$^2$VAE [yildiz2019deep]¶

6.3. Augmented NODEs [dupont2019augmented]¶

6.4. Regularized NODEs [finlay2020train]¶

6.5. ACA [zhuang2020adaptive]¶

6.6. ODE-RL [yildiz2021continuous]¶

6.7. NSDEs [tzen2019neural], [xu2022infinitely]¶

6.8. GP-ODEs [hegde2022variational]¶

References¶

Organization of the Lecture¶

Practicalities¶

1. Ordinary Differential Equations (ODEs)¶

1.1. Computing ODE Solutions¶

1.2. Example: Van der Pol Oscillator¶

1.3. Break: VDP & ODE Parameters¶

2. Neural ODE (NODE)¶

2.1. Problem Formulation¶

2.2. Maximum Likelihood Estimation¶

2.3. Example: Learning VDP Sequences with NODE¶

2.4. Break: NN Differential Function and/or Adjoints¶

3. Latent Bayesian Neural ODEs (ODEVAE)¶

3.1. Variational Inference¶

3.2. Evidence Lower-bound¶

3.3. Example Dataset: Rotating MNIST¶

3.4. Implementation¶

15-MIN BREAK¶

4. ResNets are Discretized ODEs¶

4.1. Classification Objective¶

4.2. Implementation¶

4.3. Training¶

4.4. Break: ODE Solver Parameters¶

5. Continuous-time Normalizing Flows¶

5.1. Normalizing Flows¶

5.2. Continuous-time Normalizing Flows¶

5.3. Implementation¶

5.4. Training¶

5.5. Break: Wrap-off¶

6. Related Studies¶

6.1. ODE-RNN [rubanova2019latent]¶

6.2. ODE$^2$VAE [yildiz2019deep]¶

6.3. Augmented NODEs [dupont2019augmented]¶

6.4. Regularized NODEs [finlay2020train]¶

6.5. ACA [zhuang2020adaptive]¶

6.6. ODE-RL [yildiz2021continuous]¶

6.7. NSDEs [tzen2019neural], [xu2022infinitely]¶

6.8. GP-ODEs [hegde2022variational]¶

References¶