import numpy as np
import math
import matplotlib.pyplot as plt
from sklearn.datasets import load_digits
from sklearn.decomposition import PCA

import torch
import torch.nn as nn 
import torch.nn.functional as F
import torchvision.datasets as datasets

mnist_trainset = datasets.MNIST(root='./data', train=True,  download=True, transform=None)
plot_digits(mnist_trainset.train_data.reshape(-1,28*28), w=28)

plt.figure(1,(5,5))
N = 100
# x_n = U @ z_n, z_n ~ N(0,I)
U = np.random.rand(2, 2)*2 - 1
U = U / ((U**2).sum(1,keepdims=True)**(0.5))
Z = np.random.randn(2, N)
X = (U @ Z).T # N,2
# X = X + np.random.randn(N,2)/10
plt.scatter(X[:, 0], X[:, 1])
plt.axis('equal')
plt.grid()

X_mean = X.mean(0) # N,2
Xc = X - X_mean    # mean centered data
S  = Xc.T @ Xc / N # covar matrix
V,U = np.linalg.eig(S) # eigenvectors are in the columns
# sort 
idx = np.argsort(V)[::-1]
V,U = V[idx],U[:,idx]
u1,u2 = U[:,0],U[:,1]

print('Explained variance', V/V.sum())
print('First PC', u1)

# plot Nplot data points
Nplot = 100
plt_idx = np.random.permutation(np.arange(N))[:Nplot]
plt.scatter(X[:, 0], X[:, 1], color='blue', alpha=0.1)
plt.scatter(X[plt_idx, 0], X[plt_idx, 1], color='blue')
# plot the PC
x_grid = np.linspace(X[:,0].min(), X[:,0].max(), 100)
y_grid = u1[1] * x_grid / u1[0]
plt.plot(x_grid, y_grid, 'r', lw=1)
# plot the corresponding embeddings
z1 = X[plt_idx] @ u1
plt.plot(z1*u1[0], z1*u1[1], 'or')
# plot the mappings
for n,idx in enumerate(plt_idx):
    plt.plot([X[idx,0],z1[n]*u1[0]], [X[idx,1],z1[n]*u1[1]], '-k', lw=.5)
plt.grid()
plt.axis('equal');

Explained variance [0.97576116 0.02423884]
First PC [-0.8968371   0.47128248]

digits = load_digits()
X = digits.data
print('Data shape', X.shape)


def plot_digits(data, w=8):
    fig, axes = plt.subplots(4, 10, figsize=(10, 4),
                             subplot_kw={'xticks':[], 'yticks':[]},
                             gridspec_kw=dict(hspace=0.1, wspace=0.1))
    for i, ax in enumerate(axes.flat):
        ax.imshow(data[i].reshape(w, w),
                  cmap='binary', interpolation='nearest',
                  clim=(0, 16))
        
plot_digits(digits.data)

Data shape (1797, 64)

pca = PCA(2)  # project from 64 to 2 dimensions
Z = pca.fit_transform(X)
print('Data shape  ', X.shape)
print('Latent shape', Z.shape)

plt.figure(1,(8,6))
plt.scatter(Z[:, 0], Z[:, 1],
            c=digits.target, edgecolor='none', alpha=0.5,
            cmap=plt.cm.get_cmap('hot', 10))
plt.xlabel('component 1')
plt.ylabel('component 2')
plt.colorbar();

Data shape   (1797, 64)
Latent shape (1797, 2)

pca = PCA().fit(digits.data)
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of PCA components')
plt.ylabel('Cumulative explained variance')
plt.grid();

def eval_density(mean,S):
  # mean - [2]
  # S    - [2,2]
  # returns - [Ng,Ng]
    Ng = 100
    Xg,Yg = np.meshgrid(np.linspace(0,6,Ng),np.linspace(-2,4,Ng))
    XY = np.stack([Xg.reshape(-1),Yg.reshape(-1)]).T # N,2
    C = (XY-mean) @ np.linalg.inv(S) @ (XY-mean).T
    lhood = np.exp(-np.diag(C)).reshape(Ng,Ng)
    return Xg,Yg,lhood 

W  = np.array([1.0,0.1])
mu = np.array([3,1])
S  = 0.5 * np.eye(2)

fig,axs = plt.subplots(1,3,figsize=(18,6))
# the prior
xs = np.linspace(-3,3,100)
pr_lhoods = np.exp(-xs**2)
axs[0].plot(xs,pr_lhoods)
axs[0].set_xticks([])
axs[0].set_yticks([])
n,n2 = 70,50
z,z2 = xs[n],xs[n2]
axs[0].plot(z,pr_lhoods[n],'*',markersize=20)
axs[0].plot(z2,pr_lhoods[n2],'r*',markersize=20)
axs[0].legend(['Prior dist.', r'$z^*$'],fontsize=15)
axs[0].set_title('Prior distribution',fontsize=20)

# the likelihood of a latent sample
mean_z = W*z + mu
sig_z  = S
Xg_z,Yg_z,lhood_z = eval_density(mean_z,sig_z)
# plot the mean
axs[1].plot(mu[0],mu[1],'r*',ms=20)
# plot w
x_grid = np.linspace(0, 6, 100)
y_grid = W[1] * x_grid / W[0]
y_grid = y_grid - (y_grid[50]-1)
axs[1].plot(x_grid, y_grid, 'r', lw=1)
axs[1].plot(mean_z[0],mean_z[1],'*', color='tab:orange', markersize=20)
axs[1].legend(['$\mu$', '$W$', '$Wz^*+\mu$'],fontsize=18)
axs[1].contour(Xg_z,Yg_z,lhood_z)
axs[1].set_title(r'Conditional $p(x|z^*)$',fontsize=20)


# the marginal
mean_marg = mu
sig_marg  = W.reshape(1,-1) @ W.reshape(-1,1) + S
Xg,Yg,lhood = eval_density(mean_marg,sig_marg)
axs[2].plot(mu[0],mu[1],'r*',ms=20)
axs[2].plot(x_grid, y_grid, 'r', lw=1)
axs[2].contour(Xg,Yg,lhood)
axs[2].set_title(r'Marginal $\int p(x|z)p(z)$d$z$',fontsize=20);

mnist_trainset = datasets.MNIST(root='./data', train=True,  download=True, transform=None)
mnist_testset  = datasets.MNIST(root='./data', train=False, download=True, transform=None)
X = mnist_trainset.train_data.reshape(-1,28*28)

print('MNIST dataset shape', X.shape)
plot_digits(X, w=28)

/Users/cagatay/opt/anaconda3/envs/cl/lib/python3.9/site-packages/torchvision/datasets/mnist.py:62: UserWarning: train_data has been renamed data
  warnings.warn("train_data has been renamed data")

MNIST dataset shape torch.Size([60000, 784])

class MLP_AE(nn.Module):
    def __init__(self):
        super().__init__()
        # encoder
        self.enc1 = nn.Linear(in_features=784, out_features=256)
        self.enc2 = nn.Linear(in_features=256, out_features=128)
        self.enc3 = nn.Linear(in_features=128, out_features=64)
        self.enc4 = nn.Linear(in_features=64, out_features=32)
        self.enc5 = nn.Linear(in_features=32, out_features=16)
        # decoder 
        self.dec1 = nn.Linear(in_features=16, out_features=32)
        self.dec2 = nn.Linear(in_features=32, out_features=64)
        self.dec3 = nn.Linear(in_features=64, out_features=128)
        self.dec4 = nn.Linear(in_features=128, out_features=256)
        self.dec5 = nn.Linear(in_features=256, out_features=784)

    def forward(self, x):
        x = F.relu(self.enc1(x))
        x = F.relu(self.enc2(x))
        x = F.relu(self.enc3(x))
        x = F.relu(self.enc4(x))
        x = F.relu(self.enc5(x))
        x = F.relu(self.dec1(x))
        x = F.relu(self.dec2(x))
        x = F.relu(self.dec3(x))
        x = F.relu(self.dec4(x))
        x = F.relu(self.dec5(x))
        return x

class CNN_AE(nn.Module):
    def __init__(self, q=10):
        super().__init__()
        # encoder
        self.encoder = nn.Sequential(
            nn.Conv2d(1, 8, 3, stride=2, padding=1),
            nn.ReLU(True),
            nn.Conv2d(8, 16, 3, stride=2, padding=1),
            nn.BatchNorm2d(16),
            nn.ReLU(True),
            nn.Conv2d(16, 32, 3, stride=2, padding=0),
            nn.ReLU(True),
            nn.Flatten(start_dim=1),
            nn.Linear(3 * 3 * 32, 128),
            nn.ReLU(True),
            nn.Linear(128, q)
        )

        self.decoder = nn.Sequential(
            nn.Linear(q, 128),
            nn.ReLU(True),
            nn.Linear(128, 3 * 3 * 32),
            nn.ReLU(True),
            nn.Unflatten(dim=1, unflattened_size=(32, 3, 3)),
            nn.ConvTranspose2d(32, 16, 3, stride=2, output_padding=0),
            nn.BatchNorm2d(16),
            nn.ReLU(True),
            nn.ConvTranspose2d(16, 8, 3, stride=2, padding=1, output_padding=1),
            nn.BatchNorm2d(8),
            nn.ReLU(True),
            nn.ConvTranspose2d(8, 1, 3, stride=2, padding=1, output_padding=1)
        )
        
    def forward(self, x):
        z = self.encoder(x)
        x_tilde = self.decoder(z)
        return x_tilde,z

ae = CNN_AE()
print(ae)

CNN_AE(
  (encoder): Sequential(
    (0): Conv2d(1, 8, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(8, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (3): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (4): ReLU(inplace=True)
    (5): Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2))
    (6): ReLU(inplace=True)
    (7): Flatten(start_dim=1, end_dim=-1)
    (8): Linear(in_features=288, out_features=128, bias=True)
    (9): ReLU(inplace=True)
    (10): Linear(in_features=128, out_features=10, bias=True)
  )
  (decoder): Sequential(
    (0): Linear(in_features=10, out_features=128, bias=True)
    (1): ReLU(inplace=True)
    (2): Linear(in_features=128, out_features=288, bias=True)
    (3): ReLU(inplace=True)
    (4): Unflatten(dim=1, unflattened_size=(32, 3, 3))
    (5): ConvTranspose2d(32, 16, kernel_size=(3, 3), stride=(2, 2))
    (6): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): ReLU(inplace=True)
    (8): ConvTranspose2d(16, 8, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), output_padding=(1, 1))
    (9): BatchNorm2d(8, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (10): ReLU(inplace=True)
    (11): ConvTranspose2d(8, 1, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), output_padding=(1, 1))
  )
)

device = torch.device('cpu')
optimizer = torch.optim.Adam(ae.parameters(), lr=5e-3)

X_torch = torch.tensor(X, dtype=torch.float32)
N = X_torch.shape[0]
batch_size = 100

for epoch in range(10):
    running_loss = 0.0
    random_idx = np.random.permutation(np.arange(N))
    X_mixed = X_torch[random_idx]
    X_mixed = X_mixed.reshape(-1,batch_size,1,28,28)
    for i,minibatch in enumerate(X_mixed):
        optimizer.zero_grad()
        minibatch = minibatch.to(device)
        optimizer.zero_grad()
        predictions = ae(minibatch)[0]
        error = (predictions-minibatch)**2
        loss = error.mean()
        loss.backward()
        optimizer.step()
        if i%100==0:
            print('{:d}-{:.4f}'.format(i,loss))

/var/folders/sy/zbxz8_f94nd0b0d82q4cgk740000gn/T/ipykernel_11023/2263857382.py:4: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
  X_torch = torch.tensor(X, dtype=torch.float32)

0-7628.7217
100-5646.8472
200-3003.4219
300-1948.8629
400-1575.8505
500-1530.5507
0-1426.0337
100-1447.8894
200-1293.7833
300-1361.0818
400-1228.8120
500-1261.3850
0-1229.4353
100-1187.4138
200-1221.0444
300-1265.6017
400-1183.0792
500-1159.9485
0-1147.2880
100-1205.6241
200-1135.1644
300-1163.8470
400-1211.2354
500-1103.9478
0-1098.4603
100-1174.6066
200-1068.8463
300-1192.5735
400-1115.0831
500-1018.5432
0-1096.0424
100-994.0508
200-1096.9930
300-1118.5250
400-1076.0570
500-1066.5718
0-1018.2728
100-1058.3573
200-978.0988
300-1051.1539
400-953.9379
500-982.2615
0-991.5519
100-1027.5380
200-965.4827
300-1011.8900
400-959.9594
500-1046.8967
0-927.5007
100-894.6713
200-1034.3939
300-1020.4165
400-1088.9467
500-976.6492
0-952.8253
100-989.9016
200-1000.7675
300-888.2878
400-1060.8571
500-944.5131

plot_digits(minibatch[:40], w=28)
plot_digits(predictions[:40].detach(), w=28)

np.random.seed(123)
Xnoisy = np.random.normal(X, 4)
plot_digits(Xnoisy, 28)

# pca = PCA(0.5).fit(Xnoisy)
# print('Number of components ', pca.n_components_)
# print('Explained variance   ', pca.explained_variance_ratio_.sum())

# Z = pca.transform(Xnoisy)
# Xtilde = pca.inverse_transform(Z)
# plot_digits(Xtilde, 28)

Xtilde = ae(torch.tensor(Xnoisy[:40].reshape(40,1,28,28), dtype=torch.float32))[0]
plot_digits(Xtilde.detach(), 28)

def eval_density(means,sig):
  # means - [N,2]
  # returns - [Ng,Ng]
    if means.ndim==1:
        means = means.reshape([1,-1])
    Ng = 100
    Xg,Yg = np.meshgrid(np.linspace(-5,5,Ng),np.linspace(-5,5,Ng))
    XYg = np.stack([Xg.reshape(-1),Yg.reshape(-1)]).T
    euc_dist = ((np.expand_dims(means,0) - np.expand_dims(XYg,1))**2 / sig**2).sum(-1) # Ngrid,N
    lhood = np.exp(-euc_dist).mean(-1).reshape(Ng,Ng)
    return lhood 

sigmoid = lambda x : 1/(1+np.exp(-x/1)) + 1/(1+np.exp(-x/2)) + 1/(1+np.exp(-x/3))
def f(z):
    theta = np.pi*sigmoid(z)
    return 3*np.stack([np.sin(theta),np.cos(theta)]).T # N,2
sig  = 1.0

fig,axs = plt.subplots(1,3,figsize=(18,6))
# the prior
xs = np.linspace(-3,3,100)
pr_lhoods = np.exp(-xs**2)
axs[0].plot(xs,pr_lhoods)
axs[0].set_xticks([])
axs[0].set_yticks([])
n = 60
z = xs[n]
axs[0].plot(z,pr_lhoods[n],'*',markersize=20)
axs[0].legend(['Prior dist.', r'$z^*$'],fontsize=15)
axs[0].set_title('Prior distribution',fontsize=20)

# the likelihood of a latent sample
mean_z  = f(z)
lhood_z = eval_density(mean_z,sig)
axs[1].imshow(lhood_z)
axs[1].set_xticks([])
axs[1].set_yticks([])
axs[1].set_title(r'Conditional $p(x|z^*)$',fontsize=20)

# the marginal
zs = np.random.randn(500)
means = f(zs)
lhood = eval_density(means,sig)
axs[2].imshow(lhood)
axs[2].set_xticks([])
axs[2].set_yticks([])
axs[2].set_title(r'Marginal $\int p(x|z)p(z)$d$z$',fontsize=20);

0. References¶

1. Principle component analysis¶

1.1. Maximum variance perspective¶

1.2. Minimum error formulation¶

1.3. Probabilistic PCA¶

2. From PCA to AE (auto-encoders)¶

3. Variational auto-encoders (VAE)¶

Discrete mixture of Gaussians:¶

Continuous mixture of Gaussians:¶

3.1. Evidence lower bound (ELBO)¶

Jensen's inequality¶

Specifying the variational distribution¶

Reparameterization trick¶

VAE algorithm¶