Leo's Home page -- Github Page -- License: CC BY-SA 4.0
This notebook presents some experimentation I did in 2018 with Convolutional Autoencoders.
All the source code of the experiments (working and broken) is available in the Github project
There is no much more text in this notebook except for some words at the end, as the source code and comments should be enough to explain how and why things work.
import torch
import torchvision
from torch import nn, optim
from torch.nn import functional as F
from torch.autograd import Variable
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms, utils
from torchvision import datasets
from torchvision.utils import save_image
# import skimage
import math
# import io
# import requests
# from PIL import Image
import numpy as np
import pandas as pd
# import matplotlib.pyplot as plt
import sys
import os
class CAEEncoder(nn.Module):
"""
The Encoder = Q(z|X) for the Network
"""
def __init__(self, w,h, channels=3, hid_dim=500, code_dim=200, kernel_size=3, first_feature_count=16):
super(CAEEncoder, self).__init__()
self.indices = []
padding = math.floor(kernel_size/2)
l1_feat = first_feature_count
l2_feat = l1_feat * 2
self.layer1 = nn.Sequential(
nn.Conv2d(channels, l1_feat, kernel_size=kernel_size, padding=padding),
# nn.BatchNorm2d(l1_feat),
nn.ReLU(),
# nn.Conv2d(l1_feat, l1_feat, kernel_size=kernel_size, padding=padding),
# # nn.BatchNorm2d(l1_feat),
# nn.ReLU(),
# nn.Conv2d(l1_feat, l1_feat, kernel_size=kernel_size, padding=padding),
# # nn.BatchNorm2d(l1_feat),
# nn.ReLU(),
nn.Conv2d(l1_feat, l1_feat, kernel_size=kernel_size, padding=padding),
# nn.BatchNorm2d(l1_feat),
nn.ReLU(),
torch.nn.MaxPool2d(2, stride=2, return_indices=True)
)
self.layer2 = nn.Sequential(
nn.Conv2d(l1_feat, l2_feat, kernel_size=kernel_size, padding=padding),
# nn.BatchNorm2d(l2_feat),
nn.ReLU(),
# nn.Conv2d(l2_feat, l2_feat, kernel_size=kernel_size, padding=padding),
# # nn.BatchNorm2d(l2_feat),
# nn.ReLU(),
# nn.Conv2d(l2_feat, l2_feat, kernel_size=kernel_size, padding=padding),
# # nn.BatchNorm2d(l2_feat),
# nn.ReLU(),
nn.Conv2d(l2_feat, l2_feat, kernel_size=kernel_size, padding=padding),
# nn.BatchNorm2d(l2_feat),
nn.ReLU(),
torch.nn.MaxPool2d(2, stride=2, return_indices=True)
)
self.conv_dim = int(((w*h)/16) * l2_feat)
#self.conv_dim = int( channels * (w/4) * l2_feat)
self.fc1 = nn.Linear(self.conv_dim, hid_dim)
self.fc2 = nn.Linear(hid_dim, code_dim)
# self.fc1 = nn.Linear(576, hid_dim)
def get_conv_layer_indices(self):
return [0, 2, 5, 7, 10] # without BatchNorm2d
#return [0, 3, 7, 10, 14] # with BatchNorm2d
def forward(self, x):
self.indices = []
# print("encoding conv l1")
out, idx = self.layer1(x)
self.indices.append(idx)
# print("encoding conv l2")
out, idx = self.layer2(out)
self.indices.append(idx)
# print(out.size(), self.conv_dim)
# print("view for FC l1")
out = out.view(out.size(0), -1)
# print(out.size())
# print("encoding FC1 ")
out = self.fc1(out)
# print("encoding FC2 ")
out = self.fc2(out)
return out
class CAEDecoder(torch.nn.Module):
"""
The Decoder = P(X|z) for the Network
"""
def __init__(self, encoder, width, height, channels=3, hid_dim=500, code_dim=200, kernel_size=3, first_feature_count=16):
super(CAEDecoder, self).__init__()
padding = math.floor(kernel_size/2)
# self. width = width
# self.height = height
# self.channels = channels
self.encoder = encoder
self.w_conv_dim = int(width/4)
self.h_conv_dim = int(height/4)
self.l1_feat = first_feature_count
self.l2_feat = self.l1_feat * 2
self.conv_dim = int(((width*height)/16) * self.l2_feat)
#self.conv_dim = int(channels * (width/4) * self.l2_feat)
self.layer1 = torch.nn.Linear(code_dim, hid_dim)
self.layer2 = torch.nn.Linear(hid_dim, self.conv_dim)
self.unpool_1 = nn.MaxUnpool2d(2, stride=2)
self.deconv_layer_1 = torch.nn.Sequential(
nn.ConvTranspose2d(self.l2_feat, self.l2_feat, kernel_size=kernel_size, padding=padding),
nn.ReLU(),
# nn.ConvTranspose2d(self.l2_feat, self.l2_feat, kernel_size=kernel_size, padding=padding),
# nn.ReLU(),
# nn.ConvTranspose2d(self.l2_feat, self.l2_feat, kernel_size=kernel_size, padding=padding),
# nn.ReLU(),
nn.ConvTranspose2d(self.l2_feat, self.l1_feat, kernel_size=kernel_size, padding=padding),
nn.ReLU()
)
self.unpool_2 = nn.MaxUnpool2d(2, stride=2)
self.deconv_layer_2 = torch.nn.Sequential(
nn.ConvTranspose2d(self.l1_feat, self.l1_feat, kernel_size=kernel_size, padding=padding),
nn.ReLU(),
# nn.ConvTranspose2d(self.l1_feat, self.l1_feat, kernel_size=kernel_size, padding=padding),
# nn.ReLU(),
# nn.ConvTranspose2d(self.l1_feat, self.l1_feat, kernel_size=kernel_size, padding=padding),
# nn.ReLU(),
nn.ConvTranspose2d(self.l1_feat, channels, kernel_size=kernel_size, padding=padding),
nn.Tanh()
)
def forward(self, x):
out = x
# print("decoding l1")
out = F.relu(self.layer1(x))
# print("decoding l2")
out = F.relu(self.layer2(out))
# print(out.size(), self.conv_dim)
# print("changing tensor shape to be an image")
out = out.view(out.size(0), self.l2_feat, self.w_conv_dim, self.h_conv_dim)
out = self.unpool_1(out, self.encoder.indices[-1])
# print(out.size())
# print("decoding c1")
out = self.deconv_layer_1(out)
# print("decoding c2")
out = self.unpool_2(out, self.encoder.indices[-2])
out = self.deconv_layer_2(out)
# print("returning decoder response")
return out
class CAE(nn.Module):
def __init__(self, width, height, channels, hid_dim=500, code_dim=200, conv_layer_feat=16):
super(CAE, self).__init__()
self.width = width
self.height = height
self.channels = channels
self.encoder = CAEEncoder(width, height, channels, hid_dim, code_dim, 3, conv_layer_feat)
self.decoder = CAEDecoder(self.encoder, width, height, channels, hid_dim, code_dim, 3, conv_layer_feat)
def forward(self, x):
out = self.encoder(x)
out = self.decoder(out)
return out
def save_model(self, name, path):
torch.save(self.encoder, os.path.join(path, "cae_encoder_"+name+".pth"))
torch.save(self.decoder, os.path.join(path, "cae_decoder_"+name+".pth"))
#definitions of the operations for the full image autoencoder
normalize = transforms.Normalize(
mean=[0.485, 0.456, 0.406], # from example here https://github.com/pytorch/examples/blob/409a7262dcfa7906a92aeac25ee7d413baa88b67/imagenet/main.py#L94-L95
std=[0.229, 0.224, 0.225]
# mean=[0.5, 0.5, 0.5], # from example here http://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html
# std=[0.5, 0.5, 0.5]
)
#the whole image gets resized to a small image that can be quickly analyzed to get important points
def fullimage_preprocess(w=48,h=48):
return transforms.Compose([
transforms.Resize((w,h)), #this should be used ONLY if the image is bigger than this size
transforms.ToTensor(),
normalize
])
#the full resolution fovea just is a small 12x12 patch
full_resolution_crop = transforms.Compose([
transforms.RandomCrop(12),
transforms.ToTensor(),
normalize
])
def downsampleTensor(crop_size, final_size=16):
sample = transforms.Compose([
transforms.RandomCrop(crop_size),
transforms.Resize(final_size),
transforms.ToTensor(),
normalize
])
return sample
def get_loaders(batch_size, transformation, dataset = datasets.CIFAR100, cuda=True):
kwargs = {'num_workers': 1, 'pin_memory': True} if cuda else {}
train_loader = torch.utils.data.DataLoader(
dataset('../data', train=True, download=True,
transform=transformation),
batch_size=batch_size, shuffle=True, **kwargs)
test_loader = torch.utils.data.DataLoader(
dataset('../data', train=False, transform=transformation),
batch_size=batch_size, shuffle=True, **kwargs)
return train_loader, test_loader
# Hyper Parameters
# num_epochs = 5
# batch_size = 100
# learning_rate = 0.001
num_epochs = 100
batch_size = 128
learning_rate = 0.0001
model = CAE(12,12,3,500,200,32).cuda()
criterion = nn.MSELoss()
#criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-5)
model.parameters
<bound method Module.parameters of CAE( (encoder): CAEEncoder( (layer1): Sequential( (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (1): ReLU() (2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (3): ReLU() (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False) ) (layer2): Sequential( (0): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (1): ReLU() (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (3): ReLU() (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False) ) (fc1): Linear(in_features=576, out_features=500, bias=True) (fc2): Linear(in_features=500, out_features=200, bias=True) ) (decoder): CAEDecoder( (encoder): CAEEncoder( (layer1): Sequential( (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (1): ReLU() (2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (3): ReLU() (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False) ) (layer2): Sequential( (0): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (1): ReLU() (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (3): ReLU() (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False) ) (fc1): Linear(in_features=576, out_features=500, bias=True) (fc2): Linear(in_features=500, out_features=200, bias=True) ) (layer1): Linear(in_features=200, out_features=500, bias=True) (layer2): Linear(in_features=500, out_features=576, bias=True) (unpool_1): MaxUnpool2d(kernel_size=(2, 2), stride=(2, 2), padding=(0, 0)) (deconv_layer_1): Sequential( (0): ConvTranspose2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (1): ReLU() (2): ConvTranspose2d(64, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (3): ReLU() ) (unpool_2): MaxUnpool2d(kernel_size=(2, 2), stride=(2, 2), padding=(0, 0)) (deconv_layer_2): Sequential( (0): ConvTranspose2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (1): ReLU() (2): ConvTranspose2d(32, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (3): Tanh() ) ) )>
def to_img(x):
x = 0.5 * (x + 1)
x = x.clamp(0, 1)
x = x.view(x.size(0), 3, 12, 12)
return x
transformation = full_resolution_crop
train_loader, test_loader = get_loaders(batch_size, transformation)
Files already downloaded and verified
%%time
for epoch in range(num_epochs):
for i, (img, labels) in enumerate(train_loader):
img = Variable(img).cuda()
# ===================forward=====================
# print("encoding batch of images")
output = model(img)
# print("computing loss")
loss = criterion(output, img)
# ===================backward====================
# print("Backward ")
optimizer.zero_grad()
loss.backward()
optimizer.step()
# ===================log========================
print('epoch [{}/{}], loss:{:.4f}'.format(epoch+1, num_epochs, loss.data))
if epoch % 10 == 0:
pic = to_img(output.cpu().data)
in_pic = to_img(img.cpu().data)
save_image(pic, './cae_results/2x2-2xfc-out_image_{}.png'.format(epoch))
save_image(in_pic, './cae_results/2x2-2xfc-in_image_{}.png'.format(epoch))
if loss.data < 0.15: #arbitrary number because I saw that it works well enough
break
epoch [1/100], loss:0.4092 epoch [2/100], loss:0.3377 epoch [3/100], loss:0.3058 epoch [4/100], loss:0.3483 epoch [5/100], loss:0.2722 epoch [6/100], loss:0.2946 epoch [7/100], loss:0.2499 epoch [8/100], loss:0.2494 epoch [9/100], loss:0.3001 epoch [10/100], loss:0.1993 epoch [11/100], loss:0.2132 epoch [12/100], loss:0.2049 epoch [13/100], loss:0.2972 epoch [14/100], loss:0.1959 epoch [15/100], loss:0.2462 epoch [16/100], loss:0.2011 epoch [17/100], loss:0.2434 epoch [18/100], loss:0.2658 epoch [19/100], loss:0.2246 epoch [20/100], loss:0.2300 epoch [21/100], loss:0.2028 epoch [22/100], loss:0.2029 epoch [23/100], loss:0.2460 epoch [24/100], loss:0.2545 epoch [25/100], loss:0.2325 epoch [26/100], loss:0.2252 epoch [27/100], loss:0.2551 epoch [28/100], loss:0.2338 epoch [29/100], loss:0.1642 epoch [30/100], loss:0.2235 epoch [31/100], loss:0.1950 epoch [32/100], loss:0.2379 epoch [33/100], loss:0.2440 epoch [34/100], loss:0.2408 epoch [35/100], loss:0.2713 epoch [36/100], loss:0.2313 epoch [37/100], loss:0.2363 epoch [38/100], loss:0.2542 epoch [39/100], loss:0.2782 epoch [40/100], loss:0.1701 epoch [41/100], loss:0.1879 epoch [42/100], loss:0.2589 epoch [43/100], loss:0.2518 epoch [44/100], loss:0.2565 epoch [45/100], loss:0.2314 epoch [46/100], loss:0.2601 epoch [47/100], loss:0.2513 epoch [48/100], loss:0.2133 epoch [49/100], loss:0.2205 epoch [50/100], loss:0.2386 epoch [51/100], loss:0.2227 epoch [52/100], loss:0.2001 epoch [53/100], loss:0.1826 epoch [54/100], loss:0.2644 epoch [55/100], loss:0.2641 epoch [56/100], loss:0.2712 epoch [57/100], loss:0.2261 epoch [58/100], loss:0.2290 epoch [59/100], loss:0.2018 epoch [60/100], loss:0.2029 epoch [61/100], loss:0.2601 epoch [62/100], loss:0.1618 epoch [63/100], loss:0.2245 epoch [64/100], loss:0.2053 epoch [65/100], loss:0.2228 epoch [66/100], loss:0.1712 epoch [67/100], loss:0.2746 epoch [68/100], loss:0.2654 epoch [69/100], loss:0.1959 epoch [70/100], loss:0.2104 epoch [71/100], loss:0.2078 epoch [72/100], loss:0.2283 epoch [73/100], loss:0.2238 epoch [74/100], loss:0.2457 epoch [75/100], loss:0.2209 epoch [76/100], loss:0.1890 epoch [77/100], loss:0.2088 epoch [78/100], loss:0.2030 epoch [79/100], loss:0.2173 epoch [80/100], loss:0.2101 epoch [81/100], loss:0.2130 epoch [82/100], loss:0.1821 epoch [83/100], loss:0.2299 epoch [84/100], loss:0.3318 epoch [85/100], loss:0.2389 epoch [86/100], loss:0.2200 epoch [87/100], loss:0.2253 epoch [88/100], loss:0.2084 epoch [89/100], loss:0.2013 epoch [90/100], loss:0.2318 epoch [91/100], loss:0.2426 epoch [92/100], loss:0.2342 epoch [93/100], loss:0.2158 epoch [94/100], loss:0.2128 epoch [95/100], loss:0.2286 epoch [96/100], loss:0.2585 epoch [97/100], loss:0.2448 epoch [98/100], loss:0.2380 epoch [99/100], loss:0.1772 epoch [100/100], loss:0.1301 CPU times: user 4min 7s, sys: 10.5 s, total: 4min 18s Wall time: 14min 1s
model.save_model("2x2-2xfc-layer", "CAE")
Input and Output for the first epoch
Input and Output for the 90th epoch
Experiments with the following configurations:
2 layers with 4 conv stages each does not give the same results as 2 layers with 2 conv stages
It not only converges MUCH faster and the models are smaller, but the actually the convergence is much better
For batch normalization happens the same, without batchnorm2d converges faster and model is smaller