6 minute read

Tags: , ,

What are convolutions and pooling?

Implementing convolutional layers

model = tf.keras.models.Sequential([
   tf.keras.layers.Flatten(),
   tf.keras.layers.Dense(128, activation=tf.nn.relu),
   tf.keras.layers.Dense(10, activation=tf.nn.softmax)
])
model = tf.keras.models.Sequential([
                        # 64 個 3 x3 的 filters
   tf.keras.layers.Conv2D(64, (3, 3), activation='relu', input_shape=(28, 28, 1))
                 # take max value 用 2 x 2 的大小下去找出來
                 #  _________ 
                 #  | 3 | 0 |
                 #  |___|___|  => 找到  8
                 #  | 8 | 1 |
                 #  |___|___|
   tf.keras.layers.MaxPooling2D(2, 2),
   # 再來層
   tf.keras.layers.Conv2D(64, (3, 3), activation='relu'),
   tf.keras.layers.MaxPooling2D(2, 2),

   tf.keras.layers.Flatten(),
   tf.keras.layers.Dense(128, activation='relu'),
   tf.keras.layers.Dense(10, activation='softmax')
])
Layer(type)                   Output Shape             Param #
===============================================================
                              # 28 x 28 conv 2d 去掉最左變跟最右邊 = 26 x 26
conv2d_12(Conv2D)             (None, 26, 26, 64)        640
----------------------------------------------------------------
                              # 2 x 2 pooling 來找 變成 13 x 13
max_pooling2d_12(MaxPolling)  (None, 13, 13, 64)         0
----------------------------------------------------------------
conv2d_13(Conv2D)             (None, 11, 11, 64)        36928
----------------------------------------------------------------
max_pooling2d_13(MaxPolling)  (None, 5,  5,  64)         0
----------------------------------------------------------------
flatten_5(Flatten)            (None, 1600)               0
----------------------------------------------------------------
dense_10(Dense)               (None, 128)               204928
----------------------------------------------------------------
dense_11(Dense)               (None, 10)                1290
===============================================================

Improving the Fashion classifier with convolutions

  • For convenience, here’s the entire code again. Run it and take a note of the test accuracy that is printed out at the end.
import tensorflow as tf
mnist = tf.keras.datasets.fashion_mnist
(training_images, training_labels), (test_images, test_labels) = mnist.load_data()
training_images=training_images / 255.0
test_images=test_images / 255.0
model = tf.keras.models.Sequential([
  tf.keras.layers.Flatten(),
  tf.keras.layers.Dense(128, activation=tf.nn.relu),
  tf.keras.layers.Dense(10, activation=tf.nn.softmax)
])
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit(training_images, training_labels, epochs=5)

test_loss = model.evaluate(test_images, test_labels)
"""
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-labels-idx1-ubyte.gz
32768/29515 [=================================] - 0s 0us/step
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-images-idx3-ubyte.gz
26427392/26421880 [==============================] - 0s 0us/step
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-labels-idx1-ubyte.gz
8192/5148 [===============================================] - 0s 0us/step
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-images-idx3-ubyte.gz
4423680/4422102 [==============================] - 0s 0us/step
Epoch 1/5
1875/1875 [==============================] - 3s 2ms/step - loss: 0.4947 - accuracy: 0.8254
Epoch 2/5
1875/1875 [==============================] - 3s 2ms/step - loss: 0.3747 - accuracy: 0.8647
Epoch 3/5
1875/1875 [==============================] - 3s 2ms/step - loss: 0.3362 - accuracy: 0.8777
Epoch 4/5
1875/1875 [==============================] - 3s 2ms/step - loss: 0.3123 - accuracy: 0.8859
Epoch 5/5
1875/1875 [==============================] - 4s 2ms/step - loss: 0.2959 - accuracy: 0.8905
313/313 [==============================] - 0s 1ms/step - loss: 0.3456 - accuracy: 0.8769
"""
  • Run the below code – this is the same neural network as earlier, but this time with Convolutional layers added first. It will take longer, but look at the impact on the accuracy:
import tensorflow as tf
print(tf.__version__)
mnist = tf.keras.datasets.fashion_mnist
(training_images, training_labels), (test_images, test_labels) = mnist.load_data()
training_images=training_images.reshape(60000, 28, 28, 1)
training_images=training_images / 255.0
test_images = test_images.reshape(10000, 28, 28, 1)
test_images=test_images/255.0

model = tf.keras.models.Sequential([
    tf.keras.layers.Conv2D(64, (3, 3), activation='relu', input_shape=(28, 28, 1)),
    tf.keras.layers.MaxPooling2D(2, 2),
    tf.keras.layers.Conv2D(64, (3, 3), activation='relu'),
    tf.keras.layers.MaxPooling2D(2, 2),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(10, activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()
model.fit(training_images, training_labels, epoch=5)
test_loss = model.evalutions(test_images, test_labels)

"""
2.3.0
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
conv2d (Conv2D)              (None, 26, 26, 64)        640       
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 13, 13, 64)        0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 11, 11, 64)        36928     
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 5, 5, 64)          0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 1600)              0         
_________________________________________________________________
dense_2 (Dense)              (None, 128)               204928    
_________________________________________________________________
dense_3 (Dense)              (None, 10)                1290      
=================================================================
Total params: 243,786
Trainable params: 243,786
Non-trainable params: 0
_________________________________________________________________
Epoch 1/5
1875/1875 [==============================] - 81s 43ms/step - loss: 0.4454 - accuracy: 0.8369
Epoch 2/5
1875/1875 [==============================] - 80s 43ms/step - loss: 0.2913 - accuracy: 0.8929
Epoch 3/5
1875/1875 [==============================] - 80s 43ms/step - loss: 0.2476 - accuracy: 0.9078
Epoch 4/5
1765/1875 [===========================>..] - ETA: 4s - loss: 0.2178 - accuracy: 0.9181
"""
  • Step 1 is to gather the data. You’ll notice that there’s a bit of a change here in that the training data needed to be reshaped. That’s because the first convolution expects a single tensor containing everything, so instead of 60,000 28x28x1 items in a list, we have a single 4D list that is 60,000x28x28x1, and the same for the test images. If you don’t do this, you’ll get an error when training as the Convolutions do not recognize the shape.

     import tensorflow as tf
     mnist = tf.keras.datasets.fashion_mnist
     (training_images, training_labels), (test_images, test_labels) = mnist.load_data()
     training_images=training_images.reshape(60000, 28, 28, 1)
     training_images=training_images / 255.0
     test_images = test_images.reshape(10000, 28, 28, 1)
     test_images=test_images/255.0
    
  • Next is to define your model. Now instead of the input layer at the top, you’re going to add a Convolution. The parameters are:

    1. The number of convolutions you want to generate. Purely arbitrary, but good to start with something in the order of 32
    2. The size of the Convolution, in this case a 3x3 grid
    3. The activation function to use – in this case we’ll use relu, which you might recall is the equivalent of returning x when x>0, else returning 0
    4. In the first layer, the shape of the input data.

You’ll follow the Convolution with a MaxPooling layer which is then designed to compress the image, while maintaining the content of the features that were highlighted by the convlution. By specifying (2,2) for the MaxPooling, the effect is to quarter the size of the image. Without going into too much detail here, the idea is that it creates a 2x2 array of pixels, and picks the biggest one, thus turning 4 pixels into 1. It repeats this across the image, and in so doing halves the number of horizontal, and halves the number of vertical pixels, effectively reducing the image by 25%.

You can call model.summary() to see the size and shape of the network, and you’ll notice that after every MaxPooling layer, the image size is reduced in this way.

model = tf.keras.models.Sequential([
  tf.keras.layers.Conv2D(32, (3,3), activation='relu', input_shape=(28, 28, 1)),
  tf.keras.layers.MaxPooling2D(2, 2),

Add another convolution

  tf.keras.layers.Conv2D(64, (3,3), activation='relu'),
  tf.keras.layers.MaxPooling2D(2,2)

Now flatten the output. After this you’ll just have the same DNN structure as the non convolutional version

  tf.keras.layers.Flatten(),

The same 128 dense layers, and 10 output layers as in the pre-convolution example:

  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dense(10, activation='softmax')
])

Now compile the model, call the fit method to do the training, and evaluate the loss and accuracy from the test set.

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit(training_images, training_labels, epochs=5)
test_loss, test_acc = model.evaluate(test_images, test_labels)
print(test_acc)
  • Visualizing the Convolutions and Pooling

This code will show us the convolutions graphically. The print (test_labels[;100]) shows us the first 100 labels in the test set, and you can see that the ones at index 0, index 23 and index 28 are all the same value (9). They’re all shoes. Let’s take a look at the result of running the convolution on each, and you’ll begin to see common features between them emerge. Now, when the DNN is training on that data, it’s working with a lot less, and it’s perhaps finding a commonality between shoes based on this convolution/pooling combination.

print(test_labels[:100])
"""
[9 2 1 1 6 1 4 6 5 7 4 5 7 3 4 1 2 4 8 0 2 5 7 9 1 4 6 0 9 3 8 8 3 3 8 0 7
 5 7 9 6 1 3 7 6 7 2 1 2 2 4 4 5 8 2 2 8 4 8 0 7 7 8 5 1 1 2 3 9 8 7 0 2 6
 2 3 1 2 8 4 1 8 5 9 5 0 3 2 0 6 5 3 6 7 1 8 0 1 4 2]
"""
import matplotlib.pyplot as plt
f, axarr = plt.subplots(3, 4)
FIRST_IMAGE = 0
SECOND_IMAGE = 7
THIRD_IMAGE = 26
CONVOLUTION_NUMBER = 1
from tensorflow.keras import models
layer_outputs = [ layer.output for layer in model.layers ]
activation_model = tf.keras.model.Model(input= model.input, outputs = layer_outputs)
for x in range(0, 4):
    f1 = activation_model.predict(test_images[FIRST_IMAGE]).reshape(1, 28, 28, 1)[x]
    axarr[0, x].imshow(f1[0, :, : CONVOLUTION_NUMBER], cmap='inferno')
    axarr[0, x].grid(false)
    f2 = activation_model.predict(test_images[SECOND_IMAGE]).reshape(1, 28, 28, 1)[x]
    axarr[1, x].imshow(f2[0, :, : CONVOLUTION_NUMBER], cmap='inferno')
    axarr[1, x].grid(false)
    f3 = activation_model.predict(test_images[THIRD_IMAGE]).reshape(1, 28, 28, 1)[x]
    axarr[2, x].imshow(f3[0, :, : CONVOLUTION_NUMBER], cmap='inferno')
    axarr[2, x].grid(false)