text classification with preprocessed text

Text Classification: Classify movie's reviews as positive or negative (preprocessed text) [TensorFlow + Python]

The problem

Classify 25'000 of 50'000 movie reviews from the Internet Movie DataBase (IMDB) as positive or negative .

Python
import tensorflow as tf
from tensorflow import keras

import tensorflow_datasets as tfds
tfds.disable_progress_bar()

import numpy as np

print(tf.__version__)

Our dataset

The IMDB movie reviews dataset comes packaged in tfds.

It has already been preprocessed so that the reviews (sequences of words) have been converted to sequences of integers, where each integer represents a specific word in a dictionary.

The following code downloads the IMDB dataset to your machine (or uses a cached copy if you've already downloaded it):

Python
(train_data, test_data), info = tfds.load(
    # Use the version pre-encoded with an ~8k vocabulary.
    'imdb_reviews/subwords8k', 
    # Return the train/test datasets as a tuple.
    split = (tfds.Split.TRAIN, tfds.Split.TEST),
    # Return (example, label) pairs from the dataset (instead of a dictionary).
    as_supervised=True,
    # Also return the `info` structure. 
    with_info=True)
WARNING:absl:TFDS datasets with text encoding are deprecated and will be removed in a future version.
Instead, you should use the plain text version and tokenize the text using `tensorflow_text`
(See: https://www.tensorflow.org/tutorials/tensorflow_text/intro#tfdata_example)

The encoder

The dataset info includes a text encoder tfds.features.text.SubwordTextEncoder

Python
encoder = info.features['text'].encoder
print ('Vocabulary size: {}'.format(encoder.vocab_size)) #8185

This encoder can encode/decode any string:

Python
sample_string = 'Hello TensorFlow.'
# you can encode strings
encoded_string = encoder.encode(sample_string)
print ('Encoded string is {}'.format(encoded_string))
# [4025, 222, 6307, 2327, 4043, 2120, 7975]

# you can decode encoded strings
original_string = encoder.decode(encoded_string)
print ('The original string: "{}"'.format(original_string))
# "Hello TensorFlow."

# check that original string is equal to the sample string
assert original_string == sample_string

The encoder encodes the string by breaking it into subwords or characters if the word is not in its dictionary: the more a string resembles the dataset, the shorter the encoded representation will be

Python
for ts in encoded_string:
  print ('{} ----> {}'.format(ts, encoder.decode([ts])))

'''
4025 ----> Hell
222 ----> o 
6307 ----> Ten
2327 ----> sor
4043 ----> Fl
2120 ----> ow
7975 ----> .
'''

Here's what the first review looks like:

Python
for train_example, train_label in train_data.take(1):
  print('Encoded text:', train_example[:10].numpy())
  print('Label:', train_label.numpy())
  
# Encoded text: [  62   18   41  604  927   65    3  644 7968   21]
# Label: 0
  • padded_batch Combines consecutive elements of this dataset into padded batches.

batch_size make this number of batches 

padded_shapes = None determines the resulting shape for each dimension of each component in an output element:

  • If the dimension is a constant, the component will be padded out to that length in that dimension.
  • If the dimension is unknown, the component will be padded out to the maximum length of all elements in that dimension.

padding_values = None pad the spaces with this value (you can set it to 0, "" or other) 

drop_remainder = False

Python
#########################################################
#
# This chunk of code is for understanding padded_batch!
# It is not mandatory in your script!
#
#########################################################

"""!" https://www.tensorflow.org/api_docs/python/tf/data/Dataset#padded_batch "!"""

A = (tf.data.Dataset
     .range(1, 5, output_type=tf.int32)
     .map(lambda x: tf.fill([x], x)))
# Pad to the smallest per-batch size that fits all elements.
B = A.padded_batch(2)
for element in B.as_numpy_iterator():
  print(element)
"""
[[1 0]
 [2 2]]
[[3 3 3 0]
 [4 4 4 4]]
"""

# Pad to a fixed size.
C = A.padded_batch(2, padded_shapes=5)
for element in C.as_numpy_iterator():
  print(element)
"""
[[1 0 0 0 0]
 [2 2 0 0 0]]
[[3 3 3 0 0]
 [4 4 4 4 0]]
"""

# Pad with a custom value.
D = A.padded_batch(2, padded_shapes=5, padding_values=-1)
for element in D.as_numpy_iterator():
  print(element)
"""
[[1 -1 -1 -1 -1]
 [2  2 -1 -1 -1]]
[[3  3  3 -1 -1]
 [4  4  4  4 -1]]
"""

# Components of nested elements can be padded independently.
elements = [([1, 2, 3], [10]),
            ([4, 5], [11, 12])]
dataset = tf.data.Dataset.from_generator(
    lambda: iter(elements), (tf.int32, tf.int32))
# Pad the first component of the tuple to length 4, and the second
# component to the smallest size that fits.
dataset = dataset.padded_batch(2,
    padded_shapes=([4], [None]),
    padding_values=(-1, 100))
list(dataset.as_numpy_iterator())
"""
[(
  array([[ 1,  2,  3,  -1],
         [ 4,  5, -1, -1]], dtype=int32),
  array([[ 10, 100],
         [ 11,  12]], dtype=int32)
)]
"""

Prepare the data for Training

While batching we pad the sentences with default value of 0.

Python
BUFFER_SIZE = 1000 # batches of 1000 samples

train_batches = (
    train_data
    .shuffle(BUFFER_SIZE) # take the train_data and shuffle by 1000
    .padded_batch(32)) # split into batches of 32 samples

test_batches = (
    test_data
    .padded_batch(32)) # take the test_data and divide it into batches of 32 samples

# check batches dimensions: each batch has its own different dimensions
# since sequences have different length. Batches will have
# (batch_size, most_long_sequence) dims
for example_batch, label_batch in train_batches.take(2):
  print("Batch shape:", example_batch.shape)
  print("label shape:", label_batch.shape)

"""
Batch shape: (32, 974) # in this batch you have a sentence that has 974 words
                       # all the smaller sentences will be padded with 0s
label shape: (32,)     # each sentence has one and only label, this is why single dimension

Batch shape: (32, 1352)
label shape: (32,)
"""

Build the model

The Neural Network's input is a vector of indexes (indexed words). The predicted output values are 0 or 1, therefore for this problem we will build a Continuous Bag of Words style model.

Caution: This model doesn't use masking, so the zero-padding is used as part of the input, so the padding length may affect the output.

To fix this, see the masking and padding guide

https://www.tensorflow.org/guide/keras/masking_and_padding
Python
model = keras.Sequential([
  keras.layers.Embedding(encoder.vocab_size, 16),
  keras.layers.GlobalAveragePooling1D(),
  keras.layers.Dense(1)])

model.summary()

The first layer is an Embedding Layer (What is an Embedding Layer?). The resulting dimensions are: (batch, sequence, embedding). Learn more about embeddings, see the word embedding tutorial.

keras.layers.Embedding()

  • Turns positive integers (indexes) into dense vectors of fixed size.
    This layer can only be used as the first layer in a model.
e.g. [[4], [20]] -> [[0.25, 0.1], [0.6, -0.2]]
  • Input shape 2D tensor with shape: (batch_size, input_length)
  • Output shape 3D tensor with shape: (batch_size, input_length, output_dim)

Next, a GlobalAveragePooling1D layer returns a fixed-length output vector for each example by averaging over the sequence dimension. This allows the model to handle input of variable length, in the simplest way possible.

tf.keras.layers.GlobalAveragePooling1D()

  • Global average pooling operation for temporal data
  • inputs: A 3D tensor; mask: Binary tensor of shape (batch_size, steps) indicating whether a given step should be masked (excluded from the average)
  • Input shape:
    If data_format='channels_last' : 3D tensor with shape: (batch_size, steps, features)
    If data_format='channels_first' : 3D tensor with shape: (batch_size, features, steps)
  • Output shape: 2D tensor with shape (batch_size, features)

This fixed-length output vector is piped through a fully-connected ( Dense ) layer with 16 hidden units.

The last layer is densely connected with a single output node. This uses the default linear activation function that outputs logits for numerical stability.

Another option is to use the sigmoid activation function that returns a float value between 0 and 1, representing a probability, or confidence level.

With more neurons (nodes or units) and/or more layers, the network can learn more complex representations.

However too many neurons make the network more computationally expensive and may lead to learning unwanted patterns - patterns that improve performance on training data but not on the test data. This is called overfitting.

Compile The Model

Python
model.compile(optimizer='adam',
              loss=tf.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

Train The Model

Python
history = model.fit(train_batches,
                    epochs=10,
                    validation_data=test_batches,
                    validation_steps=30)

Evaluate The Model

Python
loss, accuracy = model.evaluate(test_batches)

print("Loss: ", loss)
print("Accuracy: ", accuracy)

Create a graph of accuracy and loss over time

model.fit() returns a History object that contains a dictionary with everything that happened during training:

Python
history_dict = history.history
history_dict.keys()
# dict_keys(['loss', 'accuracy', 'val_loss', 'val_accuracy'])

There are four entries: one for each monitored metric during training and validation.
We can compare them on a plot:

Training and Validation Loss

Python
import matplotlib.pyplot as plt

acc = history_dict['accuracy']
val_acc = history_dict['val_accuracy']
loss = history_dict['loss']
val_loss = history_dict['val_loss']

epochs = range(1, len(acc) + 1)

# "bo" is for "blue dot"
plt.plot(epochs, loss, 'bo', label='Training loss')
# b is for "solid blue line"
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()
Training and Validation Loss

Training and Validation Accuracy

Python
plt.clf()   # clear figure

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')

plt.show()
Training and Validation Accuracy

Python
#
# Copyright (c) 2017 François Chollet
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.
machine learning text classification preprocessed text