%load_ext autoreload
%autoreload 2
%matplotlib inline
from lgm import *
# torch.cuda.get_device_name(torch.cuda.current_device())
path = untar_data(URLs.IMDB)
# path.ls()
We reload the IMDB labeled data. In the next notebook, we will pretrain the AWD-LSTM model on the Wikipedia, but the (much smaller) IMDB dataset is sufficient to introduce the model.
import pickle
labeled_list = pickle.load(open(path/'labeled_list_lm.pkl', 'rb'))
vocab = labeled_list.train.processor_x[-1].vocab
batch_size = 64
bptt = 70
data = lm_databunchify(labeled_list, batch_size, bptt)
We want to use the AWD-LSTM from Stephen Merity et al..
We will need four different kinds of dropouts.
Dropout consists of replacing some coefficients by $0$ with probability $p$ (hence, we keep coefficients with probability $1-p$).
mask that tells us which elements to zero outx is done by x = x * maskWe need to create our own dropout mask and cannot rely on pytorch's dropout:
Inside an RNN, a tensor $x$ has three dimensions: batch_size, seq_len, hidden_dim. We want to consistently apply the dropout mask across the seq_len dimension, so:
# def dropout_mask(x, size, prob):
# """
# We pass size in so that we get broadcasting along the sequence dimension
# in RNNDropout.
# """
# return x.new(*size).bernoulli_(1-prob).div_(1-prob)
# class RNNDropout(nn.Module):
# """
# Note the way size is passed in the forward function: we insert a 3rd
# dimension in between the width and height of the minibatch:
# (x.size(0), 1, x.size(2)).
# The middle dimension is the sequence dimension, so the zeroed-out positions
# will stay the same along the sequence, i.e., througout the bptt sequence.
# """
# def __init__(self, prob=0.5):
# super().__init__()
# self.prob = prob
# def forward(self, x):
# if not self.training or self.prob == 0.:
# return x
# mask = dropout_mask(x.data, (x.size(0), 1, x.size(2)), self.prob)
# return x * mask
rnn_dropout = RNNDropout(0.6)
test_input = torch.randn(2, 3, 4) # (batch_size, seq_len, hidden_dim)
print(test_input, '\n\n', rnn_dropout(test_input))
input_prob: we forget random pieces of the embedding matrix, the same ones in the sequence dimensionhidden_prob to the output of one of the layers of the RNN before it’s used as input to the next layer (again, the same coordinates are zeroed in the sequence dimension); we don't apply it to the last output, which gets its own dropout in the decoderWith probability weight_prob, we zero out weights of the hidden-to-hidden matrix inside the RNN
# class WeightDropout(nn.Module):
# """
# Dropout to the weights (not activations!) of the inner LSTM hidden to
# hidden matrix. We want to preserve the CuDNN speed and not reimplement
# the cell from scratch, so in __init__, we add a parameter that will
# contain the raw weights f'{layer}_raw'. We then replace the weight matrix
# in the LSTM in forward when we call self._setweights()
# """
# def __init__(self, inner_module, weight_prob=[0.], layer_names=['weight_hh_l0']):
# super().__init__()
# self.inner_module = inner_module
# self.weight_prob = weight_prob
# self.layer_names = layer_names
# for layer in self.layer_names:
# # we make a copy of the weights of the selected layers
# weights = getattr(self.inner_module, layer)
# self.register_parameter(f'{layer}_raw', nn.Parameter(weights.data))
# # we apply dropout to the actual weights since we are doing dropout
# # after all, but the forward method will use raw_weights
# self.inner_module._parameters[layer] = F.dropout(weights,
# p=self.weight_prob,
# training=False)
# def _setweights(self):
# "Apply dropout to raw_weights and set them as the layer weights."
# for layer in self.layer_names:
# raw_weights = getattr(self, f'{layer}_raw')
# self.inner_module._parameters[layer] = F.dropout(raw_weights,
# p=self.weight_prob,
# training=self.training)
# def forward(self, *args):
# self._setweights()
# with warnings.catch_warnings():
# #To avoid the warning that comes because the weights aren't flattened.
# warnings.simplefilter("ignore")
# return self.inner_module.forward(*args)
Here's an example. Let's initialize an LSTM module:
lstm_module = nn.LSTM(5, 3, batch_first=True) # (input_dim, hidden_dim)
lstm_module._parameters
getattr(lstm_module, 'weight_hh_l0')
Let's add weight dropout. The inner module weights stay the same:
dropout_module = WeightDropout(lstm_module, 0.6)
getattr(dropout_module.inner_module, 'weight_hh_l0')
And they get copied to the raw weights in the outer (dropout) module:
getattr(dropout_module, 'weight_hh_l0_raw')
It's at the beginning of a forward pass that the dropout is applied to the weights.
test_batch = torch.randn(4, 2, 5) # (batch_size, seq_len, input_dim)
output, (h, c) = dropout_module(test_batch)
# print(output.shape)
# print(output)
# print(h.shape)
# print(h)
getattr(dropout_module.inner_module, 'weight_hh_l0')
print(getattr(dropout_module.inner_module, 'weight_hh_l0').grad)
But we still have the weights saved in raw_weights:
getattr(dropout_module, 'weight_hh_l0_raw')
print(getattr(dropout_module, 'weight_hh_l0_raw').grad)
Now let's see how the gradients are backpropagated:
test_loss = (output - torch.zeros(output.shape) ** 2).mean()
test_loss.backward()
print(getattr(dropout_module.inner_module, 'weight_hh_l0').grad)
print(getattr(dropout_module, 'weight_hh_l0_raw').grad)
With probability embed_prob, we zero out lines of the embedding matrix when we look up the ids of our tokens inside the embedding matrix.
# class EmbeddingDropout(nn.Module):
# """
# Applies dropout in the embedding layer by zeroing out some elements of
# the embedding vector. Dropout is applied to full rows of the embedding
# matrix: we drop out entire words and not components of a word's dense
# embedding.
# """
# def __init__(self, emb, embed_prob):
# super().__init__()
# self.emb = emb
# self.embed_prob = embed_prob
# self.pad_idx = self.emb.padding_idx
# if self.pad_idx is None:
# self.pad_idx = -1
# def forward(self, words, scale=None):
# if self.training and self.embed_prob != 0:
# size = (self.emb.weight.size(0),1)
# mask = dropout_mask(self.emb.weight.data, size, self.embed_prob)
# masked_embed = self.emb.weight * mask
# else:
# masked_embed = self.emb.weight
# if scale:
# masked_embed.mul_(scale)
# return F.embedding(words, masked_embed, self.pad_idx, self.emb.max_norm,
# self.emb.norm_type, self.emb.scale_grad_by_freq,
# self.emb.sparse)
Here's an example:
emb = nn.Embedding(10, 4, padding_idx=1) # (vocab_size, embedding_dim)
emb.weight
vocab_items = torch.randint(10, (3,))
print(vocab_items)
emb(vocab_items)
emb_dropout = EmbeddingDropout(emb, 0.6)
emb_dropout(vocab_items)
size = (emb.weight.size(0), 1)
mask = dropout_mask(emb.weight.data, size, 0.6)
mask
emb.weight
emb.weight * mask
The encoder:
# def to_detach(h):
# "Detaches h from its gradient history."
# return h.detach() if type(h) == torch.Tensor else tuple(to_detach(v) for v in h)
# class AWD_LSTM(nn.Module):
# "AWD-LSTM inspired by https://arxiv.org/abs/1708.02182."
# initrange=0.1
# def __init__(self, vocab_size, emb_size, n_hid, n_layers, pad_token,
# hidden_prob=0.2, input_prob=0.6, embed_prob=0.1, weight_prob=0.5):
# super().__init__()
# self.batch_size = 1
# self.emb_size = emb_size
# self.n_hid = n_hid
# self.n_layers = n_layers
# self.emb = nn.Embedding(vocab_size, emb_size, padding_idx=pad_token)
# self.emb_dropout = EmbeddingDropout(self.emb, embed_prob)
# # we create n_layers of LSTMs below
# self.rnns = [nn.LSTM(emb_size if l == 0 else n_hid,
# (n_hid if l != n_layers-1 else emb_size),
# 1, batch_first=True)
# for l in range(n_layers)]
# # we add dropout to the LSTM layers
# self.rnns = nn.ModuleList([WeightDropout(rnn, weight_prob)
# for rnn in self.rnns])
# self.emb.weight.data.uniform_(-self.initrange, self.initrange)
# self.input_dropout = RNNDropout(input_prob)
# self.hidden_dropouts = nn.ModuleList([RNNDropout(hidden_prob)
# for l in range(n_layers)])
# def forward(self, input):
# batch_size, seq_len = input.size()
# if batch_size != self.batch_size:
# self.batch_size = batch_size
# self.reset()
# raw_output = self.input_dropout(self.emb_dropout(input))
# new_hidden, raw_outputs, outputs = [], [], []
# # we loop through the LSTM layers (plus the hidden dropout layers)
# for l, (rnn, hid_dropout) in enumerate(zip(self.rnns, self.hidden_dropouts)):
# raw_output, new_h = rnn(raw_output, self.hidden[l])
# new_hidden.append(new_h)
# raw_outputs.append(raw_output)
# # we do hidden dropout for all layers but the last one
# if l != self.n_layers - 1: raw_output = hid_dropout(raw_output)
# outputs.append(raw_output)
# self.hidden = to_detach(new_hidden)
# return raw_outputs, outputs
# def _one_hidden(self, l):
# "Return one hidden state."
# nh = self.n_hid if l != self.n_layers - 1 else self.emb_size
# return next(self.parameters()).new(1, self.batch_size, nh).zero_()
# def reset(self):
# "Reset the hidden states."
# self.hidden = [(self._one_hidden(l), self._one_hidden(l))
# for l in range(self.n_layers)]
We now add a linear decoder on top of the encoder.
# class LinearDecoder(nn.Module):
# """
# We add a top layer to the AWD LSTM. This is a linear model with dropout.
# """
# def __init__(self, n_out, n_hid, output_prob, tie_encoder=None, bias=True):
# super().__init__()
# self.output_dropout = RNNDropout(output_prob)
# self.decoder = nn.Linear(n_hid, n_out, bias=bias)
# if bias:
# self.decoder.bias.data.zero_()
# if tie_encoder:
# self.decoder.weight = tie_encoder.weight
# else:
# init.kaiming_uniform_(self.decoder.weight)
# def forward(self, input):
# raw_outputs, outputs = input
# # we call dropout first
# output = self.output_dropout(outputs[-1]).contiguous()
# # we call the linear model
# decoded = self.decoder(output.view(output.size(0)*output.size(1),
# output.size(2)))
# return decoded, raw_outputs, outputs
# class SequentialRNN(nn.Sequential):
# "A sequential module that passes the reset call to its children."
# def reset(self):
# for child in self.children():
# if hasattr(child, 'reset'): child.reset()
We stack the encoder and decoder together:
# def get_language_model(vocab_sz, emb_sz, n_hid, n_layers, pad_token,
# output_p=0.4, hidden_p=0.2, input_p=0.6,
# embed_p=0.1, weight_p=0.5, tie_weights=True, bias=True):
# rnn_enc = AWD_LSTM(vocab_sz, emb_sz, n_hid=n_hid, n_layers=n_layers,
# pad_token=pad_token, hidden_p=hidden_p, input_p=input_p,
# embed_p=embed_p, weight_p=weight_p)
# enc = rnn_enc.emb if tie_weights else None
# # the rnn_enc is the AWD LSTM
# # its output is passed to the top linear layer (with dropout)
# return SequentialRNN(rnn_enc,
# LinearDecoder(vocab_sz, emb_sz, output_p,
# tie_encoder=enc, bias=bias))
Let's test this to check that everything works:
tok_pad = vocab.index(PAD)
tst_model = get_language_model(len(vocab), 300, 300, 2, tok_pad)
tst_model = tst_model.cuda()
x,y = next(iter(data.train_dl))
z = tst_model(x.cuda())
We return three things to help with regularization: the true output (probabilities for each word), but also the activations of the encoder, with or without dropouts.
len(z)
decoded, raw_outputs, outputs = z
The decoded tensor is flattened to bs * seq_len by len(vocab):
decoded.size()
raw_outputs and outputs each contain the results of the intermediary layers:
len(raw_outputs),len(outputs)
[o.size() for o in raw_outputs], [o.size() for o in outputs]
We will clip the gradients by enforcing a maximum value for the norm of the gradients.
# class GradientClipping(Callback):
# """
# Checks after the backward pass if the norm (sum of squares) of the
# gradients is greater than the number clip; if they are, they get
# divided (scaled down) so that they're smaller than clip.
# """
# def __init__(self, clip=None):
# self.clip = clip
# def after_backward(self):
# if self.clip:
# nn.utils.clip_grad_norm_(self.learn.model.parameters(), self.clip)
The RNNTrainer callback will do four things:
decoded tensor (for the loss function) and store the raw_outputs and outputs# class RNNTrainer(Callback):
# """
# Adds two L2 penalties on activations (not weights).
# Activation Regularization (AR): ensures activations are not too high.
# Temporal Activation Regularization (TAR): ensures activations don't change
# radically from timestep to timestep.
# """
# def __init__(self, alpha, beta):
# # parameter for Activation Regularization (AR)
# self.alpha = alpha
# # parameter for Temporal Activation Regularization (TAR)
# self.beta = beta
# def after_pred(self):
# # Save the extra outputs for later and only returns the true output.
# self.raw_out = self.pred[1]
# self.out = self.pred[2]
# self.learn.pred = self.pred[0]
# def after_loss(self):
# # Activation Regularization (AR): we add to the loss an L2 penalty
# # on the last activations of the AWD LSTM (with dropout applied)
# if self.alpha != 0.:
# self.learn.loss += self.alpha * self.out[-1].float().pow(2).mean()
# # Temporal Activation Regularization (TAR): we add to the loss an L2
# # penalty on the difference between two consecutive (in terms of
# # words) raw outputs
# if self.beta != 0.:
# h = self.raw_out[-1]
# if len(h)>1:
# self.learn.loss += self.beta * (h[:,1:] - h[:,:-1]).float().pow(2).mean()
# def begin_epoch(self):
# # Shuffle the texts at the beginning of the epoch
# if hasattr(self.dl.dataset, "batchify"):
# self.dl.dataset.batchify()
callback_funcs = [partial(GradientClipping, clip=0.1),
partial(RNNTrainer, alpha=2., beta=1.)]
We finally assemble the model and check that we're able to train it:
tok_pad = vocab.index(PAD)
emb_sz, nh, nl = 300, 300, 2
model = get_language_model(len(vocab), emb_sz, nh, nl, tok_pad, input_prob=0.6,
output_prob=0.4, weight_prob=0.5, embed_prob=0.1, hidden_prob=0.2)
learn = Learner(model, data, cross_entropy_flat, adam_opt(),
metrics=accuracy_flat, lr=5e-3, callback_funcs=callback_funcs)
learn.fit(1)
We could save and load the state dict for the model (but there is no point since we want to pretrain on Wikipedia and only after that finetune on the IMDB reviews):
# torch.save(learn.model.state_dict(), path/'lm_state_dict')
# learn.model.load_state_dict(torch.load(path/'lm_state_dict'))