%load_ext autoreload
%autoreload 2
%matplotlib inline
from lgm import *
path = untar_data(URLs.IMDB)
# path
import pickle
labeled_list = pickle.load(open(path/'labeled_list_lm.pkl', 'rb'))
batch_size = 64
bptt = 70
data = lm_databunchify(labeled_list, batch_size, bptt)
vocab = labeled_list.train.processor_x[1].vocab
len(vocab)
print(vocab[:100])
Before tackling the classification task, we have to finetune our language model to the IMDB corpus.
We start with a model pretrained on Wikipedia (training done on google colab) and its associated Wikipedia-based vocabulary, both of which can be downloaded here:
# !wget https://abrsvn.github.io/files/pretrained.pth -P {path}
# !wget https://abrsvn.github.io/files/vocab.pkl -P {path}
dropout_probs = tensor([0.1, 0.15, 0.25, 0.02, 0.2]) * 0.5
# print(dropout_probs)
tok_pad = vocab.index(PAD)
emb_dim = 300
hidden_dim = 300
n_layers = 2
model = get_language_model(len(vocab), emb_dim, hidden_dim, n_layers, tok_pad, *dropout_probs)
old_wgts = torch.load(path/'pretrained.pth')
old_vocab = pickle.load(open(path/'vocab.pkl', 'rb'))
It is very unlikely that the ids in the IMDB-based vocabulary correspond to the Wikipedia-based vocabulary we used to pretrain model:
Let's look at the word 'house':
house_idx_imdb = vocab.index('house')
print("index of 'house' in the IMDB vocab:", house_idx_imdb)
house_idx_wikipedia = old_vocab.index('house')
print("index of 'house' in the Wikipedia vocab:", house_idx_wikipedia)
We need to match our pretrained model weights with the new vocab:
old_wgts.keys()
house_wgt = old_wgts['0.emb.weight'][house_idx_wikipedia]
house_bias = old_wgts['1.decoder.bias'][house_idx_wikipedia]
# def match_embeds(old_wgts, old_vocab, new_vocab):
# """
# Matches embeddings from an old_vocab to a new_vocab when transfer learning:
# -- old_vocab is the vocab associated with the pretrained model
# -- new_vocab is the vocab associated with the new corpus
# -- old_wgts are the weights from the old pretrained model (a state dict)
# We end up with embeddings for the new_vocab that are the same as the old
# ones whenever an item is both in the new_vocab and in the old_vocab. When an
# item in the new_vocab is missing from the old_vocab, it is assigned an
# average embedding.
# The old_wgts are updated with respect to the relevant layers. The parameters
# of the other layers are kept the same. The updated old_wgts are returned in
# full so that they can be loaded into the new model.
# """
# wgts = old_wgts['0.emb.weight']
# bias = old_wgts['1.decoder.bias']
# # compute mean weights; we'll assign them to new vocab items
# wgts_m, bias_m = wgts.mean(dim=0), bias.mean()
# # initialize new weights
# new_wgts = wgts.new_zeros(len(new_vocab), wgts.size(1))
# new_bias = bias.new_zeros(len(new_vocab))
# # reverse old vocab so that we can index into the old weights
# otoi = {v:k for k,v in enumerate(old_vocab)}
# # we check every item in the new vocab
# for i,w in enumerate(new_vocab):
# # if the item is in the old_vocab, we transfer the old weights
# if w in otoi:
# idx = otoi[w]
# new_wgts[i], new_bias[i] = wgts[idx], bias[idx]
# # if the item is not in the old_vocab, we give it average weights
# else: new_wgts[i], new_bias[i] = wgts_m, bias_m
# old_wgts['0.emb.weight'] = new_wgts
# old_wgts['0.emb_dropout.emb.weight'] = new_wgts
# old_wgts['1.decoder.weight'] = new_wgts
# old_wgts['1.decoder.bias'] = new_bias
# return old_wgts
wgts = match_embeds(old_wgts, old_vocab, vocab)
Now let's check that the word "house" was properly converted.
test_near(wgts['0.emb.weight'][house_idx_imdb], house_wgt)
test_near(wgts['1.decoder.bias'][house_idx_imdb], house_bias)
We can load the pretrained weights in our model before beginning training.
model.load_state_dict(wgts)
If we want to apply discriminative learning rates, we need to split our model in different layer groups. Let's look at our model:
model
# def lm_splitter(model):
# """
# Splits the language model provided by the get_language_model into multiple
# param groups to do transfer learning (e.g., from Wikipedia to IMDB):
# -- we have one group for each rnn + corresponding dropout, for a
# total of 2 if we had n_layers = 2 in the get_language_model call;
# -- we have one final group that contains the embeddings/decoder.
# The final group needs to be trained the most (new embedding vectors).
# """
# groups = []
# for i in range(len(model[0].rnns)):
# groups.append(nn.Sequential(model[0].rnns[i], model[0].hidden_dropouts[i]))
# groups += [nn.Sequential(model[0].emb, model[0].emb_dropout, model[0].input_dropout, model[1])]
# return [list(group.parameters()) for group in groups]
First we train with the RNNs frozen:
for rnn in model[0].rnns:
for param in rnn.parameters(): param.requires_grad_(False)
callback_funcs = [partial(GradientClipping, clip=0.1),
partial(RNNTrainer, alpha=2., beta=1.)]
learn = Learner(model, data, cross_entropy_flat, adam_opt(),
metrics=accuracy_flat, callback_funcs=callback_funcs,
splitter=lm_splitter)
learn.fit(1, callbacks=LRFinder())
lr = 2e-2
callback_sched = sched_1cycle([lr], pct_start=0.5, mom_start=0.8, mom_mid=0.7, mom_end=0.8)
# learn.fit(1, callbacks=callback_sched)
# torch.save(learn.model.state_dict(), path/'finetuned_top_layer.pth')
We then train the whole model with discriminative learning rates:
learn.model.load_state_dict(torch.load(path/'finetuned_top_layer.pth'))
for rnn in model[0].rnns:
for param in rnn.parameters(): param.requires_grad_(True)
learn.fit(1, callbacks=LRFinder())
lr = 5e-3
# note we have 3 learning rates because we have 3 groups: 2 RNN+dropout and the top embedding layer
callback_sched = sched_1cycle([lr/2., lr/2., lr], pct_start=0.5,
mom_start=0.8, mom_mid=0.7, mom_end=0.8)
# learn.fit(10, callbacks=callback_sched)
We only need to save:
for the classification task. We need to use the same vocab, and we don't need the top layer since that will be replaced by a layer that does binary sentiment classification.
# torch.save(learn.model[0].state_dict(), path/'finetuned_enc.pth')
# pickle.dump(vocab, open(path/'vocab_lm.pkl', 'wb'))
But we also save the full model just in case:
# torch.save(learn.model.state_dict(), path/'finetuned.pth')
We have to reprocess the data for classification because we have to use the same vocab as the one we had for the finetuned language model.
# vocab = pickle.load(open(path/'vocab_lm.pkl', 'rb'))
# proc_tok = TokenizeProcessor()
# proc_num = NumericalizeProcessor(vocab=vocab) # this is where we use the language-model vocab we saved
# proc_cat = CategoryProcessor()
# textlist = TextList.from_files(path, include=['train', 'test'])
# splitdata = SplitData.split_by_func(textlist, partial(grandparent_splitter, valid_name='test'))
# labeled_list = label_by_func(splitdata, parent_labeler, processor_x = [proc_tok, proc_num], processor_y=proc_cat)
# pickle.dump(labeled_list, open(path/'labeled_list_clas.pkl', 'wb'))
labeled_list = pickle.load(open(path/'labeled_list_clas.pkl', 'rb'))
vocab = pickle.load(open(path/'vocab_lm.pkl', 'rb'))
batch_size = 64
bptt = 70
data = clas_databunchify(labeled_list, batch_size)
Recall that for classification, we need to feed in batches of documents that are padded (at the end) so that the batch can be fit into a tensor.
Computing on the padding is just a waste. Worse, the information that is useful for classification (the actual movie review) recedes further and further into the past and gets weaker and weaker - even with LSTMs.
We use two pytorch utility functions to ignore the padding in the inputs.
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
Let's see how this works. We grab a batch from the training set. Actually, let's grab the second batch, since the first one has has reviews of fairly wildly varying lengths (because it includes the longest review).
test_iter = iter(data.train_dl)
x, y = next(test_iter) # first batch
x, y = next(test_iter) # second batch
These are the reviews, i.e., the predictors:
x.size()
Here are the first 200 words from the second review in the batch:
' '.join(vocab[idx] for idx in x[1][:200])
These are the labels (positive / negative sentiment), i.e., the response:
y.size()
y
The utility functions need the lengths of the reviews to be passed in. They're used after the embedding layer, so we can't see the padding anymore:
lengths = x.size(1) - (x == 1).sum(1)
lengths[:5]
len(vocab)
test_emb = nn.Embedding(len(vocab), 300)
test_emb(x).shape
We create a PackedSequence object that contains all of our unpadded sequences
packed = pack_padded_sequence(test_emb(x), lengths, batch_first=True)
packed
packed.data.shape
len(packed.batch_sizes)
This object can be passed to any RNN directly while retaining the speed of CuDNN.
test = nn.LSTM(300, 300, 2)
y, h = test(packed)
y
Then we can unpad it with the following function for other modules:
unpack = pad_packed_sequence(y, batch_first=True)
unpack[0].shape
unpack[1]
We need to change our model a little bit to use this.
# class AWD_LSTM1(nn.Module):
# """
# AWD-LSTM inspired by https://arxiv.org/abs/1708.02182,
# updated to deal with pad_packed_sequence and pack_padded_sequence.
# """
# initrange=0.1
# def __init__(self, vocab_size, emb_size, n_hid, n_layers, pad_token,
# hidden_prob=0.2, input_prob=0.6, embed_prob=0.1, weight_prob=0.5):
# super().__init__()
# self.batch_size = 1
# self.emb_size = emb_size
# self.n_hid = n_hid
# self.n_layers = n_layers
# self.pad_token = pad_token
# self.emb = nn.Embedding(vocab_size, emb_size, padding_idx=pad_token)
# self.emb_dropout = EmbeddingDropout(self.emb, embed_prob)
# # we create n_layers of LSTMs
# self.rnns = [nn.LSTM(emb_size if l == 0 else n_hid,
# (n_hid if l != n_layers - 1 else emb_size),
# 1, batch_first=True)
# for l in range(n_layers)]
# self.rnns = nn.ModuleList([WeightDropout(rnn, weight_prob)
# for rnn in self.rnns])
# self.emb.weight.data.uniform_(-self.initrange, self.initrange)
# self.input_dropout = RNNDropout(input_prob)
# self.hidden_dropouts = nn.ModuleList([RNNDropout(hidden_prob)
# for l in range(n_layers)])
# def forward(self, input):
# batch_size, seq_len = input.size()
# if batch_size != self.batch_size:
# self.batch_size = batch_size
# self.reset()
# mask = (input == self.pad_token)
# lengths = seq_len - mask.long().sum(1)
# n_empty = (lengths == 0).sum()
# if n_empty > 0:
# input = input[:-n_empty]
# lengths = lengths[:-n_empty]
# self.hidden = [(h[0][:, :input.size(0)], h[1][:, :input.size(0)])
# for h in self.hidden]
# raw_output = self.input_dropout(self.emb_dropout(input))
# new_hidden, raw_outputs, outputs = [], [], []
# for l, (rnn, hid_dropout) in enumerate(zip(self.rnns, self.hidden_dropouts)):
# # take data of different lengths and shape it to pass to RNN
# raw_output = pack_padded_sequence(raw_output, lengths, batch_first=True)
# raw_output, new_h = rnn(raw_output, self.hidden[l])
# # this is where the padding actually happens
# raw_output = pad_packed_sequence(raw_output, batch_first=True)[0]
# raw_outputs.append(raw_output)
# # we do hidden dropout for all layers but the last one
# if l != self.n_layers - 1: raw_output = hid_dropout(raw_output)
# outputs.append(raw_output)
# new_hidden.append(new_h)
# self.hidden = to_detach(new_hidden)
# return raw_outputs, outputs, mask
# def _one_hidden(self, l):
# "Return one hidden state."
# nh = self.n_hid if l != self.n_layers - 1 else self.emb_size
# return next(self.parameters()).new(1, self.batch_size, nh).zero_()
# def reset(self):
# "Reset the hidden states."
# self.hidden = [(self._one_hidden(l), self._one_hidden(l)) for l in range(self.n_layers)]
We will use three things for the classification head of the model: the last hidden state, the average of all the hidden states and the maximum of all the hidden states. The trick is just to, once again, ignore the padding in the last element/average/maximum.
# class Pooling(nn.Module):
# """
# This is just a pedagogically useful model. The actual pooling classifier is
# provided in PoolingLinearClassifier below.
# The LSTMs create hidden states for bptt time steps. We decide what to pass
# to the classifier here. Following concat pooling from vision, we use three
# things for the classification head of the model. We concatenate:
# -- the last hidden state
# -- the average (mean) pool of all the bptt hidden states
# -- the max pool of all the bptt hidden states
# We pass the resulting concatenated tensor to the classifier.
# """
# def forward(self, input):
# raw_outputs, outputs, mask = input
# # last hidden state
# output = outputs[-1]
# # once again, we need to ignore the padding in the last hidden state,
# # as well as the average pool and max pool tensors
# lengths = output.size(1) - mask.long().sum(dim=1)
# # average pool
# avg_pool = output.masked_fill(mask[:, :, None], 0).sum(dim=1)
# avg_pool.div_(lengths.type(avg_pool.dtype)[:, None])
# # max pool
# max_pool = output.masked_fill(mask[:, :, None], -float('inf')).max(dim=1)[0]
# # Concat pooling
# x = torch.cat([output[torch.arange(0, output.size(0)), lengths-1],
# max_pool, avg_pool], 1)
# return output, x
Let's go through an example:
emb_dim = 300
hidden_dim = 300
n_layers = 2
tok_pad = vocab.index(PAD)
Let's instantiate the encoder:
enc = AWD_LSTM1(len(vocab), emb_dim, hidden_dim, n_layers, pad_token=tok_pad)
pool = Pooling()
enc.batch_size = batch_size
enc.reset()
Let's get a batch of data from the train dataloader and feed predictors (reviews) $x$ through the encoder:
test_iter = iter(data.train_dl)
x, y = next(test_iter) # keep the first batch because it's easiest to see padding
output, c = pool(enc(x))
We can check we have padding with 1s at the end of each text (except the first which is the longest).
x
lengths = x.size(1) - (x == 1).sum(1)
lengths[:5]
x[1]
Pytorch puts $0$s everywhere we had padding in the output when unpacking:
print(output[1])
print(output[1].shape)
print(x[1][-100:])
print(output[1][-100:])
print(output[1][-100:].shape)
We can actually test that padding in the input x got replaced with $0$s in the output for all samples in the batch:
(x==tok_pad).float()
(output.sum(dim=2) == 0).float()
test_near((output.sum(dim=2) == 0).float(), (x==tok_pad).float())
So the last hidden state isn't the last element of output. We need to go back in the sequence to the last point before the padding.
Let's check we got everything right.
i = 1
length = x.size(1) - (x[i]==1).long().sum(); print(length)
out_unpad = output[i, :length]
print(out_unpad[-1][:20]); print(c[i,:20])
print(out_unpad.max(0)[0][:20]); print(c[i, 300:320])
print(out_unpad.mean(0)[:20]); print(c[i,600:620])
for i in range(batch_size):
length = x.size(1) - (x[i]==1).long().sum()
out_unpad = output[i, :length]
test_near(out_unpad[-1], c[i, :300])
test_near(out_unpad.max(0)[0], c[i, 300:600])
test_near(out_unpad.mean(0), c[i, 600:])
Our pooling layer properly ignores the padding, so now let's add it to the classifier.
# class PoolingLinearClassifier(nn.Module):
# """
# Create a linear classifier with pooling:
# -- the concat pooling layer, followed by
# -- a list of batchnorm + dropout + linear + activation layers
# """
# def __init__(self, layers, dropout_probs):
# super().__init__()
# modified_layers = []
# activations = [nn.ReLU(inplace=True)] * (len(layers) - 2) + [None]
# # list of batchnorm + dropout + linear layers
# for n_in, n_out, dropout_prob, activation in zip(layers[:-1], layers[1:],
# dropout_probs, activations):
# modified_layers += batchnorm_dropout_linear(n_in, n_out,
# dropout_prob=dropout_prob,
# activation=activation)
# self.layers = nn.Sequential(*modified_layers)
# def forward(self, input):
# """
# The LSTMs create hidden states for bptt time steps. We decide what to pass
# to the classifier here. Following concat pooling from vision, we use three
# things for the classification head of the model. We concatenate:
# -- the last hidden state
# -- the average (mean) pool of all the bptt hidden states
# -- the max pool of all the bptt hidden states
# We pass the resulting concatenated tensor to the linear classifier.
# """
# raw_outputs, outputs, mask = input
# # last hidden state
# output = outputs[-1]
# # we need to ignore the padding in the last hidden state,
# # as well as the average pool and max pool tensors
# lengths = output.size(1) - mask.long().sum(dim=1)
# # average pool
# avg_pool = output.masked_fill(mask[:, :, None], 0).sum(dim=1)
# avg_pool.div_(lengths.type(avg_pool.dtype)[:, None])
# # max pool
# max_pool = output.masked_fill(mask[:, :, None], -float('inf')).max(dim=1)[0]
# # Concat pooling
# x = torch.cat([output[torch.arange(0, output.size(0)), lengths-1],
# max_pool, avg_pool], 1)
# # pass the concat-pooled tensor through the linear layers
# x = self.layers(x)
# return x
Then we just have to feed our texts through those two blocks.
But we can't give them all at once to the AWD_LSTM1 or we might get an out-of-memory error:
# def pad_tensor(t, batch_size, val=0.):
# if t.size(0) < batch_size:
# return torch.cat([t, val + t.new_zeros(batch_size-t.size(0), *t.shape[1:])])
# return t
# class SentenceEncoder(nn.Module):
# "The encoder is the AWD LSTM model that gets called on the input text."
# def __init__(self, encoder, bptt, pad_idx=1):
# super().__init__()
# self.bptt = bptt
# self.encoder = encoder
# self.pad_idx = pad_idx
# def concat(self, arrs, batch_size):
# return [torch.cat([pad_tensor(l[si],batch_size) for l in arrs], dim=1)
# for si in range(len(arrs[0]))]
# def forward(self, input):
# batch_size, seq_len = input.size()
# self.encoder.batch_size = batch_size
# self.encoder.reset()
# raw_outputs, outputs, masks = [], [], []
# # We go through the input one bptt at a time
# for i in range(0, seq_len, self.bptt):
# # we call the RNN model on it
# r, o, m = self.encoder(input[:,i: min(i+self.bptt, seq_len)])
# # we keep appending the results
# masks.append(pad_tensor(m, batch_size, 1))
# raw_outputs.append(r)
# outputs.append(o)
# return self.concat(raw_outputs, batch_size), self.concat(outputs, batch_size),torch.cat(masks,dim=1)
# def get_text_classifier(vocab_sz, emb_sz, n_hid, n_layers, n_out, pad_token,
# bptt, output_p=0.4, hidden_p=0.2, input_p=0.6,
# embed_p=0.1, weight_p=0.5, layers=None, drops=None):
# "To create a full AWD-LSTM"
# rnn_enc = AWD_LSTM1(vocab_sz, emb_sz, n_hid=n_hid, n_layers=n_layers,
# pad_token=pad_token, hidden_p=hidden_p, input_p=input_p,
# embed_p=embed_p, weight_p=weight_p)
# enc = SentenceEncoder(rnn_enc, bptt)
# if layers is None:
# layers = [50]
# if drops is None:
# drops = [0.1] * len(layers)
# layers = [3 * emb_sz] + layers + [n_out]
# drops = [output_p] + drops
# return SequentialRNN(enc, PoolingLinearClassifier(layers, drops))
emb_dim = 300
hidden_dim = 300
n_layers = 2
dropout_probs = tensor([0.4, 0.3, 0.4, 0.05, 0.5]) * 0.25
model = get_text_classifier(len(vocab), emb_dim, hidden_dim, n_layers, 2, 1, bptt, *dropout_probs)
We load our pretrained encoder and freeze it.
model[0].encoder.load_state_dict(torch.load(path/'finetuned_enc.pth'))
Let's take a look at the model:
model
model[0]
model[1]
We freeze the encoder completely:
for p in model[0].parameters():
p.requires_grad_(False)
We split the model into groups for discriminative learning rates:
# def class_splitter(model):
# enc = model[0].encoder
# groups = [nn.Sequential(enc.emb, enc.emb_dropout, enc.input_dropout)]
# for i in range(len(enc.rnns)):
# groups.append(nn.Sequential(enc.rnns[i], enc.hidden_dropouts[i]))
# groups.append(model[1])
# return [list(group.parameters()) for group in groups]
param_groups = class_splitter(model)
len(param_groups)
We are now ready to train the top layers (the decoder):
callback_funcs = [partial(GradientClipping, clip=0.1)]
learn = Learner(model, data, F.cross_entropy, opt_func=adam_opt(),
metrics=accuracy, callback_funcs=callback_funcs, splitter=class_splitter)
learn.fit(1, callbacks=LRFinder())
lr = 1e-2
callback_sched = sched_1cycle(lr, mom_start=0.8, mom_mid=0.7, mom_end=0.8)
learn.fit(1, callbacks=callback_sched)
learn.plotter.plot_lr()
learn.plotter.plot_train_stats()
We progressively unfreeze the model to avoid catastrophic forgetting:
for p in model[0].encoder.rnns[-1].parameters():
p.requires_grad_(True)
And we train the top RNN together with the decoder:
learn.fit(1, callbacks=LRFinder())
lr = 5e-3
# we have 4 learning rates because we have 4 parameter groups
# the ones for the deeper layers are smaller to avoid catastrophic forgetting
callback_sched = sched_1cycle([lr/2., lr/2., lr/2., lr], mom_start=0.8, mom_mid=0.7, mom_end=0.8)
learn.fit(1, callbacks=callback_sched)
learn.plotter.plot_lr()
learn.plotter.plot_train_stats()
We now unfreeze the entire model and train it:
for p in model[0].parameters():
p.requires_grad_(True)
learn.fit(1, callbacks=LRFinder())
lr = 1e-3
# again, 4 learning rates; the ones for the deeper layers are smaller
callback_sched = sched_1cycle([lr/8., lr/4., lr/2., lr], mom_start=0.8, mom_mid=0.7, mom_end=0.8)
learn.fit(2, callbacks=callback_sched)
learn.plotter.plot_lr()
learn.plotter.plot_train_stats()
Predicting on the padded batch or on the individual unpadded samples give the same results.
x, y = next(iter(data.valid_dl))
pred_batch = learn.model.eval()(x.cuda())
pred_batch.size()
pred_batch
pred_logits = []
for review in x:
length = x.size(1) - (review == 1).long().sum()
review = review[:length]
pred_logits.append(learn.model.eval()(review[None].cuda()))
pred_logits
assert near(pred_batch, torch.cat(pred_logits))
accuracy(pred_batch, y.cuda())
accuracy(torch.cat(pred_logits), y.cuda())