# Shakespeare Character Language Model

In [9]:
import torch
import torch.nn as nn
import torch.nn.utils.rnn as rnn
from torch.utils.data import Dataset, DataLoader, TensorDataset
import numpy as np
import time

import shakespeare_data as sh

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DEVICE

'cuda'

## Fixed length input

In [31]:
# Data - refer to shakespeare_data.py for details
corpus = sh.read_corpus()
print("First 203 characters...Last 50 characters")
print("{}...{}".format(corpus[:203], corpus[-50:]))
print("Total character count: {}".format(len(corpus)))
chars, charmap = sh.get_charmap(corpus)
charcount = len(chars)
print("Unique character count: {}\n".format(len(chars)))
shakespeare_array = sh.map_corpus(corpus, charmap)
print("shakespeare_array.shape: {}\n".format(shakespeare_array.shape))
small_example = shakespeare_array[:17]
print("First 17 characters as indices", small_example)
print("First 17 characters as characters:", [chars[c] for c in small_example])
print("First 17 character indices as text:\n", sh.to_text(small_example,chars))

First 203 characters...Last 50 characters
1609
 THE SONNETS
 by William Shakespeare
                      1
   From fairest creatures we desire increase,
   That thereby beauty's rose might never die,
   But as the riper should by time decease,
...,
   And new pervert a reconciled maid.'
 THE END

Total character count: 5551930
Unique character count: 84

shakespeare_array.shape: (5551930,)

First 17 characters as indices [12 17 11 20  0  1 45 33 30  1 44 40 39 39 30 45 44]
First 17 characters as characters: ['1', '6', '0', '9', '\n', ' ', 'T', 'H', 'E', ' ', 'S', 'O', 'N', 'N', 'E', 'T', 'S']
First 17 character indices as text:
 1609
 THE SONNETS


In [32]:
# Dataset class. Transform raw text into a set of sequences of fixed length, and extracts inputs and targets
class TextDataset(Dataset):
    
    def __init__(self,text, seq_len = 200):
        n_seq = len(text) // seq_len
        text = text[:n_seq * seq_len]
        self.data = torch.tensor(text).view(-1,seq_len)
    
    def __getitem__(self,i):
        txt = self.data[i]
        
        # labels are the input sequence shifted by 1
        return txt[:-1],txt[1:]
    
    def __len__(self):
        return self.data.size(0)

# Collate function. Transform a list of sequences into a batch. Passed as an argument to the DataLoader.
# Returns data of the format seq_len x batch_size
def collate(seq_list):
    inputs = torch.cat([s[0].unsqueeze(1) for s in seq_list],dim=1)
    targets = torch.cat([s[1].unsqueeze(1) for s in seq_list],dim=1)
    return inputs,targets


In [33]:
# Model
class CharLanguageModel(nn.Module):

    def __init__(self,vocab_size,embed_size,hidden_size, nlayers):
        super(CharLanguageModel,self).__init__()
        self.vocab_size=vocab_size
        self.embed_size = embed_size
        self.hidden_size = hidden_size
        self.nlayers=nlayers
        self.embedding = nn.Embedding(vocab_size,embed_size) # Embedding layer
        self.rnn = nn.LSTM(input_size = embed_size,hidden_size=hidden_size,num_layers=nlayers) # Recurrent network
        # You can also try GRUs instead of LSTMs.
        
        self.scoring = nn.Linear(hidden_size,vocab_size) # Projection layer
        
    def forward(self,seq_batch): #L x N
        # returns 3D logits
        batch_size = seq_batch.size(1)
        embed = self.embedding(seq_batch) #L x N x E
        hidden = None
        output_lstm,hidden = self.rnn(embed,hidden) #L x N x H
        output_lstm_flatten = output_lstm.view(-1,self.hidden_size) #(L*N) x H
        output_flatten = self.scoring(output_lstm_flatten) #(L*N) x V
        return output_flatten.view(-1,batch_size,self.vocab_size)
    
    def generate(self,seq, n_chars): # L x V
        # performs greedy search to extract and return words (one sequence).
        generated_chars = []
        embed = self.embedding(seq).unsqueeze(1) # L x 1 x E
        hidden = None
        output_lstm, hidden = self.rnn(embed,hidden) # L x 1 x H
        output = output_lstm[-1] # 1 x H
        scores = self.scoring(output) # 1 x V
        _,current_char = torch.max(scores,dim=1) # 1 x 1
        generated_chars.append(current_char)
        if n_chars > 1:
            for i in range(n_chars-1):
                embed = self.embedding(current_char).unsqueeze(0) # 1 x 1 x E
                output_lstm, hidden = self.rnn(embed,hidden) # 1 x 1 x H
                output = output_lstm[0] # 1 x H
                scores = self.scoring(output) # V
                _,current_char = torch.max(scores,dim=1) # 1
                generated_chars.append(current_char)
        return torch.cat(generated_chars,dim=0)
        
        

In [34]:
def train_epoch(model, optimizer, train_loader, val_loader):
    criterion = nn.CrossEntropyLoss()
    criterion = criterion.to(DEVICE)
    before = time.time()
    print("training", len(train_loader), "number of batches")
    for batch_idx, (inputs,targets) in enumerate(train_loader):
        if batch_idx == 0:
            first_time = time.time()
        inputs = inputs.to(DEVICE)
        targets = targets.to(DEVICE)
        outputs = model(inputs) # 3D
        loss = criterion(outputs.view(-1,outputs.size(2)),targets.view(-1)) # Loss of the flattened outputs
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if batch_idx == 0:
            print("Time elapsed", time.time() - first_time)
            
        if batch_idx % 100 == 0 and batch_idx != 0:
            after = time.time()
            print("Time: ", after - before)
            print("Loss per word: ", loss.item() / batch_idx)
            print("Perplexity: ", np.exp(loss.item() / batch_idx))
            after = before
    
    val_loss = 0
    batch_id=0
    for inputs,targets in val_loader:
        batch_id+=1
        inputs = inputs.to(DEVICE)
        targets = targets.to(DEVICE)
        outputs = model(inputs)
        loss = criterion(outputs.view(-1,outputs.size(2)),targets.view(-1))
        val_loss+=loss.item()
    val_lpw = val_loss / batch_id
    print("\nValidation loss per word:",val_lpw)
    print("Validation perplexity :",np.exp(val_lpw),"\n")
    return val_lpw
    

In [35]:
model = CharLanguageModel(charcount,256,256,3)
model = model.to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(),lr=0.001, weight_decay=1e-6)
split = 5000000
train_dataset = TextDataset(shakespeare_array[:split])
val_dataset = TextDataset(shakespeare_array[split:])
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=64, collate_fn = collate)
val_loader = DataLoader(val_dataset, shuffle=False, batch_size=64, collate_fn = collate, drop_last=True)

In [57]:
for i in range(5):
    train_epoch(model, optimizer, train_loader, val_loader)

training 391 number of batches
Time elapsed 0.07767295837402344
Time:  5.596414089202881
Loss per word:  0.01185429334640503
Perplexity:  1.0119248339425135
Time:  11.145010232925415
Loss per word:  0.005843929648399353
Perplexity:  1.0058610387170948
Time:  16.780640840530396
Loss per word:  0.003932354052861532
Perplexity:  1.0039400959016305

Validation loss per word: 1.3321008599081705
Validation perplexity : 3.7889951799089627 

training 391 number of batches
Time elapsed 0.04892563819885254
Time:  5.784719467163086
Loss per word:  0.011950627565383912
Perplexity:  1.0120223216266813
Time:  11.487154722213745
Loss per word:  0.005958112478256225
Perplexity:  1.0059758973342545
Time:  17.114842653274536
Loss per word:  0.003922495444615682
Perplexity:  1.0039301984983102

Validation loss per word: 1.3247673289720403
Validation perplexity : 3.761310105292561 

training 391 number of batches
Time elapsed 0.052561044692993164
Time:  5.5854432582855225
Loss per word:  0.011668695211410

In [55]:
def generate(model, seed,nchars):
    seq = sh.map_corpus(seed, charmap)
    seq = torch.tensor(seq).to(DEVICE)
    out = model.generate(seq,nchars)
    return sh.to_text(out.cpu().detach().numpy(),chars)

In [53]:
print(generate(model, "To be, or not to be, that is the q",8))

uestion



In [56]:
print(generate(model, "Richard ", 1000))

and the King of the compolutes
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         

## Packed sequences

In [19]:
stop_character = charmap['\n']
space_character = charmap[" "]
lines = np.split(shakespeare_array, np.where(shakespeare_array == stop_character)[0]+1) # split the data in lines
shakespeare_lines = []
for s in lines:
    s_trimmed = np.trim_zeros(s-space_character)+space_character # remove space-only lines
    if len(s_trimmed)>1:
        shakespeare_lines.append(s)
for i in range(10):
    print(sh.to_text(shakespeare_lines[i],chars))
print(len(shakespeare_lines))

1609

 THE SONNETS

 by William Shakespeare

                      1

   From fairest creatures we desire increase,

   That thereby beauty's rose might never die,

   But as the riper should by time decease,

   His tender heir might bear his memory:

   But thou contracted to thine own bright eyes,

   Feed'st thy light's flame with self-substantial fuel,

114638


In [20]:
class LinesDataset(Dataset):
    def __init__(self,lines):
        self.lines=[torch.tensor(l) for l in lines]
    def __getitem__(self,i):
        line = self.lines[i]
        return line[:-1].to(DEVICE),line[1:].to(DEVICE)
    def __len__(self):
        return len(self.lines)

# collate fn lets you control the return value of each batch
# for packed_seqs, you want to return your data sorted by length
def collate_lines(seq_list):
    inputs,targets = zip(*seq_list)
    lens = [len(seq) for seq in inputs]
    seq_order = sorted(range(len(lens)), key=lens.__getitem__, reverse=True)
    inputs = [inputs[i] for i in seq_order]
    targets = [targets[i] for i in seq_order]
    return inputs,targets

In [21]:
# Model that takes packed sequences in training
class PackedLanguageModel(nn.Module):
    
    def __init__(self,vocab_size,embed_size,hidden_size, nlayers, stop):
        super(PackedLanguageModel,self).__init__()
        self.vocab_size=vocab_size
        self.embed_size = embed_size
        self.hidden_size = hidden_size
        self.nlayers=nlayers
        self.embedding = nn.Embedding(vocab_size,embed_size)
        self.rnn = nn.LSTM(input_size = embed_size,hidden_size=hidden_size,num_layers=nlayers) # 1 layer, batch_size = False
        self.scoring = nn.Linear(hidden_size,vocab_size)
        self.stop = stop # stop line character (\n)
    
    def forward(self,seq_list): # list
        batch_size = len(seq_list)
        lens = [len(s) for s in seq_list] # lens of all lines (already sorted)
        bounds = [0]
        for l in lens:
            bounds.append(bounds[-1]+l) # bounds of all lines in the concatenated sequence. Indexing into the list to 
                                        # see where the sequence occurs. Need this at line marked **
        seq_concat = torch.cat(seq_list) # concatenated sequence
        embed_concat = self.embedding(seq_concat) # concatenated embeddings
        embed_list = [embed_concat[bounds[i]:bounds[i+1]] for i in range(batch_size)] # embeddings per line **
        packed_input = rnn.pack_sequence(embed_list) # packed version
        
        # alternatively, you could use rnn.pad_sequence, followed by rnn.pack_padded_sequence
        
        
        
        hidden = None
        output_packed,hidden = self.rnn(packed_input,hidden)
        output_padded, _ = rnn.pad_packed_sequence(output_packed) # unpacked output (padded). Also gives you the lengths
        output_flatten = torch.cat([output_padded[:lens[i],i] for i in range(batch_size)]) # concatenated output
        scores_flatten = self.scoring(output_flatten) # concatenated logits
        return scores_flatten # return concatenated logits
    
    def generate(self,seq, n_words): # L x V
        generated_words = []
        embed = self.embedding(seq).unsqueeze(1) # L x 1 x E
        hidden = None
        output_lstm, hidden = self.rnn(embed,hidden) # L x 1 x H
        output = output_lstm[-1] # 1 x H
        scores = self.scoring(output) # 1 x V
        _,current_word = torch.max(scores,dim=1) # 1 x 1
        generated_words.append(current_word)
        if n_words > 1:
            for i in range(n_words-1):
                embed = self.embedding(current_word).unsqueeze(0) # 1 x 1 x E
                output_lstm, hidden = self.rnn(embed,hidden) # 1 x 1 x H
                output = output_lstm[0] # 1 x H
                scores = self.scoring(output) # V
                _,current_word = torch.max(scores,dim=1) # 1
                generated_words.append(current_word)
                if current_word[0].item()==self.stop: # If end of line
                    break
        return torch.cat(generated_words,dim=0)

In [22]:
def train_epoch_packed(model, optimizer, train_loader, val_loader):
    criterion = nn.CrossEntropyLoss(reduction="sum") # sum instead of averaging, to take into account the different lengths
    criterion = criterion.to(DEVICE)
    batch_id=0
    before = time.time()
    print("Training", len(train_loader), "number of batches")
    for inputs,targets in train_loader: # lists, presorted, preloaded on GPU
        batch_id+=1
        outputs = model(inputs)
        loss = criterion(outputs,torch.cat(targets)) # criterion of the concatenated output
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        if batch_id % 100 == 0:
            after = time.time()
            nwords = np.sum(np.array([len(l) for l in inputs]))
            lpw = loss.item() / nwords
            print("Time elapsed: ", after - before)
            print("At batch",batch_id)
            print("Training loss per word:",lpw)
            print("Training perplexity :",np.exp(lpw))
            before = after
    
    val_loss = 0
    batch_id=0
    nwords = 0
    for inputs,targets in val_loader:
        nwords += np.sum(np.array([len(l) for l in inputs]))
        batch_id+=1
        outputs = model(inputs)
        loss = criterion(outputs,torch.cat(targets))
        val_loss+=loss.item()
    val_lpw = val_loss / nwords
    print("\nValidation loss per word:",val_lpw)
    print("Validation perplexity :",np.exp(val_lpw),"\n")
    return val_lpw

In [23]:
model = PackedLanguageModel(charcount,256,256,3, stop=stop_character)
model = model.to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(),lr=0.001, weight_decay=1e-6)
split = 100000
train_dataset = LinesDataset(shakespeare_lines[:split])
val_dataset = LinesDataset(shakespeare_lines[split:])
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=64, collate_fn = collate_lines)
val_loader = DataLoader(val_dataset, shuffle=False, batch_size=64, collate_fn = collate_lines, drop_last=True)

In [24]:
for i in range(20):
    train_epoch_packed(model, optimizer, train_loader, val_loader)

Training 1563 number of batches
Time elapsed:  3.7703583240509033
At batch 100
Training loss per word: 2.7013041346193503
Training perplexity : 14.899149557123419
Time elapsed:  3.691027879714966
At batch 200
Training loss per word: 2.197311673753789
Training perplexity : 9.000783901895156
Time elapsed:  3.6885509490966797
At batch 300
Training loss per word: 1.9548370171777951
Training perplexity : 7.062767820058658
Time elapsed:  3.6700551509857178
At batch 400
Training loss per word: 1.7970468966562716
Training perplexity : 6.03180858325135
Time elapsed:  3.7132349014282227
At batch 500
Training loss per word: 1.8254557599342311
Training perplexity : 6.205622648866053
Time elapsed:  3.7091219425201416
At batch 600
Training loss per word: 1.8306482488458806
Training perplexity : 6.237929078461925
Time elapsed:  3.708371639251709
At batch 700
Training loss per word: 1.7147244232747705
Training perplexity : 5.555144432895182
Time elapsed:  3.713968515396118
At batch 800
Training loss p

In [25]:
torch.save(model, "trained_model.pt")

In [30]:
print(generate(model, "To be, or not to be, that is the q",20))

uarrel



In [27]:
print(generate(model, "Richard ", 1000))

Scotland



In [28]:
print(generate(model, "Hello", 1000))

wear



### Reminders

By default, for all rnn modules (rnn, GRU, LSTM) batch_first = False
To use packed sequences, your inputs first need to be sorted in descending order of length (longest to shortest)
Batches need to have inputs of the same length 