class PositionalEncoder(nn.Module): def __init__(self, d_model, max_seq_len = 80): super().__init__() self.d_model = d_model # create constant 'pe' matrix with values dependant on # pos and i pe = torch.zeros(max_seq_len, d_model) for pos in range(max_seq_len): for i in range(0, d_model, 2): pe[pos, i] = \ math.sin(pos / (10000 ** ((2 * i)/d_model))) pe[pos, i + 1] = \ math.cos(pos / (10000 ** ((2 * (i + 1))/d_model))) pe = pe.unsqueeze(0) self.register_buffer('pe', pe) def forward(self, x): # make embeddings relatively larger x = x * math.sqrt(self.d_model) #add constant to embedding seq_len = x.size(1) x = x + Variable(self.pe[:,:seq_len], \ requires_grad=False).cuda() return x
class FeedForward(nn.Module): def __init__(self, d_model, d_ff=2048, dropout = 0.1): super().__init__() # We set d_ff as a default to 2048 self.linear_1 = nn.Linear(d_model, d_ff) self.dropout = nn.Dropout(dropout) self.linear_2 = nn.Linear(d_ff, d_model) def forward(self, x): x = self.dropout(F.relu(self.linear_1(x))) x = self.linear_2(x) return x
最后一个变量:如果你仔细看图,你可以看到编码器和解码器旁边有一个“Nx”。实际上,上图中的编码器和解码器分别表示编码器的一层和解码器的一层。N是层数的变量。比如,如果N=6,数据经过6个编码器层(如上所示的结构),然后将这些输出传给解码器,解码器也由6个重复的解码器层组成。
现在,我们将使用上面模型中所示的结构构建编码器层和解码器层模块。在我们构建编码器和解码器时,我们可以决定层的数量。
# build an encoder layer with one multi-head attention layer and one # feed-forward layerclass EncoderLayer(nn.Module): def __init__(self, d_model, heads, dropout = 0.1): super().__init__() self.norm_1 = Norm(d_model) self.norm_2 = Norm(d_model) self.attn = MultiHeadAttention(heads, d_model) self.ff = FeedForward(d_model) self.dropout_1 = nn.Dropout(dropout) self.dropout_2 = nn.Dropout(dropout) def forward(self, x, mask): x2 = self.norm_1(x) x = x + self.dropout_1(self.attn(x2,x2,x2,mask)) x2 = self.norm_2(x) x = x + self.dropout_2(self.ff(x2)) return x # build a decoder layer with two multi-head attention layers and# one feed-forward layerclass DecoderLayer(nn.Module): def __init__(self, d_model, heads, dropout=0.1): super().__init__() self.norm_1 = Norm(d_model) self.norm_2 = Norm(d_model) self.norm_3 = Norm(d_model) self.dropout_1 = nn.Dropout(dropout) self.dropout_2 = nn.Dropout(dropout) self.dropout_3 = nn.Dropout(dropout) self.attn_1 = MultiHeadAttention(heads, d_model) self.attn_2 = MultiHeadAttention(heads, d_model) self.ff = FeedForward(d_model).cuda()def forward(self, x, e_outputs, src_mask, trg_mask): x2 = self.norm_1(x) x = x + self.dropout_1(self.attn_1(x2, x2, x2, trg_mask)) x2 = self.norm_2(x) x = x + self.dropout_2(self.attn_2(x2, e_outputs, e_outputs, src_mask)) x2 = self.norm_3(x) x = x + self.dropout_3(self.ff(x2)) return x# We can then build a convenient cloning function that can generate multiple layers:def get_clones(module, N): return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
我们现在可以构建编码器和解码器了:
class Encoder(nn.Module): def __init__(self, vocab_size, d_model, N, heads): super().__init__() self.N = N self.embed = Embedder(vocab_size, d_model) self.pe = PositionalEncoder(d_model) self.layers = get_clones(EncoderLayer(d_model, heads), N) self.norm = Norm(d_model) def forward(self, src, mask): x = self.embed(src) x = self.pe(x) for i in range(N): x = self.layers(x, mask) return self.norm(x) class Decoder(nn.Module): def __init__(self, vocab_size, d_model, N, heads): super().__init__() self.N = N self.embed = Embedder(vocab_size, d_model) self.pe = PositionalEncoder(d_model) self.layers = get_clones(DecoderLayer(d_model, heads), N) self.norm = Norm(d_model) def forward(self, trg, e_outputs, src_mask, trg_mask): x = self.embed(trg) x = self.pe(x) for i in range(self.N): x = self.layers(x, e_outputs, src_mask, trg_mask) return self.norm(x)
Transformer模型构建完毕!
class Transformer(nn.Module): def __init__(self, src_vocab, trg_vocab, d_model, N, heads): super().__init__() self.encoder = Encoder(src_vocab, d_model, N, heads) self.decoder = Decoder(trg_vocab, d_model, N, heads) self.out = nn.Linear(d_model, trg_vocab) def forward(self, src, trg, src_mask, trg_mask): e_outputs = self.encoder(src, src_mask) d_output = self.decoder(trg, e_outputs, src_mask, trg_mask) output = self.out(d_output) return output# we don't perform softmax on the output as this will be handled# automatically by our loss function
训练模型
构建完transformer,接下来要做的是用EuroParl数据集进行训练。编码部分非常简单,但是要等两天,模型才会开始converge!
让我们先来定义一些参数:
d_model = 512heads = 8N = 6src_vocab = len(EN_TEXT.vocab)trg_vocab = len(FR_TEXT.vocab)model = Transformer(src_vocab, trg_vocab, d_model, N, heads)for p in model.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p)# this code is very important! It initialises the parameters with a# range of values that stops the signal fading or getting too big.# See this blog for a mathematical explanation.optim = torch.optim.Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)
现在,我们可以开始训练了:
def train_model(epochs, print_every=100): model.train() start = time.time() temp = start total_loss = 0 for epoch in range(epochs): for i, batch in enumerate(train_iter): src = batch.English.transpose(0,1) trg = batch.French.transpose(0,1) # the French sentence we input has all words except # the last, as it is using each word to predict the next trg_input = trg[:, :-1] # the words we are trying to predict targets = trg[:, 1:].contiguous().view(-1) # create function to make masks using mask code above src_mask, trg_mask = create_masks(src, trg_input) preds = model(src, trg_input, src_mask, trg_mask) optim.zero_grad() loss = F.cross_entropy(preds.view(-1, preds.size(-1)), results, ignore_index=target_pad) loss.backward() optim.step() total_loss += loss.data[0] if (i + 1) % print_every == 0: loss_avg = total_loss / print_every print("time = %dm, epoch %d, iter = %d, loss = %.3f, %ds per %d iters" % ((time.time() - start) // 60, epoch + 1, i + 1, loss_avg, time.time() - temp, print_every)) total_loss = 0 temp = time.time()
示例训练输出:经过几天的训练后,模型的损失函数收敛到了大约1.3。
测试模型
我们可以使用下面的函数来翻译句子。我们可以直接输入句子,或者输入自定义字符串。
翻译器通过运行一个循环来工作。我们对英语句子进行编码。把<sos> token输进解码器,编码器输出。然后,解码器对第一个单词进行预测,使用<sos> token将其加进解码器的输入。接着,重新运行循环,获取下一个单词预测,将其加入解码器的输入,直到<sos> token完成翻译。
def translate(model, src, max_len = 80, custom_string=False): model.eval()if custom_sentence == True: src = tokenize_en(src) sentence=\ Variable(torch.LongTensor([[EN_TEXT.vocab.stoi[tok] for tok in sentence]])).cuda()src_mask = (src != input_pad).unsqueeze(-2) e_outputs = model.encoder(src, src_mask) outputs = torch.zeros(max_len).type_as(src.data) outputs[0] = torch.LongTensor([FR_TEXT.vocab.stoi['<sos>']])for i in range(1, max_len): trg_mask = np.triu(np.ones((1, i, i), k=1).astype('uint8') trg_mask= Variable(torch.from_numpy(trg_mask) == 0).cuda() out = model.out(model.decoder(outputs[:i].unsqueeze(0), e_outputs, src_mask, trg_mask)) out = F.softmax(out, dim=-1) val, ix = out[:, -1].data.topk(1) outputs = ix[0][0] if ix[0][0] == FR_TEXT.vocab.stoi['<eos>']: breakreturn ' '.join( [FR_TEXT.vocab.itos[ix] for ix in outputs[:i]] )Transformer模型的构建过程大致就是这样。