人工智能小组任务(一)

之前关于ELMo介绍性内容都在菜鸟学NLP(十)里。
本文主要是解读,ELMo的源码
pic1

ELMo的结构如上所示

AllenNLP给出的解答,计算elmo的流程:

  1. Prepare input data and a vocabulary file.
  2. Train the biLM.
  3. Test (compute the perplexity of) the biLM on heldout data.
  4. Write out the weights from the trained biLM to a hdf5 file.(checkpoint -> hdf5)

可以看到里面的一个核心结构是BiLSTM

所以先从BiLSTM开始分析
首先这里用到的BiLM首先不算是BiLSTM,因为它是通过正向序列预测下一个单词,反向序列预测前一个单词,而loss是两者之和,而BiLSTM需要将正向反向过程拼接。其次,这里不仅用了LSTM,而且还是stacked LSTM with connections between layers.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
class ElmoLstm(_EncoderBase):
def __init__(self,
input_size: int,#输入的维度
hidden_size: int,#输出的维度
cell_size: int,#cell数
num_layers: int,#双向LSTM的个数
requires_grad: bool = False,#是否需要反向传播
recurrent_dropout_probability: float = 0.0,#dropout_rate
memory_cell_clip_value: Optional[float] = None,#用于截断
state_projection_clip_value: Optional[float] = None) -> None:#用于截断
super(ElmoLstm, self).__init__(stateful=True)

# Required to be wrapped with a :class:`PytorchSeq2SeqWrapper`.
self.input_size = input_size
self.hidden_size = hidden_size
self.num_layers = num_layers
self.cell_size = cell_size
self.requires_grad = requires_grad

forward_layers = []#方向向前的层
backward_layers = []#方向向后的层

lstm_input_size = input_size
go_forward = True
for layer_index in range(num_layers):#num_layers是BiLSTM的层数
forward_layer = LstmCellWithProjection(lstm_input_size,
hidden_size,
cell_size,
go_forward,
recurrent_dropout_probability,
memory_cell_clip_value,
state_projection_clip_value)
#LSTMCellWithProjection是An LSTM with Recurrent Dropout and a projected and clipped hidden state and memory.速度比Pytorch原生的LSTM慢
backward_layer = LstmCellWithProjection(lstm_input_size,
hidden_size,
cell_size,
not go_forward,
recurrent_dropout_probability,
memory_cell_clip_value,
state_projection_clip_value)

lstm_input_size = hidden_size##stacked进行

self.add_module('forward_layer_{}'.format(layer_index), forward_layer)##将子模块加入当前模块
self.add_module('backward_layer_{}'.format(layer_index), backward_layer)##将子模块加入当前模块
forward_layers.append(forward_layer)
backward_layers.append(backward_layer)
self.forward_layers = forward_layers
self.backward_layers = backward_layers

def forward(self, # pylint: 规范代码
inputs: torch.Tensor,#输入形式为(Batch_size,sequence_length,hidden_size)
mask: torch.LongTensor) -> torch.Tensor:
#二值mask形状为(batch_size,sequnece_length)
#mask用来表示在Bath的每个sequnece中额pad元素
batch_size, total_sequence_length = mask.size()
stacked_sequence_output, final_states, restoration_indices = \
self.sort_and_run_forward(self._lstm_forward, inputs, mask)

num_layers, num_valid, returned_timesteps, encoder_dim = stacked_sequence_output.size()#(层数,num_valid,sequnece_length,dim)

# Add back invalid rows which were removed in the call to sort_and_run_forward.
if num_valid < batch_size:
zeros = stacked_sequence_output.new_zeros(num_layers,
batch_size - num_valid,
returned_timesteps,
encoder_dim)

stacked_sequence_output = torch.cat([stacked_sequence_output, zeros], 1)#与全0矩阵拼接

# The states also need to have invalid rows added back.
new_states = []#对state进行拼接
for state in final_states:
state_dim = state.size(-1)
zeros = state.new_zeros(num_layers, batch_size - num_valid, state_dim)
new_states.append(torch.cat([state, zeros], 1))
final_states = new_states

# It's possible to need to pass sequences which are padded to longer than the
# max length of the sequence to a Seq2StackEncoder. However, packing and unpacking
# the sequences mean that the returned tensor won't include these dimensions, because
# the RNN did not need to process them. We add them back on in the form of zeros here.
sequence_length_difference = total_sequence_length - returned_timesteps
if sequence_length_difference > 0:
zeros = stacked_sequence_output.new_zeros(num_layers,
batch_size,
sequence_length_difference,
stacked_sequence_output[0].size(-1))
stacked_sequence_output = torch.cat([stacked_sequence_output, zeros], 2)
##在时间步上拼接
self._update_states(final_states, restoration_indices)

# Restore the original indices and return the sequence.
# Has shape (num_layers, batch_size, sequence_length, hidden_size)
return stacked_sequence_output.index_select(1, restoration_indices)
#返回(num_layers,batch_size,sequence_length,hidden_dim)的tensor
def _lstm_forward(self,
inputs: PackedSequence,#batch_first
initial_state: Optional[Tuple[torch.Tensor,#(hidden,c) torch.Tensor]] = None) -> \
Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:

if initial_state is None:#如果初始状态为None
hidden_states: List[Optional[Tuple[torch.Tensor,
torch.Tensor]]] = [None] * len(self.forward_layers)
elif initial_state[0].size()[0] != len(self.forward_layers):#初始状态数与层数不匹配
raise ConfigurationError("Initial states were passed to forward() but the number of "
"initial states does not match the number of layers.")
else:
hidden_states = list(zip(initial_state[0].split(1, 0), initial_state[1].split(1, 0)))
#split是在第0维进行间隔为1的切分,对h和c的切分
inputs, batch_lengths = pad_packed_sequence(inputs, batch_first=True)
forward_output_sequence = inputs
backward_output_sequence = inputs

final_states = []
sequence_outputs = []
for layer_index, state in enumerate(hidden_states):##分层计算,state表示每层的(h,c)
forward_layer = getattr(self, 'forward_layer_{}'.format(layer_index))
backward_layer = getattr(self, 'backward_layer_{}'.format(layer_index))

forward_cache = forward_output_sequence
backward_cache = backward_output_sequence

if state is not None:
forward_hidden_state, backward_hidden_state = state[0].split(self.hidden_size, 2)#h分前向后向
forward_memory_state, backward_memory_state = state[1].split(self.cell_size, 2)#C分前向后向
forward_state = (forward_hidden_state, forward_memory_state)
backward_state = (backward_hidden_state, backward_memory_state)
else:
forward_state = None
backward_state = None

forward_output_sequence, forward_state = forward_layer(forward_output_sequence,
batch_lengths,
forward_state)#前向计算
backward_output_sequence, backward_state = backward_layer(backward_output_sequence,
batch_lengths,
backward_state)
#反向计算
if layer_index != 0:##skip connections,将输入加入到输出
forward_output_sequence += forward_cache
backward_output_sequence += backward_cache

sequence_outputs.append(torch.cat([forward_output_sequence,
backward_output_sequence], -1))#输出的forward和backward拼接

final_states.append((torch.cat([forward_state[0], backward_state[0]], -1),
torch.cat([forward_state[1], backward_state[1]], -1)))#将每一层的h,c在保存在list里

stacked_sequence_outputs: torch.FloatTensor = torch.stack(sequence_outputs)

final_hidden_states, final_memory_states = zip(*final_states)##得到h和C
final_state_tuple: Tuple[torch.FloatTensor,
torch.FloatTensor] = (torch.cat(final_hidden_states, 0),
torch.cat(final_memory_states, 0))
#则到(h,c)元组,h是所有层的h的拼接,c也是所有层的c的拼接
return stacked_sequence_outputs, final_state_tuple
#return (num_layer,batch_size,hidden_dim)
def load_weights(self, weight_file: str) -> None:#weight文件,预训练文件载入
requires_grad = self.requires_grad#决定是不是要fine-tune

with h5py.File(cached_path(weight_file), 'r') as fin:
for i_layer, lstms in enumerate(
zip(self.forward_layers, self.backward_layers)
):
for j_direction, lstm in enumerate(lstms):
# lstm is an instance of LSTMCellWithProjection
cell_size = lstm.cell_size

dataset = fin['RNN_%s' % j_direction]['RNN']['MultiRNNCell']['Cell%s' % i_layer
]['LSTMCell']

# tensorflow packs together both W and U matrices into one matrix,
# but pytorch maintains individual matrices. In addition, tensorflow
# packs the gates as input, memory, forget, output but pytorch
# uses input, forget, memory, output. So we need to modify the weights.
tf_weights = numpy.transpose(dataset['W_0'][...])
torch_weights = tf_weights.copy()

# split the W from U matrices
input_size = lstm.input_size
input_weights = torch_weights[:, :input_size]
recurrent_weights = torch_weights[:, input_size:]
tf_input_weights = tf_weights[:, :input_size]
tf_recurrent_weights = tf_weights[:, input_size:]

# handle the different gate order convention
for torch_w, tf_w in [[input_weights, tf_input_weights],
[recurrent_weights, tf_recurrent_weights]]:
torch_w[(1 * cell_size):(2 * cell_size), :] = tf_w[(2 * cell_size):(3 * cell_size), :]
torch_w[(2 * cell_size):(3 * cell_size), :] = tf_w[(1 * cell_size):(2 * cell_size), :]

lstm.input_linearity.weight.data.copy_(torch.FloatTensor(input_weights))
lstm.state_linearity.weight.data.copy_(torch.FloatTensor(recurrent_weights))
lstm.input_linearity.weight.requires_grad = requires_grad
lstm.state_linearity.weight.requires_grad = requires_grad

# the bias weights
tf_bias = dataset['B'][...]
# tensorflow adds 1.0 to forget gate bias instead of modifying the
# parameters...
tf_bias[(2 * cell_size):(3 * cell_size)] += 1
torch_bias = tf_bias.copy()
torch_bias[(1 * cell_size):(2 * cell_size)
] = tf_bias[(2 * cell_size):(3 * cell_size)]
torch_bias[(2 * cell_size):(3 * cell_size)
] = tf_bias[(1 * cell_size):(2 * cell_size)]
lstm.state_linearity.bias.data.copy_(torch.FloatTensor(torch_bias))
lstm.state_linearity.bias.requires_grad = requires_grad

# the projection weights
proj_weights = numpy.transpose(dataset['W_P_0'][...])
lstm.state_projection.weight.data.copy_(torch.FloatTensor(proj_weights))
lstm.state_projection.weight.requires_grad = requires_grad

上面的这个BiLSTM拥有与原生BiLSTM相似的结构,不同的是这个结构是多层BiLSTM堆叠成的,并且存在着一些skip 连接

有了ELMo中的基本结构BiLSTM,接下来看整个ELMo模型

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
class Elmo(torch.nn.Module):

def __init__(self,
options_file: str,#Json选项文件
weight_file: str,#权重文件
num_output_representations: int,#输出表征数目
requires_grad: bool = False,
do_layer_norm: bool = False,#是否用layer norm
dropout: float = 0.5,#dropout rate
vocab_to_cache: List[str] = None,
keep_sentence_boundaries: bool = False,#如果True则说明句子边界没有被移除
scalar_mix_parameters: List[float] = None,#
module: torch.nn.Module = None) -> None:
super(Elmo, self).__init__()

logger.info("Initializing ELMo")#开始初始化ELMo中的lstm
if module is not None:
if options_file is not None or weight_file is not None:
raise ConfigurationError(
"Don't provide options_file or weight_file with module")
self._elmo_lstm = module
else:
self._elmo_lstm = _ElmoBiLm(options_file,
weight_file,
requires_grad=requires_grad,
vocab_to_cache=vocab_to_cache)
self._has_cached_vocab = vocab_to_cache is not None
self._keep_sentence_boundaries = keep_sentence_boundaries
self._dropout = Dropout(p=dropout)
self._scalar_mixes: Any = []
for k in range(num_output_representations):
scalar_mix = ScalarMix(
self._elmo_lstm.num_layers,
do_layer_norm=do_layer_norm,
initial_scalar_parameters=scalar_mix_parameters,
trainable=scalar_mix_parameters is None)
self.add_module('scalar_mix_{}'.format(k), scalar_mix)
self._scalar_mixes.append(scalar_mix)

def get_output_dim(self):#得到输出的维度
return self._elmo_lstm.get_output_dim()

def forward(self, # pylint: disable=arguments-differ
inputs: torch.Tensor,
word_inputs: torch.Tensor = None) -> Dict[str, Union[torch.Tensor, List[torch.Tensor]]]:
"""
Parameters
----------
inputs: ``torch.Tensor``, required.
Shape ``(batch_size, timesteps, 50)`` of character ids representing the current batch.
word_inputs : ``torch.Tensor``, required.
If you passed a cached vocab, you can in addition pass a tensor of shape
``(batch_size, timesteps)``, which represent word ids which have been pre-cached.
Returns
-------
Dict with keys:
``'elmo_representations'``: ``List[torch.Tensor]``
A ``num_output_representations`` list of ELMo representations for the input sequence.
Each representation is shape ``(batch_size, timesteps, embedding_dim)``
``'mask'``: ``torch.Tensor``
Shape ``(batch_size, timesteps)`` long tensor with sequence mask.
"""
# reshape the input if needed
#input是character输入
original_shape = inputs.size()#原始输入的尺寸
if len(original_shape) > 3:
timesteps, num_characters = original_shape[-2:]#原始输入尺寸的后两个分别是sequence_length,num_characters
reshaped_inputs = inputs.view(-1, timesteps, num_characters)##变为(len(original_shape)=3)
else:
reshaped_inputs = inputs
#word_input是 词的输入
if word_inputs is not None:#词输入不为空
original_word_size = word_inputs.size()#word_input的尺寸
if self._has_cached_vocab and len(original_word_size) > 2:#
reshaped_word_inputs = word_inputs.view(-1, original_word_size[-1])
elif not self._has_cached_vocab:#没有词表
logger.warning("Word inputs were passed to ELMo but it does not have a cached vocab.")
reshaped_word_inputs = None
else:
reshaped_word_inputs = word_inputs
else:#word_inputs是空
reshaped_word_inputs = word_inputs

# run the biLM
bilm_output = self._elmo_lstm(reshaped_inputs, reshaped_word_inputs)
layer_activations = bilm_output['activations']
mask_with_bos_eos = bilm_output['mask']

# compute the elmo representations
representations = []
for i in range(len(self._scalar_mixes)):
scalar_mix = getattr(self, 'scalar_mix_{}'.format(i))
representation_with_bos_eos = scalar_mix(layer_activations, mask_with_bos_eos)
if self._keep_sentence_boundaries:
processed_representation = representation_with_bos_eos##句子含边界
processed_mask = mask_with_bos_eos#句子含边界
else:
representation_without_bos_eos, mask_without_bos_eos = remove_sentence_boundaries(
representation_with_bos_eos, mask_with_bos_eos)#去除bos和eos
processed_representation = representation_without_bos_eos
processed_mask = mask_without_bos_eos
representations.append(self._dropout(processed_representation))#dropout处理

# reshape if necessary
if word_inputs is not None and len(original_word_size) > 2:
mask = processed_mask.view(original_word_size)
elmo_representations = [representation.view(original_word_size + (-1, ))
for representation in representations]
elif len(original_shape) > 3:
mask = processed_mask.view(original_shape[:-1])
elmo_representations = [representation.view(original_shape[:-1] + (-1, ))
for representation in representations]
else:
mask = processed_mask
elmo_representations = representations

return {'elmo_representations': elmo_representations, 'mask': mask}

# The add_to_archive logic here requires a custom from_params.
@classmethod
def from_params(cls, params: Params) -> 'Elmo':
# Add files to archive
params.add_file_to_archive('options_file')
params.add_file_to_archive('weight_file')

options_file = params.pop('options_file')
weight_file = params.pop('weight_file')
requires_grad = params.pop('requires_grad', False)
num_output_representations = params.pop('num_output_representations')
do_layer_norm = params.pop_bool('do_layer_norm', False)
keep_sentence_boundaries = params.pop_bool('keep_sentence_boundaries', False)
dropout = params.pop_float('dropout', 0.5)
scalar_mix_parameters = params.pop('scalar_mix_parameters', None)
params.assert_empty(cls.__name__)

return cls(options_file=options_file,
weight_file=weight_file,
num_output_representations=num_output_representations,
requires_grad=requires_grad,
do_layer_norm=do_layer_norm,
keep_sentence_boundaries=keep_sentence_boundaries,
dropout=dropout,
scalar_mix_parameters=scalar_mix_parameters)


def batch_to_ids(batch: List[List[str]]) -> torch.Tensor:#转换tokenized sentences 为tensor
"""
Converts a batch of tokenized sentences to a tensor representing the sentences with encoded characters
(len(batch), max sentence length, max word length).
Parameters
----------
batch : ``List[List[str]]``, required
A list of tokenized sentences.
Returns
-------
A tensor of padded character ids.
"""
instances = []
indexer = ELMoTokenCharactersIndexer()#下标器
for sentence in batch:
tokens = [Token(token) for token in sentence]
field = TextField(tokens,
{'character_ids': indexer})
instance = Instance({"elmo": field})
instances.append(instance)

dataset = Batch(instances)
vocab = Vocabulary()
dataset.index_instances(vocab)
return dataset.as_tensor_dict()['elmo']['character_ids']


class _ElmoCharacterEncoder(torch.nn.Module):#计算上下文不敏感的token表达通过使用预训练biLM
"""
Compute context insensitive token representation using pretrained biLM.
This embedder has input character ids of size (batch_size, sequence_length, 50)
and returns (batch_size, sequence_length + 2, embedding_dim), where embedding_dim
is specified in the options file (typically 512).
We add special entries at the beginning and end of each sequence corresponding
to <S> and </S>, the beginning and end of sentence tokens.
Note: this is a lower level class useful for advanced usage. Most users should
use ``ElmoTokenEmbedder`` or ``allennlp.modules.Elmo`` instead.
Parameters
----------
options_file : ``str``
ELMo JSON options file
weight_file : ``str``
ELMo hdf5 weight file
requires_grad: ``bool``, optional, (default = False).
If True, compute gradient of ELMo parameters for fine tuning.
The relevant section of the options file is something like:
.. example-code::
.. code-block:: python
{'char_cnn': {
'activation': 'relu',
'embedding': {'dim': 4},
'filters': [[1, 4], [2, 8], [3, 16], [4, 32], [5, 64]],
'max_characters_per_token': 50,
'n_characters': 262,
'n_highway': 2
}
}
"""
def __init__(self,
options_file: str,#选项文件
weight_file: str,#权重文件
requires_grad: bool = False) -> None:
super(_ElmoCharacterEncoder, self).__init__()

with open(cached_path(options_file), 'r') as fin:
self._options = json.load(fin)
self._weight_file = weight_file

self.output_dim = self._options['lstm']['projection_dim']
self.requires_grad = requires_grad

self._load_weights()

# Cache the arrays for use in forward -- +1 due to masking.
self._beginning_of_sentence_characters = torch.from_numpy(
numpy.array(ELMoCharacterMapper.beginning_of_sentence_characters) + 1
)
self._end_of_sentence_characters = torch.from_numpy(
numpy.array(ELMoCharacterMapper.end_of_sentence_characters) + 1
)

def get_output_dim(self):
return self.output_dim

@overrides
def forward(self, inputs: torch.Tensor) -> Dict[str, torch.Tensor]: # pylint: disable=arguments-differ
"""
Compute context insensitive token embeddings for ELMo representations.
Parameters
----------
inputs: ``torch.Tensor``
Shape ``(batch_size, sequence_length, 50)`` of character ids representing the
current batch.
Returns
-------
Dict with keys:
``'token_embedding'``: ``torch.Tensor``
Shape ``(batch_size, sequence_length + 2, embedding_dim)`` tensor with context
insensitive token representations.
``'mask'``: ``torch.Tensor``
Shape ``(batch_size, sequence_length + 2)`` long tensor with sequence mask.
"""
# 添加BOS和EOS
mask = ((inputs > 0).long().sum(dim=-1) > 0).long()
character_ids_with_bos_eos, mask_with_bos_eos = add_sentence_boundary_token_ids(
inputs,
mask,
self._beginning_of_sentence_characters,
self._end_of_sentence_characters
)

# the character id embedding
max_chars_per_token = self._options['char_cnn']['max_characters_per_token']
# (batch_size * sequence_length, max_chars_per_token, embed_dim)
character_embedding = torch.nn.functional.embedding(
character_ids_with_bos_eos.view(-1, max_chars_per_token),
self._char_embedding_weights
)##得到character_embedding

# run convolutions
cnn_options = self._options['char_cnn']
if cnn_options['activation'] == 'tanh':
activation = torch.tanh
elif cnn_options['activation'] == 'relu':
activation = torch.nn.functional.relu
else:
raise ConfigurationError("Unknown activation")

# (batch_size * sequence_length, embed_dim, max_chars_per_token)
character_embedding = torch.transpose(character_embedding, 1, 2)
convs = []
for i in range(len(self._convolutions)):
conv = getattr(self, 'char_conv_{}'.format(i))
convolved = conv(character_embedding)
# (batch_size * sequence_length, n_filters for this width)
convolved, _ = torch.max(convolved, dim=-1)
convolved = activation(convolved)
convs.append(convolved)

# (batch_size * sequence_length, n_filters)
token_embedding = torch.cat(convs, dim=-1)

# apply the highway layers (batch_size * sequence_length, n_filters)
token_embedding = self._highways(token_embedding)

# final projection (batch_size * sequence_length, embedding_dim)
token_embedding = self._projection(token_embedding)

# reshape to (batch_size, sequence_length, embedding_dim)
batch_size, sequence_length, _ = character_ids_with_bos_eos.size()

return {
'mask': mask_with_bos_eos,
'token_embedding': token_embedding.view(batch_size, sequence_length, -1)
}

def _load_weights(self):
self._load_char_embedding()
self._load_cnn_weights()
self._load_highway()
self._load_projection()

def _load_char_embedding(self):#载入char embedding
with h5py.File(cached_path(self._weight_file), 'r') as fin:
char_embed_weights = fin['char_embed'][...]

weights = numpy.zeros(
(char_embed_weights.shape[0] + 1, char_embed_weights.shape[1]),
dtype='float32'
)
weights[1:, :] = char_embed_weights

self._char_embedding_weights = torch.nn.Parameter(
torch.FloatTensor(weights), requires_grad=self.requires_grad
)

def _load_cnn_weights(self):#载入CNN权重
cnn_options = self._options['char_cnn']
filters = cnn_options['filters']
char_embed_dim = cnn_options['embedding']['dim']

convolutions = []
for i, (width, num) in enumerate(filters):
conv = torch.nn.Conv1d(
in_channels=char_embed_dim,
out_channels=num,
kernel_size=width,
bias=True
)
# load the weights
with h5py.File(cached_path(self._weight_file), 'r') as fin:
weight = fin['CNN']['W_cnn_{}'.format(i)][...]
bias = fin['CNN']['b_cnn_{}'.format(i)][...]

w_reshaped = numpy.transpose(weight.squeeze(axis=0), axes=(2, 1, 0))
if w_reshaped.shape != tuple(conv.weight.data.shape):
raise ValueError("Invalid weight file")
conv.weight.data.copy_(torch.FloatTensor(w_reshaped))
conv.bias.data.copy_(torch.FloatTensor(bias))

conv.weight.requires_grad = self.requires_grad
conv.bias.requires_grad = self.requires_grad

convolutions.append(conv)
self.add_module('char_conv_{}'.format(i), conv)

self._convolutions = convolutions

def _load_highway(self):#载入权重
# pylint: disable=protected-access
# the highway layers have same dimensionality as the number of cnn filters
cnn_options = self._options['char_cnn']
filters = cnn_options['filters']
n_filters = sum(f[1] for f in filters)
n_highway = cnn_options['n_highway']

# create the layers, and load the weights
self._highways = Highway(n_filters, n_highway, activation=torch.nn.functional.relu)
for k in range(n_highway):
# The AllenNLP highway is one matrix multplication with concatenation of
# transform and carry weights.
with h5py.File(cached_path(self._weight_file), 'r') as fin:
# The weights are transposed due to multiplication order assumptions in tf
# vs pytorch (tf.matmul(X, W) vs pytorch.matmul(W, X))
w_transform = numpy.transpose(fin['CNN_high_{}'.format(k)]['W_transform'][...])
# -1.0 since AllenNLP is g * x + (1 - g) * f(x) but tf is (1 - g) * x + g * f(x)
w_carry = -1.0 * numpy.transpose(fin['CNN_high_{}'.format(k)]['W_carry'][...])
weight = numpy.concatenate([w_transform, w_carry], axis=0)
self._highways._layers[k].weight.data.copy_(torch.FloatTensor(weight))
self._highways._layers[k].weight.requires_grad = self.requires_grad

b_transform = fin['CNN_high_{}'.format(k)]['b_transform'][...]
b_carry = -1.0 * fin['CNN_high_{}'.format(k)]['b_carry'][...]
bias = numpy.concatenate([b_transform, b_carry], axis=0)
self._highways._layers[k].bias.data.copy_(torch.FloatTensor(bias))
self._highways._layers[k].bias.requires_grad = self.requires_grad

def _load_projection(self):#载入project(linear——尺寸为n_filter×out_dim)
cnn_options = self._options['char_cnn']
filters = cnn_options['filters']
n_filters = sum(f[1] for f in filters)

self._projection = torch.nn.Linear(n_filters, self.output_dim, bias=True)
with h5py.File(cached_path(self._weight_file), 'r') as fin:
weight = fin['CNN_proj']['W_proj'][...]
bias = fin['CNN_proj']['b_proj'][...]
self._projection.weight.data.copy_(torch.FloatTensor(numpy.transpose(weight)))
self._projection.bias.data.copy_(torch.FloatTensor(bias))

self._projection.weight.requires_grad = self.requires_grad
self._projection.bias.requires_grad = self.requires_grad


class _ElmoBiLm(torch.nn.Module):

def __init__(self,
options_file: str,#选项文件
weight_file: str,#权重文件
requires_grad: bool = False,
vocab_to_cache: List[str] = None) -> None:#词库的下标
super(_ElmoBiLm, self).__init__()

self._token_embedder = _ElmoCharacterEncoder(options_file, weight_file, requires_grad=requires_grad)
#ELMo的char编码器
self._requires_grad = requires_grad
if requires_grad and vocab_to_cache:
logging.warning("You are fine tuning ELMo and caching char CNN word vectors. "
"This behaviour is not guaranteed to be well defined, particularly. "
"if not all of your inputs will occur in the vocabulary cache.")
# This is an embedding, used to look up cached
# word vectors built from character level cnn embeddings.
self._word_embedding = None
self._bos_embedding: torch.Tensor = None
self._eos_embedding: torch.Tensor = None
if vocab_to_cache:
logging.info("Caching character cnn layers for words in vocabulary.")
# This sets 3 attributes, _word_embedding, _bos_embedding and _eos_embedding.
# They are set in the method so they can be accessed from outside the
# constructor.
self.create_cached_cnn_embeddings(vocab_to_cache)

with open(cached_path(options_file), 'r') as fin:
options = json.load(fin)
if not options['lstm'].get('use_skip_connections'):
raise ConfigurationError('We only support pretrained biLMs with residual connections')
self._elmo_lstm = ElmoLstm(input_size=options['lstm']['projection_dim'],
hidden_size=options['lstm']['projection_dim'],
cell_size=options['lstm']['dim'],
num_layers=options['lstm']['n_layers'],
memory_cell_clip_value=options['lstm']['cell_clip'],
state_projection_clip_value=options['lstm']['proj_clip'],
requires_grad=requires_grad)
self._elmo_lstm.load_weights(weight_file)
# Number of representation layers including context independent layer
self.num_layers = options['lstm']['n_layers'] + 1

def get_output_dim(self):
return 2 * self._token_embedder.get_output_dim()

def forward(self, # pylint: disable=arguments-differ
inputs: torch.Tensor,
word_inputs: torch.Tensor = None) -> Dict[str, Union[torch.Tensor, List[torch.Tensor]]]:

#word_ids 已经提前被存在下

if self._word_embedding is not None and word_inputs is not None:
try:
mask_without_bos_eos = (word_inputs > 0).long()
# The character cnn part is cached - just look it up.
embedded_inputs = self._word_embedding(word_inputs) # type: ignore
# shape (batch_size, timesteps + 2, embedding_dim)
type_representation, mask = add_sentence_boundary_token_ids(
embedded_inputs,
mask_without_bos_eos,
self._bos_embedding,
self._eos_embedding
)
except RuntimeError:
# Back off to running the character convolutions,
# as we might not have the words in the cache.
token_embedding = self._token_embedder(inputs)
mask = token_embedding['mask']
type_representation = token_embedding['token_embedding']
else:
token_embedding = self._token_embedder(inputs)
mask = token_embedding['mask']
type_representation = token_embedding['token_embedding']
lstm_outputs = self._elmo_lstm(type_representation, mask)

# Prepare the output. The first layer is duplicated.
# Because of minor differences in how masking is applied depending
# on whether the char cnn layers are cached, we'll be defensive and
# multiply by the mask here. It's not strictly necessary, as the
# mask passed on is correct, but the values in the padded areas
# of the char cnn representations can change.
output_tensors = [
torch.cat([type_representation, type_representation], dim=-1) * mask.float().unsqueeze(-1)
]
for layer_activations in torch.chunk(lstm_outputs, lstm_outputs.size(0), dim=0):
output_tensors.append(layer_activations.squeeze(0))

return {
'activations': output_tensors,
'mask': mask,
}

def create_cached_cnn_embeddings(self, tokens: List[str]) -> None:


tokens = [ELMoCharacterMapper.bos_token, ELMoCharacterMapper.eos_token] + tokens
timesteps = 32
batch_size = 32
chunked_tokens = lazy_groups_of(iter(tokens), timesteps)

all_embeddings = []
device = get_device_of(next(self.parameters()))
for batch in lazy_groups_of(chunked_tokens, batch_size):
# Shape (batch_size, timesteps, 50)
batched_tensor = batch_to_ids(batch)
# NOTE: This device check is for when a user calls this method having
# already placed the model on a device. If this is called in the
# constructor, it will probably happen on the CPU. This isn't too bad,
# because it's only a few convolutions and will likely be very fast.
if device >= 0:
batched_tensor = batched_tensor.cuda(device)
output = self._token_embedder(batched_tensor)
token_embedding = output["token_embedding"]
mask = output["mask"]
token_embedding, _ = remove_sentence_boundaries(token_embedding, mask)#移除句子的边界
all_embeddings.append(token_embedding.view(-1, token_embedding.size(-1)))
full_embedding = torch.cat(all_embeddings, 0)

# We might have some trailing embeddings from padding in the batch, so
# we clip the embedding and lookup to the right size.
full_embedding = full_embedding[:len(tokens), :]
embedding = full_embedding[2:len(tokens), :]
vocab_size, embedding_dim = list(embedding.size())

from allennlp.modules.token_embedders import Embedding # type: ignore
self._bos_embedding = full_embedding[0, :]
self._eos_embedding = full_embedding[1, :]
self._word_embedding = Embedding(vocab_size, # type: ignore
embedding_dim,
weight=embedding.data,
trainable=self._requires_grad,
padding_index=0)

阅读代码的时候,上面ScaleMix没有懂,所以继续找这部分代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
class ScalarMix(torch.nn.Module):#mixture = gamma * sum(s_k * tensor_k),其中s=softmax(w),其中w和gamma是标量参数。
#如果用了"do_layer_norm=True"则用用layer normalization到每个层,在weighting前

def __init__(self,
mixture_size: int,
do_layer_norm: bool = False,
initial_scalar_parameters: List[float] = None,
trainable: bool = True) -> None:
super(ScalarMix, self).__init__()
self.mixture_size = mixture_size
self.do_layer_norm = do_layer_norm

if initial_scalar_parameters is None:
initial_scalar_parameters = [0.0] * mixture_size
elif len(initial_scalar_parameters) != mixture_size:
raise ConfigurationError("Length of initial_scalar_parameters {} differs "
"from mixture_size {}".format(
initial_scalar_parameters, mixture_size))

self.scalar_parameters = ParameterList(
[Parameter(torch.FloatTensor([initial_scalar_parameters[i]]),
requires_grad=trainable) for i
in range(mixture_size)])#w
self.gamma = Parameter(torch.FloatTensor([1.0]), requires_grad=trainable)#gamma

def forward(self, tensors: List[torch.Tensor], # pylint: disable=arguments-differ
mask: torch.Tensor = None) -> torch.Tensor:#
if len(tensors) != self.mixture_size:
raise ConfigurationError("{} tensors were passed, but the module was initialized to "
"mix {} tensors.".format(len(tensors), self.mixture_size))

def _do_layer_norm(tensor, broadcast_mask, num_elements_not_masked):
tensor_masked = tensor * broadcast_mask
mean = torch.sum(tensor_masked) / num_elements_not_masked
variance = torch.sum(((tensor_masked - mean) * broadcast_mask)**2) / num_elements_not_masked
return (tensor - mean) / torch.sqrt(variance + 1E-12)

normed_weights = torch.nn.functional.softmax(torch.cat([parameter for parameter
in self.scalar_parameters]), dim=0)#s=softmax(w)
normed_weights = torch.split(normed_weights, split_size_or_sections=1)

if not self.do_layer_norm:#如果没有layer normal
pieces = []
for weight, tensor in zip(normed_weights, tensors):
pieces.append(weight * tensor)
return self.gamma * sum(pieces)

else:
mask_float = mask.float()
broadcast_mask = mask_float.unsqueeze(-1)
input_dim = tensors[0].size(-1)
num_elements_not_masked = torch.sum(mask_float) * input_dim

pieces = []
for weight, tensor in zip(normed_weights, tensors):
pieces.append(weight * _do_layer_norm(tensor,
broadcast_mask, num_elements_not_masked))
return self.gamma * sum(pieces)

pic2

应用到下游任务就是将ScalarMix的结果提取出来,用于下游任务的embedding。
当然整个embedding也可以根据下游任务进行微调。但基本绝大部分是不变的形式。

-------------本文结束感谢您的阅读-------------