理解循环神经网络

1. 简单的循环神经网络

RNN以渐进的方式处理信息,同时保存一个关于所处理内容的内部模型,这个模型是根据过去的信息构建的,并随着新信息的进入而不断更新。
RNN处理序列的方式是:遍历所有序列元素,并保存一个状态(State),其中包含与已查看内容相关的信息。

RNN的伪代码:

state_t = 0
for input_t in input_sequence:
    output_t = f(input_t, state_t)
    state_t = output_t

可以给出具体的函数f,从输入和状态到输出的变换,其参数包括两个矩阵(W和U)和一个偏置向量。它类似于前馈网络中密集连接层所做的变换。

state_t = 0
for input_t in input_sequence:
    output_t = activation(dot(W, input_t) + dot(U, state_t) + b)
    state_t = output_t
# 简单RNN的numpy实现
import numpy as np

timesteps = 100  # 输入序列的时间步数
input_features = 32   # 输入特征空间的维度
output_features = 64   # 输出特征空间的维度

inputs = np.random.random((timesteps, input_features))  # 输入数据:随机噪声,仅作为示例

state_t = np.zeros((output_features,))   # 初试状态:全零向量

# 创建随机的权重矩阵
W = np.random.random((output_features, input_features))   
U = np.random.random((output_features, output_features))
b = np.random.random((output_features, ))

successive_outputs = []
for input_t in inputs:    # input_t是形状为(input_features, )的向量
    output_t = np.tanh(np.dot(W, input_t) + np.dot(U, state_t) + b)   # 由输入和当前状态计算得到当前输出
    
    successive_outputs.append(output_t) # 将这个输出保存到一个列表中
    
    state_t = output_t  # 更新网络的状态,用于下一个时间步
    
# 最终输出是一个形状为(timesteps, output_features)的二维张量
final_output_sequence = np.stack(successive_outputs, axis=0)
final_output_sequence
array([[0.99999999, 0.99999995, 0.99999999, ..., 0.99999999, 0.9999997 ,
        0.99999992],
       [1.        , 1.        , 1.        , ..., 1.        , 1.        ,
        1.        ],
       [1.        , 1.        , 1.        , ..., 1.        , 1.        ,
        1.        ],
       ...,
       [1.        , 1.        , 1.        , ..., 1.        , 1.        ,
        1.        ],
       [1.        , 1.        , 1.        , ..., 1.        , 1.        ,
        1.        ],
       [1.        , 1.        , 1.        , ..., 1.        , 1.        ,
        1.        ]])

2. Keras 中的循环层

上面numpy的简单实现,对应一个实际的Keras层,即SimpleRNN层。

from keras.layers import SimpleRNN

与Keras中所有的循环层一样,SimpleRNN可以在两种不同的模式下运行:一种是返回每个时间步连续输出的完整序列,即形状为(batch_size, timesteps, output_features)的三维张量;另一种是只返回每个输入系列的最终输出,即形状为(batch_size, output_featrues)的二维张量。这个两种模式由return_sequences这个构造函数参数来控制。

from keras.models import Sequential
from keras.layers import Embedding, SimpleRNN

model = Sequential()
model.add(Embedding(10000, 32))
model.add(SimpleRNN(32))  #` 只返回最后一个时间步的输出
model.summary()
Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding_2 (Embedding)      (None, None, 32)          320000    
_________________________________________________________________
simple_rnn_2 (SimpleRNN)     (None, 32)                2080      
=================================================================
Total params: 322,080
Trainable params: 322,080
Non-trainable params: 0
_________________________________________________________________
from keras.models import Sequential
from keras.layers import Embedding, SimpleRNN

model = Sequential()
model.add(Embedding(10000, 32))
model.add(SimpleRNN(32, return_sequences=True)) # 返回完整的状态序列
model.summary()
Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding_3 (Embedding)      (None, None, 32)          320000    
_________________________________________________________________
simple_rnn_3 (SimpleRNN)     (None, 32)                2080      
=================================================================
Total params: 322,080
Trainable params: 322,080
Non-trainable params: 0
_________________________________________________________________

为了提高网络的表示能力,将多个循环层逐个堆叠有时也是很有用的。在这种情况下,你需要让所有中间层都返回完整的输出序列。

model = Sequential()
model.add(Embedding(10000, 32))
model.add(SimpleRNN(32, return_sequences=True))
model.add(SimpleRNN(32, return_sequences=True))
model.add(SimpleRNN(32, return_sequences=True))
model.add(SimpleRNN(32))
model.summary()
Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding_4 (Embedding)      (None, None, 32)          320000    
_________________________________________________________________
simple_rnn_4 (SimpleRNN)     (None, None, 32)          2080      
_________________________________________________________________
simple_rnn_5 (SimpleRNN)     (None, None, 32)          2080      
_________________________________________________________________
simple_rnn_6 (SimpleRNN)     (None, None, 32)          2080      
_________________________________________________________________
simple_rnn_7 (SimpleRNN)     (None, 32)                2080      
=================================================================
Total params: 328,320
Trainable params: 328,320
Non-trainable params: 0
_________________________________________________________________

3. 将RNN应用于IMDB数据集

3.1 准备数据

from keras.datasets import imdb
from keras.preprocessing import sequence

max_features = 10000
maxlen = 500 
batch_size = 32

print('Loading data...')
(input_train, y_train), (input_test, y_test) = imdb.load_data(num_words=max_features)
print(len(input_train), 'train_sequences')
print(len(input_test), 'test sequences')

print('Pad sequences (samples x time)')
input_train = sequence.pad_sequences(input_train, maxlen=maxlen)
input_test = sequence.pad_sequences(input_test, maxlen=maxlen)
print('input_train shape: ', input_train.shape)
print('input_test shape:', input_test.shape)

Loading data...
25000 train_sequences
25000 test sequences
Pad sequences (samples x time)
input_train shape:  (25000, 500)
input_test shape: (25000, 500)
# 用Embedding层和SimpleRNN层来训练模型
from keras.layers import Dense

model = Sequential()
model.add(Embedding(max_features, 32))
model.add(SimpleRNN(32))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
history = model.fit(input_train, y_train, epochs=10, batch_size=10, validation_split=0.2)
/usr/lib/python3.7/site-packages/tensorflow/python/ops/gradients_util.py:93: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 20000 samples, validate on 5000 samples
Epoch 1/10
20000/20000 [==============================] - 4783s 239ms/step - loss: 0.5510 - acc: 0.7012 - val_loss: 0.4217 - val_acc: 0.8162
Epoch 2/10
 1080/20000 [>.............................] - ETA: 1:14:34 - loss: 0.3898 - acc: 0.8250


---------------------------------------------------------------------------

KeyboardInterrupt                         Traceback (most recent call last)

<ipython-input-10-a41d37bef560> in <module>
      8 
      9 model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
---> 10 history = model.fit(input_train, y_train, epochs=10, batch_size=10, validation_split=0.2)


/usr/lib/python3.7/site-packages/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, max_queue_size, workers, use_multiprocessing, **kwargs)
   1237                                         steps_per_epoch=steps_per_epoch,
   1238                                         validation_steps=validation_steps,
-> 1239                                         validation_freq=validation_freq)
   1240 
   1241     def evaluate(self,


/usr/lib/python3.7/site-packages/keras/engine/training_arrays.py in fit_loop(model, fit_function, fit_inputs, out_labels, batch_size, epochs, verbose, callbacks, val_function, val_inputs, shuffle, initial_epoch, steps_per_epoch, validation_steps, validation_freq)
    194                     ins_batch[i] = ins_batch[i].toarray()
    195 
--> 196                 outs = fit_function(ins_batch)
    197                 outs = to_list(outs)
    198                 for l, o in zip(out_labels, outs):


/usr/lib/python3.7/site-packages/tensorflow/python/keras/backend.py in __call__(self, inputs)
   3215         value = math_ops.cast(value, tensor.dtype)
   3216       converted_inputs.append(value)
-> 3217     outputs = self._graph_fn(*converted_inputs)
   3218     return nest.pack_sequence_as(self._outputs_structure,
   3219                                  [x.numpy() for x in outputs])


/usr/lib/python3.7/site-packages/tensorflow/python/eager/function.py in __call__(self, *args, **kwargs)
    556       raise TypeError("Keyword arguments {} unknown. Expected {}.".format(
    557           list(kwargs.keys()), list(self._arg_keywords)))
--> 558     return self._call_flat(args)
    559 
    560   def _filtered_call(self, args, kwargs):


/usr/lib/python3.7/site-packages/tensorflow/python/eager/function.py in _call_flat(self, args)
    625     # Only need to override the gradient in graph mode and when we have outputs.
    626     if context.executing_eagerly() or not self.outputs:
--> 627       outputs = self._inference_function.call(ctx, args)
    628     else:
    629       self._register_gradient()


/usr/lib/python3.7/site-packages/tensorflow/python/eager/function.py in call(self, ctx, args)
    413             attrs=("executor_type", executor_type,
    414                    "config_proto", config),
--> 415             ctx=ctx)
    416       # Replace empty list with None
    417       outputs = outputs or None


/usr/lib/python3.7/site-packages/tensorflow/python/eager/execute.py in quick_execute(op_name, num_outputs, inputs, attrs, ctx, name)
     58     tensors = pywrap_tensorflow.TFE_Py_Execute(ctx._handle, device_name,
     59                                                op_name, inputs, attrs,
---> 60                                                num_outputs)
     61   except core._NotOkStatusException as e:
     62     if name is not None:


KeyboardInterrupt: 
import matplotlib.pyplot as plt

acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc)+1)

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val, 'b', label='Validation acc')
plt.title("Trainging and validation accuracy")
plt.legend()

plt.figure()

plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()

plt.show()