http://zh.diveintodeeplearning.org/chapter_recurrent-neural-networks/deep-rnn.html
第二段中“第l隐藏层(l=1,…,T)的隐藏状态为…”这句有个错误,“T”应该改为“L”。
深度循环神经网络是个坑吧,试了下更差劲
是差了一点。第250轮的困惑度变成1.38了,原来一层网络的数据是1.17左右。
各位大佬,可以把深度循环神经网络的code share一下么?
不知对不对 但是最后结果跑出来了
def rnn(inputs, state, params):
# inputs和outputs皆为num_steps个形状为(batch_size, vocab_size)的矩阵
W_xh, W_hh, b_h,W_xh1,W_hh1,b_h1, W_hq, b_q = params
H_01, = state
H_02, = state
outputs = []
for X in inputs:
H_01 = nd.tanh(nd.dot(X, W_xh) + nd.dot(H_01, W_hh) + b_h)
H_02 = nd.tanh(nd.dot(H_01,W_xh1) + nd.dot(H_02,W_hh1) + b_h1)
Y = nd.dot(H_02, W_hq) + b_q
outputs.append(Y)
return outputs, (H_02,)
不知为什么,用含两个隐藏层的RNN训练,结果竟然更差了。其一,困惑度下降的更慢了;其二,预测结果出现了很大的不稳定(困惑度虽然随着训练不断下降,但预测的歌词却并非越来越好,反倒是出现反复现象)。
训练的参数和单隐藏层scratch一致,只是增大了num_epochs如下:
参数尺寸如下:
state初始化需要对两个隐藏层的state均做初始化:
最后,双隐藏层rnn的内部计算实现如下(基于书本上的公式):
望大神指出我的实现是否存在问题
rnn函数有问题,应该把H1_0与H1的变量名统一起来,H2也一样。否则时间步之间的隐藏状态是无法传递的。
贴上代码,没有考虑封装,只是简单实现。
def get_params():
def _one(shape):
return nd.random.normal(scale=0.01, shape=shape, ctx=ctx)
w_xh1 = _one((num_input, num_hidden))
w_hh1 = _one((num_hidden, num_hidden))
b_h1 = nd.zeros(num_hidden, ctx=ctx)
w_xh2 = _one((num_hidden, num_hidden))
w_hh2 = _one((num_hidden, num_hidden))
b_h2 = nd.zeros(num_hidden, ctx=ctx)
w_hq = _one((num_hidden, num_output))
b_q = nd.zeros(num_output, ctx=ctx)
params = [
w_xh1, w_hh1, b_h1,
w_xh2, w_hh2, b_h2,
w_hq, b_q,
]
for p in params:
p.attach_grad()
return params
def init_rnn_state(_batch_size, _num_hidden, _ctx):
return (
nd.zeros(shape=(_batch_size, _num_hidden), ctx=ctx),
nd.zeros(shape=(_batch_size, _num_hidden), ctx=ctx),
)
def deep_rnn(_input: nd.NDArray, state: tuple, params: list):
w_xh1, w_hh1, b_h1, w_xh2, w_hh2, b_h2, w_hq, b_q, = params
hidden_layer_1, hidden_layer_2 = state
output = []
for x in _input:
hidden_layer_1 = nd.tanh(nd.dot(x, w_xh1) + nd.dot(hidden_layer_1, w_hh1) + b_h1)
hidden_layer_2 = nd.tanh(nd.dot(hidden_layer_1, w_xh2) + nd.dot(hidden_layer_2, w_hh2) + b_h2)
y_hat = nd.dot(hidden_layer_2, w_hq) + b_q
output.append(y_hat)
return output, (hidden_layer_1, hidden_layer_2)
N层RNN实现,看看对不对
def get_params():
def _one(shape):
return nd.random.normal(scale=0.01, shape=shape, ctx=ctx)
#深度RNN的层数设定为num_layers
for layer in range(num_layers):
if layer == 0:
#第一层参数
w_xh[layer] = _one((num_input, num_hidden))
w_hh[layer] = _one((num_hidden, num_hidden))
b_h[layer] = nd.zeros(num_hidden, ctx=ctx)
else:
#其他层
w_xh[layer] = _one((num_hidden, num_hidden))
w_hh[layer] = _one((num_hidden, num_hidden))
b_h[layer]= nd.zeros(num_hidden, ctx=ctx)
params.append(w_xh[layer])
params.append(w_hh[layer])
params.append(b_h[layer])
#输出层参数
w_hq = _one((num_hidden, num_output))
b_q = nd.zeros(num_output, ctx=ctx)
params.append(w_hq)
params.append(b_q)
for p in params:
p.attach_grad()
return params
def init_rnn_state(_batch_size, _num_hidden, _ctx):
return (
(nd.zeros(shape=(_batch_size, _num_hidden), ctx=ctx),)*num_layers
)
def deep_rnn(_input: nd.NDArray, state: tuple, params: list):
output = []
for x in _input:
for layer in range(num_layers):
if layer ==0:
#第一层输入层
state[layer] = nd.tanh(nd.dot(x, params[layer*3]#w_xh) + nd.dot(state[layer], params[layer*3+1]#w_hh) + params[layer*3+2]#b_h)
else: #从第二层开始,走这里
state[layer] = nd.tanh(nd.dot(state[layer-1], params[layer*3]) + nd.dot(state[layer], params[layer*3+1]) + params[layer*3+2])
y_hat = nd.dot(state[layer], params[-2]) + params[-1]
#这里layer的值停留在for循环结束时,即最后一层L
#最后两个参数是输出层参数
output.append(y_hat)
return output, state
很明显错了,建议仔细看公式