gluon如何做梯度剪裁?参数的梯度怎么查看?

根据第十四课的教程将模型修改成gluon的形式但是梯度剪裁有问题,学习率一大loss就返回nan。梯度剪裁是根据文档class mxnet.gluon.Trainer中compression_params 选项设置的:
trainer = gluon.Trainer(model.collect_params(),
‘sgd’, {‘learning_rate’: learning_rate#, ‘wd’:2e-4
},kvstore=‘local’,compression_params={‘type’:‘2bit’, ‘threshold’:.1})
模型的修改如下:
model = mx.gluon.nn.Sequential()
with model.name_scope():
model.add(mx.gluon.rnn.RNN(256,1))
model.add(mx.gluon.nn.Dense(1465, flatten=False))
model.initialize(init.Xavier())

def rnn_forward(X,H):
X,H = model[0] (X,H)
return model[1] (X),H

def predict(prefix, num_chars):
prefix = prefix.lower()
model[0].begin_state()
output = [char_to_idx[prefix[0]]]
#H = [nd.zeros((1,1,512),ctx),nd.zeros((1,1,512),ctx)]
H = nd.zeros((1,1,256),ctx)
for i in range(num_chars + len(prefix)):
X = nd.array([output[-1]], ctx=ctx)
Y,H = rnn_forward(get_inputs(X)[0].reshape((1,1,1465)),H)
if i < len(prefix)-1:
next_input = char_to_idx[prefix[i+1]]
else:
next_input = int(Y[0].argmax(axis=1).asscalar())
output.append(next_input)
return ‘’.join([idx_to_char[i] for i in output])

def get_inputs(data):
return [nd.one_hot(X, vocab_size).expand_dims(1) for X in data]

def train_and_predict():

trainer = gluon.Trainer(model.collect_params(), 
                               'sgd', {'learning_rate': learning_rate#,  'wd':2e-4
                                },kvstore='local',compression_params={'type':'2bit', 'threshold':.1})

softmax_cross_entropy = gluon.loss.SoftmaxCrossEntropyLoss()
for e in range(1, epochs + 1):

    train_loss, num_examples = 0, 0
    model[0].begin_state()
    #H = [nd.zeros((1,batch_size,512),ctx),nd.zeros((1,batch_size,512),ctx)]
    H = nd.zeros((1,batch_size,256),ctx)
    for data, label in data_iter_consecutive(corpus_indices, batch_size, seq_len, ctx):
        # 如使用随机批量采样,处理每个随机小批量前都需要初始化隐藏状态。
        #H = nd.zeros((1,batch_size,256),ctx)
        with autograd.record():
            in_put = get_inputs(data)      
            ipt = nd.concat(*ipt,dim=1)
            outputs,H1 = lstm_forward(ipt.as_in_context(ctx),H)      
          
            loss = softmax_cross_entropy(outputs, label.T)          
        loss.backward()
        H = H1
        trainer.step(batch_size)
        train_loss += nd.sum(loss).asscalar()
        num_examples += loss.size          

    if e % 20 == 0:
        print("Epoch %d. Perplexity %f" % (e, exp(train_loss/num_examples)))
        print(' - ', predict(seq3, 300), '\n')

通过重写gluon.Trainer.step实现了梯度剪裁compression_params 没起作用还是没搞清楚

class gtrainer(gluon.Trainer):

def step_grad_clipping(self, batch_size,theta, ignore_stale_grad=False):
   
    if not self._kv_initialized:
        self._init_kvstore()
    self._optimizer.rescale_grad = self._scale / batch_size
    for i, param in enumerate(self._params):
        if param.grad_req == 'null':
            continue        
        
        if not ignore_stale_grad:
            for data in param.list_data():
                if not data._fresh_grad:
                    raise UserWarning(
                        "Gradient of Parameter `%s` on context %s has not been updated "
                        "by backward since last `step`. This could mean a bug in your "
                        "model that maked it only use a subset of the Parameters (Blocks) "
                        "for this iteration. If you are intentionally only using a subset, "
                        "call step with ignore_stale_grad=True to suppress this "
                        "warning and skip updating of Parameters with stale gradient" \
                        %(param.name, str(data.context)))
        if self._kvstore:
            self._kvstore.push(i,param.list_grad(), priority=-i)
            if self._update_on_kvstore:
                self._kvstore.pull(i, param.list_data(), priority=-i)
                continue
            else:
                self._kvstore.pull(i,param.list_grad(), priority=-i)
            
        for upd, arr, grad in zip(self._updaters, param.list_data(), 
                                  param.list_grad()):
            
            if not ignore_stale_grad or arr._fresh_grad:
                norm = grad.norm()
                norm = min(theta/norm,1)
                upd(i, grad*norm, arr)
                arr._fresh_grad = False

https://discuss.gluon.ai/t/topic/2111/2?u=mli

参数的查看可以参考Parameter类的实现,里面提供了接口,list_grad(),如果想查看权值,可以用接口list_data()。
所以完整的流程可以是先通过network.collect_params()接口获取你想要监控的参数,然后将参数的梯度打印出来,具体示例代码如下所示:
··python
params = self.network.collect_params(“param_name”)
grad = params.list_grad()
print(grad)
··