含并行连结的网络(GoogLeNet) 讨论区

更新的文本
http://zh.diveintodeeplearning.org.s3-website-us-west-2.amazonaws.com/chapter_convolutional-neural-networks/googlenet.html

发现如果一直train下去,在某个epoch之后loss会变成nan, 不知道是什么原理?
lr: 0.07 batch size: 128


epoch 40, loss 0.1504, train acc 0.943, test acc 0.911, time 51.2 sec
epoch 41, loss 0.1423, train acc 0.945, test acc 0.915, time 59.1 sec
epoch 42, loss 0.1287, train acc 0.950, test acc 0.913, time 51.0 sec
epoch 43, loss 0.1189, train acc 0.954, test acc 0.905, time 51.7 sec
epoch 44, loss 0.1066, train acc 0.959, test acc 0.913, time 51.4 sec
epoch 45, loss 0.1052, train acc 0.960, test acc 0.917, time 52.1 sec
epoch 46, loss nan, train acc 0.553, test acc 0.100, time 51.5 sec
epoch 47, loss nan, train acc 0.100, test acc 0.100, time 50.5 sec
epoch 48, loss nan, train acc 0.100, test acc 0.100, time 50.5 sec
epoch 49, loss nan, train acc 0.100, test acc 0.100, time 50.4 sec
epoch 50, loss nan, train acc 0.100, test acc 0.100, time 51.1 sec

3赞

个人猜测是计算loss那里出现了数值稳定性问题,比如训练时预测的输出y_hat已经很小了,那么经过softmax并且计算log(1/y_hat)时就会出现数值稳定性问题,你调整一下学习率试一试

1赞

为啥我把GoogLeNet改成符号式编程就报错呢,检查了好几遍都不行。求大佬看一下。
import mxnet,time
class GoogLeNet_Inception(mxnet.gluon.nn.HybridBlock):
def init (self,c1,c2,c3,c4,**kwargs):
super(). init (**kwargs)
self.p1 = mxnet.gluon.nn.Conv2D(channels = c1,kernel_size = [1,1],strides = [1,1],activation = ‘relu’)
self.p2_1 = mxnet.gluon.nn.Conv2D(channels = c2[0],kernel_size = [1,1],strides = [1,1],activation = ‘relu’)
self.p2_2 = mxnet.gluon.nn.Conv2D(channels = c2[1],kernel_size = [3,3],strides = [1,1],padding = [1,1])
self.p3_1 = mxnet.gluon.nn.Conv2D(channels = c3[0],kernel_size = [1,1],strides = [1,1],activation = ‘relu’)
self.p3_2 = mxnet.gluon.nn.Conv2D(channels = c3[1],kernel_size = [5,5],strides = [1,1],padding = [2,2])
self.p4_1 = mxnet.gluon.nn.MaxPool2D(pool_size = [3,3],strides = [1,1],padding = [1,1])
self.p4_2 = mxnet.gluon.nn.Conv2D(channels = c4,kernel_size = [1,1],strides = [1,1])
def hybrid_forward(self,F,x):
p1 = self.p1(x)
p2 = F.relu(self.p2_2(self.p2_1(x)))
p3 = F.relu(self.p3_2(self.p3_1(x)))
p4 = F.relu(self.p4_2(self.p4_1(x)))
print(‘F"name:’,F)
output = F.concat(p1,p2,p3,p4,dim = 1)
return output

GoogLeNet_block1 = mxnet.gluon.nn.HybridSequential(prefix = ‘GoogLeNet_block1_’)
GoogLeNet_block1.add(mxnet.gluon.nn.Conv2D(channels = 64,kernel_size = [7,7],strides = [2,2],padding = [3,3],activation = ‘relu’),
mxnet.gluon.nn.MaxPool2D(strides = [2,2],pool_size = [3,3],padding = [1,1]))

GoogLeNet_block2 = mxnet.gluon.nn.HybridSequential(prefix = ‘GoogLeNet_block2_’)
GoogLeNet_block2.add(mxnet.gluon.nn.Conv2D(channels = 64,kernel_size = [1,1]),
mxnet.gluon.nn.Conv2D(channels = 192,kernel_size = [3,3],padding = [1,1]),
mxnet.gluon.nn.MaxPool2D(strides = [2,2],pool_size = [3,3],padding = [1,1]))

GoogLeNet_block3 = mxnet.gluon.nn.HybridSequential(prefix = ‘GoogLeNet_block3_’)
GoogLeNet_block3.add(GoogLeNet_Inception(64,[96,128],[16,32],32),
GoogLeNet_Inception(128,[128,192],[32,96],64),
mxnet.gluon.nn.MaxPool2D(strides = [2,2],pool_size = [3,3],padding = [1,1]))

GoogLeNet_block4 = mxnet.gluon.nn.HybridSequential(prefix = ‘GoogLeNet_block4_’)
GoogLeNet_block4.add(GoogLeNet_Inception(192,[96,208],[16,48],64),
GoogLeNet_Inception(160,[112,224],[24,64],64),
GoogLeNet_Inception(128,[128,256],[24,64],64),
GoogLeNet_Inception(112,[144,288],[32,64],64),
GoogLeNet_Inception(256,[160,320],[32,128],128),
mxnet.gluon.nn.MaxPool2D(strides = [2,2],pool_size = [3,3],padding = [1,1]))

GoogLeNet_block5 = mxnet.gluon.nn.HybridSequential(prefix = ‘GoogLeNet_block5_’)
GoogLeNet_block5.add(GoogLeNet_Inception(256,[160,320],[32,128],128),
GoogLeNet_Inception(384,[192,384],[48,128],128),
mxnet.gluon.nn.GlobalAvgPool2D())

GoogLeNet = mxnet.gluon.nn.HybridSequential(prefix = ‘GoogLeNet_’)
GoogLeNet.add(GoogLeNet_block1,
GoogLeNet_block2,
GoogLeNet_block3,
GoogLeNet_block4,
GoogLeNet_block5,
mxnet.gluon.nn.Dense(10))

GoogLeNet.initialize(mxnet.initializer.Xavier(),force_reinit = True)
GoogLeNet.hybridize()
print(GoogLeNet(mxnet.ndarray.uniform(shape = [6,1,96,96])))
lr = 0.5;eqochs_nums = 5;batch_szie = 6;resize = 96;
mnis_train = mxnet.gluon.data.vision.FashionMNIST(train = True)
mnis_test = mxnet.gluon.data.vision.FashionMNIST(train = False)
tranfrom = list()
tranfrom = tranfrom + [mxnet.gluon.data.vision.transforms.Resize(size = (resize,resize))]
tranfrom = tranfrom + [mxnet.gluon.data.vision.transforms.ToTensor()]
tranfrom = mxnet.gluon.data.vision.transforms.Compose(tranfrom)
train_data =mxnet.gluon.data.DataLoader(mnis_train.transform_first(tranfrom),batch_size = batch_szie,shuffle = True)
test_data = mxnet.gluon.data.DataLoader(mnis_test.transform_first(tranfrom),batch_size = batch_szie,shuffle = True)
LOSS = mxnet.gluon.loss.SoftmaxCrossEntropyLoss()
train_step = mxnet.gluon.Trainer(GoogLeNet.collect_params(),‘sgd’,{‘learning_rate’:lr})
for i in range(eqochs_nums):
nums,loss_train,acc_train,start = 0,0,0,time.clock()
for x,y in train_data:
with mxnet.autograd.record():
y_hat = GoogLeNet(x)
loss = LOSS(y_hat,y)
loss.backward()
train_step.step(batch_szie)
nums = nums + 1
loss_train = loss_train + loss.sum().asscalar()
acc_train = acc_train + (mxnet.ndarray.argmax(data = y_hat,axis = 1) == y.astype(‘float32’)).mean().asscalar()
print(‘eqochs:’,i + 1,‘loss:’,loss_train / nums,‘train_acc:’,acc_train / nums,‘test_acc:’,accuracy(net,test_data),‘time:’,time.clock() - start)

错误信息:
MXNetError Traceback (most recent call last)
in ()
----> 1 print(GoogLeNet(mxnet.ndarray.uniform(shape = [6,1,96,96])))
2 # Function(GoogLeNet,lr = 0.5,eqochs_nums = 5,batch_szie = 64,resize = 96)
3 lr = 0.5;eqochs_nums = 5;batch_szie = 6;resize = 96;
4 mnis_train = mxnet.gluon.data.vision.FashionMNIST(train = True)
5 mnis_test = mxnet.gluon.data.vision.FashionMNIST(train = False)

e:\python\lib\site-packages\mxnet\gluon\block.py in call (self, *args)
540 hook(self, args)
541
–> 542 out = self.forward(*args)
543
544 for hook in self._forward_hooks.values():

e:\python\lib\site-packages\mxnet\gluon\block.py in forward(self, x, *args)
907 with x.context as ctx:
908 if self._active:
–> 909 return self._call_cached_op(x, *args)
910
911 try:

e:\python\lib\site-packages\mxnet\gluon\block.py in _call_cached_op(self, *args)
813 i._finish_deferred_init()
814 cargs.append(i.data())
–> 815 out = self._cached_op(*cargs)
816 if isinstance(out, NDArray):
817 out = [out]

e:\python\lib\site-packages\mxnet_ctypes\ndarray.py in call (self, *args, **kwargs)
148 ctypes.byref(num_output),
149 ctypes.byref(output_vars),
–> 150 ctypes.byref(out_stypes)))
151
152 if original_output is not None:

e:\python\lib\site-packages\mxnet\base.py in check_call(ret)
249 “”"
250 if ret != 0:
–> 251 raise MXNetError(py_str(_LIB.MXGetLastError()))
252
253

MXNetError: Error in operator conv114_fwd: Shape inconsistent, Provided = [64,3,7,7], inferred shape=(64,1,7,7)

Restart & Run All 就可以解决了 应该是初始之后网络权重shape固定的原因吧

请教一下,所谓的并行连接网络和并行训练有关系吗?

具体的说在这个例子中 forward return nd.concat(x[1],x[2],x[3]) 能否用hybrid,这种情况是会Index error

我机器跑不出来,但初始化没报错,你可以参考下:



您好,在您的代码中p1 p2 p3 p4 依旧是串行执行的 能否有有效的办法使之并行呢?

mxnet后台的计算图,会自动把计算并行化

image
像这样x2带索引的好像就没办法混合,如果x2有像x1一样不带索引就可以,有没有什么好的解决方案呢,谢谢。

课后小节中有一句话:“其中Inception块的通道数分配之比是在ImageNet数据集上通过大量实验得来的”是如何实现的。

请问如何构建多输入的并行网络,比如两个不同输入图像的的alexnet网络在特征提取后用全连接融合

为啥跑两轮就跑不动了,,碰到什么情况才会这样的啊,有没有大佬帮忙分析指导一下呀?
epochs,batch_size,lr,ctx = 5,64,0.01,mx.gpu()

class Inception(nn.Block):
    def __init__(self,n1_1,n2_1,n2_3,n3_1,n3_5,n4_1,**kwargs):
        super(Inception,self).__init__(**kwargs)
        with self.name_scope():
            self.p1_conv1 = nn.Conv2D(n1_1,kernel_size=1,activation='relu')
            self.p2_conv1 = nn.Conv2D(n2_1,kernel_size=1,activation='relu')
            self.p2_conv3 = nn.Conv2D(n2_3,kernel_size=3,padding=1,activation='relu')
            self.p3_conv1 = nn.Conv2D(n3_1,kernel_size=1,activation='relu')
            self.p3_conv5 = nn.Conv2D(n3_5,kernel_size=5,padding=2,activation='relu')
            self.p4_pool3 = nn.MaxPool2D(pool_size=3,padding=1,strides=1) # 默认为2,
            self.p4_conv1 = nn.Conv2D(n4_1,kernel_size=1,activation='relu')
    def forward(self,x):
        p1 = self.p1_conv1(x)        
        p2 = self.p2_conv3(self.p2_conv1(x))
        p3 = self.p3_conv5(self.p3_conv1(x))
        p4 = self.p4_conv1(self.p4_pool3(x))
        # print(p1.shape,p2.shape,p3.shape,p4.shape) # channel 不一致,其余的都一致 batch_size channels height width
        return nd.concat(p1,p2,p3,p4,dim=1)# 在维度1 也就是输出channel上进行concat连接

incp = Inception(64,96,128,16,32,32)
incp.initialize()
x = nd.random.uniform(shape=(32,3,64,64))
incp(x).shape

class GoogleNet(nn.Block):
    def __init__(self,num_classes,verbose=False,**kwargs):
        super(GoogleNet,self).__init__(**kwargs)
        self.verbose = verbose
        with self.name_scope():
            b1 = nn.Sequential()
            b1.add(
                nn.Conv2D(64,kernel_size=7,strides=2,padding=3,activation='relu'),
                nn.MaxPool2D(pool_size=3,strides=2)
            )
            b2 = nn.Sequential()
            b2.add(
                nn.Conv2D(64,kernel_size=1),
                nn.Conv2D(192,kernel_size=3,padding=1),
                nn.MaxPool2D(pool_size=3,strides=2)
            )
            b3 = nn.Sequential()
            b3.add(
                Inception(64,96,128,16,32,32),
                Inception(128,128,192,32,96,64),
                nn.MaxPool2D(pool_size=3,strides=2)
            )
            b4 = nn.Sequential()
            b4.add(
                Inception(192,96,208,16,48,64),
                Inception(160,112,224,24,64,64),
                Inception(128,128,256,24,64,64),
                Inception(112,144,288,32,64,64),
                Inception(256,160,320,32,128,128),
                nn.MaxPool2D(pool_size=3,strides=2)
            )
            b5 = nn.Sequential()
            b5.add(
                
                Inception(256,160,320,32,128,128),
                Inception(384,192,384,48,128,128),
                nn.AvgPool2D(pool_size=2)
            )
            b6 = nn.Sequential()
            b6.add(
                nn.Flatten(),
                nn.Dense(num_classes)
            )
            self.net = nn.Sequential()
            self.net.add(b1,b2,b3,b4,b5,b6)
    def forward(self,x):
        out = x
        for i,b in enumerate(self.net):
            out = b(out)
            if self.verbose:
                print('block %s output : %s'%(i+1,out.shape))
        return out

ctx = mx.gpu()
net = GoogleNet(10,verbose=True)
net.initialize(ctx=ctx)
x = nd.random.uniform(shape=(4,3,96,96),ctx=ctx)
y = net(x)

def load_data_fashion_mnist(batch_size, resize=None, root=os.path.join(
        '~', '.mxnet', 'datasets', 'fashion-mnist')):
    """Download the fashion mnist dataset and then load into memory."""
    root = os.path.expanduser(root)
    # https://mxnet.apache.org/api/python/docs/api/gluon/data/vision/transforms/index.html#module-mxnet.gluon.data.vision.transforms
    transformer = []
    if resize:
        transformer += [gdata.vision.transforms.Resize(resize)]
    transformer += [gdata.vision.transforms.ToTensor()]
    # 因为compose的输入是一个transformer的list,所以一开始的时候 transformer 初始化为一个list,
    # 每次有新的变形方式加入到这个list里面去,最后compose为一个transformer
    transformer = gdata.vision.transforms.Compose(transformer)
    mnist_train = gdata.vision.FashionMNIST(root=root, train=True)
    mnist_test = gdata.vision.FashionMNIST(root=root, train=False)
    # 数据读取的多进程的设置
    num_workers = 0 if sys.platform.startswith('win32') else 4
    # https://mxnet.apache.org/api/python/docs/api/gluon/data/index.html#mxnet.gluon.data.Dataset.transform_first
    # 用 transform_first 的原因就是保持label不变。多看源码
    train_iter = gdata.DataLoader(mnist_train.transform_first(transformer),
                                  batch_size, shuffle=True,
                                  num_workers=num_workers)
    test_iter = gdata.DataLoader(mnist_test.transform_first(transformer),
                                 batch_size, shuffle=False,
                                 num_workers=num_workers)
    return train_iter, test_iter

epochs,batch_size,lr,ctx = 5,64,0.01,mx.gpu()
net = GoogleNet(10)
net.initialize(ctx=ctx,force_reinit=True,init=init.Xavier())
train_iter,test_iter = load_data_fashion_mnist(batch_size,resize=96)
trainer = mx.gluon.Trainer(net.collect_params(),'sgd',{'learning_rate':lr})

def evaluate_accuracy(data_iter, net, ctx=[mx.cpu()]):
    """Evaluate accuracy of a model on the given data set."""
    if isinstance(ctx, mx.Context):
        ctx = [ctx]
    acc_sum, n = nd.array([0]), 0
    for batch in data_iter:
        features, labels, _ = _get_batch(batch, ctx)
        for X, y in zip(features, labels):
            y = y.astype('float32')
            acc_sum += (net(X).argmax(axis=1) == y).sum().copyto(mx.cpu())
            n += y.size
        acc_sum.wait_to_read()
    return acc_sum.asscalar() / n

def _get_batch(batch, ctx):
    """Return features and labels on ctx."""
    features, labels = batch
    if labels.dtype != features.dtype:
        labels = labels.astype(features.dtype)
    return (gutils.split_and_load(features, ctx),
            gutils.split_and_load(labels, ctx), features.shape[0])


def train_ch5(net, train_iter, test_iter, batch_size, trainer, ctx,
              num_epochs):
    """Train and evaluate a model with CPU or GPU."""
    print('training on', ctx)
    loss = gloss.SoftmaxCrossEntropyLoss()
    for epoch in range(num_epochs):
        train_l_sum, train_acc_sum, n, start = 0.0, 0.0, 0, time.time()
        for X, y in train_iter:
            X, y = X.as_in_context(ctx), y.as_in_context(ctx)
            with ag.record():
                y_hat = net(X)
                l = loss(y_hat, y).sum()
            l.backward()
            trainer.step(batch_size)
            y = y.astype('float32')
            train_l_sum += l.asscalar()
            train_acc_sum += (y_hat.argmax(axis=1) == y).sum().asscalar()
            n += y.size
        test_acc = evaluate_accuracy(test_iter, net, ctx)
        print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f, '
              'time %.1f sec'
              % (epoch + 1, train_l_sum / n, train_acc_sum / n, test_acc,
                 time.time() - start))

train_ch5(net, train_iter, test_iter, batch_size, trainer, ctx, epochs) 

跑两轮就跑不动了。。。
训练结果
training on gpu(0)
epoch 1, loss 1.9426, train acc 0.317, test acc 0.722, time 52.4 sec
epoch 2, loss 0.6962, train acc 0.735, test acc 0.801, time 49.6 sec

这两个并行的网络的是共享权重参数的嘛?

p1,p2,p3,p4,本来就是要串行连接的把,你要并行的话,就要concat,但是先要保证p1,p2,p3,p4的shape在height,width是一致的,然后再dim=1上进行concat,这里googlenet 说的并行连接,指的是Inception block,这里已经实现了并行,后面组成更大的网络块的时候,就是用这个inception block 来重复,其余的东西就和之前差不多

p1,p2,p3,p4 是4个分块,结构上是并行的,但是代码用python写的时候,前向是在做串行(按顺序算p1,p2,p3,p4)的操作。这样其实是比较费时的。

哦哦,不好意思哈,看错了,那个就是inception 模块,结构上就是并行的,但是他写的前向的没有并行,我不太清楚他写的hybrid_forward,以及他说的这个mxnet会自动并行计算这些,我看视频是下面这么写的,你可以参考一下。

class Inception(nn.Block):
    def __init__(self,n1_1,n2_1,n2_3,n3_1,n3_5,n4_1,**kwargs):
        super(Inception,self).__init__(**kwargs)
        with self.name_scope():
            self.p1_conv1 = nn.Conv2D(n1_1,kernel_size=1,activation='relu')
            self.p2_conv1 = nn.Conv2D(n2_1,kernel_size=1,activation='relu')
            self.p2_conv3 = nn.Conv2D(n2_3,kernel_size=3,padding=1,activation='relu')
            self.p3_conv1 = nn.Conv2D(n3_1,kernel_size=1,activation='relu')
            self.p3_conv5 = nn.Conv2D(n3_5,kernel_size=5,padding=2,activation='relu')
            self.p4_pool3 = nn.MaxPool2D(pool_size=3,padding=1,strides=1) # 默认为2,
            self.p4_conv1 = nn.Conv2D(n4_1,kernel_size=1,activation='relu')
    def forward(self,x):
        p1 = self.p1_conv1(x)        
        p2 = self.p2_conv3(self.p2_conv1(x))
        p3 = self.p3_conv5(self.p3_conv1(x))
        p4 = self.p4_conv1(self.p4_pool3(x))
        # print(p1.shape,p2.shape,p3.shape,p4.shape) # channel 不一致,其余的都一致 batch_size channels height width
        return nd.concat(p1,p2,p3,p4,dim=1)# 在维度1 也就是输出channel上进行concat连接

有接口可以释放gpu内存么?每次训练后都有重新关闭jupyternotebook后再打开,否则,显存(gt940m)就爆了

training on gpu(0)
epoch 1, loss 2.2714, train acc 0.153, test acc 0.354, time 67.4 sec
epoch 2, loss 1.1149, train acc 0.547, test acc 0.733, time 42.3 sec
epoch 3, loss 0.6119, train acc 0.766, test acc 0.818, time 42.1 sec
epoch 4, loss 0.4543, train acc 0.829, test acc 0.853, time 42.2 sec
epoch 5, loss 0.3838, train acc 0.854, test acc 0.866, time 42.1 sec
epoch 6, loss 0.3464, train acc 0.869, test acc 0.850, time 42.5 sec
epoch 7, loss 0.3247, train acc 0.877, test acc 0.873, time 42.5 sec
epoch 8, loss 0.2946, train acc 0.888, test acc 0.892, time 42.2 sec
epoch 9, loss 0.2782, train acc 0.894, test acc 0.888, time 42.3 sec
epoch 10, loss 0.2611, train acc 0.899, test acc 0.888, time 42.3 sec
epoch 11, loss 0.2497, train acc 0.905, test acc 0.898, time 42.2 sec
epoch 12, loss 0.2393, train acc 0.909, test acc 0.900, time 42.8 sec
epoch 13, loss 0.2247, train acc 0.916, test acc 0.910, time 42.2 sec
epoch 14, loss 0.2112, train acc 0.920, test acc 0.902, time 42.4 sec
epoch 15, loss 0.2017, train acc 0.924, test acc 0.914, time 42.3 sec
epoch 16, loss 0.1942, train acc 0.925, test acc 0.910, time 42.3 sec
epoch 17, loss 0.3769, train acc 0.863, test acc 0.820, time 42.4 sec
epoch 18, loss 0.2344, train acc 0.911, test acc 0.910, time 42.3 sec
epoch 19, loss 0.1878, train acc 0.930, test acc 0.908, time 42.2 sec
epoch 20, loss 0.1700, train acc 0.936, test acc 0.913, time 42.1 sec
epoch 21, loss 0.1597, train acc 0.939, test acc 0.908, time 42.2 sec
epoch 22, loss 0.1501, train acc 0.943, test acc 0.916, time 42.0 sec
epoch 23, loss 0.1431, train acc 0.945, test acc 0.913, time 42.4 sec
epoch 24, loss 0.1318, train acc 0.949, test acc 0.918, time 42.7 sec
epoch 25, loss 0.1242, train acc 0.952, test acc 0.917, time 42.3 sec
epoch 26, loss 0.1177, train acc 0.955, test acc 0.914, time 42.3 sec
epoch 27, loss 0.2081, train acc 0.927, test acc 0.894, time 42.0 sec
epoch 28, loss 0.1165, train acc 0.955, test acc 0.916, time 42.1 sec
epoch 29, loss 0.1074, train acc 0.959, test acc 0.914, time 42.9 sec
epoch 30, loss 0.0970, train acc 0.963, test acc 0.919, time 42.2 sec
epoch 31, loss 0.1289, train acc 0.954, test acc 0.921, time 42.7 sec
epoch 32, loss 0.0920, train acc 0.965, test acc 0.916, time 42.4 sec
epoch 33, loss 0.0806, train acc 0.969, test acc 0.919, time 42.1 sec
epoch 34, loss nan, train acc 0.519, test acc 0.100, time 42.5 sec
epoch 35, loss nan, train acc 0.100, test acc 0.100, time 42.4 sec
epoch 36, loss nan, train acc 0.100, test acc 0.100, time 42.8 sec
epoch 37, loss nan, train acc 0.100, test acc 0.100, time 45.7 sec
epoch 38, loss nan, train acc 0.100, test acc 0.100, time 48.5 sec

将训练的epoch调成50后运行,发现到34次遍历的时候出现了上面的情况,这是怎么回事呢

1赞