之前介绍了如何将YOLO11中Backbone网络替换为ShuffleNet,而今天尝试将其替换为MobileNet V3。
首先是代码的实现:
import torch
from torch import nn
class h_sigmoid(nn.Module):
def __init__(self, inplace=True):
super(h_sigmoid, self).__init__()
self.relu = nn.ReLU6(inplace=inplace)
def forward(self, x):
return self.relu(x + 3) / 6
class h_swish(nn.Module):
def __init__(self, inplace=True):
super(h_swish, self).__init__()
self.sigmoid = h_sigmoid(inplace=inplace)
def forward(self, x):
return x * self.sigmoid(x)
class SELayer(nn.Module):
def __init__(self, channel, reduction=4):
super(SELayer, self).__init__()
# Squeeze操作
self.avg_pool = nn.AdaptiveAvgPool2d(1)
# Excitation操作(FC+ReLU+FC+Sigmoid)
self.fc = nn.Sequential(
nn.Linear(channel, channel // reduction),
nn.ReLU(inplace=True),
nn.Linear(channel // reduction, channel),
h_sigmoid()
)
def forward(self, x):
b, c, _, _ = x.size()
y = self.avg_pool(x)
y = y.view(b, c)
y = self.fc(y).view(b, c, 1, 1) # 学习到的每一channel的权重
return x * y
class conv_bn_hswish(nn.Module):
"""
This equals to
def conv_3x3_bn(inp, oup, stride):
return nn.Sequential(
nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
nn.BatchNorm2d(oup),
h_swish()
)
"""
def __init__(self, c1, c2, stride):
super(conv_bn_hswish, self).__init__()
self.conv = nn.Conv2d(c1, c2, 3, stride, 1, bias=False)
self.bn = nn.BatchNorm2d(c2)
self.act = h_swish()
def forward(self, x):
return self.act(self.bn(self.conv(x)))
def fuseforward(self, x):
return self.act(self.conv(x))
class MobileNetV3(nn.Module):
def __init__(self, inp, oup, hidden_dim, kernel_size, stride, use_se, use_hs):
super(MobileNetV3, self).__init__()
assert stride in [1, 2]
self.identity = stride == 1 and inp == oup
# 输入通道数=扩张通道数 则不进行通道扩张
if inp == hidden_dim:
self.conv = nn.Sequential(
# dw
nn.Conv2d(hidden_dim, hidden_dim, kernel_size, stride, (kernel_size - 1) // 2, groups=hidden_dim,
bias=False),
nn.BatchNorm2d(hidden_dim),
h_swish() if use_hs else nn.ReLU(inplace=True),
# Squeeze-and-Excite
SELayer(hidden_dim) if use_se else nn.Sequential(),
# pw-linear
nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
nn.BatchNorm2d(oup),
)
else:
# 否则 先进行通道扩张
self.conv = nn.Sequential(
# pw
nn.Conv2d(inp, hidden_dim, 1, 1, 0, bias=False),
nn.BatchNorm2d(hidden_dim),
h_swish() if use_hs else nn.ReLU(inplace=True),
# dw
nn.Conv2d(hidden_dim, hidden_dim, kernel_size, stride, (kernel_size - 1) // 2, groups=hidden_dim,
bias=False),
nn.BatchNorm2d(hidden_dim),
# Squeeze-and-Excite
SELayer(hidden_dim) if use_se else nn.Sequential(),
h_swish() if use_hs else nn.ReLU(inplace=True),
# pw-linear
nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
nn.BatchNorm2d(oup),
)
def forward(self, x):
y = self.conv(x)
if self.identity:
return x + y
else:
return y
关于该网络的介绍这里不赘述,可以参考文章最后的链接。这里主要关注网络的结构图:

可以看到网络输入先经过3x3卷积后才进入bneck模块。其中输出通道为16,不使用SE模块,激活函数NL为h-swich,而stride为2。
根据上述网络结构,可以定义出YOLO的yaml中的内容如下,这里新建1个配置文件yolo11n-mobilenetV3-small.yaml:
backbone:
# [from, repeats, module, args]
- [-1, 1, conv_bn_hswish, [16, 2]] # 0-P1/2 320*320
- [-1, 1, MobileNetV3, [16,16, 3, 2, True, False]] # 1-P2/4 160*160
- [-1, 1, MobileNetV3, [24,72, 3, 2, False, False]] # 2 80*80
- [-1, 1, MobileNetV3, [24,88,3,1,False,False]] # 3-P3/8 80*80
- [-1, 1, MobileNetV3, [40, 96, 5,2,True,True]] # 4 40*40
- [-1, 1, MobileNetV3, [40, 240, 5, 1, True, True]] # 5-P4/16 40*40
- [-1, 1, MobileNetV3, [40, 240, 5, 1, True, True]] # 6 40*40
- [-1, 1, MobileNetV3, [48, 120, 5, 1, True, True]] # 7-P5/32 40*40
- [-1, 1, MobileNetV3, [48, 144, 5, 1, True, True]] # 8 40*40
- [-1, 1, MobileNetV3, [96,288,5,2,True,True]] # 9 20*20
- [-1, 1, MobileNetV3, [96,576,5,1,True,True]] # 10 20*20
- [-1, 1, MobileNetV3, [96,576,5,1,True,True]] # 11 20*20
# YOLO11n head
head:
- [-1, 1, nn.Upsample, [None, 2, "nearest"]] # 40*40
- [[-1, 6], 1, Concat, [1]] # cat backbone P4
- [-1, 2, C3k2, [512, False]] # 13
- [-1, 1, nn.Upsample, [None, 2, "nearest"]]
- [[-1, 3], 1, Concat, [1]] # cat backbone P3
- [-1, 2, C3k2, [256, False]] # 16 (P3/8-small)
- [-1, 1, Conv, [256, 3, 2]]
- [[-1, 13], 1, Concat, [1]] # cat head P4
- [-1, 2, C3k2, [512, False]] # 19 (P4/16-medium)
- [-1, 1, Conv, [512, 3, 2]]
- [[-1, 10], 1, Concat, [1]] # cat head P5
- [-1, 2, C3k2, [1024, True]] # 22 (P5/32-large)
- [[17, 20, 23], 1, Detect, [nc]] # Detect(P3, P4, P5)
将其对应之前的网络结构图,其中exp size对应MobileNetV3类中hidden_dim。
最后就是将上述模块的代码在tasks.py中进行调入,这里就不再详细赘述其过程了:
from ultralytics.nn.modules import (
AIFI,
C1,
...
conv_bn_hswish,
MobileNetV3
)
def parse_model(d, ch, verbose=True):
...
base_modules = frozenset(
{
Classify,
Conv,
...
conv_bn_hswish,
MobileNetV3
}
)
最后其计算量为4.0 GFLops,而原来YOLO11n的计算量为6.6 GFLops,一下子就减少了39%的计算量。
参考文章:
https://zhuanlan.zhihu.com/p/365119654 https://www.cnblogs.com/ZOMI/articles/18561132 https://www.bilibili.com/video/BV17QhAzPE9V
如果喜欢这篇文章或对您有帮助,可以:[☕] 请我喝杯咖啡 | [💓] 小额赞助

