YDYの博客

一只有理想的菜鸟

自定义算子:复现CPU版本的二维卷积

PyTorch卷积の实现(CPU)

参考链接:知乎

搭建项目

项目地址:Github

Windows环境

提前安装:cl.exe(参考:配置cl)

编译

运行python setup.py develop,就能一键编译和安装。如果运行后没有报编译错误,就可以把实现的卷积用起来了

setup.py内容:

  • cpp_extension.CppExtension:和编译相关的内容
    • 文件路径
    • 头文件搜索路径
  • name:包的名称
  • cmdclass:指定使用cpp_extension.BuildExtension类来构建C++扩展模块

如下是编译后的目录树

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
E:.
│ my_ops.cp311-win_amd64.pyd
│ setup.py
│ test.py

├─.vs
│ │ slnx.sqlite
│ │ VSWorkspaceState.json
│ │
│ └─panoflow
│ ├─FileContentIndex
│ │ 4ea7c304-f0a5-4921-b532-67a89eef2f1e.vsidx
│ │ 85545400-021b-45eb-a926-17f428dfafc7.vsidx
│ │ c1cd6aa0-353b-4942-b0b3-62d2e7989a6a.vsidx
│ │ cffc09b9-7625-4705-972e-360fc2b94695.vsidx
│ │ d975a3f7-8477-4464-9cbb-bafa4d00c5b8.vsidx
│ │
│ └─v17
├─.vscode
│ settings.json

├─build
│ ├─lib.win-amd64-cpython-311
│ │ my_ops.cp311-win_amd64.pyd
│ │
│ └─temp.win-amd64-cpython-311
│ └─Release
│ └─panoflow
│ └─core
│ └─op
│ my_conv.obj
│ my_ops.cp311-win_amd64.exp
│ my_ops.cp311-win_amd64.lib

├─panoflow
│ └─core
│ └─op
│ │ common_cuda_helper.hpp
│ │ my_conv.cpp
│ │ my_conv.py
│ │ pytorch_cpp_helper.hpp
│ │ pytorch_cuda_helper.hpp
│ │
│ └─__pycache__
│ my_conv.cpython-311.pyc

└─panoflow.egg-info
dependency_links.txt
PKG-INFO
SOURCES.txt
top_level.txt

测试

运行python test.py

测试函数 test_onetest_two,用于测试单通道和多通道的情况

如果没有任何输出(报错信息),就说明卷积实现成功了

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
def test_one():
ts = torch.ones([1, 1, 3, 3]).to(device_name)
layer = nn.Conv2d(1, 1, 3, 1, 1, bias=open_bias).to(device_name)
gt = layer(ts)
my_layer = MyConv2d(1, 1, 3, 1, 1).to(device_name)
my_layer.load_state_dict(layer.state_dict(), strict=False)
res = my_layer(ts)
res = res.to('cpu')
gt = gt.to('cpu')
assert torch.allclose(res, gt, 1e-3, 1e-5)


def test_two():
ts = torch.rand([1, inc, *img_shaspe]).to(device_name)
layer = nn.Conv2d(inc, outc, 3, 1, 1, bias=open_bias).to(device_name)
gt = layer(ts)
my_layer = MyConv2d(inc, outc, 3, 1, 1).to(device_name)
my_layer.load_state_dict(layer.state_dict(), strict=False)
res = my_layer(ts)
res = res.to('cpu')
gt = gt.to('cpu')
assert torch.allclose(res, gt, 1e-3, 1e-5)

CPU实现过程

  • 头文件:pytorch_cpp_helper.hpp,pytorch_cuda_helper.hpp,common_cuda_helper.hpp

C++实现

  • C++实现加法算子

算子的实现函数和C++接口绑定

1
2
3
4
5
6
7
8
9
10
11
#include <torch/torch.h>

torch::Tensor my_add(torch::Tensor t1, torch::Tensor t2)
{
return t1 + t2;
}

TORCH_LIBRARY(my_ops, m)
{
m.def("my_add", my_add);
}
  • 卷积的实现my_conv.cpp

    • my_conv_forward是卷积的主函数
    1
    2
    3
    4
    5
    void my_conv_forward(Tensor input, Tensor weight, Tensor bias,
    Tensor output, Tensor columns, int kW,
    int kH, int dW, int dH, int padW, int padH,
    int dilationW, int dilationH, int group,
    int im2col_step)
    • 先做im2col操作,再做了矩阵乘法

      • 使用循环,对每个批次的输入进行 im2col 操作,根据设备调用相应的 im2col 函数。
      • 将处理后的列矩阵进行形状变换,以便后续矩阵乘法操作。

      • 利用矩阵乘法(addmm_函数)更新输出缓冲区

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    for (int elt = 0; elt < batchSize / im2col_step; elt++)
    {
    if (isCuda)
    {
    my_conv_im2col_cuda(input[elt], nInputPlane, inputHeight,
    inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
    dilationW, im2col_step, columns);
    }
    else
    {
    my_conv_im2col_cpu(input[elt], nInputPlane, inputHeight,
    inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
    dilationW, im2col_step, columns);
    }


    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
    weight = weight.view({group, weight.size(0) / group, weight.size(1),
    weight.size(2), weight.size(3)});

    for (int g = 0; g < group; g++)
    {
    output_buffer[elt][g] = output_buffer[elt][g]
    .flatten(1)
    .addmm_(weight[g].flatten(1), columns[g])
    .view_as(output_buffer[elt][g]);
    }
    columns =
    columns.view({columns.size(0) * columns.size(1), columns.size(2)});
    weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
    weight.size(3), weight.size(4)});
    }
    • CPU 实现的 im2col (my_conv_im2col_cpu)
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    void my_conv_im2col_cpu(Tensor data_im,
    const int channels, const int height,
    const int width, const int ksize_h,
    const int ksize_w, const int pad_h, const int pad_w,
    const int stride_h, const int stride_w,
    const int dilation_h, const int dilation_w,
    const int parallel_imgs, Tensor data_col)
    {
    // 计算 im2col 输出的高度和宽度
    int height_col =
    (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
    int width_col =
    (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;

    // 计算要处理的总 kernel 数量
    int num_kernels = channels * height_col * width_col * parallel_imgs;

    // 使用 PyTorch 的宏 AT_DISPATCH_FLOATING_TYPES_AND_HALF 遍历浮点数类型和半精度类型
    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
    data_im.scalar_type(), "", [&]
    {
    // 调用具体的 im2col 核函数,根据不同的数据类型执行不同的实现
    my_conv_im2col_cpu_kernel<scalar_t>(
    num_kernels, data_im.data_ptr<scalar_t>(),
    height, width, ksize_h, ksize_w,
    pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
    parallel_imgs, channels,
    height_col, width_col, data_col.data_ptr<scalar_t>());
    });
    }

Python封装

  • 调用
1
2
import my_ops
my_ops.my_conv_forward(...)
  • 使用 Pybind11 封装 C++ 函数
1
2
3
4
5
6
7
8
9
PYBIND11_MODULE(my_ops, m)
{
m.def("my_conv_forward", my_conv_forward, "my_conv_forward",
py::arg("input"), py::arg("weight"), py::arg("bias"),
py::arg("output"), py::arg("columns"), py::arg("kW"),
py::arg("kH"), py::arg("dW"), py::arg("dH"), py::arg("padW"),
py::arg("padH"), py::arg("dilationW"), py::arg("dilationH"),
py::arg("group"), py::arg("im2col_step"));
}

通过 Pybind11,可以在 Python 中直接调用名为 my_conv_forward 的函数,并将参数传递给底层的 C++ 实现,完成卷积操作

  • MyConvF 类和MyConv2d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import torch
from torch.autograd import Function
import torch.nn as nn
from torch import Tensor
from torch.nn.modules.utils import _pair
from torch.nn.parameter import Parameter

import my_ops


class MyConvF(Function):


def forward(ctx,
input: torch.Tensor,
weight,
bias,
stride=1,
padding=0,
dilation=1,
groups=1,
im2col_step=32):
if input is not None and input.dim() != 4:
raise ValueError(
f'Expected 4D tensor as input, got {input.dim()}D tensor \
instead.')
ctx.stride = _pair(stride)
ctx.padding = _pair(padding)
ctx.dilation = _pair(dilation)
ctx.groups = groups
ctx.im2col_step = im2col_step

weight = weight.type_as(input)
ctx.save_for_backward(input, weight)

output = input.new_empty(MyConvF._output_size(ctx, input, weight))

ctx.bufs_ = [input.new_empty(0), input.new_empty(0)] # columns, ones

cur_im2col_step = min(ctx.im2col_step, input.size(0))
assert (input.size(0) % cur_im2col_step
) == 0, 'batch size must be divisible by im2col_step'

my_ops.my_conv_forward(
input,
weight,
bias,
output,
ctx.bufs_[0],
kW=weight.size(3),
kH=weight.size(2),
dW=ctx.stride[1],
dH=ctx.stride[0],
padW=ctx.padding[1],
padH=ctx.padding[0],
dilationW=ctx.dilation[1],
dilationH=ctx.dilation[0],
group=ctx.groups,
im2col_step=cur_im2col_step)
return output


def _output_size(ctx, input, weight):
channels = weight.size(0)
output_size = (input.size(0), channels)
for d in range(input.dim() - 2):
in_size = input.size(d + 2)
pad = ctx.padding[d]
kernel = ctx.dilation[d] * (weight.size(d + 2) - 1) + 1
stride_ = ctx.stride[d]
output_size += ((in_size + (2 * pad) - kernel) // stride_ + 1, )
if not all(map(lambda s: s > 0, output_size)):
raise ValueError(
'convolution input is too small (output would be ' +
'x'.join(map(str, output_size)) + ')')
return output_size


my_conv = MyConvF.apply


class MyConv2d(nn.Module):

def __init__(self,
in_channels: int,
out_channels: int,
kernel_size,
stride=1,
padding=0,
dilation=1,
groups: int = 1,
bias: bool = True):
super().__init__()
kernel_size_ = _pair(kernel_size)
stride_ = _pair(stride)
padding_ = _pair(padding)
dilation_ = _pair(dilation)
self.in_channels = in_channels
self.out_channels = out_channels
self.kernel_size = kernel_size_
self.stride = stride_
self.padding = padding_
self.dilation = dilation_
self.groups = groups
self.weight = Parameter(
torch.Tensor(out_channels, in_channels // groups, *kernel_size_))
self.bias = Parameter(torch.Tensor(out_channels))

# Useless attributes
self.transposed = None
self.output_padding = None
self.padding_mode = None

def forward(self, input: Tensor) -> Tensor:
return my_conv(input, self.weight, self.bias, self.stride,
self.padding, self.dilation, self.groups)