算法
https://github.com/zhenqifu/PairLIE
python推理一次RTX3090 5ms,但是算上数据移到GPU,运算完后再移动到CPU,一共需要时间650.68ms,主要原因是4K图像数据量太大。如果考虑多进程,或则直接在GPU显示渲染画面,可以达到很高的帧率。
import torch, time
for x in range(10):
tensor = torch.rand(1, 3, 2160, 3840)
s_time = time.time()
# 随机生成一个tensor
x = tensor.to("cuda")
print("Time cpu->cuda: ", (time.time() - s_time))
s_time = time.time()
y = x.to("cpu")
print("Time: cuda->cpu", (time.time() - s_time))
1
2
3
4
5
6
7
8
9
10
11
12
13
2
3
4
5
6
7
8
9
10
11
12
13
使用上面代码测试cpu->cuda平均时间8.5ms,cuda->cpu平均时间45.1ms,共计53.6ms,从这个结果看,如要要求4K实时性的话,我这台电脑python已经无法胜任,
我的配置HP-Z4-G5工作站
CPU: Intel(R) Xeon(R) w5-2455X
GPU: NVIDIA GeForce RTX 3090 / NVIDIA-SMI 535.146.02
OS: Ubuntu20.04
MEM: 128GB
1
2
3
4
2
3
4
导出onnx
# export.py
import torch.onnx
from net.net import net
dummy_input = torch.randn(1, 3, 640, 640)
model= net()
model.load_state_dict(torch.load("./weights/PairLIE.pth", map_location=lambda storage, loc: storage))
model.eval()
model(dummy_input)
im = torch.zeros(1, 3, 1333, 800).to("cpu")
torch.onnx.export(model, im,
"PairLIE.onnx",
verbose=False,
opset_version=11,
do_constant_folding=True,
input_names=['input'],
output_names=['L','R', 'X'],
dynamic_axes={'input': {0: 'batch', 2: 'height', 3: 'width'}})
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
onnxsim
优化onnx
验证onnx是否正确
可以加个客观评价指标
# test_onnx.py
import onnxruntime as ort
import onnxruntime
import cv2
import numpy as np
import time
print(onnxruntime.get_device(), onnxruntime.get_available_providers())
# 加载模型
# GPU ['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'AzureExecutionProvider', 'CPUExecutionProvider']
# GPU ['0.48295986652374270000000', '0.5281200170516968000', '1.03634548187255900000', 'CPUExecutionProvider']
model = ort.InferenceSession("models/PairLIE-dynamic.onnx", providers=["CPUExecutionProvider"])
# 打开图片
img = cv2.imread('img/00000025_1026_4k.png')
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
# 预处理
input = np.expand_dims(img.transpose(2, 0, 1), axis=0).astype(np.float32) / 255.0
print(input.shape)
# 推理
L, R, X = model.run(["L", "R", "X"], {"input": input})
s_time = time.time()
for _ in range(10):
L, R, X = model.run(["L", "R", "X"], {"input": input})
print("Time: ", (time.time() - s_time)/10)
# 后处理
D = input- X
I = np.power(L, 0.2) * R
L = L.squeeze(0).transpose(1, 2, 0) * 255
R = R.squeeze(0).transpose(1, 2, 0) * 255
I = I.squeeze(0).transpose(1, 2, 0) * 255
D = D.squeeze(0).transpose(1, 2, 0) * 255
# 保存
name = ["test.jpg"]
cv2.imwrite('NPE/L_' + name[0], cv2.cvtColor(L, cv2.COLOR_BGR2RGB))
cv2.imwrite("NPE/R_" + name[0], cv2.cvtColor(R, cv2.COLOR_BGR2RGB))
cv2.imwrite("NPE/I_" + name[0], cv2.cvtColor(I, cv2.COLOR_BGR2RGB))
cv2.imwrite("NPE/D_" + name[0], cv2.cvtColor(D, cv2.COLOR_BGR2RGB))
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
找了一张4K图像测试推理,测试结果如下:
provider | TensorrtExecutionProvider | CUDAExecutionProvider | AzureExecutionProvider | CPUExecutionProvider |
---|---|---|---|---|
分辨率 | 4k | 4k | 4k | 4k |
时间ms | 482.95 | 528.12 | 1036.34 | 11038.60 |
onnx转tensorrt
静态转换(静态shape)
trtexec --onnx=models/PairLIE-static-2160x3840.onnx \
--fp16 \
--saveEngine=models/PairLIE-static-2160x3840.engine
1
2
3
2
3
动态转换(动态shape)
trtexec --onnx=PairLIE.onnx \
--minShapes=input:1x3x224x224 \
--optShapes=input:1x3x640x640 \
--maxShapes=input:1x3x2160x3840 \
--fp16 \
--saveEngine=PairLIE-dynamic.engine
1
2
3
4
5
6
7
2
3
4
5
6
7
TensorRT Python验证
2160x3840
357.45ms
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np
import cv2
import time
# 加载TensorRT引擎
def load_engine(engine_path):
with open(engine_path, "rb") as f:
runtime = trt.Runtime(trt.Logger(trt.Logger.WARNING))
return runtime.deserialize_cuda_engine(f.read())
# 创建TensorRT上下文
def create_context(engine):
context = engine.create_execution_context()
return context
# 分配和复制输入数据到GPU内存
def allocate_and_copy(inputs, context):
inputs_host = []
inputs_cuda = []
for input_data in inputs:
input_host = np.array(input_data, dtype=np.float32)
input_host = input_host.reshape(-1)
# print("input_host:", input_host.shape, input_host.nbytes)
input_cuda = cuda.mem_alloc(input_host.nbytes)
inputs_host.append(input_host)
inputs_cuda.append(input_cuda)
cuda.memcpy_htod(input_cuda, input_host)
return inputs_host, inputs_cuda
# 分配输出数据的CPU内存
def allocate_outputs(context):
# print("context.engine.num_bindings:", context.engine.num_bindings)
outputs_host = []
outputs_cuda = []
for binding in range(context.engine.num_bindings):
if not context.engine.binding_is_input(binding):
shape = context.engine.get_binding_shape(binding)
# print("output shape:", shape)
size = trt.volume(shape) * \
context.engine.max_batch_size
dtype = trt.nptype(context.engine.get_binding_dtype(binding))
output_host = np.empty(size, dtype=dtype)
output_cuda = cuda.mem_alloc(output_host.nbytes)
outputs_host.append(output_host)
outputs_cuda.append(output_cuda)
else:
shape = context.engine.get_binding_shape(binding)
# print("input shape:", shape)
return outputs_host, outputs_cuda
# 运行TensorRT推理
def run_inference(context, inputs_cuda, outputs_cuda):
bindings = []
for input_cuda in inputs_cuda:
bindings.append(int(input_cuda))
for output_cuda in outputs_cuda:
bindings.append(int(output_cuda))
stream = cuda.Stream()
context.execute_async_v2(bindings, stream_handle=stream.handle)
# 复制输出数据从GPU内存到CPU内存
def copy_outputs(outputs_cuda, outputs_host):
for output_cuda, output_host in zip(outputs_cuda, outputs_host):
cuda.memcpy_dtoh(output_host, output_cuda)
return outputs_host
# 预处理输入数据
def preprocess_input(image):
# 根据模型的输入要求进行相应的预处理,例如缩放、裁剪等
# 这里假设输入图像的大小已经符合模型的要求
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
image = np.expand_dims(image.transpose(2, 0, 1), axis=0).astype(np.float32) / 255.0
return image
# 主函数
def main():
# 加载TensorRT引擎
engine = load_engine("models/PairLIE-static-2160x3840-fp32.engine")
# 创建TensorRT上下文
context = create_context(engine)
# 加载输入图像
image = image = cv2.imread("./img/00000025_1026_4k.png")
print("image:", image.shape) # (1, 3, 2160, 3840)
# 预处理输入数据
preprocessed_image = preprocess_input(image)
# 分配输出数据的CPU内存
outputs_host, outputs_cuda = allocate_outputs(context)
# 分配和复制输入数据到GPU内存
inputs_host, inputs_cuda = allocate_and_copy([preprocessed_image], context)
# 运行TensorRT推理
run_inference(context, inputs_cuda, outputs_cuda)
# 复制输出数据从GPU内存到CPU内存
X, L, R = copy_outputs(outputs_cuda, outputs_host)
s_time = time.time()
for _ in range(100):
# 运行TensorRT推理
run_inference(context, inputs_cuda, outputs_cuda)
# 复制输出数据从GPU内存到CPU内存
X, L, R = copy_outputs(outputs_cuda, outputs_host)
print("Time: ", (time.time() - s_time)/10)
R = R.reshape(3, 2160, 3840).transpose(1, 2, 0)
L = L.reshape(1, 2160, 3840).transpose(1, 2, 0)
X = X.reshape(3, 2160, 3840).transpose(1, 2, 0)
I = np.power(L, 0.2) * R
# 保存
name = ["test.jpg"]
cv2.imwrite('NPE/L-TRT_' + name[0], cv2.cvtColor(L * 255, cv2.COLOR_BGR2RGB))
cv2.imwrite("NPE/R-TRT_" + name[0], cv2.cvtColor(R * 255, cv2.COLOR_BGR2RGB))
cv2.imwrite("NPE/I-TRT_" + name[0], cv2.cvtColor(I * 255, cv2.COLOR_BGR2RGB))
if __name__ == '__main__':
main()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
TensorRT C++验证
编写TensorRT C++代码
1
编写CMakeLists.txt
cmake_minimum_required(VERSION 3.10)
project(TensorRT-Demo)
# 设置C++标准
set(CMAKE_CXX_STANDARD 11)
# 设置TensorRT和OpenCV的路径
# 设置TensorRT_DIR的路径为:/home/dengyibin/work/TensorRT-8.5.1.7
set(TENSORRT_DIR /home/dengyibin/work/TensorRT-8.5.1.7)
set(TENSORRT_INCLUDE_DIRS ${TENSORRT_DIR}/include)
set(TENSORRT_LIBRARY_DIRS ${TENSORRT_DIR}/lib)
message(STATUS "TENSORRT Libs: \n${TENSORRT_LIBRARY_DIRS}\n")
message(STATUS "TENSORRT Headers: \n${TENSORRT_INCLUDE_DIRS}\n")
find_package(Threads REQUIRED)
# CUDA
find_package(CUDA REQUIRED)
message(STATUS "CUDA Libs: \n${CUDA_LIBRARIES}\n")
message(STATUS "CUDA Headers: \n${CUDA_INCLUDE_DIRS}\n")
set(OpenCV_DIR /software/opencv/lib/cmake/opencv4)
find_package(OpenCV REQUIRED)
message(STATUS "OpenCV Libs: \n${OpenCV_LIBS}\n")
message(STATUS "OpenCV Libraries: \n${OpenCV_LIBRARIES}\n")
message(STATUS "OpenCV Headers: \n${OpenCV_INCLUDE_DIRS}\n")
# 查找Eigen库
find_package(Eigen3 REQUIRED)
message(STATUS "Eigen3 Headers: \n${EIGEN3_INCLUDE_DIR}\n")
message(STATUS "Eigen3 Libraries: \n${EIGEN3_LIBRARIES}\n")
include_directories(${EIGEN3_INCLUDE_DIR})
# include dir
list(APPEND INCLUDE_DIRS
${CUDA_INCLUDE_DIRS}
${OpenCV_INCLUDE_DIRS}
${TENSORRT_INCLUDE_DIRS}
${EIGEN3_INCLUDE_DIR}
include
)
# lib dir
list(APPEND ALL_LIB_DIRS
${CUDA_LIBRARIES}
${CUDA_LIB_DIR}
${OpenCV_LIBRARIES}
${TENSORRT_LIBRARY_DIRS}
${TENSORRT_LIBRARY_DIRS}/stubs
)
# libs
list(APPEND ALL_LIBS
${CUDA_LIBRARIES}
${OpenCV_LIBRARIES}
nvinfer
nvinfer_plugin
cudart
opencv_core
opencv_imgcodecs
opencv_imgproc
)
include_directories(${INCLUDE_DIRS})
link_directories(${ALL_LIB_DIRS})
# 添加可执行文件
add_executable(${PROJECT_NAME} main.cpp)
add_executable(demo demo.cpp)
target_link_libraries(${PROJECT_NAME} PRIVATE Threads::Threads ${ALL_LIBS})
target_link_libraries(demo PRIVATE Threads::Threads ${ALL_LIBS})
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
编译
mkdir build
cd build
cmake ..
make -j 4
1
2
3
4
2
3
4
运行
./TensorRT-Demo
1
部署到Jetson NX 16G
测评结果
部署到Jetson AGX 64G
测评结果
时间包含读取图像CPU-->GPU
,GPU-->CPU
, TensorRT-C++ 数据预处理太慢
平台 | 硬件 | 引擎 | 精度 | 尺寸 | FPS |
---|---|---|---|---|---|
Ubuntu20.04 | RTX3090 | FP32 | Pytorch | 4K | 1.328 |
Ubuntu20.04 | RTX3090 | FP32 | ONNX | 4K | 1.690 |
Ubuntu20.04 | RTX3090 | FP32 | TRT-Python | 4K | 1.763 |
Ubuntu20.04 | RTX3090 | FP32 | TRT-C++ | 4K | 1.254 |
后续
- 部署到RK3588
- 部署到海思
- 部署到百度
- …