算法优化部署 TensorRT

TensorRT部署低照度算法-Jetson边缘端移植

那棵树看起来生气了

2024-02-02 21:53:58

0 点赞

68 阅读

2024-02-02

算法
导出onnx
onnxsim
验证onnx是否正确
onnx转tensorrt
TensorRT Python验证
TensorRT C++验证
部署到Jetson NX 16G
- 测评结果
部署到Jetson AGX 64G
- 测评结果
后续
参考

算法

https://github.com/zhenqifu/PairLIE

python推理一次RTX3090 5ms，但是算上数据移到GPU，运算完后再移动到CPU，一共需要时间650.68ms，主要原因是4K图像数据量太大。如果考虑多进程，或则直接在GPU显示渲染画面，可以达到很高的帧率。

import torch, time


for x in range(10):
    tensor = torch.rand(1, 3, 2160, 3840)
    s_time = time.time()
    # 随机生成一个tensor
    
    x = tensor.to("cuda")
    print("Time cpu->cuda: ", (time.time() - s_time))
    s_time = time.time()
    y = x.to("cpu")
    print("Time: cuda->cpu", (time.time() - s_time))

1
2
3
4
5
6
7
8
9
10
11
12
13

使用上面代码测试cpu->cuda平均时间8.5ms，cuda->cpu平均时间45.1ms，共计53.6ms,从这个结果看，如要要求4K实时性的话，我这台电脑python已经无法胜任，

我的配置HP-Z4-G5工作站

CPU: Intel(R) Xeon(R) w5-2455X
GPU: NVIDIA GeForce RTX 3090 / NVIDIA-SMI 535.146.02
OS: Ubuntu20.04
MEM: 128GB

1
2
3
4

导出onnx

# export.py
import torch.onnx
from net.net import net

dummy_input = torch.randn(1, 3, 640, 640)
model= net()
model.load_state_dict(torch.load("./weights/PairLIE.pth", map_location=lambda storage, loc: storage))
model.eval()
model(dummy_input)
im = torch.zeros(1, 3, 1333, 800).to("cpu")
torch.onnx.export(model, im,
    "PairLIE.onnx",
    verbose=False,
    opset_version=11,
    do_constant_folding=True,
    input_names=['input'],
    output_names=['L','R', 'X'],
    dynamic_axes={'input': {0: 'batch', 2: 'height', 3: 'width'}})

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18

onnxsim

优化onnx

验证onnx是否正确

可以加个客观评价指标

# test_onnx.py
import onnxruntime as ort
import onnxruntime
import cv2
import numpy as np
import time

print(onnxruntime.get_device(), onnxruntime.get_available_providers())
# 加载模型
# GPU ['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'AzureExecutionProvider', 'CPUExecutionProvider']
# GPU ['0.48295986652374270000000', '0.5281200170516968000', '1.03634548187255900000', 'CPUExecutionProvider']
model = ort.InferenceSession("models/PairLIE-dynamic.onnx", providers=["CPUExecutionProvider"])

# 打开图片
img = cv2.imread('img/00000025_1026_4k.png')
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

# 预处理
input = np.expand_dims(img.transpose(2, 0, 1), axis=0).astype(np.float32) / 255.0
print(input.shape)

# 推理
L, R, X = model.run(["L", "R", "X"], {"input": input})

s_time = time.time()
for _ in range(10):
    L, R, X = model.run(["L", "R", "X"], {"input": input})

print("Time: ", (time.time() - s_time)/10)

# 后处理
D = input- X
I = np.power(L, 0.2) * R
L = L.squeeze(0).transpose(1, 2, 0) * 255
R = R.squeeze(0).transpose(1, 2, 0) * 255
I = I.squeeze(0).transpose(1, 2, 0) * 255
D = D.squeeze(0).transpose(1, 2, 0) * 255



# 保存
name = ["test.jpg"] 
cv2.imwrite('NPE/L_' + name[0], cv2.cvtColor(L, cv2.COLOR_BGR2RGB))
cv2.imwrite("NPE/R_" + name[0], cv2.cvtColor(R, cv2.COLOR_BGR2RGB))
cv2.imwrite("NPE/I_" + name[0], cv2.cvtColor(I, cv2.COLOR_BGR2RGB))
cv2.imwrite("NPE/D_" + name[0], cv2.cvtColor(D, cv2.COLOR_BGR2RGB))

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47

找了一张4K图像测试推理，测试结果如下：

provider	TensorrtExecutionProvider	CUDAExecutionProvider	AzureExecutionProvider	CPUExecutionProvider
分辨率	4k	4k	4k	4k
时间ms	482.95	528.12	1036.34	11038.60

onnx转tensorrt

静态转换（静态shape）

trtexec --onnx=models/PairLIE-static-2160x3840.onnx \
--fp16 \
--saveEngine=models/PairLIE-static-2160x3840.engine

1
2
3

动态转换（动态shape）

trtexec --onnx=PairLIE.onnx \
--minShapes=input:1x3x224x224 \
--optShapes=input:1x3x640x640 \
--maxShapes=input:1x3x2160x3840 \
--fp16 \
--saveEngine=PairLIE-dynamic.engine

1
2
3
4
5
6
7

TensorRT Python验证

2160x3840
357.45ms

import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np
import cv2
import time

# 加载TensorRT引擎
def load_engine(engine_path):
    with open(engine_path, "rb") as f:
        runtime = trt.Runtime(trt.Logger(trt.Logger.WARNING))
        return runtime.deserialize_cuda_engine(f.read())

# 创建TensorRT上下文
def create_context(engine):
    context = engine.create_execution_context()
    return context

# 分配和复制输入数据到GPU内存
def allocate_and_copy(inputs, context):
    inputs_host = []
    inputs_cuda = []
    for input_data in inputs:
        input_host = np.array(input_data, dtype=np.float32)
        input_host = input_host.reshape(-1)
        # print("input_host:", input_host.shape, input_host.nbytes)
        input_cuda = cuda.mem_alloc(input_host.nbytes)
        inputs_host.append(input_host)
        inputs_cuda.append(input_cuda)
        cuda.memcpy_htod(input_cuda, input_host)
    return inputs_host, inputs_cuda

# 分配输出数据的CPU内存
def allocate_outputs(context):
    # print("context.engine.num_bindings:", context.engine.num_bindings)
    outputs_host = []
    outputs_cuda = []
    for binding in range(context.engine.num_bindings):
        
        if not context.engine.binding_is_input(binding):
            shape = context.engine.get_binding_shape(binding)
            # print("output shape:", shape) 
            size = trt.volume(shape) * \
                   context.engine.max_batch_size
            dtype = trt.nptype(context.engine.get_binding_dtype(binding))
            output_host = np.empty(size, dtype=dtype)
            output_cuda = cuda.mem_alloc(output_host.nbytes)
            outputs_host.append(output_host)
            outputs_cuda.append(output_cuda)
        else:
            shape = context.engine.get_binding_shape(binding)
            # print("input shape:", shape)
    return outputs_host, outputs_cuda

# 运行TensorRT推理
def run_inference(context, inputs_cuda, outputs_cuda):
    bindings = []
    for input_cuda in inputs_cuda:
        bindings.append(int(input_cuda))
    for output_cuda in outputs_cuda:
        bindings.append(int(output_cuda))
    stream = cuda.Stream()
    context.execute_async_v2(bindings, stream_handle=stream.handle)

# 复制输出数据从GPU内存到CPU内存
def copy_outputs(outputs_cuda, outputs_host):
    for output_cuda, output_host in zip(outputs_cuda, outputs_host):
        cuda.memcpy_dtoh(output_host, output_cuda)
    return outputs_host

# 预处理输入数据
def preprocess_input(image):
    # 根据模型的输入要求进行相应的预处理，例如缩放、裁剪等
    # 这里假设输入图像的大小已经符合模型的要求
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image = np.expand_dims(image.transpose(2, 0, 1), axis=0).astype(np.float32) / 255.0
    return image


# 主函数
def main():
    # 加载TensorRT引擎
    engine = load_engine("models/PairLIE-static-2160x3840-fp32.engine")

    # 创建TensorRT上下文
    context = create_context(engine)

    # 加载输入图像
    image = image = cv2.imread("./img/00000025_1026_4k.png")
    print("image:", image.shape) # (1, 3, 2160, 3840)
    

    # 预处理输入数据
    preprocessed_image = preprocess_input(image)

    # 分配输出数据的CPU内存
    outputs_host, outputs_cuda = allocate_outputs(context)

    # 分配和复制输入数据到GPU内存
    inputs_host, inputs_cuda = allocate_and_copy([preprocessed_image], context)
    
    # 运行TensorRT推理
    run_inference(context, inputs_cuda, outputs_cuda)

    # 复制输出数据从GPU内存到CPU内存
    X, L, R = copy_outputs(outputs_cuda, outputs_host)

    s_time = time.time()
    for _ in range(100):
        # 运行TensorRT推理
        run_inference(context, inputs_cuda, outputs_cuda)

        # 复制输出数据从GPU内存到CPU内存
        X, L, R  = copy_outputs(outputs_cuda, outputs_host)

    print("Time: ", (time.time() - s_time)/10)

    R = R.reshape(3, 2160, 3840).transpose(1, 2, 0)
    L = L.reshape(1, 2160, 3840).transpose(1, 2, 0)
    X = X.reshape(3, 2160, 3840).transpose(1, 2, 0)
    I = np.power(L, 0.2) * R

    # 保存
    name = ["test.jpg"] 
    cv2.imwrite('NPE/L-TRT_' + name[0], cv2.cvtColor(L * 255, cv2.COLOR_BGR2RGB))
    cv2.imwrite("NPE/R-TRT_" + name[0], cv2.cvtColor(R * 255, cv2.COLOR_BGR2RGB))
    cv2.imwrite("NPE/I-TRT_" + name[0], cv2.cvtColor(I * 255, cv2.COLOR_BGR2RGB))

if __name__ == '__main__':
    main()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131

TensorRT C++验证

编写TensorRT C++代码

编写CMakeLists.txt

cmake_minimum_required(VERSION 3.10)
project(TensorRT-Demo)

# 设置C++标准
set(CMAKE_CXX_STANDARD 11)

# 设置TensorRT和OpenCV的路径
# 设置TensorRT_DIR的路径为：/home/dengyibin/work/TensorRT-8.5.1.7
set(TENSORRT_DIR /home/dengyibin/work/TensorRT-8.5.1.7)
set(TENSORRT_INCLUDE_DIRS ${TENSORRT_DIR}/include)
set(TENSORRT_LIBRARY_DIRS ${TENSORRT_DIR}/lib)

message(STATUS "TENSORRT Libs: \n${TENSORRT_LIBRARY_DIRS}\n")
message(STATUS "TENSORRT Headers: \n${TENSORRT_INCLUDE_DIRS}\n")

find_package(Threads REQUIRED)

# CUDA
find_package(CUDA REQUIRED)
message(STATUS "CUDA Libs: \n${CUDA_LIBRARIES}\n")
message(STATUS "CUDA Headers: \n${CUDA_INCLUDE_DIRS}\n")

set(OpenCV_DIR /software/opencv/lib/cmake/opencv4)
find_package(OpenCV REQUIRED)
message(STATUS "OpenCV Libs: \n${OpenCV_LIBS}\n")
message(STATUS "OpenCV Libraries: \n${OpenCV_LIBRARIES}\n")
message(STATUS "OpenCV Headers: \n${OpenCV_INCLUDE_DIRS}\n")

# 查找Eigen库
find_package(Eigen3 REQUIRED)
message(STATUS "Eigen3 Headers: \n${EIGEN3_INCLUDE_DIR}\n")
message(STATUS "Eigen3 Libraries: \n${EIGEN3_LIBRARIES}\n")
include_directories(${EIGEN3_INCLUDE_DIR})


# include dir
list(APPEND INCLUDE_DIRS
    ${CUDA_INCLUDE_DIRS}
    ${OpenCV_INCLUDE_DIRS}
    ${TENSORRT_INCLUDE_DIRS}
    ${EIGEN3_INCLUDE_DIR}
    include
)

# lib dir
list(APPEND ALL_LIB_DIRS
    ${CUDA_LIBRARIES}
    ${CUDA_LIB_DIR}
    ${OpenCV_LIBRARIES}
    ${TENSORRT_LIBRARY_DIRS}
    ${TENSORRT_LIBRARY_DIRS}/stubs
)

# libs
list(APPEND ALL_LIBS
    ${CUDA_LIBRARIES}
    ${OpenCV_LIBRARIES}
    nvinfer
    nvinfer_plugin
    cudart
    opencv_core
    opencv_imgcodecs
    opencv_imgproc
)

include_directories(${INCLUDE_DIRS})
link_directories(${ALL_LIB_DIRS})


# 添加可执行文件
add_executable(${PROJECT_NAME} main.cpp)
add_executable(demo demo.cpp)

target_link_libraries(${PROJECT_NAME} PRIVATE Threads::Threads ${ALL_LIBS})
target_link_libraries(demo PRIVATE Threads::Threads ${ALL_LIBS})

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76

编译

mkdir build
cd build
cmake ..
make -j 4

1
2
3
4

运行

./TensorRT-Demo

部署到Jetson NX 16G

测评结果

部署到Jetson AGX 64G

测评结果

时间包含读取图像CPU-->GPU，GPU-->CPU， TensorRT-C++ 数据预处理太慢

平台	硬件	引擎	精度	尺寸	FPS
Ubuntu20.04	RTX3090	FP32	Pytorch	4K	1.328
Ubuntu20.04	RTX3090	FP32	ONNX	4K	1.690
Ubuntu20.04	RTX3090	FP32	TRT-Python	4K	1.763
Ubuntu20.04	RTX3090	FP32	TRT-C++	4K	1.254

后续

部署到RK3588
部署到海思
部署到百度
…

参考

版权属于：

那棵树看起来生气了

本文链接：

https://dengyb.com/archives/44.html（转载时请注明本文出处及文章链接）

作品采用：

《署名-非商业性使用-相同方式共享 4.0 国际 (CC BY-NC-SA 4.0)》许可协议授权

那棵树看起来生气了

参差荇菜，左右流之。窈窕淑女，寤寐求之。

151 文章数

20082 浏览量