Hello,
I’m trying to implement a custom object detection but in the style of the image_viewer sample with the 3D bounding box detection.
As my setup I’m using:
Jetson agx
ZED SDK 4.0.6
and ZED 2 camera
I came up with the following code, which however is not working as in the sample you provided. I tried to follow this guide but I don’t understand on how to ingest the weights of my yolo model into the CUSTOM_BOX_DETECTION.
This is the Code I came up with (a combination of the image viewer sample and the custom object detection example):
import sys
import ogl_viewer.viewer as gl
import pyzed.sl as sl
import argparse
import numpy as np
import torch
import cv2
import torch.backends.cudnn as cudnn
sys.path.insert(0, './yolov5')
from models.experimental import attempt_load
from utils.general import check_img_size, non_max_suppression, scale_boxes, xyxy2xywh
from utils.torch_utils import select_device
from utils.augmentations import letterbox
from threading import Lock, Thread
from time import sleep
import ogl_viewer.viewer as gl
lock = Lock()
run_signal = False
exit_signal = False
def xywh2abcd(xywh, im_shape):
output = np.zeros((4, 2))
# Center / Width / Height -> BBox corners coordinates
x_min = (xywh[0] - 0.5*xywh[2]) * im_shape[1]
x_max = (xywh[0] + 0.5*xywh[2]) * im_shape[1]
y_min = (xywh[1] - 0.5*xywh[3]) * im_shape[0]
y_max = (xywh[1] + 0.5*xywh[3]) * im_shape[0]
# A ------ B
# | Object |
# D ------ C
output[0][0] = x_min
output[0][1] = y_min
output[1][0] = x_max
output[1][1] = y_min
output[2][0] = x_min
output[2][1] = y_max
output[3][0] = x_max
output[3][1] = y_max
return output
def img_preprocess(img, device, half, net_size):
net_image, ratio, pad = letterbox(img[:, :, :3], net_size, auto=False)
net_image = net_image.transpose((2, 0, 1))[::-1] # HWC to CHW, BGR to RGB
net_image = np.ascontiguousarray(net_image)
img = torch.from_numpy(net_image).to(device)
img = img.half() if half else img.float() # uint8 to fp16/32
img /= 255.0 # 0 - 255 to 0.0 - 1.0
if img.ndimension() == 3:
img = img.unsqueeze(0)
return img, ratio, pad
def detections_to_custom_box(detections, im, im0):
output = []
for i, det in enumerate(detections):
if len(det):
det[:, :4] = scale_boxes(im.shape[2:], det[:, :4], im0.shape).round()
gn = torch.tensor(im0.shape)[[1, 0, 1, 0]] # normalization gain whwh
for *xyxy, conf, cls in reversed(det):
xywh = (xyxy2xywh(torch.tensor(xyxy).view(1, 4)) / gn).view(-1).tolist() # normalized xywh
# Creating ingestable objects for the ZED SDK
obj = sl.CustomBoxObjectData()
obj.bounding_box_2d = xywh2abcd(xywh, im0.shape)
obj.label = cls
obj.probability = conf
obj.is_grounded = False
output.append(obj)
return output
def torch_thread(weights, img_size, conf_thres=0.2, iou_thres=0.45):
global image_net, exit_signal, run_signal, detections
print("Intializing Network...")
device = select_device()
half = device.type != 'cpu' # half precision only supported on CUDA
imgsz = img_size
# Load model
model = attempt_load(weights, device=device) # load FP32
stride = int(model.stride.max()) # model stride
imgsz = check_img_size(imgsz, s=stride) # check img_size
if half:
model.half() # to FP16
cudnn.benchmark = True
# Run inference
if device.type != 'cpu':
model(torch.zeros(1, 3, imgsz, imgsz).to(device).type_as(next(model.parameters()))) # run once
while not exit_signal:
if run_signal:
lock.acquire()
img, ratio, pad = img_preprocess(image_net, device, half, imgsz)
pred = model(img)[0]
det = non_max_suppression(pred, conf_thres, iou_thres)
# ZED CustomBox format (with inverse letterboxing tf applied)
detections = detections_to_custom_box(det, img, image_net)
lock.release()
run_signal = False
sleep(0.01)
def parse_args(init):
if len(opt.input_svo_file)>0 and opt.input_svo_file.endswith(".svo"):
init.set_from_svo_file(opt.input_svo_file)
print("[Sample] Using SVO File input: {0}".format(opt.input_svo_file))
elif len(opt.ip_address)>0 :
ip_str = opt.ip_address
if ip_str.replace(':','').replace('.','').isdigit() and len(ip_str.split('.'))==4 and len(ip_str.split(':'))==2:
init.set_from_stream(ip_str.split(':')[0],int(ip_str.split(':')[1]))
print("[Sample] Using Stream input, IP : ",ip_str)
elif ip_str.replace(':','').replace('.','').isdigit() and len(ip_str.split('.'))==4:
init.set_from_stream(ip_str)
print("[Sample] Using Stream input, IP : ",ip_str)
else :
print("Unvalid IP format. Using live stream")
if ("HD2K" in opt.resolution):
init.camera_resolution = sl.RESOLUTION.HD2K
print("[Sample] Using Camera in resolution HD2K")
elif ("HD1200" in opt.resolution):
init.camera_resolution = sl.RESOLUTION.HD1200
print("[Sample] Using Camera in resolution HD1200")
elif ("HD1080" in opt.resolution):
init.camera_resolution = sl.RESOLUTION.HD1080
print("[Sample] Using Camera in resolution HD1080")
elif ("HD720" in opt.resolution):
init.camera_resolution = sl.RESOLUTION.HD720
print("[Sample] Using Camera in resolution HD720")
elif ("SVGA" in opt.resolution):
init.camera_resolution = sl.RESOLUTION.SVGA
print("[Sample] Using Camera in resolution SVGA")
elif ("VGA" in opt.resolution):
init.camera_resolution = sl.RESOLUTION.VGA
print("[Sample] Using Camera in resolution VGA")
elif len(opt.resolution)>0:
print("[Sample] No valid resolution entered. Using default")
else :
print("[Sample] Using default resolution")
def main():
global image_net, exit_signal, run_signal, detections
capture_thread = Thread(target=torch_thread, kwargs={'weights': opt.weights, 'img_size': opt.img_size, "conf_thres": opt.conf_thres})
capture_thread.start()
print("Initializing Camera...")
# Create a Camera object
zed = sl.Camera()
input_type = sl.InputType()
if opt.svo is not None:
input_type.set_from_svo_file(opt.svo)
# Create a InitParameters object and set configuration parameters
init_params = sl.InitParameters(input_t=input_type, svo_real_time_mode=True)
init_params.coordinate_units = sl.UNIT.METER
init_params.depth_mode = sl.DEPTH_MODE.ULTRA # QUALITY
init_params.coordinate_system = sl.COORDINATE_SYSTEM.RIGHT_HANDED_Y_UP
init_params.depth_maximum_distance = 50
#runtime_params = sl.RuntimeParameters()
status = zed.open(init_params)
if status != sl.ERROR_CODE.SUCCESS:
print(repr(status))
exit()
parse_args(init_params)
# Open the camera
err = zed.open(init_params)
if err != sl.ERROR_CODE.SUCCESS:
exit(1)
print("Initialized Camera")
# Enable object detection module
obj_param = sl.ObjectDetectionParameters()
# Defines if the object detection will track objects across images flow.
obj_param.enable_tracking = False # if True, enable positional tracking
obj_param.detection_model = sl.OBJECT_DETECTION_MODEL.CUSTOM_BOX_OBJECTS
if obj_param.enable_tracking:
zed.enable_positional_tracking()
zed.enable_object_detection(obj_param)
objects = sl.Objects()
obj_runtime_param = sl.ObjectDetectionRuntimeParameters()
# Display
camera_info = zed.get_camera_information()
# Create OpenGL viewer
viewer = gl.GLViewer()
viewer.init(camera_info.camera_configuration.calibration_parameters.left_cam, obj_param.enable_tracking)
# Configure object detection runtime parameters
obj_runtime_param = sl.ObjectDetectionRuntimeParameters()
obj_runtime_param.detection_confidence_threshold = 60
obj_runtime_param.object_class_filter = [sl.OBJECT_CLASS.PERSON] # Only detect Persons
# Create ZED objects filled in the main loop
objects = sl.Objects()
image = sl.Mat()
# Set runtime parameters
runtime_parameters = sl.RuntimeParameters()
while viewer.is_available():
# Grab an image, a RuntimeParameters object must be given to grab()
if zed.grab(runtime_parameters) == sl.ERROR_CODE.SUCCESS:
# Retrieve left image
zed.retrieve_image(image, sl.VIEW.LEFT)
# Retrieve objects
zed.retrieve_objects(objects, obj_runtime_param)
# Update GL view
viewer.update_view(image, objects)
viewer.exit()
image.free(memory_type=sl.MEM.CPU)
# Disable modules and close camera
zed.disable_object_detection()
zed.disable_positional_tracking()
zed.close()
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--weights', nargs='+', type=str, default='../weights/best.pt', help='model.pt path(s)')
parser.add_argument('--svo', type=str, default=None, help='optional svo file')
parser.add_argument('--img_size', type=int, default=416, help='inference size (pixels)')
parser.add_argument('--conf_thres', type=float, default=0.4, help='object confidence threshold')
parser.add_argument('--input_svo_file', type=str, help='Path to an .svo file, if you want to replay it',default = '')
parser.add_argument('--ip_address', type=str, help='IP Adress, in format a.b.c.d:port or a.b.c.d, if you have a streaming setup', default = '')
parser.add_argument('--resolution', type=str, help='Resolution, can be either HD2K, HD1200, HD1080, HD720, SVGA or VGA', default = '')
opt = parser.parse_args()
if len(opt.input_svo_file)>0 and len(opt.ip_address)>0:
print("Specify only input_svo_file or ip_address, or none to use wired camera, not both. Exit program")
exit()
with torch.no_grad():
main()
It’s probably my lack of knowledge with the ZED SDK I’m struggeling with.
I’m happy about any info you can provide or where I can learn more on how to implement it properly.
Thanks a lot in advance!