Realtime object position estimation using yolo v7 and ZED 2?

Hi everyone, I wanted to do a realtime object position estimation (3D world coordinates) using ZED 2 Cam. I have custom trained YOLOV7 model using github repository : GitHub - pHidayatullah/yolov7: Implementation of paper - YOLOv7: Trainable bag-of-freebies sets new state-of-the-art for real-time object detectors which detects traffic cones. How do I load Yolov7 weights i.e in pt format and cfg file in .yaml format to my code.

Thanks in advance!!

Hello and welcome to our forum,

To ingest external object detections into our SDK, we provide Custom Object Detection : Using the Object Detection API with a Custom Detector | Stereolabs

You can check out our examples from here : zed-examples/object detection/custom detector at master · stereolabs/zed-examples · GitHub
You can probably reproduce what is done in the yolov5 example.


Hi team… I did custom object detection in yolov7 pyTorch mode. Here is the code :

#!/usr/bin/env python3

import sys
import numpy as np
from OpenGL.GLUT import *
import argparse
import torch
import cv2
import as sl
import torch.backends.cudnn as cudnn

sys.path.insert(0, ‘./yolov7-gpu’)
from models.experimental import attempt_load
from utils.general import check_img_size, non_max_suppression, scale_coords, xyxy2xywh
from utils.torch_utils import select_device
from utils.datasets import letterbox

from threading import Lock, Thread
from time import sleep

import ogl_viewer.viewer as gl
import cv_viewer.tracking_viewer as cv_viewer

lock = Lock()
run_signal = False
exit_signal = False

def img_preprocess(img, device, half, net_size):
net_image, ratio, pad = letterbox(img[:, :, :3], net_size, auto=False)
net_image = net_image.transpose((2, 0, 1))[::-1] # HWC to CHW, BGR to RGB
net_image = np.ascontiguousarray(net_image)

img = torch.from_numpy(net_image).to(device)
img = img.half() if half else img.float()  # uint8 to fp16/32
img /= 255.0  # 0 - 255 to 0.0 - 1.0

if img.ndimension() == 3:
    img = img.unsqueeze(0)
return img, ratio, pad

def xywh2abcd(xywh, im_shape):
output = np.zeros((4, 2))

# Center / Width / Height -> BBox corners coordinates
x_min = (xywh[0] - 0.5*xywh[2]) * im_shape[1]
x_max = (xywh[0] + 0.5*xywh[2]) * im_shape[1]
y_min = (xywh[1] - 0.5*xywh[3]) * im_shape[0]
y_max = (xywh[1] + 0.5*xywh[3]) * im_shape[0]

# A ------ B
# | Object |
# D ------ C

output[0][0] = x_min
output[0][1] = y_min

output[1][0] = x_max
output[1][1] = y_min

output[2][0] = x_min
output[2][1] = y_max

output[3][0] = x_max
output[3][1] = y_max
return output

def detections_to_custom_box(detections, im, im0):
output = []
for i, det in enumerate(detections):
if len(det):
det[:, :4] = scale_coords(im.shape[2:], det[:, :4], im0.shape).round()
gn = torch.tensor(im0.shape)[[1, 0, 1, 0]] # normalization gain whwh

        for *xyxy, conf, cls in reversed(det):
            xywh = (xyxy2xywh(torch.tensor(xyxy).view(1, 4)) / gn).view(-1).tolist()  # normalized xywh

            # Creating ingestable objects for the ZED SDK
            obj = sl.CustomBoxObjectData()
            obj.bounding_box_2d = xywh2abcd(xywh, im0.shape)
            obj.label = cls
            obj.probability = conf
            obj.is_grounded = False
return output

def torch_thread(weights, img_size, conf_thres=0.2, iou_thres=0.45):
global image_net, exit_signal, run_signal, detections

print("Intializing Network...")

device = select_device()
half = device.type != 'cpu'  # half precision only supported on CUDA
imgsz = img_size

# Load model
model = attempt_load(weights, map_location=device)  # load FP32
stride = int(model.stride.max())  # model stride
imgsz = check_img_size(imgsz, s=stride)  # check img_size
if half:
    model.half()  # to FP16
cudnn.benchmark = True

# Run inference
if device.type != 'cpu':
    model(torch.zeros(1, 3, imgsz, imgsz).to(device).type_as(next(model.parameters())))  # run once

while not exit_signal:
    if run_signal:
        img, ratio, pad = img_preprocess(image_net, device, half, imgsz)

        pred = model(img)[0]
        det = non_max_suppression(pred, conf_thres, iou_thres)

        # ZED CustomBox format (with inverse letterboxing tf applied)
        detections = detections_to_custom_box(det, img, image_net)
        run_signal = False

def main():
global image_net, exit_signal, run_signal, detections

capture_thread = Thread(target=torch_thread,
                        kwargs={'weights': opt.weights, 'img_size': opt.img_size, "conf_thres": opt.conf_thres})

print("Initializing Camera...")

zed = sl.Camera()

input_type = sl.InputType()
if opt.svo is not None:

# Create a InitParameters object and set configuration parameters
init_params = sl.InitParameters(input_t=input_type, svo_real_time_mode=True)
init_params.camera_resolution = sl.RESOLUTION.HD720
init_params.coordinate_units = sl.UNIT.METER
init_params.depth_mode = sl.DEPTH_MODE.ULTRA  # QUALITY
init_params.coordinate_system = sl.COORDINATE_SYSTEM.RIGHT_HANDED_Y_UP
init_params.depth_maximum_distance = 50

runtime_params = sl.RuntimeParameters()
status =

if status != sl.ERROR_CODE.SUCCESS:

image_left_tmp = sl.Mat()

print("Initialized Camera")

positional_tracking_parameters = sl.PositionalTrackingParameters()
# If the camera is static, uncomment the following line to have better performances and boxes sticked to the ground.
# positional_tracking_parameters.set_as_static = True

obj_param = sl.ObjectDetectionParameters()
obj_param.detection_model = sl.DETECTION_MODEL.CUSTOM_BOX_OBJECTS
obj_param.enable_tracking = True

objects = sl.Objects()
obj_runtime_param = sl.ObjectDetectionRuntimeParameters()

# Display
camera_infos = zed.get_camera_information()
# Create OpenGL viewer
viewer = gl.GLViewer()
point_cloud_res = sl.Resolution(min(camera_infos.camera_resolution.width, 720),
                                min(camera_infos.camera_resolution.height, 404))
point_cloud_render = sl.Mat()
viewer.init(camera_infos.camera_model, point_cloud_res, obj_param.enable_tracking)
point_cloud = sl.Mat(point_cloud_res.width, point_cloud_res.height, sl.MAT_TYPE.F32_C4, sl.MEM.CPU)
image_left = sl.Mat()
# Utilities for 2D display
display_resolution = sl.Resolution(min(camera_infos.camera_resolution.width, 1280),
                                   min(camera_infos.camera_resolution.height, 720))
image_scale = [display_resolution.width / camera_infos.camera_resolution.width, display_resolution.height / camera_infos.camera_resolution.height]
image_left_ocv = np.full((display_resolution.height, display_resolution.width, 4), [245, 239, 239, 255], np.uint8)

# Utilities for tracks view
camera_config = zed.get_camera_information().camera_configuration
tracks_resolution = sl.Resolution(400, display_resolution.height)
track_view_generator = cv_viewer.TrackingViewer(tracks_resolution, camera_config.camera_fps,
image_track_ocv = np.zeros((tracks_resolution.height, tracks_resolution.width, 4), np.uint8)
# Camera pose
cam_w_pose = sl.Pose()

while viewer.is_available() and not exit_signal:
    if zed.grab(runtime_params) == sl.ERROR_CODE.SUCCESS:
        # -- Get the image
        zed.retrieve_image(image_left_tmp, sl.VIEW.LEFT)
        image_net = image_left_tmp.get_data()
        run_signal = True

        # -- Detection running on the other thread
        while run_signal:

        # Wait for detections
        # -- Ingest detections
        zed.retrieve_objects(objects, obj_runtime_param)

        # -- Display
        # Retrieve display data
        zed.retrieve_measure(point_cloud, sl.MEASURE.XYZRGBA, sl.MEM.CPU, point_cloud_res)
        zed.retrieve_image(image_left, sl.VIEW.LEFT, sl.MEM.CPU, display_resolution)
        zed.get_position(cam_w_pose, sl.REFERENCE_FRAME.WORLD)

        # 3D rendering
        viewer.updateData(point_cloud_render, objects)
        # 2D rendering
        np.copyto(image_left_ocv, image_left.get_data())
        cv_viewer.render_2D(image_left_ocv, image_scale, objects, obj_param.enable_tracking)
        global_image = cv2.hconcat([image_left_ocv, image_track_ocv])
        # Tracking view
        track_view_generator.generate_view(objects, cam_w_pose, image_track_ocv, objects.is_tracked)

        cv2.imshow("ZED | 2D View and Birds View", global_image)
        key = cv2.waitKey(10)
        if key == 27:
            exit_signal = True
        exit_signal = True

exit_signal = True

if name == ‘main’:
parser = argparse.ArgumentParser()
parser.add_argument(‘–weights’, nargs=‘+’, type=str, default=‘…/weights/’, help=‘ path(s)’)
parser.add_argument(‘–svo’, type=str, default=None, help=‘optional svo file’)
parser.add_argument(‘–img_size’, type=int, default=416, help=‘inference size (pixels)’)
parser.add_argument(‘–conf_thres’, type=float, default=0.4, help=‘object confidence threshold’)
opt = parser.parse_args()

with torch.no_grad():

But I am getting invalid svo error and list index out of range.

Intializing Network…
Initializing Camera…
Fusing layers…
Exception in thread Thread-1 (torch_thread):
Traceback (most recent call last):
File “C:\Users\shirs\Documents\Pythonobjdetection\yolov7-gpu\utils\”, line 26, in attempt_download
assets = [x[‘name’] for x in response[‘assets’]] # release assets
KeyError: ‘assets’

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
File “C:\Users\shirs\anaconda3\envs\yolov7-gpu-env\lib\”, line 1016, in _bootstrap_inner
File “C:\Users\shirs\anaconda3\envs\yolov7-gpu-env\lib\”, line 953, in run
self._target(*self._args, **self._kwargs)
File “C:\Users\shirs\Documents\Pythonobjdetection\yolov7-gpu\”, line 100, in torch_thread
model = attempt_load(weights, map_location=device) # load FP32
File “C:\Users\shirs\Documents\Pythonobjdetection\yolov7-gpu\models\”, line 251, in attempt_load
File “C:\Users\shirs\Documents\Pythonobjdetection\yolov7-gpu\utils\”, line 31, in attempt_download
tag = subprocess.check_output(‘git tag’, shell=True).decode().split()[-1]
IndexError: list index out of range

Pls can you give solution?