I have been attempting to implement Object Detection for the Zed1 camera using YOLOv5, OpenCV, and the Zed API in Python. The end goal is to use this to feed coordinates to the motion planner of a robotic arm. However, the depth values vary wildly when I feed it the center of the bounding box that YOLO generates, the object is ~560 mm away but i get values in the thousands of mm. Running depth at the center of the camera gets me an accurate depth reading but it is not so accurate for other points in the depth map. What can I do to better this performance?
Here is my code:
import cv2
import yolov5
import numpy as np
import time
import threading
import queue
import cv2, moveit_commander,faulthandler,geometry_msgs
import shape_msgs.msg
from moveit_msgs.msg import Constraints, PositionConstraint, BoundingVolume
from tf.transformations import quaternion_from_euler
import numpy as np
import datetime
import pyzed.sl as sl
import math
######################################################################################################################################################
# 1. DEFINING FUNCTIONS
######################################################################################################################################################
# coordwriter -
def coordwriter():
if bottle_event.is_set():
try:
label=label_queue.get(timeout=1)
c_x=cx_queue.get(timeout=1)
c_y=cy_queue.get(timeout=1)
depth_value = depth_queue.get(timeout=1)
#Getting Bottle coordinates and transforming them
bot_x=round(((((c_x-left_cx)*depth_value)*0.001)/left_fx),4)
bot_z=round(((((c_y-left_cy)*depth_value)*0.001)/left_fy),4)
bot_y=round((depth_value*0.001),4)
print("Bottle detected at:()",bot_x,",",bot_y,",",bot_z,")")
quarternion = quaternion_from_euler(roll,pitch,yaw)
print("Assigning Pose goal...")
pose_goal.orientation.x = quarternion[0]
pose_goal.orientation.y = quarternion[1]
pose_goal.orientation.z = quarternion[2]
pose_goal.orientation.w = quarternion[3]
pose_goal.position.x = bot_x
pose_goal.position.y = -bot_y - 0.25
pose_goal.position.z = bot_z
move_group.set_pose_target(pose_goal)
print("Pose Goal set.")
print(pose_goal)
print("Moving to goal position...")
plan = move_group.go(wait=True)
print("Achieved goal position.")
# Calling `stop()` ensures that there is no residual movement
move_group.stop()
# It is always good to clear your targets after planning with poses.
move_group.clear_pose_targets()
bottle_event.clear()
except queue.Empty:
pass
def coordwriter_manager():
global bottle_detected
while True:
if bottle_detected:
with coordwriter_lock:
if bottle_detected:
print("Bottle detected and circle drawn. Triggering coordwriter thread.")
coord_writer_thread = threading.Thread(target=coordwriter)
coord_writer_thread.start()
bottle_detected = False
time.sleep(30) # Adjust the sleep time to control the rate
######################################################################################################################################################
# 1. DEFINING VARIABLES AND OBJECTS
######################################################################################################################################################
# Time stamp for video title
timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
objdetect = "Videos/objdetect"+timestamp+".avi"
depthcolor = "Videos/depthcolor"+timestamp+".avi"
# STEREO CAMERA REQUIRED FOR THIS SCRIPT - I used ZED Stereo Camera
# Time stamp for video title
timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
objdetect = "Videos/objdetect"+timestamp+".avi"
depthcolor = "Videos/depthcolor"+timestamp+".avi"
# STEREO CAMERA REQUIRED FOR THIS SCRIPT - I used ZED Stereo Camera
# CAMERA PARAMETERS - Convert disparity to depth using calibration parameters
baseline = 120 # Replace with your camera's baseline (in mm)
focal_length = 889 # Replace with your camera's focal length (in pixels)
camera_offset = [-0.04, -0.068, -0.05] #Camera location in reference to Jaco Arm Origin
#Translational matrix
stereo_tx=120.0010
stereo_ty=-0.00497935
stereo_tz=0.00734397
#Rotational Matrix
stereo_rx=-0.0129092
stereo_ry=0
Stereo_rz=-0.000354373
#CAMERA MATRICES - For pixel-coordinate transformations - At VGA resolution -~640x480
#FOCAL LENGTHS
left_fx= 700.5150
left_fy= 700.5151
right_fx=699.6100
right_fy=699.6100
#PRINCIPAL POINTS
left_cx=613.285
left_cy=337.6010
right_cx=612.4750
right_cy=366.4365
#RADIAL DISTORTION COEFFICIENTS
left_k1=-0.1721
left_k2=0.0267
left_k3=0
right_k1=-0.1726
right_k2= 0.0269
right_k3=0
#TANGENTIAL DISTORTION COEFFICIENTS
left_p1=0.0021
left_p2=-0.0006
right_p1=0.0019
right_p2=-0.0006
# LOAD PRETRAINED YOLOV5 MODEL
model = yolov5.load('yolov5s.pt')
# set model parameters
model.conf = 0.50 # NMS confidence threshold
model.iou = 0.45 # NMS IoU threshold
model.agnostic = False # NMS class-agnostic
model.multi_label = False # NMS multiple labels per box
model.max_det = 1000 # maximum number of detections per image
# Load class names
class_names = model.names
# Define color mapping based on YOLOv5's official color palette
color_palette = [
(255, 0, 0), # Red
(0, 255, 0), # Green
(0, 0, 255), # Blue
(255, 255, 0), # Yellow
(0, 255, 255), # Cyan
(255, 0, 255) # Magenta
]
class_colors = {i: color_palette[i % len(color_palette)] for i in range(len(class_names))}
#CREATE OPENCV OBJECT FOR CAMERA FEED
# Create video capture feed object
#cam_feed = cv2.VideoCapture(0)
# Fetch camera frame dimensions - needed for measurements
#frame_width = int(cam_feed.get(3))
#frame_height = int(cam_feed.get(4))
#size = (frame_width, frame_height)
#Optional Recording Objects - If you wish to record your output. Can be commented out if not necessary
#recording = cv2.VideoWriter(objdetect, cv2.VideoWriter_fourcc(*'MJPG'),10, size)
#recording2 = cv2.VideoWriter(depthcolor, cv2.VideoWriter_fourcc(*'MJPG'),10, size)
#Jaco Arm MoveIt-Commander Objects--------------------------------------------------------------------------------------------------------------------
#commander for entire robot
robot = moveit_commander.RobotCommander()
#print current state of robot for debugging
print(robot.get_current_state())
# Create a PlanningSceneInterface instance
scene = moveit_commander.PlanningSceneInterface()
#Robotic Arm Movement Group - Moves every joint besides the fingers on Jaco
group_name = 'arm'
move_group = moveit_commander.MoveGroupCommander(group_name)
move_group.set_max_velocity_scaling_factor(1.0)
# gripper move_group2 - For control of the gripper
group2_name = 'gripper'
move_group2 = moveit_commander.MoveGroupCommander(group2_name)
move_group2.set_max_velocity_scaling_factor(1.0)
move_group.set_planning_time(10)
#Coordinates - Cartesian Home Position (m) and Euler Angle Orientation (rad) for arm
home_x = 0.03
home_y = -0.17
home_z = 0.3
roll = 2.1472
pitch = 0.0766
yaw = -0.17594
pose_goal = geometry_msgs.msg.Pose()
# Setting Constraints -------------------------------------------------------------------------------------------------------------------------------
#Threads, events, queues, flags-----------------------------------------------------------------------------------------------------------------------
coord_writer_thread = threading.Thread(target=coordwriter)
bottle_event=threading.Event()
label_queue=queue.Queue()
cx_queue=queue.Queue()
cy_queue=queue.Queue()
depth_queue=queue.Queue()
# Creating ZED Camera Object----------------------------------------------------------------------------------------------------------------------
zed = sl.Camera()
# Create a InitParameters object and set configuration parameters
init_params = sl.InitParameters()
init_params.depth_mode = sl.DEPTH_MODE.ULTRA # Use ULTRA depth mode
init_params.coordinate_units = sl.UNIT.MILLIMETER # Use meter units (for depth measurements)
# Create an RGBA sl.Mat object
# Open the camera
status = zed.open(init_params)
if status != sl.ERROR_CODE.SUCCESS: #Ensure the camera has opened succesfully
print("Camera Open : "+repr(status)+". Exit program.")
exit()
# Create and set RuntimeParameters after opening the camera
runtime_parameters = sl.RuntimeParameters()
runtime_parameters.enable_fill_mode = False
runtime_parameters.confidence_threshold = 30
image = sl.Mat()
depth = sl.Mat()
depth_view = sl.Mat()
point_cloud = sl.Mat()
mirror_ref = sl.Transform()
mirror_ref.set_translation(sl.Translation(2.75,4.0,0))
# Flag to indicate if a bottle has been detected
bottle_detected = False
# Lock to prevent multiple threads from starting the coordwriter thread simultaneously
coordwriter_lock = threading.Lock()
######################################################################################################################################################
# 2. MAIN LOOP FOR FUNCTION
######################################################################################################################################################
print("Starting bottle detector thread.")
coord_writer_thread.start()
coordwriter_manager_thread = threading.Thread(target=coordwriter_manager)
coordwriter_manager_thread.start()
# While loop for live camera feed - disparity map, object detection, camera output, and recording will take place here frame by frame
while True:
if zed.grab(runtime_parameters) == sl.ERROR_CODE.SUCCESS:
# Retrieve left image
zed.retrieve_image(image, sl.VIEW.LEFT)
image_ocv = image.get_data()
# Retrieve depth map. Depth is aligned on the left image
zed.retrieve_measure(depth, sl.MEASURE.DEPTH)
zed.retrieve_image(depth_view, sl.VIEW.DEPTH)
# Retrieve colored point cloud. Point cloud is aligned on the left image.
zed.retrieve_measure(point_cloud, sl.MEASURE.XYZRGBA)
depth_ocv = depth_view.get_data()
pointcloud_ocv = point_cloud.get_data()
image_ocv = image.get_data()
# Get and print distance value in mm at the center of the image
# We measure the distance camera - object using Euclidean distance
#x = round(image.get_width() / 2)
#y = round(image.get_height() / 2)
#CREATING DISPARITY MAP---------------------------------------------------------------------------------------------------------------------------
#CLEANING DISPARITY MAP OUTPUT-------------------------------------------------------------------------------------------------------------------
#OBJECT DETECTION IMPLEMENTATION------------------------------------------------------------------------------------------------------------------
# Perform object detection
results = model(image_ocv)
# Parse results
predictions = results.pred[0]
boxes = predictions[:, :4] # x1, y1, x2, y2
scores = predictions[:, 4]
categories = predictions[:, 5]
# Draw bounding boxes and labels on the frame
for box, score, category in zip(boxes, scores, categories):
x1, y1, x2, y2 = map(int, box)
c_x = int((x1 + x2) / 2)
c_y = int((y1 + y2) / 2)
label = class_names[int(category)]
color = class_colors[int(category)]
cv2.circle(image_ocv, (c_x, c_y), 1, color, 2)
cv2.rectangle(image_ocv, (x1, y1), (x2, y2), color, 2)
#err, point_cloud_value = point_cloud.get_value(c_x, c_y)
#err, depthval = depth.get_value(c_x,c_y)
distance=None
nearest_depth = None
err, point_cloud_value = point_cloud.get_value(c_x, c_y)
depth_value = depth.get_value(c_x,c_y)
if math.isfinite(point_cloud_value[2]):
distance = math.sqrt(point_cloud_value[0] * point_cloud_value[0] +
point_cloud_value[1] * point_cloud_value[1] +
point_cloud_value[2] * point_cloud_value[2])
cv2.putText(image_ocv, f'{label} {score:.6f}', (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
cv2.putText(image_ocv, f'{distance:.6f} mm', (x1, y1 - 30), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
if distance > 900:
print(f"Distance to Camera from {label} centered at: {{{c_x};{c_y}}}: {distance}")
print(point_cloud_value[2])
print(f"Depth at: {{{c_x};{c_y}}}: {depth_value}")
print(f"{label} is too far!!")
if distance < 900:
if label== 'bottle':
print(f"Distance to Camera from {label} centered at: {{{c_x};{c_y}}}: {distance}")
print(f"Depth at: {{{c_x};{c_y}}}: {depth_value}")
bottle_detected=True
label_queue.put(label)
cx_queue.put(c_x)
cy_queue.put(c_y)
depth_queue.put(distance)
bottle_event.set()
coordwriter()
pass
# OTHER NEEDED FUNCTIONS: RECORDING TO .AVI FILES, DISPLAY FEED TO USER, KEYBOARD INTERRUPT-------------------------------------------------------
# Record the frame
#recording.write(image)
# Display the disparity colormap, depth information, and input frame
cv2.imshow('Zed Camera Feed', image_ocv)
cv2.imshow('Zed Depth Map',depth_ocv)
#cv2.imshow('Zed Point Cloud',pointcloud_ocv)
c = cv2.waitKey(1)
if c == 27:
break
#When done----------------------------------------------------------------------------------------------------------------------------------------
# Release resources
#cam_feed.release()
#recording.release()
#recording2.release()
coord_writer_thread.join()
cv2.destroyAllWindows()