Custom object detection with my own inference

Hello everyone.
I’m trying to set my own model in custom detector api (python). I’ve already trained my net and it works well with images as inputs, however I’d like to use Zed2 camera to detect my custom objects.
I tried to manage with tutorial from stereolabs and also git samples, however finally I got stuck and can’t find solution, why api doesn’t work properly.
Detector worked well with original model and detected human, bag, etc. Now it doesn’t detect anything.

Can anyone help me what I made wrong and how can I properly define my own models for detection?

Here is my code:

# Import packages
from collections import namedtuple
import os
import cv2
import numpy as np
import tensorflow.compat.v1 as tf
import sys
import pyzed.sl as sl
import xml.etree.ElementTree as ET

def main(): 
   # This is needed since the notebook is stored in the object_detection folder.
   sys.path.append("..")

   # Import utilites
   from object_detection.utils import label_map_util
   from object_detection.utils import visualization_utils as vis_util

   # Name of the directory containing the object detection module we're using
   MODEL_NAME = 'inference_graph_3'
   IMAGE_NAME = 'PC046572.JPG'

   # Grab path to current working directory
   CWD_PATH = os.getcwd()

   # Path to frozen detection graph .pb file, which contains the model that is used
   # for object detection.
   PATH_TO_CKPT = os.path.join(CWD_PATH,MODEL_NAME,'frozen_inference_graph.pb')

   # Path to label map file
   PATH_TO_LABELS = os.path.join(CWD_PATH,'data','labelmap.pbtxt')

   Detection = namedtuple("Detection", ["image_path", "gt", "pred"])

   # Number of classes the object detector can identify
   NUM_CLASSES = 1

   # Load the label map.
   # Label maps map indices to category names, so that when our convolution
   # network predicts `5`, we know that this corresponds to `king`.
   # Here we use internal utility functions, but anything that returns a
   # dictionary mapping integers to appropriate string labels would be fine
   label_map = label_map_util.load_labelmap(PATH_TO_LABELS)
   categories = label_map_util.convert_label_map_to_categories(label_map, max_num_classes=NUM_CLASSES, use_display_name=True)
   category_index = label_map_util.create_category_index(categories)

   # Load the Tensorflow model into memory.
   detection_graph = tf.Graph()
   with detection_graph.as_default():
       od_graph_def = tf.GraphDef()
       with tf.io.gfile.GFile(PATH_TO_CKPT, 'rb') as fid:
           serialized_graph = fid.read()
           od_graph_def.ParseFromString(serialized_graph)
           tf.import_graph_def(od_graph_def, name='')

       sess = tf.Session(graph=detection_graph)

   # Define input and output tensors (i.e. data) for the object detection classifier

   # Input tensor is the image
   image_tensor = detection_graph.get_tensor_by_name('image_tensor:0')

   # Create a Camera object
   zed = sl.Camera()

   # Create a InitParameters object and set configuration parameters
   init_params = sl.InitParameters()
   init_params.camera_resolution = sl.RESOLUTION.HD720  # Use HD720 video mode
   init_params.depth_mode = sl.DEPTH_MODE.PERFORMANCE
   init_params.coordinate_units = sl.UNIT.METER
   init_params.sdk_verbose = True

    # Open the camera
   err = zed.open(init_params)
   if err != sl.ERROR_CODE.SUCCESS:
        exit(1)

   obj_param = sl.ObjectDetectionParameters()
   obj_param.detection_model = sl.DETECTION_MODEL.CUSTOM_BOX_OBJECTS
   obj_param.enable_tracking=True
   obj_param.enable_mask_output=True

   camera_infos = zed.get_camera_information()
   if obj_param.enable_tracking :
       positional_tracking_param = sl.PositionalTrackingParameters()
       #positional_tracking_param.set_as_static = True
       positional_tracking_param.set_floor_as_origin = True
       zed.enable_positional_tracking(positional_tracking_param)

   print("Object Detection: Loading Module...")

   err = zed.enable_object_detection(obj_param)
   if err != sl.ERROR_CODE.SUCCESS :
       print (repr(err))
       zed.close()
       exit(1)
 
   # Output tensors are the detection boxes, scores, and classes
   # Each box represents a part of the image where a particular object was detected
   detection_boxes = detection_graph.get_tensor_by_name('detection_boxes:0')

   # Each score represents level of confidence for each of the objects.
   # The score is shown on the result image, together with the class label.
   detection_scores = detection_graph.get_tensor_by_name('detection_scores:0')
   detection_classes = detection_graph.get_tensor_by_name('detection_classes:0')

   # Number of objects detected
   num_detections = detection_graph.get_tensor_by_name('num_detections:0')


   objects = sl.Objects()
   obj_runtime_param = sl.ObjectDetectionRuntimeParameters()
   obj_runtime_param.detection_confidence_threshold = 40

   
   while zed.grab() == sl.ERROR_CODE.SUCCESS:
       objects_in = []
       # The "detections" variable contains your custom 2D detections
       for it in num_detections:
           tmp = sl.CustomBoxObjectData()
           # Fill the detections into the correct SDK format
           tmp.unique_object_id = num_detections
           tmp.probability = detection_scores
           tmp.label = detection_classes
           tmp.bounding_box_2d = detection_boxes
           tmp.is_grounded = False
           objects_in.append(tmp)
       zed.ingest_custom_box_objects(objects_in)

       for neighbor in root.iter('bndbox'):
                    xmin = int(neighbor.find('xmin').text)
                    ymin = int(neighbor.find('ymin').text)
                    xmax = int(neighbor.find('xmax').text)
                    ymax = int(neighbor.find('ymax').text)
       annotations.append([xmin, ymin, xmax, ymax])


       err = zed.retrieve_objects(objects, obj_runtime_param)
       if objects.is_new :
           obj_array = objects.object_list
           print(str(len(obj_array))+" Object(s) detected\n")
           if len(obj_array) > 0 :
               first_object = obj_array[0]
               print("First object attributes:")
               print(" Label '"+repr(first_object.label)+"' (conf. "+str(int(first_object.confidence))+"/100)")
               if obj_param.enable_tracking :
                   print(" Tracking ID: "+str(int(first_object.id))+" tracking state: "+repr(first_object.tracking_state)+" / "+repr(first_object.action_state))
               position = first_object.position
               velocity = first_object.velocity
               dimensions = first_object.dimensions
               print(" 3D position: [{0},{1},{2}]\n Velocity: [{3},{4},{5}]\n 3D dimentions: [{6},{7},{8}]".format(position[0],position[1],position[2],velocity[0],velocity[1],velocity[2],dimensions[0],dimensions[1],dimensions[2]))
               if first_object.mask.is_init():
                   print(" 2D mask available")

               print(" Bounding Box 2D ")
               bounding_box_2d = first_object.bounding_box_2d
               for it in bounding_box_2d :
                   print("    "+str(it),end='')
               print("\n Bounding Box 3D ")
               bounding_box = first_object.bounding_box
               for it in bounding_box :
                   print("    "+str(it),end='')

               input('\nPress enter to continue: ')


   # Close the camera
   zed.close()

Hello and thank you for reaching out,

We provide samples and tutorials about how to do that. Please refer to zed-examples/object detection/custom detector at master · stereolabs/zed-examples · GitHub.

Antoine