Hello!
Today I have a lot of data to offer regarding this issue. 3 Zeds crashed this weekend on our test machines, all of them with similar symptoms. We left a heartbeat signal from inside our loop function that, when the thread would become blocked, would trigger a different function that would check the result of atomic bools used for tracing down the function where our threads block.
Turns out it is grabFrame() as per our logs:
| INFO | [ZEDCamera << PLUGIN >> | 31300192] :: C1: T [Before Grab Frame Call]
| INFO | [ZEDCamera << PLUGIN >> | 31300192] :: C2: F [After Grab Frame Call]
| INFO | [ZEDCamera << PLUGIN >> | 31300192] :: C3: F [After error checking]
| INFO | [ZEDCamera << PLUGIN >> | 31300192] :: C4: F [Before Gen Box Tracking]
| INFO | [ZEDCamera << PLUGIN >> | 31300192] :: C5: F [After Gen Box Tracking]
| INFO | [ZEDCamera << PLUGIN >> | 31300192] :: C6: F [After error checking]
our code for the loop() function that the thread runs in cpp is :
void ZEDCamera::loop()
{
ZED_TRACE("Start Camera Loop Thread");
setStatus(OpeningCamera);
QElapsedTimer et;
#ifdef ZED_NOISY_DEBUG
ZED_TRACE("ZED_NOISY_DEBUG: Thread started successfully. Entering Phase 1 (GFX Sync).");
int gfxWaitWarnCount = 0;
#endif
// --- Phase 1: Graphics Initialization Sync ---
while (!m_gfxReady && m_isAlive) {
if (m_instances == 0 && m_trackings == 0) {
m_thread->msleep(100);
continue;
}
#ifdef ZED_NOISY_DEBUG
// Print roughly every 1 second (15 * 64ms)
if (++gfxWaitWarnCount % 15 == 0) {
ZED_TRACE(QString("ZED_NOISY_DEBUG: Still waiting for GFX context to bind... m_gfxReady is false. Wait count: %1").arg(gfxWaitWarnCount));
}
#endif
emit gfxInitializationRequested();
m_thread->msleep(64); // Wait for GL context to bind
}
#ifdef ZED_NOISY_DEBUG
if (m_isAlive) {
ZED_TRACE("ZED_NOISY_DEBUG: Phase 1 Complete. GFX context bound successfully. Proceeding to main loop.");
}
#endif
// Loop variables
sl::ERROR_CODE err = sl::ERROR_CODE::SUCCESS;
int frameWarningCount = 0;
int frameErrorCount = 0;
const int warningTries = 10000;
const int errorTries = 10;
int openRetryCount = 0;
int retryOnSecs = 30;
bool isFirstOpen = true;
QTime retryTime = QTime::currentTime();
#ifdef ZED_NOISY_DEBUG
uint64_t debugLoopCounter = 0;
QElapsedTimer heartbeatTimer;
heartbeatTimer.start();
#endif
auto triggerRestart = [&](int seconds) {
ZED_TRACE(QString("Restarting Camera (Delay: %1s)...").arg(seconds));
closeCamera(); // 1. MUST BE FIRST: Close the software handle cleanly, freeing GPU/AI memory
sl::Camera::reboot(m_serial.toInt(), true); // 2. Hard reset the hardware
m_restartCamera = false;
retryTime = QTime::currentTime();
retryOnSecs = seconds; // Give Windows and the USB Controller time to re-enumerate the device
setCameraInfo("🟡 Rebooting Camera. wait a few seconds...");
};
// --- Phase 2: Main Operational Loop ---
while (m_isAlive.load()) {
c1 = false;
c2 = false;
c3 = false;
c4 = false;
c5 = false;
c6 = false;
#ifdef ZED_NOISY_DEBUG
debugLoopCounter++;
if (heartbeatTimer.hasExpired(10000)) { // <--- Safe check for 10000ms
ZED_TRACE(QString("[THREAD HEARTBEAT] Plugin & Thread Alive | Iterations: %1 | GFX: %2 | Cam: %3 | Track: %4 | Status: %5 | Instances: %6")
.arg(debugLoopCounter)
.arg(m_gfxReady ? "OK" : "WAIT")
.arg(m_cameraReady ? "OK" : "WAIT")
.arg(m_trackingReady ? "OK" : "WAIT")
.arg(statusStr())
.arg(m_instances.load()));
heartbeatTimer.restart();
}
#endif
// A) CONNECTION MANAGEMENT
if (!m_cameraReady) {
if (!isFirstOpen && retryTime.secsTo(QTime::currentTime()) < retryOnSecs) {
m_thread->msleep(200);
continue;
}
setCameraInfo("🟡 Initializing camera...");
if (!isFirstOpen) {
ZED_TRACE(QString("Retry Attempt %1. Cooldown: %2s").arg(openRetryCount).arg(retryOnSecs));
}
err = openCamera();
ZED_TRACE_ERR("openCamera", err);
if (err > sl::ERROR_CODE::SUCCESS) {
isFirstOpen = false;
openRetryCount++;
retryTime = QTime::currentTime();
setCameraError(err, "Open Camera", QString(" Connection retry on %1 secs").arg(retryOnSecs));
if (openRetryCount > 1000) {
setCameraInfo("🔴 Camera Not Found");
break; // Hard exit loop
}
continue;
}
// Success: Reset state
isFirstOpen = false;
openRetryCount = 0;
frameWarningCount = 0;
frameErrorCount = 0;
setStatus(CameraOk);
setCameraInfo("🟢 Ready");
}
// B) TRACKING & AI SYNC
if (m_cameraReady) {
if (m_trackings > 0 && !m_trackingReady && m_status != ErrorTracking) {
setStatus(InitializingTracking);
setCameraInfo("🟡 Initializing Camera Tracking...");
err = initializeTracking();
ZED_TRACE_ERR("initializeTracking", err);
if (err != sl::ERROR_CODE::SUCCESS) {
setStatus(ErrorTracking);
setCameraError(err, "AI Optimization");
} else {
setStatus(CameraOk);
setCameraInfo("🟢 Ready");
}
}
else if (m_trackings == 0 && m_trackingReady) {
finalizeTracking();
}
}
// C) MANUAL RESTART CHECK (TestRequested or KeepAlive Timeout)
if (m_restartCamera) {
triggerRestart(15);
continue;
}
c1 = true;
// D.1) GET FRAME DATE (needed for views and tracking): this function LOCKS till there is a new frame
auto grabErr = grabFrame();
ZED_TRACE_ERR("grabFrame", grabErr);
// D.2) GET FRAME VIEWS
auto viewErr = (grabErr == sl::ERROR_CODE::SUCCESS) ? generateFrameViews() : grabErr;
ZED_TRACE_ERR("generateFrameViews", viewErr);
c2 = true;
if (grabErr == sl::ERROR_CODE::SUCCESS && viewErr == sl::ERROR_CODE::SUCCESS) {
frameWarningCount = 0;
frameErrorCount = 0;
c3 = true;
// D.3) GET TRACKING DATA
if (!checkTracking() || generateBoxTracking()) {
if (++m_frameGoodCounter % 600 == 0) {
emit keepAliveConfirmed();
m_frameGoodCounter = 0;
}
}
c4 = true;
m_thread->msleep(2); // Breath room for the GPU context
continue; //D -> SUCCESS, go for next frame
}
// E) ERROR HANDLING
c5 = true;
err = (grabErr != sl::ERROR_CODE::SUCCESS) ? grabErr : viewErr;
if (err < sl::ERROR_CODE::SUCCESS) { // Periodic error logging for "Soft" errors (like darkness)
if (++frameWarningCount % 1000 == 0) {
ZED_TRACE(QString(">>> Grab Warning: %1. Counter: %2/%3")
.arg(skCameraError(err)).arg(frameWarningCount).arg(warningTries));
setCameraError(err, "Grab Frame");
}
if (frameWarningCount >= warningTries && err != sl::ERROR_CODE::CAMERA_REBOOTING) {
ZED_TRACE(">>> Warning threshold reached. Restarting...");
setStatus(ErrorTracking);
triggerRestart(30);
}
#ifdef ZED_NOISY_DEBUG
ZED_TRACE(QString(">>> Grab Warning: %1. Counter: %2/%3")
.arg(skCameraError(err)).arg(frameWarningCount).arg(warningTries));
#endif
}
else { // Bad errors
if (++frameErrorCount >= errorTries) {
ZED_TRACE(">>> Error threshold reached. Restarting...");
setStatus(ErrorTracking);
triggerRestart(30);
} else {
// Wait a bit
m_thread->msleep(500);
}
#ifdef ZED_NOISY_DEBUG
ZED_TRACE(QString(">>> Grab Error: %1. Counter: %2/%3")
.arg(skCameraError(err)).arg(frameErrorCount).arg(errorTries));
#endif
}
c6 = true;
} // while (m_isAlive.load())
// --- Phase 3: Exit Logic ---
#ifdef ZED_NOISY_DEBUG
ZED_TRACE("ZED_NOISY_DEBUG: m_isAlive became false. Thread breaking out of Phase 2 Loop. Initiating Shutdown.");
#endif
closeCamera();
// Preserve Error statuses for the Frontend to see why we stopped
if (m_status != ErrorOpen && m_status != ErrorTracking) {
setStatus(CameraClosed);
}
emit gfxFinalizationRequested();
ZED_TRACE("End Loop Thread");
setCameraInfo("🔴 Camera Closed");
}
And we have SDK logs too. The one that the SDK puts out when we lose them is :
11-04-2026 05:37:10 6-04-11 05:37:10 UTC][ZED][INFO] Detected Connection Failure. Trying to recover the camera with sn32773034...
And for another camera:
11-04-2026 02:51:33 6-04-11 02:51:33 UTC][ZED][INFO] Detected Connection Failure. Trying to recover the camera with sn31300192...
So now we can confirm that in our setup we are having our thread blocked by the Zed SDK, with which we share an OpenGL context with, and thus losing tracking info until application reboot with no ability to recover from this issue, since the locked thread keeps the Zed busy.
We kindly await your response, we are in great need of your help,
Tomas