Monitoring service: error handling improvements

This commit is contained in:
ViacheslavKlimov 2023-03-09 14:11:37 +02:00
parent 869136da89
commit 43aacd0a30
3 changed files with 28 additions and 18 deletions

View File

@ -16,15 +16,16 @@
package org.thingsboard.monitoring.data.notification;
import lombok.Getter;
import org.apache.commons.lang3.exception.ExceptionUtils;
@Getter
public class ServiceFailureNotification implements Notification {
private final Object serviceKey;
private final Exception error;
private final Throwable error;
private final int failuresCount;
public ServiceFailureNotification(Object serviceKey, Exception error, int failuresCount) {
public ServiceFailureNotification(Object serviceKey, Throwable error, int failuresCount) {
this.serviceKey = serviceKey;
this.error = error;
this.failuresCount = failuresCount;
@ -32,7 +33,17 @@ public class ServiceFailureNotification implements Notification {
@Override
public String getText() {
return String.format("[%s] Failure: %s (number of subsequent failures: %s)", serviceKey, error.getMessage(), failuresCount);
String errorMsg = error.getMessage();
if (errorMsg == null || errorMsg.equals("null")) {
Throwable cause = ExceptionUtils.getRootCause(error);
if (cause != null) {
errorMsg = cause.getMessage();
}
}
if (errorMsg == null) {
errorMsg = error.getClass().getSimpleName();
}
return String.format("[%s] Failure: %s (number of subsequent failures: %s)", serviceKey, errorMsg, failuresCount);
}
}

View File

@ -21,8 +21,6 @@ import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.boot.context.event.ApplicationReadyEvent;
import org.springframework.context.event.EventListener;
import org.springframework.stereotype.Component;
import org.thingsboard.common.util.JacksonUtil;
import org.thingsboard.monitoring.client.TbClient;
@ -39,8 +37,6 @@ import org.thingsboard.server.common.data.id.EntityIdFactory;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Collectors;
@ -110,7 +106,10 @@ public class MonitoringReporter {
latencies.computeIfAbsent(latencyKey, k -> new Latency(latencyKey)).report(latencyInMs);
}
public void serviceFailure(Object serviceKey, Exception error) {
public void serviceFailure(Object serviceKey, Throwable error) {
if (log.isDebugEnabled()) {
log.error("Error occurred", error);
}
int failuresCount = failuresCounters.computeIfAbsent(serviceKey, k -> new AtomicInteger()).incrementAndGet();
ServiceFailureNotification notification = new ServiceFailureNotification(serviceKey, error, failuresCount);
log.error(notification.getText());

View File

@ -84,13 +84,9 @@ public final class TransportMonitoringService {
@EventListener(ApplicationReadyEvent.class)
public void startMonitoring() {
scheduleCheck(0);
}
private void scheduleCheck(int delay) {
log.debug("Scheduling next check for {} ms", delay);
scheduler.schedule(() -> {
scheduler.scheduleWithFixedDelay(() -> {
try {
log.debug("Starting transports check");
stopWatch.start();
String accessToken = tbClient.logIn();
reporter.reportLatency(Latencies.LOG_IN, stopWatch.getTime());
@ -103,11 +99,15 @@ public final class TransportMonitoringService {
}
}
reporter.reportLatencies(tbClient);
} catch (Exception e) {
reporter.serviceFailure(MonitoredServiceKey.GENERAL, e);
log.debug("Finished transports check");
} catch (Throwable error) {
try {
reporter.serviceFailure(MonitoredServiceKey.GENERAL, error);
} catch (Throwable reportError) {
log.error("Error occurred during service failure reporting", reportError);
}
}
scheduleCheck(monitoringRateMs);
}, delay, TimeUnit.MILLISECONDS);
}, 0, monitoringRateMs, TimeUnit.MILLISECONDS);
}
private void checkMonitoringTarget(TransportMonitoringConfig config, MonitoringTargetConfig target, TbClient tbClient) {