EDQS repartitioning improvements

This commit is contained in:
ViacheslavKlimov 2025-02-19 15:54:18 +02:00
parent ded6daf2b3
commit 6bf56c8dc2
4 changed files with 94 additions and 112 deletions

View File

@ -43,14 +43,13 @@ import org.thingsboard.server.common.data.edqs.query.QueryResult;
import org.thingsboard.server.common.data.id.CustomerId; import org.thingsboard.server.common.data.id.CustomerId;
import org.thingsboard.server.common.data.id.TenantId; import org.thingsboard.server.common.data.id.TenantId;
import org.thingsboard.server.common.data.page.PageData; import org.thingsboard.server.common.data.page.PageData;
import org.thingsboard.server.common.data.queue.QueueConfig;
import org.thingsboard.server.common.data.util.CollectionsUtil; import org.thingsboard.server.common.data.util.CollectionsUtil;
import org.thingsboard.server.common.msg.queue.ServiceType; import org.thingsboard.server.common.msg.queue.ServiceType;
import org.thingsboard.server.common.msg.queue.TopicPartitionInfo; import org.thingsboard.server.common.msg.queue.TopicPartitionInfo;
import org.thingsboard.server.edqs.repo.EdqRepository; import org.thingsboard.server.edqs.repo.EdqRepository;
import org.thingsboard.server.edqs.state.EdqsPartitionService;
import org.thingsboard.server.edqs.state.EdqsStateService; import org.thingsboard.server.edqs.state.EdqsStateService;
import org.thingsboard.server.edqs.util.EdqsConverter; import org.thingsboard.server.edqs.util.EdqsConverter;
import org.thingsboard.server.edqs.state.EdqsPartitionService;
import org.thingsboard.server.edqs.util.VersionsStore; import org.thingsboard.server.edqs.util.VersionsStore;
import org.thingsboard.server.gen.transport.TransportProtos; import org.thingsboard.server.gen.transport.TransportProtos;
import org.thingsboard.server.gen.transport.TransportProtos.EdqsEventMsg; import org.thingsboard.server.gen.transport.TransportProtos.EdqsEventMsg;
@ -59,7 +58,7 @@ import org.thingsboard.server.gen.transport.TransportProtos.ToEdqsMsg;
import org.thingsboard.server.queue.TbQueueHandler; import org.thingsboard.server.queue.TbQueueHandler;
import org.thingsboard.server.queue.TbQueueResponseTemplate; import org.thingsboard.server.queue.TbQueueResponseTemplate;
import org.thingsboard.server.queue.common.TbProtoQueueMsg; import org.thingsboard.server.queue.common.TbProtoQueueMsg;
import org.thingsboard.server.queue.common.consumer.MainQueueConsumerManager; import org.thingsboard.server.queue.common.consumer.PartitionedQueueConsumerManager;
import org.thingsboard.server.queue.discovery.QueueKey; import org.thingsboard.server.queue.discovery.QueueKey;
import org.thingsboard.server.queue.discovery.event.PartitionChangeEvent; import org.thingsboard.server.queue.discovery.event.PartitionChangeEvent;
import org.thingsboard.server.queue.edqs.EdqsComponent; import org.thingsboard.server.queue.edqs.EdqsComponent;
@ -93,7 +92,8 @@ public class EdqsProcessor implements TbQueueHandler<TbProtoQueueMsg<ToEdqsMsg>,
@Autowired @Lazy @Autowired @Lazy
private EdqsStateService stateService; private EdqsStateService stateService;
private MainQueueConsumerManager<TbProtoQueueMsg<ToEdqsMsg>, QueueConfig> eventsConsumer; @Getter
private PartitionedQueueConsumerManager<TbProtoQueueMsg<ToEdqsMsg>> eventsConsumer;
private TbQueueResponseTemplate<TbProtoQueueMsg<ToEdqsMsg>, TbProtoQueueMsg<FromEdqsMsg>> responseTemplate; private TbQueueResponseTemplate<TbProtoQueueMsg<ToEdqsMsg>, TbProtoQueueMsg<FromEdqsMsg>> responseTemplate;
private ExecutorService consumersExecutor; private ExecutorService consumersExecutor;
@ -125,11 +125,15 @@ public class EdqsProcessor implements TbQueueHandler<TbProtoQueueMsg<ToEdqsMsg>,
} }
}; };
eventsConsumer = MainQueueConsumerManager.<TbProtoQueueMsg<ToEdqsMsg>, QueueConfig>builder() eventsConsumer = PartitionedQueueConsumerManager.<TbProtoQueueMsg<ToEdqsMsg>>create()
.queueKey(new QueueKey(ServiceType.EDQS, EdqsQueue.EVENTS.getTopic())) .queueKey(new QueueKey(ServiceType.EDQS, EdqsQueue.EVENTS.getTopic()))
.config(QueueConfig.of(true, config.getPollInterval())) .topic(EdqsQueue.EVENTS.getTopic())
.pollInterval(config.getPollInterval())
.msgPackProcessor((msgs, consumer, config) -> { .msgPackProcessor((msgs, consumer, config) -> {
for (TbProtoQueueMsg<ToEdqsMsg> queueMsg : msgs) { for (TbProtoQueueMsg<ToEdqsMsg> queueMsg : msgs) {
if (consumer.isStopped()) {
return;
}
try { try {
ToEdqsMsg msg = queueMsg.getValue(); ToEdqsMsg msg = queueMsg.getValue();
log.trace("Processing message: {}", msg); log.trace("Processing message: {}", msg);
@ -159,19 +163,14 @@ public class EdqsProcessor implements TbQueueHandler<TbProtoQueueMsg<ToEdqsMsg>,
if (event.getServiceType() != ServiceType.EDQS) { if (event.getServiceType() != ServiceType.EDQS) {
return; return;
} }
repartitionExecutor.submit(() -> { // todo: maybe cancel the task if new event comes
try { try {
Set<TopicPartitionInfo> newPartitions = event.getNewPartitions().get(new QueueKey(ServiceType.EDQS)); Set<TopicPartitionInfo> newPartitions = event.getNewPartitions().get(new QueueKey(ServiceType.EDQS));
Set<TopicPartitionInfo> partitions = newPartitions.stream() Set<TopicPartitionInfo> partitions = newPartitions.stream()
.map(tpi -> tpi.withUseInternalPartition(true)) .map(tpi -> tpi.withUseInternalPartition(true))
.collect(Collectors.toSet()); .collect(Collectors.toSet());
try { stateService.process(withTopic(partitions, EdqsQueue.STATE.getTopic()));
stateService.restore(withTopic(partitions, EdqsQueue.STATE.getTopic())); // blocks until restored // eventsConsumer's partitions are updated by stateService
} catch (Exception e) {
log.error("Failed to process restore for partitions {}", partitions, e);
}
eventsConsumer.update(withTopic(partitions, EdqsQueue.EVENTS.getTopic()));
responseTemplate.subscribe(withTopic(partitions, config.getRequestsTopic())); responseTemplate.subscribe(withTopic(partitions, config.getRequestsTopic()));
Set<TopicPartitionInfo> oldPartitions = event.getOldPartitions().get(new QueueKey(ServiceType.EDQS)); Set<TopicPartitionInfo> oldPartitions = event.getOldPartitions().get(new QueueKey(ServiceType.EDQS));
@ -189,7 +188,6 @@ public class EdqsProcessor implements TbQueueHandler<TbProtoQueueMsg<ToEdqsMsg>,
} catch (Throwable t) { } catch (Throwable t) {
log.error("Failed to handle partition change event {}", event, t); log.error("Failed to handle partition change event {}", event, t);
} }
});
} }
@Override @Override

View File

@ -25,7 +25,7 @@ import java.util.Set;
public interface EdqsStateService { public interface EdqsStateService {
void restore(Set<TopicPartitionInfo> partitions); void process(Set<TopicPartitionInfo> partitions);
void save(TenantId tenantId, ObjectType type, String key, EdqsEventType eventType, ToEdqsMsg msg); void save(TenantId tenantId, ObjectType type, String key, EdqsEventType eventType, ToEdqsMsg msg);

View File

@ -25,7 +25,6 @@ import org.thingsboard.common.util.ThingsBoardThreadFactory;
import org.thingsboard.server.common.data.ObjectType; import org.thingsboard.server.common.data.ObjectType;
import org.thingsboard.server.common.data.edqs.EdqsEventType; import org.thingsboard.server.common.data.edqs.EdqsEventType;
import org.thingsboard.server.common.data.id.TenantId; import org.thingsboard.server.common.data.id.TenantId;
import org.thingsboard.server.common.data.queue.QueueConfig;
import org.thingsboard.server.common.msg.queue.ServiceType; import org.thingsboard.server.common.msg.queue.ServiceType;
import org.thingsboard.server.common.msg.queue.TopicPartitionInfo; import org.thingsboard.server.common.msg.queue.TopicPartitionInfo;
import org.thingsboard.server.edqs.processor.EdqsProcessor; import org.thingsboard.server.edqs.processor.EdqsProcessor;
@ -34,8 +33,9 @@ import org.thingsboard.server.edqs.util.VersionsStore;
import org.thingsboard.server.gen.transport.TransportProtos.EdqsEventMsg; import org.thingsboard.server.gen.transport.TransportProtos.EdqsEventMsg;
import org.thingsboard.server.gen.transport.TransportProtos.ToEdqsMsg; import org.thingsboard.server.gen.transport.TransportProtos.ToEdqsMsg;
import org.thingsboard.server.queue.common.TbProtoQueueMsg; import org.thingsboard.server.queue.common.TbProtoQueueMsg;
import org.thingsboard.server.queue.common.consumer.MainQueueConsumerManager; import org.thingsboard.server.queue.common.consumer.PartitionedQueueConsumerManager;
import org.thingsboard.server.queue.common.consumer.QueueConsumerManager; import org.thingsboard.server.queue.common.consumer.QueueConsumerManager;
import org.thingsboard.server.queue.common.consumer.QueueStateService;
import org.thingsboard.server.queue.discovery.QueueKey; import org.thingsboard.server.queue.discovery.QueueKey;
import org.thingsboard.server.queue.edqs.EdqsConfig; import org.thingsboard.server.queue.edqs.EdqsConfig;
import org.thingsboard.server.queue.edqs.EdqsQueue; import org.thingsboard.server.queue.edqs.EdqsQueue;
@ -44,7 +44,6 @@ import org.thingsboard.server.queue.edqs.KafkaEdqsComponent;
import java.util.Set; import java.util.Set;
import java.util.UUID; import java.util.UUID;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ExecutorService; import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors; import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.ScheduledExecutorService;
@ -54,19 +53,17 @@ import java.util.concurrent.atomic.AtomicInteger;
@RequiredArgsConstructor @RequiredArgsConstructor
@KafkaEdqsComponent @KafkaEdqsComponent
@Slf4j @Slf4j
public class KafkaEdqsStateService implements EdqsStateService { public class KafkaEdqsStateService extends QueueStateService<TbProtoQueueMsg<ToEdqsMsg>, TbProtoQueueMsg<ToEdqsMsg>> implements EdqsStateService {
private final EdqsConfig config; private final EdqsConfig config;
private final EdqsPartitionService partitionService; private final EdqsPartitionService partitionService;
private final EdqsQueueFactory queueFactory; private final EdqsQueueFactory queueFactory;
private final EdqsProcessor edqsProcessor; private final EdqsProcessor edqsProcessor;
private MainQueueConsumerManager<TbProtoQueueMsg<ToEdqsMsg>, QueueConfig> stateConsumer; private PartitionedQueueConsumerManager<TbProtoQueueMsg<ToEdqsMsg>> stateConsumer;
private QueueConsumerManager<TbProtoQueueMsg<ToEdqsMsg>> eventsConsumer; private QueueConsumerManager<TbProtoQueueMsg<ToEdqsMsg>> eventsToBackupConsumer;
private EdqsProducer stateProducer; private EdqsProducer stateProducer;
private boolean initialRestoreDone;
private ExecutorService consumersExecutor; private ExecutorService consumersExecutor;
private ExecutorService mgmtExecutor; private ExecutorService mgmtExecutor;
private ScheduledExecutorService scheduler; private ScheduledExecutorService scheduler;
@ -81,11 +78,14 @@ public class KafkaEdqsStateService implements EdqsStateService {
mgmtExecutor = ThingsBoardExecutors.newWorkStealingPool(4, "edqs-backup-consumer-mgmt"); mgmtExecutor = ThingsBoardExecutors.newWorkStealingPool(4, "edqs-backup-consumer-mgmt");
scheduler = ThingsBoardExecutors.newSingleThreadScheduledExecutor("edqs-backup-scheduler"); scheduler = ThingsBoardExecutors.newSingleThreadScheduledExecutor("edqs-backup-scheduler");
stateConsumer = MainQueueConsumerManager.<TbProtoQueueMsg<ToEdqsMsg>, QueueConfig>builder() // FIXME Slavik: if topic is empty stateConsumer = PartitionedQueueConsumerManager.<TbProtoQueueMsg<ToEdqsMsg>>create() // FIXME Slavik: if topic is empty
.queueKey(new QueueKey(ServiceType.EDQS, EdqsQueue.STATE.getTopic())) .queueKey(new QueueKey(ServiceType.EDQS, EdqsQueue.STATE.getTopic()))
.config(QueueConfig.of(true, config.getPollInterval())) .pollInterval(config.getPollInterval())
.msgPackProcessor((msgs, consumer, config) -> { .msgPackProcessor((msgs, consumer, config) -> {
for (TbProtoQueueMsg<ToEdqsMsg> queueMsg : msgs) { for (TbProtoQueueMsg<ToEdqsMsg> queueMsg : msgs) {
if (consumer.isStopped()) {
return;
}
try { try {
ToEdqsMsg msg = queueMsg.getValue(); ToEdqsMsg msg = queueMsg.getValue();
log.trace("Processing message: {}", msg); log.trace("Processing message: {}", msg);
@ -94,7 +94,7 @@ public class KafkaEdqsStateService implements EdqsStateService {
log.info("[state] Processed {} msgs", stateReadCount.get()); log.info("[state] Processed {} msgs", stateReadCount.get());
} }
} catch (Exception e) { } catch (Exception e) {
log.error("Failed to process message: {}", queueMsg, e); // TODO: do something about the error - e.g. reprocess log.error("Failed to process message: {}", queueMsg, e);
} }
} }
consumer.commit(); consumer.commit();
@ -105,15 +105,16 @@ public class KafkaEdqsStateService implements EdqsStateService {
.scheduler(scheduler) .scheduler(scheduler)
.uncaughtErrorHandler(edqsProcessor.getErrorHandler()) .uncaughtErrorHandler(edqsProcessor.getErrorHandler())
.build(); .build();
super.init(stateConsumer, edqsProcessor.getEventsConsumer());
ExecutorService backupExecutor = ThingsBoardExecutors.newLimitedTasksExecutor(12, 1000, "events-to-backup-executor"); eventsToBackupConsumer = QueueConsumerManager.<TbProtoQueueMsg<ToEdqsMsg>>builder()
eventsConsumer = QueueConsumerManager.<TbProtoQueueMsg<ToEdqsMsg>>builder() // FIXME Slavik writes to the state while we read it, slows down the start. maybe start backup consumer after restore is finished
.name("edqs-events-to-backup-consumer") .name("edqs-events-to-backup-consumer")
.pollInterval(config.getPollInterval()) .pollInterval(config.getPollInterval())
.msgPackProcessor((msgs, consumer) -> { .msgPackProcessor((msgs, consumer) -> {
CountDownLatch resultLatch = new CountDownLatch(msgs.size());
for (TbProtoQueueMsg<ToEdqsMsg> queueMsg : msgs) { for (TbProtoQueueMsg<ToEdqsMsg> queueMsg : msgs) {
backupExecutor.submit(() -> { if (consumer.isStopped()) {
return;
}
try { try {
ToEdqsMsg msg = queueMsg.getValue(); ToEdqsMsg msg = queueMsg.getValue();
log.trace("Processing message: {}", msg); log.trace("Processing message: {}", msg);
@ -127,7 +128,7 @@ public class KafkaEdqsStateService implements EdqsStateService {
} }
if (eventMsg.hasVersion()) { if (eventMsg.hasVersion()) {
if (!versionsStore.isNew(key, eventMsg.getVersion())) { if (!versionsStore.isNew(key, eventMsg.getVersion())) {
return; continue;
} }
} }
@ -139,12 +140,8 @@ public class KafkaEdqsStateService implements EdqsStateService {
} }
} catch (Throwable t) { } catch (Throwable t) {
log.error("Failed to process message: {}", queueMsg, t); log.error("Failed to process message: {}", queueMsg, t);
} finally {
resultLatch.countDown();
} }
});
} }
resultLatch.await();
consumer.commit(); consumer.commit();
}) })
.consumerCreator(() -> queueFactory.createEdqsMsgConsumer(EdqsQueue.EVENTS, "events-to-backup-consumer-group")) // shared by all instances consumer group .consumerCreator(() -> queueFactory.createEdqsMsgConsumer(EdqsQueue.EVENTS, "events-to-backup-consumer-group")) // shared by all instances consumer group
@ -160,20 +157,12 @@ public class KafkaEdqsStateService implements EdqsStateService {
} }
@Override @Override
public void restore(Set<TopicPartitionInfo> partitions) { public void process(Set<TopicPartitionInfo> partitions) {
stateReadCount.set(0); //TODO Slavik: do not support remote mode in monolith setup if (getPartitions() == null) {
long startTs = System.currentTimeMillis(); eventsToBackupConsumer.subscribe();
log.info("Restore started for partitions {}", partitions.stream().map(tpi -> tpi.getPartition().orElse(-1)).sorted().toList()); eventsToBackupConsumer.launch();
stateConsumer.doUpdate(partitions); // calling blocking doUpdate instead of update
stateConsumer.awaitStop(0); // consumers should stop on their own because EdqsQueue.STATE.stopWhenRead is true, we just need to wait
log.info("Restore finished in {} ms. Processed {} msgs", (System.currentTimeMillis() - startTs), stateReadCount.get());
if (!initialRestoreDone) {
initialRestoreDone = true;
eventsConsumer.subscribe();
eventsConsumer.launch();
} }
super.update(partitions);
} }
@Override @Override
@ -194,7 +183,7 @@ public class KafkaEdqsStateService implements EdqsStateService {
private void preDestroy() { private void preDestroy() {
stateConsumer.stop(); stateConsumer.stop();
stateConsumer.awaitStop(); stateConsumer.awaitStop();
eventsConsumer.stop(); eventsToBackupConsumer.stop();
stateProducer.stop(); stateProducer.stop();
consumersExecutor.shutdownNow(); consumersExecutor.shutdownNow();

View File

@ -17,8 +17,6 @@ package org.thingsboard.server.edqs.state;
import lombok.RequiredArgsConstructor; import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.annotation.Lazy;
import org.springframework.stereotype.Service; import org.springframework.stereotype.Service;
import org.thingsboard.server.common.data.ObjectType; import org.thingsboard.server.common.data.ObjectType;
import org.thingsboard.server.common.data.edqs.EdqsEventType; import org.thingsboard.server.common.data.edqs.EdqsEventType;
@ -38,19 +36,14 @@ import java.util.Set;
@Slf4j @Slf4j
public class LocalEdqsStateService implements EdqsStateService { public class LocalEdqsStateService implements EdqsStateService {
@Autowired @Lazy private final EdqsProcessor processor;
private EdqsProcessor processor; private final EdqsRocksDb db;
@Autowired
private EdqsRocksDb db;
private boolean restoreDone; private Set<TopicPartitionInfo> partitions;
@Override @Override
public void restore(Set<TopicPartitionInfo> partitions) { public void process(Set<TopicPartitionInfo> partitions) {
if (restoreDone) { if (this.partitions != null) {
return;
}
db.forEach((key, value) -> { db.forEach((key, value) -> {
try { try {
ToEdqsMsg edqsMsg = ToEdqsMsg.parseFrom(value); ToEdqsMsg edqsMsg = ToEdqsMsg.parseFrom(value);
@ -60,7 +53,9 @@ public class LocalEdqsStateService implements EdqsStateService {
log.error("[{}] Failed to restore value", key, e); log.error("[{}] Failed to restore value", key, e);
} }
}); });
restoreDone = true; }
processor.getEventsConsumer().update(partitions);
this.partitions = partitions;
} }
@Override @Override
@ -79,7 +74,7 @@ public class LocalEdqsStateService implements EdqsStateService {
@Override @Override
public boolean isReady() { public boolean isReady() {
return restoreDone; return partitions != null;
} }
} }