EDQS repartitioning improvements

This commit is contained in:
ViacheslavKlimov 2025-02-19 15:54:18 +02:00
parent ded6daf2b3
commit 6bf56c8dc2
4 changed files with 94 additions and 112 deletions

View File

@ -43,14 +43,13 @@ import org.thingsboard.server.common.data.edqs.query.QueryResult;
import org.thingsboard.server.common.data.id.CustomerId;
import org.thingsboard.server.common.data.id.TenantId;
import org.thingsboard.server.common.data.page.PageData;
import org.thingsboard.server.common.data.queue.QueueConfig;
import org.thingsboard.server.common.data.util.CollectionsUtil;
import org.thingsboard.server.common.msg.queue.ServiceType;
import org.thingsboard.server.common.msg.queue.TopicPartitionInfo;
import org.thingsboard.server.edqs.repo.EdqRepository;
import org.thingsboard.server.edqs.state.EdqsPartitionService;
import org.thingsboard.server.edqs.state.EdqsStateService;
import org.thingsboard.server.edqs.util.EdqsConverter;
import org.thingsboard.server.edqs.state.EdqsPartitionService;
import org.thingsboard.server.edqs.util.VersionsStore;
import org.thingsboard.server.gen.transport.TransportProtos;
import org.thingsboard.server.gen.transport.TransportProtos.EdqsEventMsg;
@ -59,7 +58,7 @@ import org.thingsboard.server.gen.transport.TransportProtos.ToEdqsMsg;
import org.thingsboard.server.queue.TbQueueHandler;
import org.thingsboard.server.queue.TbQueueResponseTemplate;
import org.thingsboard.server.queue.common.TbProtoQueueMsg;
import org.thingsboard.server.queue.common.consumer.MainQueueConsumerManager;
import org.thingsboard.server.queue.common.consumer.PartitionedQueueConsumerManager;
import org.thingsboard.server.queue.discovery.QueueKey;
import org.thingsboard.server.queue.discovery.event.PartitionChangeEvent;
import org.thingsboard.server.queue.edqs.EdqsComponent;
@ -93,7 +92,8 @@ public class EdqsProcessor implements TbQueueHandler<TbProtoQueueMsg<ToEdqsMsg>,
@Autowired @Lazy
private EdqsStateService stateService;
private MainQueueConsumerManager<TbProtoQueueMsg<ToEdqsMsg>, QueueConfig> eventsConsumer;
@Getter
private PartitionedQueueConsumerManager<TbProtoQueueMsg<ToEdqsMsg>> eventsConsumer;
private TbQueueResponseTemplate<TbProtoQueueMsg<ToEdqsMsg>, TbProtoQueueMsg<FromEdqsMsg>> responseTemplate;
private ExecutorService consumersExecutor;
@ -125,11 +125,15 @@ public class EdqsProcessor implements TbQueueHandler<TbProtoQueueMsg<ToEdqsMsg>,
}
};
eventsConsumer = MainQueueConsumerManager.<TbProtoQueueMsg<ToEdqsMsg>, QueueConfig>builder()
eventsConsumer = PartitionedQueueConsumerManager.<TbProtoQueueMsg<ToEdqsMsg>>create()
.queueKey(new QueueKey(ServiceType.EDQS, EdqsQueue.EVENTS.getTopic()))
.config(QueueConfig.of(true, config.getPollInterval()))
.topic(EdqsQueue.EVENTS.getTopic())
.pollInterval(config.getPollInterval())
.msgPackProcessor((msgs, consumer, config) -> {
for (TbProtoQueueMsg<ToEdqsMsg> queueMsg : msgs) {
if (consumer.isStopped()) {
return;
}
try {
ToEdqsMsg msg = queueMsg.getValue();
log.trace("Processing message: {}", msg);
@ -159,37 +163,31 @@ public class EdqsProcessor implements TbQueueHandler<TbProtoQueueMsg<ToEdqsMsg>,
if (event.getServiceType() != ServiceType.EDQS) {
return;
}
repartitionExecutor.submit(() -> { // todo: maybe cancel the task if new event comes
try {
Set<TopicPartitionInfo> newPartitions = event.getNewPartitions().get(new QueueKey(ServiceType.EDQS));
Set<TopicPartitionInfo> partitions = newPartitions.stream()
.map(tpi -> tpi.withUseInternalPartition(true))
.collect(Collectors.toSet());
try {
Set<TopicPartitionInfo> newPartitions = event.getNewPartitions().get(new QueueKey(ServiceType.EDQS));
Set<TopicPartitionInfo> partitions = newPartitions.stream()
.map(tpi -> tpi.withUseInternalPartition(true))
.collect(Collectors.toSet());
try {
stateService.restore(withTopic(partitions, EdqsQueue.STATE.getTopic())); // blocks until restored
} catch (Exception e) {
log.error("Failed to process restore for partitions {}", partitions, e);
}
eventsConsumer.update(withTopic(partitions, EdqsQueue.EVENTS.getTopic()));
responseTemplate.subscribe(withTopic(partitions, config.getRequestsTopic()));
stateService.process(withTopic(partitions, EdqsQueue.STATE.getTopic()));
// eventsConsumer's partitions are updated by stateService
responseTemplate.subscribe(withTopic(partitions, config.getRequestsTopic()));
Set<TopicPartitionInfo> oldPartitions = event.getOldPartitions().get(new QueueKey(ServiceType.EDQS));
if (CollectionsUtil.isNotEmpty(oldPartitions)) {
Set<Integer> removedPartitions = Sets.difference(oldPartitions, newPartitions).stream()
.map(tpi -> tpi.getPartition().orElse(-1)).collect(Collectors.toSet());
if (config.getPartitioningStrategy() != EdqsPartitioningStrategy.TENANT && !removedPartitions.isEmpty()) {
log.warn("Partitions {} were removed but shouldn't be (due to NONE partitioning strategy)", removedPartitions);
}
repository.clearIf(tenantId -> {
Integer partition = partitionService.resolvePartition(tenantId);
return partition != null && removedPartitions.contains(partition);
});
Set<TopicPartitionInfo> oldPartitions = event.getOldPartitions().get(new QueueKey(ServiceType.EDQS));
if (CollectionsUtil.isNotEmpty(oldPartitions)) {
Set<Integer> removedPartitions = Sets.difference(oldPartitions, newPartitions).stream()
.map(tpi -> tpi.getPartition().orElse(-1)).collect(Collectors.toSet());
if (config.getPartitioningStrategy() != EdqsPartitioningStrategy.TENANT && !removedPartitions.isEmpty()) {
log.warn("Partitions {} were removed but shouldn't be (due to NONE partitioning strategy)", removedPartitions);
}
} catch (Throwable t) {
log.error("Failed to handle partition change event {}", event, t);
repository.clearIf(tenantId -> {
Integer partition = partitionService.resolvePartition(tenantId);
return partition != null && removedPartitions.contains(partition);
});
}
});
} catch (Throwable t) {
log.error("Failed to handle partition change event {}", event, t);
}
}
@Override

View File

@ -25,7 +25,7 @@ import java.util.Set;
public interface EdqsStateService {
void restore(Set<TopicPartitionInfo> partitions);
void process(Set<TopicPartitionInfo> partitions);
void save(TenantId tenantId, ObjectType type, String key, EdqsEventType eventType, ToEdqsMsg msg);

View File

@ -25,7 +25,6 @@ import org.thingsboard.common.util.ThingsBoardThreadFactory;
import org.thingsboard.server.common.data.ObjectType;
import org.thingsboard.server.common.data.edqs.EdqsEventType;
import org.thingsboard.server.common.data.id.TenantId;
import org.thingsboard.server.common.data.queue.QueueConfig;
import org.thingsboard.server.common.msg.queue.ServiceType;
import org.thingsboard.server.common.msg.queue.TopicPartitionInfo;
import org.thingsboard.server.edqs.processor.EdqsProcessor;
@ -34,8 +33,9 @@ import org.thingsboard.server.edqs.util.VersionsStore;
import org.thingsboard.server.gen.transport.TransportProtos.EdqsEventMsg;
import org.thingsboard.server.gen.transport.TransportProtos.ToEdqsMsg;
import org.thingsboard.server.queue.common.TbProtoQueueMsg;
import org.thingsboard.server.queue.common.consumer.MainQueueConsumerManager;
import org.thingsboard.server.queue.common.consumer.PartitionedQueueConsumerManager;
import org.thingsboard.server.queue.common.consumer.QueueConsumerManager;
import org.thingsboard.server.queue.common.consumer.QueueStateService;
import org.thingsboard.server.queue.discovery.QueueKey;
import org.thingsboard.server.queue.edqs.EdqsConfig;
import org.thingsboard.server.queue.edqs.EdqsQueue;
@ -44,7 +44,6 @@ import org.thingsboard.server.queue.edqs.KafkaEdqsComponent;
import java.util.Set;
import java.util.UUID;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
@ -54,19 +53,17 @@ import java.util.concurrent.atomic.AtomicInteger;
@RequiredArgsConstructor
@KafkaEdqsComponent
@Slf4j
public class KafkaEdqsStateService implements EdqsStateService {
public class KafkaEdqsStateService extends QueueStateService<TbProtoQueueMsg<ToEdqsMsg>, TbProtoQueueMsg<ToEdqsMsg>> implements EdqsStateService {
private final EdqsConfig config;
private final EdqsPartitionService partitionService;
private final EdqsQueueFactory queueFactory;
private final EdqsProcessor edqsProcessor;
private MainQueueConsumerManager<TbProtoQueueMsg<ToEdqsMsg>, QueueConfig> stateConsumer;
private QueueConsumerManager<TbProtoQueueMsg<ToEdqsMsg>> eventsConsumer;
private PartitionedQueueConsumerManager<TbProtoQueueMsg<ToEdqsMsg>> stateConsumer;
private QueueConsumerManager<TbProtoQueueMsg<ToEdqsMsg>> eventsToBackupConsumer;
private EdqsProducer stateProducer;
private boolean initialRestoreDone;
private ExecutorService consumersExecutor;
private ExecutorService mgmtExecutor;
private ScheduledExecutorService scheduler;
@ -81,11 +78,14 @@ public class KafkaEdqsStateService implements EdqsStateService {
mgmtExecutor = ThingsBoardExecutors.newWorkStealingPool(4, "edqs-backup-consumer-mgmt");
scheduler = ThingsBoardExecutors.newSingleThreadScheduledExecutor("edqs-backup-scheduler");
stateConsumer = MainQueueConsumerManager.<TbProtoQueueMsg<ToEdqsMsg>, QueueConfig>builder() // FIXME Slavik: if topic is empty
stateConsumer = PartitionedQueueConsumerManager.<TbProtoQueueMsg<ToEdqsMsg>>create() // FIXME Slavik: if topic is empty
.queueKey(new QueueKey(ServiceType.EDQS, EdqsQueue.STATE.getTopic()))
.config(QueueConfig.of(true, config.getPollInterval()))
.pollInterval(config.getPollInterval())
.msgPackProcessor((msgs, consumer, config) -> {
for (TbProtoQueueMsg<ToEdqsMsg> queueMsg : msgs) {
if (consumer.isStopped()) {
return;
}
try {
ToEdqsMsg msg = queueMsg.getValue();
log.trace("Processing message: {}", msg);
@ -94,7 +94,7 @@ public class KafkaEdqsStateService implements EdqsStateService {
log.info("[state] Processed {} msgs", stateReadCount.get());
}
} catch (Exception e) {
log.error("Failed to process message: {}", queueMsg, e); // TODO: do something about the error - e.g. reprocess
log.error("Failed to process message: {}", queueMsg, e);
}
}
consumer.commit();
@ -105,46 +105,43 @@ public class KafkaEdqsStateService implements EdqsStateService {
.scheduler(scheduler)
.uncaughtErrorHandler(edqsProcessor.getErrorHandler())
.build();
super.init(stateConsumer, edqsProcessor.getEventsConsumer());
ExecutorService backupExecutor = ThingsBoardExecutors.newLimitedTasksExecutor(12, 1000, "events-to-backup-executor");
eventsConsumer = QueueConsumerManager.<TbProtoQueueMsg<ToEdqsMsg>>builder() // FIXME Slavik writes to the state while we read it, slows down the start. maybe start backup consumer after restore is finished
eventsToBackupConsumer = QueueConsumerManager.<TbProtoQueueMsg<ToEdqsMsg>>builder()
.name("edqs-events-to-backup-consumer")
.pollInterval(config.getPollInterval())
.msgPackProcessor((msgs, consumer) -> {
CountDownLatch resultLatch = new CountDownLatch(msgs.size());
for (TbProtoQueueMsg<ToEdqsMsg> queueMsg : msgs) {
backupExecutor.submit(() -> {
try {
ToEdqsMsg msg = queueMsg.getValue();
log.trace("Processing message: {}", msg);
if (consumer.isStopped()) {
return;
}
try {
ToEdqsMsg msg = queueMsg.getValue();
log.trace("Processing message: {}", msg);
if (msg.hasEventMsg()) {
EdqsEventMsg eventMsg = msg.getEventMsg();
String key = eventMsg.getKey();
int count = eventsReadCount.incrementAndGet();
if (count % 100000 == 0) {
log.info("[events-to-backup] Processed {} msgs", count);
}
if (eventMsg.hasVersion()) {
if (!versionsStore.isNew(key, eventMsg.getVersion())) {
return;
}
}
TenantId tenantId = getTenantId(msg);
ObjectType objectType = ObjectType.valueOf(eventMsg.getObjectType());
EdqsEventType eventType = EdqsEventType.valueOf(eventMsg.getEventType());
log.debug("[{}] Saving to backup [{}] [{}] [{}]", tenantId, objectType, eventType, key);
stateProducer.send(tenantId, objectType, key, msg);
if (msg.hasEventMsg()) {
EdqsEventMsg eventMsg = msg.getEventMsg();
String key = eventMsg.getKey();
int count = eventsReadCount.incrementAndGet();
if (count % 100000 == 0) {
log.info("[events-to-backup] Processed {} msgs", count);
}
} catch (Throwable t) {
log.error("Failed to process message: {}", queueMsg, t);
} finally {
resultLatch.countDown();
if (eventMsg.hasVersion()) {
if (!versionsStore.isNew(key, eventMsg.getVersion())) {
continue;
}
}
TenantId tenantId = getTenantId(msg);
ObjectType objectType = ObjectType.valueOf(eventMsg.getObjectType());
EdqsEventType eventType = EdqsEventType.valueOf(eventMsg.getEventType());
log.debug("[{}] Saving to backup [{}] [{}] [{}]", tenantId, objectType, eventType, key);
stateProducer.send(tenantId, objectType, key, msg);
}
});
} catch (Throwable t) {
log.error("Failed to process message: {}", queueMsg, t);
}
}
resultLatch.await();
consumer.commit();
})
.consumerCreator(() -> queueFactory.createEdqsMsgConsumer(EdqsQueue.EVENTS, "events-to-backup-consumer-group")) // shared by all instances consumer group
@ -160,20 +157,12 @@ public class KafkaEdqsStateService implements EdqsStateService {
}
@Override
public void restore(Set<TopicPartitionInfo> partitions) {
stateReadCount.set(0); //TODO Slavik: do not support remote mode in monolith setup
long startTs = System.currentTimeMillis();
log.info("Restore started for partitions {}", partitions.stream().map(tpi -> tpi.getPartition().orElse(-1)).sorted().toList());
stateConsumer.doUpdate(partitions); // calling blocking doUpdate instead of update
stateConsumer.awaitStop(0); // consumers should stop on their own because EdqsQueue.STATE.stopWhenRead is true, we just need to wait
log.info("Restore finished in {} ms. Processed {} msgs", (System.currentTimeMillis() - startTs), stateReadCount.get());
if (!initialRestoreDone) {
initialRestoreDone = true;
eventsConsumer.subscribe();
eventsConsumer.launch();
public void process(Set<TopicPartitionInfo> partitions) {
if (getPartitions() == null) {
eventsToBackupConsumer.subscribe();
eventsToBackupConsumer.launch();
}
super.update(partitions);
}
@Override
@ -194,7 +183,7 @@ public class KafkaEdqsStateService implements EdqsStateService {
private void preDestroy() {
stateConsumer.stop();
stateConsumer.awaitStop();
eventsConsumer.stop();
eventsToBackupConsumer.stop();
stateProducer.stop();
consumersExecutor.shutdownNow();

View File

@ -17,8 +17,6 @@ package org.thingsboard.server.edqs.state;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.annotation.Lazy;
import org.springframework.stereotype.Service;
import org.thingsboard.server.common.data.ObjectType;
import org.thingsboard.server.common.data.edqs.EdqsEventType;
@ -38,29 +36,26 @@ import java.util.Set;
@Slf4j
public class LocalEdqsStateService implements EdqsStateService {
@Autowired @Lazy
private EdqsProcessor processor;
@Autowired
private EdqsRocksDb db;
private final EdqsProcessor processor;
private final EdqsRocksDb db;
private boolean restoreDone;
private Set<TopicPartitionInfo> partitions;
@Override
public void restore(Set<TopicPartitionInfo> partitions) {
if (restoreDone) {
return;
public void process(Set<TopicPartitionInfo> partitions) {
if (this.partitions != null) {
db.forEach((key, value) -> {
try {
ToEdqsMsg edqsMsg = ToEdqsMsg.parseFrom(value);
log.trace("[{}] Restored msg from RocksDB: {}", key, edqsMsg);
processor.process(edqsMsg, EdqsQueue.STATE);
} catch (Exception e) {
log.error("[{}] Failed to restore value", key, e);
}
});
}
db.forEach((key, value) -> {
try {
ToEdqsMsg edqsMsg = ToEdqsMsg.parseFrom(value);
log.trace("[{}] Restored msg from RocksDB: {}", key, edqsMsg);
processor.process(edqsMsg, EdqsQueue.STATE);
} catch (Exception e) {
log.error("[{}] Failed to restore value", key, e);
}
});
restoreDone = true;
processor.getEventsConsumer().update(partitions);
this.partitions = partitions;
}
@Override
@ -79,7 +74,7 @@ public class LocalEdqsStateService implements EdqsStateService {
@Override
public boolean isReady() {
return restoreDone;
return partitions != null;
}
}