EDQS repartitioning improvements
This commit is contained in:
parent
ded6daf2b3
commit
6bf56c8dc2
@ -43,14 +43,13 @@ import org.thingsboard.server.common.data.edqs.query.QueryResult;
|
||||
import org.thingsboard.server.common.data.id.CustomerId;
|
||||
import org.thingsboard.server.common.data.id.TenantId;
|
||||
import org.thingsboard.server.common.data.page.PageData;
|
||||
import org.thingsboard.server.common.data.queue.QueueConfig;
|
||||
import org.thingsboard.server.common.data.util.CollectionsUtil;
|
||||
import org.thingsboard.server.common.msg.queue.ServiceType;
|
||||
import org.thingsboard.server.common.msg.queue.TopicPartitionInfo;
|
||||
import org.thingsboard.server.edqs.repo.EdqRepository;
|
||||
import org.thingsboard.server.edqs.state.EdqsPartitionService;
|
||||
import org.thingsboard.server.edqs.state.EdqsStateService;
|
||||
import org.thingsboard.server.edqs.util.EdqsConverter;
|
||||
import org.thingsboard.server.edqs.state.EdqsPartitionService;
|
||||
import org.thingsboard.server.edqs.util.VersionsStore;
|
||||
import org.thingsboard.server.gen.transport.TransportProtos;
|
||||
import org.thingsboard.server.gen.transport.TransportProtos.EdqsEventMsg;
|
||||
@ -59,7 +58,7 @@ import org.thingsboard.server.gen.transport.TransportProtos.ToEdqsMsg;
|
||||
import org.thingsboard.server.queue.TbQueueHandler;
|
||||
import org.thingsboard.server.queue.TbQueueResponseTemplate;
|
||||
import org.thingsboard.server.queue.common.TbProtoQueueMsg;
|
||||
import org.thingsboard.server.queue.common.consumer.MainQueueConsumerManager;
|
||||
import org.thingsboard.server.queue.common.consumer.PartitionedQueueConsumerManager;
|
||||
import org.thingsboard.server.queue.discovery.QueueKey;
|
||||
import org.thingsboard.server.queue.discovery.event.PartitionChangeEvent;
|
||||
import org.thingsboard.server.queue.edqs.EdqsComponent;
|
||||
@ -93,7 +92,8 @@ public class EdqsProcessor implements TbQueueHandler<TbProtoQueueMsg<ToEdqsMsg>,
|
||||
@Autowired @Lazy
|
||||
private EdqsStateService stateService;
|
||||
|
||||
private MainQueueConsumerManager<TbProtoQueueMsg<ToEdqsMsg>, QueueConfig> eventsConsumer;
|
||||
@Getter
|
||||
private PartitionedQueueConsumerManager<TbProtoQueueMsg<ToEdqsMsg>> eventsConsumer;
|
||||
private TbQueueResponseTemplate<TbProtoQueueMsg<ToEdqsMsg>, TbProtoQueueMsg<FromEdqsMsg>> responseTemplate;
|
||||
|
||||
private ExecutorService consumersExecutor;
|
||||
@ -125,11 +125,15 @@ public class EdqsProcessor implements TbQueueHandler<TbProtoQueueMsg<ToEdqsMsg>,
|
||||
}
|
||||
};
|
||||
|
||||
eventsConsumer = MainQueueConsumerManager.<TbProtoQueueMsg<ToEdqsMsg>, QueueConfig>builder()
|
||||
eventsConsumer = PartitionedQueueConsumerManager.<TbProtoQueueMsg<ToEdqsMsg>>create()
|
||||
.queueKey(new QueueKey(ServiceType.EDQS, EdqsQueue.EVENTS.getTopic()))
|
||||
.config(QueueConfig.of(true, config.getPollInterval()))
|
||||
.topic(EdqsQueue.EVENTS.getTopic())
|
||||
.pollInterval(config.getPollInterval())
|
||||
.msgPackProcessor((msgs, consumer, config) -> {
|
||||
for (TbProtoQueueMsg<ToEdqsMsg> queueMsg : msgs) {
|
||||
if (consumer.isStopped()) {
|
||||
return;
|
||||
}
|
||||
try {
|
||||
ToEdqsMsg msg = queueMsg.getValue();
|
||||
log.trace("Processing message: {}", msg);
|
||||
@ -159,19 +163,14 @@ public class EdqsProcessor implements TbQueueHandler<TbProtoQueueMsg<ToEdqsMsg>,
|
||||
if (event.getServiceType() != ServiceType.EDQS) {
|
||||
return;
|
||||
}
|
||||
repartitionExecutor.submit(() -> { // todo: maybe cancel the task if new event comes
|
||||
try {
|
||||
Set<TopicPartitionInfo> newPartitions = event.getNewPartitions().get(new QueueKey(ServiceType.EDQS));
|
||||
Set<TopicPartitionInfo> partitions = newPartitions.stream()
|
||||
.map(tpi -> tpi.withUseInternalPartition(true))
|
||||
.collect(Collectors.toSet());
|
||||
|
||||
try {
|
||||
stateService.restore(withTopic(partitions, EdqsQueue.STATE.getTopic())); // blocks until restored
|
||||
} catch (Exception e) {
|
||||
log.error("Failed to process restore for partitions {}", partitions, e);
|
||||
}
|
||||
eventsConsumer.update(withTopic(partitions, EdqsQueue.EVENTS.getTopic()));
|
||||
stateService.process(withTopic(partitions, EdqsQueue.STATE.getTopic()));
|
||||
// eventsConsumer's partitions are updated by stateService
|
||||
responseTemplate.subscribe(withTopic(partitions, config.getRequestsTopic()));
|
||||
|
||||
Set<TopicPartitionInfo> oldPartitions = event.getOldPartitions().get(new QueueKey(ServiceType.EDQS));
|
||||
@ -189,7 +188,6 @@ public class EdqsProcessor implements TbQueueHandler<TbProtoQueueMsg<ToEdqsMsg>,
|
||||
} catch (Throwable t) {
|
||||
log.error("Failed to handle partition change event {}", event, t);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
@Override
|
||||
|
||||
@ -25,7 +25,7 @@ import java.util.Set;
|
||||
|
||||
public interface EdqsStateService {
|
||||
|
||||
void restore(Set<TopicPartitionInfo> partitions);
|
||||
void process(Set<TopicPartitionInfo> partitions);
|
||||
|
||||
void save(TenantId tenantId, ObjectType type, String key, EdqsEventType eventType, ToEdqsMsg msg);
|
||||
|
||||
|
||||
@ -25,7 +25,6 @@ import org.thingsboard.common.util.ThingsBoardThreadFactory;
|
||||
import org.thingsboard.server.common.data.ObjectType;
|
||||
import org.thingsboard.server.common.data.edqs.EdqsEventType;
|
||||
import org.thingsboard.server.common.data.id.TenantId;
|
||||
import org.thingsboard.server.common.data.queue.QueueConfig;
|
||||
import org.thingsboard.server.common.msg.queue.ServiceType;
|
||||
import org.thingsboard.server.common.msg.queue.TopicPartitionInfo;
|
||||
import org.thingsboard.server.edqs.processor.EdqsProcessor;
|
||||
@ -34,8 +33,9 @@ import org.thingsboard.server.edqs.util.VersionsStore;
|
||||
import org.thingsboard.server.gen.transport.TransportProtos.EdqsEventMsg;
|
||||
import org.thingsboard.server.gen.transport.TransportProtos.ToEdqsMsg;
|
||||
import org.thingsboard.server.queue.common.TbProtoQueueMsg;
|
||||
import org.thingsboard.server.queue.common.consumer.MainQueueConsumerManager;
|
||||
import org.thingsboard.server.queue.common.consumer.PartitionedQueueConsumerManager;
|
||||
import org.thingsboard.server.queue.common.consumer.QueueConsumerManager;
|
||||
import org.thingsboard.server.queue.common.consumer.QueueStateService;
|
||||
import org.thingsboard.server.queue.discovery.QueueKey;
|
||||
import org.thingsboard.server.queue.edqs.EdqsConfig;
|
||||
import org.thingsboard.server.queue.edqs.EdqsQueue;
|
||||
@ -44,7 +44,6 @@ import org.thingsboard.server.queue.edqs.KafkaEdqsComponent;
|
||||
|
||||
import java.util.Set;
|
||||
import java.util.UUID;
|
||||
import java.util.concurrent.CountDownLatch;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.Executors;
|
||||
import java.util.concurrent.ScheduledExecutorService;
|
||||
@ -54,19 +53,17 @@ import java.util.concurrent.atomic.AtomicInteger;
|
||||
@RequiredArgsConstructor
|
||||
@KafkaEdqsComponent
|
||||
@Slf4j
|
||||
public class KafkaEdqsStateService implements EdqsStateService {
|
||||
public class KafkaEdqsStateService extends QueueStateService<TbProtoQueueMsg<ToEdqsMsg>, TbProtoQueueMsg<ToEdqsMsg>> implements EdqsStateService {
|
||||
|
||||
private final EdqsConfig config;
|
||||
private final EdqsPartitionService partitionService;
|
||||
private final EdqsQueueFactory queueFactory;
|
||||
private final EdqsProcessor edqsProcessor;
|
||||
|
||||
private MainQueueConsumerManager<TbProtoQueueMsg<ToEdqsMsg>, QueueConfig> stateConsumer;
|
||||
private QueueConsumerManager<TbProtoQueueMsg<ToEdqsMsg>> eventsConsumer;
|
||||
private PartitionedQueueConsumerManager<TbProtoQueueMsg<ToEdqsMsg>> stateConsumer;
|
||||
private QueueConsumerManager<TbProtoQueueMsg<ToEdqsMsg>> eventsToBackupConsumer;
|
||||
private EdqsProducer stateProducer;
|
||||
|
||||
private boolean initialRestoreDone;
|
||||
|
||||
private ExecutorService consumersExecutor;
|
||||
private ExecutorService mgmtExecutor;
|
||||
private ScheduledExecutorService scheduler;
|
||||
@ -81,11 +78,14 @@ public class KafkaEdqsStateService implements EdqsStateService {
|
||||
mgmtExecutor = ThingsBoardExecutors.newWorkStealingPool(4, "edqs-backup-consumer-mgmt");
|
||||
scheduler = ThingsBoardExecutors.newSingleThreadScheduledExecutor("edqs-backup-scheduler");
|
||||
|
||||
stateConsumer = MainQueueConsumerManager.<TbProtoQueueMsg<ToEdqsMsg>, QueueConfig>builder() // FIXME Slavik: if topic is empty
|
||||
stateConsumer = PartitionedQueueConsumerManager.<TbProtoQueueMsg<ToEdqsMsg>>create() // FIXME Slavik: if topic is empty
|
||||
.queueKey(new QueueKey(ServiceType.EDQS, EdqsQueue.STATE.getTopic()))
|
||||
.config(QueueConfig.of(true, config.getPollInterval()))
|
||||
.pollInterval(config.getPollInterval())
|
||||
.msgPackProcessor((msgs, consumer, config) -> {
|
||||
for (TbProtoQueueMsg<ToEdqsMsg> queueMsg : msgs) {
|
||||
if (consumer.isStopped()) {
|
||||
return;
|
||||
}
|
||||
try {
|
||||
ToEdqsMsg msg = queueMsg.getValue();
|
||||
log.trace("Processing message: {}", msg);
|
||||
@ -94,7 +94,7 @@ public class KafkaEdqsStateService implements EdqsStateService {
|
||||
log.info("[state] Processed {} msgs", stateReadCount.get());
|
||||
}
|
||||
} catch (Exception e) {
|
||||
log.error("Failed to process message: {}", queueMsg, e); // TODO: do something about the error - e.g. reprocess
|
||||
log.error("Failed to process message: {}", queueMsg, e);
|
||||
}
|
||||
}
|
||||
consumer.commit();
|
||||
@ -105,15 +105,16 @@ public class KafkaEdqsStateService implements EdqsStateService {
|
||||
.scheduler(scheduler)
|
||||
.uncaughtErrorHandler(edqsProcessor.getErrorHandler())
|
||||
.build();
|
||||
super.init(stateConsumer, edqsProcessor.getEventsConsumer());
|
||||
|
||||
ExecutorService backupExecutor = ThingsBoardExecutors.newLimitedTasksExecutor(12, 1000, "events-to-backup-executor");
|
||||
eventsConsumer = QueueConsumerManager.<TbProtoQueueMsg<ToEdqsMsg>>builder() // FIXME Slavik writes to the state while we read it, slows down the start. maybe start backup consumer after restore is finished
|
||||
eventsToBackupConsumer = QueueConsumerManager.<TbProtoQueueMsg<ToEdqsMsg>>builder()
|
||||
.name("edqs-events-to-backup-consumer")
|
||||
.pollInterval(config.getPollInterval())
|
||||
.msgPackProcessor((msgs, consumer) -> {
|
||||
CountDownLatch resultLatch = new CountDownLatch(msgs.size());
|
||||
for (TbProtoQueueMsg<ToEdqsMsg> queueMsg : msgs) {
|
||||
backupExecutor.submit(() -> {
|
||||
if (consumer.isStopped()) {
|
||||
return;
|
||||
}
|
||||
try {
|
||||
ToEdqsMsg msg = queueMsg.getValue();
|
||||
log.trace("Processing message: {}", msg);
|
||||
@ -127,7 +128,7 @@ public class KafkaEdqsStateService implements EdqsStateService {
|
||||
}
|
||||
if (eventMsg.hasVersion()) {
|
||||
if (!versionsStore.isNew(key, eventMsg.getVersion())) {
|
||||
return;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
@ -139,12 +140,8 @@ public class KafkaEdqsStateService implements EdqsStateService {
|
||||
}
|
||||
} catch (Throwable t) {
|
||||
log.error("Failed to process message: {}", queueMsg, t);
|
||||
} finally {
|
||||
resultLatch.countDown();
|
||||
}
|
||||
});
|
||||
}
|
||||
resultLatch.await();
|
||||
consumer.commit();
|
||||
})
|
||||
.consumerCreator(() -> queueFactory.createEdqsMsgConsumer(EdqsQueue.EVENTS, "events-to-backup-consumer-group")) // shared by all instances consumer group
|
||||
@ -160,20 +157,12 @@ public class KafkaEdqsStateService implements EdqsStateService {
|
||||
}
|
||||
|
||||
@Override
|
||||
public void restore(Set<TopicPartitionInfo> partitions) {
|
||||
stateReadCount.set(0); //TODO Slavik: do not support remote mode in monolith setup
|
||||
long startTs = System.currentTimeMillis();
|
||||
log.info("Restore started for partitions {}", partitions.stream().map(tpi -> tpi.getPartition().orElse(-1)).sorted().toList());
|
||||
stateConsumer.doUpdate(partitions); // calling blocking doUpdate instead of update
|
||||
stateConsumer.awaitStop(0); // consumers should stop on their own because EdqsQueue.STATE.stopWhenRead is true, we just need to wait
|
||||
log.info("Restore finished in {} ms. Processed {} msgs", (System.currentTimeMillis() - startTs), stateReadCount.get());
|
||||
|
||||
if (!initialRestoreDone) {
|
||||
initialRestoreDone = true;
|
||||
|
||||
eventsConsumer.subscribe();
|
||||
eventsConsumer.launch();
|
||||
public void process(Set<TopicPartitionInfo> partitions) {
|
||||
if (getPartitions() == null) {
|
||||
eventsToBackupConsumer.subscribe();
|
||||
eventsToBackupConsumer.launch();
|
||||
}
|
||||
super.update(partitions);
|
||||
}
|
||||
|
||||
@Override
|
||||
@ -194,7 +183,7 @@ public class KafkaEdqsStateService implements EdqsStateService {
|
||||
private void preDestroy() {
|
||||
stateConsumer.stop();
|
||||
stateConsumer.awaitStop();
|
||||
eventsConsumer.stop();
|
||||
eventsToBackupConsumer.stop();
|
||||
stateProducer.stop();
|
||||
|
||||
consumersExecutor.shutdownNow();
|
||||
|
||||
@ -17,8 +17,6 @@ package org.thingsboard.server.edqs.state;
|
||||
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.context.annotation.Lazy;
|
||||
import org.springframework.stereotype.Service;
|
||||
import org.thingsboard.server.common.data.ObjectType;
|
||||
import org.thingsboard.server.common.data.edqs.EdqsEventType;
|
||||
@ -38,19 +36,14 @@ import java.util.Set;
|
||||
@Slf4j
|
||||
public class LocalEdqsStateService implements EdqsStateService {
|
||||
|
||||
@Autowired @Lazy
|
||||
private EdqsProcessor processor;
|
||||
@Autowired
|
||||
private EdqsRocksDb db;
|
||||
private final EdqsProcessor processor;
|
||||
private final EdqsRocksDb db;
|
||||
|
||||
private boolean restoreDone;
|
||||
private Set<TopicPartitionInfo> partitions;
|
||||
|
||||
@Override
|
||||
public void restore(Set<TopicPartitionInfo> partitions) {
|
||||
if (restoreDone) {
|
||||
return;
|
||||
}
|
||||
|
||||
public void process(Set<TopicPartitionInfo> partitions) {
|
||||
if (this.partitions != null) {
|
||||
db.forEach((key, value) -> {
|
||||
try {
|
||||
ToEdqsMsg edqsMsg = ToEdqsMsg.parseFrom(value);
|
||||
@ -60,7 +53,9 @@ public class LocalEdqsStateService implements EdqsStateService {
|
||||
log.error("[{}] Failed to restore value", key, e);
|
||||
}
|
||||
});
|
||||
restoreDone = true;
|
||||
}
|
||||
processor.getEventsConsumer().update(partitions);
|
||||
this.partitions = partitions;
|
||||
}
|
||||
|
||||
@Override
|
||||
@ -79,7 +74,7 @@ public class LocalEdqsStateService implements EdqsStateService {
|
||||
|
||||
@Override
|
||||
public boolean isReady() {
|
||||
return restoreDone;
|
||||
return partitions != null;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user