l sync-containers3: don't kill container on lock loss

This commit is contained in:
lassulus 2022-11-17 11:58:59 +01:00
parent 3736bbf091
commit 17fc135929

View file

@ -99,7 +99,7 @@ in {
set -efux set -efux
consul lock sync_${ctr.name} ${pkgs.writers.writeDash "${ctr.name}-sync" '' consul lock sync_${ctr.name} ${pkgs.writers.writeDash "${ctr.name}-sync" ''
set -efux set -efux
if ping -c 1 ${ctr.name}.r; then if /run/wrappers/bin/ping -c 1 ${ctr.name}.r; then
touch "$HOME"/incomplete touch "$HOME"/incomplete
rsync -a -e "ssh -i $CREDENTIALS_DIRECTORY/ssh_key" --inplace container_sync@${ctr.name}.r:disk "$HOME"/disk rsync -a -e "ssh -i $CREDENTIALS_DIRECTORY/ssh_key" --inplace container_sync@${ctr.name}.r:disk "$HOME"/disk
rm "$HOME"/incomplete rm "$HOME"/incomplete
@ -108,6 +108,54 @@ in {
''; '';
}; };
}; } }; }
{ "${ctr.name}_watcher" = {
path = with pkgs; [
coreutils
consul
cryptsetup
curl
mount
util-linux
jq
];
serviceConfig = {
ExecStart = pkgs.writers.writeDash "${ctr.name}_watcher" ''
set -efux
while sleep 5; do
# get the payload
# check if the host reacted recently
case $(curl -s -o /dev/null --retry 10 -w '%{http_code}' http://127.0.0.1:8500/v1/kv/containers/${ctr.name}) in
404)
echo 'got 404 from kv, should kill the container'
break
;;
500)
echo 'got 500 from kv, will kill container'
break
;;
200)
# echo 'got 200 from kv, will check payload'
export payload=$(consul kv get containers/${ctr.name})
if [ "$(jq -rn 'env.payload | fromjson.host')" = '${config.networking.hostName}' ]; then
# echo 'we are the host, continuing'
continue
else
echo 'we are not host, killing container'
break
fi
;;
*)
echo 'unknown state, continuing'
continue
;;
esac
done
/run/current-system/sw/bin/nixos-container stop ${ctr.name} || :
umount /var/lib/sync-containers3/${ctr.name}/state || :
cryptsetup luksClose ${ctr.name} || :
'';
};
}; }
{ "${ctr.name}_scheduler" = { { "${ctr.name}_scheduler" = {
wantedBy = [ "multi-user.target" ]; wantedBy = [ "multi-user.target" ];
path = with pkgs; [ path = with pkgs; [
@ -116,36 +164,68 @@ in {
cryptsetup cryptsetup
mount mount
util-linux util-linux
curl
systemd systemd
jq
retry retry
bc
]; ];
serviceConfig = let serviceConfig = {
containerDirectory = lib.removeSuffix "/%i" config.systemd.services."container@${ctr.name}".environment.root;
in {
Restart = "always"; Restart = "always";
RestartSec = "5s"; RestartSec = "30s";
ExecStart = "${pkgs.consul}/bin/consul lock -verbose -monitor-retry 100 container_${ctr.name} ${pkgs.writers.writeBash "${ctr.name}-start" '' ExecStart = pkgs.writers.writeDash "${ctr.name}_scheduler" ''
set -efux set -efux
# get the payload
# check if the host reacted recently
case $(curl -s -o /dev/null --retry 10 -w '%{http_code}' http://127.0.0.1:8500/v1/kv/containers/${ctr.name}) in
404)
# echo 'got 404 from kv, will create container'
;;
500)
# echo 'got 500 from kv, retrying again'
exit 0
;;
200)
# echo 'got 200 from kv, will check payload'
export payload=$(consul kv get containers/${ctr.name})
if [ "$(jq -rn 'env.payload | fromjson.host')" = '${config.networking.hostName}' ]; then
echo 'we are the host, starting container'
else
# echo 'we are not host, checking timestamp'
# if [ $(echo "$(date +%s) - $(jq -rn 'env.payload | fromjson.time') > 100" | bc) -eq 1 ]; then
if [ "$(jq -rn 'env.payload | fromjson.time | now - tonumber > 100')" = 'true' ]; then
echo 'last beacon is more than 100s ago, taking over'
else
# echo 'last beacon was recent. trying again'
exit 0
fi
fi
;;
*)
echo 'unknown state, bailing out'
exit 0
;;
esac
if test -e /var/lib/sync-containers3/${ctr.name}/incomplete; then if test -e /var/lib/sync-containers3/${ctr.name}/incomplete; then
echo 'data is inconistent, start aborted' echo 'data is inconistent, start aborted'
exit 1 exit 1
fi fi
trap ${pkgs.writers.writeDash "stop-${ctr.name}" '' consul kv put containers/${ctr.name} "$(jq -cn '{host: "${config.networking.hostName}", time: now}')" >/dev/null
set -efux consul lock -verbose -monitor-retry=100 -timeout 30s -name container_${ctr.name} container_${ctr.name} ${pkgs.writers.writeBash "${ctr.name}-start" ''
/run/current-system/sw/bin/nixos-container stop ${ctr.name} || : set -efu
umount /var/lib/sync-containers3/${ctr.name}/state || : cryptsetup luksOpen --key-file ${ctr.luksKey} /var/lib/sync-containers3/${ctr.name}/disk ${ctr.name} || :
cryptsetup luksClose ${ctr.name} || : mkdir -p /var/lib/sync-containers3/${ctr.name}/state
''} INT TERM EXIT mountpoint /var/lib/sync-containers3/${ctr.name}/state || mount /dev/mapper/${ctr.name} /var/lib/sync-containers3/${ctr.name}/state
consul kv put containers/${ctr.name}/host ${config.networking.hostName} /run/current-system/sw/bin/nixos-container start ${ctr.name}
cryptsetup luksOpen --key-file ${ctr.luksKey} /var/lib/sync-containers3/${ctr.name}/disk ${ctr.name} # wait for system to become reachable for the first time
mkdir -p /var/lib/sync-containers3/${ctr.name}/state retry -t 10 -d 10 -- /run/wrappers/bin/ping -q -c 1 ${ctr.name}.r > /dev/null
mount /dev/mapper/${ctr.name} /var/lib/sync-containers3/${ctr.name}/state systemctl start ${ctr.name}_watcher.service
/run/current-system/sw/bin/nixos-container start ${ctr.name} while systemctl is-active container@${ctr.name}.service >/devnull && /run/wrappers/bin/ping -q -c 3 ${ctr.name}.r >/dev/null; do
set +x consul kv put containers/${ctr.name} "$(jq -cn '{host: "${config.networking.hostName}", time: now}')" >/dev/null
until /run/wrappers/bin/ping -q -c 1 ${ctr.name}.r > /dev/null; do sleep 5; done sleep 10
while retry -t 5 -d 60 -- /run/wrappers/bin/ping -q -c 3 ${ctr.name}.r > /dev/null; do sleep 5; done done
echo "lost tinc connection to container, shutting down" ''}
''}"; '';
}; };
}; } }; }
]) (lib.attrValues cfg.containers))); ]) (lib.attrValues cfg.containers)));