Fallback macos ssh bootstrap to nsc
This commit is contained in:
parent
a4cabf9fb7
commit
b0e4351a9d
3 changed files with 145 additions and 6 deletions
|
|
@ -45,9 +45,10 @@ profile. The important knobs are:
|
||||||
- `namespace.machine_type` / `namespace.duration` – shape + TTL for the ephemeral
|
- `namespace.machine_type` / `namespace.duration` – shape + TTL for the ephemeral
|
||||||
Namespace environment. The dispatcher destroys the instance after a job so the
|
Namespace environment. The dispatcher destroys the instance after a job so the
|
||||||
TTL acts as a hard cap, not an idle timeout.
|
TTL acts as a hard cap, not an idle timeout.
|
||||||
- macOS fallback launches still use `nsc create`, but bootstrap runs over the
|
- macOS fallback launches still use `nsc create`. Bootstrap prefers the
|
||||||
Compute SSH config endpoint instead of `nsc ssh` so the dispatcher can always
|
Compute SSH config endpoint, and falls back to keychain-backed `nsc ssh`
|
||||||
destroy the instance itself instead of relying on a websocket SSH proxy handoff.
|
only when the Compute bearer is rejected. That keeps the fast path on direct
|
||||||
|
TCP while preserving a working fallback when tenant auth drifts.
|
||||||
- `namespace.linux_cache_*` / `namespace.macos_cache_*` – persistent cache
|
- `namespace.linux_cache_*` / `namespace.macos_cache_*` – persistent cache
|
||||||
volumes mounted into runners so Linux can keep `/nix` plus shared build
|
volumes mounted into runners so Linux can keep `/nix` plus shared build
|
||||||
caches warm and macOS can reuse Rust toolchains, Xcode package caches, and
|
caches warm and macOS can reuse Rust toolchains, Xcode package caches, and
|
||||||
|
|
|
||||||
|
|
@ -12,6 +12,8 @@ import (
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
|
"connectrpc.com/connect"
|
||||||
)
|
)
|
||||||
|
|
||||||
func nscCLIEnv() []string {
|
func nscCLIEnv() []string {
|
||||||
|
|
@ -64,6 +66,13 @@ func normalizeMacOSNSCMachineType(machineType string) (normalized string, change
|
||||||
return normalized, changed, nil
|
return normalized, changed, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type macosNSCSSHOutcome int
|
||||||
|
|
||||||
|
const (
|
||||||
|
macosNSCSSHCompleted macosNSCSSHOutcome = iota
|
||||||
|
macosNSCSSHHandoff
|
||||||
|
)
|
||||||
|
|
||||||
func (d *Dispatcher) launchMacOSRunnerViaNSC(ctx context.Context, runnerName string, req LaunchRequest, ttl time.Duration, machineType string) error {
|
func (d *Dispatcher) launchMacOSRunnerViaNSC(ctx context.Context, runnerName string, req LaunchRequest, ttl time.Duration, machineType string) error {
|
||||||
if machineType == "" {
|
if machineType == "" {
|
||||||
return errors.New("machine_type is required for macos runners")
|
return errors.New("machine_type is required for macos runners")
|
||||||
|
|
@ -216,14 +225,38 @@ func (d *Dispatcher) launchMacOSRunnerViaNSC(ctx context.Context, runnerName str
|
||||||
return fmt.Errorf("nsc create failed without producing an instance id\n%s", lastOut)
|
return fmt.Errorf("nsc create failed without producing an instance id\n%s", lastOut)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Always attempt cleanup even if the runner fails.
|
destroyOnReturn := true
|
||||||
defer d.destroyNSCInstance(context.Background(), runnerName, instanceID)
|
defer func() {
|
||||||
|
if destroyOnReturn {
|
||||||
|
d.destroyNSCInstance(context.Background(), runnerName, instanceID)
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
script := macosBootstrapWrapperScript(runnerName, req, d.opts.Executor, d.opts.WorkDir)
|
script := macosBootstrapWrapperScript(runnerName, req, d.opts.Executor, d.opts.WorkDir)
|
||||||
// Use the Compute SSH config endpoint (direct TCP) instead of `nsc ssh`, which
|
// Use the Compute SSH config endpoint (direct TCP) instead of `nsc ssh`, which
|
||||||
// relies on a websocket-based SSH proxy that is less reliable under the
|
// relies on a websocket-based SSH proxy that is less reliable under the
|
||||||
// revokable tenant token flow used by the dispatcher.
|
// revokable tenant token flow used by the dispatcher.
|
||||||
if err := d.runMacOSComputeSSHScript(ctx, runnerName, instanceID, script); err != nil {
|
if err := d.runMacOSComputeSSHScript(ctx, runnerName, instanceID, script); err != nil {
|
||||||
|
if shouldFallbackToNSCSSH(err) {
|
||||||
|
d.log.Warn("compute ssh bootstrap failed; falling back to nsc ssh",
|
||||||
|
"runner", runnerName,
|
||||||
|
"instance", instanceID,
|
||||||
|
"err", err,
|
||||||
|
)
|
||||||
|
outcome, sshErr := d.runMacOSNSCSSHScript(ctx, runnerName, instanceID, script)
|
||||||
|
if sshErr != nil {
|
||||||
|
return sshErr
|
||||||
|
}
|
||||||
|
if outcome == macosNSCSSHHandoff {
|
||||||
|
destroyOnReturn = false
|
||||||
|
d.log.Info("leaving macos nsc instance running until TTL after runner handoff",
|
||||||
|
"runner", runnerName,
|
||||||
|
"instance", instanceID,
|
||||||
|
"ttl", ttl.String(),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
|
|
@ -345,6 +378,75 @@ func shellSingleQuote(value string) string {
|
||||||
return "'" + strings.ReplaceAll(value, "'", `'\"'\"'`) + "'"
|
return "'" + strings.ReplaceAll(value, "'", `'\"'\"'`) + "'"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func shouldFallbackToNSCSSH(err error) bool {
|
||||||
|
if err == nil {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
switch connect.CodeOf(err) {
|
||||||
|
case connect.CodeUnauthenticated, connect.CodePermissionDenied, connect.CodeUnimplemented:
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
errText := strings.ToLower(err.Error())
|
||||||
|
return strings.Contains(errText, "compute get ssh config failed") &&
|
||||||
|
(strings.Contains(errText, "unauthenticated") ||
|
||||||
|
strings.Contains(errText, "permission_denied") ||
|
||||||
|
strings.Contains(errText, "permission denied") ||
|
||||||
|
strings.Contains(errText, "unimplemented"))
|
||||||
|
}
|
||||||
|
|
||||||
|
func (d *Dispatcher) runMacOSNSCSSHScript(ctx context.Context, runnerName, instanceID, script string) (macosNSCSSHOutcome, error) {
|
||||||
|
sshCtx, cancel := context.WithTimeout(ctx, 5*time.Minute)
|
||||||
|
defer cancel()
|
||||||
|
|
||||||
|
args := []string{"ssh", "--disable-pty", instanceID, "/bin/bash"}
|
||||||
|
args = prependNSCRegionArgs(args, d.opts.ComputeBaseURL)
|
||||||
|
|
||||||
|
cmd := exec.CommandContext(sshCtx, d.opts.BinaryPath, args...)
|
||||||
|
cmd.Env = nscCLIEnv()
|
||||||
|
cmd.Stdin = strings.NewReader(script)
|
||||||
|
|
||||||
|
var buf bytes.Buffer
|
||||||
|
cmd.Stdout = &buf
|
||||||
|
cmd.Stderr = &buf
|
||||||
|
|
||||||
|
if err := cmd.Run(); err != nil {
|
||||||
|
if errors.Is(sshCtx.Err(), context.DeadlineExceeded) {
|
||||||
|
return macosNSCSSHCompleted, fmt.Errorf("nsc ssh timed out after %s\n%s", 5*time.Minute, strings.TrimSpace(buf.String()))
|
||||||
|
}
|
||||||
|
if nscSSHBootstrapLikelySucceeded(err, buf.String()) {
|
||||||
|
d.log.Warn("nsc ssh exited after runner handoff; treating bootstrap as successful",
|
||||||
|
"runner", runnerName,
|
||||||
|
"instance", instanceID,
|
||||||
|
"err", err,
|
||||||
|
)
|
||||||
|
d.log.Info("macos runner bootstrap completed via nsc ssh", "runner", runnerName, "instance", instanceID)
|
||||||
|
return macosNSCSSHHandoff, nil
|
||||||
|
}
|
||||||
|
return macosNSCSSHCompleted, fmt.Errorf("nsc ssh runner bootstrap failed: %w\n%s", err, strings.TrimSpace(buf.String()))
|
||||||
|
}
|
||||||
|
|
||||||
|
d.log.Info("macos runner bootstrap completed via nsc ssh", "runner", runnerName, "instance", instanceID)
|
||||||
|
return macosNSCSSHCompleted, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func nscSSHBootstrapLikelySucceeded(err error, output string) bool {
|
||||||
|
if err == nil {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
errText := strings.ToLower(err.Error())
|
||||||
|
if !strings.Contains(errText, "remote command exited without exit status or exit signal") {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
output = strings.ToLower(output)
|
||||||
|
return strings.Contains(output, "runner registered successfully") &&
|
||||||
|
strings.Contains(output, "starting job") &&
|
||||||
|
strings.Contains(output, "task ")
|
||||||
|
}
|
||||||
|
|
||||||
func prependNSCRegionArgs(args []string, computeBaseURL string) []string {
|
func prependNSCRegionArgs(args []string, computeBaseURL string) []string {
|
||||||
region := strings.TrimSpace(os.Getenv("NSC_REGION"))
|
region := strings.TrimSpace(os.Getenv("NSC_REGION"))
|
||||||
if region == "" {
|
if region == "" {
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,9 @@
|
||||||
package nsc
|
package nsc
|
||||||
|
|
||||||
import "testing"
|
import (
|
||||||
|
"errors"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
func TestNormalizeMacOSNSCMachineTypeRoundsUp(t *testing.T) {
|
func TestNormalizeMacOSNSCMachineTypeRoundsUp(t *testing.T) {
|
||||||
t.Parallel()
|
t.Parallel()
|
||||||
|
|
@ -31,3 +34,36 @@ func TestNormalizeMacOSNSCMachineTypeKeepsAllowedShape(t *testing.T) {
|
||||||
t.Fatalf("expected 6x14, got %q", got)
|
t.Fatalf("expected 6x14, got %q", got)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestShouldFallbackToNSCSSHFallbackForComputeAuthErrors(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
err := errors.New("compute get ssh config failed: unauthenticated: invalid tenant credentials")
|
||||||
|
if !shouldFallbackToNSCSSH(err) {
|
||||||
|
t.Fatal("expected compute auth error to fall back to nsc ssh")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestShouldFallbackToNSCSSHRejectsOtherErrors(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
err := errors.New("compute ssh runner bootstrap failed: exit status 1")
|
||||||
|
if shouldFallbackToNSCSSH(err) {
|
||||||
|
t.Fatal("expected unrelated bootstrap errors to remain fatal")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNSCSSHBootstrapLikelySucceeded(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
err := errors.New("wait: remote command exited without exit status or exit signal")
|
||||||
|
output := `
|
||||||
|
level=info msg="Runner registered successfully."
|
||||||
|
time="2026-03-19T11:29:49Z" level=info msg="Starting job"
|
||||||
|
time="2026-03-19T11:29:50Z" level=info msg="task 124 repo is hackclub/burrow"
|
||||||
|
`
|
||||||
|
|
||||||
|
if !nscSSHBootstrapLikelySucceeded(err, output) {
|
||||||
|
t.Fatal("expected handoff success heuristic to match")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue