From b0e4351a9d4d6e4f3d2bb6ae44d012cfc26f094d Mon Sep 17 00:00:00 2001 From: Conrad Kramer Date: Thu, 19 Mar 2026 14:11:40 -0700 Subject: [PATCH] Fallback macos ssh bootstrap to nsc --- services/forgejo-nsc/README.md | 7 +- .../forgejo-nsc/internal/nsc/macos_nsc.go | 106 +++++++++++++++++- .../internal/nsc/macos_nsc_test.go | 38 ++++++- 3 files changed, 145 insertions(+), 6 deletions(-) diff --git a/services/forgejo-nsc/README.md b/services/forgejo-nsc/README.md index 4cee5da..95167c1 100644 --- a/services/forgejo-nsc/README.md +++ b/services/forgejo-nsc/README.md @@ -45,9 +45,10 @@ profile. The important knobs are: - `namespace.machine_type` / `namespace.duration` – shape + TTL for the ephemeral Namespace environment. The dispatcher destroys the instance after a job so the TTL acts as a hard cap, not an idle timeout. -- macOS fallback launches still use `nsc create`, but bootstrap runs over the - Compute SSH config endpoint instead of `nsc ssh` so the dispatcher can always - destroy the instance itself instead of relying on a websocket SSH proxy handoff. +- macOS fallback launches still use `nsc create`. Bootstrap prefers the + Compute SSH config endpoint, and falls back to keychain-backed `nsc ssh` + only when the Compute bearer is rejected. That keeps the fast path on direct + TCP while preserving a working fallback when tenant auth drifts. - `namespace.linux_cache_*` / `namespace.macos_cache_*` – persistent cache volumes mounted into runners so Linux can keep `/nix` plus shared build caches warm and macOS can reuse Rust toolchains, Xcode package caches, and diff --git a/services/forgejo-nsc/internal/nsc/macos_nsc.go b/services/forgejo-nsc/internal/nsc/macos_nsc.go index 6c66f34..159634a 100644 --- a/services/forgejo-nsc/internal/nsc/macos_nsc.go +++ b/services/forgejo-nsc/internal/nsc/macos_nsc.go @@ -12,6 +12,8 @@ import ( "path/filepath" "strings" "time" + + "connectrpc.com/connect" ) func nscCLIEnv() []string { @@ -64,6 +66,13 @@ func normalizeMacOSNSCMachineType(machineType string) (normalized string, change return normalized, changed, nil } +type macosNSCSSHOutcome int + +const ( + macosNSCSSHCompleted macosNSCSSHOutcome = iota + macosNSCSSHHandoff +) + func (d *Dispatcher) launchMacOSRunnerViaNSC(ctx context.Context, runnerName string, req LaunchRequest, ttl time.Duration, machineType string) error { if machineType == "" { return errors.New("machine_type is required for macos runners") @@ -216,14 +225,38 @@ func (d *Dispatcher) launchMacOSRunnerViaNSC(ctx context.Context, runnerName str return fmt.Errorf("nsc create failed without producing an instance id\n%s", lastOut) } - // Always attempt cleanup even if the runner fails. - defer d.destroyNSCInstance(context.Background(), runnerName, instanceID) + destroyOnReturn := true + defer func() { + if destroyOnReturn { + d.destroyNSCInstance(context.Background(), runnerName, instanceID) + } + }() script := macosBootstrapWrapperScript(runnerName, req, d.opts.Executor, d.opts.WorkDir) // Use the Compute SSH config endpoint (direct TCP) instead of `nsc ssh`, which // relies on a websocket-based SSH proxy that is less reliable under the // revokable tenant token flow used by the dispatcher. if err := d.runMacOSComputeSSHScript(ctx, runnerName, instanceID, script); err != nil { + if shouldFallbackToNSCSSH(err) { + d.log.Warn("compute ssh bootstrap failed; falling back to nsc ssh", + "runner", runnerName, + "instance", instanceID, + "err", err, + ) + outcome, sshErr := d.runMacOSNSCSSHScript(ctx, runnerName, instanceID, script) + if sshErr != nil { + return sshErr + } + if outcome == macosNSCSSHHandoff { + destroyOnReturn = false + d.log.Info("leaving macos nsc instance running until TTL after runner handoff", + "runner", runnerName, + "instance", instanceID, + "ttl", ttl.String(), + ) + } + return nil + } return err } return nil @@ -345,6 +378,75 @@ func shellSingleQuote(value string) string { return "'" + strings.ReplaceAll(value, "'", `'\"'\"'`) + "'" } +func shouldFallbackToNSCSSH(err error) bool { + if err == nil { + return false + } + + switch connect.CodeOf(err) { + case connect.CodeUnauthenticated, connect.CodePermissionDenied, connect.CodeUnimplemented: + return true + } + + errText := strings.ToLower(err.Error()) + return strings.Contains(errText, "compute get ssh config failed") && + (strings.Contains(errText, "unauthenticated") || + strings.Contains(errText, "permission_denied") || + strings.Contains(errText, "permission denied") || + strings.Contains(errText, "unimplemented")) +} + +func (d *Dispatcher) runMacOSNSCSSHScript(ctx context.Context, runnerName, instanceID, script string) (macosNSCSSHOutcome, error) { + sshCtx, cancel := context.WithTimeout(ctx, 5*time.Minute) + defer cancel() + + args := []string{"ssh", "--disable-pty", instanceID, "/bin/bash"} + args = prependNSCRegionArgs(args, d.opts.ComputeBaseURL) + + cmd := exec.CommandContext(sshCtx, d.opts.BinaryPath, args...) + cmd.Env = nscCLIEnv() + cmd.Stdin = strings.NewReader(script) + + var buf bytes.Buffer + cmd.Stdout = &buf + cmd.Stderr = &buf + + if err := cmd.Run(); err != nil { + if errors.Is(sshCtx.Err(), context.DeadlineExceeded) { + return macosNSCSSHCompleted, fmt.Errorf("nsc ssh timed out after %s\n%s", 5*time.Minute, strings.TrimSpace(buf.String())) + } + if nscSSHBootstrapLikelySucceeded(err, buf.String()) { + d.log.Warn("nsc ssh exited after runner handoff; treating bootstrap as successful", + "runner", runnerName, + "instance", instanceID, + "err", err, + ) + d.log.Info("macos runner bootstrap completed via nsc ssh", "runner", runnerName, "instance", instanceID) + return macosNSCSSHHandoff, nil + } + return macosNSCSSHCompleted, fmt.Errorf("nsc ssh runner bootstrap failed: %w\n%s", err, strings.TrimSpace(buf.String())) + } + + d.log.Info("macos runner bootstrap completed via nsc ssh", "runner", runnerName, "instance", instanceID) + return macosNSCSSHCompleted, nil +} + +func nscSSHBootstrapLikelySucceeded(err error, output string) bool { + if err == nil { + return false + } + + errText := strings.ToLower(err.Error()) + if !strings.Contains(errText, "remote command exited without exit status or exit signal") { + return false + } + + output = strings.ToLower(output) + return strings.Contains(output, "runner registered successfully") && + strings.Contains(output, "starting job") && + strings.Contains(output, "task ") +} + func prependNSCRegionArgs(args []string, computeBaseURL string) []string { region := strings.TrimSpace(os.Getenv("NSC_REGION")) if region == "" { diff --git a/services/forgejo-nsc/internal/nsc/macos_nsc_test.go b/services/forgejo-nsc/internal/nsc/macos_nsc_test.go index 682f441..d2aabc6 100644 --- a/services/forgejo-nsc/internal/nsc/macos_nsc_test.go +++ b/services/forgejo-nsc/internal/nsc/macos_nsc_test.go @@ -1,6 +1,9 @@ package nsc -import "testing" +import ( + "errors" + "testing" +) func TestNormalizeMacOSNSCMachineTypeRoundsUp(t *testing.T) { t.Parallel() @@ -31,3 +34,36 @@ func TestNormalizeMacOSNSCMachineTypeKeepsAllowedShape(t *testing.T) { t.Fatalf("expected 6x14, got %q", got) } } + +func TestShouldFallbackToNSCSSHFallbackForComputeAuthErrors(t *testing.T) { + t.Parallel() + + err := errors.New("compute get ssh config failed: unauthenticated: invalid tenant credentials") + if !shouldFallbackToNSCSSH(err) { + t.Fatal("expected compute auth error to fall back to nsc ssh") + } +} + +func TestShouldFallbackToNSCSSHRejectsOtherErrors(t *testing.T) { + t.Parallel() + + err := errors.New("compute ssh runner bootstrap failed: exit status 1") + if shouldFallbackToNSCSSH(err) { + t.Fatal("expected unrelated bootstrap errors to remain fatal") + } +} + +func TestNSCSSHBootstrapLikelySucceeded(t *testing.T) { + t.Parallel() + + err := errors.New("wait: remote command exited without exit status or exit signal") + output := ` +level=info msg="Runner registered successfully." +time="2026-03-19T11:29:49Z" level=info msg="Starting job" +time="2026-03-19T11:29:50Z" level=info msg="task 124 repo is hackclub/burrow" +` + + if !nscSSHBootstrapLikelySucceeded(err, output) { + t.Fatal("expected handoff success heuristic to match") + } +}