Harden macos runner cleanup
Some checks failed
Build Apple / Build App (macOS) (push) Waiting to run
Build Apple / Build App (iOS Simulator) (push) Has started running
Build Site / Next.js Build (push) Successful in 2m1s
Build Rust / Cargo Test (push) Has been cancelled

This commit is contained in:
Conrad Kramer 2026-03-19 14:01:37 -07:00
parent fc79766a31
commit 283209d364
5 changed files with 239 additions and 113 deletions

View file

@ -64,13 +64,6 @@ func normalizeMacOSNSCMachineType(machineType string) (normalized string, change
return normalized, changed, nil
}
type macosNSCSSHOutcome int
const (
macosNSCSSHCompleted macosNSCSSHOutcome = iota
macosNSCSSHHandoff
)
func (d *Dispatcher) launchMacOSRunnerViaNSC(ctx context.Context, runnerName string, req LaunchRequest, ttl time.Duration, machineType string) error {
if machineType == "" {
return errors.New("machine_type is required for macos runners")
@ -223,30 +216,16 @@ func (d *Dispatcher) launchMacOSRunnerViaNSC(ctx context.Context, runnerName str
return fmt.Errorf("nsc create failed without producing an instance id\n%s", lastOut)
}
destroyOnReturn := true
// Always attempt cleanup on failure; successful handoff is allowed to run out
// to its NSC TTL because `nsc ssh` may detach before the Forgejo job exits.
defer func() {
if destroyOnReturn {
d.destroyNSCInstance(context.Background(), runnerName, instanceID)
}
}()
// Always attempt cleanup even if the runner fails.
defer d.destroyNSCInstance(context.Background(), runnerName, instanceID)
script := macosBootstrapWrapperScript(runnerName, req, d.opts.Executor, d.opts.WorkDir)
// The CLI fallback is explicitly keychain-backed and does not rely on the
// service bearer token, so use `nsc ssh` end-to-end here.
outcome, err := d.runMacOSNSCSSHScript(ctx, runnerName, instanceID, script)
if err != nil {
// Use the Compute SSH config endpoint (direct TCP) instead of `nsc ssh`, which
// relies on a websocket-based SSH proxy that is less reliable under the
// revokable tenant token flow used by the dispatcher.
if err := d.runMacOSComputeSSHScript(ctx, runnerName, instanceID, script); err != nil {
return err
}
if outcome == macosNSCSSHHandoff {
destroyOnReturn = false
d.log.Info("leaving macos nsc instance running until TTL after runner handoff",
"runner", runnerName,
"instance", instanceID,
"ttl", ttl.String(),
)
}
return nil
}
@ -366,57 +345,6 @@ func shellSingleQuote(value string) string {
return "'" + strings.ReplaceAll(value, "'", `'\"'\"'`) + "'"
}
func (d *Dispatcher) runMacOSNSCSSHScript(ctx context.Context, runnerName, instanceID, script string) (macosNSCSSHOutcome, error) {
sshCtx, cancel := context.WithTimeout(ctx, 5*time.Minute)
defer cancel()
args := []string{"ssh", "--disable-pty", instanceID, "/bin/bash"}
args = prependNSCRegionArgs(args, d.opts.ComputeBaseURL)
cmd := exec.CommandContext(sshCtx, d.opts.BinaryPath, args...)
cmd.Env = nscCLIEnv()
cmd.Stdin = strings.NewReader(script)
var buf bytes.Buffer
cmd.Stdout = &buf
cmd.Stderr = &buf
if err := cmd.Run(); err != nil {
if errors.Is(sshCtx.Err(), context.DeadlineExceeded) {
return macosNSCSSHCompleted, fmt.Errorf("nsc ssh timed out after %s\n%s", 5*time.Minute, strings.TrimSpace(buf.String()))
}
if nscSSHBootstrapLikelySucceeded(err, buf.String()) {
d.log.Warn("nsc ssh exited after runner handoff; treating bootstrap as successful",
"runner", runnerName,
"instance", instanceID,
"err", err,
)
d.log.Info("macos runner bootstrap completed via nsc ssh", "runner", runnerName, "instance", instanceID)
return macosNSCSSHHandoff, nil
}
return macosNSCSSHCompleted, fmt.Errorf("nsc ssh runner bootstrap failed: %w\n%s", err, strings.TrimSpace(buf.String()))
}
d.log.Info("macos runner bootstrap completed via nsc ssh", "runner", runnerName, "instance", instanceID)
return macosNSCSSHCompleted, nil
}
func nscSSHBootstrapLikelySucceeded(err error, output string) bool {
if err == nil {
return false
}
errText := strings.ToLower(err.Error())
if !strings.Contains(errText, "remote command exited without exit status or exit signal") {
return false
}
output = strings.ToLower(output)
return strings.Contains(output, "runner registered successfully") &&
strings.Contains(output, "starting job") &&
strings.Contains(output, "task ")
}
func prependNSCRegionArgs(args []string, computeBaseURL string) []string {
region := strings.TrimSpace(os.Getenv("NSC_REGION"))
if region == "" {