Harden macos runner cleanup
This commit is contained in:
parent
fc79766a31
commit
283209d364
5 changed files with 239 additions and 113 deletions
|
|
@ -64,13 +64,6 @@ func normalizeMacOSNSCMachineType(machineType string) (normalized string, change
|
|||
return normalized, changed, nil
|
||||
}
|
||||
|
||||
type macosNSCSSHOutcome int
|
||||
|
||||
const (
|
||||
macosNSCSSHCompleted macosNSCSSHOutcome = iota
|
||||
macosNSCSSHHandoff
|
||||
)
|
||||
|
||||
func (d *Dispatcher) launchMacOSRunnerViaNSC(ctx context.Context, runnerName string, req LaunchRequest, ttl time.Duration, machineType string) error {
|
||||
if machineType == "" {
|
||||
return errors.New("machine_type is required for macos runners")
|
||||
|
|
@ -223,30 +216,16 @@ func (d *Dispatcher) launchMacOSRunnerViaNSC(ctx context.Context, runnerName str
|
|||
return fmt.Errorf("nsc create failed without producing an instance id\n%s", lastOut)
|
||||
}
|
||||
|
||||
destroyOnReturn := true
|
||||
// Always attempt cleanup on failure; successful handoff is allowed to run out
|
||||
// to its NSC TTL because `nsc ssh` may detach before the Forgejo job exits.
|
||||
defer func() {
|
||||
if destroyOnReturn {
|
||||
d.destroyNSCInstance(context.Background(), runnerName, instanceID)
|
||||
}
|
||||
}()
|
||||
// Always attempt cleanup even if the runner fails.
|
||||
defer d.destroyNSCInstance(context.Background(), runnerName, instanceID)
|
||||
|
||||
script := macosBootstrapWrapperScript(runnerName, req, d.opts.Executor, d.opts.WorkDir)
|
||||
// The CLI fallback is explicitly keychain-backed and does not rely on the
|
||||
// service bearer token, so use `nsc ssh` end-to-end here.
|
||||
outcome, err := d.runMacOSNSCSSHScript(ctx, runnerName, instanceID, script)
|
||||
if err != nil {
|
||||
// Use the Compute SSH config endpoint (direct TCP) instead of `nsc ssh`, which
|
||||
// relies on a websocket-based SSH proxy that is less reliable under the
|
||||
// revokable tenant token flow used by the dispatcher.
|
||||
if err := d.runMacOSComputeSSHScript(ctx, runnerName, instanceID, script); err != nil {
|
||||
return err
|
||||
}
|
||||
if outcome == macosNSCSSHHandoff {
|
||||
destroyOnReturn = false
|
||||
d.log.Info("leaving macos nsc instance running until TTL after runner handoff",
|
||||
"runner", runnerName,
|
||||
"instance", instanceID,
|
||||
"ttl", ttl.String(),
|
||||
)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
|
|
@ -366,57 +345,6 @@ func shellSingleQuote(value string) string {
|
|||
return "'" + strings.ReplaceAll(value, "'", `'\"'\"'`) + "'"
|
||||
}
|
||||
|
||||
func (d *Dispatcher) runMacOSNSCSSHScript(ctx context.Context, runnerName, instanceID, script string) (macosNSCSSHOutcome, error) {
|
||||
sshCtx, cancel := context.WithTimeout(ctx, 5*time.Minute)
|
||||
defer cancel()
|
||||
|
||||
args := []string{"ssh", "--disable-pty", instanceID, "/bin/bash"}
|
||||
args = prependNSCRegionArgs(args, d.opts.ComputeBaseURL)
|
||||
|
||||
cmd := exec.CommandContext(sshCtx, d.opts.BinaryPath, args...)
|
||||
cmd.Env = nscCLIEnv()
|
||||
cmd.Stdin = strings.NewReader(script)
|
||||
|
||||
var buf bytes.Buffer
|
||||
cmd.Stdout = &buf
|
||||
cmd.Stderr = &buf
|
||||
|
||||
if err := cmd.Run(); err != nil {
|
||||
if errors.Is(sshCtx.Err(), context.DeadlineExceeded) {
|
||||
return macosNSCSSHCompleted, fmt.Errorf("nsc ssh timed out after %s\n%s", 5*time.Minute, strings.TrimSpace(buf.String()))
|
||||
}
|
||||
if nscSSHBootstrapLikelySucceeded(err, buf.String()) {
|
||||
d.log.Warn("nsc ssh exited after runner handoff; treating bootstrap as successful",
|
||||
"runner", runnerName,
|
||||
"instance", instanceID,
|
||||
"err", err,
|
||||
)
|
||||
d.log.Info("macos runner bootstrap completed via nsc ssh", "runner", runnerName, "instance", instanceID)
|
||||
return macosNSCSSHHandoff, nil
|
||||
}
|
||||
return macosNSCSSHCompleted, fmt.Errorf("nsc ssh runner bootstrap failed: %w\n%s", err, strings.TrimSpace(buf.String()))
|
||||
}
|
||||
|
||||
d.log.Info("macos runner bootstrap completed via nsc ssh", "runner", runnerName, "instance", instanceID)
|
||||
return macosNSCSSHCompleted, nil
|
||||
}
|
||||
|
||||
func nscSSHBootstrapLikelySucceeded(err error, output string) bool {
|
||||
if err == nil {
|
||||
return false
|
||||
}
|
||||
|
||||
errText := strings.ToLower(err.Error())
|
||||
if !strings.Contains(errText, "remote command exited without exit status or exit signal") {
|
||||
return false
|
||||
}
|
||||
|
||||
output = strings.ToLower(output)
|
||||
return strings.Contains(output, "runner registered successfully") &&
|
||||
strings.Contains(output, "starting job") &&
|
||||
strings.Contains(output, "task ")
|
||||
}
|
||||
|
||||
func prependNSCRegionArgs(args []string, computeBaseURL string) []string {
|
||||
region := strings.TrimSpace(os.Getenv("NSC_REGION"))
|
||||
if region == "" {
|
||||
|
|
|
|||
|
|
@ -1,47 +1,33 @@
|
|||
package nsc
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"testing"
|
||||
)
|
||||
import "testing"
|
||||
|
||||
func TestNSCSSHBootstrapLikelySucceeded(t *testing.T) {
|
||||
func TestNormalizeMacOSNSCMachineTypeRoundsUp(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
err := errors.New("wait: remote command exited without exit status or exit signal")
|
||||
output := `
|
||||
level=info msg="Runner registered successfully."
|
||||
time="2026-03-19T11:29:49Z" level=info msg="Starting job"
|
||||
time="2026-03-19T11:29:50Z" level=info msg="task 124 repo is hackclub/burrow"
|
||||
`
|
||||
|
||||
if !nscSSHBootstrapLikelySucceeded(err, output) {
|
||||
t.Fatal("expected handoff success heuristic to match")
|
||||
got, changed, err := normalizeMacOSNSCMachineType("5x10")
|
||||
if err != nil {
|
||||
t.Fatalf("normalizeMacOSNSCMachineType: %v", err)
|
||||
}
|
||||
if !changed {
|
||||
t.Fatal("expected machine type to be normalized")
|
||||
}
|
||||
if got != "6x14" {
|
||||
t.Fatalf("expected 6x14, got %q", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestNSCSSHBootstrapLikelySucceededRejectsIncompleteOutput(t *testing.T) {
|
||||
func TestNormalizeMacOSNSCMachineTypeKeepsAllowedShape(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
err := errors.New("wait: remote command exited without exit status or exit signal")
|
||||
output := `level=info msg="Runner registered successfully."`
|
||||
|
||||
if nscSSHBootstrapLikelySucceeded(err, output) {
|
||||
t.Fatal("expected incomplete runner output to remain a failure")
|
||||
}
|
||||
}
|
||||
|
||||
func TestNSCSSHBootstrapLikelySucceededRejectsDifferentErrors(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
err := errors.New("exit status 1")
|
||||
output := `
|
||||
level=info msg="Runner registered successfully."
|
||||
time="2026-03-19T11:29:49Z" level=info msg="Starting job"
|
||||
time="2026-03-19T11:29:50Z" level=info msg="task 124 repo is hackclub/burrow"
|
||||
`
|
||||
|
||||
if nscSSHBootstrapLikelySucceeded(err, output) {
|
||||
t.Fatal("expected unrelated nsc ssh errors to remain failures")
|
||||
got, changed, err := normalizeMacOSNSCMachineType("6x14")
|
||||
if err != nil {
|
||||
t.Fatalf("normalizeMacOSNSCMachineType: %v", err)
|
||||
}
|
||||
if changed {
|
||||
t.Fatal("expected allowed machine type to remain unchanged")
|
||||
}
|
||||
if got != "6x14" {
|
||||
t.Fatalf("expected 6x14, got %q", got)
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue