burrow/services/forgejo-nsc/internal/nsc/macos_nsc.go
Conrad Kramer dd369bd0f8
Some checks failed
Build Apple / Build App (iOS Simulator) (push) Has started running
Build Site / Next.js Build (push) Successful in 1m47s
Build Rust / Cargo Test (push) Failing after 3m43s
Build Apple / Build App (macOS) (push) Successful in 2m12s
Tolerate macos nsc ssh handoff exit
2026-03-19 04:37:01 -07:00

432 lines
12 KiB
Go

package nsc
import (
"bytes"
"context"
"encoding/json"
"errors"
"fmt"
"net/url"
"os"
"os/exec"
"path/filepath"
"strings"
"time"
)
func nscCLIEnv() []string {
env := os.Environ()
out := env[:0]
for _, entry := range env {
if strings.HasPrefix(entry, "NSC_TOKEN_FILE=") {
continue
}
out = append(out, entry)
}
return out
}
func normalizeMacOSNSCMachineType(machineType string) (normalized string, changed bool, err error) {
vcpu, memoryMB, err := parseMachineTypeCPUxMemGB(machineType)
if err != nil {
return "", false, err
}
memGB := memoryMB / 1024
if memGB <= 0 || vcpu <= 0 {
return "", false, fmt.Errorf("invalid machine_type %q after parse: vcpu=%d memGB=%d", machineType, vcpu, memGB)
}
// NSC CLI (and the underlying InstanceService) enforce discrete cpu/mem sets
// for macOS. Normalize requested values by rounding up to the closest allowed
// values to keep provisioning stable even when configs drift.
//
// Observed allowed sets from Namespace API error output for macos/arm64:
// cpu: [4 6 8 12]
// mem: [7 14 28 56] (GB)
allowedCPU := []int32{4, 6, 8, 12}
allowedMemGB := []int32{7, 14, 28, 56}
roundUp := func(v int32, allowed []int32) (int32, bool) {
for _, a := range allowed {
if v <= a {
return a, a != v
}
}
// Clamp to max if above all allowed values.
return allowed[len(allowed)-1], true
}
newCPU, cpuChanged := roundUp(vcpu, allowedCPU)
newMemGB, memChanged := roundUp(memGB, allowedMemGB)
normalized = fmt.Sprintf("%dx%d", newCPU, newMemGB)
changed = cpuChanged || memChanged
return normalized, changed, nil
}
func (d *Dispatcher) launchMacOSRunnerViaNSC(ctx context.Context, runnerName string, req LaunchRequest, ttl time.Duration, machineType string) error {
if machineType == "" {
return errors.New("machine_type is required for macos runners")
}
selectors := macosSelectorsArg(d.opts.MacosBaseImageID)
if selectors == "" {
return errors.New("macos selectors resolved empty")
}
normalizedMachineType := machineType
if n, changed, err := normalizeMacOSNSCMachineType(machineType); err != nil {
return err
} else if changed {
normalizedMachineType = n
}
// If capacity is constrained for the requested (large) shape, try a small
// set of progressively smaller shapes before failing the dispatch request.
// This keeps macOS builds flowing even when large runners are scarce.
candidates := []string{normalizedMachineType, "8x28", "6x14", "4x7"}
seen := map[string]struct{}{}
var uniq []string
for _, c := range candidates {
c = strings.TrimSpace(c)
if c == "" {
continue
}
if _, ok := seen[c]; ok {
continue
}
seen[c] = struct{}{}
uniq = append(uniq, c)
}
candidates = uniq
type attemptCfg struct {
waitTimeout time.Duration
createTimeout time.Duration
}
attempts := []attemptCfg{
{waitTimeout: 6 * time.Minute, createTimeout: 8 * time.Minute},
{waitTimeout: 4 * time.Minute, createTimeout: 6 * time.Minute},
{waitTimeout: 3 * time.Minute, createTimeout: 5 * time.Minute},
}
createInstance := func(mt string, a attemptCfg) (instanceID string, out string, err error) {
tmpDir, err := os.MkdirTemp("", "forgejo-nsc-macos-*")
if err != nil {
return "", "", fmt.Errorf("mktemp: %w", err)
}
defer os.RemoveAll(tmpDir)
metaPath := filepath.Join(tmpDir, "create.json")
cidPath := filepath.Join(tmpDir, "create.cid")
arch := strings.TrimSpace(d.opts.MacosMachineArch)
if arch == "" {
arch = "arm64"
}
// Namespace CLI requires the "os/arch:" prefix to create a macOS instance.
// Without it, `nsc create` defaults to Linux even if selectors include macos.*.
machineType := fmt.Sprintf("macos/%s:%s", arch, mt)
args := []string{
"create",
"--duration", ttl.String(),
"--machine_type", machineType,
"--selectors", selectors,
"--bare",
"--cidfile", cidPath,
"--log_actions",
"--purpose", fmt.Sprintf("burrow forgejo runner %s", runnerName),
// Prefer plain output for debuggability (progress, capacity errors, etc).
"--output", "plain",
"--output_json_to", metaPath,
// macOS instances can take a while to become ready.
"--wait_timeout", a.waitTimeout.String(),
}
args = prependNSCRegionArgs(args, d.opts.ComputeBaseURL)
args = appendVolumeArgs(args, d.opts.MacosCacheVolumes)
createCtx, cancel := context.WithTimeout(ctx, a.createTimeout)
defer cancel()
cmd := exec.CommandContext(createCtx, d.opts.BinaryPath, args...)
cmd.Env = nscCLIEnv()
var buf bytes.Buffer
cmd.Stdout = &buf
cmd.Stderr = &buf
if err := cmd.Run(); err != nil {
// Best-effort cleanup: if the instance ID was written before the command failed
// (or before we timed it out), attempt to destroy it to avoid idling machines.
if instanceID := strings.TrimSpace(mustReadFile(cidPath)); instanceID != "" {
d.destroyNSCInstance(context.Background(), runnerName, instanceID)
}
if errors.Is(createCtx.Err(), context.DeadlineExceeded) {
return "", buf.String(), fmt.Errorf("nsc create timed out after %s", a.createTimeout)
}
return "", buf.String(), fmt.Errorf("nsc create failed: %w", err)
}
instanceID, err = readNSCCreateInstanceID(metaPath)
if err != nil {
return "", buf.String(), fmt.Errorf("nsc create output parse failed: %w", err)
}
if instanceID == "" {
return "", buf.String(), fmt.Errorf("nsc create returned empty instance id")
}
return instanceID, buf.String(), nil
}
var (
instanceID string
lastOut string
lastErr error
)
for i, mt := range candidates {
a := attempts[i]
if i >= len(attempts) {
a = attempts[len(attempts)-1]
}
d.log.Info("launching Namespace macos runner via nsc",
"runner", runnerName,
"attempt", i+1,
"machine_type", mt,
"requested_machine_type", machineType,
"selectors", selectors,
)
id, out, err := createInstance(mt, a)
lastOut = out
lastErr = err
if err != nil {
// Timeouts are treated as retryable (capacity constrained).
if strings.Contains(err.Error(), "timed out") || strings.Contains(strings.ToLower(out), "capacity") {
continue
}
return fmt.Errorf("%w\n%s", err, out)
}
instanceID = id
break
}
if instanceID == "" {
if lastErr != nil {
return fmt.Errorf("%w\n%s", lastErr, lastOut)
}
return fmt.Errorf("nsc create failed without producing an instance id\n%s", lastOut)
}
// Always attempt cleanup even if the runner fails.
defer d.destroyNSCInstance(context.Background(), runnerName, instanceID)
script := macosBootstrapWrapperScript(runnerName, req, d.opts.Executor, d.opts.WorkDir)
// The CLI fallback is explicitly keychain-backed and does not rely on the
// service bearer token, so use `nsc ssh` end-to-end here.
if err := d.runMacOSNSCSSHScript(ctx, runnerName, instanceID, script); err != nil {
return err
}
return nil
}
func mustReadFile(path string) string {
raw, err := os.ReadFile(path)
if err != nil {
return ""
}
return string(raw)
}
func macosSelectorsArg(baseImageID string) string {
id := strings.TrimSpace(baseImageID)
if id == "" {
id = "tahoe"
}
// Allow passing selectors directly via config, e.g. "macos.version=26.x,image.with=xcode-26".
if strings.Contains(id, "=") {
return id
}
switch strings.ToLower(id) {
case "sonoma", "macos-14", "macos14", "14":
return "macos.version=14.x"
case "sequoia", "macos-15", "macos15", "15":
return "macos.version=15.x"
case "tahoe", "macos-26", "macos26", "26":
return "macos.version=26.x,image.with=xcode-26"
default:
return "macos.version=26.x"
}
}
type nscCreateMetadata struct {
InstanceID string `json:"instance_id"`
ClusterID string `json:"cluster_id"`
ID string `json:"id"`
}
func readNSCCreateInstanceID(path string) (string, error) {
raw, err := os.ReadFile(path)
if err != nil {
return "", fmt.Errorf("read %s: %w", path, err)
}
var meta nscCreateMetadata
if err := json.Unmarshal(raw, &meta); err != nil {
return "", err
}
if meta.InstanceID != "" {
return meta.InstanceID, nil
}
if meta.ClusterID != "" {
return meta.ClusterID, nil
}
if meta.ID != "" {
return meta.ID, nil
}
return "", nil
}
func (d *Dispatcher) destroyNSCInstance(ctx context.Context, runnerName, instanceID string) {
if ctx == nil {
ctx = context.Background()
}
ctx, cancel := context.WithTimeout(ctx, 2*time.Minute)
defer cancel()
args := []string{"destroy", "--force", instanceID}
args = prependNSCRegionArgs(args, d.opts.ComputeBaseURL)
cmd := exec.CommandContext(ctx, d.opts.BinaryPath, args...)
cmd.Env = nscCLIEnv()
var buf bytes.Buffer
cmd.Stdout = &buf
cmd.Stderr = &buf
if err := cmd.Run(); err != nil {
d.log.Warn("nsc destroy failed", "runner", runnerName, "instance", instanceID, "err", err, "output", strings.TrimSpace(buf.String()))
return
}
d.log.Info("nsc instance destroyed", "runner", runnerName, "instance", instanceID)
}
func macosBootstrapWrapperScript(runnerName string, req LaunchRequest, executor, workdir string) string {
workdir = macosWorkDir(workdir)
// Pass all values via stdin script so secrets do not appear in the nsc ssh argv.
env := map[string]string{
"FORGEJO_INSTANCE_URL": req.InstanceURL,
"FORGEJO_RUNNER_TOKEN": req.Token,
"FORGEJO_RUNNER_NAME": runnerName,
"FORGEJO_RUNNER_LABELS": strings.Join(req.Labels, ","),
"FORGEJO_RUNNER_EXEC": executor,
"FORGEJO_RUNNER_WORKDIR": workdir,
}
for k, v := range req.ExtraEnv {
env[k] = v
}
var b strings.Builder
b.WriteString("set -euo pipefail\n")
for k, v := range env {
if strings.TrimSpace(k) == "" {
continue
}
// Single-quote shell escaping: safe for arbitrary tokens.
b.WriteString("export ")
b.WriteString(k)
b.WriteString("=")
b.WriteString(shellSingleQuote(v))
b.WriteString("\n")
}
b.WriteString("\n")
b.WriteString(macosBootstrapScript())
return b.String()
}
func shellSingleQuote(value string) string {
// 'foo' -> '\'' within single quotes: '"'"'
return "'" + strings.ReplaceAll(value, "'", `'\"'\"'`) + "'"
}
func (d *Dispatcher) runMacOSNSCSSHScript(ctx context.Context, runnerName, instanceID, script string) error {
sshCtx, cancel := context.WithTimeout(ctx, 5*time.Minute)
defer cancel()
args := []string{"ssh", "--disable-pty", instanceID, "/bin/bash"}
args = prependNSCRegionArgs(args, d.opts.ComputeBaseURL)
cmd := exec.CommandContext(sshCtx, d.opts.BinaryPath, args...)
cmd.Env = nscCLIEnv()
cmd.Stdin = strings.NewReader(script)
var buf bytes.Buffer
cmd.Stdout = &buf
cmd.Stderr = &buf
if err := cmd.Run(); err != nil {
if errors.Is(sshCtx.Err(), context.DeadlineExceeded) {
return fmt.Errorf("nsc ssh timed out after %s\n%s", 5*time.Minute, strings.TrimSpace(buf.String()))
}
if nscSSHBootstrapLikelySucceeded(err, buf.String()) {
d.log.Warn("nsc ssh exited after runner handoff; treating bootstrap as successful",
"runner", runnerName,
"instance", instanceID,
"err", err,
)
d.log.Info("macos runner bootstrap completed via nsc ssh", "runner", runnerName, "instance", instanceID)
return nil
}
return fmt.Errorf("nsc ssh runner bootstrap failed: %w\n%s", err, strings.TrimSpace(buf.String()))
}
d.log.Info("macos runner bootstrap completed via nsc ssh", "runner", runnerName, "instance", instanceID)
return nil
}
func nscSSHBootstrapLikelySucceeded(err error, output string) bool {
if err == nil {
return false
}
errText := strings.ToLower(err.Error())
if !strings.Contains(errText, "remote command exited without exit status or exit signal") {
return false
}
output = strings.ToLower(output)
return strings.Contains(output, "runner registered successfully") &&
strings.Contains(output, "starting job") &&
strings.Contains(output, "task ")
}
func prependNSCRegionArgs(args []string, computeBaseURL string) []string {
region := strings.TrimSpace(os.Getenv("NSC_REGION"))
if region == "" {
region = regionFromComputeBaseURL(computeBaseURL)
}
if region == "" {
// Default to the burrow region used for other Namespace integrations.
region = "ord4"
}
return append([]string{"--region", region}, args...)
}
func regionFromComputeBaseURL(raw string) string {
raw = strings.TrimSpace(raw)
if raw == "" {
return ""
}
u, err := url.Parse(raw)
if err != nil {
return ""
}
host := u.Hostname()
if host == "" {
return ""
}
parts := strings.Split(host, ".")
if len(parts) == 0 {
return ""
}
// ord4.compute.namespaceapis.com -> ord4
if strings.HasSuffix(host, ".compute.namespaceapis.com") || strings.Contains(host, ".compute.") {
return parts[0]
}
return ""
}