Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 55 additions & 0 deletions controllers/actions.github.com/ephemeralrunner_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -356,6 +356,19 @@ func (r *EphemeralRunnerReconciler) Reconcile(ctx context.Context, req ctrl.Requ
log.Info("Failed to update ephemeral runner status. Requeue to not miss this event")
return ctrl.Result{}, err
}

// Check if runner's GitHub registration is still valid
healthy, err := r.checkRunnerRegistration(ctx, ephemeralRunner, log)
if err != nil {
return ctrl.Result{}, err
}
Comment on lines +360 to +364
Copy link

Copilot AI Mar 9, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

checkRunnerRegistration is invoked whenever cs.State.Terminated == nil, but that condition also covers Waiting container states (image pull, crashloop backoff, etc.) and doesn’t guarantee the runner container is actually running. This can trigger GitHub registration checks before the runner process is up and may incorrectly mark the EphemeralRunner failed on a 404. Consider gating the health check on cs.State.Running != nil (and/or pod.Status.Phase == PodRunning) before calling it.

Copilot uses AI. Check for mistakes.
Comment on lines +360 to +364
Copy link

Copilot AI Mar 9, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This adds an external GetRunner API call on the hot path for running pods and will execute on every reconcile triggered by pod updates. Since pod status updates can be frequent, this may significantly increase GitHub API usage and risk rate limiting at scale. Consider throttling (e.g., record last check time in status/annotations and only check every N minutes) and/or explicitly RequeueAfter a fixed interval to make the check periodic rather than event-frequency dependent.

Copilot uses AI. Check for mistakes.
if !healthy {
if err := r.markAsFailed(ctx, ephemeralRunner, "Runner registration no longer exists on GitHub", "RegistrationInvalidated", log); err != nil {
return ctrl.Result{}, err
}
return ctrl.Result{}, nil
Comment on lines +366 to +369
Copy link

Copilot AI Mar 9, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The failure reason is introduced as a string literal ("RegistrationInvalidated"), while other failure reasons in this package are constants (e.g., ReasonInvalidPodFailure). To keep reasons consistent and avoid typos/drift, consider defining a ReasonRegistrationInvalidated constant (likely in constants.go) and using it here.

Copilot uses AI. Check for mistakes.
}

return ctrl.Result{}, nil

case cs.State.Terminated.ExitCode != 0: // failed
Expand Down Expand Up @@ -804,6 +817,43 @@ func (r *EphemeralRunnerReconciler) updateRunStatusFromPod(ctx context.Context,
return nil
}

// checkRunnerRegistration verifies the runner's GitHub-side registration is still valid.
// Returns true if the runner is healthy or we can't determine status (API errors).
// Returns false if the registration is confirmed gone (404).
func (r *EphemeralRunnerReconciler) checkRunnerRegistration(ctx context.Context, ephemeralRunner *v1alpha1.EphemeralRunner, log logr.Logger) (bool, error) {
// Only check runners that have been registered (have a RunnerId)
if ephemeralRunner.Status.RunnerId == 0 {
return true, nil
}

actionsClient, err := r.GetActionsService(ctx, ephemeralRunner)
if err != nil {
log.Error(err, "Failed to get actions client for health check, skipping")
return true, nil
}

_, err = actionsClient.GetRunner(ctx, int64(ephemeralRunner.Status.RunnerId))
if err == nil {
return true, nil
}

var actionsErr *actions.ActionsError
if errors.As(err, &actionsErr) && actionsErr.StatusCode == 404 {
log.Info("Runner registration not found on GitHub, marking as failed",
"runnerId", ephemeralRunner.Status.RunnerId,
"runnerName", ephemeralRunner.Status.RunnerName,
)
return false, nil
}

// For non-404 errors (transient API issues), don't take action
log.Info("Health check API call failed, will retry on next reconciliation",
"runnerId", ephemeralRunner.Status.RunnerId,
"error", err.Error(),
)
return true, nil
}

func (r *EphemeralRunnerReconciler) deleteRunnerFromService(ctx context.Context, ephemeralRunner *v1alpha1.EphemeralRunner, log logr.Logger) error {
client, err := r.GetActionsService(ctx, ephemeralRunner)
if err != nil {
Expand All @@ -813,6 +863,11 @@ func (r *EphemeralRunnerReconciler) deleteRunnerFromService(ctx context.Context,
log.Info("Removing runner from the service", "runnerId", ephemeralRunner.Status.RunnerId)
err = client.RemoveRunner(ctx, int64(ephemeralRunner.Status.RunnerId))
if err != nil {
var actionsErr *actions.ActionsError
if errors.As(err, &actionsErr) && actionsErr.StatusCode == 404 {
log.Info("Runner already removed from service", "runnerId", ephemeralRunner.Status.RunnerId)
return nil
}
return fmt.Errorf("failed to remove runner from the service: %w", err)
}

Expand Down
212 changes: 212 additions & 0 deletions controllers/actions.github.com/ephemeralrunner_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1422,4 +1422,216 @@ var _ = Describe("EphemeralRunner", func() {
).Should(BeTrue(), "failed to contact server")
})
})

Describe("Stale runner health check", func() {
var ctx context.Context
var mgr ctrl.Manager
var autoscalingNS *corev1.Namespace
var configSecret *corev1.Secret
var controller *EphemeralRunnerReconciler

Context("when GitHub registration is invalidated (404)", func() {
var ephemeralRunner *v1alpha1.EphemeralRunner

BeforeEach(func() {
ctx = context.Background()
autoscalingNS, mgr = createNamespace(GinkgoT(), k8sClient)
configSecret = createDefaultSecret(GinkgoT(), k8sClient, autoscalingNS.Name)

// GetRunner returns 404 — simulates GitHub forgetting the registration
// RemoveRunner also returns 404 — the runner is already gone
fakeClient := fake.NewFakeClient(
fake.WithGetRunner(nil, &actions.ActionsError{
StatusCode: 404,
Err: &actions.ActionsExceptionError{
ExceptionName: "AgentNotFoundException",
Message: "runner not found",
},
}),
fake.WithRemoveRunnerError(&actions.ActionsError{
StatusCode: 404,
Err: &actions.ActionsExceptionError{
ExceptionName: "AgentNotFoundException",
Message: "runner not found",
},
}),
)

controller = &EphemeralRunnerReconciler{
Client: mgr.GetClient(),
Scheme: mgr.GetScheme(),
Log: logf.Log,
ResourceBuilder: ResourceBuilder{
SecretResolver: &SecretResolver{
k8sClient: mgr.GetClient(),
multiClient: fake.NewMultiClient(fake.WithDefaultClient(fakeClient, nil)),
},
},
}

err := controller.SetupWithManager(mgr)
Expect(err).To(BeNil(), "failed to setup controller")

ephemeralRunner = newExampleRunner("stale-runner", autoscalingNS.Name, configSecret.Name)
err = k8sClient.Create(ctx, ephemeralRunner)
Expect(err).To(BeNil(), "failed to create ephemeral runner")

startManagers(GinkgoT(), mgr)
})

It("should mark the runner as failed when GetRunner returns 404", func() {
// Wait for the controller to set up the runner (finalizers, secret, pod, status)
Eventually(
func() (int, error) {
updated := new(v1alpha1.EphemeralRunner)
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, updated)
if err != nil {
return 0, err
}
return updated.Status.RunnerId, nil
},
ephemeralRunnerTimeout,
ephemeralRunnerInterval,
).ShouldNot(Equal(0), "runner should have a RunnerId set")

// Wait for the pod to exist, then simulate a running container
pod := new(corev1.Pod)
Eventually(
func() (bool, error) {
if err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod); err != nil {
return false, err
}
return true, nil
},
ephemeralRunnerTimeout,
ephemeralRunnerInterval,
).Should(BeEquivalentTo(true), "pod should exist")

// Set pod to running with a running container status
pod.Status.Phase = corev1.PodRunning
pod.Status.ContainerStatuses = []corev1.ContainerStatus{
{
Name: v1alpha1.EphemeralRunnerContainerName,
State: corev1.ContainerState{
Running: &corev1.ContainerStateRunning{
StartedAt: metav1.Now(),
},
},
},
}
err := k8sClient.Status().Update(ctx, pod)
Expect(err).To(BeNil(), "failed to update pod status to running")

// Now the health check should detect the stale registration and mark it failed
Eventually(
func() (corev1.PodPhase, error) {
updated := new(v1alpha1.EphemeralRunner)
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, updated)
if err != nil {
return "", err
}
return updated.Status.Phase, nil
},
ephemeralRunnerTimeout,
ephemeralRunnerInterval,
).Should(Equal(corev1.PodFailed), "runner with invalidated registration should be marked failed")
})
})

Context("when GetRunner returns a transient error (500)", func() {
var ephemeralRunner *v1alpha1.EphemeralRunner

BeforeEach(func() {
ctx = context.Background()
autoscalingNS, mgr = createNamespace(GinkgoT(), k8sClient)
configSecret = createDefaultSecret(GinkgoT(), k8sClient, autoscalingNS.Name)

// GetRunner returns 500 — transient error, should NOT kill the runner
fakeClient := fake.NewFakeClient(
fake.WithGetRunner(nil, &actions.ActionsError{
StatusCode: 500,
Err: fmt.Errorf("internal server error"),
}),
)

controller = &EphemeralRunnerReconciler{
Client: mgr.GetClient(),
Scheme: mgr.GetScheme(),
Log: logf.Log,
ResourceBuilder: ResourceBuilder{
SecretResolver: &SecretResolver{
k8sClient: mgr.GetClient(),
multiClient: fake.NewMultiClient(fake.WithDefaultClient(fakeClient, nil)),
},
},
}

err := controller.SetupWithManager(mgr)
Expect(err).To(BeNil(), "failed to setup controller")

ephemeralRunner = newExampleRunner("transient-error-runner", autoscalingNS.Name, configSecret.Name)
err = k8sClient.Create(ctx, ephemeralRunner)
Expect(err).To(BeNil(), "failed to create ephemeral runner")

startManagers(GinkgoT(), mgr)
})

It("should NOT mark the runner as failed on transient errors", func() {
// Wait for runner to get a RunnerId
Eventually(
func() (int, error) {
updated := new(v1alpha1.EphemeralRunner)
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, updated)
if err != nil {
return 0, err
}
return updated.Status.RunnerId, nil
},
ephemeralRunnerTimeout,
ephemeralRunnerInterval,
).ShouldNot(Equal(0), "runner should have a RunnerId set")

// Wait for pod and set it to running (same as the 404 test)
pod := new(corev1.Pod)
Eventually(
func() (bool, error) {
if err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, pod); err != nil {
return false, err
}
return true, nil
},
ephemeralRunnerTimeout,
ephemeralRunnerInterval,
).Should(BeEquivalentTo(true), "pod should exist")

pod.Status.Phase = corev1.PodRunning
pod.Status.ContainerStatuses = []corev1.ContainerStatus{
{
Name: v1alpha1.EphemeralRunnerContainerName,
State: corev1.ContainerState{
Running: &corev1.ContainerStateRunning{
StartedAt: metav1.Now(),
},
},
},
}
err := k8sClient.Status().Update(ctx, pod)
Expect(err).To(BeNil(), "failed to update pod status to running")

// Give the controller time to process — it should NOT mark as failed
Consistently(
func() (corev1.PodPhase, error) {
updated := new(v1alpha1.EphemeralRunner)
err := k8sClient.Get(ctx, client.ObjectKey{Name: ephemeralRunner.Name, Namespace: ephemeralRunner.Namespace}, updated)
if err != nil {
return "", err
}
return updated.Status.Phase, nil
},
time.Second*3,
ephemeralRunnerInterval,
).ShouldNot(Equal(corev1.PodFailed), "runner should NOT be marked failed on transient errors")
})
})
})
})
6 changes: 6 additions & 0 deletions github/actions/fake/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,12 @@ func WithUpdateRunnerScaleSet(scaleSet *actions.RunnerScaleSet, err error) Optio
}
}

func WithRemoveRunnerError(err error) Option {
return func(f *FakeClient) {
f.removeRunnerResult.err = err
}
}

var defaultRunnerScaleSet = &actions.RunnerScaleSet{
Id: 1,
Name: "testset",
Expand Down
Loading