@@ -588,6 +588,19 @@ func getSubmitterContainer(rayJobInstance *rayv1.RayJob, rayClusterInstance *ray
588588 return corev1.Container {}, err
589589 }
590590
591+ // When SidecarSubmitterRestart feature gate is enabled, configure per-container restart rules.
592+ // This requires Kubernetes 1.35+ with ContainerRestartRules feature gate enabled.
593+ if features .Enabled (features .SidecarSubmitterRestart ) {
594+ // OnFailure restarts only the submitter container (not all containers in the pod) on non-zero exit.
595+ // The non-zero exit can come from `ray job submit --no-wait` or `ray job logs --follow`.
596+ // The key case is `ray job logs --follow` exiting non-zero on a transient
597+ // WebSocket closure even when the Ray job is still running.
598+ // On restart, the submitter checks ray job status first.
599+ // Since the job is still running, the submitter simply reattaches to the log stream.
600+ // See BuildJobSubmitCommand in ray-operator/controllers/ray/common/job.go for more details.
601+ submitterContainer .RestartPolicy = ptr .To (corev1 .ContainerRestartPolicyOnFailure )
602+ }
603+
591604 return submitterContainer , nil
592605}
593606
@@ -992,8 +1005,8 @@ func (r *RayJobReconciler) constructRayClusterForRayJob(rayJobInstance *rayv1.Ra
9921005 rayCluster .Spec .HeadGroupSpec .Template .Spec .Containers , sidecar )
9931006 // In K8sJobMode, the submitter Job relies on the K8s Job backoffLimit API to restart if it fails.
9941007 // This mainly handles WebSocket connection failures caused by transient network issues.
995- // In SidecarMode, however, the submitter container shares the same network namespace as the Ray dashboard,
996- // so restarts are no longer needed .
1008+ // In SidecarMode, the pod-level RestartPolicy is set to Never.
1009+ // The submitter container may override this with per-container restart rules when the SidecarSubmitterRestart feature gate is enabled .
9971010 rayCluster .Spec .HeadGroupSpec .Template .Spec .RestartPolicy = corev1 .RestartPolicyNever
9981011 }
9991012
@@ -1052,20 +1065,44 @@ func (r *RayJobReconciler) checkSubmitterAndUpdateStatusIfNeeded(ctx context.Con
10521065 return
10531066 }
10541067
1055- shouldUpdate , submitterContainerStatus = checkSidecarContainerStatus (headPod )
1056- if shouldUpdate {
1057- logger .Info ("The submitter sidecar container has failed. Attempting to transition the status to `Failed`." ,
1058- "Submitter sidecar container" , submitterContainerStatus .Name , "Reason" , submitterContainerStatus .State .Terminated .Reason , "Message" , submitterContainerStatus .State .Terminated .Message )
1059- rayJob .Status .JobDeploymentStatus = rayv1 .JobDeploymentStatusFailed
1060- // The submitter sidecar container needs to wait for the user code to finish and retrieve its logs.
1061- // Therefore, a failed Submitter sidecar container indicates that the submission itself has failed or the user code has thrown an error.
1062- // If the failure is due to user code, the JobStatus and Job message will be updated accordingly from the previous reconciliation.
1063- if rayJob .Status .JobStatus == rayv1 .JobStatusFailed {
1064- rayJob .Status .Reason = rayv1 .AppFailed
1065- } else {
1066- rayJob .Status .Reason = rayv1 .SubmissionFailed
1067- rayJob .Status .Message = fmt .Sprintf ("Ray head pod container %s terminated with exit code %d: %s" ,
1068- submitterContainerStatus .Name , submitterContainerStatus .State .Terminated .ExitCode , submitterContainerStatus .State .Terminated .Reason )
1068+ // Only check exit code when the feature gate is disabled.
1069+ // When SidecarSubmitterRestart is enabled, the container restarts on non-zero exit,
1070+ // so a terminated container is transient — not a permanent failure.
1071+ if ! features .Enabled (features .SidecarSubmitterRestart ) {
1072+ shouldUpdate , submitterContainerStatus = checkSidecarContainerStatus (headPod )
1073+ if shouldUpdate {
1074+ logger .Info ("The submitter sidecar container has failed. Attempting to transition the status to `Failed`." ,
1075+ "Submitter sidecar container" , submitterContainerStatus .Name , "Reason" , submitterContainerStatus .State .Terminated .Reason , "Message" , submitterContainerStatus .State .Terminated .Message )
1076+ rayJob .Status .JobDeploymentStatus = rayv1 .JobDeploymentStatusFailed
1077+ // The submitter sidecar container needs to wait for the user code to finish and retrieve its logs.
1078+ // Therefore, a failed Submitter sidecar container indicates that the submission itself has failed or the user code has thrown an error.
1079+ // If the failure is due to user code, the JobStatus and Job message will be updated accordingly from the previous reconciliation.
1080+ if rayJob .Status .JobStatus == rayv1 .JobStatusFailed {
1081+ rayJob .Status .Reason = rayv1 .AppFailed
1082+ } else {
1083+ rayJob .Status .Reason = rayv1 .SubmissionFailed
1084+ rayJob .Status .Message = fmt .Sprintf ("Ray head pod container %s terminated with exit code %d: %s" ,
1085+ submitterContainerStatus .Name , submitterContainerStatus .State .Terminated .ExitCode , submitterContainerStatus .State .Terminated .Reason )
1086+ }
1087+ }
1088+ } else {
1089+ submitterBackoffLimit := int32 (2 )
1090+ if rayJob .Spec .SubmitterConfig != nil && rayJob .Spec .SubmitterConfig .BackoffLimit != nil {
1091+ submitterBackoffLimit = * rayJob .Spec .SubmitterConfig .BackoffLimit
1092+ }
1093+ shouldUpdate , submitterContainerStatus = checkIsRestartCountExceeded (headPod , submitterBackoffLimit )
1094+ if shouldUpdate {
1095+ logger .Info ("The submitter sidecar container has exceeded the max restart count. Attempting to transition the status to `Failed`." ,
1096+ "Submitter sidecar container" , submitterContainerStatus .Name ,
1097+ "RestartCount" , submitterContainerStatus .RestartCount )
1098+ rayJob .Status .JobDeploymentStatus = rayv1 .JobDeploymentStatusFailed
1099+ if rayJob .Status .JobStatus == rayv1 .JobStatusFailed {
1100+ rayJob .Status .Reason = rayv1 .AppFailed
1101+ } else {
1102+ rayJob .Status .Reason = rayv1 .SubmissionFailed
1103+ rayJob .Status .Message = fmt .Sprintf ("Ray head pod submitter container %s terminated after exceeding the maximum restart count" ,
1104+ submitterContainerStatus .Name )
1105+ }
10691106 }
10701107 }
10711108
@@ -1149,6 +1186,24 @@ func checkSidecarContainerStatus(headPod *corev1.Pod) (bool, *corev1.ContainerSt
11491186 return false , nil
11501187}
11511188
1189+ func checkIsRestartCountExceeded (headPod * corev1.Pod , backoffLimit int32 ) (bool , * corev1.ContainerStatus ) {
1190+ for _ , containerStatus := range headPod .Status .ContainerStatuses {
1191+ if containerStatus .Name == utils .SubmitterContainerName {
1192+ // Only check when the container has been terminated at least once.
1193+ // When the submitter container fails in a CrashLoopBackOff fashion, LastTerminationState.Terminated is populated
1194+ if containerStatus .LastTerminationState .Terminated != nil {
1195+ // If the container exited successfully, we do not fail the job.
1196+ if containerStatus .State .Terminated != nil && containerStatus .State .Terminated .ExitCode == 0 {
1197+ break
1198+ }
1199+ return containerStatus .RestartCount >= backoffLimit , & containerStatus
1200+ }
1201+ break
1202+ }
1203+ }
1204+ return false , nil
1205+ }
1206+
11521207func checkActiveDeadlineAndUpdateStatusIfNeeded (ctx context.Context , rayJob * rayv1.RayJob ) bool {
11531208 logger := ctrl .LoggerFrom (ctx )
11541209 if rayJob .Spec .ActiveDeadlineSeconds == nil || time .Now ().Before (rayJob .Status .StartTime .Add (time .Duration (* rayJob .Spec .ActiveDeadlineSeconds )* time .Second )) {
0 commit comments