@@ -33,17 +33,15 @@ def enable_ray_v2_backend():
3333 "VLLM_USE_RAY_V2_EXECUTOR_BACKEND" : os .environ .get (
3434 "VLLM_USE_RAY_V2_EXECUTOR_BACKEND"
3535 ),
36- "RAY_RUNTIME_ENV_HOOK" : os .environ .get ("RAY_RUNTIME_ENV_HOOK" ),
3736 "VLLM_ENABLE_V1_MULTIPROCESSING" : os .environ .get (
3837 "VLLM_ENABLE_V1_MULTIPROCESSING"
3938 ),
4039 }
4140 os .environ ["VLLM_USE_RAY_V2_EXECUTOR_BACKEND" ] = "1"
42- # TODO (jeffreywang): Is this necessary?
41+ # The multiprocess engine forks a subprocess that inherits the Ray
42+ # driver connection, causing hangs. RayExecutorV2 already distributes
43+ # work via Ray actors, so the EngineCore can run safely in-process.
4344 os .environ ["VLLM_ENABLE_V1_MULTIPROCESSING" ] = "0"
44- # TODO (jeffreywang): Figure out vLLM CI
45- # This is only necessary for Anyscale ray cluster.
46- os .environ .pop ("RAY_RUNTIME_ENV_HOOK" , None )
4745 try :
4846 yield
4947 finally :
@@ -57,17 +55,21 @@ def _cleanup_ray_resources():
5755 if not ray .is_initialized ():
5856 return
5957
60- # Ray actor shutdown is async -- wait until all actors are dead
58+ # Ray actor shutdown is async -- wait until all actors are dead.
6159 dangling_actors = []
62- for _ in range (10 ):
63- dangling_actors = [
64- actor
65- for actor in list_actors (filters = [("state" , "=" , "ALIVE" )])
66- if actor .class_name == "RayWorkerProc"
67- ]
68- if not dangling_actors :
69- break
70- time .sleep (1 )
60+ try :
61+ for _ in range (10 ):
62+ dangling_actors = [
63+ actor
64+ for actor in list_actors (filters = [("state" , "=" , "ALIVE" )])
65+ if actor .class_name == "RayWorkerProc"
66+ ]
67+ if not dangling_actors :
68+ break
69+ time .sleep (1 )
70+ except Exception :
71+ # Tolerate connection errors to the Ray dashboard
72+ pass
7173
7274 # Always clean up PGs and shut down Ray, even if actors are dangling,
7375 # to avoid leaking GPU resources and blocking subsequent tests.
@@ -76,6 +78,8 @@ def _cleanup_ray_resources():
7678 if pg_info ["state" ] == "CREATED" :
7779 pg = PlacementGroup (ray .PlacementGroupID (bytes .fromhex (pg_id )))
7880 ray .util .remove_placement_group (pg )
81+ except Exception :
82+ pass
7983 finally :
8084 ray .shutdown ()
8185
0 commit comments