Skip to content

Commit c3ad8e5

Browse files
Ubuntujeffreywang88
authored andcommitted
Remove unnecessary changes
Signed-off-by: Jeffrey Wang <jeffreywang@anyscale.com>
1 parent 2541f2d commit c3ad8e5

4 files changed

Lines changed: 20 additions & 282 deletions

File tree

‎.buildkite/test_areas/distributed.yaml‎

Lines changed: 0 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -271,18 +271,3 @@ steps:
271271
- pytest -v -s distributed/test_ray_v2_executor.py
272272
- pytest -v -s distributed/test_pipeline_parallel.py -k "ray"
273273
- TARGET_TEST_SUITE=L4 pytest -v -s basic_correctness/test_basic_correctness.py -k "ray"
274-
275-
- label: RayExecutorV2 Multi-Node (8 GPUs)
276-
timeout_in_minutes: 30
277-
working_dir: "/vllm-workspace/tests"
278-
num_devices: 4
279-
num_nodes: 2
280-
no_plugin: true
281-
optional: true
282-
source_file_dependencies:
283-
- vllm/v1/executor/ray_executor_v2.py
284-
- vllm/v1/executor/abstract.py
285-
- vllm/v1/executor/multiproc_executor.py
286-
- tests/distributed/test_ray_v2_executor_multinode.py
287-
commands:
288-
- ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 4 $IMAGE_TAG "VLLM_MULTI_NODE=1 VLLM_USE_RAY_V2_EXECUTOR_BACKEND=1 NCCL_CUMEM_HOST_ENABLE=0 pytest -v -s distributed/test_ray_v2_executor_multinode.py" "echo 'Worker node ready'"

‎tests/distributed/test_ray_v2_executor.py‎

Lines changed: 19 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -33,17 +33,15 @@ def enable_ray_v2_backend():
3333
"VLLM_USE_RAY_V2_EXECUTOR_BACKEND": os.environ.get(
3434
"VLLM_USE_RAY_V2_EXECUTOR_BACKEND"
3535
),
36-
"RAY_RUNTIME_ENV_HOOK": os.environ.get("RAY_RUNTIME_ENV_HOOK"),
3736
"VLLM_ENABLE_V1_MULTIPROCESSING": os.environ.get(
3837
"VLLM_ENABLE_V1_MULTIPROCESSING"
3938
),
4039
}
4140
os.environ["VLLM_USE_RAY_V2_EXECUTOR_BACKEND"] = "1"
42-
# TODO (jeffreywang): Is this necessary?
41+
# The multiprocess engine forks a subprocess that inherits the Ray
42+
# driver connection, causing hangs. RayExecutorV2 already distributes
43+
# work via Ray actors, so the EngineCore can run safely in-process.
4344
os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
44-
# TODO (jeffreywang): Figure out vLLM CI
45-
# This is only necessary for Anyscale ray cluster.
46-
os.environ.pop("RAY_RUNTIME_ENV_HOOK", None)
4745
try:
4846
yield
4947
finally:
@@ -57,17 +55,21 @@ def _cleanup_ray_resources():
5755
if not ray.is_initialized():
5856
return
5957

60-
# Ray actor shutdown is async -- wait until all actors are dead
58+
# Ray actor shutdown is async -- wait until all actors are dead.
6159
dangling_actors = []
62-
for _ in range(10):
63-
dangling_actors = [
64-
actor
65-
for actor in list_actors(filters=[("state", "=", "ALIVE")])
66-
if actor.class_name == "RayWorkerProc"
67-
]
68-
if not dangling_actors:
69-
break
70-
time.sleep(1)
60+
try:
61+
for _ in range(10):
62+
dangling_actors = [
63+
actor
64+
for actor in list_actors(filters=[("state", "=", "ALIVE")])
65+
if actor.class_name == "RayWorkerProc"
66+
]
67+
if not dangling_actors:
68+
break
69+
time.sleep(1)
70+
except Exception:
71+
# Tolerate connection errors to the Ray dashboard
72+
pass
7173

7274
# Always clean up PGs and shut down Ray, even if actors are dangling,
7375
# to avoid leaking GPU resources and blocking subsequent tests.
@@ -76,6 +78,8 @@ def _cleanup_ray_resources():
7678
if pg_info["state"] == "CREATED":
7779
pg = PlacementGroup(ray.PlacementGroupID(bytes.fromhex(pg_id)))
7880
ray.util.remove_placement_group(pg)
81+
except Exception:
82+
pass
7983
finally:
8084
ray.shutdown()
8185

‎tests/distributed/test_ray_v2_executor_multinode.py‎

Lines changed: 0 additions & 252 deletions
This file was deleted.

‎vllm/v1/executor/ray_executor_v2.py‎

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,7 @@ def _init_executor(self) -> None:
175175

176176
# Prefer driver node; group by node for TP locality
177177
bundle_to_node_id = []
178+
assert placement_group is not None
178179
bundle_specs = placement_group.bundle_specs
179180
assert bundle_specs is not None
180181
for i, bundle in enumerate(bundle_specs):

0 commit comments

Comments
 (0)