Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
3a3a250
Implement RayExecutorV2 & tested on a single-node
jeffreywang88 Mar 10, 2026
df75664
Enable multinode
jeffreywang88 Mar 12, 2026
bbaa21b
Fix pre-commit
jeffreywang88 Mar 16, 2026
2541f2d
Fix RayExecutorV2 monitor thread self-join
jeffreywang88 Mar 16, 2026
c3ad8e5
Remove unnecessary changes
Mar 17, 2026
300d0ae
Extract bundle sorting to a utility
jeffreywang88 Mar 17, 2026
11d32eb
Fix linter
jeffreywang88 Mar 17, 2026
5795f1d
Enable async scheduling
jeffreywang88 Mar 18, 2026
7128074
Address CR feedback
jeffreywang88 Mar 19, 2026
e7a3c1f
Address test feedback
jeffreywang88 Mar 19, 2026
5b4119a
Merge branch 'main' into ray
jeffreywang88 Mar 19, 2026
ec2730d
Iterate over world_size
jeffreywang88 Mar 19, 2026
ca95900
Fix tests and linters
jeffreywang88 Mar 19, 2026
139c02a
Respect VLLM_RAY_BUNDLE_INDICES
jeffreywang88 Mar 22, 2026
7657031
Adjust DP rank for ray executor backend
jeffreywang88 Mar 23, 2026
6c1ea7e
Apply DP local-rank device offset for RayExecutorV2 workers
jeffreywang88 Mar 23, 2026
d040317
Support DP
jeffreywang88 Mar 24, 2026
a76acc9
Fix linter
jeffreywang88 Mar 24, 2026
c9f0a39
Lazily initialize RayWorkerProc
jeffreywang88 Mar 25, 2026
29c7426
Propagate env var; add tests
jeffreywang88 Mar 25, 2026
aae5938
Add nsight profiling and non-GPU device support to RayExecutorV2
jeffreywang88 Mar 25, 2026
25eaf8e
Fix AsyncLLMActor async detection in e2e tests
jeffreywang88 Mar 26, 2026
6717ca2
Fix AsyncLLMActor async detection in e2e tests
jeffreywang88 Mar 26, 2026
cfba15e
Fix test
jeffreywang88 Mar 26, 2026
476501b
Fix test
jeffreywang88 Mar 26, 2026
c7aa661
Fix wrong PYTHONPATH in Ray workers
jeffreywang88 Mar 26, 2026
e0fd321
CR feedback round 1
jeffreywang88 Mar 30, 2026
af21cdd
CR feedback round 2
jeffreywang88 Mar 30, 2026
ad8f6d0
Only apply blacklist & propagate env with setdefault
jeffreywang88 Mar 30, 2026
605a347
Merge branch 'main' into ray
jeffreywang88 Mar 31, 2026
7586204
CR feedback round 3
jeffreywang88 Mar 31, 2026
File filter

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Remove unnecessary changes
Signed-off-by: Jeffrey Wang <jeffreywang@anyscale.com>
  • Loading branch information
Ubuntu authored and jeffreywang88 committed Mar 17, 2026
commit c3ad8e55ee20353f5bbcf0e963769b2eb6c65f88
15 changes: 0 additions & 15 deletions .buildkite/test_areas/distributed.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -271,18 +271,3 @@ steps:
- pytest -v -s distributed/test_ray_v2_executor.py
- pytest -v -s distributed/test_pipeline_parallel.py -k "ray"
- TARGET_TEST_SUITE=L4 pytest -v -s basic_correctness/test_basic_correctness.py -k "ray"

- label: RayExecutorV2 Multi-Node (8 GPUs)
timeout_in_minutes: 30
working_dir: "/vllm-workspace/tests"
num_devices: 4
num_nodes: 2
no_plugin: true
optional: true
source_file_dependencies:
- vllm/v1/executor/ray_executor_v2.py
- vllm/v1/executor/abstract.py
- vllm/v1/executor/multiproc_executor.py
- tests/distributed/test_ray_v2_executor_multinode.py
commands:
- ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 4 $IMAGE_TAG "VLLM_MULTI_NODE=1 VLLM_USE_RAY_V2_EXECUTOR_BACKEND=1 NCCL_CUMEM_HOST_ENABLE=0 pytest -v -s distributed/test_ray_v2_executor_multinode.py" "echo 'Worker node ready'"
34 changes: 19 additions & 15 deletions tests/distributed/test_ray_v2_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,17 +33,15 @@ def enable_ray_v2_backend():
"VLLM_USE_RAY_V2_EXECUTOR_BACKEND": os.environ.get(
"VLLM_USE_RAY_V2_EXECUTOR_BACKEND"
),
"RAY_RUNTIME_ENV_HOOK": os.environ.get("RAY_RUNTIME_ENV_HOOK"),
"VLLM_ENABLE_V1_MULTIPROCESSING": os.environ.get(
"VLLM_ENABLE_V1_MULTIPROCESSING"
),
}
os.environ["VLLM_USE_RAY_V2_EXECUTOR_BACKEND"] = "1"
# TODO (jeffreywang): Is this necessary?
# The multiprocess engine forks a subprocess that inherits the Ray
# driver connection, causing hangs. RayExecutorV2 already distributes
# work via Ray actors, so the EngineCore can run safely in-process.
os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
# TODO (jeffreywang): Figure out vLLM CI
# This is only necessary for Anyscale ray cluster.
os.environ.pop("RAY_RUNTIME_ENV_HOOK", None)
try:
yield
finally:
Expand All @@ -57,17 +55,21 @@ def _cleanup_ray_resources():
if not ray.is_initialized():
return

# Ray actor shutdown is async -- wait until all actors are dead
# Ray actor shutdown is async -- wait until all actors are dead.
dangling_actors = []
for _ in range(10):
dangling_actors = [
actor
for actor in list_actors(filters=[("state", "=", "ALIVE")])
if actor.class_name == "RayWorkerProc"
]
if not dangling_actors:
break
time.sleep(1)
try:
for _ in range(10):
dangling_actors = [
actor
for actor in list_actors(filters=[("state", "=", "ALIVE")])
if actor.class_name == "RayWorkerProc"
]
if not dangling_actors:
break
time.sleep(1)
except Exception:
# Tolerate connection errors to the Ray dashboard
pass

# Always clean up PGs and shut down Ray, even if actors are dangling,
# to avoid leaking GPU resources and blocking subsequent tests.
Expand All @@ -76,6 +78,8 @@ def _cleanup_ray_resources():
if pg_info["state"] == "CREATED":
pg = PlacementGroup(ray.PlacementGroupID(bytes.fromhex(pg_id)))
ray.util.remove_placement_group(pg)
except Exception:
pass
finally:
ray.shutdown()
Comment thread
jeffreywang88 marked this conversation as resolved.
Outdated

Expand Down
252 changes: 0 additions & 252 deletions tests/distributed/test_ray_v2_executor_multinode.py

This file was deleted.

1 change: 1 addition & 0 deletions vllm/v1/executor/ray_executor_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,7 @@ def _init_executor(self) -> None:

# Prefer driver node; group by node for TP locality
bundle_to_node_id = []
assert placement_group is not None
bundle_specs = placement_group.bundle_specs
assert bundle_specs is not None
for i, bundle in enumerate(bundle_specs):
Expand Down
Loading