Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 39 additions & 0 deletions tests/multimodal/test_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -549,3 +549,42 @@ def test_processor_cache_shared_across_loras():

receiver_cache.get_and_update_features([feature_lora_b])
assert feature_lora_b.data == item_data


_SLEEP_VISION_PROMPT = (
"<|im_start|>system\nYou are a helpful assistant.<|im_end|>"
"\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>"
"What is in the image?<|im_end|>\n"
"<|im_start|>assistant\n"
)


@pytest.mark.skipif(
not torch.cuda.is_available(),
reason="sleep mode regression requires a CUDA GPU",
)
def test_sleep_wake_preserves_mm_cache_consistency():
Comment thread
DarkLight1337 marked this conversation as resolved.
"""Regression for vllm-project/vllm#42995."""
from vllm import LLM, SamplingParams
from vllm.assets.image import ImageAsset

image = ImageAsset("stop_sign").pil_image
prompt = {
"prompt": _SLEEP_VISION_PROMPT,
"multi_modal_data": {"image": image},
}
sampling_params = SamplingParams(temperature=0, max_tokens=8)

llm = LLM(
model="Qwen/Qwen2-VL-2B-Instruct",
enable_sleep_mode=True,
enforce_eager=True,
gpu_memory_utilization=0.5,
max_model_len=2048,
)

llm.generate([prompt], sampling_params)
llm.sleep(level=1)
llm.wake_up()
output2 = llm.generate([prompt], sampling_params)
assert output2[0].outputs[0].text
4 changes: 4 additions & 0 deletions vllm/v1/engine/async_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -781,6 +781,8 @@ async def pause_generation(
stacklevel=2,
)
mode = "wait"
if clear_cache:
await self.renderer.clear_mm_cache_async()
Comment thread
wasnertobias marked this conversation as resolved.
await self.engine_core.pause_scheduler_async(mode=mode, clear_cache=clear_cache)
# Small sleep to help ensure that final outputs from any in-flight requests are
# returned prior to this method returning. These outputs come out of the engine
Expand Down Expand Up @@ -927,6 +929,8 @@ async def reset_encoder_cache(self) -> None:
await self.engine_core.reset_encoder_cache_async()

async def sleep(self, level: int = 1, mode: PauseMode = "abort") -> None:
if level >= 1:
await self.renderer.clear_mm_cache_async()
Comment thread
wasnertobias marked this conversation as resolved.
await self.engine_core.sleep_async(level, mode)

if self.logger_manager is not None:
Expand Down
2 changes: 2 additions & 0 deletions vllm/v1/engine/llm_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -350,6 +350,8 @@ def reset_encoder_cache(self) -> None:
self.engine_core.reset_encoder_cache()

def sleep(self, level: int = 1, mode: PauseMode = "abort"):
if level >= 1:
self.renderer.clear_mm_cache()
Comment thread
wasnertobias marked this conversation as resolved.
self.engine_core.sleep(level, mode)

if self.logger_manager is not None:
Expand Down
Loading