[10c/n] Migrate MoE kernels to torch stable ABI #44565
Conversation
|
|
||
| #include <cuda_bf16.h> | ||
| #include <cuda_runtime.h> | ||
|
|
||
| #include "core/registration.h" | ||
| #include "dsv3_router_gemm_utils.h" |
There was a problem hiding this comment.
delete dsv3_router_gemm_utils.h as it now appears unused by anyone in the repo.
if i am wrong about that, then it'd be better to move it to stable and make the getSMVersion change in that file if it's used more widely.
| } // namespace vllm | ||
|
|
||
| std::tuple<torch::Tensor, torch::Tensor> grouped_topk( | ||
| torch::Tensor const& scores, int64_t n_group, int64_t topk_group, |
There was a problem hiding this comment.
deleting const seems unintentional here?
There was a problem hiding this comment.
The migration tool uses the convention const <type>& instead of <type> const& for any rewrites like this. I checked some other files in vLLM and there doesn't seem to be a standard convention for this code base on which way to write (both appear often).
I can change it back (and in other places) if you think that helps ease the migration though.
| auto topk_values = torch::stable::new_empty( | ||
| scores, {num_tokens, topk}, torch::headeronly::ScalarType::Float); | ||
| auto topk_indices = torch::stable::new_empty( | ||
| scores, {num_tokens, topk}, torch::headeronly::ScalarType::Int); |
There was a problem hiding this comment.
these tensors should be on cuda, no?
There was a problem hiding this comment.
torch::stable::new_empty uses the device of scores which should be on cuda. From vllm/vllm/_custom_ops.py,
if not current_platform.is_cuda():
raise NotImplementedError(
"The fused grouped_topk kernel is only available on CUDA platforms"
)
There was a problem hiding this comment.
Should I update this to be more explicit?
| torch::stable::Tensor sorted_token_ids, torch::stable::Tensor experts_ids, | ||
| torch::stable::Tensor num_tokens_post_pad, | ||
| std::optional<torch::stable::Tensor> maybe_expert_map) { | ||
| const torch::stable::accelerator::DeviceGuard device_guard( |
| expert_map.data_ptr<int32_t>(), num_experts, block_size, | ||
| topk_ids.numel(), sorted_token_ids.size(0), topk_ids.size(1), | ||
| has_expert_map); | ||
| reinterpret_cast<const scalar_t*>(topk_ids.data_ptr()), |
| #include "moeTopKFuncs.cuh" | ||
| #include <c10/cuda/CUDAStream.h> | ||
| #include <torch/all.h> | ||
| #include "moe/moeTopKFuncs.cuh" |
There was a problem hiding this comment.
this file is all stable too, right?
There was a problem hiding this comment.
Yes, moved it to the libtorch_stable directory.
There was a problem hiding this comment.
this file should have changes to become stable
| experts_per_warp, block_size, topk_ids.numel(), | ||
| cumsum_buffer.data_ptr<int32_t>(), sorted_token_ids.size(0), | ||
| topk_ids.size(1), has_expert_map); | ||
| reinterpret_cast<const scalar_t*>(topk_ids.data_ptr()), |
There was a problem hiding this comment.
same here, const or mutable
| namespace { | ||
|
|
||
| inline int getSMVersion() { | ||
| auto* props = get_device_prop(); |
There was a problem hiding this comment.
This and only this used to be defined in "dsv3_router_gemm_utils.h"
Signed-off-by: Chris Leonard <chleonar@redhat.com>
|
@Harry-Chen, all of moe has been moved over to the stable ABI. I renamed the library _moe_c_stable_libtorch.so to emphasize that it is stable, but I can change the name back if I need to. |
|
@Harry-Chen and @janeyx99, to look at the diff for the files that GitHub has marked as deleted/created when really they are just moved, check out the commit a9a466d#diff-ae0d2f513cdf90dbeae0b924311373baacb889d29a201155b02b51b6d023ee51 (ignore |
janeyx99
left a comment
There was a problem hiding this comment.
lgtm, pls check headers comments tho
There was a problem hiding this comment.
these are the headers in this now stable file:
#include "quantization/marlin/marlin.cuh"
#include "quantization/marlin/marlin_dtypes.cuh"
#include "core/scalar_type.hpp"
i think the latter 2 can be moved at least. and the first should as well?
There was a problem hiding this comment.
These are still used by the kernel in csrc/quantization/marlin. This should be moved in the next PR so the headers should be moved over then.
There was a problem hiding this comment.
same q for migrating these
#include "quantization/marlin/marlin.cuh"
#include "quantization/marlin/marlin_dtypes.cuh"
#include "quantization/marlin/dequant.h"
#include "quantization/marlin/marlin_mma.h"
#include "core/scalar_type.hpp"
| } | ||
| STABLE_TORCH_LIBRARY_IMPL(_moe_C, CUDA, m) { | ||
| m.impl("moe_wna16_marlin_gemm", TORCH_BOX(&moe_wna16_marlin_gemm)); | ||
| } No newline at end of file |
|
|
||
| #include <torch/csrc/stable/tensor.h> | ||
|
|
||
| #include "core/scalar_type.hpp" |
…aderonly::ScalarType::Float8_e8m0fnu Signed-off-by: Chris Leonard <chleonar@redhat.com>
|
@Harry-Chen, the failures were because |
It appears ROCm doesn't support 2.11 in CI yet, which may affect whether we need to handle this migration differently. @Harry-Chen Do you know if there's an ongoing effort to migrate ROCm CI and the timeline for it? |
I think the errors were caused by in |
I do not have information on this either. CC @AndreasKaratzas @tjtanaa who may know something on this |
Hi @Harry-Chen, we are currently giving 2.11 another round of testing after getting some fixes in. We'll bump as soon as we can- hopefully this week as long as things check out. Here's our latest torch 2.11 build in amd-ci: https://buildkite.com/vllm/amd-ci/builds/9408 |
|
@Harry-Chen, none of these failures seems to be coming from this PR. |
Signed-off-by: Chris Leonard <chleonar@redhat.com> Co-authored-by: Shengqi Chen <harry-chen@outlook.com>
Signed-off-by: Chris Leonard <chleonar@redhat.com> Co-authored-by: Shengqi Chen <harry-chen@outlook.com>
Signed-off-by: Chris Leonard <chleonar@redhat.com> Co-authored-by: Shengqi Chen <harry-chen@outlook.com> Signed-off-by: divineearthly <divineearthly@gmail.com>
Signed-off-by: Chris Leonard <chleonar@redhat.com> Co-authored-by: Shengqi Chen <harry-chen@outlook.com>
Signed-off-by: Chris Leonard <chleonar@redhat.com> Co-authored-by: Shengqi Chen <harry-chen@outlook.com>
Purpose
This PR continues the libtorch stable ABI migration (see #26946) for vLLM MoE CUDA kernels by introducing _moe_C_stable_libtorch and moving all of the MoE ops (topk, align, permute/unpermute, grouped topk, and related headers) into csrc/libtorch_stable/moe/.
Note: started using the [10x/n] label to indicate that they could be merged in any order (theoretically, there could still be merge conflicts because of CMakeLists.txt, ops.h, and/or torch_binding.cpp files).
cc @janeyx99 @Harry-Chen
Test Plan
Test Result
Essential Elements of an Effective PR Description Checklist
supported_models.mdandexamplesfor a new model.Migration progress using the Audit Python extension torch-abi-audit:
main branch
This branch
moved all of _moe_C to STABLE ABI