diff --git a/cuda_core/cuda/core/experimental/_kernel_arg_handler.pyx b/cuda_core/cuda/core/experimental/_kernel_arg_handler.pyx index 4cac74a25..2387c7ef5 100644 --- a/cuda_core/cuda/core/experimental/_kernel_arg_handler.pyx +++ b/cuda_core/cuda/core/experimental/_kernel_arg_handler.pyx @@ -3,7 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 from cpython.mem cimport PyMem_Malloc, PyMem_Free -from libc.stdint cimport (intptr_t, +from libc.stdint cimport (intptr_t, uintptr_t, int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t, uint32_t, uint64_t,) from libcpp cimport bool as cpp_bool @@ -11,13 +11,15 @@ from libcpp.complex cimport complex as cpp_complex from libcpp cimport nullptr from libcpp cimport vector +from cuda.bindings cimport cydriver +from cuda.core.experimental._memoryview cimport _MDSPAN + import ctypes import numpy from cuda.core.experimental._memory import Buffer from cuda.core.experimental._utils.cuda_utils import driver -from cuda.bindings cimport cydriver ctypedef cpp_complex.complex[float] cpp_single_complex @@ -296,6 +298,12 @@ cdef class ParamHolder: elif arg_type is complex: prepare_arg[cpp_double_complex](self.data, self.data_addresses, arg, i) continue + elif arg_type is _MDSPAN: + # The mdspan struct is allocated on the host and owned by the CuPy mdspan object. + # We pass a pointer to the struct so the driver can copy it by value to the kernel. + # Access _ptr at C level to avoid creating a temporary Python object. + self.data_addresses[i] = ((<_MDSPAN>arg)._ptr) + continue not_prepared = prepare_numpy_arg(self.data, self.data_addresses, arg, i) if not_prepared: diff --git a/cuda_core/cuda/core/experimental/_memoryview.pxd b/cuda_core/cuda/core/experimental/_memoryview.pxd new file mode 100644 index 000000000..746f4c8db --- /dev/null +++ b/cuda_core/cuda/core/experimental/_memoryview.pxd @@ -0,0 +1,10 @@ +from libc.stdint cimport uintptr_t + + +cdef class _MDSPAN: + cdef: + # this must be a pointer to a host mdspan object + readonly uintptr_t _ptr + # if the host mdspan is exported from any Python object, + # we need to keep a reference to that object alive + readonly object _exporting_obj diff --git a/cuda_core/cuda/core/experimental/_memoryview.pyx b/cuda_core/cuda/core/experimental/_memoryview.pyx index 5b7066114..e8a24e539 100644 --- a/cuda_core/cuda/core/experimental/_memoryview.pyx +++ b/cuda_core/cuda/core/experimental/_memoryview.pyx @@ -2,7 +2,10 @@ # # SPDX-License-Identifier: Apache-2.0 +from libc.stdint cimport uintptr_t + from ._dlpack cimport * +from cuda.core.experimental._utils cimport cuda_utils import functools import warnings @@ -11,12 +14,26 @@ from typing import Optional import numpy from cuda.core.experimental._utils.cuda_utils import handle_return, driver -from cuda.core.experimental._utils cimport cuda_utils # TODO(leofang): support NumPy structured dtypes +cdef class _MDSPAN: + + def __cinit__(self): + self._ptr = 0 + + def __init__(self, uintptr_t ptr, object obj=None): + self._ptr = ptr + self._exporting_obj = obj + + def __dealloc__(self): + self._ptr = 0 + self._exporting_obj = None + + + cdef class StridedMemoryView: """A dataclass holding metadata of a strided dense array/tensor. @@ -98,6 +115,7 @@ cdef class StridedMemoryView: # this flag helps prevent unnecessary recompuation of _strides bint _strides_init object _dtype + _MDSPAN _mdspan def __init__(self, obj: object = None, stream_ptr: int | None = None) -> None: cdef str clsname = self.__class__.__name__ @@ -224,6 +242,27 @@ cdef class StridedMemoryView: self._dtype = numpy.dtype(self.metadata["typestr"]) return self._dtype + @property + def as_mdspan(self) -> _MDSPAN: + """A C++ mdspan view of the tensor. + + Returns + ------- + mdspan : _MDSPAN + """ + if self._mdspan is None: + arr = self.exporting_obj + module = self.exporting_obj.__class__.__module__.split(".")[0] + if module == "cupy": + mdspan = arr.mdspan + #mdspan = arr.cstruct + self._mdspan = _MDSPAN((mdspan.ptr), mdspan) + else: + raise NotImplementedError( + f"as_mdspan is not implemented for objects from module '{module}'" + ) + return self._mdspan + def __repr__(self): return (f"StridedMemoryView(ptr={self.ptr},\n" + f" shape={self.shape},\n" diff --git a/cuda_core/examples/mdspan_example.py b/cuda_core/examples/mdspan_example.py new file mode 100644 index 000000000..1a146e303 --- /dev/null +++ b/cuda_core/examples/mdspan_example.py @@ -0,0 +1,458 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +# ################################################################################ +# +# This demo illustrates how to write a C++ kernel that takes cuda::std::mdspan +# as kernel arguments, JIT-compile it using cuda.core.experimental.Program, +# and prepare input/output CuPy arrays to launch this kernel and verify the result. +# +# NOTE: This is a skeleton/demonstration code that is not yet runnable. +# It is designed to guide the cuda.core design by exploring how mdspan layout +# information should be handled on both the host and device sides. +# +# The example covers three scenarios: +# 1. 2D input/output arrays in C-order (row-major) +# 2. 2D input/output arrays in F-order (column-major) +# 3. 2D input/output arrays with strided access (second axis skipped by 1 step) +# +# ################################################################################ + +import os, sys +import cupy as cp +from cuda.core.experimental import Device, LaunchConfig, Program, ProgramOptions, launch + + +# prepare include +cuda_path = os.environ.get("CUDA_PATH", os.environ.get("CUDA_HOME")) +if cuda_path is None: + print("this demo requires a valid CUDA_PATH environment variable set", file=sys.stderr) + sys.exit(0) +cuda_include = os.path.join(cuda_path, "include") +assert os.path.isdir(cuda_include) +include_path = [cuda_include] +cccl_include = os.path.join(cuda_include, "cccl") +if os.path.isdir(cccl_include): + include_path.insert(0, cccl_include) + + +# ################################################################################ +# C++ Kernel Code with cuda::std::mdspan +# ################################################################################ + +# This kernel performs element-wise addition on 2D arrays using mdspan. +# mdspan provides a multi-dimensional view over contiguous or strided memory. +code = """ +#include + +// Kernel for element-wise addition of 2D arrays +// Template parameters: +// T: element type (e.g., float, double) +// LayoutPolicy: layout policy (e.g., layout_right for C-order, layout_left for F-order) +template +__global__ void mdspan_add_2d( + cuda::std::mdspan, LayoutPolicy> input1, + cuda::std::mdspan, LayoutPolicy> input2, + cuda::std::mdspan, LayoutPolicy> output +) { + // Calculate global thread indices + size_t row = blockIdx.y * blockDim.y + threadIdx.y; + size_t col = blockIdx.x * blockDim.x + threadIdx.x; + + // Ensure we're within bounds + if (row < input1.extent(0) && col < input1.extent(1)) { + // Perform element-wise addition + // mdspan handles the layout internally + output(row, col) = input1(row, col) + input2(row, col); + } +} + +// Kernel variant for strided mdspan with custom layout +template +__global__ void mdspan_add_2d_strided( + cuda::std::mdspan, cuda::std::layout_stride> input1, + cuda::std::mdspan, cuda::std::layout_stride> input2, + cuda::std::mdspan, cuda::std::layout_stride> output +) { + // Calculate global thread indices + size_t row = blockIdx.y * blockDim.y + threadIdx.y; + size_t col = blockIdx.x * blockDim.x + threadIdx.x; + + // Ensure we're within bounds + if (row < input1.extent(0) && col < input1.extent(1)) { + // Perform element-wise addition + output(row, col) = input1(row, col) + input2(row, col); + } +} +""" + + +# ################################################################################ +# Helper Functions (To Be Implemented) +# ################################################################################ + +def prepare_mdspan_args_c_order(arr, dtype, shape): + """ + Prepare mdspan arguments for C-order (row-major) 2D array. + + TODO: Determine the exact structure of mdspan on device side: + - What information needs to be passed? (data pointer, extents, strides?) + - How should the layout be represented? + - What is the correct argument passing convention? + + Parameters + ---------- + arr : cupy.ndarray + Input CuPy array + dtype : numpy.dtype + Data type of the array + shape : tuple + Shape of the array (rows, cols) + + Returns + ------- + tuple + Arguments to pass to the kernel (needs investigation) + """ + # Placeholder: This needs to be determined based on mdspan layout + # For C-order (row-major): layout_right in mdspan + # Possible arguments: pointer, extent0, extent1, stride0, stride1? + data_ptr = arr.data.ptr + rows, cols = shape + # TODO: Determine if we need to pass strides explicitly + # For C-order: stride0 = cols, stride1 = 1 + return (data_ptr, rows, cols) # Placeholder return + + +def prepare_mdspan_args_f_order(arr, dtype, shape): + """ + Prepare mdspan arguments for F-order (column-major) 2D array. + + TODO: Determine the exact structure of mdspan on device side: + - What information needs to be passed for F-order layout? + - How does layout_left differ from layout_right in argument passing? + + Parameters + ---------- + arr : cupy.ndarray + Input CuPy array + dtype : numpy.dtype + Data type of the array + shape : tuple + Shape of the array (rows, cols) + + Returns + ------- + tuple + Arguments to pass to the kernel (needs investigation) + """ + # Placeholder: This needs to be determined based on mdspan layout + # For F-order (column-major): layout_left in mdspan + # Possible arguments: pointer, extent0, extent1, stride0, stride1? + data_ptr = arr.data.ptr + rows, cols = shape + # TODO: Determine if we need to pass strides explicitly + # For F-order: stride0 = 1, stride1 = rows + return (data_ptr, rows, cols) # Placeholder return + + +def prepare_mdspan_args_strided(arr, dtype, shape, strides): + """ + Prepare mdspan arguments for strided 2D array with custom layout. + + TODO: Determine the exact structure of mdspan with layout_stride: + - How to pass stride information to the kernel? + - What is the argument structure for layout_stride mdspan? + + Parameters + ---------- + arr : cupy.ndarray + Input CuPy array + dtype : numpy.dtype + Data type of the array + shape : tuple + Shape of the array (rows, cols) + strides : tuple + Strides in bytes for each dimension + + Returns + ------- + tuple + Arguments to pass to the kernel (needs investigation) + """ + # Placeholder: This needs to be determined based on mdspan layout + # For custom strides: layout_stride in mdspan + data_ptr = arr.data.ptr + rows, cols = shape + # Convert byte strides to element strides + stride0 = strides[0] // arr.itemsize + stride1 = strides[1] // arr.itemsize + # TODO: Determine the correct argument structure for layout_stride + return (data_ptr, rows, cols, stride0, stride1) # Placeholder return + + +# ################################################################################ +# Example 1: C-order (row-major) 2D arrays +# ################################################################################ + +def example_c_order(): + """Demonstrate mdspan with C-order (row-major) arrays.""" + print("=" * 70) + print("Example 1: C-order (row-major) 2D arrays") + print("=" * 70) + + # Setup device and stream + dev = Device() + dev.set_current() + s = dev.create_stream() + + # Prepare program with C++17 or later for mdspan support + program_options = ProgramOptions( + std="c++17", # mdspan requires C++17 or later + arch=f"sm_{dev.arch}", + include_path=include_path, + ) + prog = Program(code, code_type="c++", options=program_options) + + # Compile the kernel for float type with layout_right (C-order) + # TODO: Determine the correct template instantiation syntax + kernel_name = "mdspan_add_2d" + mod = prog.compile("cubin", name_expressions=(kernel_name,)) + ker = mod.get_kernel(kernel_name) + + # Prepare input/output arrays in C-order + dtype = cp.float32 + shape = (128, 256) # rows x cols + rng = cp.random.default_rng() + + # Create C-order arrays explicitly + input1 = rng.random(shape, dtype=dtype) + input2 = rng.random(shape, dtype=dtype) + output = cp.empty(shape, dtype=dtype, order='C') + + # Verify arrays are in C-order + assert input1.flags['C_CONTIGUOUS'] + assert input2.flags['C_CONTIGUOUS'] + assert output.flags['C_CONTIGUOUS'] + + dev.sync() # Sync CuPy stream + + # TODO: Prepare mdspan kernel arguments + # This is the main unknown: how to pass mdspan from host to device + # Possible approaches: + # 1. Pass pointer + extents + strides separately + # 2. Pass a structure that matches mdspan layout + # 3. Use a helper wrapper that constructs mdspan on device + + # Placeholder argument preparation + args_input1 = prepare_mdspan_args_c_order(input1, dtype, shape) + args_input2 = prepare_mdspan_args_c_order(input2, dtype, shape) + args_output = prepare_mdspan_args_c_order(output, dtype, shape) + + # Prepare launch configuration + block = (16, 16) # 2D thread block + grid = ((shape[1] + block[0] - 1) // block[0], # cols + (shape[0] + block[1] - 1) // block[1]) # rows + config = LaunchConfig(grid=grid, block=block) + + # TODO: Launch kernel with proper mdspan arguments + # launch(s, config, ker, *args_input1, *args_input2, *args_output) + # s.sync() + + # Verify result + # expected = input1 + input2 + # assert cp.allclose(output, expected) + + print("C-order example prepared (not executed)") + print(f" Input1 shape: {input1.shape}, strides: {input1.strides}, order: C") + print(f" Input2 shape: {input2.shape}, strides: {input2.strides}, order: C") + print(f" Output shape: {output.shape}, strides: {output.strides}, order: C") + print(f" Launch grid: {grid}, block: {block}") + print() + + +# ################################################################################ +# Example 2: F-order (column-major) 2D arrays +# ################################################################################ + +def example_f_order(): + """Demonstrate mdspan with F-order (column-major) arrays.""" + print("=" * 70) + print("Example 2: F-order (column-major) 2D arrays") + print("=" * 70) + + # Setup device and stream + dev = Device() + dev.set_current() + s = dev.create_stream() + + # Prepare program + program_options = ProgramOptions( + std="c++17", + arch=f"sm_{dev.arch}", + include_path=include_path, + ) + prog = Program(code, code_type="c++", options=program_options) + + # Compile the kernel for float type with layout_left (F-order) + kernel_name = "mdspan_add_2d" + mod = prog.compile("cubin", name_expressions=(kernel_name,)) + ker = mod.get_kernel(kernel_name) + + # Prepare input/output arrays in F-order + dtype = cp.float32 + shape = (128, 256) # rows x cols + rng = cp.random.default_rng() + + # Create F-order arrays explicitly + input1 = cp.asfortranarray(rng.random(shape, dtype=dtype)) + input2 = cp.asfortranarray(rng.random(shape, dtype=dtype)) + output = cp.empty(shape, dtype=dtype, order='F') + + # Verify arrays are in F-order + assert input1.flags['F_CONTIGUOUS'] + assert input2.flags['F_CONTIGUOUS'] + assert output.flags['F_CONTIGUOUS'] + + dev.sync() # Sync CuPy stream + + # TODO: Prepare mdspan kernel arguments for F-order + args_input1 = prepare_mdspan_args_f_order(input1, dtype, shape) + args_input2 = prepare_mdspan_args_f_order(input2, dtype, shape) + args_output = prepare_mdspan_args_f_order(output, dtype, shape) + + # Prepare launch configuration + block = (16, 16) + grid = ((shape[1] + block[0] - 1) // block[0], + (shape[0] + block[1] - 1) // block[1]) + config = LaunchConfig(grid=grid, block=block) + + # TODO: Launch kernel with proper mdspan arguments + # launch(s, config, ker, *args_input1, *args_input2, *args_output) + # s.sync() + + # Verify result + # expected = input1 + input2 + # assert cp.allclose(output, expected) + + print("F-order example prepared (not executed)") + print(f" Input1 shape: {input1.shape}, strides: {input1.strides}, order: F") + print(f" Input2 shape: {input2.shape}, strides: {input2.strides}, order: F") + print(f" Output shape: {output.shape}, strides: {output.strides}, order: F") + print(f" Launch grid: {grid}, block: {block}") + print() + + +# ################################################################################ +# Example 3: Strided arrays (second axis with step 2, i.e., arr[:, ::2]) +# ################################################################################ + +def example_strided(): + """Demonstrate mdspan with strided arrays.""" + print("=" * 70) + print("Example 3: Strided arrays (second axis skipped by 1 step)") + print("=" * 70) + + # Setup device and stream + dev = Device() + dev.set_current() + s = dev.create_stream() + + # Prepare program + program_options = ProgramOptions( + std="c++17", + arch=f"sm_{dev.arch}", + include_path=include_path, + ) + prog = Program(code, code_type="c++", options=program_options) + + # Compile the kernel for float type with layout_stride + kernel_name = "mdspan_add_2d_strided" + mod = prog.compile("cubin", name_expressions=(kernel_name,)) + ker = mod.get_kernel(kernel_name) + + # Prepare input/output arrays with strided views + dtype = cp.float32 + base_shape = (128, 512) # Base array shape + rng = cp.random.default_rng() + + # Create base arrays in C-order + base_input1 = rng.random(base_shape, dtype=dtype) + base_input2 = rng.random(base_shape, dtype=dtype) + base_output = cp.empty(base_shape, dtype=dtype, order='C') + + # Create strided views: skip every other element in second axis + # arr[:, ::2] means: take all rows, every 2nd column + input1 = base_input1[:, ::2] + input2 = base_input2[:, ::2] + output = base_output[:, ::2] + + # Check the resulting shapes and strides + # Shape should be (128, 256) - half the columns + # Strides will be different from contiguous arrays + assert input1.shape == (128, 256) + assert input2.shape == (128, 256) + assert output.shape == (128, 256) + + dev.sync() # Sync CuPy stream + + print(f" Strided view shape: {input1.shape}") + print(f" Strided view strides (bytes): {input1.strides}") + print(f" Base array strides (bytes): {base_input1.strides}") + print(f" Stride ratio: {input1.strides[1] / dtype().itemsize} elements") + + # TODO: Prepare mdspan kernel arguments for strided layout + args_input1 = prepare_mdspan_args_strided(input1, dtype, input1.shape, input1.strides) + args_input2 = prepare_mdspan_args_strided(input2, dtype, input2.shape, input2.strides) + args_output = prepare_mdspan_args_strided(output, dtype, output.shape, output.strides) + + # Prepare launch configuration + block = (16, 16) + grid = ((input1.shape[1] + block[0] - 1) // block[0], + (input1.shape[0] + block[1] - 1) // block[1]) + config = LaunchConfig(grid=grid, block=block) + + # TODO: Launch kernel with proper mdspan arguments + # launch(s, config, ker, *args_input1, *args_input2, *args_output) + # s.sync() + + # Verify result + # expected = input1 + input2 + # assert cp.allclose(output, expected) + + print("Strided example prepared (not executed)") + print(f" Input1 shape: {input1.shape}, strides: {input1.strides}") + print(f" Input2 shape: {input2.shape}, strides: {input2.strides}") + print(f" Output shape: {output.shape}, strides: {output.strides}") + print(f" Launch grid: {grid}, block: {block}") + print() + + +# ################################################################################ +# Main execution +# ################################################################################ + +if __name__ == "__main__": + print("\n" + "=" * 70) + print("CUDA mdspan Example Skeleton") + print("=" * 70) + print() + print("This is a skeleton/demonstration code to guide cuda.core design.") + print("The main question to answer:") + print(" How does mdspan layout look on the device side?") + print() + print("Key unknowns:") + print(" 1. What arguments to pass to kernels with mdspan parameters?") + print(" 2. How to represent different layouts (C-order, F-order, strided)?") + print(" 3. What is the ABI/calling convention for mdspan arguments?") + print() + + # Run the three examples + example_c_order() + example_f_order() + example_strided() + + print("=" * 70) + print("All examples prepared successfully!") + print("=" * 70) diff --git a/cuda_core/examples/mdspan_verify_args.py b/cuda_core/examples/mdspan_verify_args.py new file mode 100644 index 000000000..bec378e66 --- /dev/null +++ b/cuda_core/examples/mdspan_verify_args.py @@ -0,0 +1,492 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +# ################################################################################ +# +# This demo illustrates how to verify mdspan kernel arguments passed from the host +# using mdspan accessors and printf. This helps understand what data is actually +# being passed to the kernel for different layout types. +# +# NOTE: This is a skeleton/demonstration code that is not yet runnable. +# It is designed to help investigate the mdspan device-side representation. +# +# The example covers three scenarios: +# 1. C-order (layout_right) - prints pointer, extents +# 2. F-order (layout_left) - prints pointer, extents +# 3. Strided (layout_stride) - prints pointer, extents, and explicit strides +# +# ################################################################################ + +import os, sys +import cupy as cp +from cuda.core.experimental import Device, LaunchConfig, Program, ProgramOptions, launch +from cuda.core.experimental.utils import StridedMemoryView + +# prepare include +cuda_path = os.environ.get("CUDA_PATH", os.environ.get("CUDA_HOME")) +if cuda_path is None: + print("this demo requires a valid CUDA_PATH environment variable set", file=sys.stderr) + sys.exit(0) +cuda_include = os.path.join(cuda_path, "include") +assert os.path.isdir(cuda_include) +include_path = [cuda_include] +cccl_include = os.path.join(cuda_include, "cccl") +if os.path.isdir(cccl_include): + include_path.insert(0, cccl_include) + +# ################################################################################ +# C++ Kernel Code for Verifying mdspan Arguments +# ################################################################################ + +# Verification kernels that print mdspan properties using printf +code_verify = """ +#include + +// typedef struct { +// void* ptr; +// size_t ext1; +// size_t ext2; +// } mdspan_view_t; +// +// +// // Kernel to verify layout_right (C-order) mdspan arguments +// template +// __global__ void verify_mdspan_layout_right( +// mdspan_view_t arr +// ) { +// // Only thread 0 prints to avoid cluttered output +// if (threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0 && blockIdx.y == 0) { +// printf("=== layout_right (C-order) mdspan ===\\n"); +// printf("sizeof(mdspan_view_t): %llu\\n", sizeof(arr)); +// printf("view - ptr: %p\\n", reinterpret_cast(&arr)->ptr); +// printf("view2 : %p\\n", *(void**)((char*)(&arr) + 0)); +// printf("view - ext1: %p\\n", reinterpret_cast(&arr)->ext1); +// printf("view - ext2: %p\\n", reinterpret_cast(&arr)->ext2); +// } +// } + +// Kernel to verify layout_right (C-order) mdspan arguments + +typedef struct { + void* ptr; + void* ext1; + void* ext2; +} mdspan_view_t; + +template +__global__ void verify_mdspan_layout_right( + cuda::std::mdspan, cuda::std::layout_right> arr +) { + // Only thread 0 prints to avoid cluttered output + if (threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0 && blockIdx.y == 0) { + printf("=== layout_right (C-order) mdspan ===\\n"); + printf("sizeof(mdspan): %llu\\n", sizeof(arr)); + printf("view - ptr: %p\\n", reinterpret_cast(&arr)->ptr); + printf("view2 : %p\\n", (void**)((char*)(&arr) + 0)); + //printf("view - ext1: %llu\\n", *((size_t*)(reinterpret_cast(&arr)->ext1))); + //printf("view - ext2: %llu\\n", *((size_t*)(reinterpret_cast(&arr)->ext2))); + printf("view - ext1: %p\\n", reinterpret_cast(&arr)->ext1); + printf("view - ext2: %p\\n", reinterpret_cast(&arr)->ext2); + + printf("Data pointer: %p\\n", arr.data_handle()); + printf("Data pointer (actual): %p\\n", (void*)((char*)(&arr) + 0)); + printf("Data pointer (actual): %p\\n", addressof(arr)); + printf("Extent 0 (rows): %llu\\n", arr.extent(0)); + printf("Extent 1 (cols): %llu\\n", arr.extent(1)); + printf("Extent 0 (rows) (actual): %llu\\n", (size_t)(*((char*)(&arr) + 8))); + printf("Extent 1 (cols) (actual): %llu\\n", (size_t)(*((char*)(&arr) + 16))); + printf("Size: %zu\\n", arr.size()); + + // For layout_right, strides are implicit but we can query them + printf("Stride 0: %llu\\n", arr.stride(0)); + printf("Stride 1: %llu\\n", arr.stride(1)); + printf("Stride 0 (actual): %llu\\n", (size_t)((char*)(&arr) + 24)); + printf("Stride 1 (actual): %llu\\n", (size_t)((char*)(&arr) + 32)); + + // Verify memory layout: for layout_right (C-order) + // stride(0) should equal extent(1), stride(1) should be 1 + printf("Expected stride(0) = extent(1): %s\\n", + (arr.stride(0) == arr.extent(1)) ? "PASS" : "FAIL"); + printf("Expected stride(1) = 1: %s\\n", + (arr.stride(1) == 1) ? "PASS" : "FAIL"); + + // Test element access + if (arr.extent(0) > 0 && arr.extent(1) > 0) { + printf("First element arr(0,0): %f\\n", static_cast(arr(0, 0))); + } + } +} + +// Kernel to verify layout_left (F-order) mdspan arguments +template +__global__ void verify_mdspan_layout_left( + cuda::std::mdspan, cuda::std::layout_left> arr +) { + // Only thread 0 prints to avoid cluttered output + if (threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0 && blockIdx.y == 0) { + printf("=== layout_left (F-order) mdspan ===\\n"); + printf("Data pointer: %p\\n", arr.data_handle()); + printf("Extent 0 (rows): %zu\\n", arr.extent(0)); + printf("Extent 1 (cols): %zu\\n", arr.extent(1)); + printf("Size: %zu\\n", arr.size()); + + // For layout_left, strides are implicit but we can query them + printf("Stride 0: %zu\\n", arr.stride(0)); + printf("Stride 1: %zu\\n", arr.stride(1)); + + // Verify memory layout: for layout_left (F-order) + // stride(0) should be 1, stride(1) should equal extent(0) + printf("Expected stride(0) = 1: %s\\n", + (arr.stride(0) == 1) ? "PASS" : "FAIL"); + printf("Expected stride(1) = extent(0): %s\\n", + (arr.stride(1) == arr.extent(0)) ? "PASS" : "FAIL"); + + // Test element access + if (arr.extent(0) > 0 && arr.extent(1) > 0) { + printf("First element arr(0,0): %f\\n", static_cast(arr(0, 0))); + } + } +} + +// Kernel to verify layout_stride mdspan arguments +template +__global__ void verify_mdspan_layout_stride( + cuda::std::mdspan, cuda::std::layout_stride> arr +) { + // Only thread 0 prints to avoid cluttered output + if (threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0 && blockIdx.y == 0) { + printf("=== layout_stride mdspan ===\\n"); + printf("Data pointer: %p\\n", arr.data_handle()); + printf("Extent 0 (rows): %zu\\n", arr.extent(0)); + printf("Extent 1 (cols): %zu\\n", arr.extent(1)); + printf("Size: %zu\\n", arr.size()); + + // For layout_stride, strides are stored explicitly + printf("Stride 0 (explicit): %zu\\n", arr.stride(0)); + printf("Stride 1 (explicit): %zu\\n", arr.stride(1)); + + // The mapping can be queried + printf("Required span size: %zu\\n", arr.mapping().required_span_size()); + + // Test element access + if (arr.extent(0) > 0 && arr.extent(1) > 0) { + printf("First element arr(0,0): %f\\n", static_cast(arr(0, 0))); + if (arr.extent(1) > 1) { + printf("Second element arr(0,1): %f\\n", static_cast(arr(0, 1))); + } + } + } +} +""" + + +# ################################################################################ +# Helper Functions (To Be Implemented) +# ################################################################################ + +def prepare_mdspan_args_layout_right(arr, dtype, shape): + """ + Prepare mdspan arguments for layout_right (C-order) 2D array. + + TODO: Determine the exact structure needed for kernel launch. + Based on the source code, layout_right::mapping stores {extents} and + mdspan stores {data_handle, mapping, accessor}. + + Parameters + ---------- + arr : cupy.ndarray + Input CuPy array + dtype : numpy.dtype + Data type of the array + shape : tuple + Shape of the array (rows, cols) + + Returns + ------- + tuple + Arguments to pass to the kernel (needs investigation) + """ + #obj = arr.mdspan + #print(f"{hex(obj.ptr)=}, {obj.ptr=}") + #return (obj.ptr,) + + obj = StridedMemoryView(arr, stream_ptr=-1).as_mdspan + print(f"{hex(obj._ptr)=}, {obj._ptr=}, type={type(obj)}") + return (obj,) + + +def prepare_mdspan_args_layout_left(arr, dtype, shape): + """ + Prepare mdspan arguments for layout_left (F-order) 2D array. + + TODO: Determine the exact structure needed for kernel launch. + Based on the source code, layout_left::mapping stores {extents} and + mdspan stores {data_handle, mapping, accessor}. + + Parameters + ---------- + arr : cupy.ndarray + Input CuPy array + dtype : numpy.dtype + Data type of the array + shape : tuple + Shape of the array (rows, cols) + + Returns + ------- + tuple + Arguments to pass to the kernel (needs investigation) + """ + data_ptr = arr.data.ptr + rows, cols = shape + # TODO: Determine exact argument structure + return (data_ptr, rows, cols) + + +def prepare_mdspan_args_layout_stride(arr, dtype, shape, strides): + """ + Prepare mdspan arguments for layout_stride 2D array. + + TODO: Determine the exact structure needed for kernel launch. + Based on the source code, layout_stride::mapping stores {extents, stride_array} + and mdspan stores {data_handle, mapping, accessor}. + + Parameters + ---------- + arr : cupy.ndarray + Input CuPy array + dtype : numpy.dtype + Data type of the array + shape : tuple + Shape of the array (rows, cols) + strides : tuple + Strides in bytes for each dimension + + Returns + ------- + tuple + Arguments to pass to the kernel (needs investigation) + """ + data_ptr = arr.data.ptr + rows, cols = shape + # Convert byte strides to element strides + stride0 = strides[0] // arr.itemsize + stride1 = strides[1] // arr.itemsize + # TODO: Determine exact argument structure + return (data_ptr, rows, cols, stride0, stride1) + + +# ################################################################################ +# Example 1: Verify layout_right (C-order) mdspan +# ################################################################################ + +def verify_layout_right(): + """Verify layout_right (C-order) mdspan arguments.""" + print("=" * 70) + print("Verifying layout_right (C-order) mdspan") + print("=" * 70) + + # Setup device and stream + dev = Device() + dev.set_current() + s = dev.create_stream() + + # Prepare program with C++17 or later for mdspan support + program_options = ProgramOptions( + std="c++17", + arch=f"sm_{dev.arch}", + include_path=include_path, + ) + prog = Program(code_verify, code_type="c++", options=program_options) + + # Compile the verification kernel for float type with layout_right + kernel_name = "verify_mdspan_layout_right" + mod = prog.compile("cubin", name_expressions=(kernel_name,)) + ker = mod.get_kernel(kernel_name) + + # Prepare test array in C-order + dtype = cp.float32 + shape = (4, 8) # Small array for testing + + # Create C-order array with known values + arr = cp.arange(shape[0] * shape[1], dtype=dtype).reshape(shape, order='C') + + # Verify array is in C-order + assert arr.flags['C_CONTIGUOUS'] + print(f"Array pointer: {hex(arr.data.ptr)}") + print(f"Array shape: {arr.shape}") + print(f"Array strides (bytes): {arr.strides}") + print(f"Array strides (elements): ({arr.strides[0]//arr.itemsize}, {arr.strides[1]//arr.itemsize})") + print(f"First element: {arr[0, 0]}") + print() + + dev.sync() # Sync CuPy stream + + # TODO: Prepare mdspan kernel arguments + args = prepare_mdspan_args_layout_right(arr, dtype, shape) + + # Launch kernel (single thread is enough for verification) + config = LaunchConfig(grid=1, block=1) + + # TODO: Launch kernel with proper mdspan arguments + launch(s, config, ker, *args) + s.sync() + + print("Verification kernel prepared (not executed)") + print() + + +# ################################################################################ +# Example 2: Verify layout_left (F-order) mdspan +# ################################################################################ + +def verify_layout_left(): + """Verify layout_left (F-order) mdspan arguments.""" + print("=" * 70) + print("Verifying layout_left (F-order) mdspan") + print("=" * 70) + + # Setup device and stream + dev = Device() + dev.set_current() + s = dev.create_stream() + + # Prepare program + program_options = ProgramOptions( + std="c++17", + arch=f"sm_{dev.arch}", + include_path=include_path, + ) + prog = Program(code_verify, code_type="c++", options=program_options) + + # Compile the verification kernel for float type with layout_left + kernel_name = "verify_mdspan_layout_left" + mod = prog.compile("cubin", name_expressions=(kernel_name,)) + ker = mod.get_kernel(kernel_name) + + # Prepare test array in F-order + dtype = cp.float32 + shape = (4, 8) # Small array for testing + + # Create F-order array with known values + arr = cp.arange(shape[0] * shape[1], dtype=dtype).reshape(shape, order='F') + + # Verify array is in F-order + assert arr.flags['F_CONTIGUOUS'] + + print(f"Array shape: {arr.shape}") + print(f"Array strides (bytes): {arr.strides}") + print(f"Array strides (elements): ({arr.strides[0]//arr.itemsize}, {arr.strides[1]//arr.itemsize})") + print(f"First element: {arr[0, 0]}") + print() + + dev.sync() # Sync CuPy stream + + # TODO: Prepare mdspan kernel arguments + args = prepare_mdspan_args_layout_left(arr, dtype, shape) + + # Launch kernel (single thread is enough for verification) + config = LaunchConfig(grid=1, block=1) + + # TODO: Launch kernel with proper mdspan arguments + # launch(s, config, ker, *args) + # s.sync() + + print("Verification kernel prepared (not executed)") + print() + + +# ################################################################################ +# Example 3: Verify layout_stride mdspan +# ################################################################################ + +def verify_layout_stride(): + """Verify layout_stride mdspan arguments.""" + print("=" * 70) + print("Verifying layout_stride mdspan") + print("=" * 70) + + # Setup device and stream + dev = Device() + dev.set_current() + s = dev.create_stream() + + # Prepare program + program_options = ProgramOptions( + std="c++17", + arch=f"sm_{dev.arch}", + include_path=include_path, + ) + prog = Program(code_verify, code_type="c++", options=program_options) + + # Compile the verification kernel for float type with layout_stride + kernel_name = "verify_mdspan_layout_stride" + mod = prog.compile("cubin", name_expressions=(kernel_name,)) + ker = mod.get_kernel(kernel_name) + + # Prepare test array with strided view + dtype = cp.float32 + base_shape = (4, 16) # Base array shape + + # Create base array in C-order with known values + base_arr = cp.arange(base_shape[0] * base_shape[1], dtype=dtype).reshape(base_shape, order='C') + + # Create strided view: skip every other element in second axis + # arr[:, ::2] means: take all rows, every 2nd column + arr = base_arr[:, ::2] + + print(f"Base array shape: {base_arr.shape}") + print(f"Base array strides (bytes): {base_arr.strides}") + print(f"Strided view shape: {arr.shape}") + print(f"Strided view strides (bytes): {arr.strides}") + print(f"Strided view strides (elements): ({arr.strides[0]//arr.itemsize}, {arr.strides[1]//arr.itemsize})") + print(f"First element: {arr[0, 0]}") + print(f"Second element: {arr[0, 1]}") + print() + + dev.sync() # Sync CuPy stream + + # TODO: Prepare mdspan kernel arguments + args = prepare_mdspan_args_layout_stride(arr, dtype, arr.shape, arr.strides) + + # Launch kernel (single thread is enough for verification) + config = LaunchConfig(grid=1, block=1) + + # TODO: Launch kernel with proper mdspan arguments + # launch(s, config, ker, *args) + # s.sync() + + print("Verification kernel prepared (not executed)") + print() + + +# ################################################################################ +# Main execution +# ################################################################################ + +if __name__ == "__main__": + print("\n" + "=" * 70) + print("CUDA mdspan Argument Verification Example") + print("=" * 70) + print() + print("This example demonstrates how to verify mdspan kernel arguments") + print("using printf to inspect:") + print(" - Data pointer address") + print(" - Extents (dimensions)") + print(" - Strides (for layout_stride)") + print() + print("Key investigation points:") + print(" 1. What is the actual parameter passing mechanism?") + print(" 2. How are extents encoded in the kernel arguments?") + print(" 3. How are strides encoded for layout_stride?") + print() + + # Run the three verification examples + verify_layout_right() + verify_layout_left() + verify_layout_stride() + + print("=" * 70) + print("All verification examples prepared successfully!") + print("=" * 70)