diff --git a/cuda_core/cuda/core/experimental/_kernel_arg_handler.pyx b/cuda_core/cuda/core/experimental/_kernel_arg_handler.pyx
index 4cac74a25..2387c7ef5 100644
--- a/cuda_core/cuda/core/experimental/_kernel_arg_handler.pyx
+++ b/cuda_core/cuda/core/experimental/_kernel_arg_handler.pyx
@@ -3,7 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from cpython.mem cimport PyMem_Malloc, PyMem_Free
-from libc.stdint cimport (intptr_t,
+from libc.stdint cimport (intptr_t, uintptr_t,
                           int8_t, int16_t, int32_t, int64_t,
                           uint8_t, uint16_t, uint32_t, uint64_t,)
 from libcpp cimport bool as cpp_bool
@@ -11,13 +11,15 @@ from libcpp.complex cimport complex as cpp_complex
 from libcpp cimport nullptr
 from libcpp cimport vector
 
+from cuda.bindings cimport cydriver
+from cuda.core.experimental._memoryview cimport _MDSPAN
+
 import ctypes
 
 import numpy
 
 from cuda.core.experimental._memory import Buffer
 from cuda.core.experimental._utils.cuda_utils import driver
-from cuda.bindings cimport cydriver
 
 
 ctypedef cpp_complex.complex[float] cpp_single_complex
@@ -296,6 +298,12 @@ cdef class ParamHolder:
             elif arg_type is complex:
                 prepare_arg[cpp_double_complex](self.data, self.data_addresses, arg, i)
                 continue
+            elif arg_type is _MDSPAN:
+                # The mdspan struct is allocated on the host and owned by the CuPy mdspan object.
+                # We pass a pointer to the struct so the driver can copy it by value to the kernel.
+                # Access _ptr at C level to avoid creating a temporary Python object.
+                self.data_addresses[i] = <void*>((<_MDSPAN>arg)._ptr)
+                continue
 
             not_prepared = prepare_numpy_arg(self.data, self.data_addresses, arg, i)
             if not_prepared:
diff --git a/cuda_core/cuda/core/experimental/_memoryview.pxd b/cuda_core/cuda/core/experimental/_memoryview.pxd
new file mode 100644
index 000000000..746f4c8db
--- /dev/null
+++ b/cuda_core/cuda/core/experimental/_memoryview.pxd
@@ -0,0 +1,10 @@
+from libc.stdint cimport uintptr_t
+
+
+cdef class _MDSPAN:
+    cdef:
+        # this must be a pointer to a host mdspan object
+        readonly uintptr_t _ptr
+        # if the host mdspan is exported from any Python object,
+        # we need to keep a reference to that object alive
+        readonly object _exporting_obj
diff --git a/cuda_core/cuda/core/experimental/_memoryview.pyx b/cuda_core/cuda/core/experimental/_memoryview.pyx
index 5b7066114..e8a24e539 100644
--- a/cuda_core/cuda/core/experimental/_memoryview.pyx
+++ b/cuda_core/cuda/core/experimental/_memoryview.pyx
@@ -2,7 +2,10 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+from libc.stdint cimport uintptr_t
+
 from ._dlpack cimport *
+from cuda.core.experimental._utils cimport cuda_utils
 
 import functools
 import warnings
@@ -11,12 +14,26 @@ from typing import Optional
 import numpy
 
 from cuda.core.experimental._utils.cuda_utils import handle_return, driver
-from cuda.core.experimental._utils cimport cuda_utils
 
 
 # TODO(leofang): support NumPy structured dtypes
 
 
+cdef class _MDSPAN:
+
+    def __cinit__(self):
+        self._ptr = 0
+
+    def __init__(self, uintptr_t ptr, object obj=None):
+        self._ptr = ptr
+        self._exporting_obj = obj
+
+    def __dealloc__(self):
+        self._ptr = 0
+        self._exporting_obj = None
+
+
+
 cdef class StridedMemoryView:
     """A dataclass holding metadata of a strided dense array/tensor.
 
@@ -98,6 +115,7 @@ cdef class StridedMemoryView:
         # this flag helps prevent unnecessary recompuation of _strides
         bint _strides_init
         object _dtype
+        _MDSPAN _mdspan
 
     def __init__(self, obj: object = None, stream_ptr: int | None = None) -> None:
         cdef str clsname = self.__class__.__name__
@@ -224,6 +242,27 @@ cdef class StridedMemoryView:
                     self._dtype = numpy.dtype(self.metadata["typestr"])
         return self._dtype
 
+    @property
+    def as_mdspan(self) -> _MDSPAN:
+        """A C++ mdspan view of the tensor.
+
+        Returns
+        -------
+        mdspan : _MDSPAN
+        """
+        if self._mdspan is None:
+            arr = self.exporting_obj
+            module = self.exporting_obj.__class__.__module__.split(".")[0]
+            if module == "cupy":
+                mdspan = arr.mdspan
+                #mdspan = arr.cstruct
+                self._mdspan = _MDSPAN(<uintptr_t>(mdspan.ptr), mdspan)
+            else:
+                raise NotImplementedError(
+                    f"as_mdspan is not implemented for objects from module '{module}'"
+                )
+        return self._mdspan
+
     def __repr__(self):
         return (f"StridedMemoryView(ptr={self.ptr},\n"
               + f"                  shape={self.shape},\n"
diff --git a/cuda_core/examples/mdspan_example.py b/cuda_core/examples/mdspan_example.py
new file mode 100644
index 000000000..1a146e303
--- /dev/null
+++ b/cuda_core/examples/mdspan_example.py
@@ -0,0 +1,458 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# ################################################################################
+#
+# This demo illustrates how to write a C++ kernel that takes cuda::std::mdspan
+# as kernel arguments, JIT-compile it using cuda.core.experimental.Program,
+# and prepare input/output CuPy arrays to launch this kernel and verify the result.
+#
+# NOTE: This is a skeleton/demonstration code that is not yet runnable.
+# It is designed to guide the cuda.core design by exploring how mdspan layout
+# information should be handled on both the host and device sides.
+#
+# The example covers three scenarios:
+# 1. 2D input/output arrays in C-order (row-major)
+# 2. 2D input/output arrays in F-order (column-major)
+# 3. 2D input/output arrays with strided access (second axis skipped by 1 step)
+#
+# ################################################################################
+
+import os, sys
+import cupy as cp
+from cuda.core.experimental import Device, LaunchConfig, Program, ProgramOptions, launch
+
+
+# prepare include
+cuda_path = os.environ.get("CUDA_PATH", os.environ.get("CUDA_HOME"))
+if cuda_path is None:
+    print("this demo requires a valid CUDA_PATH environment variable set", file=sys.stderr)
+    sys.exit(0)
+cuda_include = os.path.join(cuda_path, "include")
+assert os.path.isdir(cuda_include)
+include_path = [cuda_include]
+cccl_include = os.path.join(cuda_include, "cccl")
+if os.path.isdir(cccl_include):
+    include_path.insert(0, cccl_include)
+
+
+# ################################################################################
+# C++ Kernel Code with cuda::std::mdspan
+# ################################################################################
+
+# This kernel performs element-wise addition on 2D arrays using mdspan.
+# mdspan provides a multi-dimensional view over contiguous or strided memory.
+code = """
+#include <cuda/std/mdspan>
+
+// Kernel for element-wise addition of 2D arrays
+// Template parameters:
+//   T: element type (e.g., float, double)
+//   LayoutPolicy: layout policy (e.g., layout_right for C-order, layout_left for F-order)
+template<typename T, typename LayoutPolicy>
+__global__ void mdspan_add_2d(
+    cuda::std::mdspan<const T, cuda::std::extents<size_t, cuda::std::dynamic_extent, cuda::std::dynamic_extent>, LayoutPolicy> input1,
+    cuda::std::mdspan<const T, cuda::std::extents<size_t, cuda::std::dynamic_extent, cuda::std::dynamic_extent>, LayoutPolicy> input2,
+    cuda::std::mdspan<T, cuda::std::extents<size_t, cuda::std::dynamic_extent, cuda::std::dynamic_extent>, LayoutPolicy> output
+) {
+    // Calculate global thread indices
+    size_t row = blockIdx.y * blockDim.y + threadIdx.y;
+    size_t col = blockIdx.x * blockDim.x + threadIdx.x;
+    
+    // Ensure we're within bounds
+    if (row < input1.extent(0) && col < input1.extent(1)) {
+        // Perform element-wise addition
+        // mdspan handles the layout internally
+        output(row, col) = input1(row, col) + input2(row, col);
+    }
+}
+
+// Kernel variant for strided mdspan with custom layout
+template<typename T>
+__global__ void mdspan_add_2d_strided(
+    cuda::std::mdspan<const T, cuda::std::extents<size_t, cuda::std::dynamic_extent, cuda::std::dynamic_extent>, cuda::std::layout_stride> input1,
+    cuda::std::mdspan<const T, cuda::std::extents<size_t, cuda::std::dynamic_extent, cuda::std::dynamic_extent>, cuda::std::layout_stride> input2,
+    cuda::std::mdspan<T, cuda::std::extents<size_t, cuda::std::dynamic_extent, cuda::std::dynamic_extent>, cuda::std::layout_stride> output
+) {
+    // Calculate global thread indices
+    size_t row = blockIdx.y * blockDim.y + threadIdx.y;
+    size_t col = blockIdx.x * blockDim.x + threadIdx.x;
+    
+    // Ensure we're within bounds
+    if (row < input1.extent(0) && col < input1.extent(1)) {
+        // Perform element-wise addition
+        output(row, col) = input1(row, col) + input2(row, col);
+    }
+}
+"""
+
+
+# ################################################################################
+# Helper Functions (To Be Implemented)
+# ################################################################################
+
+def prepare_mdspan_args_c_order(arr, dtype, shape):
+    """
+    Prepare mdspan arguments for C-order (row-major) 2D array.
+    
+    TODO: Determine the exact structure of mdspan on device side:
+    - What information needs to be passed? (data pointer, extents, strides?)
+    - How should the layout be represented?
+    - What is the correct argument passing convention?
+    
+    Parameters
+    ----------
+    arr : cupy.ndarray
+        Input CuPy array
+    dtype : numpy.dtype
+        Data type of the array
+    shape : tuple
+        Shape of the array (rows, cols)
+    
+    Returns
+    -------
+    tuple
+        Arguments to pass to the kernel (needs investigation)
+    """
+    # Placeholder: This needs to be determined based on mdspan layout
+    # For C-order (row-major): layout_right in mdspan
+    # Possible arguments: pointer, extent0, extent1, stride0, stride1?
+    data_ptr = arr.data.ptr
+    rows, cols = shape
+    # TODO: Determine if we need to pass strides explicitly
+    # For C-order: stride0 = cols, stride1 = 1
+    return (data_ptr, rows, cols)  # Placeholder return
+
+
+def prepare_mdspan_args_f_order(arr, dtype, shape):
+    """
+    Prepare mdspan arguments for F-order (column-major) 2D array.
+    
+    TODO: Determine the exact structure of mdspan on device side:
+    - What information needs to be passed for F-order layout?
+    - How does layout_left differ from layout_right in argument passing?
+    
+    Parameters
+    ----------
+    arr : cupy.ndarray
+        Input CuPy array
+    dtype : numpy.dtype
+        Data type of the array
+    shape : tuple
+        Shape of the array (rows, cols)
+    
+    Returns
+    -------
+    tuple
+        Arguments to pass to the kernel (needs investigation)
+    """
+    # Placeholder: This needs to be determined based on mdspan layout
+    # For F-order (column-major): layout_left in mdspan
+    # Possible arguments: pointer, extent0, extent1, stride0, stride1?
+    data_ptr = arr.data.ptr
+    rows, cols = shape
+    # TODO: Determine if we need to pass strides explicitly
+    # For F-order: stride0 = 1, stride1 = rows
+    return (data_ptr, rows, cols)  # Placeholder return
+
+
+def prepare_mdspan_args_strided(arr, dtype, shape, strides):
+    """
+    Prepare mdspan arguments for strided 2D array with custom layout.
+    
+    TODO: Determine the exact structure of mdspan with layout_stride:
+    - How to pass stride information to the kernel?
+    - What is the argument structure for layout_stride mdspan?
+    
+    Parameters
+    ----------
+    arr : cupy.ndarray
+        Input CuPy array
+    dtype : numpy.dtype
+        Data type of the array
+    shape : tuple
+        Shape of the array (rows, cols)
+    strides : tuple
+        Strides in bytes for each dimension
+    
+    Returns
+    -------
+    tuple
+        Arguments to pass to the kernel (needs investigation)
+    """
+    # Placeholder: This needs to be determined based on mdspan layout
+    # For custom strides: layout_stride in mdspan
+    data_ptr = arr.data.ptr
+    rows, cols = shape
+    # Convert byte strides to element strides
+    stride0 = strides[0] // arr.itemsize
+    stride1 = strides[1] // arr.itemsize
+    # TODO: Determine the correct argument structure for layout_stride
+    return (data_ptr, rows, cols, stride0, stride1)  # Placeholder return
+
+
+# ################################################################################
+# Example 1: C-order (row-major) 2D arrays
+# ################################################################################
+
+def example_c_order():
+    """Demonstrate mdspan with C-order (row-major) arrays."""
+    print("=" * 70)
+    print("Example 1: C-order (row-major) 2D arrays")
+    print("=" * 70)
+    
+    # Setup device and stream
+    dev = Device()
+    dev.set_current()
+    s = dev.create_stream()
+    
+    # Prepare program with C++17 or later for mdspan support
+    program_options = ProgramOptions(
+        std="c++17",  # mdspan requires C++17 or later
+        arch=f"sm_{dev.arch}",
+        include_path=include_path,
+    )
+    prog = Program(code, code_type="c++", options=program_options)
+    
+    # Compile the kernel for float type with layout_right (C-order)
+    # TODO: Determine the correct template instantiation syntax
+    kernel_name = "mdspan_add_2d<float, cuda::std::layout_right>"
+    mod = prog.compile("cubin", name_expressions=(kernel_name,))
+    ker = mod.get_kernel(kernel_name)
+    
+    # Prepare input/output arrays in C-order
+    dtype = cp.float32
+    shape = (128, 256)  # rows x cols
+    rng = cp.random.default_rng()
+    
+    # Create C-order arrays explicitly
+    input1 = rng.random(shape, dtype=dtype)
+    input2 = rng.random(shape, dtype=dtype)
+    output = cp.empty(shape, dtype=dtype, order='C')
+    
+    # Verify arrays are in C-order
+    assert input1.flags['C_CONTIGUOUS']
+    assert input2.flags['C_CONTIGUOUS']
+    assert output.flags['C_CONTIGUOUS']
+    
+    dev.sync()  # Sync CuPy stream
+    
+    # TODO: Prepare mdspan kernel arguments
+    # This is the main unknown: how to pass mdspan from host to device
+    # Possible approaches:
+    # 1. Pass pointer + extents + strides separately
+    # 2. Pass a structure that matches mdspan layout
+    # 3. Use a helper wrapper that constructs mdspan on device
+    
+    # Placeholder argument preparation
+    args_input1 = prepare_mdspan_args_c_order(input1, dtype, shape)
+    args_input2 = prepare_mdspan_args_c_order(input2, dtype, shape)
+    args_output = prepare_mdspan_args_c_order(output, dtype, shape)
+    
+    # Prepare launch configuration
+    block = (16, 16)  # 2D thread block
+    grid = ((shape[1] + block[0] - 1) // block[0],  # cols
+            (shape[0] + block[1] - 1) // block[1])  # rows
+    config = LaunchConfig(grid=grid, block=block)
+    
+    # TODO: Launch kernel with proper mdspan arguments
+    # launch(s, config, ker, *args_input1, *args_input2, *args_output)
+    # s.sync()
+    
+    # Verify result
+    # expected = input1 + input2
+    # assert cp.allclose(output, expected)
+    
+    print("C-order example prepared (not executed)")
+    print(f"  Input1 shape: {input1.shape}, strides: {input1.strides}, order: C")
+    print(f"  Input2 shape: {input2.shape}, strides: {input2.strides}, order: C")
+    print(f"  Output shape: {output.shape}, strides: {output.strides}, order: C")
+    print(f"  Launch grid: {grid}, block: {block}")
+    print()
+
+
+# ################################################################################
+# Example 2: F-order (column-major) 2D arrays
+# ################################################################################
+
+def example_f_order():
+    """Demonstrate mdspan with F-order (column-major) arrays."""
+    print("=" * 70)
+    print("Example 2: F-order (column-major) 2D arrays")
+    print("=" * 70)
+    
+    # Setup device and stream
+    dev = Device()
+    dev.set_current()
+    s = dev.create_stream()
+    
+    # Prepare program
+    program_options = ProgramOptions(
+        std="c++17",
+        arch=f"sm_{dev.arch}",
+        include_path=include_path,
+    )
+    prog = Program(code, code_type="c++", options=program_options)
+    
+    # Compile the kernel for float type with layout_left (F-order)
+    kernel_name = "mdspan_add_2d<float, cuda::std::layout_left>"
+    mod = prog.compile("cubin", name_expressions=(kernel_name,))
+    ker = mod.get_kernel(kernel_name)
+    
+    # Prepare input/output arrays in F-order
+    dtype = cp.float32
+    shape = (128, 256)  # rows x cols
+    rng = cp.random.default_rng()
+    
+    # Create F-order arrays explicitly
+    input1 = cp.asfortranarray(rng.random(shape, dtype=dtype))
+    input2 = cp.asfortranarray(rng.random(shape, dtype=dtype))
+    output = cp.empty(shape, dtype=dtype, order='F')
+    
+    # Verify arrays are in F-order
+    assert input1.flags['F_CONTIGUOUS']
+    assert input2.flags['F_CONTIGUOUS']
+    assert output.flags['F_CONTIGUOUS']
+    
+    dev.sync()  # Sync CuPy stream
+    
+    # TODO: Prepare mdspan kernel arguments for F-order
+    args_input1 = prepare_mdspan_args_f_order(input1, dtype, shape)
+    args_input2 = prepare_mdspan_args_f_order(input2, dtype, shape)
+    args_output = prepare_mdspan_args_f_order(output, dtype, shape)
+    
+    # Prepare launch configuration
+    block = (16, 16)
+    grid = ((shape[1] + block[0] - 1) // block[0],
+            (shape[0] + block[1] - 1) // block[1])
+    config = LaunchConfig(grid=grid, block=block)
+    
+    # TODO: Launch kernel with proper mdspan arguments
+    # launch(s, config, ker, *args_input1, *args_input2, *args_output)
+    # s.sync()
+    
+    # Verify result
+    # expected = input1 + input2
+    # assert cp.allclose(output, expected)
+    
+    print("F-order example prepared (not executed)")
+    print(f"  Input1 shape: {input1.shape}, strides: {input1.strides}, order: F")
+    print(f"  Input2 shape: {input2.shape}, strides: {input2.strides}, order: F")
+    print(f"  Output shape: {output.shape}, strides: {output.strides}, order: F")
+    print(f"  Launch grid: {grid}, block: {block}")
+    print()
+
+
+# ################################################################################
+# Example 3: Strided arrays (second axis with step 2, i.e., arr[:, ::2])
+# ################################################################################
+
+def example_strided():
+    """Demonstrate mdspan with strided arrays."""
+    print("=" * 70)
+    print("Example 3: Strided arrays (second axis skipped by 1 step)")
+    print("=" * 70)
+    
+    # Setup device and stream
+    dev = Device()
+    dev.set_current()
+    s = dev.create_stream()
+    
+    # Prepare program
+    program_options = ProgramOptions(
+        std="c++17",
+        arch=f"sm_{dev.arch}",
+        include_path=include_path,
+    )
+    prog = Program(code, code_type="c++", options=program_options)
+    
+    # Compile the kernel for float type with layout_stride
+    kernel_name = "mdspan_add_2d_strided<float>"
+    mod = prog.compile("cubin", name_expressions=(kernel_name,))
+    ker = mod.get_kernel(kernel_name)
+    
+    # Prepare input/output arrays with strided views
+    dtype = cp.float32
+    base_shape = (128, 512)  # Base array shape
+    rng = cp.random.default_rng()
+    
+    # Create base arrays in C-order
+    base_input1 = rng.random(base_shape, dtype=dtype)
+    base_input2 = rng.random(base_shape, dtype=dtype)
+    base_output = cp.empty(base_shape, dtype=dtype, order='C')
+    
+    # Create strided views: skip every other element in second axis
+    # arr[:, ::2] means: take all rows, every 2nd column
+    input1 = base_input1[:, ::2]
+    input2 = base_input2[:, ::2]
+    output = base_output[:, ::2]
+    
+    # Check the resulting shapes and strides
+    # Shape should be (128, 256) - half the columns
+    # Strides will be different from contiguous arrays
+    assert input1.shape == (128, 256)
+    assert input2.shape == (128, 256)
+    assert output.shape == (128, 256)
+    
+    dev.sync()  # Sync CuPy stream
+    
+    print(f"  Strided view shape: {input1.shape}")
+    print(f"  Strided view strides (bytes): {input1.strides}")
+    print(f"  Base array strides (bytes): {base_input1.strides}")
+    print(f"  Stride ratio: {input1.strides[1] / dtype().itemsize} elements")
+    
+    # TODO: Prepare mdspan kernel arguments for strided layout
+    args_input1 = prepare_mdspan_args_strided(input1, dtype, input1.shape, input1.strides)
+    args_input2 = prepare_mdspan_args_strided(input2, dtype, input2.shape, input2.strides)
+    args_output = prepare_mdspan_args_strided(output, dtype, output.shape, output.strides)
+    
+    # Prepare launch configuration
+    block = (16, 16)
+    grid = ((input1.shape[1] + block[0] - 1) // block[0],
+            (input1.shape[0] + block[1] - 1) // block[1])
+    config = LaunchConfig(grid=grid, block=block)
+    
+    # TODO: Launch kernel with proper mdspan arguments
+    # launch(s, config, ker, *args_input1, *args_input2, *args_output)
+    # s.sync()
+    
+    # Verify result
+    # expected = input1 + input2
+    # assert cp.allclose(output, expected)
+    
+    print("Strided example prepared (not executed)")
+    print(f"  Input1 shape: {input1.shape}, strides: {input1.strides}")
+    print(f"  Input2 shape: {input2.shape}, strides: {input2.strides}")
+    print(f"  Output shape: {output.shape}, strides: {output.strides}")
+    print(f"  Launch grid: {grid}, block: {block}")
+    print()
+
+
+# ################################################################################
+# Main execution
+# ################################################################################
+
+if __name__ == "__main__":
+    print("\n" + "=" * 70)
+    print("CUDA mdspan Example Skeleton")
+    print("=" * 70)
+    print()
+    print("This is a skeleton/demonstration code to guide cuda.core design.")
+    print("The main question to answer:")
+    print("  How does mdspan layout look on the device side?")
+    print()
+    print("Key unknowns:")
+    print("  1. What arguments to pass to kernels with mdspan parameters?")
+    print("  2. How to represent different layouts (C-order, F-order, strided)?")
+    print("  3. What is the ABI/calling convention for mdspan arguments?")
+    print()
+    
+    # Run the three examples
+    example_c_order()
+    example_f_order()
+    example_strided()
+    
+    print("=" * 70)
+    print("All examples prepared successfully!")
+    print("=" * 70)
diff --git a/cuda_core/examples/mdspan_verify_args.py b/cuda_core/examples/mdspan_verify_args.py
new file mode 100644
index 000000000..bec378e66
--- /dev/null
+++ b/cuda_core/examples/mdspan_verify_args.py
@@ -0,0 +1,492 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# ################################################################################
+#
+# This demo illustrates how to verify mdspan kernel arguments passed from the host
+# using mdspan accessors and printf. This helps understand what data is actually
+# being passed to the kernel for different layout types.
+#
+# NOTE: This is a skeleton/demonstration code that is not yet runnable.
+# It is designed to help investigate the mdspan device-side representation.
+#
+# The example covers three scenarios:
+# 1. C-order (layout_right) - prints pointer, extents
+# 2. F-order (layout_left) - prints pointer, extents
+# 3. Strided (layout_stride) - prints pointer, extents, and explicit strides
+#
+# ################################################################################
+
+import os, sys
+import cupy as cp
+from cuda.core.experimental import Device, LaunchConfig, Program, ProgramOptions, launch
+from cuda.core.experimental.utils import StridedMemoryView
+
+# prepare include
+cuda_path = os.environ.get("CUDA_PATH", os.environ.get("CUDA_HOME"))
+if cuda_path is None:
+    print("this demo requires a valid CUDA_PATH environment variable set", file=sys.stderr)
+    sys.exit(0)
+cuda_include = os.path.join(cuda_path, "include")
+assert os.path.isdir(cuda_include)
+include_path = [cuda_include]
+cccl_include = os.path.join(cuda_include, "cccl")
+if os.path.isdir(cccl_include):
+    include_path.insert(0, cccl_include)
+
+# ################################################################################
+# C++ Kernel Code for Verifying mdspan Arguments
+# ################################################################################
+
+# Verification kernels that print mdspan properties using printf
+code_verify = """
+#include <cuda/std/mdspan>
+
+// typedef struct {
+//     void* ptr;
+//     size_t ext1;
+//     size_t ext2;
+// } mdspan_view_t;
+// 
+// 
+// // Kernel to verify layout_right (C-order) mdspan arguments
+// template<typename T>
+// __global__ void verify_mdspan_layout_right(
+//     mdspan_view_t arr
+// ) {
+//     // Only thread 0 prints to avoid cluttered output
+//     if (threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0 && blockIdx.y == 0) {
+//         printf("=== layout_right (C-order) mdspan ===\\n");
+//         printf("sizeof(mdspan_view_t): %llu\\n", sizeof(arr));
+//         printf("view - ptr: %p\\n", reinterpret_cast<mdspan_view_t*>(&arr)->ptr);
+//         printf("view2 : %p\\n", *(void**)((char*)(&arr) + 0));
+//         printf("view - ext1: %p\\n", reinterpret_cast<mdspan_view_t*>(&arr)->ext1);
+//         printf("view - ext2: %p\\n", reinterpret_cast<mdspan_view_t*>(&arr)->ext2);
+//     }
+// }
+
+// Kernel to verify layout_right (C-order) mdspan arguments
+
+typedef struct {
+    void* ptr;
+    void* ext1;
+    void* ext2;
+} mdspan_view_t;
+
+template<typename T>
+__global__ void verify_mdspan_layout_right(
+    cuda::std::mdspan<T, cuda::std::extents<size_t, cuda::std::dynamic_extent, cuda::std::dynamic_extent>, cuda::std::layout_right> arr
+) {
+    // Only thread 0 prints to avoid cluttered output
+    if (threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0 && blockIdx.y == 0) {
+        printf("=== layout_right (C-order) mdspan ===\\n");
+        printf("sizeof(mdspan): %llu\\n", sizeof(arr));
+        printf("view - ptr: %p\\n", reinterpret_cast<mdspan_view_t*>(&arr)->ptr);
+        printf("view2 : %p\\n", (void**)((char*)(&arr) + 0));
+        //printf("view - ext1: %llu\\n", *((size_t*)(reinterpret_cast<mdspan_view_t*>(&arr)->ext1)));
+        //printf("view - ext2: %llu\\n", *((size_t*)(reinterpret_cast<mdspan_view_t*>(&arr)->ext2)));
+        printf("view - ext1: %p\\n", reinterpret_cast<mdspan_view_t*>(&arr)->ext1);
+        printf("view - ext2: %p\\n", reinterpret_cast<mdspan_view_t*>(&arr)->ext2);
+
+        printf("Data pointer: %p\\n", arr.data_handle());
+        printf("Data pointer (actual): %p\\n", (void*)((char*)(&arr) + 0));
+        printf("Data pointer (actual): %p\\n", addressof(arr));
+        printf("Extent 0 (rows): %llu\\n", arr.extent(0));
+        printf("Extent 1 (cols): %llu\\n", arr.extent(1));
+        printf("Extent 0 (rows) (actual): %llu\\n", (size_t)(*((char*)(&arr) + 8)));
+        printf("Extent 1 (cols) (actual): %llu\\n", (size_t)(*((char*)(&arr) + 16)));
+        printf("Size: %zu\\n", arr.size());
+        
+        // For layout_right, strides are implicit but we can query them
+        printf("Stride 0: %llu\\n", arr.stride(0));
+        printf("Stride 1: %llu\\n", arr.stride(1));
+        printf("Stride 0 (actual): %llu\\n", (size_t)((char*)(&arr) + 24));
+        printf("Stride 1 (actual): %llu\\n", (size_t)((char*)(&arr) + 32));
+        
+        // Verify memory layout: for layout_right (C-order)
+        // stride(0) should equal extent(1), stride(1) should be 1
+        printf("Expected stride(0) = extent(1): %s\\n", 
+               (arr.stride(0) == arr.extent(1)) ? "PASS" : "FAIL");
+        printf("Expected stride(1) = 1: %s\\n", 
+               (arr.stride(1) == 1) ? "PASS" : "FAIL");
+        
+        // Test element access
+        if (arr.extent(0) > 0 && arr.extent(1) > 0) {
+            printf("First element arr(0,0): %f\\n", static_cast<float>(arr(0, 0)));
+        }
+    }
+}
+
+// Kernel to verify layout_left (F-order) mdspan arguments
+template<typename T>
+__global__ void verify_mdspan_layout_left(
+    cuda::std::mdspan<T, cuda::std::extents<size_t, cuda::std::dynamic_extent, cuda::std::dynamic_extent>, cuda::std::layout_left> arr
+) {
+    // Only thread 0 prints to avoid cluttered output
+    if (threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0 && blockIdx.y == 0) {
+        printf("=== layout_left (F-order) mdspan ===\\n");
+        printf("Data pointer: %p\\n", arr.data_handle());
+        printf("Extent 0 (rows): %zu\\n", arr.extent(0));
+        printf("Extent 1 (cols): %zu\\n", arr.extent(1));
+        printf("Size: %zu\\n", arr.size());
+        
+        // For layout_left, strides are implicit but we can query them
+        printf("Stride 0: %zu\\n", arr.stride(0));
+        printf("Stride 1: %zu\\n", arr.stride(1));
+        
+        // Verify memory layout: for layout_left (F-order)
+        // stride(0) should be 1, stride(1) should equal extent(0)
+        printf("Expected stride(0) = 1: %s\\n", 
+               (arr.stride(0) == 1) ? "PASS" : "FAIL");
+        printf("Expected stride(1) = extent(0): %s\\n", 
+               (arr.stride(1) == arr.extent(0)) ? "PASS" : "FAIL");
+        
+        // Test element access
+        if (arr.extent(0) > 0 && arr.extent(1) > 0) {
+            printf("First element arr(0,0): %f\\n", static_cast<float>(arr(0, 0)));
+        }
+    }
+}
+
+// Kernel to verify layout_stride mdspan arguments
+template<typename T>
+__global__ void verify_mdspan_layout_stride(
+    cuda::std::mdspan<T, cuda::std::extents<size_t, cuda::std::dynamic_extent, cuda::std::dynamic_extent>, cuda::std::layout_stride> arr
+) {
+    // Only thread 0 prints to avoid cluttered output
+    if (threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0 && blockIdx.y == 0) {
+        printf("=== layout_stride mdspan ===\\n");
+        printf("Data pointer: %p\\n", arr.data_handle());
+        printf("Extent 0 (rows): %zu\\n", arr.extent(0));
+        printf("Extent 1 (cols): %zu\\n", arr.extent(1));
+        printf("Size: %zu\\n", arr.size());
+        
+        // For layout_stride, strides are stored explicitly
+        printf("Stride 0 (explicit): %zu\\n", arr.stride(0));
+        printf("Stride 1 (explicit): %zu\\n", arr.stride(1));
+        
+        // The mapping can be queried
+        printf("Required span size: %zu\\n", arr.mapping().required_span_size());
+        
+        // Test element access
+        if (arr.extent(0) > 0 && arr.extent(1) > 0) {
+            printf("First element arr(0,0): %f\\n", static_cast<float>(arr(0, 0)));
+            if (arr.extent(1) > 1) {
+                printf("Second element arr(0,1): %f\\n", static_cast<float>(arr(0, 1)));
+            }
+        }
+    }
+}
+"""
+
+
+# ################################################################################
+# Helper Functions (To Be Implemented)
+# ################################################################################
+
+def prepare_mdspan_args_layout_right(arr, dtype, shape):
+    """
+    Prepare mdspan arguments for layout_right (C-order) 2D array.
+    
+    TODO: Determine the exact structure needed for kernel launch.
+    Based on the source code, layout_right::mapping stores {extents} and
+    mdspan stores {data_handle, mapping, accessor}.
+    
+    Parameters
+    ----------
+    arr : cupy.ndarray
+        Input CuPy array
+    dtype : numpy.dtype
+        Data type of the array
+    shape : tuple
+        Shape of the array (rows, cols)
+    
+    Returns
+    -------
+    tuple
+        Arguments to pass to the kernel (needs investigation)
+    """
+    #obj = arr.mdspan
+    #print(f"{hex(obj.ptr)=}, {obj.ptr=}")
+    #return (obj.ptr,)
+
+    obj = StridedMemoryView(arr, stream_ptr=-1).as_mdspan
+    print(f"{hex(obj._ptr)=}, {obj._ptr=}, type={type(obj)}")
+    return (obj,)
+
+
+def prepare_mdspan_args_layout_left(arr, dtype, shape):
+    """
+    Prepare mdspan arguments for layout_left (F-order) 2D array.
+    
+    TODO: Determine the exact structure needed for kernel launch.
+    Based on the source code, layout_left::mapping stores {extents} and
+    mdspan stores {data_handle, mapping, accessor}.
+    
+    Parameters
+    ----------
+    arr : cupy.ndarray
+        Input CuPy array
+    dtype : numpy.dtype
+        Data type of the array
+    shape : tuple
+        Shape of the array (rows, cols)
+    
+    Returns
+    -------
+    tuple
+        Arguments to pass to the kernel (needs investigation)
+    """
+    data_ptr = arr.data.ptr
+    rows, cols = shape
+    # TODO: Determine exact argument structure
+    return (data_ptr, rows, cols)
+
+
+def prepare_mdspan_args_layout_stride(arr, dtype, shape, strides):
+    """
+    Prepare mdspan arguments for layout_stride 2D array.
+    
+    TODO: Determine the exact structure needed for kernel launch.
+    Based on the source code, layout_stride::mapping stores {extents, stride_array}
+    and mdspan stores {data_handle, mapping, accessor}.
+    
+    Parameters
+    ----------
+    arr : cupy.ndarray
+        Input CuPy array
+    dtype : numpy.dtype
+        Data type of the array
+    shape : tuple
+        Shape of the array (rows, cols)
+    strides : tuple
+        Strides in bytes for each dimension
+    
+    Returns
+    -------
+    tuple
+        Arguments to pass to the kernel (needs investigation)
+    """
+    data_ptr = arr.data.ptr
+    rows, cols = shape
+    # Convert byte strides to element strides
+    stride0 = strides[0] // arr.itemsize
+    stride1 = strides[1] // arr.itemsize
+    # TODO: Determine exact argument structure
+    return (data_ptr, rows, cols, stride0, stride1)
+
+
+# ################################################################################
+# Example 1: Verify layout_right (C-order) mdspan
+# ################################################################################
+
+def verify_layout_right():
+    """Verify layout_right (C-order) mdspan arguments."""
+    print("=" * 70)
+    print("Verifying layout_right (C-order) mdspan")
+    print("=" * 70)
+    
+    # Setup device and stream
+    dev = Device()
+    dev.set_current()
+    s = dev.create_stream()
+    
+    # Prepare program with C++17 or later for mdspan support
+    program_options = ProgramOptions(
+        std="c++17",
+        arch=f"sm_{dev.arch}",
+        include_path=include_path,
+    )
+    prog = Program(code_verify, code_type="c++", options=program_options)
+    
+    # Compile the verification kernel for float type with layout_right
+    kernel_name = "verify_mdspan_layout_right<float>"
+    mod = prog.compile("cubin", name_expressions=(kernel_name,))
+    ker = mod.get_kernel(kernel_name)
+    
+    # Prepare test array in C-order
+    dtype = cp.float32
+    shape = (4, 8)  # Small array for testing
+    
+    # Create C-order array with known values
+    arr = cp.arange(shape[0] * shape[1], dtype=dtype).reshape(shape, order='C')
+    
+    # Verify array is in C-order
+    assert arr.flags['C_CONTIGUOUS']
+    print(f"Array pointer: {hex(arr.data.ptr)}")
+    print(f"Array shape: {arr.shape}")
+    print(f"Array strides (bytes): {arr.strides}")
+    print(f"Array strides (elements): ({arr.strides[0]//arr.itemsize}, {arr.strides[1]//arr.itemsize})")
+    print(f"First element: {arr[0, 0]}")
+    print()
+    
+    dev.sync()  # Sync CuPy stream
+    
+    # TODO: Prepare mdspan kernel arguments
+    args = prepare_mdspan_args_layout_right(arr, dtype, shape)
+    
+    # Launch kernel (single thread is enough for verification)
+    config = LaunchConfig(grid=1, block=1)
+    
+    # TODO: Launch kernel with proper mdspan arguments
+    launch(s, config, ker, *args)
+    s.sync()
+    
+    print("Verification kernel prepared (not executed)")
+    print()
+
+
+# ################################################################################
+# Example 2: Verify layout_left (F-order) mdspan
+# ################################################################################
+
+def verify_layout_left():
+    """Verify layout_left (F-order) mdspan arguments."""
+    print("=" * 70)
+    print("Verifying layout_left (F-order) mdspan")
+    print("=" * 70)
+    
+    # Setup device and stream
+    dev = Device()
+    dev.set_current()
+    s = dev.create_stream()
+    
+    # Prepare program
+    program_options = ProgramOptions(
+        std="c++17",
+        arch=f"sm_{dev.arch}",
+        include_path=include_path,
+    )
+    prog = Program(code_verify, code_type="c++", options=program_options)
+    
+    # Compile the verification kernel for float type with layout_left
+    kernel_name = "verify_mdspan_layout_left<float>"
+    mod = prog.compile("cubin", name_expressions=(kernel_name,))
+    ker = mod.get_kernel(kernel_name)
+    
+    # Prepare test array in F-order
+    dtype = cp.float32
+    shape = (4, 8)  # Small array for testing
+    
+    # Create F-order array with known values
+    arr = cp.arange(shape[0] * shape[1], dtype=dtype).reshape(shape, order='F')
+    
+    # Verify array is in F-order
+    assert arr.flags['F_CONTIGUOUS']
+    
+    print(f"Array shape: {arr.shape}")
+    print(f"Array strides (bytes): {arr.strides}")
+    print(f"Array strides (elements): ({arr.strides[0]//arr.itemsize}, {arr.strides[1]//arr.itemsize})")
+    print(f"First element: {arr[0, 0]}")
+    print()
+    
+    dev.sync()  # Sync CuPy stream
+    
+    # TODO: Prepare mdspan kernel arguments
+    args = prepare_mdspan_args_layout_left(arr, dtype, shape)
+    
+    # Launch kernel (single thread is enough for verification)
+    config = LaunchConfig(grid=1, block=1)
+    
+    # TODO: Launch kernel with proper mdspan arguments
+    # launch(s, config, ker, *args)
+    # s.sync()
+    
+    print("Verification kernel prepared (not executed)")
+    print()
+
+
+# ################################################################################
+# Example 3: Verify layout_stride mdspan
+# ################################################################################
+
+def verify_layout_stride():
+    """Verify layout_stride mdspan arguments."""
+    print("=" * 70)
+    print("Verifying layout_stride mdspan")
+    print("=" * 70)
+    
+    # Setup device and stream
+    dev = Device()
+    dev.set_current()
+    s = dev.create_stream()
+    
+    # Prepare program
+    program_options = ProgramOptions(
+        std="c++17",
+        arch=f"sm_{dev.arch}",
+        include_path=include_path,
+    )
+    prog = Program(code_verify, code_type="c++", options=program_options)
+    
+    # Compile the verification kernel for float type with layout_stride
+    kernel_name = "verify_mdspan_layout_stride<float>"
+    mod = prog.compile("cubin", name_expressions=(kernel_name,))
+    ker = mod.get_kernel(kernel_name)
+    
+    # Prepare test array with strided view
+    dtype = cp.float32
+    base_shape = (4, 16)  # Base array shape
+    
+    # Create base array in C-order with known values
+    base_arr = cp.arange(base_shape[0] * base_shape[1], dtype=dtype).reshape(base_shape, order='C')
+    
+    # Create strided view: skip every other element in second axis
+    # arr[:, ::2] means: take all rows, every 2nd column
+    arr = base_arr[:, ::2]
+    
+    print(f"Base array shape: {base_arr.shape}")
+    print(f"Base array strides (bytes): {base_arr.strides}")
+    print(f"Strided view shape: {arr.shape}")
+    print(f"Strided view strides (bytes): {arr.strides}")
+    print(f"Strided view strides (elements): ({arr.strides[0]//arr.itemsize}, {arr.strides[1]//arr.itemsize})")
+    print(f"First element: {arr[0, 0]}")
+    print(f"Second element: {arr[0, 1]}")
+    print()
+    
+    dev.sync()  # Sync CuPy stream
+    
+    # TODO: Prepare mdspan kernel arguments
+    args = prepare_mdspan_args_layout_stride(arr, dtype, arr.shape, arr.strides)
+    
+    # Launch kernel (single thread is enough for verification)
+    config = LaunchConfig(grid=1, block=1)
+    
+    # TODO: Launch kernel with proper mdspan arguments
+    # launch(s, config, ker, *args)
+    # s.sync()
+    
+    print("Verification kernel prepared (not executed)")
+    print()
+
+
+# ################################################################################
+# Main execution
+# ################################################################################
+
+if __name__ == "__main__":
+    print("\n" + "=" * 70)
+    print("CUDA mdspan Argument Verification Example")
+    print("=" * 70)
+    print()
+    print("This example demonstrates how to verify mdspan kernel arguments")
+    print("using printf to inspect:")
+    print("  - Data pointer address")
+    print("  - Extents (dimensions)")
+    print("  - Strides (for layout_stride)")
+    print()
+    print("Key investigation points:")
+    print("  1. What is the actual parameter passing mechanism?")
+    print("  2. How are extents encoded in the kernel arguments?")
+    print("  3. How are strides encoded for layout_stride?")
+    print()
+    
+    # Run the three verification examples
+    verify_layout_right()
+    verify_layout_left()
+    verify_layout_stride()
+    
+    print("=" * 70)
+    print("All verification examples prepared successfully!")
+    print("=" * 70)