ClimaCartesianIndices.jl
This package implements a type called FastCartesianIndices
, which is a drop-in replacement for CartesianIndices
. FastCartesianIndices
uses Julia Base's SignedMultiplicativeInverse
to avoid integer division when converting linear indexes to cartesian indexes. This is especially useful on the gpu, which can have a nearly 2x performance impact.
FastCartesianIndices
internally uses Int32
and is therefore only valid when the product of the input indices are less than or equal to typemax(Int32)
(2147483647)
Example
using ClimaCartesianIndices: FastCartesianIndices
using CUDA;
function perf_linear_index!(X, Y)
x1 = X.x1;
nitems = length(parent(x1));
max_threads = 256; # can be higher if conditions permit
nthreads = min(max_threads, nitems);
nblocks = cld(nitems, nthreads);
CUDA.@cuda threads=nthreads blocks=nblocks name="linear" perf_linear_index_kernel!(
X,
Y,
Val(nitems),
)
end;
function perf_linear_index_kernel!(X, Y, ::Val{nitems}) where {nitems}
(; x1, x2, x3, x4) = X
(; y1) = Y
@inbounds begin
i = CUDA.threadIdx().x +
(CUDA.blockIdx().x - Int32(1)) * CUDA.blockDim().x
if i ≤ nitems
# more flops makes them much closer
# y1[i] = x1[i] + x2[i] + x3[i] + x4[i]
y1[i] = x1[i]
end
end
return nothing
end;
function perf_cart_index!(X, Y, cart_inds)
x1 = X.x1;
nitems = length(parent(x1));
max_threads = 256; # can be higher if conditions permit
nthreads = min(max_threads, nitems);
nblocks = cld(nitems, nthreads);
CUDA.@cuda threads=nthreads blocks=nblocks name="cartesian" perf_cart_index_kernel!(
X,
Y,
Val(nitems),
cart_inds,
)
end;
unval(::Val{CI}) where {CI} = CI
unval(CI) = CI
function perf_cart_index_kernel!(X, Y, ::Val{nitems}, cart_inds) where {nitems}
(; x1, x2, x3, x4) = X
(; y1) = Y
@inbounds begin
_i = CUDA.threadIdx().x +
(CUDA.blockIdx().x - Int32(1)) * CUDA.blockDim().x
if _i ≤ nitems && _i ≤ length(unval(cart_inds))
CI = unval(cart_inds)
i = CI[_i]
# more flops makes them much closer
# y1[i] = x1[i] + x2[i] + x3[i] + x4[i]
y1[i] = x1[i]
end
end
return nothing
end;
function get_arrays(sym, AType, FT, s, n = 4)
println("array_type = $AType")
fn = ntuple(i -> Symbol(sym, i), n)
return (; zip(fn, ntuple(_ -> AType(zeros(FT, s...)), n))...)
end;
using CUDA;
array_size = (50, 5, 5, 6, 5400); # array
X = get_arrays(:x, CUDA.CuArray, Float64, array_size);
Y = get_arrays(:y, CUDA.CuArray, Float64, array_size);
fast_ci(x) = FastCartesianIndices(map(Int32, size(x)))
CUDA.@profile begin
perf_linear_index!(X, Y)
perf_linear_index!(X, Y)
perf_linear_index!(X, Y)
perf_linear_index!(X, Y)
end
CUDA.@profile begin
perf_cart_index!(X, Y, CartesianIndices(X.x1))
perf_cart_index!(X, Y, CartesianIndices(X.x1))
perf_cart_index!(X, Y, CartesianIndices(X.x1))
perf_cart_index!(X, Y, CartesianIndices(X.x1))
end
CUDA.@profile begin
perf_cart_index!(X, Y, Val(CartesianIndices(X.x1)))
perf_cart_index!(X, Y, Val(CartesianIndices(X.x1)))
perf_cart_index!(X, Y, Val(CartesianIndices(X.x1)))
perf_cart_index!(X, Y, Val(CartesianIndices(X.x1)))
end
CUDA.@profile begin
perf_cart_index!(X, Y, fast_ci(X.x1))
perf_cart_index!(X, Y, fast_ci(X.x1))
perf_cart_index!(X, Y, fast_ci(X.x1))
perf_cart_index!(X, Y, fast_ci(X.x1))
end
CUDA.@profile begin
perf_cart_index!(X, Y, Val(fast_ci(X.x1)))
perf_cart_index!(X, Y, Val(fast_ci(X.x1)))
perf_cart_index!(X, Y, Val(fast_ci(X.x1)))
perf_cart_index!(X, Y, Val(fast_ci(X.x1)))
end
Results (NVIDIA A100)
For perf_linear_index!
:
Device-side activity: GPU was busy for 1.51 ms (94.36% of the trace)
┌──────────┬────────────┬───────┬──────────────────────────────────────┬────────┐
│ Time (%) │ Total time │ Calls │ Time distribution │ Name │
├──────────┼────────────┼───────┼──────────────────────────────────────┼────────┤
│ 94.36% │ 1.51 ms │ 4 │ 377.24 µs ± 1.5 (375.75 ‥ 379.32) │ linear │
└──────────┴────────────┴───────┴──────────────────────────────────────┴────────┘
For perf_cart_index!(X, Y, CartesianIndices(...))
Device-side activity: GPU was busy for 3.56 ms (97.21% of the trace)
┌──────────┬────────────┬───────┬─────────────────────────────────────┬───────────┐
│ Time (%) │ Total time │ Calls │ Time distribution │ Name │
├──────────┼────────────┼───────┼─────────────────────────────────────┼───────────┤
│ 97.21% │ 3.56 ms │ 4 │ 889.06 µs ± 0.19 (888.82 ‥ 889.3) │ cartesian │
└──────────┴────────────┴───────┴─────────────────────────────────────┴───────────┘
For perf_cart_index!(X, Y, Val(CartesianIndices(...)))
Device-side activity: GPU was busy for 2.61 ms (95.98% of the trace)
┌──────────┬────────────┬───────┬──────────────────────────────────────┬───────────┐
│ Time (%) │ Total time │ Calls │ Time distribution │ Name │
├──────────┼────────────┼───────┼──────────────────────────────────────┼───────────┤
│ 95.98% │ 2.61 ms │ 4 │ 652.13 µs ± 0.23 (651.84 ‥ 652.31) │ cartesian │
└──────────┴────────────┴───────┴──────────────────────────────────────┴───────────┘
For perf_cart_index!(X, Y, fast_ci(...))
Device-side activity: GPU was busy for 2.43 ms (95.79% of the trace)
┌──────────┬────────────┬───────┬──────────────────────────────────────┬───────────┐
│ Time (%) │ Total time │ Calls │ Time distribution │ Name │
├──────────┼────────────┼───────┼──────────────────────────────────────┼───────────┤
│ 95.79% │ 2.43 ms │ 4 │ 607.31 µs ± 0.41 (606.78 ‥ 607.73) │ cartesian │
└──────────┴────────────┴───────┴──────────────────────────────────────┴───────────┘
For perf_cart_index!(X, Y, Val(fast_ci(...)))
Device-side activity: GPU was busy for 1.64 ms (91.04% of the trace)
┌──────────┬────────────┬───────┬──────────────────────────────────────┬───────────┐
│ Time (%) │ Total time │ Calls │ Time distribution │ Name │
├──────────┼────────────┼───────┼──────────────────────────────────────┼───────────┤
│ 91.04% │ 1.64 ms │ 4 │ 408.89 µs ± 9.92 (397.21 ‥ 420.09) │ cartesian │
└──────────┴────────────┴───────┴──────────────────────────────────────┴───────────┘