Tag Archives: MGPU

naiveScan

Segmented Warp Scan with Packed Head Flags

Here is a simple test implementation of a segmented scan in warp with packed head flag. The idea is from the segmented warp scan function.

Here is the implementation in F#.

[sourcecode language=”fsharp”]
open NUnit.Framework
open Alea.Interop.LLVM
open Alea.CUDA
open Alea.CUDA.Extension

// define an alias
module IRB = Alea.CUDA.IRBuilderUtil
module DF = Alea.CUDA.DeviceFunction

// get the worker
let worker = getDefaultWorker()

[<IRB.LLVMFunctionBuilder>]
let bfi(x:int, y:int, bit:int, numBits:int):int = failwith “Device Only!”
let bfi [BUILDER](ctx:IRB.LLVMFunctionBuilderContext) =
let args = ctx.LLVMValueArgs // arguments LLVM values
let i32t = ctx.LLVMHelper.i32_t // int LLVM type
let rett = i32t // return type
let argst = [| i32t; i32t; i32t; i32t |] // argument type list
let funct = LLVMFunctionTypeEx(rett, argst, 0)
let funcp = LLVMConstInlineAsm(funct, “bfi.b32 \t$0, $2, $1, $3, $4;”, “=r,r,r,r,r”, 0, 0)
IRB.Value(LLVMBuildCallEx(ctx.Builder, funcp, args, “”))

[<ReflectedDefinition>]
let segscanWarp (inputs:DevicePtr<int>) (outputs:DevicePtr<int>) (distances:DevicePtr<int>) =
let tid = threadIdx.x
let packed = inputs.[tid]

// the start flag is in the high bit
let flag = 0x80000000 &&& packed

// get the start flags for each thread in the warp
let flags = __ballot(flag)

// mask out the bits above the current thread
let flags = flags &&& bfi(0, 0xffffffff, 0, tid + 1)

// find the distance from the current thread to the thread at the start of
// the segment
let distance = DF.__clz(flags) + tid – 31

let shared = shared<int>(Util.WARP_SIZE).Ptr(0).Volatile()

let x0 = 0x7fffffff &&& packed
let mutable x = x0
shared.[tid] <- x

// perform the parallel scan. Note the conditional if(offset < distance)
// replaces the ordinary scan conditional if(offset <= tid)
for i = 0 to Util.LOG_WARP_SIZE – 1 do
let offset = 1 <<< i
if offset <= distance then x <- x + shared.[tid – offset]
shared.[tid] <- x

// turn inclusive scan into exclusive scan
x <- x – x0

outputs.[tid] <- x
distances.[tid] <- distance

[<Test>]
let test() =
let blockSize = 256
let numWarps = blockSize / Util.WARP_SIZE
let s x = x ||| (1 <<< 31)
let hInputs =
[|
s 3; 0; 3; 3; 0; s 1; 2; 0; 3; 3; 3; 2; 3; 0; 3; 1;
0; 0; 2; 3; 2; s 3; 1; 0; 2; 1; 2; 1; 1; 0; 1; s 3;
|]

let pfunct = cuda {
let! segscanWarp = <@ segscanWarp @> |> defineKernelFunc

return PFunc(fun (m:Module) ->
let worker = m.Worker
let segscanWarp = segscanWarp.Apply m
pcalc {
let! dInputs = DArray.scatterInBlob worker hInputs
let! dOutputs = DArray.createInBlob worker hInputs.Length
let! dDistances = DArray.createInBlob worker hInputs.Length

do! PCalc.action (fun hint ->
let lp = LaunchParam(1, Util.WARP_SIZE) |> hint.ModifyLaunchParam
segscanWarp.Launch lp dInputs.Ptr dOutputs.Ptr dDistances.Ptr)

let! hOutputs = dOutputs.Gather()
let! hDistances = dDistances.Gather()

printfn “outputs:”
for i = 0 to 15 do printf “%2d; ” hOutputs.[i]
printfn “”
for i = 16 to 31 do printf “%2d; ” hOutputs.[i]
printfn “”

printfn “distances:”
for i = 0 to 15 do printf “%2d; ” hDistances.[i]
printfn “”
for i = 16 to 31 do printf “%2d; ” hDistances.[i]
printfn “” } ) }

let calc = worker.LoadPModule(pfunct).Invoke
calc |> PCalc.run
[/sourcecode]