using BenchmarkTools versioninfo(verbose=true) # ### HWloc using Pkg Pkg.add("Hwloc") using Hwloc Hwloc.num_physical_cores() # ###Starting Threads using Base.Threads nthreads() threadid() a=zeros(4) for i in 1:nthreads() a[threadid()] = threadid() end a # ### The @Threads macro @threads for i in 1:nthreads() a[threadid()] = threadid() end # #### Prefix sum function prefix_threads!(y) l=length(y) k=ceil(Int, log2(l)) for j=1:k @threads for i=2^j:2^j:min(l, 2^k) @inbounds y[i] = y[i-2^(j-1)] + y[i] end end for j=(k-1):-1:1 @threads for i=3*2^(j-1):2^j:min(l, 2^k) @inbounds y[i] = y[i-2^(j-1)] + y[i] end end y end # ## Thread Safety # ### Monte Carlo using Random function darts_in_circle(n, rng=Random.GLOBAL_RNG) inside = 0 for i in 1:n if rand(rng)^2 + rand(rng)^2 < 1 inside += 1 end end return inside end function pi_serial(n) return 4 * darts_in_circle(n) / n end using Base.Threads const rnglist = [MersenneTwister() for i in 1:nthreads()]; function pi_threads(n, loops) inside = zeros(Int, loops) @threads for i in 1:loops rng = rnglist[threadid()] inside[threadid()] = darts_in_circle(n, rng) end return 4 * sum(inside) / (n*loops) end pi_threads(2_500_000, 4) @btime pi_serial(10_000_000) @btime pi_threads(2_500_000, 4) # ### Atomics function sum_thread_base(x) r = zero(eltype(x)) @threads for i in eachindex(x) @inbounds r += x[i] end return r end a=rand(10_000_000); @btime sum($a) @btime sum_thread_base($a) function sum_thread_atomic(x) r = Atomic{eltype(x)}(zero(eltype(x))) @threads for i in eachindex(x) @inbounds atomic_add!(r, x[i]) end return r[] end @btime sum_thread_atomic($a) function sum_thread_split(A) r = Atomic{eltype(A)}(zero(eltype(A))) len, rem = divrem(length(A), nthreads()) #Split the array equally among the threads @threads for t in 1:nthreads() r[] = zero(eltype(A)) @simd for i in (1:len) .+ (t-1)*len @inbounds r[] += A[i] end atomic_add!(r, r[]) end result = r[] #process up the remaining data @simd for i in length(A)-rem+1:length(A) @inbounds result += A[i] end return result end @btime sum_thread_split($a) @btime sum_thread_split($a) # ### Synchronisation primitives const f = open(tempname(), "a+") const mt = Base.Threads.Mutex(); @threads for i in 1:50 r = pi_serial(10_000_000) lock(mt) write(f, "From $(threadid()), pi = $r\n") unlock(mt) end close(f) const s = Base.Semaphore(2); @threads for i in 1:nthreads() Base.acquire(s) r = pi_serial(10_000_000) Core.println("Calculated pi = $r in Thread $(threadid())") Base.release(s) end # ## Threaded libraries a = rand(1000, 1000); b = rand(1000, 1000); @btime $a*$b; # ### Oversubscriptions function matmul_serial(x) first_num = zeros(length(x)) for i in eachindex(x) @inbounds first_num[i] = (x[i]'*x[i])[1] end return first_num end function matmul_thread(x) first_num = zeros(length(x)) @threads for i in eachindex(x) @inbounds first_num[i] = (x[i]'*x[i])[1] end return first_num end m = [rand(1000, 1000) for _ in 1:10]; @btime matmul_serial(m); @btime matmul_thread(m); using LinearAlgebra BLAS.set_num_threads(1) @btime matmul_thread(m);