| | |
| | |
| | |
| |
|
| | package main |
| |
|
| | import ( |
| | "fmt" |
| | "os" |
| | "simd/archsimd" |
| | "unsafe" |
| | ) |
| |
|
| | func load(s []float64) archsimd.Float64x4 { |
| | return archsimd.LoadFloat64x4((*[4]float64)(s[:4])) |
| | } |
| |
|
| | type S1 = archsimd.Float64x4 |
| |
|
| | type S2 archsimd.Float64x4 |
| |
|
| | func (s S2) Len() int { |
| | return archsimd.Float64x4(s).Len() |
| | } |
| |
|
| | func (s S2) Load(a []float64) S2 { |
| | return S2(load(a)) |
| | } |
| |
|
| | func (s S2) Store(a *[4]float64) { |
| | archsimd.Float64x4(s).Store(a) |
| | } |
| |
|
| | func (s S2) Add(a S2) S2 { |
| | return S2(archsimd.Float64x4(s).Add(archsimd.Float64x4(a))) |
| | } |
| |
|
| | func (s S2) Mul(a S2) S2 { |
| | return S2(archsimd.Float64x4(s).Mul(archsimd.Float64x4(a))) |
| | } |
| |
|
| | type S3 struct { |
| | archsimd.Float64x4 |
| | } |
| |
|
| | func ip64_0(a, b []float64) float64 { |
| | s := 0.0 |
| | for i := range a { |
| | s += a[i] * b[i] |
| | } |
| | return s |
| | } |
| |
|
| | func ip64_1(a, b []float64) float64 { |
| | var z S1 |
| | sum := z |
| | var i int |
| | stride := z.Len() |
| | for ; i <= len(a)-stride; i += stride { |
| | va := load(a[i:]) |
| | vb := load(b[i:]) |
| | sum = sum.Add(va.Mul(vb)) |
| | } |
| | var tmp [4]float64 |
| | sum.Store(&tmp) |
| | return tmp[0] + tmp[1] + tmp[2] + tmp[3] |
| | } |
| |
|
| | func ip64_1a(a, b []float64) float64 { |
| | var z S1 |
| | sum := z |
| | var i int |
| | stride := z.Len() |
| | for ; i <= len(a)-stride; i += stride { |
| | va := load(a[i:]) |
| | vb := load(b[i:]) |
| | sum = FMA(sum, va, vb) |
| | } |
| | var tmp [4]float64 |
| | sum.Store(&tmp) |
| | return tmp[0] + tmp[1] + tmp[2] + tmp[3] |
| | } |
| |
|
| | |
| | func FMA(a, b, c archsimd.Float64x4) archsimd.Float64x4 { |
| | return a.Add(b.Mul(c)) |
| | } |
| |
|
| | func ip64_2(a, b []float64) float64 { |
| | var z S2 |
| | sum := z |
| | var i int |
| | stride := z.Len() |
| | for ; i <= len(a)-stride; i += stride { |
| | va := z.Load(a[i:]) |
| | vb := z.Load(b[i:]) |
| | sum = sum.Add(va.Mul(vb)) |
| | } |
| | var tmp [4]float64 |
| | sum.Store(&tmp) |
| | return tmp[0] + tmp[1] + tmp[2] + tmp[3] |
| | } |
| |
|
| | func ip64_3(a, b []float64) float64 { |
| | var z S3 |
| | sum := z |
| | var i int |
| | stride := z.Len() |
| | for ; i <= len(a)-stride; i += stride { |
| | va := load(a[i:]) |
| | vb := load(b[i:]) |
| | sum = S3{sum.Add(va.Mul(vb))} |
| | } |
| | var tmp [4]float64 |
| | sum.Store(&tmp) |
| | return tmp[0] + tmp[1] + tmp[2] + tmp[3] |
| | } |
| |
|
| | func main() { |
| | a := []float64{1, 2, 3, 4, 5, 6, 7, 8} |
| | ip0 := ip64_0(a, a) |
| | ip1 := ip64_1(a, a) |
| | ip1a := ip64_1a(a, a) |
| | ip2 := ip64_2(a, a) |
| | ip3 := ip64_3(a, a) |
| | fmt.Printf("Test IP = %f\n", ip0) |
| | fmt.Printf("SIMD IP 1 = %f\n", ip1) |
| | fmt.Printf("SIMD IP 1a = %f\n", ip1a) |
| | fmt.Printf("SIMD IP 2 = %f\n", ip2) |
| | fmt.Printf("SIMD IP 3 = %f\n", ip3) |
| | var z1 S1 |
| | var z2 S2 |
| | var z3 S2 |
| |
|
| | s1, s2, s3 := unsafe.Sizeof(z1), unsafe.Sizeof(z2), unsafe.Sizeof(z3) |
| |
|
| | fmt.Printf("unsafe.Sizeof(z1, z2, z3)=%d, %d, %d\n", s1, s2, s3) |
| |
|
| | fail := false |
| |
|
| | if s1 != 32 || s2 != 32 || s3 != 32 { |
| | fmt.Println("Failed a sizeof check, should all be 32") |
| | fail = true |
| | } |
| |
|
| | if ip1 != ip0 || ip1a != ip0 || ip2 != ip0 || ip3 != ip0 { |
| | fmt.Println("Failed an inner product check, should all be", ip0) |
| | fail = true |
| | } |
| |
|
| | if fail { |
| | os.Exit(1) |
| | } |
| | } |
| |
|