| // Copyright 2025 The Go Authors. All rights reserved. | |
| // Use of this source code is governed by a BSD-style | |
| // license that can be found in the LICENSE file. | |
| package asmgen | |
| import ( | |
| "fmt" | |
| "math/bits" | |
| "slices" | |
| ) | |
| // Note: Exported fields and methods are expected to be used | |
| // by function generators (like the ones in add.go and so on). | |
| // Unexported fields and methods should not be. | |
| // A Pipe manages the input and output data pipelines for a function's | |
| // memory operations. | |
| // | |
| // The input is one or more equal-length slices of words, so collectively | |
| // it can be viewed as a matrix, in which each slice is a row and each column | |
| // is a set of corresponding words from the different slices. | |
| // The output can be viewed the same way, although it is often just one row. | |
| type Pipe struct { | |
| f *Func // function being generated | |
| label string // prefix for loop labels (default "loop") | |
| backward bool // processing columns in reverse | |
| started bool // Start has been called | |
| loaded bool // LoadPtrs has been called | |
| inPtr []RegPtr // input slice pointers | |
| hints []Hint // for each inPtr, a register hint to use for its data | |
| outPtr []RegPtr // output slice pointers | |
| index Reg // index register, if in use | |
| useIndexCounter bool // index counter requested | |
| indexCounter int // index is also counter (386); 0 no, -1 negative counter, +1 positive counter | |
| readOff int // read offset not yet added to index | |
| writeOff int // write offset not yet added to index | |
| factors []int // unrolling factors | |
| counts []Reg // iterations for each factor | |
| needWrite bool // need a write call during Loop1/LoopN | |
| maxColumns int // maximum columns during unrolled loop | |
| unrollStart func() // emit code at start of unrolled body | |
| unrollEnd func() // emit code end of unrolled body | |
| } | |
| // Pipe creates and returns a new pipe for use in the function f. | |
| func (f *Func) Pipe() *Pipe { | |
| a := f.Asm | |
| p := &Pipe{ | |
| f: f, | |
| label: "loop", | |
| maxColumns: 10000000, | |
| } | |
| if m := a.Arch.maxColumns; m != 0 { | |
| p.maxColumns = m | |
| } | |
| return p | |
| } | |
| // SetBackward sets the pipe to process the input and output columns in reverse order. | |
| // This is needed for left shifts, which might otherwise overwrite data they will read later. | |
| func (p *Pipe) SetBackward() { | |
| if p.loaded { | |
| p.f.Asm.Fatalf("SetBackward after Start/LoadPtrs") | |
| } | |
| p.backward = true | |
| } | |
| // SetUseIndexCounter sets the pipe to use an index counter if possible, | |
| // meaning the loop counter is also used as an index for accessing the slice data. | |
| // This clever trick is slower on modern processors, but it is still necessary on 386. | |
| // On non-386 systems, SetUseIndexCounter is a no-op. | |
| func (p *Pipe) SetUseIndexCounter() { | |
| if p.f.Asm.Arch.memIndex == nil { // need memIndex (only 386 provides it) | |
| return | |
| } | |
| p.useIndexCounter = true | |
| } | |
| // SetLabel sets the label prefix for the loops emitted by the pipe. | |
| // The default prefix is "loop". | |
| func (p *Pipe) SetLabel(label string) { | |
| p.label = label | |
| } | |
| // SetMaxColumns sets the maximum number of | |
| // columns processed in a single loop body call. | |
| func (p *Pipe) SetMaxColumns(m int) { | |
| p.maxColumns = m | |
| } | |
| // SetHint records that the inputs from the named vector | |
| // should be allocated with the given register hint. | |
| // | |
| // If the hint indicates a single register on the target architecture, | |
| // then SetHint calls SetMaxColumns(1), since the hinted register | |
| // can only be used for one value at a time. | |
| func (p *Pipe) SetHint(name string, hint Hint) { | |
| if hint == HintMemOK && !p.f.Asm.Arch.memOK { | |
| return | |
| } | |
| i := slices.Index(p.f.inputs, name) | |
| if i < 0 { | |
| p.f.Asm.Fatalf("unknown input name %s", name) | |
| } | |
| if p.f.Asm.hint(hint) != "" { | |
| p.SetMaxColumns(1) | |
| } | |
| for len(p.hints) <= i { | |
| p.hints = append(p.hints, HintNone) | |
| } | |
| p.hints[i] = hint | |
| } | |
| // LoadPtrs loads the slice pointer arguments into registers, | |
| // assuming that the slice length n has already been loaded | |
| // into the register n. | |
| // | |
| // Start will call LoadPtrs if it has not been called already. | |
| // LoadPtrs only needs to be called explicitly when code needs | |
| // to use LoadN before Start, like when the shift.go generators | |
| // read an initial word before the loop. | |
| func (p *Pipe) LoadPtrs(n Reg) { | |
| a := p.f.Asm | |
| if p.loaded { | |
| a.Fatalf("pointers already loaded") | |
| } | |
| // Load the actual pointers. | |
| p.loaded = true | |
| for _, name := range p.f.inputs { | |
| p.inPtr = append(p.inPtr, RegPtr(p.f.Arg(name+"_base"))) | |
| } | |
| for _, name := range p.f.outputs { | |
| p.outPtr = append(p.outPtr, RegPtr(p.f.Arg(name+"_base"))) | |
| } | |
| // Decide the memory access strategy for LoadN and StoreN. | |
| switch { | |
| case p.backward && p.useIndexCounter: | |
| // Generator wants an index counter, meaning when the iteration counter | |
| // is AX, we will access the slice with pointer BX using (BX)(AX*WordBytes). | |
| // The loop is moving backward through the slice, but the counter | |
| // is also moving backward, so not much to do. | |
| a.Comment("run loop backward, using counter as positive index") | |
| p.indexCounter = +1 | |
| p.index = n | |
| case !p.backward && p.useIndexCounter: | |
| // Generator wants an index counter, but the loop is moving forward. | |
| // To make the counter move in the direction of data access, | |
| // we negate the counter, counting up from -len(z) to -1. | |
| // To make the index access the right words, we add len(z)*WordBytes | |
| // to each of the pointers. | |
| // See comment below about the garbage collector (non-)implications | |
| // of pointing beyond the slice bounds. | |
| a.Comment("use counter as negative index") | |
| p.indexCounter = -1 | |
| p.index = n | |
| for _, ptr := range p.inPtr { | |
| a.AddWords(n, ptr, ptr) | |
| } | |
| for _, ptr := range p.outPtr { | |
| a.AddWords(n, ptr, ptr) | |
| } | |
| a.Neg(n, n) | |
| case p.backward: | |
| // Generator wants to run the loop backward. | |
| // We'll decrement the pointers before using them, | |
| // so position them at the very end of the slices. | |
| // If we had precise pointer information for assembly, | |
| // these pointers would cause problems with the garbage collector, | |
| // since they no longer point into the allocated slice, | |
| // but the garbage collector ignores unexpected values in assembly stacks, | |
| // and the actual slice pointers are still in the argument stack slots, | |
| // so the slices won't be collected early. | |
| // If we switched to the register ABI, we might have to rethink this. | |
| // (The same thing happens by the end of forward loops, | |
| // but it's less important since once the pointers go off the slice | |
| // in a forward loop, the loop is over and the slice won't be accessed anymore.) | |
| a.Comment("run loop backward") | |
| for _, ptr := range p.inPtr { | |
| a.AddWords(n, ptr, ptr) | |
| } | |
| for _, ptr := range p.outPtr { | |
| a.AddWords(n, ptr, ptr) | |
| } | |
| case !p.backward: | |
| // Nothing to do! | |
| } | |
| } | |
| // LoadN returns the next n columns of input words as a slice of rows. | |
| // Regs for inputs that have been marked using p.SetMemOK will be direct memory references. | |
| // Regs for other inputs will be newly allocated registers and must be freed. | |
| func (p *Pipe) LoadN(n int) [][]Reg { | |
| a := p.f.Asm | |
| regs := make([][]Reg, len(p.inPtr)) | |
| for i, ptr := range p.inPtr { | |
| regs[i] = make([]Reg, n) | |
| switch { | |
| case a.Arch.loadIncN != nil: | |
| // Load from memory and advance pointers at the same time. | |
| for j := range regs[i] { | |
| regs[i][j] = p.f.Asm.Reg() | |
| } | |
| if p.backward { | |
| a.Arch.loadDecN(a, ptr, regs[i]) | |
| } else { | |
| a.Arch.loadIncN(a, ptr, regs[i]) | |
| } | |
| default: | |
| // Load from memory using offsets. | |
| // We'll advance the pointers or the index counter later. | |
| for j := range n { | |
| off := p.readOff + j | |
| if p.backward { | |
| off = -(off + 1) | |
| } | |
| var mem Reg | |
| if p.indexCounter != 0 { | |
| mem = a.Arch.memIndex(a, off*a.Arch.WordBytes, p.index, ptr) | |
| } else { | |
| mem = ptr.mem(off * a.Arch.WordBytes) | |
| } | |
| h := HintNone | |
| if i < len(p.hints) { | |
| h = p.hints[i] | |
| } | |
| if h == HintMemOK { | |
| regs[i][j] = mem | |
| } else { | |
| r := p.f.Asm.RegHint(h) | |
| a.Mov(mem, r) | |
| regs[i][j] = r | |
| } | |
| } | |
| } | |
| } | |
| p.readOff += n | |
| return regs | |
| } | |
| // StoreN writes regs (a slice of rows) to the next n columns of output, where n = len(regs[0]). | |
| func (p *Pipe) StoreN(regs [][]Reg) { | |
| p.needWrite = false | |
| a := p.f.Asm | |
| if len(regs) != len(p.outPtr) { | |
| p.f.Asm.Fatalf("wrong number of output rows") | |
| } | |
| n := len(regs[0]) | |
| for i, ptr := range p.outPtr { | |
| switch { | |
| case a.Arch.storeIncN != nil: | |
| // Store to memory and advance pointers at the same time. | |
| if p.backward { | |
| a.Arch.storeDecN(a, ptr, regs[i]) | |
| } else { | |
| a.Arch.storeIncN(a, ptr, regs[i]) | |
| } | |
| default: | |
| // Store to memory using offsets. | |
| // We'll advance the pointers or the index counter later. | |
| for j, r := range regs[i] { | |
| off := p.writeOff + j | |
| if p.backward { | |
| off = -(off + 1) | |
| } | |
| var mem Reg | |
| if p.indexCounter != 0 { | |
| mem = a.Arch.memIndex(a, off*a.Arch.WordBytes, p.index, ptr) | |
| } else { | |
| mem = ptr.mem(off * a.Arch.WordBytes) | |
| } | |
| a.Mov(r, mem) | |
| } | |
| } | |
| } | |
| p.writeOff += n | |
| } | |
| // advancePtrs advances the pointers by step | |
| // or handles bookkeeping for an imminent index advance by step | |
| // that the caller will do. | |
| func (p *Pipe) advancePtrs(step int) { | |
| a := p.f.Asm | |
| switch { | |
| case a.Arch.loadIncN != nil: | |
| // nothing to do | |
| default: | |
| // Adjust read/write offsets for pointer advance (or imminent index advance). | |
| p.readOff -= step | |
| p.writeOff -= step | |
| if p.indexCounter == 0 { | |
| // Advance pointers. | |
| if p.backward { | |
| step = -step | |
| } | |
| for _, ptr := range p.inPtr { | |
| a.Add(a.Imm(step*a.Arch.WordBytes), Reg(ptr), Reg(ptr), KeepCarry) | |
| } | |
| for _, ptr := range p.outPtr { | |
| a.Add(a.Imm(step*a.Arch.WordBytes), Reg(ptr), Reg(ptr), KeepCarry) | |
| } | |
| } | |
| } | |
| } | |
| // DropInput deletes the named input from the pipe, | |
| // usually because it has been exhausted. | |
| // (This is not used yet but will be used in a future generator.) | |
| func (p *Pipe) DropInput(name string) { | |
| i := slices.Index(p.f.inputs, name) | |
| if i < 0 { | |
| p.f.Asm.Fatalf("unknown input %s", name) | |
| } | |
| ptr := p.inPtr[i] | |
| p.f.Asm.Free(Reg(ptr)) | |
| p.inPtr = slices.Delete(p.inPtr, i, i+1) | |
| p.f.inputs = slices.Delete(p.f.inputs, i, i+1) | |
| if len(p.hints) > i { | |
| p.hints = slices.Delete(p.hints, i, i+1) | |
| } | |
| } | |
| // Start prepares to loop over n columns. | |
| // The factors give a sequence of unrolling factors to use, | |
| // which must be either strictly increasing or strictly decreasing | |
| // and must include 1. | |
| // For example, 4, 1 means to process 4 elements at a time | |
| // and then 1 at a time for the final 0-3; specifying 1,4 instead | |
| // handles 0-3 elements first and then 4 at a time. | |
| // Similarly, 32, 4, 1 means to process 32 at a time, | |
| // then 4 at a time, then 1 at a time. | |
| // | |
| // One benefit of using 1, 4 instead of 4, 1 is that the body | |
| // processing 4 at a time needs more registers, and if it is | |
| // the final body, the register holding the fragment count (0-3) | |
| // has been freed and is available for use. | |
| // | |
| // Start may modify the carry flag. | |
| // | |
| // Start must be followed by a call to Loop1 or LoopN, | |
| // but it is permitted to emit other instructions first, | |
| // for example to set an initial carry flag. | |
| func (p *Pipe) Start(n Reg, factors ...int) { | |
| a := p.f.Asm | |
| if p.started { | |
| a.Fatalf("loop already started") | |
| } | |
| if p.useIndexCounter && len(factors) > 1 { | |
| a.Fatalf("cannot call SetUseIndexCounter and then use Start with factors != [1]; have factors = %v", factors) | |
| } | |
| p.started = true | |
| if !p.loaded { | |
| if len(factors) == 1 { | |
| p.SetUseIndexCounter() | |
| } | |
| p.LoadPtrs(n) | |
| } | |
| // If there were calls to LoadN between LoadPtrs and Start, | |
| // adjust the loop not to scan those columns, assuming that | |
| // either the code already called an equivalent StoreN or else | |
| // that it will do so after the loop. | |
| if off := p.readOff; off != 0 { | |
| if p.indexCounter < 0 { | |
| // Index is negated, so add off instead of subtracting. | |
| a.Add(a.Imm(off), n, n, SmashCarry) | |
| } else { | |
| a.Sub(a.Imm(off), n, n, SmashCarry) | |
| } | |
| if p.indexCounter != 0 { | |
| // n is also the index we are using, so adjust readOff and writeOff | |
| // to continue to point at the same positions as before we changed n. | |
| p.readOff -= off | |
| p.writeOff -= off | |
| } | |
| } | |
| p.Restart(n, factors...) | |
| } | |
| // Restart prepares to loop over an additional n columns, | |
| // beyond a previous loop run by p.Start/p.Loop. | |
| func (p *Pipe) Restart(n Reg, factors ...int) { | |
| a := p.f.Asm | |
| if !p.started { | |
| a.Fatalf("pipe not started") | |
| } | |
| p.factors = factors | |
| p.counts = make([]Reg, len(factors)) | |
| if len(factors) == 0 { | |
| factors = []int{1} | |
| } | |
| // Compute the loop lengths for each unrolled section into separate registers. | |
| // We compute them all ahead of time in case the computation would smash | |
| // a carry flag that the loop bodies need preserved. | |
| if len(factors) > 1 { | |
| a.Comment("compute unrolled loop lengths") | |
| } | |
| switch { | |
| default: | |
| a.Fatalf("invalid factors %v", factors) | |
| case factors[0] == 1: | |
| // increasing loop factors | |
| div := 1 | |
| for i, f := range factors[1:] { | |
| if f <= factors[i] { | |
| a.Fatalf("non-increasing factors %v", factors) | |
| } | |
| if f&(f-1) != 0 { | |
| a.Fatalf("non-power-of-two factors %v", factors) | |
| } | |
| t := p.f.Asm.Reg() | |
| f /= div | |
| a.And(a.Imm(f-1), n, t) | |
| a.Rsh(a.Imm(bits.TrailingZeros(uint(f))), n, n) | |
| div *= f | |
| p.counts[i] = t | |
| } | |
| p.counts[len(p.counts)-1] = n | |
| case factors[len(factors)-1] == 1: | |
| // decreasing loop factors | |
| for i, f := range factors[:len(factors)-1] { | |
| if f <= factors[i+1] { | |
| a.Fatalf("non-decreasing factors %v", factors) | |
| } | |
| if f&(f-1) != 0 { | |
| a.Fatalf("non-power-of-two factors %v", factors) | |
| } | |
| t := p.f.Asm.Reg() | |
| a.Rsh(a.Imm(bits.TrailingZeros(uint(f))), n, t) | |
| a.And(a.Imm(f-1), n, n) | |
| p.counts[i] = t | |
| } | |
| p.counts[len(p.counts)-1] = n | |
| } | |
| } | |
| // Done frees all the registers allocated by the pipe. | |
| func (p *Pipe) Done() { | |
| for _, ptr := range p.inPtr { | |
| p.f.Asm.Free(Reg(ptr)) | |
| } | |
| p.inPtr = nil | |
| for _, ptr := range p.outPtr { | |
| p.f.Asm.Free(Reg(ptr)) | |
| } | |
| p.outPtr = nil | |
| p.index = Reg{} | |
| } | |
| // Loop emits code for the loop, calling block repeatedly to emit code that | |
| // handles a block of N input columns (for arbitrary N = len(in[0]) chosen by p). | |
| // block must call p.StoreN(out) to write N output columns. | |
| // The out slice is a pre-allocated matrix of uninitialized Reg values. | |
| // block is expected to set each entry to the Reg that should be written | |
| // before calling p.StoreN(out). | |
| // | |
| // For example, if the loop is to be unrolled 4x in blocks of 2 columns each, | |
| // the sequence of calls to emit the unrolled loop body is: | |
| // | |
| // start() // set by pAtUnrollStart | |
| // ... reads for 2 columns ... | |
| // block() | |
| // ... writes for 2 columns ... | |
| // ... reads for 2 columns ... | |
| // block() | |
| // ... writes for 2 columns ... | |
| // end() // set by p.AtUnrollEnd | |
| // | |
| // Any registers allocated during block are freed automatically when block returns. | |
| func (p *Pipe) Loop(block func(in, out [][]Reg)) { | |
| if p.factors == nil { | |
| p.f.Asm.Fatalf("Pipe.Start not called") | |
| } | |
| for i, factor := range p.factors { | |
| n := p.counts[i] | |
| p.unroll(n, factor, block) | |
| if i < len(p.factors)-1 { | |
| p.f.Asm.Free(n) | |
| } | |
| } | |
| p.factors = nil | |
| } | |
| // AtUnrollStart sets a function to call at the start of an unrolled sequence. | |
| // See [Pipe.Loop] for details. | |
| func (p *Pipe) AtUnrollStart(start func()) { | |
| p.unrollStart = start | |
| } | |
| // AtUnrollEnd sets a function to call at the end of an unrolled sequence. | |
| // See [Pipe.Loop] for details. | |
| func (p *Pipe) AtUnrollEnd(end func()) { | |
| p.unrollEnd = end | |
| } | |
| // unroll emits a single unrolled loop for the given factor, iterating n times. | |
| func (p *Pipe) unroll(n Reg, factor int, block func(in, out [][]Reg)) { | |
| a := p.f.Asm | |
| label := fmt.Sprintf("%s%d", p.label, factor) | |
| // Top of loop control flow. | |
| a.Label(label) | |
| if a.Arch.loopTop != "" { | |
| a.Printf("\t"+a.Arch.loopTop+"\n", n, label+"done") | |
| } else { | |
| a.JmpZero(n, label+"done") | |
| } | |
| a.Label(label + "cont") | |
| // Unrolled loop body. | |
| if factor < p.maxColumns { | |
| a.Comment("unroll %dX", factor) | |
| } else { | |
| a.Comment("unroll %dX in batches of %d", factor, p.maxColumns) | |
| } | |
| if p.unrollStart != nil { | |
| p.unrollStart() | |
| } | |
| for done := 0; done < factor; { | |
| batch := min(factor-done, p.maxColumns) | |
| regs := a.RegsUsed() | |
| out := make([][]Reg, len(p.outPtr)) | |
| for i := range out { | |
| out[i] = make([]Reg, batch) | |
| } | |
| in := p.LoadN(batch) | |
| p.needWrite = true | |
| block(in, out) | |
| if p.needWrite && len(p.outPtr) > 0 { | |
| a.Fatalf("missing p.Write1 or p.StoreN") | |
| } | |
| a.SetRegsUsed(regs) // free anything block allocated | |
| done += batch | |
| } | |
| if p.unrollEnd != nil { | |
| p.unrollEnd() | |
| } | |
| p.advancePtrs(factor) | |
| // Bottom of loop control flow. | |
| switch { | |
| case p.indexCounter >= 0 && a.Arch.loopBottom != "": | |
| a.Printf("\t"+a.Arch.loopBottom+"\n", n, label+"cont") | |
| case p.indexCounter >= 0: | |
| a.Sub(a.Imm(1), n, n, KeepCarry) | |
| a.JmpNonZero(n, label+"cont") | |
| case p.indexCounter < 0 && a.Arch.loopBottomNeg != "": | |
| a.Printf("\t"+a.Arch.loopBottomNeg+"\n", n, label+"cont") | |
| case p.indexCounter < 0: | |
| a.Add(a.Imm(1), n, n, KeepCarry) | |
| } | |
| a.Label(label + "done") | |
| } | |