|
|
package backend |
|
|
|
|
|
import ( |
|
|
"math/rand" |
|
|
"os" |
|
|
"path/filepath" |
|
|
|
|
|
"github.com/mudler/LocalAI/core/config" |
|
|
pb "github.com/mudler/LocalAI/pkg/grpc/proto" |
|
|
"github.com/mudler/LocalAI/pkg/model" |
|
|
"github.com/mudler/xlog" |
|
|
) |
|
|
|
|
|
func ModelOptions(c config.ModelConfig, so *config.ApplicationConfig, opts ...model.Option) []model.Option { |
|
|
name := c.Name |
|
|
if name == "" { |
|
|
name = c.Model |
|
|
} |
|
|
|
|
|
defOpts := []model.Option{ |
|
|
model.WithBackendString(c.Backend), |
|
|
model.WithModel(c.Model), |
|
|
model.WithContext(so.Context), |
|
|
model.WithModelID(name), |
|
|
} |
|
|
|
|
|
threads := 1 |
|
|
|
|
|
if c.Threads != nil { |
|
|
threads = *c.Threads |
|
|
} |
|
|
|
|
|
if so.Threads != 0 { |
|
|
threads = so.Threads |
|
|
} |
|
|
|
|
|
c.Threads = &threads |
|
|
|
|
|
grpcOpts := grpcModelOpts(c, so.SystemState.Model.ModelsPath) |
|
|
defOpts = append(defOpts, model.WithLoadGRPCLoadModelOpts(grpcOpts)) |
|
|
|
|
|
if so.ParallelBackendRequests { |
|
|
defOpts = append(defOpts, model.EnableParallelRequests) |
|
|
} |
|
|
|
|
|
if c.GRPC.Attempts != 0 { |
|
|
defOpts = append(defOpts, model.WithGRPCAttempts(c.GRPC.Attempts)) |
|
|
} |
|
|
|
|
|
if c.GRPC.AttemptsSleepTime != 0 { |
|
|
defOpts = append(defOpts, model.WithGRPCAttemptsDelay(c.GRPC.AttemptsSleepTime)) |
|
|
} |
|
|
|
|
|
for k, v := range so.ExternalGRPCBackends { |
|
|
defOpts = append(defOpts, model.WithExternalBackend(k, v)) |
|
|
} |
|
|
|
|
|
return append(defOpts, opts...) |
|
|
} |
|
|
|
|
|
func getSeed(c config.ModelConfig) int32 { |
|
|
var seed int32 = config.RAND_SEED |
|
|
|
|
|
if c.Seed != nil { |
|
|
seed = int32(*c.Seed) |
|
|
} |
|
|
|
|
|
if seed == config.RAND_SEED { |
|
|
seed = rand.Int31() |
|
|
} |
|
|
|
|
|
return seed |
|
|
} |
|
|
|
|
|
func grpcModelOpts(c config.ModelConfig, modelPath string) *pb.ModelOptions { |
|
|
b := 512 |
|
|
if c.Batch != 0 { |
|
|
b = c.Batch |
|
|
} |
|
|
|
|
|
flashAttention := "auto" |
|
|
|
|
|
if c.FlashAttention != nil { |
|
|
flashAttention = *c.FlashAttention |
|
|
} |
|
|
|
|
|
f16 := false |
|
|
if c.F16 != nil { |
|
|
f16 = *c.F16 |
|
|
} |
|
|
|
|
|
embeddings := false |
|
|
if c.Embeddings != nil { |
|
|
embeddings = *c.Embeddings |
|
|
} |
|
|
|
|
|
lowVRAM := false |
|
|
if c.LowVRAM != nil { |
|
|
lowVRAM = *c.LowVRAM |
|
|
} |
|
|
|
|
|
reranking := false |
|
|
if c.Reranking != nil { |
|
|
reranking = *c.Reranking |
|
|
} |
|
|
|
|
|
mmap := false |
|
|
if c.MMap != nil { |
|
|
mmap = *c.MMap |
|
|
} |
|
|
|
|
|
ctxSize := 4096 |
|
|
if c.ContextSize != nil { |
|
|
ctxSize = *c.ContextSize |
|
|
} |
|
|
|
|
|
mmlock := false |
|
|
if c.MMlock != nil { |
|
|
mmlock = *c.MMlock |
|
|
} |
|
|
|
|
|
nGPULayers := 9999999 |
|
|
if c.NGPULayers != nil { |
|
|
nGPULayers = *c.NGPULayers |
|
|
} |
|
|
|
|
|
triggers := make([]*pb.GrammarTrigger, 0) |
|
|
for _, t := range c.FunctionsConfig.GrammarConfig.GrammarTriggers { |
|
|
triggers = append(triggers, &pb.GrammarTrigger{ |
|
|
Word: t.Word, |
|
|
}) |
|
|
} |
|
|
|
|
|
opts := &pb.ModelOptions{ |
|
|
CUDA: c.CUDA || c.Diffusers.CUDA, |
|
|
SchedulerType: c.Diffusers.SchedulerType, |
|
|
GrammarTriggers: triggers, |
|
|
PipelineType: c.Diffusers.PipelineType, |
|
|
CFGScale: c.CFGScale, |
|
|
LoraAdapter: c.LoraAdapter, |
|
|
LoraScale: c.LoraScale, |
|
|
LoraAdapters: c.LoraAdapters, |
|
|
LoraScales: c.LoraScales, |
|
|
F16Memory: f16, |
|
|
LoraBase: c.LoraBase, |
|
|
IMG2IMG: c.Diffusers.IMG2IMG, |
|
|
CLIPModel: c.Diffusers.ClipModel, |
|
|
CLIPSubfolder: c.Diffusers.ClipSubFolder, |
|
|
Options: c.Options, |
|
|
Overrides: c.Overrides, |
|
|
CLIPSkip: int32(c.Diffusers.ClipSkip), |
|
|
ControlNet: c.Diffusers.ControlNet, |
|
|
ContextSize: int32(ctxSize), |
|
|
Seed: getSeed(c), |
|
|
NBatch: int32(b), |
|
|
NoMulMatQ: c.NoMulMatQ, |
|
|
DraftModel: c.DraftModel, |
|
|
AudioPath: c.AudioPath, |
|
|
Quantization: c.Quantization, |
|
|
LoadFormat: c.LoadFormat, |
|
|
GPUMemoryUtilization: c.GPUMemoryUtilization, |
|
|
TrustRemoteCode: c.TrustRemoteCode, |
|
|
EnforceEager: c.EnforceEager, |
|
|
SwapSpace: int32(c.SwapSpace), |
|
|
MaxModelLen: int32(c.MaxModelLen), |
|
|
TensorParallelSize: int32(c.TensorParallelSize), |
|
|
DisableLogStatus: c.DisableLogStatus, |
|
|
DType: c.DType, |
|
|
|
|
|
LimitImagePerPrompt: int32(c.LimitMMPerPrompt.LimitImagePerPrompt), |
|
|
LimitVideoPerPrompt: int32(c.LimitMMPerPrompt.LimitVideoPerPrompt), |
|
|
LimitAudioPerPrompt: int32(c.LimitMMPerPrompt.LimitAudioPerPrompt), |
|
|
FlashAttention: flashAttention, |
|
|
CacheTypeKey: c.CacheTypeK, |
|
|
CacheTypeValue: c.CacheTypeV, |
|
|
NoKVOffload: c.NoKVOffloading, |
|
|
YarnExtFactor: c.YarnExtFactor, |
|
|
YarnAttnFactor: c.YarnAttnFactor, |
|
|
YarnBetaFast: c.YarnBetaFast, |
|
|
YarnBetaSlow: c.YarnBetaSlow, |
|
|
NGQA: c.NGQA, |
|
|
RMSNormEps: c.RMSNormEps, |
|
|
MLock: mmlock, |
|
|
RopeFreqBase: c.RopeFreqBase, |
|
|
RopeScaling: c.RopeScaling, |
|
|
Type: c.ModelType, |
|
|
RopeFreqScale: c.RopeFreqScale, |
|
|
NUMA: c.NUMA, |
|
|
Embeddings: embeddings, |
|
|
Reranking: reranking, |
|
|
LowVRAM: lowVRAM, |
|
|
NGPULayers: int32(nGPULayers), |
|
|
MMap: mmap, |
|
|
MainGPU: c.MainGPU, |
|
|
Threads: int32(*c.Threads), |
|
|
TensorSplit: c.TensorSplit, |
|
|
|
|
|
Tokenizer: c.Tokenizer, |
|
|
} |
|
|
|
|
|
if c.MMProj != "" { |
|
|
opts.MMProj = filepath.Join(modelPath, c.MMProj) |
|
|
} |
|
|
|
|
|
return opts |
|
|
} |
|
|
|
|
|
func gRPCPredictOpts(c config.ModelConfig, modelPath string) *pb.PredictOptions { |
|
|
promptCachePath := "" |
|
|
if c.PromptCachePath != "" { |
|
|
p := filepath.Join(modelPath, c.PromptCachePath) |
|
|
err := os.MkdirAll(filepath.Dir(p), 0750) |
|
|
if err == nil { |
|
|
promptCachePath = p |
|
|
} else { |
|
|
xlog.Error("error creating prompt cache folder", "error", err, "promptCachePath", promptCachePath) |
|
|
} |
|
|
} |
|
|
|
|
|
pbOpts := &pb.PredictOptions{ |
|
|
Temperature: float32(*c.Temperature), |
|
|
TopP: float32(*c.TopP), |
|
|
NDraft: c.NDraft, |
|
|
TopK: int32(*c.TopK), |
|
|
Tokens: int32(*c.Maxtokens), |
|
|
Threads: int32(*c.Threads), |
|
|
PromptCacheAll: c.PromptCacheAll, |
|
|
PromptCacheRO: c.PromptCacheRO, |
|
|
PromptCachePath: promptCachePath, |
|
|
F16KV: *c.F16, |
|
|
DebugMode: *c.Debug, |
|
|
Grammar: c.Grammar, |
|
|
NegativePromptScale: c.NegativePromptScale, |
|
|
RopeFreqBase: c.RopeFreqBase, |
|
|
RopeFreqScale: c.RopeFreqScale, |
|
|
NegativePrompt: c.NegativePrompt, |
|
|
Mirostat: int32(*c.LLMConfig.Mirostat), |
|
|
MirostatETA: float32(*c.LLMConfig.MirostatETA), |
|
|
MirostatTAU: float32(*c.LLMConfig.MirostatTAU), |
|
|
Debug: *c.Debug, |
|
|
StopPrompts: c.StopWords, |
|
|
Repeat: int32(c.RepeatLastN), |
|
|
FrequencyPenalty: float32(c.FrequencyPenalty), |
|
|
PresencePenalty: float32(c.PresencePenalty), |
|
|
Penalty: float32(c.RepeatPenalty), |
|
|
NKeep: int32(c.Keep), |
|
|
Batch: int32(c.Batch), |
|
|
IgnoreEOS: c.IgnoreEOS, |
|
|
Seed: getSeed(c), |
|
|
MLock: *c.MMlock, |
|
|
MMap: *c.MMap, |
|
|
MainGPU: c.MainGPU, |
|
|
TensorSplit: c.TensorSplit, |
|
|
TailFreeSamplingZ: float32(*c.TFZ), |
|
|
TypicalP: float32(*c.TypicalP), |
|
|
} |
|
|
|
|
|
return pbOpts |
|
|
} |
|
|
|