Upload folder using huggingface_hub

0f07ba7 verified 24 days ago

7.07 kB

	package backend

	import (
	"math/rand"
	"os"
	"path/filepath"

	"github.com/mudler/LocalAI/core/config"
	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
	"github.com/mudler/LocalAI/pkg/model"
	"github.com/mudler/xlog"
	)

	func ModelOptions(c config.ModelConfig, so *config.ApplicationConfig, opts ...model.Option) []model.Option {
	name := c.Name
	if name == "" {
	name = c.Model
	}

	defOpts := []model.Option{
	model.WithBackendString(c.Backend),
	model.WithModel(c.Model),
	model.WithContext(so.Context),
	model.WithModelID(name),
	}

	threads := 1

	if c.Threads != nil {
	threads = *c.Threads
	}

	if so.Threads != 0 {
	threads = so.Threads
	}

	c.Threads = &threads

	grpcOpts := grpcModelOpts(c, so.SystemState.Model.ModelsPath)
	defOpts = append(defOpts, model.WithLoadGRPCLoadModelOpts(grpcOpts))

	if so.ParallelBackendRequests {
	defOpts = append(defOpts, model.EnableParallelRequests)
	}

	if c.GRPC.Attempts != 0 {
	defOpts = append(defOpts, model.WithGRPCAttempts(c.GRPC.Attempts))
	}

	if c.GRPC.AttemptsSleepTime != 0 {
	defOpts = append(defOpts, model.WithGRPCAttemptsDelay(c.GRPC.AttemptsSleepTime))
	}

	for k, v := range so.ExternalGRPCBackends {
	defOpts = append(defOpts, model.WithExternalBackend(k, v))
	}

	return append(defOpts, opts...)
	}

	func getSeed(c config.ModelConfig) int32 {
	var seed int32 = config.RAND_SEED

	if c.Seed != nil {
	seed = int32(*c.Seed)
	}

	if seed == config.RAND_SEED {
	seed = rand.Int31()
	}

	return seed
	}

	func grpcModelOpts(c config.ModelConfig, modelPath string) *pb.ModelOptions {
	b := 512
	if c.Batch != 0 {
	b = c.Batch
	}

	flashAttention := "auto"

	if c.FlashAttention != nil {
	flashAttention = *c.FlashAttention
	}

	f16 := false
	if c.F16 != nil {
	f16 = *c.F16
	}

	embeddings := false
	if c.Embeddings != nil {
	embeddings = *c.Embeddings
	}

	lowVRAM := false
	if c.LowVRAM != nil {
	lowVRAM = *c.LowVRAM
	}

	reranking := false
	if c.Reranking != nil {
	reranking = *c.Reranking
	}

	mmap := false
	if c.MMap != nil {
	mmap = *c.MMap
	}

	ctxSize := 4096
	if c.ContextSize != nil {
	ctxSize = *c.ContextSize
	}

	mmlock := false
	if c.MMlock != nil {
	mmlock = *c.MMlock
	}

	nGPULayers := 9999999
	if c.NGPULayers != nil {
	nGPULayers = *c.NGPULayers
	}

	triggers := make([]*pb.GrammarTrigger, 0)
	for _, t := range c.FunctionsConfig.GrammarConfig.GrammarTriggers {
	triggers = append(triggers, &pb.GrammarTrigger{
	Word: t.Word,
	})
	}

	opts := &pb.ModelOptions{
	CUDA: c.CUDA \|\| c.Diffusers.CUDA,
	SchedulerType: c.Diffusers.SchedulerType,
	GrammarTriggers: triggers,
	PipelineType: c.Diffusers.PipelineType,
	CFGScale: c.CFGScale,
	LoraAdapter: c.LoraAdapter,
	LoraScale: c.LoraScale,
	LoraAdapters: c.LoraAdapters,
	LoraScales: c.LoraScales,
	F16Memory: f16,
	LoraBase: c.LoraBase,
	IMG2IMG: c.Diffusers.IMG2IMG,
	CLIPModel: c.Diffusers.ClipModel,
	CLIPSubfolder: c.Diffusers.ClipSubFolder,
	Options: c.Options,
	Overrides: c.Overrides,
	CLIPSkip: int32(c.Diffusers.ClipSkip),
	ControlNet: c.Diffusers.ControlNet,
	ContextSize: int32(ctxSize),
	Seed: getSeed(c),
	NBatch: int32(b),
	NoMulMatQ: c.NoMulMatQ,
	DraftModel: c.DraftModel,
	AudioPath: c.AudioPath,
	Quantization: c.Quantization,
	LoadFormat: c.LoadFormat,
	GPUMemoryUtilization: c.GPUMemoryUtilization,
	TrustRemoteCode: c.TrustRemoteCode,
	EnforceEager: c.EnforceEager,
	SwapSpace: int32(c.SwapSpace),
	MaxModelLen: int32(c.MaxModelLen),
	TensorParallelSize: int32(c.TensorParallelSize),
	DisableLogStatus: c.DisableLogStatus,
	DType: c.DType,
	// LimitMMPerPrompt vLLM
	LimitImagePerPrompt: int32(c.LimitMMPerPrompt.LimitImagePerPrompt),
	LimitVideoPerPrompt: int32(c.LimitMMPerPrompt.LimitVideoPerPrompt),
	LimitAudioPerPrompt: int32(c.LimitMMPerPrompt.LimitAudioPerPrompt),
	FlashAttention: flashAttention,
	CacheTypeKey: c.CacheTypeK,
	CacheTypeValue: c.CacheTypeV,
	NoKVOffload: c.NoKVOffloading,
	YarnExtFactor: c.YarnExtFactor,
	YarnAttnFactor: c.YarnAttnFactor,
	YarnBetaFast: c.YarnBetaFast,
	YarnBetaSlow: c.YarnBetaSlow,
	NGQA: c.NGQA,
	RMSNormEps: c.RMSNormEps,
	MLock: mmlock,
	RopeFreqBase: c.RopeFreqBase,
	RopeScaling: c.RopeScaling,
	Type: c.ModelType,
	RopeFreqScale: c.RopeFreqScale,
	NUMA: c.NUMA,
	Embeddings: embeddings,
	Reranking: reranking,
	LowVRAM: lowVRAM,
	NGPULayers: int32(nGPULayers),
	MMap: mmap,
	MainGPU: c.MainGPU,
	Threads: int32(*c.Threads),
	TensorSplit: c.TensorSplit,
	// RWKV
	Tokenizer: c.Tokenizer,
	}

	if c.MMProj != "" {
	opts.MMProj = filepath.Join(modelPath, c.MMProj)
	}

	return opts
	}

	func gRPCPredictOpts(c config.ModelConfig, modelPath string) *pb.PredictOptions {
	promptCachePath := ""
	if c.PromptCachePath != "" {
	p := filepath.Join(modelPath, c.PromptCachePath)
	err := os.MkdirAll(filepath.Dir(p), 0750)
	if err == nil {
	promptCachePath = p
	} else {
	xlog.Error("error creating prompt cache folder", "error", err, "promptCachePath", promptCachePath)
	}
	}

	pbOpts := &pb.PredictOptions{
	Temperature: float32(*c.Temperature),
	TopP: float32(*c.TopP),
	NDraft: c.NDraft,
	TopK: int32(*c.TopK),
	Tokens: int32(*c.Maxtokens),
	Threads: int32(*c.Threads),
	PromptCacheAll: c.PromptCacheAll,
	PromptCacheRO: c.PromptCacheRO,
	PromptCachePath: promptCachePath,
	F16KV: *c.F16,
	DebugMode: *c.Debug,
	Grammar: c.Grammar,
	NegativePromptScale: c.NegativePromptScale,
	RopeFreqBase: c.RopeFreqBase,
	RopeFreqScale: c.RopeFreqScale,
	NegativePrompt: c.NegativePrompt,
	Mirostat: int32(*c.LLMConfig.Mirostat),
	MirostatETA: float32(*c.LLMConfig.MirostatETA),
	MirostatTAU: float32(*c.LLMConfig.MirostatTAU),
	Debug: *c.Debug,
	StopPrompts: c.StopWords,
	Repeat: int32(c.RepeatLastN),
	FrequencyPenalty: float32(c.FrequencyPenalty),
	PresencePenalty: float32(c.PresencePenalty),
	Penalty: float32(c.RepeatPenalty),
	NKeep: int32(c.Keep),
	Batch: int32(c.Batch),
	IgnoreEOS: c.IgnoreEOS,
	Seed: getSeed(c),
	MLock: *c.MMlock,
	MMap: *c.MMap,
	MainGPU: c.MainGPU,
	TensorSplit: c.TensorSplit,
	TailFreeSamplingZ: float32(*c.TFZ),
	TypicalP: float32(*c.TypicalP),
	}
	// Logprobs and TopLogprobs are set by the caller if provided
	return pbOpts
	}