File size: 4,117 Bytes
750bbe6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
package cli

import (
	"context"
	"encoding/json"
	"errors"
	"fmt"
	"strings"

	"github.com/mudler/LocalAI/core/backend"
	cliContext "github.com/mudler/LocalAI/core/cli/context"
	"github.com/mudler/LocalAI/core/config"
	"github.com/mudler/LocalAI/core/gallery"
	"github.com/mudler/LocalAI/core/schema"
	"github.com/mudler/LocalAI/pkg/format"
	"github.com/mudler/LocalAI/pkg/model"
	"github.com/mudler/LocalAI/pkg/system"
	"github.com/mudler/xlog"
)

type TranscriptCMD struct {
	Filename string `arg:"" name:"file" help:"Audio file to transcribe" type:"path"`

	Backend          string                                 `short:"b" default:"whisper" help:"Backend to run the transcription model"`
	Model            string                                 `short:"m" required:"" help:"Model name to run the TTS"`
	Language         string                                 `short:"l" help:"Language of the audio file"`
	Translate        bool                                   `short:"c" help:"Translate the transcription to English"`
	Diarize          bool                                   `short:"d" help:"Mark speaker turns"`
	Threads          int                                    `short:"t" default:"1" help:"Number of threads used for parallel computation"`
	BackendsPath     string                                 `env:"LOCALAI_BACKENDS_PATH,BACKENDS_PATH" type:"path" default:"${basepath}/backends" help:"Path containing backends used for inferencing" group:"storage"`
	ModelsPath       string                                 `env:"LOCALAI_MODELS_PATH,MODELS_PATH" type:"path" default:"${basepath}/models" help:"Path containing models used for inferencing" group:"storage"`
	BackendGalleries string                                 `env:"LOCALAI_BACKEND_GALLERIES,BACKEND_GALLERIES" help:"JSON list of backend galleries" group:"backends" default:"${backends}"`
	Prompt           string                                 `short:"p" help:"Previous transcribed text or words that hint at what the model should expect"`
	ResponseFormat   schema.TranscriptionResponseFormatType `short:"f" default:"" help:"Response format for Whisper models, can be one of (txt, lrc, srt, vtt, json, json_verbose)"`
	PrettyPrint      bool                                   `help:"Used with response_format json or json_verbose for pretty printing"`
}

func (t *TranscriptCMD) Run(ctx *cliContext.Context) error {
	systemState, err := system.GetSystemState(
		system.WithBackendPath(t.BackendsPath),
		system.WithModelPath(t.ModelsPath),
	)
	if err != nil {
		return err
	}
	opts := &config.ApplicationConfig{
		SystemState: systemState,
		Context:     context.Background(),
	}

	cl := config.NewModelConfigLoader(t.ModelsPath)
	ml := model.NewModelLoader(systemState)

	if err := gallery.RegisterBackends(systemState, ml); err != nil {
		xlog.Error("error registering external backends", "error", err)
	}

	if err := cl.LoadModelConfigsFromPath(t.ModelsPath); err != nil {
		return err
	}

	c, exists := cl.GetModelConfig(t.Model)
	if !exists {
		return errors.New("model not found")
	}

	c.Threads = &t.Threads

	defer func() {
		err := ml.StopAllGRPC()
		if err != nil {
			xlog.Error("unable to stop all grpc processes", "error", err)
		}
	}()

	tr, err := backend.ModelTranscription(t.Filename, t.Language, t.Translate, t.Diarize, t.Prompt, ml, c, opts)
	if err != nil {
		return err
	}

	switch t.ResponseFormat {
	case schema.TranscriptionResponseFormatLrc, schema.TranscriptionResponseFormatSrt, schema.TranscriptionResponseFormatVtt, schema.TranscriptionResponseFormatText:
		fmt.Println(format.TranscriptionResponse(tr, t.ResponseFormat))
	case schema.TranscriptionResponseFormatJson:
		tr.Segments = nil
		fallthrough
	case schema.TranscriptionResponseFormatJsonVerbose:
		var mtr []byte
		var err error
		if t.PrettyPrint {
			mtr, err = json.MarshalIndent(tr, "", "    ")
		} else {
			mtr, err = json.Marshal(tr)
		}
		if err != nil {
			return err
		}
		fmt.Println(string(mtr))
	default:
		for _, segment := range tr.Segments {
			fmt.Println(segment.Start.String(), "-", strings.TrimSpace(segment.Text))
		}
	}
	return nil
}