| | #include "gowhisper.h" |
| | #include "ggml-backend.h" |
| | #include "whisper.h" |
| | #include <vector> |
| |
|
| | static struct whisper_vad_context *vctx; |
| | static struct whisper_context *ctx; |
| | static std::vector<float> flat_segs; |
| |
|
| | static void ggml_log_cb(enum ggml_log_level level, const char *log, |
| | void *data) { |
| | const char *level_str; |
| |
|
| | if (!log) { |
| | return; |
| | } |
| |
|
| | switch (level) { |
| | case GGML_LOG_LEVEL_DEBUG: |
| | level_str = "DEBUG"; |
| | break; |
| | case GGML_LOG_LEVEL_INFO: |
| | level_str = "INFO"; |
| | break; |
| | case GGML_LOG_LEVEL_WARN: |
| | level_str = "WARN"; |
| | break; |
| | case GGML_LOG_LEVEL_ERROR: |
| | level_str = "ERROR"; |
| | break; |
| | default: |
| | level_str = "?????"; |
| | break; |
| | } |
| |
|
| | fprintf(stderr, "[%-5s] ", level_str); |
| | fputs(log, stderr); |
| | fflush(stderr); |
| | } |
| |
|
| | int load_model(const char *const model_path) { |
| | whisper_log_set(ggml_log_cb, nullptr); |
| | ggml_backend_load_all(); |
| |
|
| | struct whisper_context_params cparams = whisper_context_default_params(); |
| |
|
| | ctx = whisper_init_from_file_with_params(model_path, cparams); |
| | if (ctx == nullptr) { |
| | fprintf(stderr, "error: Also failed to init model as transcriber\n"); |
| | return 1; |
| | } |
| |
|
| | return 0; |
| | } |
| |
|
| | int load_model_vad(const char *const model_path) { |
| | whisper_log_set(ggml_log_cb, nullptr); |
| | ggml_backend_load_all(); |
| |
|
| | struct whisper_vad_context_params vcparams = |
| | whisper_vad_default_context_params(); |
| |
|
| | |
| | |
| |
|
| | vctx = whisper_vad_init_from_file_with_params(model_path, vcparams); |
| | if (vctx == nullptr) { |
| | fprintf(stderr, "error: Failed to init model as VAD\n"); |
| | return 1; |
| | } |
| |
|
| | return 0; |
| | } |
| |
|
| | int vad(float pcmf32[], size_t pcmf32_len, float **segs_out, |
| | size_t *segs_out_len) { |
| | if (!whisper_vad_detect_speech(vctx, pcmf32, pcmf32_len)) { |
| | fprintf(stderr, "error: failed to detect speech\n"); |
| | return 1; |
| | } |
| |
|
| | struct whisper_vad_params params = whisper_vad_default_params(); |
| | struct whisper_vad_segments *segs = |
| | whisper_vad_segments_from_probs(vctx, params); |
| | size_t segn = whisper_vad_segments_n_segments(segs); |
| |
|
| | |
| |
|
| | flat_segs.clear(); |
| |
|
| | for (int i = 0; i < segn; i++) { |
| | flat_segs.push_back(whisper_vad_segments_get_segment_t0(segs, i)); |
| | flat_segs.push_back(whisper_vad_segments_get_segment_t1(segs, i)); |
| | } |
| |
|
| | |
| | |
| | |
| | *segs_out = flat_segs.data(); |
| | *segs_out_len = flat_segs.size(); |
| |
|
| | |
| | whisper_vad_free_segments(segs); |
| |
|
| | |
| | return 0; |
| | } |
| |
|
| | int transcribe(uint32_t threads, char *lang, bool translate, bool tdrz, |
| | float pcmf32[], size_t pcmf32_len, size_t *segs_out_len, char *prompt) { |
| | whisper_full_params wparams = |
| | whisper_full_default_params(WHISPER_SAMPLING_GREEDY); |
| |
|
| | wparams.n_threads = threads; |
| | if (*lang != '\0') |
| | wparams.language = lang; |
| | else { |
| | wparams.language = nullptr; |
| | } |
| |
|
| | wparams.translate = translate; |
| | wparams.debug_mode = true; |
| | wparams.print_progress = true; |
| | wparams.tdrz_enable = tdrz; |
| | wparams.initial_prompt = prompt; |
| |
|
| | fprintf(stderr, "info: Enable tdrz: %d\n", tdrz); |
| | fprintf(stderr, "info: Initial prompt: \"%s\"\n", prompt); |
| |
|
| | if (whisper_full(ctx, wparams, pcmf32, pcmf32_len)) { |
| | fprintf(stderr, "error: transcription failed\n"); |
| | return 1; |
| | } |
| |
|
| | *segs_out_len = whisper_full_n_segments(ctx); |
| |
|
| | return 0; |
| | } |
| |
|
| | const char *get_segment_text(int i) { |
| | return whisper_full_get_segment_text(ctx, i); |
| | } |
| |
|
| | int64_t get_segment_t0(int i) { return whisper_full_get_segment_t0(ctx, i); } |
| |
|
| | int64_t get_segment_t1(int i) { return whisper_full_get_segment_t1(ctx, i); } |
| |
|
| | int n_tokens(int i) { return whisper_full_n_tokens(ctx, i); } |
| |
|
| | int32_t get_token_id(int i, int j) { |
| | return whisper_full_get_token_id(ctx, i, j); |
| | } |
| |
|
| | bool get_segment_speaker_turn_next(int i) { |
| | return whisper_full_get_segment_speaker_turn_next(ctx, i); |
| | } |
| |
|