// Copyright 2025 The ODML Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #ifndef THIRD_PARTY_ODML_LITERT_LM_RUNTIME_COMPONENTS_PREPROCESSOR_AUDIO_PREPROCESSOR_MINIAUDIO_H_ #define THIRD_PARTY_ODML_LITERT_LM_RUNTIME_COMPONENTS_PREPROCESSOR_AUDIO_PREPROCESSOR_MINIAUDIO_H_ #include #include #include #include "absl/log/absl_check.h" // from @com_google_absl #include "absl/status/status.h" // from @com_google_absl #include "absl/status/statusor.h" // from @com_google_absl #include "absl/strings/string_view.h" // from @com_google_absl #include "absl/types/span.h" // from @com_google_absl #include "runtime/components/preprocessor/audio_preprocessor.h" #include "runtime/components/preprocessor/mel_filterbank.h" #include "runtime/engine/io_types.h" namespace litert::lm { // Audio preprocessor implementation using MiniAudio library and kissfft // library. class AudioPreprocessorMiniAudio : public AudioPreprocessor { public: // Creates an AudioPreprocessorMiniAudio instance. // Args: // - config: The configuration of the audio preprocessor. // Returns: // A unique pointer to the AudioPreprocessorMiniAudio instance. static absl::StatusOr> Create( const AudioPreprocessorConfig& config); // Decodes the raw audio bytes to PCM frames using MiniAudio library. // Args: // - audio_bytes: The raw audio bytes read from the audio file to decode. // The supported audio format from MiniAudio library is WAV, FLAC and MP3. // - num_channels: The number of channels in the audio to be decoded. Only // mono audio (num_channels = 1) is supported for now. // - sample_rate_hz: The sample rate of the audio to be resampled to. // - pcm_frames: The decoded PCM frames. static absl::Status DecodeAudio(absl::string_view audio_bytes, int num_channels, int sample_rate_hz, std::vector& pcm_frames); // Preprocesses the undecoded raw audio bytes and returns the preprocessed // audio mel spectrograms. // Args: // - input_audio: The input audio to preprocess. The supported audio // format from MiniAudio library is WAV, FLAC and MP3. // If the input audio is already preprocessed, it will be returned // directly. // Returns: // An InputAudio object containing the preprocessed audio mel spectrograms // with shape (1, num_frames, num_mel_bins). absl::StatusOr Preprocess(const InputAudio& input_audio) override; // Resets the preprocessor to its initial state. void Reset() override { input_queue_.clear(); if (config_.GetSemicausalPadding()) { samples_to_next_step_ = config_.GetFrameLength() - config_.GetHopLength(); input_queue_.resize(config_.GetHopLength(), 0.0f); } else { samples_to_next_step_ = config_.GetFrameLength(); } } // Copy constructor for cloning the audio preprocessor. AudioPreprocessorMiniAudio(const AudioPreprocessorMiniAudio& other) : config_(other.config_), mel_filterbank_(nullptr), input_queue_(other.input_queue_), samples_to_next_step_(other.samples_to_next_step_) { mel_filterbank_ = std::make_unique(); ABSL_CHECK_OK(mel_filterbank_->Initialize( other.config_.GetFftBins(), other.config_.GetSampleRateHz(), other.config_.GetNumMelBins(), other.config_.GetMelLowHz(), other.config_.GetMelHighHz())); } // Copy assignment operator for cloning the audio preprocessor. AudioPreprocessorMiniAudio& operator=( const AudioPreprocessorMiniAudio& other) { config_ = other.config_; mel_filterbank_ = std::make_unique(); ABSL_CHECK_OK(mel_filterbank_->Initialize( other.config_.GetFftBins(), other.config_.GetSampleRateHz(), other.config_.GetNumMelBins(), other.config_.GetMelLowHz(), other.config_.GetMelHighHz())); input_queue_ = other.input_queue_; samples_to_next_step_ = other.samples_to_next_step_; return *this; } private: explicit AudioPreprocessorMiniAudio( const AudioPreprocessorConfig& config, std::unique_ptr mel_filterbank) : config_(config), mel_filterbank_(std::move(mel_filterbank)), input_queue_(std::vector()) { if (config.GetSemicausalPadding()) { samples_to_next_step_ = config.GetFrameLength() - config.GetHopLength(); input_queue_.resize(config.GetHopLength(), 0.0f); } else { samples_to_next_step_ = config.GetFrameLength(); } } absl::Status PcmFramesToSpectrogram(absl::Span pcm_frames, std::vector& spectrograms); absl::Status ToLogMelSpectrogram(const std::vector& spectrograms, std::vector& log_mel_spectrograms); bool GetNextWindowOfSamples(const std::vector& pcm_frames, int& input_start); AudioPreprocessorConfig config_; std::unique_ptr mel_filterbank_; std::vector input_queue_; int samples_to_next_step_; }; } // namespace litert::lm #endif // THIRD_PARTY_ODML_LITERT_LM_RUNTIME_COMPONENTS_PREPROCESSOR_AUDIO_PREPROCESSOR_MINIAUDIO_H_