File size: 2,675 Bytes
a57f260
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
//
// SPDX-FileCopyrightText: Hadad <hadad@linuxmail.org>
// SPDX-License-Identifier: Apache-2.0
//

#ifndef POCKET_TTS_AUDIO_PROCESSOR_HPP
#define POCKET_TTS_AUDIO_PROCESSOR_HPP

#include "memory_pool.hpp"
#include <cstddef>
#include <cstdint>
#include <string>
#include <vector>

namespace pocket_tts_accelerator {

struct WavFileHeader {
    char riff_marker[4];
    std::uint32_t file_size;
    char wave_marker[4];
    char format_marker[4];
    std::uint32_t format_chunk_size;
    std::uint16_t audio_format;
    std::uint16_t number_of_channels;
    std::uint32_t sample_rate;
    std::uint32_t byte_rate;
    std::uint16_t block_align;
    std::uint16_t bits_per_sample;
    char data_marker[4];
    std::uint32_t data_size;
};

struct AudioData {
    std::vector<std::int16_t> samples;
    std::uint32_t sample_rate;
    std::uint16_t number_of_channels;
    std::uint16_t bits_per_sample;
    bool is_valid;
    std::string error_message;
};

struct AudioProcessingResult {
    std::vector<std::int16_t> processed_samples;
    std::uint32_t output_sample_rate;
    bool success;
    std::string error_message;
};

class AudioProcessor {
public:
    explicit AudioProcessor(MemoryPool& shared_memory_pool);
    ~AudioProcessor();

    AudioProcessor(const AudioProcessor&) = delete;
    AudioProcessor& operator=(const AudioProcessor&) = delete;

    AudioData read_wav_file(const std::string& file_path);
    bool write_wav_file(const std::string& file_path, const AudioData& audio_data);

    AudioProcessingResult convert_to_mono(const AudioData& input_audio);
    AudioProcessingResult convert_to_pcm_int16(const AudioData& input_audio);
    AudioProcessingResult resample_audio(const AudioData& input_audio, std::uint32_t target_sample_rate);
    AudioProcessingResult normalize_audio(const AudioData& input_audio, float target_peak_level);

    AudioProcessingResult process_audio_for_voice_cloning(
        const std::string& input_file_path,
        const std::string& output_file_path
    );

    static bool validate_wav_header(const WavFileHeader& header);
    static std::size_t calculate_audio_duration_milliseconds(const AudioData& audio_data);

private:
    void convert_float32_to_int16(const float* input, std::int16_t* output, std::size_t sample_count);
    void convert_int32_to_int16(const std::int32_t* input, std::int16_t* output, std::size_t sample_count);
    void convert_uint8_to_int16(const std::uint8_t* input, std::int16_t* output, std::size_t sample_count);
    void mix_channels_to_mono(const std::int16_t* input, std::int16_t* output, std::size_t frame_count, std::uint16_t channel_count);

    MemoryPool& memory_pool;
};

}

#endif