hadadrjt commited on
Commit
1a63d8d
·
1 Parent(s): d011224

Pocket TTS: Switch to simple demo.

Browse files
.dockerignore DELETED
@@ -1,3 +0,0 @@
1
- Dockerfile
2
- LICENSE
3
- README.md
 
 
 
 
Dockerfile CHANGED
@@ -3,19 +3,4 @@
3
  # SPDX-License-Identifier: Apache-2.0
4
  #
5
 
6
- FROM hadadrjt/pocket-tts:hf-20260121
7
-
8
- WORKDIR /app
9
-
10
- COPY . .
11
-
12
- RUN mkdir build \
13
- && cd build \
14
- && cmake \
15
- -DCMAKE_BUILD_TYPE=Release \
16
- -DCMAKE_INSTALL_PREFIX=/app \
17
- ../accelerator \
18
- && make -j$(nproc) \
19
- && make install \
20
- && cd .. \
21
- && rm -rf accelerator build
 
3
  # SPDX-License-Identifier: Apache-2.0
4
  #
5
 
6
+ FROM hadadrjt/pocket-tts:hf-simple-demo
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
README.md CHANGED
@@ -8,4 +8,7 @@ colorTo: yellow
8
  sdk: docker
9
  app_port: 7860
10
  pinned: false
 
 
 
11
  ---
 
8
  sdk: docker
9
  app_port: 7860
10
  pinned: false
11
+ models:
12
+ - kyutai/pocket-tts
13
+ - kyutai/tts-voices
14
  ---
accelerator/CMakeLists.txt DELETED
@@ -1,36 +0,0 @@
1
- #
2
- # SPDX-FileCopyrightText: Hadad <hadad@linuxmail.org>
3
- # SPDX-License-Identifier: Apache-2.0
4
- #
5
-
6
- cmake_minimum_required(VERSION 3.31.6)
7
-
8
- project(pocket_tts_accelerator VERSION 0.0.0 LANGUAGES CXX)
9
-
10
- set(CMAKE_CXX_STANDARD 17)
11
- set(CMAKE_CXX_STANDARD_REQUIRED ON)
12
- set(CMAKE_CXX_EXTENSIONS OFF)
13
-
14
- set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -march=native -ffast-math -funroll-loops")
15
- set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Wpedantic")
16
- set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread")
17
-
18
- find_package(Threads REQUIRED)
19
-
20
- set(ACCELERATOR_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/include)
21
-
22
- set(ACCELERATOR_SOURCES
23
- src/main.cpp
24
- src/accelerator_core.cpp
25
- src/audio_processor.cpp
26
- src/ipc_handler.cpp
27
- src/memory_pool.cpp
28
- src/thread_pool.cpp
29
- )
30
-
31
- add_executable(pocket_tts_accelerator ${ACCELERATOR_SOURCES})
32
-
33
- target_include_directories(pocket_tts_accelerator PRIVATE ${ACCELERATOR_INCLUDE_DIR})
34
- target_link_libraries(pocket_tts_accelerator PRIVATE Threads::Threads)
35
-
36
- install(TARGETS pocket_tts_accelerator DESTINATION bin)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
accelerator/include/accelerator_core.hpp DELETED
@@ -1,69 +0,0 @@
1
- //
2
- // SPDX-FileCopyrightText: Hadad <hadad@linuxmail.org>
3
- // SPDX-License-Identifier: Apache-2.0
4
- //
5
-
6
- #ifndef POCKET_TTS_ACCELERATOR_CORE_HPP
7
- #define POCKET_TTS_ACCELERATOR_CORE_HPP
8
-
9
- #include "audio_processor.hpp"
10
- #include "ipc_handler.hpp"
11
- #include "memory_pool.hpp"
12
- #include "thread_pool.hpp"
13
- #include <atomic>
14
- #include <memory>
15
- #include <string>
16
-
17
- namespace pocket_tts_accelerator {
18
-
19
- struct AcceleratorConfiguration {
20
- std::size_t number_of_worker_threads;
21
- std::size_t memory_pool_size_bytes;
22
- std::string ipc_socket_path;
23
- bool enable_verbose_logging;
24
- };
25
-
26
- class AcceleratorCore {
27
- public:
28
- explicit AcceleratorCore(const AcceleratorConfiguration& configuration);
29
- ~AcceleratorCore();
30
-
31
- AcceleratorCore(const AcceleratorCore&) = delete;
32
- AcceleratorCore& operator=(const AcceleratorCore&) = delete;
33
-
34
- bool initialize();
35
- void run();
36
- void shutdown();
37
-
38
- bool is_running() const;
39
- std::string get_status_string() const;
40
-
41
- static AcceleratorConfiguration get_default_configuration();
42
-
43
- private:
44
- void register_all_command_handlers();
45
- void setup_signal_handlers();
46
-
47
- std::vector<std::uint8_t> handle_ping_command(const std::vector<std::uint8_t>& payload);
48
- std::vector<std::uint8_t> handle_process_audio_command(const std::vector<std::uint8_t>& payload);
49
- std::vector<std::uint8_t> handle_convert_to_mono_command(const std::vector<std::uint8_t>& payload);
50
- std::vector<std::uint8_t> handle_convert_to_pcm_command(const std::vector<std::uint8_t>& payload);
51
- std::vector<std::uint8_t> handle_resample_audio_command(const std::vector<std::uint8_t>& payload);
52
- std::vector<std::uint8_t> handle_get_memory_stats_command(const std::vector<std::uint8_t>& payload);
53
- std::vector<std::uint8_t> handle_clear_memory_pool_command(const std::vector<std::uint8_t>& payload);
54
- std::vector<std::uint8_t> handle_shutdown_command(const std::vector<std::uint8_t>& payload);
55
-
56
- void log_message(const std::string& message) const;
57
-
58
- AcceleratorConfiguration config;
59
- std::unique_ptr<MemoryPool> memory_pool;
60
- std::unique_ptr<ThreadPool> thread_pool;
61
- std::unique_ptr<AudioProcessor> audio_processor;
62
- std::unique_ptr<IpcHandler> ipc_handler;
63
- std::atomic<bool> is_initialized;
64
- std::atomic<bool> should_shutdown;
65
- };
66
-
67
- }
68
-
69
- #endif
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
accelerator/include/audio_processor.hpp DELETED
@@ -1,84 +0,0 @@
1
- //
2
- // SPDX-FileCopyrightText: Hadad <hadad@linuxmail.org>
3
- // SPDX-License-Identifier: Apache-2.0
4
- //
5
-
6
- #ifndef POCKET_TTS_AUDIO_PROCESSOR_HPP
7
- #define POCKET_TTS_AUDIO_PROCESSOR_HPP
8
-
9
- #include "memory_pool.hpp"
10
- #include <cstddef>
11
- #include <cstdint>
12
- #include <string>
13
- #include <vector>
14
-
15
- namespace pocket_tts_accelerator {
16
-
17
- struct WavFileHeader {
18
- char riff_marker[4];
19
- std::uint32_t file_size;
20
- char wave_marker[4];
21
- char format_marker[4];
22
- std::uint32_t format_chunk_size;
23
- std::uint16_t audio_format;
24
- std::uint16_t number_of_channels;
25
- std::uint32_t sample_rate;
26
- std::uint32_t byte_rate;
27
- std::uint16_t block_align;
28
- std::uint16_t bits_per_sample;
29
- char data_marker[4];
30
- std::uint32_t data_size;
31
- };
32
-
33
- struct AudioData {
34
- std::vector<std::int16_t> samples;
35
- std::uint32_t sample_rate;
36
- std::uint16_t number_of_channels;
37
- std::uint16_t bits_per_sample;
38
- bool is_valid;
39
- std::string error_message;
40
- };
41
-
42
- struct AudioProcessingResult {
43
- std::vector<std::int16_t> processed_samples;
44
- std::uint32_t output_sample_rate;
45
- bool success;
46
- std::string error_message;
47
- };
48
-
49
- class AudioProcessor {
50
- public:
51
- explicit AudioProcessor(MemoryPool& shared_memory_pool);
52
- ~AudioProcessor();
53
-
54
- AudioProcessor(const AudioProcessor&) = delete;
55
- AudioProcessor& operator=(const AudioProcessor&) = delete;
56
-
57
- AudioData read_wav_file(const std::string& file_path);
58
- bool write_wav_file(const std::string& file_path, const AudioData& audio_data);
59
-
60
- AudioProcessingResult convert_to_mono(const AudioData& input_audio);
61
- AudioProcessingResult convert_to_pcm_int16(const AudioData& input_audio);
62
- AudioProcessingResult resample_audio(const AudioData& input_audio, std::uint32_t target_sample_rate);
63
- AudioProcessingResult normalize_audio(const AudioData& input_audio, float target_peak_level);
64
-
65
- AudioProcessingResult process_audio_for_voice_cloning(
66
- const std::string& input_file_path,
67
- const std::string& output_file_path
68
- );
69
-
70
- static bool validate_wav_header(const WavFileHeader& header);
71
- static std::size_t calculate_audio_duration_milliseconds(const AudioData& audio_data);
72
-
73
- private:
74
- void convert_float32_to_int16(const float* input, std::int16_t* output, std::size_t sample_count);
75
- void convert_int32_to_int16(const std::int32_t* input, std::int16_t* output, std::size_t sample_count);
76
- void convert_uint8_to_int16(const std::uint8_t* input, std::int16_t* output, std::size_t sample_count);
77
- void mix_channels_to_mono(const std::int16_t* input, std::int16_t* output, std::size_t frame_count, std::uint16_t channel_count);
78
-
79
- MemoryPool& memory_pool;
80
- };
81
-
82
- }
83
-
84
- #endif
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
accelerator/include/ipc_handler.hpp DELETED
@@ -1,107 +0,0 @@
1
- //
2
- // SPDX-FileCopyrightText: Hadad <hadad@linuxmail.org>
3
- // SPDX-License-Identifier: Apache-2.0
4
- //
5
-
6
- #ifndef POCKET_TTS_IPC_HANDLER_HPP
7
- #define POCKET_TTS_IPC_HANDLER_HPP
8
-
9
- #include <atomic>
10
- #include <cstddef>
11
- #include <cstdint>
12
- #include <functional>
13
- #include <memory>
14
- #include <mutex>
15
- #include <string>
16
- #include <thread>
17
- #include <vector>
18
-
19
- namespace pocket_tts_accelerator {
20
-
21
- enum class CommandType : std::uint32_t {
22
- PING = 0,
23
- PROCESS_AUDIO = 1,
24
- CONVERT_TO_MONO = 2,
25
- CONVERT_TO_PCM = 3,
26
- RESAMPLE_AUDIO = 4,
27
- GET_MEMORY_STATS = 5,
28
- CLEAR_MEMORY_POOL = 6,
29
- SHUTDOWN = 7,
30
- UNKNOWN = 255
31
- };
32
-
33
- enum class ResponseStatus : std::uint32_t {
34
- SUCCESS = 0,
35
- ERROR_INVALID_COMMAND = 1,
36
- ERROR_FILE_NOT_FOUND = 2,
37
- ERROR_PROCESSING_FAILED = 3,
38
- ERROR_MEMORY_ALLOCATION = 4,
39
- ERROR_INTERNAL = 5
40
- };
41
-
42
- struct RequestHeader {
43
- std::uint32_t magic_number;
44
- std::uint32_t command_type;
45
- std::uint32_t payload_size;
46
- std::uint32_t request_id;
47
- };
48
-
49
- struct ResponseHeader {
50
- std::uint32_t magic_number;
51
- std::uint32_t status_code;
52
- std::uint32_t payload_size;
53
- std::uint32_t request_id;
54
- };
55
-
56
- struct ProcessAudioRequest {
57
- char input_file_path[512];
58
- char output_file_path[512];
59
- std::uint32_t target_sample_rate;
60
- std::uint32_t options_flags;
61
- };
62
-
63
- struct MemoryStatsResponse {
64
- std::uint64_t total_allocated_bytes;
65
- std::uint64_t total_used_bytes;
66
- std::uint64_t block_count;
67
- };
68
-
69
- class IpcHandler {
70
- public:
71
- using CommandHandlerFunction = std::function<std::vector<std::uint8_t>(const std::vector<std::uint8_t>&)>;
72
-
73
- explicit IpcHandler(const std::string& socket_path);
74
- ~IpcHandler();
75
-
76
- IpcHandler(const IpcHandler&) = delete;
77
- IpcHandler& operator=(const IpcHandler&) = delete;
78
-
79
- bool start_server();
80
- void stop_server();
81
- bool is_running() const;
82
-
83
- void register_command_handler(CommandType command_type, CommandHandlerFunction handler);
84
- void set_shutdown_callback(std::function<void()> callback);
85
-
86
- static constexpr std::uint32_t PROTOCOL_MAGIC_NUMBER = 0x50545453;
87
- static constexpr std::size_t MAXIMUM_PAYLOAD_SIZE = 16 * 1024 * 1024;
88
- static constexpr int CONNECTION_BACKLOG = 5;
89
-
90
- private:
91
- void accept_connections_loop();
92
- void handle_client_connection(int client_socket_fd);
93
- bool send_response(int socket_fd, const ResponseHeader& header, const std::vector<std::uint8_t>& payload);
94
- bool receive_request(int socket_fd, RequestHeader& header, std::vector<std::uint8_t>& payload);
95
-
96
- std::string socket_file_path;
97
- int server_socket_fd;
98
- std::atomic<bool> is_server_running;
99
- std::thread accept_thread;
100
- std::mutex handlers_mutex;
101
- std::unordered_map<CommandType, CommandHandlerFunction> command_handlers;
102
- std::function<void()> shutdown_callback;
103
- };
104
-
105
- }
106
-
107
- #endif
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
accelerator/include/memory_pool.hpp DELETED
@@ -1,79 +0,0 @@
1
- //
2
- // SPDX-FileCopyrightText: Hadad <hadad@linuxmail.org>
3
- // SPDX-License-Identifier: Apache-2.0
4
- //
5
-
6
- #ifndef POCKET_TTS_MEMORY_POOL_HPP
7
- #define POCKET_TTS_MEMORY_POOL_HPP
8
-
9
- #include <atomic>
10
- #include <cstddef>
11
- #include <cstdint>
12
- #include <memory>
13
- #include <mutex>
14
- #include <unordered_map>
15
- #include <vector>
16
-
17
- namespace pocket_tts_accelerator {
18
-
19
- struct MemoryBlock {
20
- std::unique_ptr<std::uint8_t[]> data;
21
- std::size_t block_size;
22
- bool is_in_use;
23
- std::uint64_t last_access_timestamp;
24
- };
25
-
26
- class MemoryPool {
27
- public:
28
- explicit MemoryPool(std::size_t initial_pool_size_bytes = 64 * 1024 * 1024);
29
- ~MemoryPool();
30
-
31
- MemoryPool(const MemoryPool&) = delete;
32
- MemoryPool& operator=(const MemoryPool&) = delete;
33
- MemoryPool(MemoryPool&&) = delete;
34
- MemoryPool& operator=(MemoryPool&&) = delete;
35
-
36
- std::uint8_t* allocate(std::size_t requested_size_bytes);
37
- void deallocate(std::uint8_t* pointer);
38
- void clear_unused_blocks();
39
- void reset_pool();
40
-
41
- std::size_t get_total_allocated_bytes() const;
42
- std::size_t get_total_used_bytes() const;
43
- std::size_t get_block_count() const;
44
-
45
- private:
46
- std::size_t find_suitable_block_index(std::size_t requested_size) const;
47
- void create_new_block(std::size_t block_size);
48
- std::uint64_t get_current_timestamp() const;
49
-
50
- std::vector<MemoryBlock> memory_blocks;
51
- std::unordered_map<std::uint8_t*, std::size_t> pointer_to_block_index;
52
- mutable std::mutex pool_mutex;
53
- std::size_t total_allocated_bytes;
54
- std::size_t total_used_bytes;
55
- std::size_t maximum_pool_size_bytes;
56
- };
57
-
58
- class ScopedMemoryAllocation {
59
- public:
60
- ScopedMemoryAllocation(MemoryPool& pool, std::size_t size);
61
- ~ScopedMemoryAllocation();
62
-
63
- ScopedMemoryAllocation(const ScopedMemoryAllocation&) = delete;
64
- ScopedMemoryAllocation& operator=(const ScopedMemoryAllocation&) = delete;
65
- ScopedMemoryAllocation(ScopedMemoryAllocation&& other) noexcept;
66
- ScopedMemoryAllocation& operator=(ScopedMemoryAllocation&& other) noexcept;
67
-
68
- std::uint8_t* get() const;
69
- std::size_t size() const;
70
-
71
- private:
72
- MemoryPool* memory_pool_pointer;
73
- std::uint8_t* allocated_pointer;
74
- std::size_t allocation_size;
75
- };
76
-
77
- }
78
-
79
- #endif
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
accelerator/include/thread_pool.hpp DELETED
@@ -1,83 +0,0 @@
1
- //
2
- // SPDX-FileCopyrightText: Hadad <hadad@linuxmail.org>
3
- // SPDX-License-Identifier: Apache-2.0
4
- //
5
-
6
- #ifndef POCKET_TTS_THREAD_POOL_HPP
7
- #define POCKET_TTS_THREAD_POOL_HPP
8
-
9
- #include <atomic>
10
- #include <condition_variable>
11
- #include <functional>
12
- #include <future>
13
- #include <memory>
14
- #include <mutex>
15
- #include <queue>
16
- #include <thread>
17
- #include <vector>
18
-
19
- namespace pocket_tts_accelerator {
20
-
21
- class ThreadPool {
22
- public:
23
- explicit ThreadPool(std::size_t number_of_threads);
24
- ~ThreadPool();
25
-
26
- ThreadPool(const ThreadPool&) = delete;
27
- ThreadPool& operator=(const ThreadPool&) = delete;
28
- ThreadPool(ThreadPool&&) = delete;
29
- ThreadPool& operator=(ThreadPool&&) = delete;
30
-
31
- template<typename FunctionType, typename... ArgumentTypes>
32
- auto submit_task(FunctionType&& function, ArgumentTypes&&... arguments)
33
- -> std::future<typename std::invoke_result<FunctionType, ArgumentTypes...>::type>;
34
-
35
- void shutdown();
36
- bool is_running() const;
37
- std::size_t get_pending_task_count() const;
38
- std::size_t get_thread_count() const;
39
-
40
- private:
41
- void worker_thread_function();
42
-
43
- std::vector<std::thread> worker_threads;
44
- std::queue<std::function<void()>> task_queue;
45
- mutable std::mutex queue_mutex;
46
- std::condition_variable task_available_condition;
47
- std::atomic<bool> should_stop;
48
- std::atomic<bool> is_stopped;
49
- std::size_t thread_count;
50
- };
51
-
52
- template<typename FunctionType, typename... ArgumentTypes>
53
- auto ThreadPool::submit_task(FunctionType&& function, ArgumentTypes&&... arguments)
54
- -> std::future<typename std::invoke_result<FunctionType, ArgumentTypes...>::type> {
55
-
56
- using ReturnType = typename std::invoke_result<FunctionType, ArgumentTypes...>::type;
57
-
58
- auto packaged_task = std::make_shared<std::packaged_task<ReturnType()>>(
59
- std::bind(std::forward<FunctionType>(function), std::forward<ArgumentTypes>(arguments)...)
60
- );
61
-
62
- std::future<ReturnType> result_future = packaged_task->get_future();
63
-
64
- {
65
- std::unique_lock<std::mutex> lock(queue_mutex);
66
-
67
- if (should_stop.load()) {
68
- throw std::runtime_error("Cannot submit task to stopped thread pool");
69
- }
70
-
71
- task_queue.emplace([packaged_task]() {
72
- (*packaged_task)();
73
- });
74
- }
75
-
76
- task_available_condition.notify_one();
77
-
78
- return result_future;
79
- }
80
-
81
- }
82
-
83
- #endif
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
accelerator/src/accelerator_core.cpp DELETED
@@ -1,558 +0,0 @@
1
- //
2
- // SPDX-FileCopyrightText: Hadad <hadad@linuxmail.org>
3
- // SPDX-License-Identifier: Apache-2.0
4
- //
5
-
6
- #include "accelerator_core.hpp"
7
- #include <chrono>
8
- #include <cstring>
9
- #include <ctime>
10
- #include <iomanip>
11
- #include <iostream>
12
- #include <sstream>
13
- #include <signal.h>
14
-
15
- namespace pocket_tts_accelerator {
16
-
17
- static AcceleratorCore* global_accelerator_instance = nullptr;
18
- static volatile sig_atomic_t last_received_signal = 0;
19
-
20
- static void signal_handler_function(int signal_number) {
21
- last_received_signal = signal_number;
22
- if (global_accelerator_instance != nullptr) {
23
- global_accelerator_instance->shutdown();
24
- }
25
- }
26
-
27
- AcceleratorCore::AcceleratorCore(const AcceleratorConfiguration& configuration)
28
- : config(configuration)
29
- , is_initialized(false)
30
- , should_shutdown(false) {
31
- }
32
-
33
- AcceleratorCore::~AcceleratorCore() {
34
- shutdown();
35
- }
36
-
37
- bool AcceleratorCore::initialize() {
38
- if (is_initialized.load()) {
39
- return true;
40
- }
41
-
42
- log_message("Initializing Pocket TTS Accelerator...");
43
-
44
- memory_pool = std::make_unique<MemoryPool>(config.memory_pool_size_bytes);
45
- log_message("Memory pool initialized with " + std::to_string(config.memory_pool_size_bytes / (1024 * 1024)) + " MB");
46
-
47
- thread_pool = std::make_unique<ThreadPool>(config.number_of_worker_threads);
48
- log_message("Thread pool initialized with " + std::to_string(config.number_of_worker_threads) + " worker threads");
49
-
50
- audio_processor = std::make_unique<AudioProcessor>(*memory_pool);
51
- log_message("Audio processor initialized");
52
-
53
- ipc_handler = std::make_unique<IpcHandler>(config.ipc_socket_path);
54
- log_message("IPC handler created for socket: " + config.ipc_socket_path);
55
-
56
- register_all_command_handlers();
57
-
58
- ipc_handler->set_shutdown_callback([this]() {
59
- this->shutdown();
60
- });
61
-
62
- if (!ipc_handler->start_server()) {
63
- log_message("ERROR: Failed to start IPC server");
64
- return false;
65
- }
66
-
67
- log_message("IPC server started successfully");
68
-
69
- global_accelerator_instance = this;
70
- setup_signal_handlers();
71
-
72
- is_initialized.store(true);
73
- log_message("Pocket TTS Accelerator initialized successfully");
74
-
75
- return true;
76
- }
77
-
78
- void AcceleratorCore::run() {
79
- if (!is_initialized.load()) {
80
- log_message("ERROR: Accelerator not initialized");
81
- return;
82
- }
83
-
84
- log_message("Accelerator running and waiting for commands...");
85
-
86
- while (!should_shutdown.load()) {
87
- std::this_thread::sleep_for(std::chrono::milliseconds(100));
88
-
89
- if (last_received_signal != 0) {
90
- log_message("Received signal: " + std::to_string(last_received_signal));
91
- last_received_signal = 0;
92
- }
93
- }
94
-
95
- log_message("Accelerator main loop exited");
96
- }
97
-
98
- void AcceleratorCore::shutdown() {
99
- if (should_shutdown.exchange(true)) {
100
- return;
101
- }
102
-
103
- log_message("Shutting down Pocket TTS Accelerator...");
104
-
105
- if (ipc_handler) {
106
- ipc_handler->stop_server();
107
- log_message("IPC server stopped");
108
- }
109
-
110
- if (thread_pool) {
111
- thread_pool->shutdown();
112
- log_message("Thread pool shut down");
113
- }
114
-
115
- if (memory_pool) {
116
- memory_pool->reset_pool();
117
- log_message("Memory pool reset");
118
- }
119
-
120
- is_initialized.store(false);
121
- log_message("Pocket TTS Accelerator shut down complete");
122
- }
123
-
124
- bool AcceleratorCore::is_running() const {
125
- return is_initialized.load() && !should_shutdown.load();
126
- }
127
-
128
- std::string AcceleratorCore::get_status_string() const {
129
- if (!is_initialized.load()) {
130
- return "Not initialized";
131
- }
132
-
133
- if (should_shutdown.load()) {
134
- return "Shutting down";
135
- }
136
-
137
- return "Running";
138
- }
139
-
140
- AcceleratorConfiguration AcceleratorCore::get_default_configuration() {
141
- AcceleratorConfiguration default_config;
142
- default_config.number_of_worker_threads = 2;
143
- default_config.memory_pool_size_bytes = 64 * 1024 * 1024;
144
- default_config.ipc_socket_path = "/tmp/pocket_tts_accelerator.sock";
145
- default_config.enable_verbose_logging = true;
146
- return default_config;
147
- }
148
-
149
- void AcceleratorCore::register_all_command_handlers() {
150
- ipc_handler->register_command_handler(
151
- CommandType::PING,
152
- [this](const std::vector<std::uint8_t>& payload) {
153
- return this->handle_ping_command(payload);
154
- }
155
- );
156
-
157
- ipc_handler->register_command_handler(
158
- CommandType::PROCESS_AUDIO,
159
- [this](const std::vector<std::uint8_t>& payload) {
160
- return this->handle_process_audio_command(payload);
161
- }
162
- );
163
-
164
- ipc_handler->register_command_handler(
165
- CommandType::CONVERT_TO_MONO,
166
- [this](const std::vector<std::uint8_t>& payload) {
167
- return this->handle_convert_to_mono_command(payload);
168
- }
169
- );
170
-
171
- ipc_handler->register_command_handler(
172
- CommandType::CONVERT_TO_PCM,
173
- [this](const std::vector<std::uint8_t>& payload) {
174
- return this->handle_convert_to_pcm_command(payload);
175
- }
176
- );
177
-
178
- ipc_handler->register_command_handler(
179
- CommandType::RESAMPLE_AUDIO,
180
- [this](const std::vector<std::uint8_t>& payload) {
181
- return this->handle_resample_audio_command(payload);
182
- }
183
- );
184
-
185
- ipc_handler->register_command_handler(
186
- CommandType::GET_MEMORY_STATS,
187
- [this](const std::vector<std::uint8_t>& payload) {
188
- return this->handle_get_memory_stats_command(payload);
189
- }
190
- );
191
-
192
- ipc_handler->register_command_handler(
193
- CommandType::CLEAR_MEMORY_POOL,
194
- [this](const std::vector<std::uint8_t>& payload) {
195
- return this->handle_clear_memory_pool_command(payload);
196
- }
197
- );
198
-
199
- ipc_handler->register_command_handler(
200
- CommandType::SHUTDOWN,
201
- [this](const std::vector<std::uint8_t>& payload) {
202
- return this->handle_shutdown_command(payload);
203
- }
204
- );
205
-
206
- log_message("All command handlers registered");
207
- }
208
-
209
- void AcceleratorCore::setup_signal_handlers() {
210
- signal(SIGINT, signal_handler_function);
211
- signal(SIGTERM, signal_handler_function);
212
- }
213
-
214
- std::vector<std::uint8_t> AcceleratorCore::handle_ping_command(const std::vector<std::uint8_t>& payload) {
215
- std::string payload_content;
216
- if (!payload.empty()) {
217
- payload_content = std::string(payload.begin(), payload.end());
218
- log_message("Received PING command with payload: " + payload_content);
219
- } else {
220
- log_message("Received PING command");
221
- }
222
-
223
- std::string response_message = "PONG";
224
- if (!payload_content.empty()) {
225
- response_message += ":" + payload_content;
226
- }
227
-
228
- return std::vector<std::uint8_t>(response_message.begin(), response_message.end());
229
- }
230
-
231
- std::vector<std::uint8_t> AcceleratorCore::handle_process_audio_command(const std::vector<std::uint8_t>& payload) {
232
- log_message("Received PROCESS_AUDIO command with payload size: " + std::to_string(payload.size()) + " bytes");
233
-
234
- if (payload.size() < sizeof(ProcessAudioRequest)) {
235
- std::string error_message = "ERROR:Invalid payload size, expected " + std::to_string(sizeof(ProcessAudioRequest)) + " bytes";
236
- log_message(error_message);
237
- return std::vector<std::uint8_t>(error_message.begin(), error_message.end());
238
- }
239
-
240
- ProcessAudioRequest request;
241
- std::memcpy(&request, payload.data(), sizeof(ProcessAudioRequest));
242
-
243
- std::string input_path(request.input_file_path);
244
- std::string output_path(request.output_file_path);
245
-
246
- log_message("Processing audio from: " + input_path + " to: " + output_path);
247
-
248
- auto future_result = thread_pool->submit_task([this, input_path, output_path]() {
249
- return this->audio_processor->process_audio_for_voice_cloning(input_path, output_path);
250
- });
251
-
252
- AudioProcessingResult result = future_result.get();
253
-
254
- if (result.success) {
255
- log_message("Audio processing completed successfully");
256
- std::string success_message = "SUCCESS:" + output_path;
257
- return std::vector<std::uint8_t>(success_message.begin(), success_message.end());
258
- } else {
259
- log_message("Audio processing failed: " + result.error_message);
260
- std::string error_message = "ERROR:" + result.error_message;
261
- return std::vector<std::uint8_t>(error_message.begin(), error_message.end());
262
- }
263
- }
264
-
265
- std::vector<std::uint8_t> AcceleratorCore::handle_convert_to_mono_command(const std::vector<std::uint8_t>& payload) {
266
- log_message("Received CONVERT_TO_MONO command with payload size: " + std::to_string(payload.size()) + " bytes");
267
-
268
- if (payload.size() < sizeof(ProcessAudioRequest)) {
269
- std::string error_message = "ERROR:Invalid payload size, expected " + std::to_string(sizeof(ProcessAudioRequest)) + " bytes";
270
- log_message(error_message);
271
- return std::vector<std::uint8_t>(error_message.begin(), error_message.end());
272
- }
273
-
274
- ProcessAudioRequest request;
275
- std::memcpy(&request, payload.data(), sizeof(ProcessAudioRequest));
276
-
277
- std::string input_path(request.input_file_path);
278
- std::string output_path(request.output_file_path);
279
-
280
- log_message("Converting to mono from: " + input_path + " to: " + output_path);
281
-
282
- AudioData audio_data = audio_processor->read_wav_file(input_path);
283
-
284
- if (!audio_data.is_valid) {
285
- log_message("Failed to read input file: " + audio_data.error_message);
286
- std::string error_message = "ERROR:" + audio_data.error_message;
287
- return std::vector<std::uint8_t>(error_message.begin(), error_message.end());
288
- }
289
-
290
- AudioProcessingResult result = audio_processor->convert_to_mono(audio_data);
291
-
292
- if (!result.success) {
293
- log_message("Mono conversion failed: " + result.error_message);
294
- std::string error_message = "ERROR:" + result.error_message;
295
- return std::vector<std::uint8_t>(error_message.begin(), error_message.end());
296
- }
297
-
298
- AudioData output_audio;
299
- output_audio.samples = std::move(result.processed_samples);
300
- output_audio.sample_rate = result.output_sample_rate;
301
- output_audio.number_of_channels = 1;
302
- output_audio.bits_per_sample = 16;
303
- output_audio.is_valid = true;
304
-
305
- if (!audio_processor->write_wav_file(output_path, output_audio)) {
306
- log_message("Failed to write output file: " + output_path);
307
- std::string error_message = "ERROR:Failed to write output file";
308
- return std::vector<std::uint8_t>(error_message.begin(), error_message.end());
309
- }
310
-
311
- log_message("Mono conversion completed successfully: " + output_path);
312
- std::string success_message = "SUCCESS:" + output_path;
313
- return std::vector<std::uint8_t>(success_message.begin(), success_message.end());
314
- }
315
-
316
- std::vector<std::uint8_t> AcceleratorCore::handle_convert_to_pcm_command(const std::vector<std::uint8_t>& payload) {
317
- log_message("Received CONVERT_TO_PCM command with payload size: " + std::to_string(payload.size()) + " bytes");
318
-
319
- if (payload.size() < sizeof(ProcessAudioRequest)) {
320
- std::string error_message = "ERROR:Invalid payload size, expected " + std::to_string(sizeof(ProcessAudioRequest)) + " bytes";
321
- log_message(error_message);
322
- return std::vector<std::uint8_t>(error_message.begin(), error_message.end());
323
- }
324
-
325
- ProcessAudioRequest request;
326
- std::memcpy(&request, payload.data(), sizeof(ProcessAudioRequest));
327
-
328
- std::string input_path(request.input_file_path);
329
- std::string output_path(request.output_file_path);
330
-
331
- log_message("Converting to PCM from: " + input_path + " to: " + output_path);
332
-
333
- AudioData audio_data = audio_processor->read_wav_file(input_path);
334
-
335
- if (!audio_data.is_valid) {
336
- log_message("Failed to read input file: " + audio_data.error_message);
337
- std::string error_message = "ERROR:" + audio_data.error_message;
338
- return std::vector<std::uint8_t>(error_message.begin(), error_message.end());
339
- }
340
-
341
- AudioData mono_audio;
342
-
343
- if (audio_data.number_of_channels > 1) {
344
- log_message("Input has " + std::to_string(audio_data.number_of_channels) + " channels, converting to mono");
345
- AudioProcessingResult mono_result = audio_processor->convert_to_mono(audio_data);
346
-
347
- if (!mono_result.success) {
348
- log_message("Mono conversion failed: " + mono_result.error_message);
349
- std::string error_message = "ERROR:" + mono_result.error_message;
350
- return std::vector<std::uint8_t>(error_message.begin(), error_message.end());
351
- }
352
-
353
- mono_audio.samples = std::move(mono_result.processed_samples);
354
- mono_audio.sample_rate = mono_result.output_sample_rate;
355
- } else {
356
- mono_audio.samples = std::move(audio_data.samples);
357
- mono_audio.sample_rate = audio_data.sample_rate;
358
- }
359
-
360
- mono_audio.number_of_channels = 1;
361
- mono_audio.bits_per_sample = 16;
362
- mono_audio.is_valid = true;
363
-
364
- if (!audio_processor->write_wav_file(output_path, mono_audio)) {
365
- log_message("Failed to write output file: " + output_path);
366
- std::string error_message = "ERROR:Failed to write output file";
367
- return std::vector<std::uint8_t>(error_message.begin(), error_message.end());
368
- }
369
-
370
- log_message("PCM conversion completed successfully: " + output_path);
371
- std::string success_message = "SUCCESS:" + output_path;
372
- return std::vector<std::uint8_t>(success_message.begin(), success_message.end());
373
- }
374
-
375
- std::vector<std::uint8_t> AcceleratorCore::handle_resample_audio_command(const std::vector<std::uint8_t>& payload) {
376
- log_message("Received RESAMPLE_AUDIO command with payload size: " + std::to_string(payload.size()) + " bytes");
377
-
378
- if (payload.size() < sizeof(ProcessAudioRequest)) {
379
- std::string error_message = "ERROR:Invalid payload size, expected " + std::to_string(sizeof(ProcessAudioRequest)) + " bytes";
380
- log_message(error_message);
381
- return std::vector<std::uint8_t>(error_message.begin(), error_message.end());
382
- }
383
-
384
- ProcessAudioRequest request;
385
- std::memcpy(&request, payload.data(), sizeof(ProcessAudioRequest));
386
-
387
- std::string input_path(request.input_file_path);
388
- std::string output_path(request.output_file_path);
389
- std::uint32_t target_sample_rate = request.target_sample_rate;
390
-
391
- log_message("Resampling audio from: " + input_path + " to: " + output_path + " at " + std::to_string(target_sample_rate) + " Hz");
392
-
393
- AudioData audio_data = audio_processor->read_wav_file(input_path);
394
-
395
- if (!audio_data.is_valid) {
396
- log_message("Failed to read input file: " + audio_data.error_message);
397
- std::string error_message = "ERROR:" + audio_data.error_message;
398
- return std::vector<std::uint8_t>(error_message.begin(), error_message.end());
399
- }
400
-
401
- AudioProcessingResult result = audio_processor->resample_audio(audio_data, target_sample_rate);
402
-
403
- if (!result.success) {
404
- log_message("Resampling failed: " + result.error_message);
405
- std::string error_message = "ERROR:" + result.error_message;
406
- return std::vector<std::uint8_t>(error_message.begin(), error_message.end());
407
- }
408
-
409
- AudioData output_audio;
410
- output_audio.samples = std::move(result.processed_samples);
411
- output_audio.sample_rate = result.output_sample_rate;
412
- output_audio.number_of_channels = audio_data.number_of_channels;
413
- output_audio.bits_per_sample = 16;
414
- output_audio.is_valid = true;
415
-
416
- if (!audio_processor->write_wav_file(output_path, output_audio)) {
417
- log_message("Failed to write output file: " + output_path);
418
- std::string error_message = "ERROR:Failed to write output file";
419
- return std::vector<std::uint8_t>(error_message.begin(), error_message.end());
420
- }
421
-
422
- log_message("Resampling completed successfully: " + output_path);
423
- std::string success_message = "SUCCESS:" + output_path;
424
- return std::vector<std::uint8_t>(success_message.begin(), success_message.end());
425
- }
426
-
427
- std::vector<std::uint8_t> AcceleratorCore::handle_get_memory_stats_command(const std::vector<std::uint8_t>& payload) {
428
- std::uint32_t request_flags = 0;
429
-
430
- if (payload.size() >= sizeof(std::uint32_t)) {
431
- std::memcpy(&request_flags, payload.data(), sizeof(std::uint32_t));
432
- log_message("Received GET_MEMORY_STATS command with flags: " + std::to_string(request_flags));
433
- } else {
434
- log_message("Received GET_MEMORY_STATS command with payload size: " + std::to_string(payload.size()) + " bytes");
435
- }
436
-
437
- MemoryStatsResponse stats;
438
- stats.total_allocated_bytes = memory_pool->get_total_allocated_bytes();
439
- stats.total_used_bytes = memory_pool->get_total_used_bytes();
440
- stats.block_count = memory_pool->get_block_count();
441
-
442
- bool include_detailed_log = (request_flags & 0x01) != 0;
443
-
444
- if (include_detailed_log) {
445
- log_message("Memory stats (detailed) - Allocated: " + std::to_string(stats.total_allocated_bytes) +
446
- " bytes (" + std::to_string(stats.total_allocated_bytes / (1024 * 1024)) + " MB)" +
447
- ", Used: " + std::to_string(stats.total_used_bytes) +
448
- " bytes (" + std::to_string(stats.total_used_bytes / (1024 * 1024)) + " MB)" +
449
- ", Blocks: " + std::to_string(stats.block_count) +
450
- ", Utilization: " + std::to_string(stats.total_allocated_bytes > 0 ?
451
- (stats.total_used_bytes * 100 / stats.total_allocated_bytes) : 0) + "%");
452
- } else {
453
- log_message("Memory stats - Allocated: " + std::to_string(stats.total_allocated_bytes) +
454
- " bytes, Used: " + std::to_string(stats.total_used_bytes) +
455
- " bytes, Blocks: " + std::to_string(stats.block_count));
456
- }
457
-
458
- std::vector<std::uint8_t> response(sizeof(MemoryStatsResponse));
459
- std::memcpy(response.data(), &stats, sizeof(MemoryStatsResponse));
460
-
461
- return response;
462
- }
463
-
464
- std::vector<std::uint8_t> AcceleratorCore::handle_clear_memory_pool_command(const std::vector<std::uint8_t>& payload) {
465
- std::uint32_t clear_flags = 0;
466
-
467
- if (payload.size() >= sizeof(std::uint32_t)) {
468
- std::memcpy(&clear_flags, payload.data(), sizeof(std::uint32_t));
469
- log_message("Received CLEAR_MEMORY_POOL command with flags: " + std::to_string(clear_flags));
470
- } else {
471
- log_message("Received CLEAR_MEMORY_POOL command with payload size: " + std::to_string(payload.size()) + " bytes");
472
- }
473
-
474
- std::size_t blocks_before = memory_pool->get_block_count();
475
- std::size_t allocated_before = memory_pool->get_total_allocated_bytes();
476
- std::size_t used_before = memory_pool->get_total_used_bytes();
477
-
478
- bool force_full_reset = (clear_flags & 0x01) != 0;
479
-
480
- if (force_full_reset) {
481
- log_message("Performing full memory pool reset (force flag set)");
482
- memory_pool->reset_pool();
483
- } else {
484
- log_message("Clearing unused memory blocks");
485
- memory_pool->clear_unused_blocks();
486
- }
487
-
488
- std::size_t blocks_after = memory_pool->get_block_count();
489
- std::size_t allocated_after = memory_pool->get_total_allocated_bytes();
490
- std::size_t used_after = memory_pool->get_total_used_bytes();
491
-
492
- std::size_t blocks_freed = blocks_before - blocks_after;
493
- std::size_t bytes_freed = allocated_before - allocated_after;
494
-
495
- log_message("Memory pool cleared - Before: " + std::to_string(blocks_before) + " blocks (" +
496
- std::to_string(allocated_before) + " bytes allocated, " +
497
- std::to_string(used_before) + " bytes used) -> After: " +
498
- std::to_string(blocks_after) + " blocks (" +
499
- std::to_string(allocated_after) + " bytes allocated, " +
500
- std::to_string(used_after) + " bytes used) -> Freed: " +
501
- std::to_string(blocks_freed) + " blocks (" +
502
- std::to_string(bytes_freed) + " bytes)");
503
-
504
- std::string success_message = "SUCCESS:Freed " + std::to_string(blocks_freed) +
505
- " blocks (" + std::to_string(bytes_freed) + " bytes)";
506
-
507
- if (force_full_reset) {
508
- success_message += " [full reset]";
509
- }
510
-
511
- return std::vector<std::uint8_t>(success_message.begin(), success_message.end());
512
- }
513
-
514
- std::vector<std::uint8_t> AcceleratorCore::handle_shutdown_command(const std::vector<std::uint8_t>& payload) {
515
- std::string shutdown_reason;
516
- if (!payload.empty()) {
517
- shutdown_reason = std::string(payload.begin(), payload.end());
518
- log_message("Received SHUTDOWN command with reason: " + shutdown_reason);
519
- } else {
520
- log_message("Received SHUTDOWN command");
521
- }
522
-
523
- std::string success_message = "SUCCESS:Shutting down";
524
- if (!shutdown_reason.empty()) {
525
- success_message += " (reason: " + shutdown_reason + ")";
526
- }
527
-
528
- std::thread shutdown_thread([this]() {
529
- std::this_thread::sleep_for(std::chrono::milliseconds(100));
530
- this->shutdown();
531
- });
532
- shutdown_thread.detach();
533
-
534
- return std::vector<std::uint8_t>(success_message.begin(), success_message.end());
535
- }
536
-
537
- void AcceleratorCore::log_message(const std::string& message) const {
538
- if (config.enable_verbose_logging) {
539
- auto now = std::chrono::system_clock::now();
540
- std::time_t time_t_now = std::chrono::system_clock::to_time_t(now);
541
-
542
- auto milliseconds = std::chrono::duration_cast<std::chrono::milliseconds>(
543
- now.time_since_epoch()
544
- ) % 1000;
545
-
546
- std::tm time_info;
547
- localtime_r(&time_t_now, &time_info);
548
-
549
- std::ostringstream timestamp_stream;
550
- timestamp_stream << std::put_time(&time_info, "%Y-%m-%d %H:%M:%S");
551
- timestamp_stream << '.' << std::setfill('0') << std::setw(3) << milliseconds.count();
552
-
553
- std::cout << "[" << timestamp_stream.str() << "] " << message << std::endl;
554
- std::cout.flush();
555
- }
556
- }
557
-
558
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
accelerator/src/audio_processor.cpp DELETED
@@ -1,352 +0,0 @@
1
- //
2
- // SPDX-FileCopyrightText: Hadad <hadad@linuxmail.org>
3
- // SPDX-License-Identifier: Apache-2.0
4
- //
5
-
6
- #include "audio_processor.hpp"
7
- #include <algorithm>
8
- #include <cmath>
9
- #include <cstring>
10
- #include <fstream>
11
-
12
- namespace pocket_tts_accelerator {
13
-
14
- AudioProcessor::AudioProcessor(MemoryPool& shared_memory_pool)
15
- : memory_pool(shared_memory_pool) {
16
- }
17
-
18
- AudioProcessor::~AudioProcessor() {
19
- }
20
-
21
- AudioData AudioProcessor::read_wav_file(const std::string& file_path) {
22
- AudioData result;
23
- result.is_valid = false;
24
-
25
- std::ifstream file_stream(file_path, std::ios::binary);
26
-
27
- if (!file_stream.is_open()) {
28
- result.error_message = "Failed to open file: " + file_path;
29
- return result;
30
- }
31
-
32
- WavFileHeader header;
33
- file_stream.read(reinterpret_cast<char*>(&header), sizeof(WavFileHeader));
34
-
35
- if (file_stream.gcount() < static_cast<std::streamsize>(sizeof(WavFileHeader))) {
36
- result.error_message = "File is too small to be a valid WAV file";
37
- return result;
38
- }
39
-
40
- if (!validate_wav_header(header)) {
41
- result.error_message = "Invalid WAV file header";
42
- return result;
43
- }
44
-
45
- result.sample_rate = header.sample_rate;
46
- result.number_of_channels = header.number_of_channels;
47
- result.bits_per_sample = header.bits_per_sample;
48
-
49
- std::size_t sample_count = header.data_size / (header.bits_per_sample / 8);
50
- result.samples.resize(sample_count);
51
-
52
- if (header.bits_per_sample == 16) {
53
- file_stream.read(reinterpret_cast<char*>(result.samples.data()), header.data_size);
54
- } else if (header.bits_per_sample == 8) {
55
- std::vector<std::uint8_t> raw_data(sample_count);
56
- file_stream.read(reinterpret_cast<char*>(raw_data.data()), header.data_size);
57
- convert_uint8_to_int16(raw_data.data(), result.samples.data(), sample_count);
58
- } else if (header.bits_per_sample == 32) {
59
- if (header.audio_format == 3) {
60
- std::vector<float> raw_data(sample_count);
61
- file_stream.read(reinterpret_cast<char*>(raw_data.data()), header.data_size);
62
- convert_float32_to_int16(raw_data.data(), result.samples.data(), sample_count);
63
- } else {
64
- std::vector<std::int32_t> raw_data(sample_count);
65
- file_stream.read(reinterpret_cast<char*>(raw_data.data()), header.data_size);
66
- convert_int32_to_int16(raw_data.data(), result.samples.data(), sample_count);
67
- }
68
- }
69
-
70
- result.is_valid = true;
71
- return result;
72
- }
73
-
74
- bool AudioProcessor::write_wav_file(const std::string& file_path, const AudioData& audio_data) {
75
- std::ofstream file_stream(file_path, std::ios::binary);
76
-
77
- if (!file_stream.is_open()) {
78
- return false;
79
- }
80
-
81
- std::uint32_t data_size = static_cast<std::uint32_t>(audio_data.samples.size() * sizeof(std::int16_t));
82
- std::uint32_t file_size = data_size + 36;
83
-
84
- WavFileHeader header;
85
- std::memcpy(header.riff_marker, "RIFF", 4);
86
- header.file_size = file_size;
87
- std::memcpy(header.wave_marker, "WAVE", 4);
88
- std::memcpy(header.format_marker, "fmt ", 4);
89
- header.format_chunk_size = 16;
90
- header.audio_format = 1;
91
- header.number_of_channels = audio_data.number_of_channels;
92
- header.sample_rate = audio_data.sample_rate;
93
- header.bits_per_sample = 16;
94
- header.byte_rate = audio_data.sample_rate * audio_data.number_of_channels * 2;
95
- header.block_align = audio_data.number_of_channels * 2;
96
- std::memcpy(header.data_marker, "data", 4);
97
- header.data_size = data_size;
98
-
99
- file_stream.write(reinterpret_cast<const char*>(&header), sizeof(WavFileHeader));
100
- file_stream.write(reinterpret_cast<const char*>(audio_data.samples.data()), data_size);
101
-
102
- return file_stream.good();
103
- }
104
-
105
- AudioProcessingResult AudioProcessor::convert_to_mono(const AudioData& input_audio) {
106
- AudioProcessingResult result;
107
- result.success = false;
108
-
109
- if (!input_audio.is_valid) {
110
- result.error_message = "Invalid input audio";
111
- return result;
112
- }
113
-
114
- if (input_audio.number_of_channels == 1) {
115
- result.processed_samples = input_audio.samples;
116
- result.output_sample_rate = input_audio.sample_rate;
117
- result.success = true;
118
- return result;
119
- }
120
-
121
- std::size_t frame_count = input_audio.samples.size() / input_audio.number_of_channels;
122
- result.processed_samples.resize(frame_count);
123
-
124
- mix_channels_to_mono(
125
- input_audio.samples.data(),
126
- result.processed_samples.data(),
127
- frame_count,
128
- input_audio.number_of_channels
129
- );
130
-
131
- result.output_sample_rate = input_audio.sample_rate;
132
- result.success = true;
133
- return result;
134
- }
135
-
136
- AudioProcessingResult AudioProcessor::convert_to_pcm_int16(const AudioData& input_audio) {
137
- AudioProcessingResult result;
138
- result.success = false;
139
-
140
- if (!input_audio.is_valid) {
141
- result.error_message = "Invalid input audio";
142
- return result;
143
- }
144
-
145
- result.processed_samples = input_audio.samples;
146
- result.output_sample_rate = input_audio.sample_rate;
147
- result.success = true;
148
- return result;
149
- }
150
-
151
- AudioProcessingResult AudioProcessor::resample_audio(const AudioData& input_audio, std::uint32_t target_sample_rate) {
152
- AudioProcessingResult result;
153
- result.success = false;
154
-
155
- if (!input_audio.is_valid) {
156
- result.error_message = "Invalid input audio";
157
- return result;
158
- }
159
-
160
- if (input_audio.sample_rate == target_sample_rate) {
161
- result.processed_samples = input_audio.samples;
162
- result.output_sample_rate = target_sample_rate;
163
- result.success = true;
164
- return result;
165
- }
166
-
167
- double ratio = static_cast<double>(target_sample_rate) / static_cast<double>(input_audio.sample_rate);
168
- std::size_t output_sample_count = static_cast<std::size_t>(input_audio.samples.size() * ratio);
169
-
170
- result.processed_samples.resize(output_sample_count);
171
-
172
- for (std::size_t output_index = 0; output_index < output_sample_count; ++output_index) {
173
- double source_position = output_index / ratio;
174
- std::size_t source_index_floor = static_cast<std::size_t>(source_position);
175
- std::size_t source_index_ceil = source_index_floor + 1;
176
- double fractional_part = source_position - source_index_floor;
177
-
178
- if (source_index_ceil >= input_audio.samples.size()) {
179
- source_index_ceil = input_audio.samples.size() - 1;
180
- }
181
-
182
- double interpolated_value =
183
- input_audio.samples[source_index_floor] * (1.0 - fractional_part) +
184
- input_audio.samples[source_index_ceil] * fractional_part;
185
-
186
- result.processed_samples[output_index] = static_cast<std::int16_t>(
187
- std::clamp(interpolated_value, -32768.0, 32767.0)
188
- );
189
- }
190
-
191
- result.output_sample_rate = target_sample_rate;
192
- result.success = true;
193
- return result;
194
- }
195
-
196
- AudioProcessingResult AudioProcessor::normalize_audio(const AudioData& input_audio, float target_peak_level) {
197
- AudioProcessingResult result;
198
- result.success = false;
199
-
200
- if (!input_audio.is_valid) {
201
- result.error_message = "Invalid input audio";
202
- return result;
203
- }
204
-
205
- std::int16_t max_absolute_value = 0;
206
- for (const std::int16_t sample : input_audio.samples) {
207
- std::int16_t absolute_value = static_cast<std::int16_t>(std::abs(sample));
208
- if (absolute_value > max_absolute_value) {
209
- max_absolute_value = absolute_value;
210
- }
211
- }
212
-
213
- if (max_absolute_value == 0) {
214
- result.processed_samples = input_audio.samples;
215
- result.output_sample_rate = input_audio.sample_rate;
216
- result.success = true;
217
- return result;
218
- }
219
-
220
- float normalization_factor = (target_peak_level * 32767.0f) / static_cast<float>(max_absolute_value);
221
-
222
- result.processed_samples.resize(input_audio.samples.size());
223
-
224
- for (std::size_t index = 0; index < input_audio.samples.size(); ++index) {
225
- float normalized_sample = static_cast<float>(input_audio.samples[index]) * normalization_factor;
226
- result.processed_samples[index] = static_cast<std::int16_t>(
227
- std::clamp(normalized_sample, -32768.0f, 32767.0f)
228
- );
229
- }
230
-
231
- result.output_sample_rate = input_audio.sample_rate;
232
- result.success = true;
233
- return result;
234
- }
235
-
236
- AudioProcessingResult AudioProcessor::process_audio_for_voice_cloning(
237
- const std::string& input_file_path,
238
- const std::string& output_file_path
239
- ) {
240
- AudioProcessingResult result;
241
- result.success = false;
242
-
243
- AudioData input_audio = read_wav_file(input_file_path);
244
-
245
- if (!input_audio.is_valid) {
246
- result.error_message = "Failed to read input file: " + input_audio.error_message;
247
- return result;
248
- }
249
-
250
- AudioProcessingResult mono_result = convert_to_mono(input_audio);
251
-
252
- if (!mono_result.success) {
253
- result.error_message = "Failed to convert to mono: " + mono_result.error_message;
254
- return result;
255
- }
256
-
257
- AudioData mono_audio;
258
- mono_audio.samples = std::move(mono_result.processed_samples);
259
- mono_audio.sample_rate = mono_result.output_sample_rate;
260
- mono_audio.number_of_channels = 1;
261
- mono_audio.bits_per_sample = 16;
262
- mono_audio.is_valid = true;
263
-
264
- if (!write_wav_file(output_file_path, mono_audio)) {
265
- result.error_message = "Failed to write output file";
266
- return result;
267
- }
268
-
269
- result.processed_samples = std::move(mono_audio.samples);
270
- result.output_sample_rate = mono_audio.sample_rate;
271
- result.success = true;
272
- return result;
273
- }
274
-
275
- bool AudioProcessor::validate_wav_header(const WavFileHeader& header) {
276
- if (std::memcmp(header.riff_marker, "RIFF", 4) != 0) {
277
- return false;
278
- }
279
-
280
- if (std::memcmp(header.wave_marker, "WAVE", 4) != 0) {
281
- return false;
282
- }
283
-
284
- if (std::memcmp(header.format_marker, "fmt ", 4) != 0) {
285
- return false;
286
- }
287
-
288
- if (header.audio_format != 1 && header.audio_format != 3) {
289
- return false;
290
- }
291
-
292
- if (header.number_of_channels < 1 || header.number_of_channels > 16) {
293
- return false;
294
- }
295
-
296
- if (header.sample_rate < 100 || header.sample_rate > 384000) {
297
- return false;
298
- }
299
-
300
- if (header.bits_per_sample != 8 && header.bits_per_sample != 16 && header.bits_per_sample != 32) {
301
- return false;
302
- }
303
-
304
- return true;
305
- }
306
-
307
- std::size_t AudioProcessor::calculate_audio_duration_milliseconds(const AudioData& audio_data) {
308
- if (!audio_data.is_valid || audio_data.sample_rate == 0) {
309
- return 0;
310
- }
311
-
312
- std::size_t frame_count = audio_data.samples.size() / audio_data.number_of_channels;
313
- return (frame_count * 1000) / audio_data.sample_rate;
314
- }
315
-
316
- void AudioProcessor::convert_float32_to_int16(const float* input, std::int16_t* output, std::size_t sample_count) {
317
- for (std::size_t index = 0; index < sample_count; ++index) {
318
- float clamped_value = std::clamp(input[index], -1.0f, 1.0f);
319
- output[index] = static_cast<std::int16_t>(clamped_value * 32767.0f);
320
- }
321
- }
322
-
323
- void AudioProcessor::convert_int32_to_int16(const std::int32_t* input, std::int16_t* output, std::size_t sample_count) {
324
- for (std::size_t index = 0; index < sample_count; ++index) {
325
- output[index] = static_cast<std::int16_t>(input[index] >> 16);
326
- }
327
- }
328
-
329
- void AudioProcessor::convert_uint8_to_int16(const std::uint8_t* input, std::int16_t* output, std::size_t sample_count) {
330
- for (std::size_t index = 0; index < sample_count; ++index) {
331
- output[index] = static_cast<std::int16_t>((static_cast<std::int16_t>(input[index]) - 128) * 256);
332
- }
333
- }
334
-
335
- void AudioProcessor::mix_channels_to_mono(
336
- const std::int16_t* input,
337
- std::int16_t* output,
338
- std::size_t frame_count,
339
- std::uint16_t channel_count
340
- ) {
341
- for (std::size_t frame_index = 0; frame_index < frame_count; ++frame_index) {
342
- std::int32_t sum = 0;
343
-
344
- for (std::uint16_t channel_index = 0; channel_index < channel_count; ++channel_index) {
345
- sum += input[frame_index * channel_count + channel_index];
346
- }
347
-
348
- output[frame_index] = static_cast<std::int16_t>(sum / channel_count);
349
- }
350
- }
351
-
352
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
accelerator/src/ipc_handler.cpp DELETED
@@ -1,226 +0,0 @@
1
- //
2
- // SPDX-FileCopyrightText: Hadad <hadad@linuxmail.org>
3
- // SPDX-License-Identifier: Apache-2.0
4
- //
5
-
6
- #include "ipc_handler.hpp"
7
- #include <cstring>
8
- #include <iostream>
9
- #include <sys/socket.h>
10
- #include <sys/un.h>
11
- #include <unistd.h>
12
-
13
- namespace pocket_tts_accelerator {
14
-
15
- IpcHandler::IpcHandler(const std::string& socket_path)
16
- : socket_file_path(socket_path)
17
- , server_socket_fd(-1)
18
- , is_server_running(false) {
19
- }
20
-
21
- IpcHandler::~IpcHandler() {
22
- stop_server();
23
- }
24
-
25
- bool IpcHandler::start_server() {
26
- if (is_server_running.load()) {
27
- return true;
28
- }
29
-
30
- unlink(socket_file_path.c_str());
31
-
32
- server_socket_fd = socket(AF_UNIX, SOCK_STREAM, 0);
33
-
34
- if (server_socket_fd < 0) {
35
- std::cerr << "Failed to create socket: " << strerror(errno) << std::endl;
36
- return false;
37
- }
38
-
39
- struct sockaddr_un server_address;
40
- std::memset(&server_address, 0, sizeof(server_address));
41
- server_address.sun_family = AF_UNIX;
42
- std::strncpy(server_address.sun_path, socket_file_path.c_str(), sizeof(server_address.sun_path) - 1);
43
-
44
- if (bind(server_socket_fd, reinterpret_cast<struct sockaddr*>(&server_address), sizeof(server_address)) < 0) {
45
- std::cerr << "Failed to bind socket: " << strerror(errno) << std::endl;
46
- close(server_socket_fd);
47
- server_socket_fd = -1;
48
- return false;
49
- }
50
-
51
- if (listen(server_socket_fd, CONNECTION_BACKLOG) < 0) {
52
- std::cerr << "Failed to listen on socket: " << strerror(errno) << std::endl;
53
- close(server_socket_fd);
54
- server_socket_fd = -1;
55
- return false;
56
- }
57
-
58
- is_server_running.store(true);
59
- accept_thread = std::thread(&IpcHandler::accept_connections_loop, this);
60
-
61
- return true;
62
- }
63
-
64
- void IpcHandler::stop_server() {
65
- if (!is_server_running.load()) {
66
- return;
67
- }
68
-
69
- is_server_running.store(false);
70
-
71
- if (server_socket_fd >= 0) {
72
- shutdown(server_socket_fd, SHUT_RDWR);
73
- close(server_socket_fd);
74
- server_socket_fd = -1;
75
- }
76
-
77
- if (accept_thread.joinable()) {
78
- accept_thread.join();
79
- }
80
-
81
- unlink(socket_file_path.c_str());
82
- }
83
-
84
- bool IpcHandler::is_running() const {
85
- return is_server_running.load();
86
- }
87
-
88
- void IpcHandler::register_command_handler(CommandType command_type, CommandHandlerFunction handler) {
89
- std::unique_lock<std::mutex> lock(handlers_mutex);
90
- command_handlers[command_type] = std::move(handler);
91
- }
92
-
93
- void IpcHandler::set_shutdown_callback(std::function<void()> callback) {
94
- shutdown_callback = std::move(callback);
95
- }
96
-
97
- void IpcHandler::accept_connections_loop() {
98
- while (is_server_running.load()) {
99
- struct sockaddr_un client_address;
100
- socklen_t client_address_length = sizeof(client_address);
101
-
102
- int client_socket_fd = accept(
103
- server_socket_fd,
104
- reinterpret_cast<struct sockaddr*>(&client_address),
105
- &client_address_length
106
- );
107
-
108
- if (client_socket_fd < 0) {
109
- if (!is_server_running.load()) {
110
- break;
111
- }
112
- continue;
113
- }
114
-
115
- handle_client_connection(client_socket_fd);
116
- close(client_socket_fd);
117
- }
118
- }
119
-
120
- void IpcHandler::handle_client_connection(int client_socket_fd) {
121
- RequestHeader request_header;
122
- std::vector<std::uint8_t> request_payload;
123
-
124
- if (!receive_request(client_socket_fd, request_header, request_payload)) {
125
- return;
126
- }
127
-
128
- if (request_header.magic_number != PROTOCOL_MAGIC_NUMBER) {
129
- ResponseHeader error_response;
130
- error_response.magic_number = PROTOCOL_MAGIC_NUMBER;
131
- error_response.status_code = static_cast<std::uint32_t>(ResponseStatus::ERROR_INVALID_COMMAND);
132
- error_response.payload_size = 0;
133
- error_response.request_id = request_header.request_id;
134
- send_response(client_socket_fd, error_response, {});
135
- return;
136
- }
137
-
138
- CommandType command_type = static_cast<CommandType>(request_header.command_type);
139
-
140
- std::vector<std::uint8_t> response_payload;
141
- ResponseStatus status = ResponseStatus::SUCCESS;
142
-
143
- {
144
- std::unique_lock<std::mutex> lock(handlers_mutex);
145
- auto handler_iterator = command_handlers.find(command_type);
146
-
147
- if (handler_iterator != command_handlers.end()) {
148
- try {
149
- response_payload = handler_iterator->second(request_payload);
150
- } catch (const std::exception& exception) {
151
- std::cerr << "Handler exception: " << exception.what() << std::endl;
152
- status = ResponseStatus::ERROR_INTERNAL;
153
- } catch (...) {
154
- std::cerr << "Handler unknown exception" << std::endl;
155
- status = ResponseStatus::ERROR_INTERNAL;
156
- }
157
- } else {
158
- status = ResponseStatus::ERROR_INVALID_COMMAND;
159
- }
160
- }
161
-
162
- ResponseHeader response_header;
163
- response_header.magic_number = PROTOCOL_MAGIC_NUMBER;
164
- response_header.status_code = static_cast<std::uint32_t>(status);
165
- response_header.payload_size = static_cast<std::uint32_t>(response_payload.size());
166
- response_header.request_id = request_header.request_id;
167
-
168
- send_response(client_socket_fd, response_header, response_payload);
169
- }
170
-
171
- bool IpcHandler::send_response(
172
- int socket_fd,
173
- const ResponseHeader& header,
174
- const std::vector<std::uint8_t>& payload
175
- ) {
176
- ssize_t bytes_written = write(socket_fd, &header, sizeof(ResponseHeader));
177
-
178
- if (bytes_written != sizeof(ResponseHeader)) {
179
- return false;
180
- }
181
-
182
- if (!payload.empty()) {
183
- bytes_written = write(socket_fd, payload.data(), payload.size());
184
-
185
- if (bytes_written != static_cast<ssize_t>(payload.size())) {
186
- return false;
187
- }
188
- }
189
-
190
- return true;
191
- }
192
-
193
- bool IpcHandler::receive_request(
194
- int socket_fd,
195
- RequestHeader& header,
196
- std::vector<std::uint8_t>& payload
197
- ) {
198
- ssize_t bytes_read = read(socket_fd, &header, sizeof(RequestHeader));
199
-
200
- if (bytes_read != sizeof(RequestHeader)) {
201
- return false;
202
- }
203
-
204
- if (header.payload_size > MAXIMUM_PAYLOAD_SIZE) {
205
- return false;
206
- }
207
-
208
- if (header.payload_size > 0) {
209
- payload.resize(header.payload_size);
210
- std::size_t total_read = 0;
211
-
212
- while (total_read < header.payload_size) {
213
- bytes_read = read(socket_fd, payload.data() + total_read, header.payload_size - total_read);
214
-
215
- if (bytes_read <= 0) {
216
- return false;
217
- }
218
-
219
- total_read += static_cast<std::size_t>(bytes_read);
220
- }
221
- }
222
-
223
- return true;
224
- }
225
-
226
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
accelerator/src/main.cpp DELETED
@@ -1,83 +0,0 @@
1
- //
2
- // SPDX-FileCopyrightText: Hadad <hadad@linuxmail.org>
3
- // SPDX-License-Identifier: Apache-2.0
4
- //
5
-
6
- #include "accelerator_core.hpp"
7
- #include <cstdlib>
8
- #include <cstring>
9
- #include <iostream>
10
- #include <string>
11
-
12
- void print_usage(const char* program_name) {
13
- std::cout << "Usage: " << program_name << " [options]" << std::endl;
14
- std::cout << std::endl;
15
- std::cout << "Options:" << std::endl;
16
- std::cout << " --socket PATH IPC socket path (default: /tmp/pocket_tts_accelerator.sock)" << std::endl;
17
- std::cout << " --threads N Number of worker threads (default: 2)" << std::endl;
18
- std::cout << " --memory MB Memory pool size in megabytes (default: 64)" << std::endl;
19
- std::cout << " --quiet Disable verbose logging" << std::endl;
20
- std::cout << " --help Show this help message" << std::endl;
21
- std::cout.flush();
22
- }
23
-
24
- int main(int argc, char* argv[]) {
25
- std::cout.setf(std::ios::unitbuf);
26
- std::cerr.setf(std::ios::unitbuf);
27
-
28
- pocket_tts_accelerator::AcceleratorConfiguration configuration =
29
- pocket_tts_accelerator::AcceleratorCore::get_default_configuration();
30
-
31
- for (int argument_index = 1; argument_index < argc; ++argument_index) {
32
- std::string argument(argv[argument_index]);
33
-
34
- if (argument == "--help" || argument == "-h") {
35
- print_usage(argv[0]);
36
- return 0;
37
- }
38
-
39
- if (argument == "--socket" && argument_index + 1 < argc) {
40
- configuration.ipc_socket_path = argv[++argument_index];
41
- continue;
42
- }
43
-
44
- if (argument == "--threads" && argument_index + 1 < argc) {
45
- configuration.number_of_worker_threads = std::stoul(argv[++argument_index]);
46
- continue;
47
- }
48
-
49
- if (argument == "--memory" && argument_index + 1 < argc) {
50
- std::size_t memory_mb = std::stoul(argv[++argument_index]);
51
- configuration.memory_pool_size_bytes = memory_mb * 1024 * 1024;
52
- continue;
53
- }
54
-
55
- if (argument == "--quiet" || argument == "-q") {
56
- configuration.enable_verbose_logging = false;
57
- continue;
58
- }
59
-
60
- std::cerr << "Unknown argument: " << argument << std::endl;
61
- print_usage(argv[0]);
62
- return 1;
63
- }
64
-
65
- if (configuration.number_of_worker_threads < 1) {
66
- configuration.number_of_worker_threads = 1;
67
- }
68
-
69
- if (configuration.number_of_worker_threads > 2) {
70
- configuration.number_of_worker_threads = 2;
71
- }
72
-
73
- pocket_tts_accelerator::AcceleratorCore accelerator(configuration);
74
-
75
- if (!accelerator.initialize()) {
76
- std::cerr << "Failed to initialize accelerator" << std::endl;
77
- return 1;
78
- }
79
-
80
- accelerator.run();
81
-
82
- return 0;
83
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
accelerator/src/memory_pool.cpp DELETED
@@ -1,216 +0,0 @@
1
- //
2
- // SPDX-FileCopyrightText: Hadad <hadad@linuxmail.org>
3
- // SPDX-License-Identifier: Apache-2.0
4
- //
5
-
6
- #include "memory_pool.hpp"
7
- #include <algorithm>
8
- #include <chrono>
9
- #include <cstring>
10
-
11
- namespace pocket_tts_accelerator {
12
-
13
- MemoryPool::MemoryPool(std::size_t initial_pool_size_bytes)
14
- : total_allocated_bytes(0)
15
- , total_used_bytes(0)
16
- , maximum_pool_size_bytes(initial_pool_size_bytes) {
17
- }
18
-
19
- MemoryPool::~MemoryPool() {
20
- reset_pool();
21
- }
22
-
23
- std::uint8_t* MemoryPool::allocate(std::size_t requested_size_bytes) {
24
- std::unique_lock<std::mutex> lock(pool_mutex);
25
-
26
- std::size_t block_index = find_suitable_block_index(requested_size_bytes);
27
-
28
- if (block_index != static_cast<std::size_t>(-1)) {
29
- MemoryBlock& existing_block = memory_blocks[block_index];
30
- existing_block.is_in_use = true;
31
- existing_block.last_access_timestamp = get_current_timestamp();
32
- total_used_bytes += existing_block.block_size;
33
- return existing_block.data.get();
34
- }
35
-
36
- if (total_allocated_bytes + requested_size_bytes > maximum_pool_size_bytes) {
37
- clear_unused_blocks();
38
- }
39
-
40
- std::size_t aligned_size = ((requested_size_bytes + 63) / 64) * 64;
41
-
42
- memory_blocks.push_back(MemoryBlock{
43
- std::make_unique<std::uint8_t[]>(aligned_size),
44
- aligned_size,
45
- true,
46
- get_current_timestamp()
47
- });
48
-
49
- std::uint8_t* allocated_pointer = memory_blocks.back().data.get();
50
- pointer_to_block_index[allocated_pointer] = memory_blocks.size() - 1;
51
-
52
- total_allocated_bytes += aligned_size;
53
- total_used_bytes += aligned_size;
54
-
55
- return allocated_pointer;
56
- }
57
-
58
- void MemoryPool::deallocate(std::uint8_t* pointer) {
59
- if (pointer == nullptr) {
60
- return;
61
- }
62
-
63
- std::unique_lock<std::mutex> lock(pool_mutex);
64
-
65
- auto iterator = pointer_to_block_index.find(pointer);
66
-
67
- if (iterator != pointer_to_block_index.end()) {
68
- std::size_t block_index = iterator->second;
69
-
70
- if (block_index < memory_blocks.size()) {
71
- MemoryBlock& block = memory_blocks[block_index];
72
-
73
- if (block.is_in_use) {
74
- block.is_in_use = false;
75
- block.last_access_timestamp = get_current_timestamp();
76
- total_used_bytes -= block.block_size;
77
- }
78
- }
79
- }
80
- }
81
-
82
- void MemoryPool::clear_unused_blocks() {
83
- std::vector<std::size_t> indices_to_remove;
84
-
85
- for (std::size_t index = 0; index < memory_blocks.size(); ++index) {
86
- if (!memory_blocks[index].is_in_use) {
87
- indices_to_remove.push_back(index);
88
- }
89
- }
90
-
91
- std::sort(indices_to_remove.rbegin(), indices_to_remove.rend());
92
-
93
- for (std::size_t index : indices_to_remove) {
94
- std::uint8_t* pointer = memory_blocks[index].data.get();
95
- total_allocated_bytes -= memory_blocks[index].block_size;
96
-
97
- pointer_to_block_index.erase(pointer);
98
- memory_blocks.erase(memory_blocks.begin() + static_cast<std::ptrdiff_t>(index));
99
- }
100
-
101
- for (std::size_t index = 0; index < memory_blocks.size(); ++index) {
102
- pointer_to_block_index[memory_blocks[index].data.get()] = index;
103
- }
104
- }
105
-
106
- void MemoryPool::reset_pool() {
107
- std::unique_lock<std::mutex> lock(pool_mutex);
108
-
109
- memory_blocks.clear();
110
- pointer_to_block_index.clear();
111
- total_allocated_bytes = 0;
112
- total_used_bytes = 0;
113
- }
114
-
115
- std::size_t MemoryPool::get_total_allocated_bytes() const {
116
- std::unique_lock<std::mutex> lock(pool_mutex);
117
- return total_allocated_bytes;
118
- }
119
-
120
- std::size_t MemoryPool::get_total_used_bytes() const {
121
- std::unique_lock<std::mutex> lock(pool_mutex);
122
- return total_used_bytes;
123
- }
124
-
125
- std::size_t MemoryPool::get_block_count() const {
126
- std::unique_lock<std::mutex> lock(pool_mutex);
127
- return memory_blocks.size();
128
- }
129
-
130
- std::size_t MemoryPool::find_suitable_block_index(std::size_t requested_size) const {
131
- std::size_t best_fit_index = static_cast<std::size_t>(-1);
132
- std::size_t best_fit_size = static_cast<std::size_t>(-1);
133
-
134
- for (std::size_t index = 0; index < memory_blocks.size(); ++index) {
135
- const MemoryBlock& block = memory_blocks[index];
136
-
137
- if (!block.is_in_use && block.block_size >= requested_size) {
138
- if (block.block_size < best_fit_size) {
139
- best_fit_size = block.block_size;
140
- best_fit_index = index;
141
- }
142
- }
143
- }
144
-
145
- return best_fit_index;
146
- }
147
-
148
- void MemoryPool::create_new_block(std::size_t block_size) {
149
- std::size_t aligned_size = ((block_size + 63) / 64) * 64;
150
-
151
- memory_blocks.push_back(MemoryBlock{
152
- std::make_unique<std::uint8_t[]>(aligned_size),
153
- aligned_size,
154
- false,
155
- get_current_timestamp()
156
- });
157
-
158
- pointer_to_block_index[memory_blocks.back().data.get()] = memory_blocks.size() - 1;
159
- total_allocated_bytes += aligned_size;
160
- }
161
-
162
- std::uint64_t MemoryPool::get_current_timestamp() const {
163
- auto current_time = std::chrono::steady_clock::now();
164
- auto duration = current_time.time_since_epoch();
165
- return std::chrono::duration_cast<std::chrono::milliseconds>(duration).count();
166
- }
167
-
168
- ScopedMemoryAllocation::ScopedMemoryAllocation(MemoryPool& pool, std::size_t size)
169
- : memory_pool_pointer(&pool)
170
- , allocated_pointer(pool.allocate(size))
171
- , allocation_size(size) {
172
- }
173
-
174
- ScopedMemoryAllocation::~ScopedMemoryAllocation() {
175
- if (memory_pool_pointer != nullptr && allocated_pointer != nullptr) {
176
- memory_pool_pointer->deallocate(allocated_pointer);
177
- }
178
- }
179
-
180
- ScopedMemoryAllocation::ScopedMemoryAllocation(ScopedMemoryAllocation&& other) noexcept
181
- : memory_pool_pointer(other.memory_pool_pointer)
182
- , allocated_pointer(other.allocated_pointer)
183
- , allocation_size(other.allocation_size) {
184
-
185
- other.memory_pool_pointer = nullptr;
186
- other.allocated_pointer = nullptr;
187
- other.allocation_size = 0;
188
- }
189
-
190
- ScopedMemoryAllocation& ScopedMemoryAllocation::operator=(ScopedMemoryAllocation&& other) noexcept {
191
- if (this != &other) {
192
- if (memory_pool_pointer != nullptr && allocated_pointer != nullptr) {
193
- memory_pool_pointer->deallocate(allocated_pointer);
194
- }
195
-
196
- memory_pool_pointer = other.memory_pool_pointer;
197
- allocated_pointer = other.allocated_pointer;
198
- allocation_size = other.allocation_size;
199
-
200
- other.memory_pool_pointer = nullptr;
201
- other.allocated_pointer = nullptr;
202
- other.allocation_size = 0;
203
- }
204
-
205
- return *this;
206
- }
207
-
208
- std::uint8_t* ScopedMemoryAllocation::get() const {
209
- return allocated_pointer;
210
- }
211
-
212
- std::size_t ScopedMemoryAllocation::size() const {
213
- return allocation_size;
214
- }
215
-
216
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
accelerator/src/thread_pool.cpp DELETED
@@ -1,84 +0,0 @@
1
- //
2
- // SPDX-FileCopyrightText: Hadad <hadad@linuxmail.org>
3
- // SPDX-License-Identifier: Apache-2.0
4
- //
5
-
6
- #include "thread_pool.hpp"
7
-
8
- namespace pocket_tts_accelerator {
9
-
10
- ThreadPool::ThreadPool(std::size_t number_of_threads)
11
- : should_stop(false)
12
- , is_stopped(false)
13
- , thread_count(number_of_threads) {
14
-
15
- worker_threads.reserve(number_of_threads);
16
-
17
- for (std::size_t thread_index = 0; thread_index < number_of_threads; ++thread_index) {
18
- worker_threads.emplace_back(&ThreadPool::worker_thread_function, this);
19
- }
20
- }
21
-
22
- ThreadPool::~ThreadPool() {
23
- shutdown();
24
- }
25
-
26
- void ThreadPool::shutdown() {
27
- {
28
- std::unique_lock<std::mutex> lock(queue_mutex);
29
-
30
- if (is_stopped.load()) {
31
- return;
32
- }
33
-
34
- should_stop.store(true);
35
- }
36
-
37
- task_available_condition.notify_all();
38
-
39
- for (std::thread& worker_thread : worker_threads) {
40
- if (worker_thread.joinable()) {
41
- worker_thread.join();
42
- }
43
- }
44
-
45
- is_stopped.store(true);
46
- }
47
-
48
- bool ThreadPool::is_running() const {
49
- return !should_stop.load() && !is_stopped.load();
50
- }
51
-
52
- std::size_t ThreadPool::get_pending_task_count() const {
53
- std::unique_lock<std::mutex> lock(queue_mutex);
54
- return task_queue.size();
55
- }
56
-
57
- std::size_t ThreadPool::get_thread_count() const {
58
- return thread_count;
59
- }
60
-
61
- void ThreadPool::worker_thread_function() {
62
- while (true) {
63
- std::function<void()> task_to_execute;
64
-
65
- {
66
- std::unique_lock<std::mutex> lock(queue_mutex);
67
-
68
- task_available_condition.wait(lock, [this] {
69
- return should_stop.load() || !task_queue.empty();
70
- });
71
-
72
- if (should_stop.load() && task_queue.empty()) {
73
- return;
74
- }
75
-
76
- task_to_execute = std::move(task_queue.front());
77
- task_queue.pop();
78
- }
79
-
80
- task_to_execute();
81
- }
82
- }
83
-
84
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py DELETED
@@ -1,372 +0,0 @@
1
- #
2
- # SPDX-FileCopyrightText: Hadad <hadad@linuxmail.org>
3
- # SPDX-License-Identifier: Apache-2.0
4
- #
5
-
6
- import atexit
7
- import math
8
- import torch
9
- import gradio as gr
10
- from config import (
11
- AVAILABLE_VOICES,
12
- DEFAULT_VOICE,
13
- DEFAULT_MODEL_VARIANT,
14
- DEFAULT_TEMPERATURE,
15
- DEFAULT_LSD_DECODE_STEPS,
16
- DEFAULT_EOS_THRESHOLD,
17
- DEFAULT_NOISE_CLAMP,
18
- DEFAULT_FRAMES_AFTER_EOS,
19
- MAXIMUM_INPUT_LENGTH,
20
- VOICE_MODE_PRESET,
21
- VOICE_MODE_CLONE,
22
- EXAMPLE_PROMPTS,
23
- ACCELERATOR_ENABLED,
24
- ACCELERATOR_WORKER_THREADS
25
- )
26
- from src.core.authentication import authenticate_huggingface
27
- authenticate_huggingface()
28
- if ACCELERATOR_ENABLED:
29
- from src.accelerator.client import start_accelerator_daemon, stop_accelerator_daemon
30
- accelerator_started = start_accelerator_daemon()
31
- if accelerator_started:
32
- print("Accelerator daemon started successfully", flush=True)
33
- else:
34
- print("Accelerator daemon not available, using Python fallback", flush=True)
35
- atexit.register(stop_accelerator_daemon)
36
- from src.core.memory import start_background_cleanup_thread
37
- start_background_cleanup_thread()
38
- from src.generation.handler import (
39
- perform_speech_generation,
40
- request_generation_stop
41
- )
42
- from src.ui.state import (
43
- check_generate_button_state,
44
- calculate_character_count_display,
45
- determine_clear_button_visibility,
46
- update_voice_mode_visibility
47
- )
48
- from src.ui.handlers import (
49
- switch_to_generating_state,
50
- switch_to_idle_state,
51
- perform_clear_action,
52
- create_example_handler,
53
- format_example_button_label
54
- )
55
- from assets.css.styles import CSS
56
- from assets.static.title import TITLE
57
- from assets.static.header import HEADER
58
- from assets.static.footer import FOOTER
59
- from assets.static.sidebar import SIDEBAR
60
-
61
- with gr.Blocks(css=CSS, fill_height=False, fill_width=True) as app:
62
- torch.set_num_threads(ACCELERATOR_WORKER_THREADS)
63
- torch.set_num_interop_threads(ACCELERATOR_WORKER_THREADS)
64
-
65
- ui_state = gr.State({"generating": False})
66
-
67
- with gr.Sidebar():
68
- gr.HTML(SIDEBAR())
69
-
70
- with gr.Column(elem_classes="header-section"):
71
- gr.HTML(TITLE())
72
- gr.HTML(HEADER())
73
-
74
- with gr.Row():
75
- with gr.Column():
76
- audio_output_component = gr.Audio(
77
- label="Generated Speech Output",
78
- type="filepath",
79
- interactive=False
80
- )
81
-
82
- with gr.Accordion("Voice Selection", open=True):
83
- voice_mode_radio = gr.Radio(
84
- label="Voice Mode",
85
- choices=[
86
- VOICE_MODE_PRESET,
87
- VOICE_MODE_CLONE
88
- ],
89
- value=VOICE_MODE_PRESET,
90
- info="Choose between preset voices or clone a voice from uploaded audio",
91
- elem_id="voice-mode"
92
- )
93
-
94
- with gr.Column(visible=True) as preset_voice_container:
95
- voice_preset_dropdown = gr.Dropdown(
96
- label="Select Preset Voice",
97
- choices=AVAILABLE_VOICES,
98
- value=DEFAULT_VOICE
99
- )
100
-
101
- with gr.Column(visible=False) as clone_voice_container:
102
- voice_clone_audio_input = gr.Audio(
103
- label="Upload Audio for Voice Cloning",
104
- type="filepath"
105
- )
106
-
107
- with gr.Accordion("Model Parameters", open=False):
108
- with gr.Row():
109
- temperature_slider = gr.Slider(
110
- label="Temperature",
111
- minimum=0.1,
112
- maximum=2.0,
113
- step=0.05,
114
- value=DEFAULT_TEMPERATURE,
115
- info="Higher values produce more expressive speech"
116
- )
117
-
118
- lsd_decode_steps_slider = gr.Slider(
119
- label="LSD Decode Steps",
120
- minimum=1,
121
- maximum=20,
122
- step=1,
123
- value=DEFAULT_LSD_DECODE_STEPS,
124
- info="More steps may improve quality but slower"
125
- )
126
-
127
- with gr.Row():
128
- noise_clamp_slider = gr.Slider(
129
- label="Noise Clamp",
130
- minimum=0.0,
131
- maximum=2.0,
132
- step=0.05,
133
- value=DEFAULT_NOISE_CLAMP,
134
- info="Maximum noise sampling value (0 = disabled)"
135
- )
136
-
137
- eos_threshold_slider = gr.Slider(
138
- label="End of Sequence Threshold",
139
- minimum=-10.0,
140
- maximum=0.0,
141
- step=0.25,
142
- value=DEFAULT_EOS_THRESHOLD,
143
- info="Smaller values cause earlier completion"
144
- )
145
-
146
- with gr.Accordion("Advanced Settings", open=False):
147
- model_variant_textbox = gr.Textbox(
148
- label="Model Variant Identifier",
149
- value=DEFAULT_MODEL_VARIANT,
150
- info="Model signature for generation"
151
- )
152
-
153
- with gr.Row():
154
- enable_custom_frames_checkbox = gr.Checkbox(
155
- label="Enable Custom Frames After EOS",
156
- value=False,
157
- info="Manually control post-EOS frame generation"
158
- )
159
-
160
- frames_after_eos_slider = gr.Slider(
161
- label="Frames After EOS",
162
- minimum=0,
163
- maximum=100,
164
- step=1,
165
- value=DEFAULT_FRAMES_AFTER_EOS,
166
- info="Additional frames after end-of-sequence (80ms per frame)"
167
- )
168
-
169
- with gr.Column(scale=1):
170
- text_input_component = gr.Textbox(
171
- label="Prompt",
172
- placeholder="Enter the text you want to convert to speech...",
173
- lines=2,
174
- max_lines=20,
175
- max_length=MAXIMUM_INPUT_LENGTH,
176
- autoscroll=True
177
- )
178
-
179
- character_count_display = gr.HTML(
180
- f"""
181
- <div class="character-count">
182
- <span>0 / {MAXIMUM_INPUT_LENGTH}</span>
183
- </div>
184
- """,
185
- visible=False
186
- )
187
-
188
- generate_button = gr.Button(
189
- "Generate",
190
- variant="primary",
191
- size="lg",
192
- interactive=False
193
- )
194
-
195
- stop_button = gr.Button(
196
- "Stop",
197
- variant="stop",
198
- size="lg",
199
- visible=False
200
- )
201
-
202
- clear_button = gr.Button(
203
- "Clear",
204
- variant="secondary",
205
- size="lg",
206
- visible=False
207
- )
208
-
209
- gr.HTML(
210
- """
211
- <div class="example-prompts">
212
- <h3>Example Prompts</h3>
213
- <p>Click any example to generate speech with its assigned voice</p>
214
- </div>
215
- """
216
- )
217
-
218
- example_buttons_list = []
219
- num_examples = len(EXAMPLE_PROMPTS)
220
- examples_per_row = 2
221
- num_rows = math.ceil(num_examples / examples_per_row)
222
-
223
- for row_idx in range(num_rows):
224
- with gr.Row():
225
- start_idx = row_idx * examples_per_row
226
- end_idx = min(start_idx + examples_per_row, num_examples)
227
- for i in range(start_idx, end_idx):
228
- btn = gr.Button(
229
- format_example_button_label(
230
- EXAMPLE_PROMPTS[i]["text"],
231
- EXAMPLE_PROMPTS[i]["voice"]
232
- ),
233
- size="sm",
234
- variant="secondary"
235
- )
236
- example_buttons_list.append(btn)
237
-
238
- gr.HTML(FOOTER())
239
-
240
- generation_inputs = [
241
- text_input_component,
242
- voice_mode_radio,
243
- voice_preset_dropdown,
244
- voice_clone_audio_input,
245
- model_variant_textbox,
246
- lsd_decode_steps_slider,
247
- temperature_slider,
248
- noise_clamp_slider,
249
- eos_threshold_slider,
250
- frames_after_eos_slider,
251
- enable_custom_frames_checkbox
252
- ]
253
-
254
- voice_mode_radio.change(
255
- fn=update_voice_mode_visibility,
256
- inputs=[voice_mode_radio],
257
- outputs=[
258
- preset_voice_container,
259
- clone_voice_container
260
- ]
261
- )
262
-
263
- text_input_component.change(
264
- fn=calculate_character_count_display,
265
- inputs=[text_input_component],
266
- outputs=[character_count_display]
267
- )
268
-
269
- text_input_component.change(
270
- fn=check_generate_button_state,
271
- inputs=[
272
- text_input_component,
273
- ui_state
274
- ],
275
- outputs=[generate_button]
276
- )
277
-
278
- text_input_component.change(
279
- fn=determine_clear_button_visibility,
280
- inputs=[
281
- text_input_component,
282
- ui_state
283
- ],
284
- outputs=[clear_button]
285
- )
286
-
287
- generate_button.click(
288
- fn=switch_to_generating_state,
289
- inputs=[ui_state],
290
- outputs=[
291
- generate_button,
292
- stop_button,
293
- clear_button,
294
- ui_state
295
- ]
296
- ).then(
297
- fn=perform_speech_generation,
298
- inputs=generation_inputs,
299
- outputs=[audio_output_component]
300
- ).then(
301
- fn=switch_to_idle_state,
302
- inputs=[
303
- text_input_component,
304
- ui_state
305
- ],
306
- outputs=[
307
- generate_button,
308
- stop_button,
309
- clear_button,
310
- ui_state
311
- ]
312
- )
313
-
314
- stop_button.click(
315
- fn=request_generation_stop,
316
- outputs=[stop_button]
317
- )
318
-
319
- clear_button.click(
320
- fn=perform_clear_action,
321
- outputs=[
322
- text_input_component,
323
- audio_output_component,
324
- clear_button,
325
- voice_mode_radio,
326
- voice_preset_dropdown,
327
- voice_clone_audio_input
328
- ]
329
- )
330
-
331
- for button_index, example_button in enumerate(example_buttons_list):
332
- example_text = EXAMPLE_PROMPTS[button_index]["text"]
333
- example_voice = EXAMPLE_PROMPTS[button_index]["voice"]
334
-
335
- example_button.click(
336
- fn=switch_to_generating_state,
337
- inputs=[ui_state],
338
- outputs=[
339
- generate_button,
340
- stop_button,
341
- clear_button,
342
- ui_state
343
- ]
344
- ).then(
345
- fn=create_example_handler(example_text, example_voice),
346
- outputs=[
347
- text_input_component,
348
- voice_mode_radio,
349
- voice_preset_dropdown
350
- ]
351
- ).then(
352
- fn=perform_speech_generation,
353
- inputs=generation_inputs,
354
- outputs=[audio_output_component]
355
- ).then(
356
- fn=switch_to_idle_state,
357
- inputs=[
358
- text_input_component,
359
- ui_state
360
- ],
361
- outputs=[
362
- generate_button,
363
- stop_button,
364
- clear_button,
365
- ui_state
366
- ]
367
- )
368
-
369
- app.launch(
370
- server_name="0.0.0.0",
371
- max_file_size="1mb"
372
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
assets/css/styles.py DELETED
@@ -1,161 +0,0 @@
1
- #
2
- # https://huggingface.co/spaces/D3vShoaib/pocket-tts
3
- #
4
-
5
- CSS = """
6
- footer {
7
- visibility: hidden;
8
- }
9
-
10
- .gradio-container {
11
- max-width: 100% !important;
12
- padding: 0 !important;
13
- }
14
-
15
- @media (min-width: 768px) {
16
- .gradio-container {
17
- padding-left: 2% !important;
18
- padding-right: 2% !important;
19
- }
20
- }
21
-
22
- .header-section {
23
- text-align: left;
24
- margin-bottom: 1.5rem;
25
- }
26
-
27
- .main-title {
28
- color: #10b981;
29
- font-weight: 800;
30
- font-size: 1.8rem;
31
- margin: 5px 0;
32
- }
33
-
34
- @media (min-width: 768px) {
35
- .main-title {
36
- font-size: 2.2rem;
37
- }
38
- }
39
-
40
- .logo-container {
41
- display: flex;
42
- justify-content: flex-start;
43
- align-items: center;
44
- gap: 10px;
45
- margin-bottom: 0;
46
- }
47
-
48
- .logo-img {
49
- height: 40px;
50
- border-radius: 8px;
51
- }
52
-
53
- @media (min-width: 768px) {
54
- .logo-img {
55
- height: 50px;
56
- }
57
-
58
- .logo-container {
59
- gap: 15px;
60
- }
61
- }
62
-
63
- .links-row {
64
- display: flex;
65
- flex-wrap: wrap;
66
- justify-content: flex-start;
67
- gap: 8px;
68
- margin: 5px 0 10px 0;
69
- font-size: 0.85rem;
70
- }
71
-
72
- @media (min-width: 768px) {
73
- .links-row {
74
- gap: 10px;
75
- font-size: 0.9rem;
76
- }
77
- }
78
-
79
- .links-row a {
80
- color: #10b981;
81
- text-decoration: none;
82
- padding: 3px 10px;
83
- border: 1px solid #10b981;
84
- border-radius: 15px;
85
- transition: all 0.2s;
86
- white-space: nowrap;
87
- }
88
-
89
- .links-row a:hover {
90
- background-color: #10b981;
91
- color: white;
92
- }
93
-
94
- .disclaimer {
95
- text-align: center;
96
- font-size: 10px;
97
- line-height: 1.4;
98
- color: #9ca3af;
99
- margin-top: 30px;
100
- padding: 15px;
101
- border-top: 1px solid currentColor;
102
- }
103
-
104
- @media (min-width: 768px) {
105
- .disclaimer {
106
- margin-top: 40px;
107
- padding: 20px;
108
- }
109
- }
110
-
111
- .disclaimer-copyright {
112
- opacity: 0.8;
113
- }
114
-
115
- .disclaimer-warning {
116
- font-size: 8px;
117
- opacity: 0.7;
118
- }
119
-
120
- .accent-link {
121
- color: #10b981;
122
- text-decoration: none;
123
- }
124
-
125
- #voice-mode .wrap {
126
- display: flex !important;
127
- flex-direction: row !important;
128
- width: 100% !important;
129
- }
130
-
131
- #voice-mode .wrap label {
132
- flex: 1 !important;
133
- justify-content: center !important;
134
- text-align: center !important;
135
- }
136
-
137
- .example-prompts {
138
- padding: 16px 0 8px 0;
139
- }
140
-
141
- .example-prompts h3 {
142
- margin: 0 0 8px 0;
143
- font-size: 1.1em;
144
- }
145
-
146
- .example-prompts p {
147
- margin: 0;
148
- opacity: 0.7;
149
- font-size: 0.9em;
150
- }
151
-
152
- .character-count {
153
- text-align: right;
154
- padding: 4px 0;
155
- }
156
-
157
- .character-count span {
158
- color: var(--body-text-color-subdued);
159
- font-size: 0.85em;
160
- }
161
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
assets/static/footer.py DELETED
@@ -1,32 +0,0 @@
1
- #
2
- # SPDX-FileCopyrightText: Hadad <hadad@linuxmail.org>
3
- # SPDX-License-Identifier: Apache-2.0
4
- #
5
-
6
- from config import (
7
- COPYRIGHT_NAME,
8
- COPYRIGHT_URL,
9
- DESIGN_BY_NAME,
10
- DESIGN_BY_URL
11
- )
12
-
13
- def FOOTER():
14
- return f"""
15
- <div class="disclaimer">
16
- <br>
17
- <p class="disclaimer-copyright">
18
- Copyright © 2026
19
- <a href="{COPYRIGHT_URL}" target="_blank" class="accent-link">
20
- {COPYRIGHT_NAME}
21
- </a>,
22
- design inspired by
23
- <a href="{DESIGN_BY_URL}" target="_blank" class="accent-link">
24
- {DESIGN_BY_NAME}
25
- </a>.
26
- </p>
27
-
28
- <p class="disclaimer-warning">
29
- ⚠️ This Space is not affiliated with Kyutai TTS and is provided for demonstration purposes only.
30
- </p>
31
- </div>
32
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
assets/static/header.py DELETED
@@ -1,18 +0,0 @@
1
- #
2
- # SPDX-FileCopyrightText: Hadad <hadad@linuxmail.org>
3
- # SPDX-License-Identifier: Apache-2.0
4
- #
5
-
6
- from config import HEADER_LINKS
7
-
8
- def HEADER():
9
- data = ""
10
-
11
- for link in HEADER_LINKS:
12
- data += f'<a href="{link["url"]}" target="_blank">{link["icon"]} {link["text"]}</a>\n'
13
-
14
- return f"""
15
- <div class="links-row">
16
- {data}
17
- </div>
18
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
assets/static/sidebar.py DELETED
@@ -1,44 +0,0 @@
1
- #
2
- # SPDX-FileCopyrightText: Hadad <hadad@linuxmail.org>
3
- # SPDX-License-Identifier: Apache-2.0
4
- #
5
-
6
- def SIDEBAR():
7
- return f"""
8
- <h1>
9
- Audio Generation Playground part of the
10
- <a href="https://huggingface.co/spaces/hadadxyz/ai" target="_blank" class="accent-link">
11
- Demo Playground
12
- </a>,
13
- and the
14
- <a href="https://huggingface.co/umint" target="_blank" class="accent-link">
15
- UltimaX Intelligence
16
- </a>
17
- project.
18
- </h1><br />
19
-
20
- <p>
21
- This Space runs the
22
- <b>
23
- <a href="https://huggingface.co/kyutai/pocket-tts" target="_blank" class="accent-link">
24
- Pocket TTS
25
- </a>
26
- </b>
27
- model from <b>Kyutai</b>.<br /><br />
28
-
29
- A lightweight text-to-speech (TTS) application designed to run
30
- efficiently on CPUs. Forget about the hassle of using GPUs and
31
- web APIs serving TTS models.<br /><br />
32
-
33
- Additionally, this Space uses a custom Docker image to
34
- maximize model performance and is optimized for the
35
- constraints of Hugging Face Spaces.
36
- </p><br />
37
-
38
- <p>
39
- <b>Like this project?</b> You can support me by buying a
40
- <a href="https://ko-fi.com/hadad" target="_blank" class="accent-link">
41
- coffee
42
- </a>.
43
- </p>
44
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
assets/static/title.py DELETED
@@ -1,15 +0,0 @@
1
- #
2
- # SPDX-FileCopyrightText: Hadad <hadad@linuxmail.org>
3
- # SPDX-License-Identifier: Apache-2.0
4
- #
5
-
6
- from config import KYUTAI_LOGO_URL, POCKET_TTS_LOGO_URL, SPACE_INFO
7
-
8
- def TITLE():
9
- return f"""
10
- <div class="logo-container">
11
- <img src="{KYUTAI_LOGO_URL}" class="logo-img" alt="Kyutai Logo">
12
- <img src="{POCKET_TTS_LOGO_URL}" class="logo-img" alt="PocketTTS Logo">
13
- <h1 class='main-title'>{SPACE_INFO}</h1>
14
- </div>
15
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
config.py DELETED
@@ -1,126 +0,0 @@
1
- #
2
- # SPDX-FileCopyrightText: Hadad <hadad@linuxmail.org>
3
- # SPDX-License-Identifier: Apache-2.0
4
- #
5
-
6
- import os
7
-
8
- HF_TOKEN = os.getenv("HF_TOKEN", None)
9
-
10
- AVAILABLE_VOICES = [
11
- "alba",
12
- "marius",
13
- "javert",
14
- "jean",
15
- "fantine",
16
- "cosette",
17
- "eponine",
18
- "azelma"
19
- ]
20
-
21
- DEFAULT_VOICE = "alba"
22
- DEFAULT_MODEL_VARIANT = "b6369a24"
23
- DEFAULT_TEMPERATURE = 0.7
24
- DEFAULT_LSD_DECODE_STEPS = 1
25
- DEFAULT_EOS_THRESHOLD = -4.0
26
- DEFAULT_NOISE_CLAMP = 0.0
27
- DEFAULT_FRAMES_AFTER_EOS = 10
28
-
29
- VOICE_MODE_PRESET = "Preset Voices"
30
- VOICE_MODE_CLONE = "Voice Cloning"
31
-
32
- VOICE_STATE_CACHE_MAXIMUM_SIZE = 8
33
- VOICE_STATE_CACHE_CLEANUP_THRESHOLD = 4
34
-
35
- BACKGROUND_CLEANUP_INTERVAL = 300
36
-
37
- MAXIMUM_INPUT_LENGTH = 1000
38
-
39
- TEMPORARY_FILE_LIFETIME_SECONDS = 7200
40
-
41
- MAXIMUM_MEMORY_USAGE = 1 * 1024 * 1024 * 1024
42
- MEMORY_WARNING_THRESHOLD = int(0.7 * MAXIMUM_MEMORY_USAGE)
43
- MEMORY_CRITICAL_THRESHOLD = int(0.85 * MAXIMUM_MEMORY_USAGE)
44
- MEMORY_CHECK_INTERVAL = 30
45
- MEMORY_IDLE_TARGET = int(0.5 * MAXIMUM_MEMORY_USAGE)
46
-
47
- MAXIMUM_VOICE_CLONE_FILE_SIZE_BYTES = 1 * 1024 * 1024
48
-
49
- AUDIO_CONVERSION_QUEUE_TIMEOUT_SECONDS = 60
50
-
51
- MODEL_LOAD_RETRY_ATTEMPTS = 3
52
-
53
- SUPPORTED_AUDIO_EXTENSIONS = [
54
- ".wav",
55
- ".mp3",
56
- ".flac",
57
- ".ogg",
58
- ".m4a",
59
- ".aac",
60
- ".wma",
61
- ".aiff",
62
- ".aif",
63
- ".opus",
64
- ".webm",
65
- ".mp4",
66
- ".mkv",
67
- ".avi",
68
- ".mov",
69
- ".3gp"
70
- ]
71
-
72
- AUDIO_FORMAT_DISPLAY_NAME_OVERRIDES = {
73
- "m4a": "M4A/AAC",
74
- "aif": "AIFF",
75
- "3gp": "3GP"
76
- }
77
-
78
- EXAMPLE_PROMPTS = [
79
- {
80
- "text": "The quick brown fox jumps over the lazy dog near the riverbank.",
81
- "voice": "alba"
82
- },
83
- {
84
- "text": "Welcome to the future of text to speech technology powered by artificial intelligence.",
85
- "voice": "marius"
86
- },
87
- {
88
- "text": "Technology continues to push the boundaries of what we thought was possible.",
89
- "voice": "javert"
90
- },
91
- {
92
- "text": "The weather today is absolutely beautiful and perfect for a relaxing walk outside.",
93
- "voice": "fantine"
94
- },
95
- {
96
- "text": "Science and innovation are transforming how we interact with the world around us.",
97
- "voice": "jean"
98
- }
99
- ]
100
-
101
- KYUTAI_LOGO_URL = "https://cdn-avatars.huggingface.co/v1/production/uploads/6355a3c1805be5a8f30fea49/8xGdIOlfkopZfhbMitw_k.jpeg"
102
- POCKET_TTS_LOGO_URL = "https://raw.githubusercontent.com/kyutai-labs/pocket-tts/refs/heads/main/docs/logo.png"
103
-
104
- SPACE_INFO = "Pocket TTS"
105
-
106
- HEADER_LINKS = [
107
- {"icon": "🔊", "text": "Demo", "url": "https://kyutai.org/tts"},
108
- {"icon": "🐱‍💻", "text": "GitHub", "url": "https://github.com/kyutai-labs/pocket-tts"},
109
- {"icon": "🤗", "text": "Model Card", "url": "https://huggingface.co/kyutai/pocket-tts"},
110
- {"icon": "🤗", "text": "Space", "url": "https://huggingface.co/spaces/hadadxyz/pocket-tts-hf-cpu-optimized"},
111
- {"icon": "📄", "text": "Paper", "url": "https://arxiv.org/abs/2509.06926"},
112
- {"icon": "📚", "text": "Docs", "url": "https://github.com/kyutai-labs/pocket-tts/tree/main/docs"},
113
- ]
114
-
115
- COPYRIGHT_NAME = "Hadad Darajat"
116
- COPYRIGHT_URL = "https://www.linkedin.com/in/hadadrjt"
117
-
118
- DESIGN_BY_NAME = "D3vShoaib/pocket-tts"
119
- DESIGN_BY_URL = f"https://huggingface.co/spaces/{DESIGN_BY_NAME}"
120
-
121
- ACCELERATOR_SOCKET_PATH = "/app/pocket_tts_accelerator.sock"
122
- ACCELERATOR_BINARY_PATH = "/app/bin/pocket_tts_accelerator"
123
- ACCELERATOR_WORKER_THREADS = 1
124
- ACCELERATOR_MEMORY_POOL_MB = 64
125
- ACCELERATOR_LOG_PREFIX = "[ACCELERATOR]"
126
- ACCELERATOR_ENABLED = True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/accelerator/client.py DELETED
@@ -1,583 +0,0 @@
1
- #
2
- # SPDX-FileCopyrightText: Hadad <hadad@linuxmail.org>
3
- # SPDX-License-Identifier: Apache-2.0
4
- #
5
-
6
- import os
7
- import socket
8
- import struct
9
- import subprocess
10
- import tempfile
11
- import threading
12
- import sys
13
- from typing import Optional, Tuple, Dict, Any
14
- from config import (
15
- ACCELERATOR_SOCKET_PATH,
16
- ACCELERATOR_BINARY_PATH,
17
- ACCELERATOR_WORKER_THREADS,
18
- ACCELERATOR_MEMORY_POOL_MB,
19
- ACCELERATOR_LOG_PREFIX
20
- )
21
- from ..core.state import (
22
- accelerator_log_lock,
23
- accelerator_log_thread,
24
- accelerator_log_stop_event
25
- )
26
-
27
- PROTOCOL_MAGIC_NUMBER = 0x50545453
28
-
29
- COMMAND_PING = 0
30
- COMMAND_PROCESS_AUDIO = 1
31
- COMMAND_CONVERT_TO_MONO = 2
32
- COMMAND_CONVERT_TO_PCM = 3
33
- COMMAND_RESAMPLE_AUDIO = 4
34
- COMMAND_GET_MEMORY_STATS = 5
35
- COMMAND_CLEAR_MEMORY_POOL = 6
36
- COMMAND_SHUTDOWN = 7
37
-
38
- RESPONSE_SUCCESS = 0
39
- RESPONSE_ERROR_INVALID_COMMAND = 1
40
- RESPONSE_ERROR_FILE_NOT_FOUND = 2
41
- RESPONSE_ERROR_PROCESSING_FAILED = 3
42
- RESPONSE_ERROR_MEMORY_ALLOCATION = 4
43
- RESPONSE_ERROR_INTERNAL = 5
44
-
45
- REQUEST_HEADER_FORMAT = "=IIII"
46
- RESPONSE_HEADER_FORMAT = "=IIII"
47
- REQUEST_HEADER_SIZE = struct.calcsize(REQUEST_HEADER_FORMAT)
48
- RESPONSE_HEADER_SIZE = struct.calcsize(RESPONSE_HEADER_FORMAT)
49
-
50
- PROCESS_AUDIO_REQUEST_FORMAT = "=512s512sII"
51
- PROCESS_AUDIO_REQUEST_SIZE = struct.calcsize(PROCESS_AUDIO_REQUEST_FORMAT)
52
-
53
- MEMORY_STATS_RESPONSE_FORMAT = "=QQQ"
54
- MEMORY_STATS_RESPONSE_SIZE = struct.calcsize(MEMORY_STATS_RESPONSE_FORMAT)
55
-
56
- accelerator_process_handle = None
57
- accelerator_process_lock = threading.Lock()
58
- request_id_counter = 0
59
- request_id_lock = threading.Lock()
60
-
61
- def log_accelerator_message(message: str):
62
- with accelerator_log_lock:
63
- print(f"{ACCELERATOR_LOG_PREFIX} {message}", flush=True)
64
-
65
- def stream_accelerator_output(process_handle: subprocess.Popen):
66
- try:
67
- while not accelerator_log_stop_event.is_set():
68
- if process_handle.poll() is not None:
69
- break
70
-
71
- if process_handle.stdout:
72
- line = process_handle.stdout.readline()
73
-
74
- if line:
75
- decoded_line = line.decode("utf-8", errors="replace").rstrip()
76
-
77
- if decoded_line:
78
- log_accelerator_message(decoded_line)
79
-
80
- except Exception as stream_error:
81
- log_accelerator_message(f"Log stream error: {str(stream_error)}")
82
-
83
- def stream_accelerator_stderr(process_handle: subprocess.Popen):
84
- try:
85
- while not accelerator_log_stop_event.is_set():
86
- if process_handle.poll() is not None:
87
- break
88
-
89
- if process_handle.stderr:
90
- line = process_handle.stderr.readline()
91
-
92
- if line:
93
- decoded_line = line.decode("utf-8", errors="replace").rstrip()
94
-
95
- if decoded_line:
96
- log_accelerator_message(f"[STDERR] {decoded_line}")
97
-
98
- except Exception as stream_error:
99
- log_accelerator_message(f"Stderr stream error: {str(stream_error)}")
100
-
101
- class AcceleratorClient:
102
- def __init__(self, socket_path: str = ACCELERATOR_SOCKET_PATH):
103
- self.socket_path = socket_path
104
- self.connection_timeout = 5.0
105
- self.read_timeout = 30.0
106
-
107
- def is_connected(self) -> bool:
108
- try:
109
- response = self.send_ping()
110
- return response is not None and response.startswith(b"PONG")
111
-
112
- except Exception:
113
- return False
114
-
115
- def send_ping(self) -> Optional[bytes]:
116
- return self._send_command(COMMAND_PING, b"")
117
-
118
- def process_audio(
119
- self,
120
- input_file_path: str,
121
- output_file_path: str,
122
- target_sample_rate: int = 0,
123
- options_flags: int = 0
124
- ) -> Tuple[bool, str]:
125
- payload = self._pack_process_audio_request(
126
- input_file_path,
127
- output_file_path,
128
- target_sample_rate,
129
- options_flags
130
- )
131
-
132
- log_accelerator_message(f"Processing audio: {input_file_path} -> {output_file_path}")
133
-
134
- response = self._send_command(COMMAND_PROCESS_AUDIO, payload)
135
-
136
- if response is None:
137
- log_accelerator_message("Failed to communicate with accelerator for process_audio")
138
- return False, "Failed to communicate with accelerator"
139
-
140
- response_string = response.decode("utf-8", errors="ignore")
141
-
142
- if response_string.startswith("SUCCESS:"):
143
- log_accelerator_message(f"Audio processing succeeded: {response_string[8:]}")
144
- return True, response_string[8:]
145
-
146
- elif response_string.startswith("ERROR:"):
147
- log_accelerator_message(f"Audio processing failed: {response_string[6:]}")
148
- return False, response_string[6:]
149
-
150
- else:
151
- log_accelerator_message(f"Audio processing unknown response: {response_string}")
152
- return False, response_string
153
-
154
- def convert_to_mono(
155
- self,
156
- input_file_path: str,
157
- output_file_path: str
158
- ) -> Tuple[bool, str]:
159
- payload = self._pack_process_audio_request(
160
- input_file_path,
161
- output_file_path,
162
- 0,
163
- 0
164
- )
165
-
166
- log_accelerator_message(f"Converting to mono: {input_file_path} -> {output_file_path}")
167
-
168
- response = self._send_command(COMMAND_CONVERT_TO_MONO, payload)
169
-
170
- if response is None:
171
- log_accelerator_message("Failed to communicate with accelerator for convert_to_mono")
172
- return False, "Failed to communicate with accelerator"
173
-
174
- response_string = response.decode("utf-8", errors="ignore")
175
-
176
- if response_string.startswith("SUCCESS:"):
177
- log_accelerator_message(f"Mono conversion succeeded: {response_string[8:]}")
178
- return True, response_string[8:]
179
-
180
- elif response_string.startswith("ERROR:"):
181
- log_accelerator_message(f"Mono conversion failed: {response_string[6:]}")
182
- return False, response_string[6:]
183
-
184
- else:
185
- log_accelerator_message(f"Mono conversion unknown response: {response_string}")
186
- return False, response_string
187
-
188
- def convert_to_pcm(
189
- self,
190
- input_file_path: str,
191
- output_file_path: str
192
- ) -> Tuple[bool, str]:
193
- payload = self._pack_process_audio_request(
194
- input_file_path,
195
- output_file_path,
196
- 0,
197
- 0
198
- )
199
-
200
- log_accelerator_message(f"Converting to PCM: {input_file_path} -> {output_file_path}")
201
-
202
- response = self._send_command(COMMAND_CONVERT_TO_PCM, payload)
203
-
204
- if response is None:
205
- log_accelerator_message("Failed to communicate with accelerator for convert_to_pcm")
206
- return False, "Failed to communicate with accelerator"
207
-
208
- response_string = response.decode("utf-8", errors="ignore")
209
-
210
- if response_string.startswith("SUCCESS:"):
211
- log_accelerator_message(f"PCM conversion succeeded: {response_string[8:]}")
212
- return True, response_string[8:]
213
-
214
- elif response_string.startswith("ERROR:"):
215
- log_accelerator_message(f"PCM conversion failed: {response_string[6:]}")
216
- return False, response_string[6:]
217
-
218
- else:
219
- log_accelerator_message(f"PCM conversion unknown response: {response_string}")
220
- return False, response_string
221
-
222
- def resample_audio(
223
- self,
224
- input_file_path: str,
225
- output_file_path: str,
226
- target_sample_rate: int
227
- ) -> Tuple[bool, str]:
228
- payload = self._pack_process_audio_request(
229
- input_file_path,
230
- output_file_path,
231
- target_sample_rate,
232
- 0
233
- )
234
-
235
- log_accelerator_message(f"Resampling audio to {target_sample_rate}Hz: {input_file_path} -> {output_file_path}")
236
-
237
- response = self._send_command(COMMAND_RESAMPLE_AUDIO, payload)
238
-
239
- if response is None:
240
- log_accelerator_message("Failed to communicate with accelerator for resample_audio")
241
- return False, "Failed to communicate with accelerator"
242
-
243
- response_string = response.decode("utf-8", errors="ignore")
244
-
245
- if response_string.startswith("SUCCESS:"):
246
- log_accelerator_message(f"Resampling succeeded: {response_string[8:]}")
247
- return True, response_string[8:]
248
-
249
- elif response_string.startswith("ERROR:"):
250
- log_accelerator_message(f"Resampling failed: {response_string[6:]}")
251
- return False, response_string[6:]
252
-
253
- else:
254
- log_accelerator_message(f"Resampling unknown response: {response_string}")
255
- return False, response_string
256
-
257
- def get_memory_stats(self) -> Optional[Dict[str, int]]:
258
- response = self._send_command(COMMAND_GET_MEMORY_STATS, b"")
259
-
260
- if response is None or len(response) < MEMORY_STATS_RESPONSE_SIZE:
261
- log_accelerator_message("Failed to get memory stats from accelerator")
262
- return None
263
-
264
- total_allocated, total_used, block_count = struct.unpack(
265
- MEMORY_STATS_RESPONSE_FORMAT,
266
- response[:MEMORY_STATS_RESPONSE_SIZE]
267
- )
268
-
269
- stats = {
270
- "total_allocated_bytes": total_allocated,
271
- "total_used_bytes": total_used,
272
- "block_count": block_count
273
- }
274
-
275
- log_accelerator_message(f"Memory stats: allocated={total_allocated}, used={total_used}, blocks={block_count}")
276
-
277
- return stats
278
-
279
- def clear_memory_pool(self) -> bool:
280
- log_accelerator_message("Clearing accelerator memory pool")
281
- response = self._send_command(COMMAND_CLEAR_MEMORY_POOL, b"")
282
- success = response is not None
283
- if success:
284
- log_accelerator_message("Memory pool cleared successfully")
285
- else:
286
- log_accelerator_message("Failed to clear memory pool")
287
- return success
288
-
289
- def shutdown_accelerator(self) -> bool:
290
- log_accelerator_message("Sending shutdown command to accelerator")
291
- response = self._send_command(COMMAND_SHUTDOWN, b"")
292
- return response is not None
293
-
294
- def _get_next_request_id(self) -> int:
295
- global request_id_counter
296
-
297
- with request_id_lock:
298
- request_id_counter += 1
299
- return request_id_counter
300
-
301
- def _pack_process_audio_request(
302
- self,
303
- input_path: str,
304
- output_path: str,
305
- target_sample_rate: int,
306
- options_flags: int
307
- ) -> bytes:
308
- input_path_bytes = input_path.encode("utf-8")[:511] + b"\x00"
309
- output_path_bytes = output_path.encode("utf-8")[:511] + b"\x00"
310
-
311
- input_path_padded = input_path_bytes.ljust(512, b"\x00")
312
- output_path_padded = output_path_bytes.ljust(512, b"\x00")
313
-
314
- return struct.pack(
315
- PROCESS_AUDIO_REQUEST_FORMAT,
316
- input_path_padded,
317
- output_path_padded,
318
- target_sample_rate,
319
- options_flags
320
- )
321
-
322
- def _send_command(
323
- self,
324
- command_type: int,
325
- payload: bytes
326
- ) -> Optional[bytes]:
327
- try:
328
- client_socket = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
329
- client_socket.settimeout(self.connection_timeout)
330
- client_socket.connect(self.socket_path)
331
-
332
- request_id = self._get_next_request_id()
333
-
334
- request_header = struct.pack(
335
- REQUEST_HEADER_FORMAT,
336
- PROTOCOL_MAGIC_NUMBER,
337
- command_type,
338
- len(payload),
339
- request_id
340
- )
341
-
342
- client_socket.sendall(request_header)
343
-
344
- if payload:
345
- client_socket.sendall(payload)
346
-
347
- client_socket.settimeout(self.read_timeout)
348
-
349
- response_header_data = self._receive_exactly(client_socket, RESPONSE_HEADER_SIZE)
350
-
351
- if response_header_data is None:
352
- client_socket.close()
353
- return None
354
-
355
- magic_number, status_code, payload_size, response_request_id = struct.unpack(
356
- RESPONSE_HEADER_FORMAT,
357
- response_header_data
358
- )
359
-
360
- if magic_number != PROTOCOL_MAGIC_NUMBER:
361
- log_accelerator_message(f"Invalid magic number in response: {magic_number}")
362
- client_socket.close()
363
- return None
364
-
365
- if response_request_id != request_id:
366
- log_accelerator_message(f"Request ID mismatch: expected {request_id}, got {response_request_id}")
367
- client_socket.close()
368
- return None
369
-
370
- response_payload = b""
371
-
372
- if payload_size > 0:
373
- response_payload = self._receive_exactly(client_socket, payload_size)
374
-
375
- if response_payload is None:
376
- client_socket.close()
377
- return None
378
-
379
- client_socket.close()
380
-
381
- if status_code != RESPONSE_SUCCESS:
382
- return response_payload if response_payload else None
383
-
384
- return response_payload
385
-
386
- except socket.timeout:
387
- log_accelerator_message("Socket timeout while communicating with accelerator")
388
- return None
389
-
390
- except socket.error as socket_err:
391
- log_accelerator_message(f"Socket error: {str(socket_err)}")
392
- return None
393
-
394
- except Exception as general_error:
395
- log_accelerator_message(f"Unexpected error: {str(general_error)}")
396
- return None
397
-
398
- def _receive_exactly(
399
- self,
400
- client_socket: socket.socket,
401
- num_bytes: int
402
- ) -> Optional[bytes]:
403
- received_data = b""
404
- remaining_bytes = num_bytes
405
-
406
- while remaining_bytes > 0:
407
- try:
408
- chunk = client_socket.recv(remaining_bytes)
409
-
410
- if not chunk:
411
- return None
412
-
413
- received_data += chunk
414
- remaining_bytes -= len(chunk)
415
-
416
- except socket.timeout:
417
- return None
418
-
419
- except socket.error:
420
- return None
421
-
422
- return received_data
423
-
424
- def is_accelerator_available() -> bool:
425
- if not os.path.exists(ACCELERATOR_SOCKET_PATH):
426
- return False
427
-
428
- client = AcceleratorClient()
429
- return client.is_connected()
430
-
431
- def start_accelerator_daemon() -> bool:
432
- global accelerator_process_handle
433
-
434
- from ..core import state as global_state
435
-
436
- with accelerator_process_lock:
437
- if accelerator_process_handle is not None:
438
- if accelerator_process_handle.poll() is None:
439
- return True
440
-
441
- if not os.path.exists(ACCELERATOR_BINARY_PATH):
442
- log_accelerator_message(f"Accelerator binary not found: {ACCELERATOR_BINARY_PATH}")
443
- return False
444
-
445
- try:
446
- log_accelerator_message("Starting accelerator daemon...")
447
-
448
- global_state.accelerator_log_stop_event.clear()
449
-
450
- accelerator_process_handle = subprocess.Popen(
451
- [
452
- ACCELERATOR_BINARY_PATH,
453
- "--socket", ACCELERATOR_SOCKET_PATH,
454
- "--threads", str(ACCELERATOR_WORKER_THREADS),
455
- "--memory", str(ACCELERATOR_MEMORY_POOL_MB)
456
- ],
457
- stdout=subprocess.PIPE,
458
- stderr=subprocess.PIPE,
459
- start_new_session=True
460
- )
461
-
462
- stdout_thread = threading.Thread(
463
- target=stream_accelerator_output,
464
- args=(accelerator_process_handle,),
465
- daemon=True,
466
- name="AcceleratorStdoutThread"
467
- )
468
- stdout_thread.start()
469
-
470
- stderr_thread = threading.Thread(
471
- target=stream_accelerator_stderr,
472
- args=(accelerator_process_handle,),
473
- daemon=True,
474
- name="AcceleratorStderrThread"
475
- )
476
- stderr_thread.start()
477
-
478
- for attempt_index in range(50):
479
- if is_accelerator_available():
480
- log_accelerator_message("Accelerator daemon started and responding")
481
- return True
482
-
483
- available = is_accelerator_available()
484
- if available:
485
- log_accelerator_message("Accelerator daemon started successfully")
486
- else:
487
- log_accelerator_message("Accelerator daemon started but not responding")
488
-
489
- return available
490
-
491
- except Exception as start_error:
492
- log_accelerator_message(f"Failed to start accelerator daemon: {str(start_error)}")
493
- return False
494
-
495
- def stop_accelerator_daemon() -> bool:
496
- global accelerator_process_handle
497
-
498
- from ..core import state as global_state
499
-
500
- with accelerator_process_lock:
501
- global_state.accelerator_log_stop_event.set()
502
-
503
- if is_accelerator_available():
504
- try:
505
- log_accelerator_message("Sending shutdown command to accelerator...")
506
- client = AcceleratorClient()
507
- client.shutdown_accelerator()
508
-
509
- except Exception as shutdown_error:
510
- log_accelerator_message(f"Error during shutdown command: {str(shutdown_error)}")
511
-
512
- if accelerator_process_handle is not None:
513
- if accelerator_process_handle.poll() is None:
514
- try:
515
- log_accelerator_message("Terminating accelerator process...")
516
- accelerator_process_handle.terminate()
517
- accelerator_process_handle.wait(timeout=5)
518
- log_accelerator_message("Accelerator process terminated")
519
-
520
- except subprocess.TimeoutExpired:
521
- log_accelerator_message("Accelerator process did not terminate, killing...")
522
- accelerator_process_handle.kill()
523
- accelerator_process_handle.wait()
524
- log_accelerator_message("Accelerator process killed")
525
-
526
- accelerator_process_handle = None
527
-
528
- return True
529
-
530
- def process_audio_with_accelerator(
531
- input_file_path: str,
532
- output_file_path: str
533
- ) -> Tuple[bool, str]:
534
- if not is_accelerator_available():
535
- return False, "Accelerator not available"
536
-
537
- client = AcceleratorClient()
538
- return client.process_audio(input_file_path, output_file_path)
539
-
540
- def convert_to_mono_with_accelerator(
541
- input_file_path: str,
542
- output_file_path: str
543
- ) -> Tuple[bool, str]:
544
- if not is_accelerator_available():
545
- return False, "Accelerator not available"
546
-
547
- client = AcceleratorClient()
548
- return client.convert_to_mono(input_file_path, output_file_path)
549
-
550
- def convert_to_pcm_with_accelerator(
551
- input_file_path: str,
552
- output_file_path: str
553
- ) -> Tuple[bool, str]:
554
- if not is_accelerator_available():
555
- return False, "Accelerator not available"
556
-
557
- client = AcceleratorClient()
558
- return client.convert_to_pcm(input_file_path, output_file_path)
559
-
560
- def resample_audio_with_accelerator(
561
- input_file_path: str,
562
- output_file_path: str,
563
- target_sample_rate: int
564
- ) -> Tuple[bool, str]:
565
- if not is_accelerator_available():
566
- return False, "Accelerator not available"
567
-
568
- client = AcceleratorClient()
569
- return client.resample_audio(input_file_path, output_file_path, target_sample_rate)
570
-
571
- def get_accelerator_memory_stats() -> Optional[Dict[str, int]]:
572
- if not is_accelerator_available():
573
- return None
574
-
575
- client = AcceleratorClient()
576
- return client.get_memory_stats()
577
-
578
- def clear_accelerator_memory_pool() -> bool:
579
- if not is_accelerator_available():
580
- return False
581
-
582
- client = AcceleratorClient()
583
- return client.clear_memory_pool()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/audio/converter.py DELETED
@@ -1,344 +0,0 @@
1
- #
2
- # SPDX-FileCopyrightText: Hadad <hadad@linuxmail.org>
3
- # SPDX-License-Identifier: Apache-2.0
4
- #
5
-
6
- import os
7
- import time
8
- import tempfile
9
- import numpy as np
10
- import scipy.io.wavfile
11
- from config import AUDIO_CONVERSION_QUEUE_TIMEOUT_SECONDS
12
- from ..core.state import (
13
- temporary_files_registry,
14
- temporary_files_lock,
15
- audio_conversion_semaphore,
16
- increment_audio_conversion_active,
17
- decrement_audio_conversion_active,
18
- increment_audio_conversion_waiting,
19
- decrement_audio_conversion_waiting,
20
- is_audio_conversion_queue_busy
21
- )
22
- from ..core.memory import trigger_background_cleanup_check
23
- from ..accelerator.client import (
24
- is_accelerator_available,
25
- convert_to_pcm_with_accelerator,
26
- process_audio_with_accelerator,
27
- log_accelerator_message
28
- )
29
-
30
- class AudioConversionQueueError(Exception):
31
- pass
32
-
33
- class AudioConversionQueueBusyError(AudioConversionQueueError):
34
- pass
35
-
36
- class AudioConversionQueueTimeoutError(AudioConversionQueueError):
37
- pass
38
-
39
- def convert_audio_data_to_pcm_int16(audio_data):
40
- if audio_data.dtype == np.float32 or audio_data.dtype == np.float64:
41
- audio_data_clipped = np.clip(audio_data, -1.0, 1.0)
42
- audio_data_int16 = (audio_data_clipped * 32767).astype(np.int16)
43
- return audio_data_int16
44
-
45
- if audio_data.dtype == np.int32:
46
- audio_data_int16 = (audio_data >> 16).astype(np.int16)
47
- return audio_data_int16
48
-
49
- if audio_data.dtype == np.uint8:
50
- audio_data_int16 = ((audio_data.astype(np.int16) - 128) * 256).astype(np.int16)
51
- return audio_data_int16
52
-
53
- if audio_data.dtype == np.int16:
54
- return audio_data
55
-
56
- if audio_data.dtype == np.int64:
57
- audio_data_int16 = (audio_data >> 48).astype(np.int16)
58
- return audio_data_int16
59
-
60
- return audio_data.astype(np.int16)
61
-
62
- def convert_stereo_to_mono(audio_data):
63
- if len(audio_data.shape) == 1:
64
- return audio_data
65
-
66
- if len(audio_data.shape) == 2:
67
- if audio_data.shape[0] > audio_data.shape[1]:
68
- audio_data = audio_data.T
69
-
70
- if audio_data.shape[0] > 1:
71
- mono_audio = np.mean(audio_data, axis=0)
72
- return mono_audio.astype(audio_data.dtype)
73
-
74
- return audio_data[0]
75
-
76
- return audio_data
77
-
78
- def register_temporary_file(file_path):
79
- with temporary_files_lock:
80
- temporary_files_registry[file_path] = time.time()
81
- trigger_background_cleanup_check()
82
-
83
- def acquire_audio_conversion_slot(wait_for_slot=True):
84
- if is_audio_conversion_queue_busy():
85
- if not wait_for_slot:
86
- raise AudioConversionQueueBusyError(
87
- "Audio conversion is currently in progress for another user. Please wait a moment and try again."
88
- )
89
-
90
- increment_audio_conversion_waiting()
91
-
92
- try:
93
- acquired = audio_conversion_semaphore.acquire(
94
- blocking=True,
95
- timeout=AUDIO_CONVERSION_QUEUE_TIMEOUT_SECONDS
96
- )
97
-
98
- if not acquired:
99
- raise AudioConversionQueueTimeoutError(
100
- "Audio conversion queue timed out. The server is busy processing other requests. Please try again in a moment."
101
- )
102
-
103
- finally:
104
- decrement_audio_conversion_waiting()
105
-
106
- else:
107
- acquired = audio_conversion_semaphore.acquire(blocking=False)
108
-
109
- if not acquired:
110
- if not wait_for_slot:
111
- raise AudioConversionQueueBusyError(
112
- "Audio conversion is currently in progress for another user. Please wait a moment and try again."
113
- )
114
-
115
- increment_audio_conversion_waiting()
116
-
117
- try:
118
- acquired = audio_conversion_semaphore.acquire(
119
- blocking=True,
120
- timeout=AUDIO_CONVERSION_QUEUE_TIMEOUT_SECONDS
121
- )
122
-
123
- if not acquired:
124
- raise AudioConversionQueueTimeoutError(
125
- "Audio conversion queue timed out. The server is busy processing other requests. Please try again in a moment."
126
- )
127
-
128
- finally:
129
- decrement_audio_conversion_waiting()
130
-
131
- increment_audio_conversion_active()
132
-
133
- def release_audio_conversion_slot():
134
- decrement_audio_conversion_active()
135
- audio_conversion_semaphore.release()
136
-
137
- def convert_wav_file_to_pcm_format_with_accelerator(input_path):
138
- output_file = tempfile.NamedTemporaryFile(suffix="_accel_pcm_converted.wav", delete=False)
139
- output_path = output_file.name
140
- output_file.close()
141
-
142
- success, result_message = convert_to_pcm_with_accelerator(input_path, output_path)
143
-
144
- if success:
145
- register_temporary_file(output_path)
146
- return output_path, None
147
-
148
- else:
149
- if os.path.exists(output_path):
150
- try:
151
- os.remove(output_path)
152
-
153
- except Exception:
154
- pass
155
-
156
- return None, result_message
157
-
158
- def convert_wav_file_to_pcm_format(input_path):
159
- if is_accelerator_available():
160
- log_accelerator_message(f"Using accelerator for PCM conversion: {input_path}")
161
- accelerated_result, accelerated_error = convert_wav_file_to_pcm_format_with_accelerator(input_path)
162
- if accelerated_result is not None:
163
- return accelerated_result, None
164
- log_accelerator_message(f"Accelerator PCM conversion failed, falling back to Python: {accelerated_error}")
165
-
166
- try:
167
- sample_rate, audio_data = scipy.io.wavfile.read(input_path)
168
-
169
- if len(audio_data.shape) > 1:
170
- audio_data = convert_stereo_to_mono(audio_data)
171
-
172
- audio_data_pcm = convert_audio_data_to_pcm_int16(audio_data)
173
-
174
- output_file = tempfile.NamedTemporaryFile(suffix="_pcm_converted.wav", delete=False)
175
- scipy.io.wavfile.write(output_file.name, sample_rate, audio_data_pcm)
176
-
177
- register_temporary_file(output_file.name)
178
-
179
- return output_file.name, None
180
-
181
- except Exception as conversion_error:
182
- return None, f"Failed to convert WAV to PCM format: {str(conversion_error)}"
183
-
184
- def convert_audio_using_pydub(input_path, target_sample_rate=None):
185
- try:
186
- from pydub import AudioSegment
187
-
188
- audio_segment = AudioSegment.from_file(input_path)
189
-
190
- audio_segment = audio_segment.set_channels(1)
191
- audio_segment = audio_segment.set_sample_width(2)
192
-
193
- if target_sample_rate is not None:
194
- audio_segment = audio_segment.set_frame_rate(target_sample_rate)
195
-
196
- output_file = tempfile.NamedTemporaryFile(suffix="_pydub_converted.wav", delete=False)
197
- audio_segment.export(output_file.name, format="wav")
198
-
199
- register_temporary_file(output_file.name)
200
-
201
- return output_file.name, None
202
-
203
- except ImportError:
204
- return None, "pydub_library_not_available"
205
-
206
- except Exception as conversion_error:
207
- error_message = str(conversion_error)
208
- if "ffmpeg" in error_message.lower() or "ffprobe" in error_message.lower():
209
- return None, "ffmpeg_not_available"
210
- return None, f"Failed to convert audio using pydub: {error_message}"
211
-
212
- def convert_audio_using_soundfile(input_path):
213
- try:
214
- import soundfile
215
-
216
- audio_data, sample_rate = soundfile.read(input_path, dtype='float32')
217
-
218
- if len(audio_data.shape) > 1:
219
- audio_data = np.mean(audio_data, axis=1)
220
-
221
- audio_data_pcm = convert_audio_data_to_pcm_int16(audio_data)
222
-
223
- output_file = tempfile.NamedTemporaryFile(suffix="_soundfile_converted.wav", delete=False)
224
- scipy.io.wavfile.write(output_file.name, sample_rate, audio_data_pcm)
225
-
226
- register_temporary_file(output_file.name)
227
-
228
- return output_file.name, None
229
-
230
- except ImportError:
231
- return None, "soundfile_library_not_available"
232
-
233
- except Exception as conversion_error:
234
- return None, f"Failed to convert audio using soundfile: {str(conversion_error)}"
235
-
236
- def convert_audio_using_librosa(input_path):
237
- try:
238
- import librosa
239
-
240
- audio_data, sample_rate = librosa.load(input_path, sr=None, mono=True)
241
-
242
- audio_data_pcm = convert_audio_data_to_pcm_int16(audio_data)
243
-
244
- output_file = tempfile.NamedTemporaryFile(suffix="_librosa_converted.wav", delete=False)
245
- scipy.io.wavfile.write(output_file.name, sample_rate, audio_data_pcm)
246
-
247
- register_temporary_file(output_file.name)
248
-
249
- return output_file.name, None
250
-
251
- except ImportError:
252
- return None, "librosa_library_not_available"
253
-
254
- except Exception as conversion_error:
255
- return None, f"Failed to convert audio using librosa: {str(conversion_error)}"
256
-
257
- def convert_non_wav_audio_to_wav(input_path):
258
- converted_path, pydub_error = convert_audio_using_pydub(input_path)
259
- if converted_path is not None:
260
- return converted_path, None, "pydub"
261
-
262
- converted_path, soundfile_error = convert_audio_using_soundfile(input_path)
263
- if converted_path is not None:
264
- return converted_path, None, "soundfile"
265
-
266
- converted_path, librosa_error = convert_audio_using_librosa(input_path)
267
- if converted_path is not None:
268
- return converted_path, None, "librosa"
269
-
270
- pydub_unavailable = pydub_error in ["pydub_library_not_available", "ffmpeg_not_available"]
271
- soundfile_unavailable = soundfile_error == "soundfile_library_not_available"
272
- librosa_unavailable = librosa_error == "librosa_library_not_available"
273
-
274
- if pydub_unavailable and soundfile_unavailable and librosa_unavailable:
275
- return None, "No audio conversion library is available on the server. Please upload a WAV file directly.", None
276
-
277
- all_errors = []
278
- if not pydub_unavailable and pydub_error:
279
- all_errors.append(f"pydub: {pydub_error}")
280
-
281
- if not soundfile_unavailable and soundfile_error:
282
- all_errors.append(f"soundfile: {soundfile_error}")
283
-
284
- if not librosa_unavailable and librosa_error:
285
- all_errors.append(f"librosa: {librosa_error}")
286
-
287
- if all_errors:
288
- combined_error = " | ".join(all_errors)
289
- return None, f"Audio conversion failed with all available methods. {combined_error}", None
290
-
291
- return None, "Audio conversion failed. Please try uploading a different audio file or use WAV format.", None
292
-
293
- def prepare_audio_file_for_voice_cloning_internal(input_path):
294
- from .validator import perform_comprehensive_audio_validation, get_format_display_name
295
-
296
- is_valid, is_wav_format, detected_format, validation_error = perform_comprehensive_audio_validation(input_path)
297
-
298
- if not is_valid:
299
- return None, validation_error, False, detected_format
300
-
301
- if is_wav_format:
302
- converted_path, conversion_error = convert_wav_file_to_pcm_format(input_path)
303
- if converted_path is not None:
304
- return converted_path, None, False, 'wav'
305
- return None, conversion_error, False, 'wav'
306
-
307
- format_display_name = get_format_display_name(detected_format)
308
-
309
- converted_path, conversion_error, conversion_method = convert_non_wav_audio_to_wav(input_path)
310
- if converted_path is not None:
311
- final_path, pcm_error = convert_wav_file_to_pcm_format(converted_path)
312
- if final_path is not None:
313
- return final_path, None, True, detected_format
314
- return converted_path, None, True, detected_format
315
-
316
- return None, conversion_error, True, detected_format
317
-
318
- def prepare_audio_file_for_voice_cloning(input_path, wait_for_queue=True):
319
- try:
320
- acquire_audio_conversion_slot(wait_for_slot=wait_for_queue)
321
-
322
- except AudioConversionQueueBusyError as queue_busy_error:
323
- return None, str(queue_busy_error), False, None
324
-
325
- except AudioConversionQueueTimeoutError as queue_timeout_error:
326
- return None, str(queue_timeout_error), False, None
327
-
328
- try:
329
- result_path, result_error, was_converted, detected_format = prepare_audio_file_for_voice_cloning_internal(input_path)
330
- return result_path, result_error, was_converted, detected_format
331
-
332
- finally:
333
- release_audio_conversion_slot()
334
-
335
- def convert_audio_to_pcm_wav(input_path):
336
- converted_path, error, was_converted, detected_format = prepare_audio_file_for_voice_cloning(input_path)
337
-
338
- if converted_path is not None:
339
- return converted_path
340
-
341
- if error:
342
- print(f"Warning: Audio conversion failed - {error}")
343
-
344
- return input_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/audio/validator.py DELETED
@@ -1,268 +0,0 @@
1
- #
2
- # SPDX-FileCopyrightText: Hadad <hadad@linuxmail.org>
3
- # SPDX-License-Identifier: Apache-2.0
4
- #
5
-
6
- import os
7
- import wave
8
- from config import (
9
- SUPPORTED_AUDIO_EXTENSIONS,
10
- AUDIO_FORMAT_DISPLAY_NAME_OVERRIDES,
11
- MAXIMUM_VOICE_CLONE_FILE_SIZE_BYTES
12
- )
13
-
14
- def build_format_display_names_from_supported_extensions():
15
- format_display_names = {}
16
-
17
- for extension in SUPPORTED_AUDIO_EXTENSIONS:
18
- format_code = extension.lstrip(".")
19
-
20
- if format_code in AUDIO_FORMAT_DISPLAY_NAME_OVERRIDES:
21
- format_display_names[format_code] = AUDIO_FORMAT_DISPLAY_NAME_OVERRIDES[format_code]
22
- else:
23
- format_display_names[format_code] = format_code.upper()
24
-
25
- format_display_names["unknown"] = "Unknown"
26
-
27
- return format_display_names
28
-
29
- FORMAT_DISPLAY_NAMES = build_format_display_names_from_supported_extensions()
30
-
31
- def get_audio_file_extension(file_path):
32
- if not file_path:
33
- return None
34
-
35
- _, extension = os.path.splitext(file_path)
36
-
37
- return extension.lower()
38
-
39
- def is_supported_audio_extension(file_path):
40
- extension = get_audio_file_extension(file_path)
41
-
42
- if extension is None:
43
- return False
44
-
45
- return extension in SUPPORTED_AUDIO_EXTENSIONS
46
-
47
- def format_file_size_for_display(size_bytes):
48
- if size_bytes < 1024:
49
- return f"{size_bytes} bytes"
50
-
51
- elif size_bytes < 1024 * 1024:
52
- return f"{size_bytes / 1024:.1f} KB"
53
-
54
- else:
55
- return f"{size_bytes / (1024 * 1024):.2f} MB"
56
-
57
- def validate_file_size_for_voice_cloning(file_path):
58
- if not file_path:
59
- return False, "No audio file provided."
60
-
61
- try:
62
- file_size = os.path.getsize(file_path)
63
-
64
- except OSError as size_error:
65
- return False, f"Cannot read file size: {str(size_error)}"
66
-
67
- if file_size > MAXIMUM_VOICE_CLONE_FILE_SIZE_BYTES:
68
- max_size_display = format_file_size_for_display(MAXIMUM_VOICE_CLONE_FILE_SIZE_BYTES)
69
- actual_size_display = format_file_size_for_display(file_size)
70
- return False, f"Audio file size ({actual_size_display}) exceeds the maximum allowed size of {max_size_display}. Please upload a smaller audio file."
71
-
72
- return True, None
73
-
74
- def validate_file_exists_and_readable(file_path):
75
- if not file_path:
76
- return False, "No audio file provided."
77
-
78
- if not os.path.exists(file_path):
79
- return False, "Audio file does not exist."
80
-
81
- if not os.path.isfile(file_path):
82
- return False, "The provided path is not a valid file."
83
-
84
- try:
85
- file_size = os.path.getsize(file_path)
86
-
87
- except OSError as size_error:
88
- return False, f"Cannot read file size: {str(size_error)}"
89
-
90
- if file_size == 0:
91
- return False, "Audio file is empty (0 bytes)."
92
-
93
- if file_size < 44:
94
- return False, "Audio file is too small to be a valid audio file."
95
-
96
- try:
97
- with open(file_path, "rb") as test_file:
98
- test_file.read(1)
99
- except IOError as read_error:
100
- return False, f"Audio file is not readable: {str(read_error)}"
101
-
102
- return True, None
103
-
104
- def detect_audio_format_from_header(file_path):
105
- try:
106
- with open(file_path, "rb") as audio_file:
107
- header_bytes = audio_file.read(32)
108
-
109
- if len(header_bytes) < 4:
110
- return None, "File is too small to determine audio format."
111
-
112
- if len(header_bytes) >= 12:
113
- if header_bytes[:4] == b"RIFF" and header_bytes[8:12] == b"WAVE":
114
- return "wav", None
115
-
116
- if header_bytes[:3] == b"ID3":
117
- return "mp3", None
118
-
119
- if len(header_bytes) >= 2:
120
- first_two_bytes = header_bytes[:2]
121
-
122
- mp3_sync_bytes = [
123
- b"\xff\xfb",
124
- b"\xff\xfa",
125
- b"\xff\xf3",
126
- b"\xff\xf2",
127
- b"\xff\xe0",
128
- b"\xff\xe2",
129
- b"\xff\xe3"
130
- ]
131
-
132
- if first_two_bytes in mp3_sync_bytes:
133
- return "mp3", None
134
-
135
- if header_bytes[:4] == b"fLaC":
136
- return "flac", None
137
-
138
- if header_bytes[:4] == b"OggS":
139
- return "ogg", None
140
-
141
- if len(header_bytes) >= 12:
142
- if header_bytes[:4] == b"FORM" and header_bytes[8:12] in [b"AIFF", b"AIFC"]:
143
- return "aiff", None
144
-
145
- if len(header_bytes) >= 8:
146
- if header_bytes[4:8] == b"ftyp":
147
- return "m4a", None
148
-
149
- if len(header_bytes) >= 4:
150
- if header_bytes[:4] == b"\x1aE\xdf\xa3":
151
- return "webm", None
152
-
153
- if len(header_bytes) >= 8:
154
- if header_bytes[4:8] in [b"mdat", b"moov", b"free", b"skip", b"wide"]:
155
- return "m4a", None
156
-
157
- file_extension = get_audio_file_extension(file_path)
158
-
159
- if file_extension and file_extension in SUPPORTED_AUDIO_EXTENSIONS:
160
- return file_extension.lstrip("."), None
161
-
162
- return "unknown", "Could not determine audio format from file header. The file may be corrupted or in an unsupported format."
163
-
164
- except IOError as io_error:
165
- return None, f"Error reading file header: {str(io_error)}"
166
-
167
- except Exception as detection_error:
168
- return None, f"Unexpected error detecting audio format: {str(detection_error)}"
169
-
170
- def validate_wav_file_structure(file_path):
171
- try:
172
- with wave.open(file_path, "rb") as wav_file:
173
- number_of_channels = wav_file.getnchannels()
174
- sample_width_bytes = wav_file.getsampwidth()
175
- sample_rate = wav_file.getframerate()
176
- number_of_frames = wav_file.getnframes()
177
-
178
- if number_of_channels < 1:
179
- return False, "WAV file has no audio channels."
180
-
181
- if number_of_channels > 16:
182
- return False, f"WAV file has too many channels ({number_of_channels}). Maximum supported is 16."
183
-
184
- if sample_width_bytes < 1:
185
- return False, "WAV file has invalid sample width (less than 1 byte)."
186
-
187
- if sample_width_bytes > 4:
188
- return False, f"WAV file has unsupported sample width ({sample_width_bytes} bytes). Maximum supported is 4 bytes (32-bit)."
189
-
190
- if sample_rate < 100:
191
- return False, f"WAV file has invalid sample rate ({sample_rate} Hz). Minimum supported is 100 Hz."
192
-
193
- if sample_rate > 384000:
194
- return False, f"WAV file has unsupported sample rate ({sample_rate} Hz). Maximum supported is 384000 Hz."
195
-
196
- if number_of_frames < 1:
197
- return False, "WAV file contains no audio frames."
198
-
199
- audio_duration_seconds = number_of_frames / sample_rate
200
-
201
- if audio_duration_seconds < 0.1:
202
- return False, f"Audio is too short ({audio_duration_seconds:.2f} seconds). Minimum duration is 0.1 seconds."
203
-
204
- if audio_duration_seconds > 60:
205
- return False, f"Audio is too long ({audio_duration_seconds:.0f} seconds). Maximum duration is 1 minute."
206
-
207
- return True, None
208
-
209
- except wave.Error as wav_error:
210
- error_message = str(wav_error)
211
-
212
- if "file does not start with RIFF id" in error_message:
213
- return False, "File has .wav extension but is not a valid WAV file. It may be a different audio format renamed to .wav."
214
-
215
- if "unknown format" in error_message.lower():
216
- return False, "WAV file uses an unsupported audio encoding format."
217
-
218
- return False, f"Invalid WAV file structure: {error_message}"
219
-
220
- except EOFError:
221
- return False, "WAV file is truncated or corrupted (unexpected end of file)."
222
-
223
- except Exception as validation_error:
224
- return False, f"Error validating WAV file: {str(validation_error)}"
225
-
226
- def perform_comprehensive_audio_validation(file_path):
227
- file_exists_valid, file_exists_error = validate_file_exists_and_readable(file_path)
228
-
229
- if not file_exists_valid:
230
- return False, False, None, file_exists_error
231
-
232
- file_extension = get_audio_file_extension(file_path)
233
-
234
- if not is_supported_audio_extension(file_path):
235
- supported_formats_list = ", ".join(SUPPORTED_AUDIO_EXTENSIONS)
236
- return False, False, None, f"Unsupported file format '{file_extension}'. Supported formats are: {supported_formats_list}"
237
-
238
- detected_format, detection_error = detect_audio_format_from_header(file_path)
239
-
240
- if detected_format is None:
241
- return False, False, None, detection_error
242
-
243
- is_wav_format = (detected_format == "wav")
244
-
245
- if is_wav_format:
246
- wav_structure_valid, wav_structure_error = validate_wav_file_structure(file_path)
247
-
248
- if not wav_structure_valid:
249
- return False, True, "wav", wav_structure_error
250
-
251
- return True, is_wav_format, detected_format, None
252
-
253
- def perform_voice_clone_file_validation(file_path):
254
- file_size_valid, file_size_error = validate_file_size_for_voice_cloning(file_path)
255
-
256
- if not file_size_valid:
257
- return False, False, None, file_size_error
258
-
259
- return perform_comprehensive_audio_validation(file_path)
260
-
261
- def get_format_display_name(format_code):
262
- if format_code is None:
263
- return "Unknown"
264
-
265
- if format_code in FORMAT_DISPLAY_NAMES:
266
- return FORMAT_DISPLAY_NAMES[format_code]
267
-
268
- return format_code.upper()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/core/authentication.py DELETED
@@ -1,23 +0,0 @@
1
- #
2
- # SPDX-FileCopyrightText: Hadad <hadad@linuxmail.org>
3
- # SPDX-License-Identifier: Apache-2.0
4
- #
5
-
6
- from config import HF_TOKEN
7
- from huggingface_hub import login
8
-
9
- def authenticate_huggingface():
10
- if HF_TOKEN:
11
- try:
12
- login(token=HF_TOKEN, add_to_git_credential=False)
13
- print("Authenticated with Hugging Face", flush=True)
14
-
15
- except Exception as authentication_error:
16
- print(f"Hugging Face authentication failed: {authentication_error}", flush=True)
17
- print("Voice cloning may not be available", flush=True)
18
-
19
- else:
20
- print("Missing Hugging Face authentication required for the license agreement", flush=True)
21
-
22
- def get_huggingface_token():
23
- return HF_TOKEN
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/core/memory.py DELETED
@@ -1,394 +0,0 @@
1
- #
2
- # SPDX-FileCopyrightText: Hadad <hadad@linuxmail.org>
3
- # SPDX-License-Identifier: Apache-2.0
4
- #
5
-
6
- import os
7
- import gc
8
- import time
9
- import atexit
10
- import threading
11
- import torch
12
- from config import (
13
- TEMPORARY_FILE_LIFETIME_SECONDS,
14
- BACKGROUND_CLEANUP_INTERVAL,
15
- MEMORY_WARNING_THRESHOLD,
16
- MEMORY_CRITICAL_THRESHOLD,
17
- MEMORY_CHECK_INTERVAL,
18
- MEMORY_IDLE_TARGET,
19
- MAXIMUM_MEMORY_USAGE
20
- )
21
- from ..core.state import (
22
- temporary_files_registry,
23
- temporary_files_lock,
24
- memory_enforcement_lock,
25
- background_cleanup_thread,
26
- background_cleanup_stop_event,
27
- background_cleanup_trigger_event,
28
- check_if_generation_is_currently_active,
29
- get_text_to_speech_manager,
30
- is_model_in_use
31
- )
32
-
33
- def get_current_memory_usage():
34
- try:
35
- with open('/proc/self/status', 'r') as status_file:
36
- for line in status_file:
37
- if line.startswith('VmRSS:'):
38
- memory_value_kb = int(line.split()[1])
39
- return memory_value_kb * 1024
40
-
41
- except Exception:
42
- pass
43
-
44
- try:
45
- with open('/proc/self/statm', 'r') as statm_file:
46
- statm_values = statm_file.read().split()
47
- resident_pages = int(statm_values[1])
48
- page_size = os.sysconf('SC_PAGE_SIZE')
49
- return resident_pages * page_size
50
-
51
- except Exception:
52
- pass
53
-
54
- try:
55
- import resource
56
- import platform
57
- memory_usage_kilobytes = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
58
-
59
- if platform.system() == "Darwin":
60
- return memory_usage_kilobytes
61
-
62
- else:
63
- return memory_usage_kilobytes * 1024
64
-
65
- except Exception:
66
- pass
67
-
68
- return 0
69
-
70
- def is_memory_usage_within_limit():
71
- current_memory_usage = get_current_memory_usage()
72
- return current_memory_usage < MAXIMUM_MEMORY_USAGE
73
-
74
- def is_memory_usage_approaching_limit():
75
- current_memory_usage = get_current_memory_usage()
76
- return current_memory_usage >= MEMORY_WARNING_THRESHOLD
77
-
78
- def is_memory_usage_critical():
79
- current_memory_usage = get_current_memory_usage()
80
- return current_memory_usage >= MEMORY_CRITICAL_THRESHOLD
81
-
82
- def is_memory_above_idle_target():
83
- current_memory_usage = get_current_memory_usage()
84
- return current_memory_usage > MEMORY_IDLE_TARGET
85
-
86
- def force_garbage_collection():
87
- gc.collect(0)
88
- gc.collect(1)
89
- gc.collect(2)
90
-
91
- if torch.cuda.is_available():
92
- try:
93
- torch.cuda.empty_cache()
94
- torch.cuda.synchronize()
95
-
96
- except Exception:
97
- pass
98
-
99
- def memory_cleanup():
100
- force_garbage_collection()
101
-
102
- try:
103
- import ctypes
104
- libc = ctypes.CDLL("libc.so.6")
105
- libc.malloc_trim(0)
106
-
107
- except Exception:
108
- pass
109
-
110
- force_garbage_collection()
111
-
112
- def perform_memory_cleanup():
113
- force_garbage_collection()
114
-
115
- tts_manager = get_text_to_speech_manager()
116
- if tts_manager is not None:
117
- try:
118
- tts_manager.evict_least_recently_used_voice_states()
119
-
120
- except Exception:
121
- pass
122
-
123
- memory_cleanup()
124
-
125
- def cleanup_expired_temporary_files():
126
- current_timestamp = time.time()
127
- expired_files = []
128
-
129
- with temporary_files_lock:
130
- for file_path, creation_timestamp in list(temporary_files_registry.items()):
131
- if current_timestamp - creation_timestamp > TEMPORARY_FILE_LIFETIME_SECONDS:
132
- expired_files.append(file_path)
133
-
134
- for file_path in expired_files:
135
- try:
136
- if os.path.exists(file_path):
137
- os.remove(file_path)
138
- del temporary_files_registry[file_path]
139
-
140
- except Exception:
141
- pass
142
-
143
- def cleanup_all_temporary_files_immediately():
144
- with temporary_files_lock:
145
- for file_path in list(temporary_files_registry.keys()):
146
- try:
147
- if os.path.exists(file_path):
148
- os.remove(file_path)
149
- del temporary_files_registry[file_path]
150
-
151
- except Exception:
152
- pass
153
-
154
- def has_temporary_files_pending_cleanup():
155
- with temporary_files_lock:
156
- if len(temporary_files_registry) == 0:
157
- return False
158
-
159
- current_timestamp = time.time()
160
-
161
- for file_path, creation_timestamp in temporary_files_registry.items():
162
- if current_timestamp - creation_timestamp > TEMPORARY_FILE_LIFETIME_SECONDS:
163
- return True
164
-
165
- return False
166
-
167
- def has_any_temporary_files_registered():
168
- with temporary_files_lock:
169
- return len(temporary_files_registry) > 0
170
-
171
- def calculate_time_until_next_file_expiration():
172
- with temporary_files_lock:
173
- if len(temporary_files_registry) == 0:
174
- return None
175
-
176
- current_timestamp = time.time()
177
- minimum_time_until_expiration = None
178
-
179
- for file_path, creation_timestamp in temporary_files_registry.items():
180
- time_since_creation = current_timestamp - creation_timestamp
181
- time_until_expiration = TEMPORARY_FILE_LIFETIME_SECONDS - time_since_creation
182
-
183
- if time_until_expiration <= 0:
184
- return 0
185
-
186
- if minimum_time_until_expiration is None or time_until_expiration < minimum_time_until_expiration:
187
- minimum_time_until_expiration = time_until_expiration
188
-
189
- return minimum_time_until_expiration
190
-
191
- def enforce_memory_limit_if_exceeded():
192
- with memory_enforcement_lock:
193
- generation_is_active = check_if_generation_is_currently_active()
194
- model_is_in_use = is_model_in_use()
195
-
196
- current_memory_usage = get_current_memory_usage()
197
-
198
- if current_memory_usage < MEMORY_WARNING_THRESHOLD:
199
- return True
200
-
201
- force_garbage_collection()
202
- current_memory_usage = get_current_memory_usage()
203
-
204
- if current_memory_usage < MEMORY_WARNING_THRESHOLD:
205
- return True
206
-
207
- tts_manager = get_text_to_speech_manager()
208
- if tts_manager is not None:
209
- try:
210
- tts_manager.evict_least_recently_used_voice_states()
211
- except Exception:
212
- pass
213
-
214
- memory_cleanup()
215
- current_memory_usage = get_current_memory_usage()
216
-
217
- if current_memory_usage < MEMORY_CRITICAL_THRESHOLD:
218
- return True
219
-
220
- generation_is_active = check_if_generation_is_currently_active()
221
- model_is_in_use = is_model_in_use()
222
-
223
- if generation_is_active or model_is_in_use:
224
- if tts_manager is not None:
225
- try:
226
- tts_manager.clear_voice_state_cache_completely()
227
-
228
- except Exception:
229
- pass
230
-
231
- cleanup_all_temporary_files_immediately()
232
- memory_cleanup()
233
-
234
- return current_memory_usage < MAXIMUM_MEMORY_USAGE
235
-
236
- if tts_manager is not None:
237
- try:
238
- tts_manager.clear_voice_state_cache_completely()
239
-
240
- except Exception:
241
- pass
242
-
243
- cleanup_all_temporary_files_immediately()
244
- memory_cleanup()
245
-
246
- current_memory_usage = get_current_memory_usage()
247
-
248
- return current_memory_usage < MAXIMUM_MEMORY_USAGE
249
-
250
- def perform_idle_memory_reduction():
251
- if check_if_generation_is_currently_active():
252
- return
253
-
254
- if is_model_in_use():
255
- return
256
-
257
- with memory_enforcement_lock:
258
- current_memory_usage = get_current_memory_usage()
259
-
260
- if current_memory_usage <= MEMORY_IDLE_TARGET:
261
- return
262
-
263
- force_garbage_collection()
264
- current_memory_usage = get_current_memory_usage()
265
-
266
- if current_memory_usage <= MEMORY_IDLE_TARGET:
267
- return
268
-
269
- if check_if_generation_is_currently_active() or is_model_in_use():
270
- return
271
-
272
- tts_manager = get_text_to_speech_manager()
273
- if tts_manager is not None:
274
- try:
275
- tts_manager.evict_least_recently_used_voice_states()
276
-
277
- except Exception:
278
- pass
279
-
280
- memory_cleanup()
281
-
282
- current_memory_usage = get_current_memory_usage()
283
- if current_memory_usage <= MEMORY_IDLE_TARGET:
284
- return
285
-
286
- if check_if_generation_is_currently_active() or is_model_in_use():
287
- return
288
-
289
- if tts_manager is not None:
290
- try:
291
- tts_manager.clear_voice_state_cache_completely()
292
-
293
- except Exception:
294
- pass
295
-
296
- memory_cleanup()
297
-
298
- def perform_background_cleanup_cycle():
299
- last_memory_check_timestamp = 0
300
-
301
- while not background_cleanup_stop_event.is_set():
302
- time_until_next_expiration = calculate_time_until_next_file_expiration()
303
- current_timestamp = time.time()
304
- time_since_last_memory_check = current_timestamp - last_memory_check_timestamp
305
-
306
- if time_until_next_expiration is not None:
307
- if time_until_next_expiration <= 0:
308
- wait_duration = 1
309
-
310
- else:
311
- wait_duration = min(
312
- time_until_next_expiration + 1,
313
- MEMORY_CHECK_INTERVAL,
314
- BACKGROUND_CLEANUP_INTERVAL
315
- )
316
- else:
317
- should_check_memory = (
318
- is_memory_above_idle_target() and
319
- not check_if_generation_is_currently_active() and
320
- not is_model_in_use()
321
- )
322
-
323
- if should_check_memory:
324
- wait_duration = MEMORY_CHECK_INTERVAL
325
-
326
- else:
327
- background_cleanup_trigger_event.clear()
328
- triggered = background_cleanup_trigger_event.wait(timeout=BACKGROUND_CLEANUP_INTERVAL)
329
-
330
- if background_cleanup_stop_event.is_set():
331
- break
332
-
333
- if triggered:
334
- continue
335
-
336
- else:
337
- if not check_if_generation_is_currently_active() and not is_model_in_use():
338
- perform_idle_memory_reduction()
339
- continue
340
-
341
- background_cleanup_stop_event.wait(timeout=wait_duration)
342
-
343
- if background_cleanup_stop_event.is_set():
344
- break
345
-
346
- if has_temporary_files_pending_cleanup():
347
- cleanup_expired_temporary_files()
348
-
349
- current_timestamp = time.time()
350
- time_since_last_memory_check = current_timestamp - last_memory_check_timestamp
351
-
352
- if time_since_last_memory_check >= MEMORY_CHECK_INTERVAL:
353
- generation_active = check_if_generation_is_currently_active()
354
- model_in_use = is_model_in_use()
355
-
356
- if not generation_active and not model_in_use:
357
- if is_memory_usage_critical():
358
- enforce_memory_limit_if_exceeded()
359
-
360
- elif is_memory_above_idle_target():
361
- perform_idle_memory_reduction()
362
-
363
- last_memory_check_timestamp = current_timestamp
364
-
365
- def trigger_background_cleanup_check():
366
- background_cleanup_trigger_event.set()
367
-
368
- def start_background_cleanup_thread():
369
- global background_cleanup_thread
370
-
371
- from ..core import state as global_state
372
-
373
- if global_state.background_cleanup_thread is None or not global_state.background_cleanup_thread.is_alive():
374
- background_cleanup_stop_event.clear()
375
- background_cleanup_trigger_event.clear()
376
-
377
- global_state.background_cleanup_thread = threading.Thread(
378
- target=perform_background_cleanup_cycle,
379
- daemon=True,
380
- name="BackgroundCleanupThread"
381
- )
382
-
383
- global_state.background_cleanup_thread.start()
384
-
385
- def stop_background_cleanup_thread():
386
- from ..core import state as global_state
387
-
388
- background_cleanup_stop_event.set()
389
- background_cleanup_trigger_event.set()
390
-
391
- if global_state.background_cleanup_thread is not None and global_state.background_cleanup_thread.is_alive():
392
- global_state.background_cleanup_thread.join(timeout=5)
393
-
394
- atexit.register(stop_background_cleanup_thread)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/core/state.py DELETED
@@ -1,147 +0,0 @@
1
- #
2
- # SPDX-FileCopyrightText: Hadad <hadad@linuxmail.org>
3
- # SPDX-License-Identifier: Apache-2.0
4
- #
5
-
6
- import threading
7
-
8
- generation_state_lock = threading.Lock()
9
- is_currently_generating = False
10
- stop_generation_requested = False
11
- temporary_files_registry = {}
12
- temporary_files_lock = threading.Lock()
13
- memory_enforcement_lock = threading.Lock()
14
- background_cleanup_thread = None
15
- background_cleanup_stop_event = threading.Event()
16
- background_cleanup_trigger_event = threading.Event()
17
- text_to_speech_manager = None
18
-
19
- audio_conversion_semaphore = threading.Semaphore(1)
20
- audio_conversion_queue_lock = threading.Lock()
21
- audio_conversion_active_count = 0
22
- audio_conversion_waiting_count = 0
23
-
24
- accelerator_log_lock = threading.Lock()
25
- accelerator_log_thread = None
26
- accelerator_log_stop_event = threading.Event()
27
-
28
- model_usage_lock = threading.Lock()
29
- model_usage_count = 0
30
-
31
- generation_protection_lock = threading.RLock()
32
- generation_protection_count = 0
33
-
34
- def set_text_to_speech_manager(manager_instance):
35
- global text_to_speech_manager
36
- text_to_speech_manager = manager_instance
37
-
38
- def get_text_to_speech_manager():
39
- global text_to_speech_manager
40
- return text_to_speech_manager
41
-
42
- def check_if_generation_is_currently_active():
43
- with generation_state_lock:
44
- return is_currently_generating
45
-
46
- def set_generation_active(is_active):
47
- global is_currently_generating
48
- with generation_state_lock:
49
- is_currently_generating = is_active
50
-
51
- def set_stop_generation_requested(requested):
52
- global stop_generation_requested
53
- with generation_state_lock:
54
- stop_generation_requested = requested
55
-
56
- def get_stop_generation_requested():
57
- with generation_state_lock:
58
- return stop_generation_requested
59
-
60
- def increment_audio_conversion_active():
61
- global audio_conversion_active_count
62
- with audio_conversion_queue_lock:
63
- audio_conversion_active_count += 1
64
- return audio_conversion_active_count
65
-
66
- def decrement_audio_conversion_active():
67
- global audio_conversion_active_count
68
- with audio_conversion_queue_lock:
69
- audio_conversion_active_count = max(0, audio_conversion_active_count - 1)
70
- return audio_conversion_active_count
71
-
72
- def get_audio_conversion_active_count():
73
- with audio_conversion_queue_lock:
74
- return audio_conversion_active_count
75
-
76
- def increment_audio_conversion_waiting():
77
- global audio_conversion_waiting_count
78
- with audio_conversion_queue_lock:
79
- audio_conversion_waiting_count += 1
80
- return audio_conversion_waiting_count
81
-
82
- def decrement_audio_conversion_waiting():
83
- global audio_conversion_waiting_count
84
- with audio_conversion_queue_lock:
85
- audio_conversion_waiting_count = max(0, audio_conversion_waiting_count - 1)
86
- return audio_conversion_waiting_count
87
-
88
- def get_audio_conversion_waiting_count():
89
- with audio_conversion_queue_lock:
90
- return audio_conversion_waiting_count
91
-
92
- def is_audio_conversion_queue_busy():
93
- with audio_conversion_queue_lock:
94
- return audio_conversion_active_count > 0
95
-
96
- def increment_model_usage():
97
- global model_usage_count
98
- with model_usage_lock:
99
- model_usage_count += 1
100
- return model_usage_count
101
-
102
- def decrement_model_usage():
103
- global model_usage_count
104
- with model_usage_lock:
105
- model_usage_count = max(0, model_usage_count - 1)
106
- current_count = model_usage_count
107
- return current_count
108
-
109
- def get_model_usage_count():
110
- with model_usage_lock:
111
- return model_usage_count
112
-
113
- def is_model_in_use():
114
- with model_usage_lock:
115
- return model_usage_count > 0
116
-
117
- def acquire_generation_protection():
118
- global generation_protection_count
119
- generation_protection_lock.acquire()
120
- generation_protection_count += 1
121
- return generation_protection_count
122
-
123
- def release_generation_protection():
124
- global generation_protection_count
125
- generation_protection_count = max(0, generation_protection_count - 1)
126
- generation_protection_lock.release()
127
-
128
- def is_generation_protected():
129
- if generation_protection_lock.acquire(blocking=False):
130
- is_protected = generation_protection_count > 0
131
- generation_protection_lock.release()
132
- return is_protected
133
- return True
134
-
135
- def try_acquire_generation_protection_for_cleanup(timeout_seconds=0.1):
136
- acquired = generation_protection_lock.acquire(blocking=True, timeout=timeout_seconds)
137
- if acquired:
138
- if generation_protection_count > 0:
139
- generation_protection_lock.release()
140
- return False
141
-
142
- return True
143
-
144
- return False
145
-
146
- def release_generation_protection_for_cleanup():
147
- generation_protection_lock.release()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/generation/handler.py DELETED
@@ -1,309 +0,0 @@
1
- #
2
- # SPDX-FileCopyrightText: Hadad <hadad@linuxmail.org>
3
- # SPDX-License-Identifier: Apache-2.0
4
- #
5
-
6
- import gradio as gr
7
- from config import VOICE_MODE_CLONE, MAXIMUM_VOICE_CLONE_FILE_SIZE_BYTES
8
- from ..core.state import (
9
- generation_state_lock,
10
- get_stop_generation_requested,
11
- set_stop_generation_requested,
12
- is_audio_conversion_queue_busy,
13
- get_audio_conversion_waiting_count,
14
- acquire_generation_protection,
15
- release_generation_protection
16
- )
17
- from ..core.authentication import get_huggingface_token
18
- from ..core.memory import (
19
- has_temporary_files_pending_cleanup,
20
- cleanup_expired_temporary_files,
21
- perform_memory_cleanup,
22
- memory_cleanup,
23
- trigger_background_cleanup_check
24
- )
25
- from ..tts.manager import text_to_speech_manager, ModelNotLoadedError, ModelLoadingError
26
- from ..validation.text import validate_text_input
27
- from ..audio.validator import (
28
- perform_voice_clone_file_validation,
29
- get_format_display_name,
30
- format_file_size_for_display,
31
- validate_file_size_for_voice_cloning
32
- )
33
- from ..audio.converter import (
34
- prepare_audio_file_for_voice_cloning,
35
- AudioConversionQueueBusyError,
36
- AudioConversionQueueTimeoutError
37
- )
38
-
39
- def check_if_generating():
40
- from ..core.state import is_currently_generating
41
- with generation_state_lock:
42
- return is_currently_generating
43
-
44
- def request_generation_stop():
45
- set_stop_generation_requested(True)
46
- return gr.update(interactive=False)
47
-
48
- def validate_voice_clone_file_size(voice_clone_audio_file):
49
- if not voice_clone_audio_file:
50
- return True, None
51
-
52
- file_size_valid, file_size_error = validate_file_size_for_voice_cloning(voice_clone_audio_file)
53
-
54
- if not file_size_valid:
55
- return False, file_size_error
56
-
57
- return True, None
58
-
59
- def validate_and_prepare_voice_clone_audio(voice_clone_audio_file):
60
- if not voice_clone_audio_file:
61
- return None, "Please upload an audio file for voice cloning.", None, None
62
-
63
- file_size_valid, file_size_error = validate_file_size_for_voice_cloning(voice_clone_audio_file)
64
-
65
- if not file_size_valid:
66
- return None, file_size_error, None, None
67
-
68
- is_valid, is_wav_format, detected_format, validation_error = perform_voice_clone_file_validation(voice_clone_audio_file)
69
-
70
- if not is_valid:
71
- format_display_name = get_format_display_name(detected_format) if detected_format else "Unknown"
72
-
73
- if validation_error:
74
- if "too short" in validation_error.lower():
75
- return None, f"The uploaded audio file is too short. Please upload a longer audio sample for better voice cloning results.", None, detected_format
76
-
77
- if "too long" in validation_error.lower():
78
- return None, f"The uploaded audio file is too long. Please upload a shorter audio sample (maximum 1 hour).", None, detected_format
79
-
80
- if "empty" in validation_error.lower() or "0 bytes" in validation_error.lower():
81
- return None, "The uploaded audio file is empty. Please upload a valid audio file.", None, detected_format
82
-
83
- if "corrupted" in validation_error.lower() or "truncated" in validation_error.lower():
84
- return None, f"The uploaded {format_display_name} file appears to be corrupted or incomplete. Please upload a valid audio file.", None, detected_format
85
-
86
- if "unsupported" in validation_error.lower():
87
- return None, validation_error, None, detected_format
88
-
89
- if "exceeds" in validation_error.lower() or "maximum" in validation_error.lower():
90
- return None, validation_error, None, detected_format
91
-
92
- return None, f"Invalid audio file: {validation_error}", None, detected_format
93
-
94
- return None, "The uploaded file could not be validated as a valid audio file.", None, detected_format
95
-
96
- format_display_name = get_format_display_name(detected_format)
97
-
98
- if is_audio_conversion_queue_busy():
99
- waiting_count = get_audio_conversion_waiting_count()
100
-
101
- if waiting_count > 0:
102
- gr.Warning(f"Audio conversion queue is busy. Your request is queued (position: {waiting_count + 1}). Please wait...")
103
-
104
- else:
105
- gr.Warning("Audio conversion is in progress for another user. Your request has been queued. Please wait...")
106
-
107
- try:
108
- if is_wav_format:
109
- prepared_path, preparation_error, was_converted, final_format = prepare_audio_file_for_voice_cloning(
110
- voice_clone_audio_file,
111
- wait_for_queue=True
112
- )
113
-
114
- if prepared_path is None:
115
- return None, f"Failed to process WAV file: {preparation_error}", None, 'wav'
116
-
117
- return prepared_path, None, False, 'wav'
118
-
119
- else:
120
- prepared_path, preparation_error, was_converted, final_format = prepare_audio_file_for_voice_cloning(
121
- voice_clone_audio_file,
122
- wait_for_queue=True
123
- )
124
-
125
- if prepared_path is None:
126
- if "no audio conversion library" in preparation_error.lower():
127
- return None, f"Cannot convert {format_display_name} format. Please upload a WAV file directly.", None, detected_format
128
-
129
- if "queue" in preparation_error.lower() or "busy" in preparation_error.lower():
130
- return None, preparation_error, None, detected_format
131
-
132
- return None, f"Failed to convert {format_display_name} to WAV format: {preparation_error}", None, detected_format
133
-
134
- return prepared_path, None, True, detected_format
135
-
136
- except AudioConversionQueueBusyError as queue_busy_error:
137
- return None, str(queue_busy_error), None, detected_format
138
-
139
- except AudioConversionQueueTimeoutError as queue_timeout_error:
140
- return None, str(queue_timeout_error), None, detected_format
141
-
142
- def perform_speech_generation(
143
- text_input,
144
- voice_mode_selection,
145
- voice_preset_selection,
146
- voice_clone_audio_file,
147
- model_variant,
148
- lsd_decode_steps,
149
- temperature,
150
- noise_clamp,
151
- eos_threshold,
152
- frames_after_eos,
153
- enable_custom_frames
154
- ):
155
- from ..core import state as global_state
156
-
157
- if has_temporary_files_pending_cleanup():
158
- cleanup_expired_temporary_files()
159
-
160
- is_valid, validation_result = validate_text_input(text_input)
161
-
162
- if not is_valid:
163
- if validation_result:
164
- raise gr.Error(validation_result)
165
- raise gr.Error("Please enter valid text to generate speech.")
166
-
167
- prepared_audio_path = None
168
- was_audio_converted = False
169
- original_audio_format = None
170
-
171
- if voice_mode_selection == VOICE_MODE_CLONE:
172
- if not voice_clone_audio_file:
173
- raise gr.Error("Please upload an audio file for voice cloning.")
174
-
175
- file_size_valid, file_size_error = validate_voice_clone_file_size(voice_clone_audio_file)
176
- if not file_size_valid:
177
- max_size_display = format_file_size_for_display(MAXIMUM_VOICE_CLONE_FILE_SIZE_BYTES)
178
- raise gr.Error(f"File size exceeds maximum limit of {max_size_display}. {file_size_error}")
179
-
180
- if not get_huggingface_token():
181
- raise gr.Error("Voice cloning is not configured properly at the moment. Please try again later.")
182
-
183
- prepared_audio_path, audio_error, was_audio_converted, original_audio_format = validate_and_prepare_voice_clone_audio(voice_clone_audio_file)
184
-
185
- if prepared_audio_path is None:
186
- raise gr.Error(audio_error)
187
-
188
- if was_audio_converted:
189
- format_display_name = get_format_display_name(original_audio_format)
190
- gr.Warning(f"Audio converted from {format_display_name} to WAV format for voice cloning.")
191
-
192
- with generation_state_lock:
193
- if global_state.is_currently_generating:
194
- raise gr.Error("A generation is already in progress. Please wait.")
195
-
196
- global_state.is_currently_generating = True
197
- global_state.stop_generation_requested = False
198
-
199
- acquire_generation_protection()
200
-
201
- generated_audio_tensor = None
202
- cloned_voice_state_tensor = None
203
-
204
- try:
205
- perform_memory_cleanup()
206
-
207
- loaded_model = text_to_speech_manager.load_or_get_model(
208
- model_variant,
209
- temperature,
210
- lsd_decode_steps,
211
- noise_clamp,
212
- eos_threshold
213
- )
214
-
215
- if loaded_model is None:
216
- raise gr.Error("Failed to load TTS model. Please try again.")
217
-
218
- with generation_state_lock:
219
- if global_state.stop_generation_requested:
220
- return None
221
-
222
- if voice_mode_selection == VOICE_MODE_CLONE:
223
- cloned_voice_state_tensor = text_to_speech_manager.get_voice_state_for_clone(
224
- voice_clone_audio_file,
225
- prepared_audio_path=prepared_audio_path
226
- )
227
- voice_state = cloned_voice_state_tensor
228
-
229
- else:
230
- voice_state = text_to_speech_manager.get_voice_state_for_preset(voice_preset_selection)
231
-
232
- with generation_state_lock:
233
- if global_state.stop_generation_requested:
234
- return None
235
-
236
- generated_audio_tensor = text_to_speech_manager.generate_audio(
237
- validation_result,
238
- voice_state,
239
- frames_after_eos,
240
- enable_custom_frames
241
- )
242
-
243
- with generation_state_lock:
244
- if global_state.stop_generation_requested:
245
- return None
246
-
247
- output_file_path = text_to_speech_manager.save_audio_to_file(generated_audio_tensor)
248
-
249
- return output_file_path
250
-
251
- except gr.Error:
252
- raise
253
-
254
- except ModelNotLoadedError as model_not_loaded_error:
255
- raise gr.Error(str(model_not_loaded_error))
256
-
257
- except ModelLoadingError as model_loading_error:
258
- raise gr.Error(f"Failed to load TTS model: {str(model_loading_error)}")
259
-
260
- except RuntimeError as runtime_error:
261
- error_message = str(runtime_error)
262
- if "not loaded" in error_message.lower():
263
-
264
- if text_to_speech_manager.ensure_model_loaded():
265
- raise gr.Error("Model was temporarily unavailable. Please try again.")
266
-
267
- else:
268
- raise gr.Error("TTS model could not be loaded. Please try again later.")
269
-
270
- raise gr.Error(error_message)
271
-
272
- except Exception as generation_error:
273
- error_message = str(generation_error)
274
-
275
- if "file does not start with RIFF id" in error_message:
276
- raise gr.Error("The audio file format is not supported. Please upload a valid WAV file or a common audio format (MP3, FLAC, OGG, M4A).")
277
-
278
- if "unknown format" in error_message.lower():
279
- raise gr.Error("The audio file uses an unsupported encoding format. Please convert it to a standard format and try again.")
280
-
281
- raise gr.Error(f"Speech generation failed: {error_message}")
282
-
283
- finally:
284
- release_generation_protection()
285
-
286
- with generation_state_lock:
287
- global_state.is_currently_generating = False
288
- global_state.stop_generation_requested = False
289
-
290
- if generated_audio_tensor is not None:
291
- try:
292
- del generated_audio_tensor
293
-
294
- except Exception:
295
- pass
296
-
297
- generated_audio_tensor = None
298
-
299
- if cloned_voice_state_tensor is not None:
300
- try:
301
- del cloned_voice_state_tensor
302
-
303
- except Exception:
304
- pass
305
-
306
- cloned_voice_state_tensor = None
307
-
308
- memory_cleanup()
309
- trigger_background_cleanup_check()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/tts/manager.py DELETED
@@ -1,341 +0,0 @@
1
- #
2
- # SPDX-FileCopyrightText: Hadad <hadad@linuxmail.org>
3
- # SPDX-License-Identifier: Apache-2.0
4
- #
5
-
6
- import time
7
- import tempfile
8
- import threading
9
- import torch
10
- import scipy.io.wavfile
11
- from pocket_tts import TTSModel
12
- from config import (
13
- AVAILABLE_VOICES,
14
- DEFAULT_VOICE,
15
- DEFAULT_MODEL_VARIANT,
16
- DEFAULT_TEMPERATURE,
17
- DEFAULT_LSD_DECODE_STEPS,
18
- DEFAULT_EOS_THRESHOLD,
19
- VOICE_STATE_CACHE_MAXIMUM_SIZE,
20
- VOICE_STATE_CACHE_CLEANUP_THRESHOLD,
21
- MODEL_LOAD_RETRY_ATTEMPTS
22
- )
23
- from ..core.state import (
24
- temporary_files_registry,
25
- temporary_files_lock,
26
- set_text_to_speech_manager,
27
- increment_model_usage,
28
- decrement_model_usage,
29
- is_model_in_use
30
- )
31
- from ..core.memory import (
32
- force_garbage_collection,
33
- memory_cleanup,
34
- perform_memory_cleanup,
35
- trigger_background_cleanup_check,
36
- is_memory_usage_approaching_limit
37
- )
38
-
39
- class ModelNotLoadedError(RuntimeError):
40
- pass
41
-
42
- class ModelLoadingError(RuntimeError):
43
- pass
44
-
45
- class TextToSpeechManager:
46
- def __init__(self):
47
- self.loaded_model = None
48
- self.current_configuration = {}
49
- self.voice_state_cache = {}
50
- self.voice_state_cache_access_timestamps = {}
51
- self.voice_state_cache_lock = threading.Lock()
52
- self.model_lock = threading.RLock()
53
- self.model_loading_in_progress = False
54
- self.last_successful_configuration = None
55
-
56
- def is_model_loaded(self):
57
- with self.model_lock:
58
- return self.loaded_model is not None
59
-
60
- def is_model_loading(self):
61
- with self.model_lock:
62
- return self.model_loading_in_progress
63
-
64
- def _clear_voice_state_cache_internal(self):
65
- with self.voice_state_cache_lock:
66
- for voice_name in list(self.voice_state_cache.keys()):
67
- voice_state_tensor = self.voice_state_cache.pop(voice_name, None)
68
-
69
- if voice_state_tensor is not None:
70
- try:
71
- del voice_state_tensor
72
-
73
- except Exception:
74
- pass
75
-
76
- self.voice_state_cache.clear()
77
- self.voice_state_cache_access_timestamps.clear()
78
-
79
- def load_or_get_model(
80
- self,
81
- model_variant,
82
- temperature,
83
- lsd_decode_steps,
84
- noise_clamp,
85
- eos_threshold
86
- ):
87
- processed_variant = str(model_variant or DEFAULT_MODEL_VARIANT).strip()
88
- processed_temperature = float(temperature) if temperature is not None else DEFAULT_TEMPERATURE
89
- processed_lsd_steps = int(lsd_decode_steps) if lsd_decode_steps is not None else DEFAULT_LSD_DECODE_STEPS
90
- processed_noise_clamp = float(noise_clamp) if noise_clamp and float(noise_clamp) > 0 else None
91
- processed_eos_threshold = float(eos_threshold) if eos_threshold is not None else DEFAULT_EOS_THRESHOLD
92
-
93
- requested_configuration = {
94
- "variant": processed_variant,
95
- "temp": processed_temperature,
96
- "lsd_decode_steps": processed_lsd_steps,
97
- "noise_clamp": processed_noise_clamp,
98
- "eos_threshold": processed_eos_threshold
99
- }
100
-
101
- with self.model_lock:
102
- if self.loaded_model is not None and self.current_configuration == requested_configuration:
103
- return self.loaded_model
104
-
105
- return self._load_model_with_retry(requested_configuration)
106
-
107
- def _load_model_with_retry(self, requested_configuration):
108
- last_exception = None
109
-
110
- for attempt_number in range(MODEL_LOAD_RETRY_ATTEMPTS):
111
- try:
112
- self.model_loading_in_progress = True
113
-
114
- if self.loaded_model is not None:
115
- self._clear_voice_state_cache_internal()
116
-
117
- try:
118
- del self.loaded_model
119
-
120
- except Exception:
121
- pass
122
-
123
- self.loaded_model = None
124
- memory_cleanup()
125
-
126
- perform_memory_cleanup()
127
-
128
- self.loaded_model = TTSModel.load_model(**requested_configuration)
129
-
130
- if self.loaded_model is None:
131
- raise ModelLoadingError("TTSModel.load_model returned None")
132
-
133
- self.current_configuration = requested_configuration.copy()
134
- self.last_successful_configuration = requested_configuration.copy()
135
- self.voice_state_cache = {}
136
-
137
- self.model_loading_in_progress = False
138
-
139
- return self.loaded_model
140
-
141
- except Exception as loading_exception:
142
- last_exception = loading_exception
143
- self.loaded_model = None
144
- self.model_loading_in_progress = False
145
-
146
- if attempt_number < MODEL_LOAD_RETRY_ATTEMPTS - 1:
147
- memory_cleanup()
148
-
149
- self.model_loading_in_progress = False
150
- raise ModelLoadingError(f"Failed to load TTS model after {MODEL_LOAD_RETRY_ATTEMPTS} attempts: {str(last_exception)}")
151
-
152
- def ensure_model_loaded(self):
153
- with self.model_lock:
154
- if self.loaded_model is not None:
155
- return True
156
-
157
- if self.last_successful_configuration is not None:
158
- try:
159
- self._load_model_with_retry(self.last_successful_configuration)
160
- return self.loaded_model is not None
161
-
162
- except Exception:
163
- pass
164
-
165
- default_configuration = {
166
- "variant": DEFAULT_MODEL_VARIANT,
167
- "temp": DEFAULT_TEMPERATURE,
168
- "lsd_decode_steps": DEFAULT_LSD_DECODE_STEPS,
169
- "noise_clamp": None,
170
- "eos_threshold": DEFAULT_EOS_THRESHOLD
171
- }
172
-
173
- try:
174
- self._load_model_with_retry(default_configuration)
175
- return self.loaded_model is not None
176
-
177
- except Exception:
178
- return False
179
-
180
- def clear_voice_state_cache_completely(self):
181
- with self.model_lock:
182
- self._clear_voice_state_cache_internal()
183
-
184
- force_garbage_collection()
185
-
186
- def evict_least_recently_used_voice_states(self):
187
- with self.voice_state_cache_lock:
188
- if len(self.voice_state_cache) == 0:
189
- return
190
-
191
- if len(self.voice_state_cache) <= VOICE_STATE_CACHE_CLEANUP_THRESHOLD:
192
- sorted_voice_names_by_access_time = sorted(
193
- self.voice_state_cache_access_timestamps.keys(),
194
- key=lambda voice_name: self.voice_state_cache_access_timestamps.get(voice_name, 0)
195
- )
196
-
197
- number_of_entries_to_remove = max(1, len(self.voice_state_cache) // 2)
198
-
199
- for index in range(min(number_of_entries_to_remove, len(sorted_voice_names_by_access_time))):
200
- voice_name_to_remove = sorted_voice_names_by_access_time[index]
201
- voice_state_tensor = self.voice_state_cache.pop(voice_name_to_remove, None)
202
- self.voice_state_cache_access_timestamps.pop(voice_name_to_remove, None)
203
-
204
- if voice_state_tensor is not None:
205
- try:
206
- del voice_state_tensor
207
-
208
- except Exception:
209
- pass
210
-
211
- force_garbage_collection()
212
- return
213
-
214
- sorted_voice_names_by_access_time = sorted(
215
- self.voice_state_cache_access_timestamps.keys(),
216
- key=lambda voice_name: self.voice_state_cache_access_timestamps.get(voice_name, 0)
217
- )
218
-
219
- number_of_entries_to_remove = len(self.voice_state_cache) - VOICE_STATE_CACHE_CLEANUP_THRESHOLD
220
-
221
- for index in range(number_of_entries_to_remove):
222
- if index >= len(sorted_voice_names_by_access_time):
223
- break
224
-
225
- voice_name_to_remove = sorted_voice_names_by_access_time[index]
226
- voice_state_tensor = self.voice_state_cache.pop(voice_name_to_remove, None)
227
- self.voice_state_cache_access_timestamps.pop(voice_name_to_remove, None)
228
-
229
- if voice_state_tensor is not None:
230
- try:
231
- del voice_state_tensor
232
-
233
- except Exception:
234
- pass
235
-
236
- force_garbage_collection()
237
-
238
- def get_voice_state_for_preset(self, voice_name):
239
- validated_voice = voice_name if voice_name in AVAILABLE_VOICES else DEFAULT_VOICE
240
-
241
- with self.voice_state_cache_lock:
242
- if validated_voice in self.voice_state_cache:
243
- self.voice_state_cache_access_timestamps[validated_voice] = time.time()
244
- cached_state = self.voice_state_cache[validated_voice]
245
- return cached_state
246
-
247
- if is_memory_usage_approaching_limit():
248
- self.evict_least_recently_used_voice_states()
249
-
250
- if len(self.voice_state_cache) >= VOICE_STATE_CACHE_MAXIMUM_SIZE:
251
- self.evict_least_recently_used_voice_states()
252
-
253
- increment_model_usage()
254
-
255
- try:
256
- with self.model_lock:
257
- if self.loaded_model is None:
258
- if not self.ensure_model_loaded():
259
- raise ModelNotLoadedError("TTS model is not loaded and could not be reloaded. Please try again.")
260
-
261
- computed_voice_state = self.loaded_model.get_state_for_audio_prompt(
262
- audio_conditioning=validated_voice,
263
- truncate=False
264
- )
265
-
266
- with self.voice_state_cache_lock:
267
- self.voice_state_cache[validated_voice] = computed_voice_state
268
- self.voice_state_cache_access_timestamps[validated_voice] = time.time()
269
-
270
- return computed_voice_state
271
-
272
- finally:
273
- decrement_model_usage()
274
-
275
- def get_voice_state_for_clone(self, audio_file_path, prepared_audio_path=None):
276
- audio_path_to_use = prepared_audio_path if prepared_audio_path is not None else audio_file_path
277
-
278
- increment_model_usage()
279
-
280
- try:
281
- with self.model_lock:
282
- if self.loaded_model is None:
283
- if not self.ensure_model_loaded():
284
- raise ModelNotLoadedError("TTS model is not loaded and could not be reloaded. Please try again.")
285
-
286
- cloned_voice_state = self.loaded_model.get_state_for_audio_prompt(
287
- audio_conditioning=audio_path_to_use,
288
- truncate=False
289
- )
290
-
291
- return cloned_voice_state
292
-
293
- finally:
294
- decrement_model_usage()
295
-
296
- def generate_audio(self, text_content, voice_state, frames_after_eos, enable_custom_frames):
297
- increment_model_usage()
298
-
299
- try:
300
- with self.model_lock:
301
- if self.loaded_model is None:
302
- if not self.ensure_model_loaded():
303
- raise ModelNotLoadedError("TTS model is not loaded and could not be reloaded. Please try again.")
304
-
305
- processed_frames = int(frames_after_eos) if enable_custom_frames else None
306
-
307
- generated_audio = self.loaded_model.generate_audio(
308
- model_state=voice_state,
309
- text_to_generate=text_content,
310
- frames_after_eos=processed_frames,
311
- copy_state=True
312
- )
313
-
314
- force_garbage_collection()
315
-
316
- return generated_audio
317
-
318
- finally:
319
- decrement_model_usage()
320
-
321
- def save_audio_to_file(self, audio_tensor):
322
- with self.model_lock:
323
- if self.loaded_model is None:
324
- raise ModelNotLoadedError("TTS model is not loaded. Cannot determine sample rate.")
325
-
326
- audio_sample_rate = self.loaded_model.sample_rate
327
-
328
- audio_numpy_data = audio_tensor.numpy()
329
-
330
- output_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
331
- scipy.io.wavfile.write(output_file.name, audio_sample_rate, audio_numpy_data)
332
-
333
- with temporary_files_lock:
334
- temporary_files_registry[output_file.name] = time.time()
335
-
336
- trigger_background_cleanup_check()
337
-
338
- return output_file.name
339
-
340
- text_to_speech_manager = TextToSpeechManager()
341
- set_text_to_speech_manager(text_to_speech_manager)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/ui/handlers.py DELETED
@@ -1,58 +0,0 @@
1
- #
2
- # SPDX-FileCopyrightText: Hadad <hadad@linuxmail.org>
3
- # SPDX-License-Identifier: Apache-2.0
4
- #
5
-
6
- import gradio as gr
7
- from config import VOICE_MODE_PRESET, DEFAULT_VOICE
8
- from ..validation.text import validate_text_input
9
-
10
- def switch_to_generating_state(ui_state):
11
- new_state = {"generating": True}
12
-
13
- return (
14
- gr.update(visible=False),
15
- gr.update(visible=True, interactive=True),
16
- gr.update(visible=False),
17
- new_state
18
- )
19
-
20
- def switch_to_idle_state(text_content, ui_state):
21
- new_state = {"generating": False}
22
-
23
- has_text_content = bool(text_content and text_content.strip())
24
- should_show_clear = has_text_content
25
-
26
- is_valid_text, _ = validate_text_input(text_content)
27
-
28
- return (
29
- gr.update(visible=True, interactive=is_valid_text),
30
- gr.update(visible=False),
31
- gr.update(visible=should_show_clear),
32
- new_state
33
- )
34
-
35
- def perform_clear_action():
36
- return (
37
- "",
38
- None,
39
- gr.update(visible=False),
40
- VOICE_MODE_PRESET,
41
- DEFAULT_VOICE,
42
- None
43
- )
44
-
45
- def create_example_handler(example_text, example_voice):
46
- def set_example_values():
47
- return example_text, VOICE_MODE_PRESET, example_voice
48
-
49
- return set_example_values
50
-
51
- def format_example_button_label(example_text, example_voice, max_text_length=40):
52
- truncated_text = (
53
- example_text[:max_text_length] + "..."
54
- if len(example_text) > max_text_length
55
- else example_text
56
- )
57
-
58
- return f"[{example_voice}] {truncated_text}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/ui/state.py DELETED
@@ -1,43 +0,0 @@
1
- #
2
- # SPDX-FileCopyrightText: Hadad <hadad@linuxmail.org>
3
- # SPDX-License-Identifier: Apache-2.0
4
- #
5
-
6
- import gradio as gr
7
- from config import MAXIMUM_INPUT_LENGTH, VOICE_MODE_CLONE
8
- from ..validation.text import validate_text_input
9
-
10
- def check_generate_button_state(text_content, ui_state):
11
- if ui_state.get("generating", False):
12
- return gr.update(interactive=False)
13
-
14
- is_valid, _ = validate_text_input(text_content)
15
-
16
- return gr.update(interactive=is_valid)
17
-
18
- def calculate_character_count_display(text_content):
19
- character_count = len(text_content) if text_content else 0
20
-
21
- display_color = (
22
- "var(--error-text-color)"
23
- if character_count > MAXIMUM_INPUT_LENGTH
24
- else "var(--body-text-color-subdued)"
25
- )
26
-
27
- return f"<div style='text-align: right; padding: 4px 0;'><span style='color: {display_color}; font-size: 0.85em;'>{character_count} / {MAXIMUM_INPUT_LENGTH}</span></div>"
28
-
29
- def determine_clear_button_visibility(text_content, ui_state):
30
- if ui_state.get("generating", False):
31
- return gr.update(visible=False)
32
-
33
- has_text_content = bool(text_content and text_content.strip())
34
- should_show_clear = has_text_content
35
-
36
- return gr.update(visible=should_show_clear)
37
-
38
- def update_voice_mode_visibility(voice_mode_value):
39
- if voice_mode_value == VOICE_MODE_CLONE:
40
- return gr.update(visible=False), gr.update(visible=True)
41
-
42
- else:
43
- return gr.update(visible=True), gr.update(visible=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/validation/text.py DELETED
@@ -1,20 +0,0 @@
1
- #
2
- # SPDX-FileCopyrightText: Hadad <hadad@linuxmail.org>
3
- # SPDX-License-Identifier: Apache-2.0
4
- #
5
-
6
- from config import MAXIMUM_INPUT_LENGTH
7
-
8
- def validate_text_input(text_content):
9
- if not text_content or not isinstance(text_content, str):
10
- return False, ""
11
-
12
- cleaned_text = text_content.strip()
13
-
14
- if not cleaned_text:
15
- return False, ""
16
-
17
- if len(cleaned_text) > MAXIMUM_INPUT_LENGTH:
18
- return False, f"Input exceeds maximum length of {MAXIMUM_INPUT_LENGTH} characters."
19
-
20
- return True, cleaned_text