Manny Hernandez commited on
Commit
12da80a
·
0 Parent(s):

Initial commit

Browse files
.gitattributes ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ onnx-*.onnx filter=lfs diff=lfs merge=lfs -text
2
+ *.bin filter=lfs diff=lfs merge=lfs -text
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2025 0N Labs
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Onyx-TTS
2
+
3
+ Onyx-TTS is a high-performance, multilingual text-to-speech system developed by 0N Labs. It's built on ONNX Runtime, delivering fast and efficient speech synthesis with minimal resource requirements.
4
+
5
+ ## Model Details
6
+
7
+ - **Developed by**: 0N Labs
8
+ - **Model type**: Text-to-Speech
9
+ - **Languages**: Multiple languages supported
10
+ - **License**: Apache 2.0
11
+ - **Model size**: ~300MB (quantized: ~80MB)
12
+
13
+ ## How to Use
14
+
15
+ ### Installation
16
+
17
+ ```bash
18
+ pip install onyx-tts
19
+ ```
20
+
21
+ ### Basic Usage
22
+
23
+ ```python
24
+ import soundfile as sf
25
+ from onyx_tts import OnyxTTS
26
+
27
+ # Initialize the TTS engine
28
+ onyx = OnyxTTS("onyx-v1.0.onnx", "voices-v1.0.bin")
29
+
30
+ # Generate speech
31
+ samples, sample_rate = onyx.create(
32
+ "Hello! This is Onyx TTS by 0N Labs.",
33
+ voice="af_sarah",
34
+ speed=1.0,
35
+ lang="en-us"
36
+ )
37
+
38
+ # Save to file
39
+ sf.write("output.wav", samples, sample_rate)
40
+ ```
41
+
42
+ ## Available Voices
43
+
44
+ See the latest voices and languages in the [VOICES.md](VOICES.md) file.
45
+
46
+ ## License
47
+
48
+ - Onyx-TTS: MIT
49
+ - Onyx model: Apache 2.0
50
+
51
+ ## Citation
52
+
53
+ ```bibtex
54
+ @software{onyx-tts,
55
+ author = {0N Labs},
56
+ title = {Onyx-TTS: High-performance Text-to-Speech},
57
+ year = {2025},
58
+ publisher = {GitHub},
59
+ journal = {GitHub repository},
60
+ howpublished = {\url{https://github.com/0N-Labs/onyx-tts}}
61
+ }
62
+ ```
63
+
64
+ ## Contact
65
+
66
+ For questions and support, please contact: contact@0nlabs.ai
onyx_tts/__init__.py ADDED
@@ -0,0 +1,263 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import importlib
3
+ import importlib.metadata
4
+ import importlib.util
5
+ import json
6
+ import os
7
+ import platform
8
+ import re
9
+ import time
10
+ from collections.abc import AsyncGenerator
11
+
12
+ import numpy as np
13
+ import onnxruntime as rt
14
+ from numpy.typing import NDArray
15
+
16
+ from .config import MAX_PHONEME_LENGTH, SAMPLE_RATE, EspeakConfig, OnyxConfig
17
+ from .log import log
18
+ from .tokenizer import Tokenizer
19
+ from .trim import trim as trim_audio
20
+
21
+
22
+ class OnyxTTS:
23
+ def __init__(
24
+ self,
25
+ model_path: str,
26
+ voices_path: str,
27
+ espeak_config: EspeakConfig | None = None,
28
+ vocab_config: dict | str | None = None,
29
+ ):
30
+ # Show useful information for bug reports
31
+ log.debug(
32
+ f"Onyx-TTS version 1.0.0 on {platform.platform()} {platform.version()}"
33
+ )
34
+ self.config = OnyxConfig(model_path, voices_path, espeak_config)
35
+ self.config.validate()
36
+
37
+ # See list of providers https://github.com/microsoft/onnxruntime/issues/22101#issuecomment-2357667377
38
+ providers = ["CPUExecutionProvider"]
39
+
40
+ # Check if onyx-tts installed with onyx-tts[gpu] feature (Windows/Linux)
41
+ gpu_enabled = importlib.util.find_spec("onnxruntime-gpu")
42
+ if gpu_enabled:
43
+ providers: list[str] = rt.get_available_providers()
44
+
45
+ # Check if ONNX_PROVIDER environment variable was set
46
+ env_provider = os.getenv("ONNX_PROVIDER")
47
+ if env_provider:
48
+ providers = [env_provider]
49
+
50
+ log.debug(f"Providers: {providers}")
51
+ self.sess = rt.InferenceSession(model_path, providers=providers)
52
+ self.voices: np.ndarray = np.load(voices_path)
53
+
54
+ vocab = self._load_vocab(vocab_config)
55
+ self.tokenizer = Tokenizer(espeak_config, vocab=vocab)
56
+
57
+ @classmethod
58
+ def from_session(
59
+ cls,
60
+ session: rt.InferenceSession,
61
+ voices_path: str,
62
+ espeak_config: EspeakConfig | None = None,
63
+ vocab_config: dict | str | None = None,
64
+ ):
65
+ instance = cls.__new__(cls)
66
+ instance.sess = session
67
+ instance.config = KoKoroConfig(session._model_path, voices_path, espeak_config)
68
+ instance.config.validate()
69
+ instance.voices = np.load(voices_path)
70
+
71
+ vocab = instance._load_vocab(vocab_config)
72
+ instance.tokenizer = Tokenizer(espeak_config, vocab=vocab)
73
+ return instance
74
+
75
+ def _load_vocab(self, vocab_config: dict | str | None) -> dict:
76
+ """Load vocabulary from config file or dictionary.
77
+
78
+ Args:
79
+ vocab_config: Path to vocab config file or dictionary containing vocab.
80
+
81
+ Returns:
82
+ Loaded vocabulary dictionary or empty dictionary if no config provided.
83
+ """
84
+
85
+ if isinstance(vocab_config, str):
86
+ with open(vocab_config, encoding="utf-8") as fp:
87
+ config = json.load(fp)
88
+ return config["vocab"]
89
+ if isinstance(vocab_config, dict):
90
+ return vocab_config["vocab"]
91
+ return {}
92
+
93
+ def _create_audio(
94
+ self, phonemes: str, voice: NDArray[np.float32], speed: float
95
+ ) -> tuple[NDArray[np.float32], int]:
96
+ log.debug(f"Phonemes: {phonemes}")
97
+ if len(phonemes) > MAX_PHONEME_LENGTH:
98
+ log.warning(
99
+ f"Phonemes are too long, truncating to {MAX_PHONEME_LENGTH} phonemes"
100
+ )
101
+ phonemes = phonemes[:MAX_PHONEME_LENGTH]
102
+ start_t = time.time()
103
+ tokens = np.array(self.tokenizer.tokenize(phonemes), dtype=np.int64)
104
+ assert len(tokens) <= MAX_PHONEME_LENGTH, (
105
+ f"Context length is {MAX_PHONEME_LENGTH}, but leave room for the pad token 0 at the start & end"
106
+ )
107
+
108
+ voice = voice[len(tokens)]
109
+ tokens = [[0, *tokens, 0]]
110
+ if "input_ids" in [i.name for i in self.sess.get_inputs()]:
111
+ # Newer export versions
112
+ inputs = {
113
+ "input_ids": tokens,
114
+ "style": np.array(voice, dtype=np.float32),
115
+ "speed": np.array([speed], dtype=np.int32),
116
+ }
117
+ else:
118
+ inputs = {
119
+ "tokens": tokens,
120
+ "style": voice,
121
+ "speed": np.ones(1, dtype=np.float32) * speed,
122
+ }
123
+
124
+ audio = self.sess.run(None, inputs)[0]
125
+ audio_duration = len(audio) / SAMPLE_RATE
126
+ create_duration = time.time() - start_t
127
+ rtf = create_duration / audio_duration
128
+ log.debug(
129
+ f"Created audio in length of {audio_duration:.2f}s for {len(phonemes)} phonemes in {create_duration:.2f}s (RTF: {rtf:.2f}"
130
+ )
131
+ return audio, SAMPLE_RATE
132
+
133
+ def get_voice_style(self, name: str) -> NDArray[np.float32]:
134
+ return self.voices[name]
135
+
136
+ def _split_phonemes(self, phonemes: str) -> list[str]:
137
+ """
138
+ Split phonemes into batches of MAX_PHONEME_LENGTH
139
+ Prefer splitting at punctuation marks.
140
+ """
141
+ # Regular expression to split by punctuation and keep them
142
+ words = re.split(r"([.,!?;])", phonemes)
143
+ batched_phoenemes: list[str] = []
144
+ current_batch = ""
145
+
146
+ for part in words:
147
+ # Remove leading/trailing whitespace
148
+ part = part.strip()
149
+
150
+ if part:
151
+ # If adding the part exceeds the max length, split into a new batch
152
+ # TODO: make it more accurate
153
+ if len(current_batch) + len(part) + 1 >= MAX_PHONEME_LENGTH:
154
+ batched_phoenemes.append(current_batch.strip())
155
+ current_batch = part
156
+ else:
157
+ if part in ".,!?;":
158
+ current_batch += part
159
+ else:
160
+ if current_batch:
161
+ current_batch += " "
162
+ current_batch += part
163
+
164
+ # Append the last batch if it contains any phonemes
165
+ if current_batch:
166
+ batched_phoenemes.append(current_batch.strip())
167
+
168
+ return batched_phoenemes
169
+
170
+ def create(
171
+ self,
172
+ text: str,
173
+ voice: str | NDArray[np.float32],
174
+ speed: float = 1.0,
175
+ lang: str = "en-us",
176
+ is_phonemes: bool = False,
177
+ trim: bool = True,
178
+ ) -> tuple[NDArray[np.float32], int]:
179
+ """
180
+ Create audio from text using the specified voice and speed.
181
+ """
182
+ assert speed >= 0.5 and speed <= 2.0, "Speed should be between 0.5 and 2.0"
183
+
184
+ if isinstance(voice, str):
185
+ assert voice in self.voices, f"Voice {voice} not found in available voices"
186
+ voice = self.get_voice_style(voice)
187
+
188
+ start_t = time.time()
189
+ if is_phonemes:
190
+ phonemes = text
191
+ else:
192
+ phonemes = self.tokenizer.phonemize(text, lang)
193
+ # Create batches of phonemes by splitting spaces to MAX_PHONEME_LENGTH
194
+ batched_phoenemes = self._split_phonemes(phonemes)
195
+
196
+ audio = []
197
+ log.debug(
198
+ f"Creating audio for {len(batched_phoenemes)} batches for {len(phonemes)} phonemes"
199
+ )
200
+ for phonemes in batched_phoenemes:
201
+ audio_part, _ = self._create_audio(phonemes, voice, speed)
202
+ if trim:
203
+ # Trim leading and trailing silence for a more natural sound concatenation
204
+ # (initial ~2s, subsequent ~0.02s)
205
+ audio_part, _ = trim_audio(audio_part)
206
+ audio.append(audio_part)
207
+ audio = np.concatenate(audio)
208
+ log.debug(f"Created audio in {time.time() - start_t:.2f}s")
209
+ return audio, SAMPLE_RATE
210
+
211
+ async def create_stream(
212
+ self,
213
+ text: str,
214
+ voice: str | NDArray[np.float32],
215
+ speed: float = 1.0,
216
+ lang: str = "en-us",
217
+ is_phonemes: bool = False,
218
+ trim: bool = True,
219
+ ) -> AsyncGenerator[tuple[NDArray[np.float32], int], None]:
220
+ """
221
+ Stream audio creation asynchronously in the background, yielding chunks as they are processed.
222
+ """
223
+ assert speed >= 0.5 and speed <= 2.0, "Speed should be between 0.5 and 2.0"
224
+
225
+ if isinstance(voice, str):
226
+ assert voice in self.voices, f"Voice {voice} not found in available voices"
227
+ voice = self.get_voice_style(voice)
228
+
229
+ if is_phonemes:
230
+ phonemes = text
231
+ else:
232
+ phonemes = self.tokenizer.phonemize(text, lang)
233
+
234
+ batched_phonemes = self._split_phonemes(phonemes)
235
+ queue: asyncio.Queue[tuple[NDArray[np.float32], int] | None] = asyncio.Queue()
236
+
237
+ async def process_batches():
238
+ """Process phoneme batches in the background."""
239
+ for i, phonemes in enumerate(batched_phonemes):
240
+ loop = asyncio.get_event_loop()
241
+ # Execute in separate thread since it's blocking operation
242
+ audio_part, sample_rate = await loop.run_in_executor(
243
+ None, self._create_audio, phonemes, voice, speed
244
+ )
245
+ if trim:
246
+ # Trim leading and trailing silence for a more natural sound concatenation
247
+ # (initial ~2s, subsequent ~0.02s)
248
+ audio_part, _ = trim_audio(audio_part)
249
+ log.debug(f"Processed chunk {i} of stream")
250
+ await queue.put((audio_part, sample_rate))
251
+ await queue.put(None) # Signal the end of the stream
252
+
253
+ # Start processing in the background
254
+ asyncio.create_task(process_batches())
255
+
256
+ while True:
257
+ chunk = await queue.get()
258
+ if chunk is None:
259
+ break
260
+ yield chunk
261
+
262
+ def get_voices(self) -> list[str]:
263
+ return list(sorted(self.voices.keys()))
onyx_tts/config.json ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "istftnet": {
3
+ "upsample_kernel_sizes": [20, 12],
4
+ "upsample_rates": [10, 6],
5
+ "gen_istft_hop_size": 5,
6
+ "gen_istft_n_fft": 20,
7
+ "resblock_dilation_sizes": [
8
+ [1, 3, 5],
9
+ [1, 3, 5],
10
+ [1, 3, 5]
11
+ ],
12
+ "resblock_kernel_sizes": [3, 7, 11],
13
+ "upsample_initial_channel": 512
14
+ },
15
+ "dim_in": 64,
16
+ "dropout": 0.2,
17
+ "hidden_dim": 512,
18
+ "max_conv_dim": 512,
19
+ "max_dur": 50,
20
+ "multispeaker": true,
21
+ "n_layer": 3,
22
+ "n_mels": 80,
23
+ "n_token": 178,
24
+ "style_dim": 128,
25
+ "text_encoder_kernel_size": 5,
26
+ "plbert": {
27
+ "hidden_size": 768,
28
+ "num_attention_heads": 12,
29
+ "intermediate_size": 2048,
30
+ "max_position_embeddings": 512,
31
+ "num_hidden_layers": 12,
32
+ "dropout": 0.1
33
+ },
34
+ "vocab": {
35
+ ";": 1,
36
+ ":": 2,
37
+ ",": 3,
38
+ ".": 4,
39
+ "!": 5,
40
+ "?": 6,
41
+ "—": 9,
42
+ "…": 10,
43
+ "\"": 11,
44
+ "(": 12,
45
+ ")": 13,
46
+ "“": 14,
47
+ "”": 15,
48
+ " ": 16,
49
+ "\u0303": 17,
50
+ "ʣ": 18,
51
+ "ʥ": 19,
52
+ "ʦ": 20,
53
+ "ʨ": 21,
54
+ "ᵝ": 22,
55
+ "\uAB67": 23,
56
+ "A": 24,
57
+ "I": 25,
58
+ "O": 31,
59
+ "Q": 33,
60
+ "S": 35,
61
+ "T": 36,
62
+ "W": 39,
63
+ "Y": 41,
64
+ "ᵊ": 42,
65
+ "a": 43,
66
+ "b": 44,
67
+ "c": 45,
68
+ "d": 46,
69
+ "e": 47,
70
+ "f": 48,
71
+ "h": 50,
72
+ "i": 51,
73
+ "j": 52,
74
+ "k": 53,
75
+ "l": 54,
76
+ "m": 55,
77
+ "n": 56,
78
+ "o": 57,
79
+ "p": 58,
80
+ "q": 59,
81
+ "r": 60,
82
+ "s": 61,
83
+ "t": 62,
84
+ "u": 63,
85
+ "v": 64,
86
+ "w": 65,
87
+ "x": 66,
88
+ "y": 67,
89
+ "z": 68,
90
+ "ɑ": 69,
91
+ "ɐ": 70,
92
+ "ɒ": 71,
93
+ "æ": 72,
94
+ "β": 75,
95
+ "ɔ": 76,
96
+ "ɕ": 77,
97
+ "ç": 78,
98
+ "ɖ": 80,
99
+ "ð": 81,
100
+ "ʤ": 82,
101
+ "ə": 83,
102
+ "ɚ": 85,
103
+ "ɛ": 86,
104
+ "ɜ": 87,
105
+ "ɟ": 90,
106
+ "ɡ": 92,
107
+ "ɥ": 99,
108
+ "ɨ": 101,
109
+ "ɪ": 102,
110
+ "ʝ": 103,
111
+ "ɯ": 110,
112
+ "ɰ": 111,
113
+ "ŋ": 112,
114
+ "ɳ": 113,
115
+ "ɲ": 114,
116
+ "ɴ": 115,
117
+ "ø": 116,
118
+ "ɸ": 118,
119
+ "θ": 119,
120
+ "œ": 120,
121
+ "ɹ": 123,
122
+ "ɾ": 125,
123
+ "ɻ": 126,
124
+ "ʁ": 128,
125
+ "ɽ": 129,
126
+ "ʂ": 130,
127
+ "ʃ": 131,
128
+ "ʈ": 132,
129
+ "ʧ": 133,
130
+ "ʊ": 135,
131
+ "ʋ": 136,
132
+ "ʌ": 138,
133
+ "ɣ": 139,
134
+ "ɤ": 140,
135
+ "χ": 142,
136
+ "ʎ": 143,
137
+ "ʒ": 147,
138
+ "ʔ": 148,
139
+ "ˈ": 156,
140
+ "ˌ": 157,
141
+ "ː": 158,
142
+ "ʰ": 162,
143
+ "ʲ": 164,
144
+ "↓": 169,
145
+ "→": 171,
146
+ "↗": 172,
147
+ "↘": 173,
148
+ "ᵻ": 177
149
+ }
150
+ }
onyx_tts/config.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from dataclasses import dataclass
3
+ from pathlib import Path
4
+
5
+ MAX_PHONEME_LENGTH = 510
6
+ SAMPLE_RATE = 24000
7
+
8
+
9
+ @dataclass
10
+ class EspeakConfig:
11
+ lib_path: str | None = None
12
+ data_path: str | None = None
13
+
14
+
15
+ class OnyxConfig:
16
+ def __init__(
17
+ self,
18
+ model_path: str,
19
+ voices_path: str,
20
+ espeak_config: EspeakConfig | None = None,
21
+ ):
22
+ self.model_path = model_path
23
+ self.voices_path = voices_path
24
+ self.espeak_config = espeak_config
25
+
26
+ def validate(self):
27
+ if not Path(self.voices_path).exists():
28
+ error_msg = f"Voices file not found at {self.voices_path}"
29
+ error_msg += (
30
+ "\nYou can download the voices file using the following command:"
31
+ )
32
+ error_msg += "\nwget https://github.com/0N-Labs/onyx-tts/releases/download/v1.0.0/voices-v1.0.bin"
33
+ raise FileNotFoundError(error_msg)
34
+
35
+ if not Path(self.model_path).exists():
36
+ error_msg = f"Model file not found at {self.model_path}"
37
+ error_msg += "\nYou can download the model file from https://github.com/0N-Labs/onyx-tts/releases"
38
+ raise FileNotFoundError(error_msg)
39
+
40
+
41
+ def get_vocab():
42
+ with open(Path(__file__).parent / "config.json", encoding="utf-8") as fp:
43
+ config = json.load(fp)
44
+ return config["vocab"]
45
+
46
+
47
+ DEFAULT_VOCAB = get_vocab()
onyx_tts/log.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Provide a way to enable logging by setting LOG_LEVEL environment variable
3
+ """
4
+
5
+ import logging
6
+ import os
7
+
8
+ import colorlog
9
+
10
+
11
+ def _create_logger():
12
+ """
13
+ Create a logger with colorized output
14
+ Usage: LOG_LEVEL=DEBUG python <script.py>
15
+ """
16
+
17
+ handler = colorlog.StreamHandler()
18
+ fmt = "%(log_color)s%(levelname)-8s%(reset)s [%(filename)s:%(lineno)d] %(message)s"
19
+ handler.setFormatter(
20
+ colorlog.ColoredFormatter(
21
+ fmt=fmt,
22
+ log_colors={
23
+ "DEBUG": "blue",
24
+ "INFO": "green",
25
+ "WARNING": "yellow",
26
+ "ERROR": "red",
27
+ "CRITICAL": "red",
28
+ },
29
+ )
30
+ )
31
+ # Get log level from LOG_LEVEL environment variable
32
+ log_level = os.getenv("LOG_LEVEL", "WARNING").upper()
33
+ logger = colorlog.getLogger(__package__)
34
+ logger.setLevel(level=getattr(logging, log_level, logging.WARNING))
35
+ # Setup logging to stdout
36
+ logger.addHandler(handler)
37
+ return logger
38
+
39
+
40
+ log = _create_logger()
onyx_tts/tokenizer.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ctypes
2
+ import os
3
+ import platform
4
+ import sys
5
+
6
+ import espeakng_loader
7
+ import phonemizer
8
+ from phonemizer.backend.espeak.wrapper import EspeakWrapper
9
+
10
+ from .config import DEFAULT_VOCAB, MAX_PHONEME_LENGTH, EspeakConfig
11
+ from .log import log
12
+
13
+
14
+ class Tokenizer:
15
+ def __init__(self, espeak_config: EspeakConfig | None = None, vocab: dict = None):
16
+ self.vocab = vocab or DEFAULT_VOCAB
17
+
18
+ if not espeak_config:
19
+ espeak_config = EspeakConfig()
20
+ if not espeak_config.data_path:
21
+ espeak_config.data_path = espeakng_loader.get_data_path()
22
+ if not espeak_config.lib_path:
23
+ espeak_config.lib_path = espeakng_loader.get_library_path()
24
+
25
+ # Check if PHONEMIZER_ESPEAK_LIBRARY was set
26
+ if os.getenv("PHONEMIZER_ESPEAK_LIBRARY"):
27
+ espeak_config.lib_path = os.getenv("PHONEMIZER_ESPEAK_LIBRARY")
28
+
29
+ # Check that the espeak-ng library can be loaded
30
+ try:
31
+ ctypes.cdll.LoadLibrary(espeak_config.lib_path)
32
+ except Exception as e:
33
+ log.error(f"Failed to load espeak shared library: {e}")
34
+ log.warning("Falling back to system wide espeak-ng library")
35
+
36
+ # Fallback system wide load
37
+ error_info = (
38
+ "Failed to load espeak-ng from fallback. Please install espeak-ng system wide.\n"
39
+ "\tSee https://github.com/espeak-ng/espeak-ng/blob/master/docs/guide.md\n"
40
+ "\tNote: you can specify shared library path using PHONEMIZER_ESPEAK_LIBRARY environment variable.\n"
41
+ f"Environment:\n\t{platform.platform()} ({platform.release()}) | {sys.version}"
42
+ )
43
+ espeak_config.lib_path = ctypes.util.find_library(
44
+ "espeak-ng"
45
+ ) or ctypes.util.find_library("espeak")
46
+ if not espeak_config.lib_path:
47
+ raise RuntimeError(error_info)
48
+ try:
49
+ ctypes.cdll.LoadLibrary(espeak_config.lib_path)
50
+ except Exception as e:
51
+ raise RuntimeError(f"{e}: {error_info}")
52
+
53
+ EspeakWrapper.set_data_path(espeak_config.data_path)
54
+ EspeakWrapper.set_library(espeak_config.lib_path)
55
+
56
+ @staticmethod
57
+ def normalize_text(text) -> str:
58
+ return text.strip()
59
+
60
+ def tokenize(self, phonemes):
61
+ if len(phonemes) > MAX_PHONEME_LENGTH:
62
+ raise ValueError(
63
+ f"text is too long, must be less than {MAX_PHONEME_LENGTH} phonemes"
64
+ )
65
+ return [i for i in map(self.vocab.get, phonemes) if i is not None]
66
+
67
+ def phonemize(self, text, lang="en-us", norm=True) -> str:
68
+ """
69
+ lang can be 'en-us' or 'en-gb'
70
+ """
71
+ if norm:
72
+ text = Tokenizer.normalize_text(text)
73
+
74
+ phonemes = phonemizer.phonemize(
75
+ text, lang, preserve_punctuation=True, with_stress=True
76
+ )
77
+ phonemes = "".join(filter(lambda p: p in self.vocab, phonemes))
78
+ return phonemes.strip()
onyx_tts/trim.py ADDED
@@ -0,0 +1,743 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Copyright (c) 2013--2023, librosa development team.
3
+
4
+ Permission to use, copy, modify, and/or distribute this software for any purpose with or without fee is hereby granted, provided that the above copyright notice and this permission notice appear in all copies.
5
+
6
+ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
7
+
8
+
9
+ ***This file extracted from librosa package since we use only the trim() function and librosa requires many dependencies***
10
+
11
+ Reference:
12
+ - https://gist.github.com/evq/82e95a363eeeb75d15dd62abc1eb1bde
13
+ - https://github.com/librosa/librosa/blob/894942673d55aa2206df1296b6c4c50827c7f1d6/librosa/effects.py#L612
14
+ """
15
+
16
+ import warnings
17
+ from collections.abc import Callable
18
+ from typing import Any
19
+
20
+ import numpy as np
21
+ from numpy.lib.stride_tricks import as_strided
22
+
23
+
24
+ class LibrosaError(Exception):
25
+ """The root librosa exception class"""
26
+
27
+ pass
28
+
29
+
30
+ class ParameterError(LibrosaError):
31
+ """Exception class for mal-formed inputs"""
32
+
33
+ pass
34
+
35
+
36
+ # @numba.vectorize(
37
+ # ["float32(complex64)", "float64(complex128)"], nopython=True, cache=True, identity=0
38
+ # ) # type: ignore
39
+ def _cabs2(x): # pragma: no cover
40
+ """Efficiently compute abs2 on complex inputs"""
41
+ return x.real**2 + x.imag**2
42
+
43
+
44
+ def abs2(x, dtype):
45
+ """Compute the squared magnitude of a real or complex array.
46
+
47
+ This function is equivalent to calling `np.abs(x)**2` but it
48
+ is slightly more efficient.
49
+
50
+ Parameters
51
+ ----------
52
+ x : np.ndarray or scalar, real or complex typed
53
+ The input data, either real (float32, float64) or complex (complex64, complex128) typed
54
+ dtype : np.dtype, optional
55
+ The data type of the output array.
56
+ If not provided, it will be inferred from `x`
57
+
58
+ Returns
59
+ -------
60
+ p : np.ndarray or scale, real
61
+ squared magnitude of `x`
62
+
63
+ Examples
64
+ --------
65
+ >>> librosa.util.abs2(3 + 4j)
66
+ 25.0
67
+
68
+ >>> librosa.util.abs2((0.5j)**np.arange(8))
69
+ array([1.000e+00, 2.500e-01, 6.250e-02, 1.562e-02, 3.906e-03, 9.766e-04,
70
+ 2.441e-04, 6.104e-05])
71
+ """
72
+ if np.iscomplexobj(x):
73
+ # suppress type check, mypy doesn't like vectorization
74
+ y = _cabs2(x)
75
+ if dtype is None:
76
+ return y # type: ignore
77
+ else:
78
+ return y.astype(dtype) # type: ignore
79
+ else:
80
+ # suppress type check, mypy doesn't know this is real
81
+ return np.square(x, dtype=dtype) # type: ignore
82
+
83
+
84
+ def amplitude_to_db(
85
+ S,
86
+ *,
87
+ ref: float | Callable = 1.0,
88
+ amin: float = 1e-5,
89
+ top_db: float | None = 80.0,
90
+ ) -> np.floating[Any] | np.ndarray:
91
+ """Convert an amplitude spectrogram to dB-scaled spectrogram.
92
+
93
+ This is equivalent to ``power_to_db(S**2, ref=ref**2, amin=amin**2, top_db=top_db)``,
94
+ but is provided for convenience.
95
+
96
+ Parameters
97
+ ----------
98
+ S : np.ndarray
99
+ input amplitude
100
+
101
+ ref : scalar or callable
102
+ If scalar, the amplitude ``abs(S)`` is scaled relative to ``ref``:
103
+ ``20 * log10(S / ref)``.
104
+ Zeros in the output correspond to positions where ``S == ref``.
105
+
106
+ If callable, the reference value is computed as ``ref(S)``.
107
+
108
+ amin : float > 0 [scalar]
109
+ minimum threshold for ``S`` and ``ref``
110
+
111
+ top_db : float >= 0 [scalar]
112
+ threshold the output at ``top_db`` below the peak:
113
+ ``max(20 * log10(S/ref)) - top_db``
114
+
115
+ Returns
116
+ -------
117
+ S_db : np.ndarray
118
+ ``S`` measured in dB
119
+
120
+ See Also
121
+ --------
122
+ power_to_db, db_to_amplitude
123
+
124
+ Notes
125
+ -----
126
+ This function caches at level 30.
127
+ """
128
+ S = np.asarray(S)
129
+
130
+ if np.issubdtype(S.dtype, np.complexfloating):
131
+ warnings.warn(
132
+ "amplitude_to_db was called on complex input so phase "
133
+ "information will be discarded. To suppress this warning, "
134
+ "call amplitude_to_db(np.abs(S)) instead.",
135
+ stacklevel=2,
136
+ )
137
+
138
+ magnitude = np.abs(S)
139
+
140
+ if callable(ref):
141
+ # User supplied a function to calculate reference power
142
+ ref_value = ref(magnitude)
143
+ else:
144
+ ref_value = np.abs(ref)
145
+
146
+ out_array = magnitude if isinstance(magnitude, np.ndarray) else None
147
+ power = np.square(magnitude, out=out_array)
148
+
149
+ db: np.ndarray = power_to_db(power, ref=ref_value**2, amin=amin**2, top_db=top_db)
150
+ return db
151
+
152
+
153
+ def _signal_to_frame_nonsilent(
154
+ y: np.ndarray,
155
+ frame_length: int = 2048,
156
+ hop_length: int = 512,
157
+ top_db: float = 60,
158
+ ref: Callable | float = np.max,
159
+ aggregate: Callable = np.max,
160
+ ) -> np.ndarray:
161
+ """Frame-wise non-silent indicator for audio input.
162
+
163
+ This is a helper function for `trim` and `split`.
164
+
165
+ Parameters
166
+ ----------
167
+ y : np.ndarray
168
+ Audio signal, mono or stereo
169
+
170
+ frame_length : int > 0
171
+ The number of samples per frame
172
+
173
+ hop_length : int > 0
174
+ The number of samples between frames
175
+
176
+ top_db : number
177
+ The threshold (in decibels) below reference to consider as
178
+ silence.
179
+ You can also use a negative value for `top_db` to treat any value
180
+ below `ref + |top_db|` as silent. This will only make sense if
181
+ `ref` is not `np.max`.
182
+
183
+ ref : callable or float
184
+ The reference amplitude
185
+
186
+ aggregate : callable [default: np.max]
187
+ Function to aggregate dB measurements across channels (if y.ndim > 1)
188
+
189
+ Note: for multiple leading axes, this is performed using ``np.apply_over_axes``.
190
+
191
+ Returns
192
+ -------
193
+ non_silent : np.ndarray, shape=(m,), dtype=bool
194
+ Indicator of non-silent frames
195
+ """
196
+ # Compute the MSE for the signal
197
+ mse = rms(y=y, frame_length=frame_length, hop_length=hop_length)
198
+
199
+ # Convert to decibels and slice out the mse channel
200
+ db: np.ndarray = amplitude_to_db(mse[..., 0, :], ref=ref, top_db=None)
201
+
202
+ # Aggregate everything but the time dimension
203
+ if db.ndim > 1:
204
+ db = np.apply_over_axes(aggregate, db, range(db.ndim - 1))
205
+ # Squeeze out leading singleton dimensions here
206
+ # We always want to keep the trailing dimension though
207
+ db = np.squeeze(db, axis=tuple(range(db.ndim - 1)))
208
+
209
+ return db > -top_db
210
+
211
+
212
+ def trim(
213
+ y: np.ndarray,
214
+ *,
215
+ top_db: float = 60,
216
+ ref: float | Callable = np.max,
217
+ frame_length: int = 2048,
218
+ hop_length: int = 512,
219
+ aggregate: Callable = np.max,
220
+ ) -> tuple[np.ndarray, np.ndarray]:
221
+ """Trim leading and trailing silence from an audio signal.
222
+
223
+ Silence is defined as segments of the audio signal that are `top_db`
224
+ decibels (or more) quieter than a reference level, `ref`.
225
+ By default, `ref` is set to the signal's maximum RMS value.
226
+ It's important to note that if the entire signal maintains a uniform
227
+ RMS value, there will be no segments considered quieter than the maximum,
228
+ leading to no trimming.
229
+ This implies that a completely silent signal will remain untrimmed with the default `ref` setting.
230
+ In these situations, an explicit value for `ref` (in decibels) should be used instead.
231
+
232
+ Parameters
233
+ ----------
234
+ y : np.ndarray, shape=(..., n)
235
+ Audio signal. Multi-channel is supported.
236
+ top_db : number
237
+ The threshold (in decibels) below reference to consider as
238
+ silence.
239
+ You can also use a negative value for `top_db` to treat any value
240
+ below `ref + |top_db|` as silent. This will only make sense if
241
+ `ref` is not `np.max`.
242
+ ref : number or callable
243
+ The reference amplitude. By default, it uses `np.max` and compares
244
+ to the peak amplitude in the signal.
245
+ frame_length : int > 0
246
+ The number of samples per analysis frame
247
+ hop_length : int > 0
248
+ The number of samples between analysis frames
249
+ aggregate : callable [default: np.max]
250
+ Function to aggregate across channels (if y.ndim > 1)
251
+
252
+ Returns
253
+ -------
254
+ y_trimmed : np.ndarray, shape=(..., m)
255
+ The trimmed signal
256
+ index : np.ndarray, shape=(2,)
257
+ the interval of ``y`` corresponding to the non-silent region:
258
+ ``y_trimmed = y[index[0]:index[1]]`` (for mono) or
259
+ ``y_trimmed = y[:, index[0]:index[1]]`` (for stereo).
260
+
261
+ Examples
262
+ --------
263
+ >>> # Load some audio
264
+ >>> y, sr = librosa.load(librosa.ex('choice'))
265
+ >>> # Trim the beginning and ending silence
266
+ >>> yt, index = librosa.effects.trim(y)
267
+ >>> # Print the durations
268
+ >>> print(librosa.get_duration(y, sr=sr), librosa.get_duration(yt, sr=sr))
269
+ 25.025986394557822 25.007891156462584
270
+ """
271
+ non_silent = _signal_to_frame_nonsilent(
272
+ y,
273
+ frame_length=frame_length,
274
+ hop_length=hop_length,
275
+ ref=ref,
276
+ top_db=top_db,
277
+ aggregate=aggregate,
278
+ )
279
+
280
+ nonzero = np.flatnonzero(non_silent)
281
+
282
+ if nonzero.size > 0:
283
+ # Compute the start and end positions
284
+ # End position goes one frame past the last non-zero
285
+ start = int(frames_to_samples(nonzero[0], hop_length=hop_length))
286
+ end = min(
287
+ y.shape[-1],
288
+ int(frames_to_samples(nonzero[-1] + 1, hop_length=hop_length)),
289
+ )
290
+ else:
291
+ # The entire signal is trimmed here: nothing is above the threshold
292
+ start, end = 0, 0
293
+
294
+ # Slice the buffer and return the corresponding interval
295
+ return y[..., start:end], np.asarray([start, end])
296
+
297
+
298
+ def rms(
299
+ *,
300
+ y: np.ndarray | None = None,
301
+ S: np.ndarray | None = None,
302
+ frame_length: int = 2048,
303
+ hop_length: int = 512,
304
+ center: bool = True,
305
+ pad_mode="constant",
306
+ dtype=np.float32,
307
+ ) -> np.ndarray:
308
+ """Compute root-mean-square (RMS) value for each frame, either from the
309
+ audio samples ``y`` or from a spectrogram ``S``.
310
+
311
+ Computing the RMS value from audio samples is faster as it doesn't require
312
+ a STFT calculation. However, using a spectrogram will give a more accurate
313
+ representation of energy over time because its frames can be windowed,
314
+ thus prefer using ``S`` if it's already available.
315
+
316
+ Parameters
317
+ ----------
318
+ y : np.ndarray [shape=(..., n)] or None
319
+ (optional) audio time series. Required if ``S`` is not input.
320
+ Multi-channel is supported.
321
+ S : np.ndarray [shape=(..., d, t)] or None
322
+ (optional) spectrogram magnitude. Required if ``y`` is not input.
323
+ frame_length : int > 0 [scalar]
324
+ length of analysis frame (in samples) for energy calculation
325
+ hop_length : int > 0 [scalar]
326
+ hop length for STFT. See `librosa.stft` for details.
327
+ center : bool
328
+ If `True` and operating on time-domain input (``y``), pad the signal
329
+ by ``frame_length//2`` on either side.
330
+ If operating on spectrogram input, this has no effect.
331
+ pad_mode : str
332
+ Padding mode for centered analysis. See `numpy.pad` for valid
333
+ values.
334
+ dtype : np.dtype, optional
335
+ Data type of the output array. Defaults to float32.
336
+
337
+ Returns
338
+ -------
339
+ rms : np.ndarray [shape=(..., 1, t)]
340
+ RMS value for each frame
341
+
342
+ Examples
343
+ --------
344
+ >>> y, sr = librosa.load(librosa.ex('trumpet'))
345
+ >>> librosa.feature.rms(y=y)
346
+ array([[1.248e-01, 1.259e-01, ..., 1.845e-05, 1.796e-05]],
347
+ dtype=float32)
348
+
349
+ Or from spectrogram input
350
+
351
+ >>> S, phase = librosa.magphase(librosa.stft(y))
352
+ >>> rms = librosa.feature.rms(S=S)
353
+
354
+ >>> import matplotlib.pyplot as plt
355
+ >>> fig, ax = plt.subplots(nrows=2, sharex=True)
356
+ >>> times = librosa.times_like(rms)
357
+ >>> ax[0].semilogy(times, rms[0], label='RMS Energy')
358
+ >>> ax[0].set(xticks=[])
359
+ >>> ax[0].legend()
360
+ >>> ax[0].label_outer()
361
+ >>> librosa.display.specshow(librosa.amplitude_to_db(S, ref=np.max),
362
+ ... y_axis='log', x_axis='time', ax=ax[1])
363
+ >>> ax[1].set(title='log Power spectrogram')
364
+
365
+ Use a STFT window of constant ones and no frame centering to get consistent
366
+ results with the RMS computed from the audio samples ``y``
367
+
368
+ >>> S = librosa.magphase(librosa.stft(y, window=np.ones, center=False))[0]
369
+ >>> librosa.feature.rms(S=S)
370
+ >>> plt.show()
371
+
372
+ """
373
+ if y is not None:
374
+ if center:
375
+ padding = [(0, 0) for _ in range(y.ndim)]
376
+ padding[-1] = (int(frame_length // 2), int(frame_length // 2))
377
+ y = np.pad(y, padding, mode=pad_mode)
378
+
379
+ x = frame(y, frame_length=frame_length, hop_length=hop_length)
380
+
381
+ # Calculate power
382
+ power = np.mean(abs2(x, dtype=dtype), axis=-2, keepdims=True)
383
+ elif S is not None:
384
+ # Check the frame length
385
+ if S.shape[-2] != frame_length // 2 + 1:
386
+ raise ParameterError(
387
+ f"Since S.shape[-2] is {S.shape[-2]}, "
388
+ f"frame_length is expected to be {S.shape[-2] * 2 - 2} or {S.shape[-2] * 2 - 1}; "
389
+ f"found {frame_length}"
390
+ )
391
+
392
+ # power spectrogram
393
+ x = abs2(S, dtype=dtype)
394
+
395
+ # Adjust the DC and sr/2 component
396
+ x[..., 0, :] *= 0.5
397
+ if frame_length % 2 == 0:
398
+ x[..., -1, :] *= 0.5
399
+
400
+ # Calculate power
401
+ power = 2 * np.sum(x, axis=-2, keepdims=True) / frame_length**2
402
+ else:
403
+ raise ParameterError("Either `y` or `S` must be input.")
404
+
405
+ rms_result: np.ndarray = np.sqrt(power)
406
+ return rms_result
407
+
408
+
409
+ def frame(
410
+ x: np.ndarray,
411
+ *,
412
+ frame_length: int,
413
+ hop_length: int,
414
+ axis: int = -1,
415
+ writeable: bool = False,
416
+ subok: bool = False,
417
+ ) -> np.ndarray:
418
+ """Slice a data array into (overlapping) frames.
419
+
420
+ This implementation uses low-level stride manipulation to avoid
421
+ making a copy of the data. The resulting frame representation
422
+ is a new view of the same input data.
423
+
424
+ For example, a one-dimensional input ``x = [0, 1, 2, 3, 4, 5, 6]``
425
+ can be framed with frame length 3 and hop length 2 in two ways.
426
+ The first (``axis=-1``), results in the array ``x_frames``::
427
+
428
+ [[0, 2, 4],
429
+ [1, 3, 5],
430
+ [2, 4, 6]]
431
+
432
+ where each column ``x_frames[:, i]`` contains a contiguous slice of
433
+ the input ``x[i * hop_length : i * hop_length + frame_length]``.
434
+
435
+ The second way (``axis=0``) results in the array ``x_frames``::
436
+
437
+ [[0, 1, 2],
438
+ [2, 3, 4],
439
+ [4, 5, 6]]
440
+
441
+ where each row ``x_frames[i]`` contains a contiguous slice of the input.
442
+
443
+ This generalizes to higher dimensional inputs, as shown in the examples below.
444
+ In general, the framing operation increments by 1 the number of dimensions,
445
+ adding a new "frame axis" either before the framing axis (if ``axis < 0``)
446
+ or after the framing axis (if ``axis >= 0``).
447
+
448
+ Parameters
449
+ ----------
450
+ x : np.ndarray
451
+ Array to frame
452
+ frame_length : int > 0 [scalar]
453
+ Length of the frame
454
+ hop_length : int > 0 [scalar]
455
+ Number of steps to advance between frames
456
+ axis : int
457
+ The axis along which to frame.
458
+ writeable : bool
459
+ If ``False``, then the framed view of ``x`` is read-only.
460
+ If ``True``, then the framed view is read-write. Note that writing to the framed view
461
+ will also write to the input array ``x`` in this case.
462
+ subok : bool
463
+ If True, sub-classes will be passed-through, otherwise the returned array will be
464
+ forced to be a base-class array (default).
465
+
466
+ Returns
467
+ -------
468
+ x_frames : np.ndarray [shape=(..., frame_length, N_FRAMES, ...)]
469
+ A framed view of ``x``, for example with ``axis=-1`` (framing on the last dimension)::
470
+
471
+ x_frames[..., j] == x[..., j * hop_length : j * hop_length + frame_length]
472
+
473
+ If ``axis=0`` (framing on the first dimension), then::
474
+
475
+ x_frames[j] = x[j * hop_length : j * hop_length + frame_length]
476
+
477
+ Raises
478
+ ------
479
+ ParameterError
480
+ If ``x.shape[axis] < frame_length``, there is not enough data to fill one frame.
481
+
482
+ If ``hop_length < 1``, frames cannot advance.
483
+
484
+ See Also
485
+ --------
486
+ numpy.lib.stride_tricks.as_strided
487
+
488
+ Examples
489
+ --------
490
+ Extract 2048-sample frames from monophonic signal with a hop of 64 samples per frame
491
+
492
+ >>> y, sr = librosa.load(librosa.ex('trumpet'))
493
+ >>> frames = librosa.util.frame(y, frame_length=2048, hop_length=64)
494
+ >>> frames
495
+ array([[-1.407e-03, -2.604e-02, ..., -1.795e-05, -8.108e-06],
496
+ [-4.461e-04, -3.721e-02, ..., -1.573e-05, -1.652e-05],
497
+ ...,
498
+ [ 7.960e-02, -2.335e-01, ..., -6.815e-06, 1.266e-05],
499
+ [ 9.568e-02, -1.252e-01, ..., 7.397e-06, -1.921e-05]],
500
+ dtype=float32)
501
+ >>> y.shape
502
+ (117601,)
503
+
504
+ >>> frames.shape
505
+ (2048, 1806)
506
+
507
+ Or frame along the first axis instead of the last:
508
+
509
+ >>> frames = librosa.util.frame(y, frame_length=2048, hop_length=64, axis=0)
510
+ >>> frames.shape
511
+ (1806, 2048)
512
+
513
+ Frame a stereo signal:
514
+
515
+ >>> y, sr = librosa.load(librosa.ex('trumpet', hq=True), mono=False)
516
+ >>> y.shape
517
+ (2, 117601)
518
+ >>> frames = librosa.util.frame(y, frame_length=2048, hop_length=64)
519
+ (2, 2048, 1806)
520
+
521
+ Carve an STFT into fixed-length patches of 32 frames with 50% overlap
522
+
523
+ >>> y, sr = librosa.load(librosa.ex('trumpet'))
524
+ >>> S = np.abs(librosa.stft(y))
525
+ >>> S.shape
526
+ (1025, 230)
527
+ >>> S_patch = librosa.util.frame(S, frame_length=32, hop_length=16)
528
+ >>> S_patch.shape
529
+ (1025, 32, 13)
530
+ >>> # The first patch contains the first 32 frames of S
531
+ >>> np.allclose(S_patch[:, :, 0], S[:, :32])
532
+ True
533
+ >>> # The second patch contains frames 16 to 16+32=48, and so on
534
+ >>> np.allclose(S_patch[:, :, 1], S[:, 16:48])
535
+ True
536
+ """
537
+ # This implementation is derived from numpy.lib.stride_tricks.sliding_window_view (1.20.0)
538
+ # https://numpy.org/doc/stable/reference/generated/numpy.lib.stride_tricks.sliding_window_view.html
539
+
540
+ x = np.array(x, copy=False, subok=subok)
541
+
542
+ if x.shape[axis] < frame_length:
543
+ raise ParameterError(
544
+ f"Input is too short (n={x.shape[axis]:d}) for frame_length={frame_length:d}"
545
+ )
546
+
547
+ if hop_length < 1:
548
+ raise ParameterError(f"Invalid hop_length: {hop_length:d}")
549
+
550
+ # put our new within-frame axis at the end for now
551
+ out_strides = x.strides + tuple([x.strides[axis]])
552
+
553
+ # Reduce the shape on the framing axis
554
+ x_shape_trimmed = list(x.shape)
555
+ x_shape_trimmed[axis] -= frame_length - 1
556
+
557
+ out_shape = tuple(x_shape_trimmed) + tuple([frame_length])
558
+ xw = as_strided(
559
+ x, strides=out_strides, shape=out_shape, subok=subok, writeable=writeable
560
+ )
561
+
562
+ if axis < 0:
563
+ target_axis = axis - 1
564
+ else:
565
+ target_axis = axis + 1
566
+
567
+ xw = np.moveaxis(xw, -1, target_axis)
568
+
569
+ # Downsample along the target axis
570
+ slices = [slice(None)] * xw.ndim
571
+ slices[axis] = slice(0, None, hop_length)
572
+ return xw[tuple(slices)]
573
+
574
+
575
+ def power_to_db(
576
+ S,
577
+ *,
578
+ ref: float | Callable = 1.0,
579
+ amin: float = 1e-10,
580
+ top_db: float | None = 80.0,
581
+ ) -> np.floating[Any] | np.ndarray:
582
+ """Convert a power spectrogram (amplitude squared) to decibel (dB) units
583
+
584
+ This computes the scaling ``10 * log10(S / ref)`` in a numerically
585
+ stable way.
586
+
587
+ Parameters
588
+ ----------
589
+ S : np.ndarray
590
+ input power
591
+
592
+ ref : scalar or callable
593
+ If scalar, the amplitude ``abs(S)`` is scaled relative to ``ref``::
594
+
595
+ 10 * log10(S / ref)
596
+
597
+ Zeros in the output correspond to positions where ``S == ref``.
598
+
599
+ If callable, the reference value is computed as ``ref(S)``.
600
+
601
+ amin : float > 0 [scalar]
602
+ minimum threshold for ``abs(S)`` and ``ref``
603
+
604
+ top_db : float >= 0 [scalar]
605
+ threshold the output at ``top_db`` below the peak:
606
+ ``max(10 * log10(S/ref)) - top_db``
607
+
608
+ Returns
609
+ -------
610
+ S_db : np.ndarray
611
+ ``S_db ~= 10 * log10(S) - 10 * log10(ref)``
612
+
613
+ See Also
614
+ --------
615
+ perceptual_weighting
616
+ db_to_power
617
+ amplitude_to_db
618
+ db_to_amplitude
619
+
620
+ Notes
621
+ -----
622
+ This function caches at level 30.
623
+
624
+ Examples
625
+ --------
626
+ Get a power spectrogram from a waveform ``y``
627
+
628
+ >>> y, sr = librosa.load(librosa.ex('trumpet'))
629
+ >>> S = np.abs(librosa.stft(y))
630
+ >>> librosa.power_to_db(S**2)
631
+ array([[-41.809, -41.809, ..., -41.809, -41.809],
632
+ [-41.809, -41.809, ..., -41.809, -41.809],
633
+ ...,
634
+ [-41.809, -41.809, ..., -41.809, -41.809],
635
+ [-41.809, -41.809, ..., -41.809, -41.809]], dtype=float32)
636
+
637
+ Compute dB relative to peak power
638
+
639
+ >>> librosa.power_to_db(S**2, ref=np.max)
640
+ array([[-80., -80., ..., -80., -80.],
641
+ [-80., -80., ..., -80., -80.],
642
+ ...,
643
+ [-80., -80., ..., -80., -80.],
644
+ [-80., -80., ..., -80., -80.]], dtype=float32)
645
+
646
+ Or compare to median power
647
+
648
+ >>> librosa.power_to_db(S**2, ref=np.median)
649
+ array([[16.578, 16.578, ..., 16.578, 16.578],
650
+ [16.578, 16.578, ..., 16.578, 16.578],
651
+ ...,
652
+ [16.578, 16.578, ..., 16.578, 16.578],
653
+ [16.578, 16.578, ..., 16.578, 16.578]], dtype=float32)
654
+
655
+ And plot the results
656
+
657
+ >>> import matplotlib.pyplot as plt
658
+ >>> fig, ax = plt.subplots(nrows=2, sharex=True, sharey=True)
659
+ >>> imgpow = librosa.display.specshow(S**2, sr=sr, y_axis='log', x_axis='time',
660
+ ... ax=ax[0])
661
+ >>> ax[0].set(title='Power spectrogram')
662
+ >>> ax[0].label_outer()
663
+ >>> imgdb = librosa.display.specshow(librosa.power_to_db(S**2, ref=np.max),
664
+ ... sr=sr, y_axis='log', x_axis='time', ax=ax[1])
665
+ >>> ax[1].set(title='Log-Power spectrogram')
666
+ >>> fig.colorbar(imgpow, ax=ax[0])
667
+ >>> fig.colorbar(imgdb, ax=ax[1], format="%+2.0f dB")
668
+ """
669
+ S = np.asarray(S)
670
+
671
+ if amin <= 0:
672
+ raise ParameterError("amin must be strictly positive")
673
+
674
+ if np.issubdtype(S.dtype, np.complexfloating):
675
+ warnings.warn(
676
+ "power_to_db was called on complex input so phase "
677
+ "information will be discarded. To suppress this warning, "
678
+ "call power_to_db(np.abs(D)**2) instead.",
679
+ stacklevel=2,
680
+ )
681
+ magnitude = np.abs(S)
682
+ else:
683
+ magnitude = S
684
+
685
+ if callable(ref):
686
+ # User supplied a function to calculate reference power
687
+ ref_value = ref(magnitude)
688
+ else:
689
+ ref_value = np.abs(ref)
690
+
691
+ log_spec: np.ndarray = 10.0 * np.log10(np.maximum(amin, magnitude))
692
+ log_spec -= 10.0 * np.log10(np.maximum(amin, ref_value))
693
+
694
+ if top_db is not None:
695
+ if top_db < 0:
696
+ raise ParameterError("top_db must be non-negative")
697
+ log_spec = np.maximum(log_spec, log_spec.max() - top_db)
698
+
699
+ return log_spec
700
+
701
+
702
+ def frames_to_samples(
703
+ frames,
704
+ *,
705
+ hop_length: int = 512,
706
+ n_fft: int | None = None,
707
+ ) -> np.integer[Any] | np.ndarray:
708
+ """Convert frame indices to audio sample indices.
709
+
710
+ Parameters
711
+ ----------
712
+ frames : number or np.ndarray [shape=(n,)]
713
+ frame index or vector of frame indices
714
+ hop_length : int > 0 [scalar]
715
+ number of samples between successive frames
716
+ n_fft : None or int > 0 [scalar]
717
+ Optional: length of the FFT window.
718
+ If given, time conversion will include an offset of ``n_fft // 2``
719
+ to counteract windowing effects when using a non-centered STFT.
720
+
721
+ Returns
722
+ -------
723
+ times : number or np.ndarray
724
+ time (in samples) of each given frame number::
725
+
726
+ times[i] = frames[i] * hop_length
727
+
728
+ See Also
729
+ --------
730
+ frames_to_time : convert frame indices to time values
731
+ samples_to_frames : convert sample indices to frame indices
732
+
733
+ Examples
734
+ --------
735
+ >>> y, sr = librosa.load(librosa.ex('choice'))
736
+ >>> tempo, beats = librosa.beat.beat_track(y=y, sr=sr)
737
+ >>> beat_samples = librosa.frames_to_samples(beats, sr=sr)
738
+ """
739
+ offset = 0
740
+ if n_fft is not None:
741
+ offset = int(n_fft // 2)
742
+
743
+ return (np.asanyarray(frames) * hop_length + offset).astype(int)
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ numpy>=2.0.2
2
+ onnxruntime>=1.20.1
3
+ colorlog>=6.9.0
4
+ espeakng-loader>=0.2.4
5
+ phonemizer-fork>=3.3.2