Commit ·
db53ed7
1
Parent(s): dea7d40
update voice sets
Browse files- aux_lm_residual_projection.safetensors +2 -2
- chute_config.yml +7 -6
- miner.py +15 -15
aux_lm_residual_projection.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9ef937d2f3d9390e0bfee2978a6805f3cd251326b1091355e1129250e11d020c
|
| 3 |
+
size 76094690
|
chute_config.yml
CHANGED
|
@@ -1,20 +1,21 @@
|
|
| 1 |
-
# Image + node + Chute for Vocence deploy. Required in the HF repo at build time.
|
| 2 |
-
|
| 3 |
Image:
|
| 4 |
from_base: parachutes/python:3.12
|
| 5 |
run_command:
|
| 6 |
-
- pip install torch torchaudio
|
|
|
|
|
|
|
| 7 |
set_workdir: /app
|
| 8 |
|
| 9 |
NodeSelector:
|
| 10 |
gpu_count: 1
|
| 11 |
min_vram_gb_per_gpu: 64
|
| 12 |
-
include:
|
|
|
|
| 13 |
exclude: []
|
| 14 |
|
| 15 |
Chute:
|
| 16 |
-
tagline: Vocence TTS —
|
| 17 |
-
readme:
|
| 18 |
shutdown_after_seconds: 86400
|
| 19 |
concurrency: 1
|
| 20 |
max_instances: 1
|
|
|
|
|
|
|
|
|
|
| 1 |
Image:
|
| 2 |
from_base: parachutes/python:3.12
|
| 3 |
run_command:
|
| 4 |
+
- pip install torch torchaudio
|
| 5 |
+
- pip install transformers==4.51.3
|
| 6 |
+
- pip install accelerate huggingface_hub pyyaml soundfile librosa datasets peft llvmlite numba diffusers tqdm numpy scipy librosa ml-collections absl-py gradio av aiortc
|
| 7 |
set_workdir: /app
|
| 8 |
|
| 9 |
NodeSelector:
|
| 10 |
gpu_count: 1
|
| 11 |
min_vram_gb_per_gpu: 64
|
| 12 |
+
include:
|
| 13 |
+
- pro_6000
|
| 14 |
exclude: []
|
| 15 |
|
| 16 |
Chute:
|
| 17 |
+
tagline: Vocence TTS — QWEN3Vox (weights + miner.py in repo)
|
| 18 |
+
readme: Repo-root miner.py, config.json, weights, aux_lm_residual_projection.safetensors; optional voices/*.wav for discrete conditioning (VOCENCE_PREFER_DISCRETE_COEFF_DIR).
|
| 19 |
shutdown_after_seconds: 86400
|
| 20 |
concurrency: 1
|
| 21 |
max_instances: 1
|
miner.py
CHANGED
|
@@ -76,7 +76,7 @@ class QWEN3VoxDataset:
|
|
| 76 |
data["voice_prompts"] = user_provided_prompt
|
| 77 |
else:
|
| 78 |
try:
|
| 79 |
-
target_sr =
|
| 80 |
wav_array = _load_audio_to_24k(
|
| 81 |
item[self.audio_column], target_sr=target_sr
|
| 82 |
)
|
|
@@ -157,7 +157,7 @@ def _apply_silence_with_crossfade(
|
|
| 157 |
def _load_audio_to_24k(
|
| 158 |
audio: Union[str, np.ndarray, torch.Tensor, Dict[str, Any]],
|
| 159 |
*,
|
| 160 |
-
target_sr: int =
|
| 161 |
augment_with_silence: bool = False,
|
| 162 |
) -> np.ndarray:
|
| 163 |
if isinstance(audio, np.ndarray):
|
|
@@ -241,7 +241,7 @@ class QWEN3VoxCollator:
|
|
| 241 |
)
|
| 242 |
speech_input_mask_list = speech_input_mask[0].tolist()
|
| 243 |
wav_target = _load_audio_to_24k(
|
| 244 |
-
target_audio, target_sr=
|
| 245 |
)
|
| 246 |
target_latent_len = None
|
| 247 |
try:
|
|
@@ -579,7 +579,7 @@ class QWEN3VoxDiffusionHeadConfig(PretrainedConfig):
|
|
| 579 |
prediction_type="v_prediction",
|
| 580 |
diffusion_type="ddpm",
|
| 581 |
ddpm_num_steps=1000,
|
| 582 |
-
ddpm_num_inference_steps=
|
| 583 |
ddpm_beta_schedule="cosine",
|
| 584 |
ddpm_batch_mul=4,
|
| 585 |
**kwargs,
|
|
@@ -2119,7 +2119,7 @@ class QWEN3VoxTokenizerProcessor(FeatureExtractionMixin):
|
|
| 2119 |
|
| 2120 |
def __init__(
|
| 2121 |
self,
|
| 2122 |
-
sampling_rate: int =
|
| 2123 |
normalize_audio: bool = True,
|
| 2124 |
target_dB_FS: float = -25,
|
| 2125 |
eps: float = 1e-06,
|
|
@@ -3920,7 +3920,7 @@ class QWEN3VoxASRProcessor:
|
|
| 3920 |
tokenizer=None,
|
| 3921 |
audio_processor=None,
|
| 3922 |
speech_tok_compress_ratio=320,
|
| 3923 |
-
target_sample_rate=
|
| 3924 |
normalize_audio=True,
|
| 3925 |
**kwargs,
|
| 3926 |
):
|
|
@@ -3982,7 +3982,7 @@ class QWEN3VoxASRProcessor:
|
|
| 3982 |
logger.warning(f"Could not load preprocessor_config.json: {e }")
|
| 3983 |
logger.warning("Using default configuration")
|
| 3984 |
speech_tok_compress_ratio = config.get("speech_tok_compress_ratio", 3200)
|
| 3985 |
-
target_sample_rate = config.get("target_sample_rate",
|
| 3986 |
normalize_audio = config.get("normalize_audio", True)
|
| 3987 |
language_model_pretrained_name = config.get(
|
| 3988 |
"language_model_pretrained_name", None
|
|
@@ -4377,7 +4377,7 @@ class QWEN3VoxProcessor:
|
|
| 4377 |
if "audio_processor" in config:
|
| 4378 |
audio_config = config["audio_processor"]
|
| 4379 |
audio_processor = QWEN3VoxTokenizerProcessor(
|
| 4380 |
-
sampling_rate=audio_config.get("sampling_rate",
|
| 4381 |
normalize_audio=audio_config.get("normalize_audio", True),
|
| 4382 |
target_dB_FS=audio_config.get("target_dB_FS", -25),
|
| 4383 |
eps=audio_config.get("eps", 1e-06),
|
|
@@ -4402,7 +4402,7 @@ class QWEN3VoxProcessor:
|
|
| 4402 |
"db_normalize": self.db_normalize,
|
| 4403 |
"audio_processor": {
|
| 4404 |
"feature_extractor_type": "QWEN3VoxTokenizerProcessor",
|
| 4405 |
-
"sampling_rate": getattr(self.audio_processor, "sampling_rate",
|
| 4406 |
"normalize_audio": getattr(
|
| 4407 |
self.audio_processor, "normalize_audio", True
|
| 4408 |
),
|
|
@@ -4899,7 +4899,7 @@ class QWEN3VoxStreamingProcessor:
|
|
| 4899 |
if "audio_processor" in config:
|
| 4900 |
audio_config = config["audio_processor"]
|
| 4901 |
audio_processor = QWEN3VoxTokenizerProcessor(
|
| 4902 |
-
sampling_rate=audio_config.get("sampling_rate",
|
| 4903 |
normalize_audio=audio_config.get("normalize_audio", True),
|
| 4904 |
target_dB_FS=audio_config.get("target_dB_FS", -25),
|
| 4905 |
eps=audio_config.get("eps", 1e-06),
|
|
@@ -4924,7 +4924,7 @@ class QWEN3VoxStreamingProcessor:
|
|
| 4924 |
"db_normalize": self.db_normalize,
|
| 4925 |
"audio_processor": {
|
| 4926 |
"feature_extractor_type": "QWEN3VoxTokenizerProcessor",
|
| 4927 |
-
"sampling_rate": getattr(self.audio_processor, "sampling_rate",
|
| 4928 |
"normalize_audio": getattr(
|
| 4929 |
self.audio_processor, "normalize_audio", True
|
| 4930 |
),
|
|
@@ -7932,7 +7932,7 @@ class QWEN3VoxASRForConditionalGeneration(QWEN3VoxASRPreTrainedModel, Generation
|
|
| 7932 |
if speech_tensors.ndim == 1:
|
| 7933 |
speech_tensors = speech_tensors.unsqueeze(0)
|
| 7934 |
batch_size, total_samples = speech_tensors.shape
|
| 7935 |
-
sample_rate =
|
| 7936 |
segment_samples = int(streaming_segment_duration * sample_rate)
|
| 7937 |
use_streaming = total_samples > segment_samples
|
| 7938 |
with torch.no_grad():
|
|
@@ -8975,7 +8975,7 @@ def convert_q3_nnscaler_checkpoint_to_hf(
|
|
| 8975 |
"db_normalize": True,
|
| 8976 |
"audio_processor": {
|
| 8977 |
"feature_extractor_type": "QWEN3VoxTokenizerProcessor",
|
| 8978 |
-
"sampling_rate":
|
| 8979 |
"normalize_audio": True,
|
| 8980 |
"target_dB_FS": -25,
|
| 8981 |
"eps": 1e-06,
|
|
@@ -9835,7 +9835,7 @@ class Miner:
|
|
| 9835 |
torch.manual_seed(s)
|
| 9836 |
if torch.cuda.is_available():
|
| 9837 |
torch.cuda.manual_seed_all(s)
|
| 9838 |
-
self._cfg_scale = float(os.environ.get("VOCENCE_CFG_SCALE", "1.
|
| 9839 |
self._disable_prefill = os.environ.get(
|
| 9840 |
"VOCENCE_DISABLE_PREFILL", ""
|
| 9841 |
).lower() in ("1", "true", "yes")
|
|
@@ -9865,7 +9865,7 @@ class Miner:
|
|
| 9865 |
self._model.eval()
|
| 9866 |
self._model.set_ddpm_inference_steps(num_steps=20)
|
| 9867 |
self._sample_rate = int(
|
| 9868 |
-
getattr(self._processor.audio_processor, "sampling_rate",
|
| 9869 |
)
|
| 9870 |
|
| 9871 |
def _load_model_weights(
|
|
|
|
| 76 |
data["voice_prompts"] = user_provided_prompt
|
| 77 |
else:
|
| 78 |
try:
|
| 79 |
+
target_sr = 22050
|
| 80 |
wav_array = _load_audio_to_24k(
|
| 81 |
item[self.audio_column], target_sr=target_sr
|
| 82 |
)
|
|
|
|
| 157 |
def _load_audio_to_24k(
|
| 158 |
audio: Union[str, np.ndarray, torch.Tensor, Dict[str, Any]],
|
| 159 |
*,
|
| 160 |
+
target_sr: int = 22050,
|
| 161 |
augment_with_silence: bool = False,
|
| 162 |
) -> np.ndarray:
|
| 163 |
if isinstance(audio, np.ndarray):
|
|
|
|
| 241 |
)
|
| 242 |
speech_input_mask_list = speech_input_mask[0].tolist()
|
| 243 |
wav_target = _load_audio_to_24k(
|
| 244 |
+
target_audio, target_sr=22050, augment_with_silence=True
|
| 245 |
)
|
| 246 |
target_latent_len = None
|
| 247 |
try:
|
|
|
|
| 579 |
prediction_type="v_prediction",
|
| 580 |
diffusion_type="ddpm",
|
| 581 |
ddpm_num_steps=1000,
|
| 582 |
+
ddpm_num_inference_steps=30,
|
| 583 |
ddpm_beta_schedule="cosine",
|
| 584 |
ddpm_batch_mul=4,
|
| 585 |
**kwargs,
|
|
|
|
| 2119 |
|
| 2120 |
def __init__(
|
| 2121 |
self,
|
| 2122 |
+
sampling_rate: int = 22050,
|
| 2123 |
normalize_audio: bool = True,
|
| 2124 |
target_dB_FS: float = -25,
|
| 2125 |
eps: float = 1e-06,
|
|
|
|
| 3920 |
tokenizer=None,
|
| 3921 |
audio_processor=None,
|
| 3922 |
speech_tok_compress_ratio=320,
|
| 3923 |
+
target_sample_rate=22050,
|
| 3924 |
normalize_audio=True,
|
| 3925 |
**kwargs,
|
| 3926 |
):
|
|
|
|
| 3982 |
logger.warning(f"Could not load preprocessor_config.json: {e }")
|
| 3983 |
logger.warning("Using default configuration")
|
| 3984 |
speech_tok_compress_ratio = config.get("speech_tok_compress_ratio", 3200)
|
| 3985 |
+
target_sample_rate = config.get("target_sample_rate", 22050)
|
| 3986 |
normalize_audio = config.get("normalize_audio", True)
|
| 3987 |
language_model_pretrained_name = config.get(
|
| 3988 |
"language_model_pretrained_name", None
|
|
|
|
| 4377 |
if "audio_processor" in config:
|
| 4378 |
audio_config = config["audio_processor"]
|
| 4379 |
audio_processor = QWEN3VoxTokenizerProcessor(
|
| 4380 |
+
sampling_rate=audio_config.get("sampling_rate", 22050),
|
| 4381 |
normalize_audio=audio_config.get("normalize_audio", True),
|
| 4382 |
target_dB_FS=audio_config.get("target_dB_FS", -25),
|
| 4383 |
eps=audio_config.get("eps", 1e-06),
|
|
|
|
| 4402 |
"db_normalize": self.db_normalize,
|
| 4403 |
"audio_processor": {
|
| 4404 |
"feature_extractor_type": "QWEN3VoxTokenizerProcessor",
|
| 4405 |
+
"sampling_rate": getattr(self.audio_processor, "sampling_rate", 22050),
|
| 4406 |
"normalize_audio": getattr(
|
| 4407 |
self.audio_processor, "normalize_audio", True
|
| 4408 |
),
|
|
|
|
| 4899 |
if "audio_processor" in config:
|
| 4900 |
audio_config = config["audio_processor"]
|
| 4901 |
audio_processor = QWEN3VoxTokenizerProcessor(
|
| 4902 |
+
sampling_rate=audio_config.get("sampling_rate", 22050),
|
| 4903 |
normalize_audio=audio_config.get("normalize_audio", True),
|
| 4904 |
target_dB_FS=audio_config.get("target_dB_FS", -25),
|
| 4905 |
eps=audio_config.get("eps", 1e-06),
|
|
|
|
| 4924 |
"db_normalize": self.db_normalize,
|
| 4925 |
"audio_processor": {
|
| 4926 |
"feature_extractor_type": "QWEN3VoxTokenizerProcessor",
|
| 4927 |
+
"sampling_rate": getattr(self.audio_processor, "sampling_rate", 22050),
|
| 4928 |
"normalize_audio": getattr(
|
| 4929 |
self.audio_processor, "normalize_audio", True
|
| 4930 |
),
|
|
|
|
| 7932 |
if speech_tensors.ndim == 1:
|
| 7933 |
speech_tensors = speech_tensors.unsqueeze(0)
|
| 7934 |
batch_size, total_samples = speech_tensors.shape
|
| 7935 |
+
sample_rate = 22050
|
| 7936 |
segment_samples = int(streaming_segment_duration * sample_rate)
|
| 7937 |
use_streaming = total_samples > segment_samples
|
| 7938 |
with torch.no_grad():
|
|
|
|
| 8975 |
"db_normalize": True,
|
| 8976 |
"audio_processor": {
|
| 8977 |
"feature_extractor_type": "QWEN3VoxTokenizerProcessor",
|
| 8978 |
+
"sampling_rate": 22050,
|
| 8979 |
"normalize_audio": True,
|
| 8980 |
"target_dB_FS": -25,
|
| 8981 |
"eps": 1e-06,
|
|
|
|
| 9835 |
torch.manual_seed(s)
|
| 9836 |
if torch.cuda.is_available():
|
| 9837 |
torch.cuda.manual_seed_all(s)
|
| 9838 |
+
self._cfg_scale = float(os.environ.get("VOCENCE_CFG_SCALE", "1.3"))
|
| 9839 |
self._disable_prefill = os.environ.get(
|
| 9840 |
"VOCENCE_DISABLE_PREFILL", ""
|
| 9841 |
).lower() in ("1", "true", "yes")
|
|
|
|
| 9865 |
self._model.eval()
|
| 9866 |
self._model.set_ddpm_inference_steps(num_steps=20)
|
| 9867 |
self._sample_rate = int(
|
| 9868 |
+
getattr(self._processor.audio_processor, "sampling_rate", 22050)
|
| 9869 |
)
|
| 9870 |
|
| 9871 |
def _load_model_weights(
|