matthewliu0302 commited on
Commit
db53ed7
·
1 Parent(s): dea7d40

update voice sets

Browse files
aux_lm_residual_projection.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:228ee1401f2ce80378f007b6a5fef3f80f7350252d5ee0c1efa3f370692b0351
3
- size 76258956
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ef937d2f3d9390e0bfee2978a6805f3cd251326b1091355e1129250e11d020c
3
+ size 76094690
chute_config.yml CHANGED
@@ -1,20 +1,21 @@
1
- # Image + node + Chute for Vocence deploy. Required in the HF repo at build time.
2
-
3
  Image:
4
  from_base: parachutes/python:3.12
5
  run_command:
6
- - pip install torch torchaudio transformers accelerate huggingface_hub pyyaml soundfile librosa datasets peft llvmlite numba diffusers tqdm numpy scipy librosa ml-collections absl-py gradio av aiortc
 
 
7
  set_workdir: /app
8
 
9
  NodeSelector:
10
  gpu_count: 1
11
  min_vram_gb_per_gpu: 64
12
- include: ["pro_6000"]
 
13
  exclude: []
14
 
15
  Chute:
16
- tagline: Vocence TTS — Qwen3 PromptTTS (weights in repo)
17
- readme: Qwen3 12Hz TTS snapshot + miner.py for Vocence
18
  shutdown_after_seconds: 86400
19
  concurrency: 1
20
  max_instances: 1
 
 
 
1
  Image:
2
  from_base: parachutes/python:3.12
3
  run_command:
4
+ - pip install torch torchaudio
5
+ - pip install transformers==4.51.3
6
+ - pip install accelerate huggingface_hub pyyaml soundfile librosa datasets peft llvmlite numba diffusers tqdm numpy scipy librosa ml-collections absl-py gradio av aiortc
7
  set_workdir: /app
8
 
9
  NodeSelector:
10
  gpu_count: 1
11
  min_vram_gb_per_gpu: 64
12
+ include:
13
+ - pro_6000
14
  exclude: []
15
 
16
  Chute:
17
+ tagline: Vocence TTS — QWEN3Vox (weights + miner.py in repo)
18
+ readme: Repo-root miner.py, config.json, weights, aux_lm_residual_projection.safetensors; optional voices/*.wav for discrete conditioning (VOCENCE_PREFER_DISCRETE_COEFF_DIR).
19
  shutdown_after_seconds: 86400
20
  concurrency: 1
21
  max_instances: 1
miner.py CHANGED
@@ -76,7 +76,7 @@ class QWEN3VoxDataset:
76
  data["voice_prompts"] = user_provided_prompt
77
  else:
78
  try:
79
- target_sr = 24000
80
  wav_array = _load_audio_to_24k(
81
  item[self.audio_column], target_sr=target_sr
82
  )
@@ -157,7 +157,7 @@ def _apply_silence_with_crossfade(
157
  def _load_audio_to_24k(
158
  audio: Union[str, np.ndarray, torch.Tensor, Dict[str, Any]],
159
  *,
160
- target_sr: int = 24000,
161
  augment_with_silence: bool = False,
162
  ) -> np.ndarray:
163
  if isinstance(audio, np.ndarray):
@@ -241,7 +241,7 @@ class QWEN3VoxCollator:
241
  )
242
  speech_input_mask_list = speech_input_mask[0].tolist()
243
  wav_target = _load_audio_to_24k(
244
- target_audio, target_sr=24000, augment_with_silence=True
245
  )
246
  target_latent_len = None
247
  try:
@@ -579,7 +579,7 @@ class QWEN3VoxDiffusionHeadConfig(PretrainedConfig):
579
  prediction_type="v_prediction",
580
  diffusion_type="ddpm",
581
  ddpm_num_steps=1000,
582
- ddpm_num_inference_steps=20,
583
  ddpm_beta_schedule="cosine",
584
  ddpm_batch_mul=4,
585
  **kwargs,
@@ -2119,7 +2119,7 @@ class QWEN3VoxTokenizerProcessor(FeatureExtractionMixin):
2119
 
2120
  def __init__(
2121
  self,
2122
- sampling_rate: int = 24000,
2123
  normalize_audio: bool = True,
2124
  target_dB_FS: float = -25,
2125
  eps: float = 1e-06,
@@ -3920,7 +3920,7 @@ class QWEN3VoxASRProcessor:
3920
  tokenizer=None,
3921
  audio_processor=None,
3922
  speech_tok_compress_ratio=320,
3923
- target_sample_rate=24000,
3924
  normalize_audio=True,
3925
  **kwargs,
3926
  ):
@@ -3982,7 +3982,7 @@ class QWEN3VoxASRProcessor:
3982
  logger.warning(f"Could not load preprocessor_config.json: {e }")
3983
  logger.warning("Using default configuration")
3984
  speech_tok_compress_ratio = config.get("speech_tok_compress_ratio", 3200)
3985
- target_sample_rate = config.get("target_sample_rate", 24000)
3986
  normalize_audio = config.get("normalize_audio", True)
3987
  language_model_pretrained_name = config.get(
3988
  "language_model_pretrained_name", None
@@ -4377,7 +4377,7 @@ class QWEN3VoxProcessor:
4377
  if "audio_processor" in config:
4378
  audio_config = config["audio_processor"]
4379
  audio_processor = QWEN3VoxTokenizerProcessor(
4380
- sampling_rate=audio_config.get("sampling_rate", 24000),
4381
  normalize_audio=audio_config.get("normalize_audio", True),
4382
  target_dB_FS=audio_config.get("target_dB_FS", -25),
4383
  eps=audio_config.get("eps", 1e-06),
@@ -4402,7 +4402,7 @@ class QWEN3VoxProcessor:
4402
  "db_normalize": self.db_normalize,
4403
  "audio_processor": {
4404
  "feature_extractor_type": "QWEN3VoxTokenizerProcessor",
4405
- "sampling_rate": getattr(self.audio_processor, "sampling_rate", 24000),
4406
  "normalize_audio": getattr(
4407
  self.audio_processor, "normalize_audio", True
4408
  ),
@@ -4899,7 +4899,7 @@ class QWEN3VoxStreamingProcessor:
4899
  if "audio_processor" in config:
4900
  audio_config = config["audio_processor"]
4901
  audio_processor = QWEN3VoxTokenizerProcessor(
4902
- sampling_rate=audio_config.get("sampling_rate", 24000),
4903
  normalize_audio=audio_config.get("normalize_audio", True),
4904
  target_dB_FS=audio_config.get("target_dB_FS", -25),
4905
  eps=audio_config.get("eps", 1e-06),
@@ -4924,7 +4924,7 @@ class QWEN3VoxStreamingProcessor:
4924
  "db_normalize": self.db_normalize,
4925
  "audio_processor": {
4926
  "feature_extractor_type": "QWEN3VoxTokenizerProcessor",
4927
- "sampling_rate": getattr(self.audio_processor, "sampling_rate", 24000),
4928
  "normalize_audio": getattr(
4929
  self.audio_processor, "normalize_audio", True
4930
  ),
@@ -7932,7 +7932,7 @@ class QWEN3VoxASRForConditionalGeneration(QWEN3VoxASRPreTrainedModel, Generation
7932
  if speech_tensors.ndim == 1:
7933
  speech_tensors = speech_tensors.unsqueeze(0)
7934
  batch_size, total_samples = speech_tensors.shape
7935
- sample_rate = 24000
7936
  segment_samples = int(streaming_segment_duration * sample_rate)
7937
  use_streaming = total_samples > segment_samples
7938
  with torch.no_grad():
@@ -8975,7 +8975,7 @@ def convert_q3_nnscaler_checkpoint_to_hf(
8975
  "db_normalize": True,
8976
  "audio_processor": {
8977
  "feature_extractor_type": "QWEN3VoxTokenizerProcessor",
8978
- "sampling_rate": 24000,
8979
  "normalize_audio": True,
8980
  "target_dB_FS": -25,
8981
  "eps": 1e-06,
@@ -9835,7 +9835,7 @@ class Miner:
9835
  torch.manual_seed(s)
9836
  if torch.cuda.is_available():
9837
  torch.cuda.manual_seed_all(s)
9838
- self._cfg_scale = float(os.environ.get("VOCENCE_CFG_SCALE", "1.2"))
9839
  self._disable_prefill = os.environ.get(
9840
  "VOCENCE_DISABLE_PREFILL", ""
9841
  ).lower() in ("1", "true", "yes")
@@ -9865,7 +9865,7 @@ class Miner:
9865
  self._model.eval()
9866
  self._model.set_ddpm_inference_steps(num_steps=20)
9867
  self._sample_rate = int(
9868
- getattr(self._processor.audio_processor, "sampling_rate", 24000)
9869
  )
9870
 
9871
  def _load_model_weights(
 
76
  data["voice_prompts"] = user_provided_prompt
77
  else:
78
  try:
79
+ target_sr = 22050
80
  wav_array = _load_audio_to_24k(
81
  item[self.audio_column], target_sr=target_sr
82
  )
 
157
  def _load_audio_to_24k(
158
  audio: Union[str, np.ndarray, torch.Tensor, Dict[str, Any]],
159
  *,
160
+ target_sr: int = 22050,
161
  augment_with_silence: bool = False,
162
  ) -> np.ndarray:
163
  if isinstance(audio, np.ndarray):
 
241
  )
242
  speech_input_mask_list = speech_input_mask[0].tolist()
243
  wav_target = _load_audio_to_24k(
244
+ target_audio, target_sr=22050, augment_with_silence=True
245
  )
246
  target_latent_len = None
247
  try:
 
579
  prediction_type="v_prediction",
580
  diffusion_type="ddpm",
581
  ddpm_num_steps=1000,
582
+ ddpm_num_inference_steps=30,
583
  ddpm_beta_schedule="cosine",
584
  ddpm_batch_mul=4,
585
  **kwargs,
 
2119
 
2120
  def __init__(
2121
  self,
2122
+ sampling_rate: int = 22050,
2123
  normalize_audio: bool = True,
2124
  target_dB_FS: float = -25,
2125
  eps: float = 1e-06,
 
3920
  tokenizer=None,
3921
  audio_processor=None,
3922
  speech_tok_compress_ratio=320,
3923
+ target_sample_rate=22050,
3924
  normalize_audio=True,
3925
  **kwargs,
3926
  ):
 
3982
  logger.warning(f"Could not load preprocessor_config.json: {e }")
3983
  logger.warning("Using default configuration")
3984
  speech_tok_compress_ratio = config.get("speech_tok_compress_ratio", 3200)
3985
+ target_sample_rate = config.get("target_sample_rate", 22050)
3986
  normalize_audio = config.get("normalize_audio", True)
3987
  language_model_pretrained_name = config.get(
3988
  "language_model_pretrained_name", None
 
4377
  if "audio_processor" in config:
4378
  audio_config = config["audio_processor"]
4379
  audio_processor = QWEN3VoxTokenizerProcessor(
4380
+ sampling_rate=audio_config.get("sampling_rate", 22050),
4381
  normalize_audio=audio_config.get("normalize_audio", True),
4382
  target_dB_FS=audio_config.get("target_dB_FS", -25),
4383
  eps=audio_config.get("eps", 1e-06),
 
4402
  "db_normalize": self.db_normalize,
4403
  "audio_processor": {
4404
  "feature_extractor_type": "QWEN3VoxTokenizerProcessor",
4405
+ "sampling_rate": getattr(self.audio_processor, "sampling_rate", 22050),
4406
  "normalize_audio": getattr(
4407
  self.audio_processor, "normalize_audio", True
4408
  ),
 
4899
  if "audio_processor" in config:
4900
  audio_config = config["audio_processor"]
4901
  audio_processor = QWEN3VoxTokenizerProcessor(
4902
+ sampling_rate=audio_config.get("sampling_rate", 22050),
4903
  normalize_audio=audio_config.get("normalize_audio", True),
4904
  target_dB_FS=audio_config.get("target_dB_FS", -25),
4905
  eps=audio_config.get("eps", 1e-06),
 
4924
  "db_normalize": self.db_normalize,
4925
  "audio_processor": {
4926
  "feature_extractor_type": "QWEN3VoxTokenizerProcessor",
4927
+ "sampling_rate": getattr(self.audio_processor, "sampling_rate", 22050),
4928
  "normalize_audio": getattr(
4929
  self.audio_processor, "normalize_audio", True
4930
  ),
 
7932
  if speech_tensors.ndim == 1:
7933
  speech_tensors = speech_tensors.unsqueeze(0)
7934
  batch_size, total_samples = speech_tensors.shape
7935
+ sample_rate = 22050
7936
  segment_samples = int(streaming_segment_duration * sample_rate)
7937
  use_streaming = total_samples > segment_samples
7938
  with torch.no_grad():
 
8975
  "db_normalize": True,
8976
  "audio_processor": {
8977
  "feature_extractor_type": "QWEN3VoxTokenizerProcessor",
8978
+ "sampling_rate": 22050,
8979
  "normalize_audio": True,
8980
  "target_dB_FS": -25,
8981
  "eps": 1e-06,
 
9835
  torch.manual_seed(s)
9836
  if torch.cuda.is_available():
9837
  torch.cuda.manual_seed_all(s)
9838
+ self._cfg_scale = float(os.environ.get("VOCENCE_CFG_SCALE", "1.3"))
9839
  self._disable_prefill = os.environ.get(
9840
  "VOCENCE_DISABLE_PREFILL", ""
9841
  ).lower() in ("1", "true", "yes")
 
9865
  self._model.eval()
9866
  self._model.set_ddpm_inference_steps(num_steps=20)
9867
  self._sample_rate = int(
9868
+ getattr(self._processor.audio_processor, "sampling_rate", 22050)
9869
  )
9870
 
9871
  def _load_model_weights(