smartwang commited on
Commit
652ae4a
·
1 Parent(s): e8a5955
app.py CHANGED
@@ -216,7 +216,7 @@ def infer_voice_design(part, language, voice_description):
216
  def infer_voice_clone( part, language,audio_tuple,ref_text,use_xvector_only):
217
  """Single segment inference for Voice Clone using reference audio."""
218
  # tts = BASE_MODELS[model_size]
219
- seed_everything(42)
220
  tts = load_model("Base", "0.6B")
221
  voice_clone_prompt = tts.create_voice_clone_prompt(
222
  ref_audio=audio_tuple,
@@ -289,25 +289,25 @@ def extract_voice_clone_prompt(ref_audio,ref_text,use_xvector_only):
289
  logger.info(f"开始 Whisper 语音识别任务。模型: {model_size}, 音频路径: {ref_audio}")
290
  r_text = ref_text
291
  uxo = use_xvector_only
292
- try:
293
- whisper_model = load_whisper_model(model_size)
294
- # 使用 transcribe 方法进行转录
295
- # whisper 会自动处理音频加载和重采样
296
- audio, sr = audio_tuple
297
 
298
- # Whisper 模型期望 16000Hz 采样率的音频
299
- if sr != 16000:
300
- logger.info(f"重采样音频: {sr}Hz -> 16000Hz")
301
- audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
302
 
303
- result = whisper_model.transcribe(audio)
304
- text = result["text"]
305
- logger.info(f"Whisper 识别完成。文本长度: {len(text)}")
306
- r_text = text.strip()
307
- logger.error(f"Whisper 识别成功:{r_text}")
308
- uxo = False
309
- except Exception as e:
310
- logger.error(f"Whisper 识别失败: {str(e)}", exc_info=True)
311
 
312
  voice_clone_prompt_items = tts.create_voice_clone_prompt(
313
  ref_audio=audio_tuple,
 
216
  def infer_voice_clone( part, language,audio_tuple,ref_text,use_xvector_only):
217
  """Single segment inference for Voice Clone using reference audio."""
218
  # tts = BASE_MODELS[model_size]
219
+ # seed_everything(42)
220
  tts = load_model("Base", "0.6B")
221
  voice_clone_prompt = tts.create_voice_clone_prompt(
222
  ref_audio=audio_tuple,
 
289
  logger.info(f"开始 Whisper 语音识别任务。模型: {model_size}, 音频路径: {ref_audio}")
290
  r_text = ref_text
291
  uxo = use_xvector_only
292
+ # try:
293
+ # whisper_model = load_whisper_model(model_size)
294
+ # # 使用 transcribe 方法进行转录
295
+ # # whisper 会自动处理音频加载和重采样
296
+ # audio, sr = audio_tuple
297
 
298
+ # # Whisper 模型期望 16000Hz 采样率的音频
299
+ # if sr != 16000:
300
+ # logger.info(f"重采样音频: {sr}Hz -> 16000Hz")
301
+ # audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
302
 
303
+ # result = whisper_model.transcribe(audio)
304
+ # text = result["text"]
305
+ # logger.info(f"Whisper 识别完成。文本长度: {len(text)}")
306
+ # r_text = text.strip()
307
+ # logger.error(f"Whisper 识别成功:{r_text}")
308
+ # uxo = False
309
+ # except Exception as e:
310
+ # logger.error(f"Whisper 识别失败: {str(e)}", exc_info=True)
311
 
312
  voice_clone_prompt_items = tts.create_voice_clone_prompt(
313
  ref_audio=audio_tuple,
qwen_tts/core/models/modeling_qwen3_tts.py CHANGED
@@ -1956,7 +1956,7 @@ class Qwen3TTSForConditionalGeneration(Qwen3TTSPreTrainedModel, GenerationMixin)
1956
  @torch.inference_mode()
1957
  def generate_speaker_prompt(
1958
  self,
1959
- voice_clone_prompt: list[dict]
1960
  ):
1961
  voice_clone_spk_embeds = []
1962
  for index in range(len(voice_clone_prompt['ref_spk_embedding'])):
@@ -2024,7 +2024,7 @@ class Qwen3TTSForConditionalGeneration(Qwen3TTSPreTrainedModel, GenerationMixin)
2024
  input_ids: Optional[list[torch.Tensor]] = None,
2025
  instruct_ids: Optional[list[torch.Tensor]] = None,
2026
  ref_ids: Optional[list[torch.Tensor]] = None,
2027
- voice_clone_prompt: list[dict] = None,
2028
  languages: list[str] = None,
2029
  speakers: list[str] = None,
2030
  non_streaming_mode = False,
 
1956
  @torch.inference_mode()
1957
  def generate_speaker_prompt(
1958
  self,
1959
+ voice_clone_prompt: dict
1960
  ):
1961
  voice_clone_spk_embeds = []
1962
  for index in range(len(voice_clone_prompt['ref_spk_embedding'])):
 
2024
  input_ids: Optional[list[torch.Tensor]] = None,
2025
  instruct_ids: Optional[list[torch.Tensor]] = None,
2026
  ref_ids: Optional[list[torch.Tensor]] = None,
2027
+ voice_clone_prompt: dict = None,
2028
  languages: list[str] = None,
2029
  speakers: list[str] = None,
2030
  non_streaming_mode = False,