Spaces:
Running on Zero
Running on Zero
- app.py +18 -18
- qwen_tts/core/models/modeling_qwen3_tts.py +2 -2
app.py
CHANGED
|
@@ -216,7 +216,7 @@ def infer_voice_design(part, language, voice_description):
|
|
| 216 |
def infer_voice_clone( part, language,audio_tuple,ref_text,use_xvector_only):
|
| 217 |
"""Single segment inference for Voice Clone using reference audio."""
|
| 218 |
# tts = BASE_MODELS[model_size]
|
| 219 |
-
seed_everything(42)
|
| 220 |
tts = load_model("Base", "0.6B")
|
| 221 |
voice_clone_prompt = tts.create_voice_clone_prompt(
|
| 222 |
ref_audio=audio_tuple,
|
|
@@ -289,25 +289,25 @@ def extract_voice_clone_prompt(ref_audio,ref_text,use_xvector_only):
|
|
| 289 |
logger.info(f"开始 Whisper 语音识别任务。模型: {model_size}, 音频路径: {ref_audio}")
|
| 290 |
r_text = ref_text
|
| 291 |
uxo = use_xvector_only
|
| 292 |
-
try:
|
| 293 |
-
|
| 294 |
-
|
| 295 |
-
|
| 296 |
-
|
| 297 |
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
|
| 301 |
-
|
| 302 |
|
| 303 |
-
|
| 304 |
-
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
except Exception as e:
|
| 310 |
-
|
| 311 |
|
| 312 |
voice_clone_prompt_items = tts.create_voice_clone_prompt(
|
| 313 |
ref_audio=audio_tuple,
|
|
|
|
| 216 |
def infer_voice_clone( part, language,audio_tuple,ref_text,use_xvector_only):
|
| 217 |
"""Single segment inference for Voice Clone using reference audio."""
|
| 218 |
# tts = BASE_MODELS[model_size]
|
| 219 |
+
# seed_everything(42)
|
| 220 |
tts = load_model("Base", "0.6B")
|
| 221 |
voice_clone_prompt = tts.create_voice_clone_prompt(
|
| 222 |
ref_audio=audio_tuple,
|
|
|
|
| 289 |
logger.info(f"开始 Whisper 语音识别任务。模型: {model_size}, 音频路径: {ref_audio}")
|
| 290 |
r_text = ref_text
|
| 291 |
uxo = use_xvector_only
|
| 292 |
+
# try:
|
| 293 |
+
# whisper_model = load_whisper_model(model_size)
|
| 294 |
+
# # 使用 transcribe 方法进行转录
|
| 295 |
+
# # whisper 会自动处理音频加载和重采样
|
| 296 |
+
# audio, sr = audio_tuple
|
| 297 |
|
| 298 |
+
# # Whisper 模型期望 16000Hz 采样率的音频
|
| 299 |
+
# if sr != 16000:
|
| 300 |
+
# logger.info(f"重采样音频: {sr}Hz -> 16000Hz")
|
| 301 |
+
# audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
|
| 302 |
|
| 303 |
+
# result = whisper_model.transcribe(audio)
|
| 304 |
+
# text = result["text"]
|
| 305 |
+
# logger.info(f"Whisper 识别完成。文本长度: {len(text)}")
|
| 306 |
+
# r_text = text.strip()
|
| 307 |
+
# logger.error(f"Whisper 识别成功:{r_text}")
|
| 308 |
+
# uxo = False
|
| 309 |
+
# except Exception as e:
|
| 310 |
+
# logger.error(f"Whisper 识别失败: {str(e)}", exc_info=True)
|
| 311 |
|
| 312 |
voice_clone_prompt_items = tts.create_voice_clone_prompt(
|
| 313 |
ref_audio=audio_tuple,
|
qwen_tts/core/models/modeling_qwen3_tts.py
CHANGED
|
@@ -1956,7 +1956,7 @@ class Qwen3TTSForConditionalGeneration(Qwen3TTSPreTrainedModel, GenerationMixin)
|
|
| 1956 |
@torch.inference_mode()
|
| 1957 |
def generate_speaker_prompt(
|
| 1958 |
self,
|
| 1959 |
-
voice_clone_prompt:
|
| 1960 |
):
|
| 1961 |
voice_clone_spk_embeds = []
|
| 1962 |
for index in range(len(voice_clone_prompt['ref_spk_embedding'])):
|
|
@@ -2024,7 +2024,7 @@ class Qwen3TTSForConditionalGeneration(Qwen3TTSPreTrainedModel, GenerationMixin)
|
|
| 2024 |
input_ids: Optional[list[torch.Tensor]] = None,
|
| 2025 |
instruct_ids: Optional[list[torch.Tensor]] = None,
|
| 2026 |
ref_ids: Optional[list[torch.Tensor]] = None,
|
| 2027 |
-
voice_clone_prompt:
|
| 2028 |
languages: list[str] = None,
|
| 2029 |
speakers: list[str] = None,
|
| 2030 |
non_streaming_mode = False,
|
|
|
|
| 1956 |
@torch.inference_mode()
|
| 1957 |
def generate_speaker_prompt(
|
| 1958 |
self,
|
| 1959 |
+
voice_clone_prompt: dict
|
| 1960 |
):
|
| 1961 |
voice_clone_spk_embeds = []
|
| 1962 |
for index in range(len(voice_clone_prompt['ref_spk_embedding'])):
|
|
|
|
| 2024 |
input_ids: Optional[list[torch.Tensor]] = None,
|
| 2025 |
instruct_ids: Optional[list[torch.Tensor]] = None,
|
| 2026 |
ref_ids: Optional[list[torch.Tensor]] = None,
|
| 2027 |
+
voice_clone_prompt: dict = None,
|
| 2028 |
languages: list[str] = None,
|
| 2029 |
speakers: list[str] = None,
|
| 2030 |
non_streaming_mode = False,
|