Spaces:
Sleeping
Sleeping
| import os | |
| import torch | |
| import torchaudio | |
| import subprocess | |
| # Fix PyTorch weights_only issue for XTTS | |
| import torch.serialization | |
| from TTS.tts.configs.xtts_config import XttsConfig | |
| torch.serialization.add_safe_globals([XttsConfig]) | |
| # Set environment variables | |
| os.environ['COQUI_TOS_AGREED'] = '1' | |
| os.environ['NUMBA_DISABLE_JIT'] = '1' | |
| from TTS.api import TTS | |
| from TTS.tts.configs.xtts_config import XttsConfig | |
| from TTS.tts.models.xtts import Xtts | |
| from TTS.utils.generic_utils import get_user_data_dir | |
| print("Testing XTTS C3PO voice cloning...") | |
| # C3PO model path | |
| model_path = "XTTS-v2_C3PO/" | |
| config_path = "XTTS-v2_C3PO/config.json" | |
| # Check if model files exist, if not download them | |
| if not os.path.exists(config_path): | |
| print("C3PO model not found locally, downloading...") | |
| try: | |
| subprocess.run([ | |
| "git", "clone", | |
| "https://huggingface.co/Borcherding/XTTS-v2_C3PO", | |
| "XTTS-v2_C3PO" | |
| ], check=True) | |
| print("C3PO model downloaded successfully") | |
| except subprocess.CalledProcessError as e: | |
| print(f"Failed to download C3PO model: {e}") | |
| exit(1) | |
| # Load configuration | |
| config = XttsConfig() | |
| config.load_json(config_path) | |
| # Initialize and load model | |
| model = Xtts.init_from_config(config) | |
| model.load_checkpoint( | |
| config, | |
| checkpoint_path=os.path.join(model_path, "model.pth"), | |
| vocab_path=os.path.join(model_path, "vocab.json"), | |
| eval=True, | |
| ) | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| if device == "cuda": | |
| model.cuda() | |
| print(f"C3PO model loaded on {device}") | |
| # Text to convert to speech | |
| text = "Hello there! I am C-3PO, human-cyborg relations. How may I assist you today?" | |
| # Look for reference audio in the C3PO model directory | |
| reference_audio_path = None | |
| for file in os.listdir(model_path): | |
| if file.endswith(('.wav', '.mp3', '.m4a')): | |
| reference_audio_path = os.path.join(model_path, file) | |
| print(f"Found C3PO reference audio: {file}") | |
| break | |
| # If no reference audio found, create a simple test reference | |
| if reference_audio_path is None: | |
| print("No reference audio found in C3PO model, creating test reference...") | |
| reference_audio_path = "test_reference.wav" | |
| # Generate a simple sine wave as placeholder | |
| import numpy as np | |
| sample_rate = 24000 | |
| duration = 3 # seconds | |
| frequency = 440 # Hz | |
| t = np.linspace(0, duration, int(sample_rate * duration)) | |
| audio_data = 0.3 * np.sin(2 * np.pi * frequency * t) | |
| # Save as WAV | |
| torchaudio.save(reference_audio_path, torch.tensor(audio_data).unsqueeze(0), sample_rate) | |
| print(f"Test reference audio created: {reference_audio_path}") | |
| try: | |
| # Generate conditioning latents | |
| print("Processing reference audio...") | |
| gpt_cond_latent, speaker_embedding = model.get_conditioning_latents( | |
| audio_path=reference_audio_path, | |
| gpt_cond_len=30, | |
| gpt_cond_chunk_len=4, | |
| max_ref_length=60 | |
| ) | |
| # Generate speech | |
| print("Generating C3PO speech...") | |
| out = model.inference( | |
| text, | |
| "en", # language | |
| gpt_cond_latent, | |
| speaker_embedding, | |
| repetition_penalty=5.0, | |
| temperature=0.75, | |
| ) | |
| # Save output | |
| output_path = "c3po_test_output.wav" | |
| torchaudio.save(output_path, torch.tensor(out["wav"]).unsqueeze(0), 24000) | |
| print(f"C3PO speech generated successfully! Saved as: {output_path}") | |
| # Test multilingual capabilities | |
| print("\nTesting multilingual C3PO...") | |
| multilingual_tests = [ | |
| ("es", "Hola, soy C-3PO. Domino más de seis millones de formas de comunicación."), | |
| ("fr", "Bonjour, je suis C-3PO. Je maîtrise plus de six millions de formes de communication."), | |
| ("de", "Hallo, ich bin C-3PO. Ich beherrsche über sechs Millionen Kommunikationsformen."), | |
| ] | |
| for lang, test_text in multilingual_tests: | |
| print(f"Generating {lang.upper()} speech...") | |
| out = model.inference( | |
| test_text, | |
| lang, | |
| gpt_cond_latent, | |
| speaker_embedding, | |
| repetition_penalty=5.0, | |
| temperature=0.75, | |
| ) | |
| output_path = f"c3po_test_{lang}.wav" | |
| torchaudio.save(output_path, torch.tensor(out["wav"]).unsqueeze(0), 24000) | |
| print(f"C3PO {lang.upper()} speech saved as: {output_path}") | |
| except Exception as e: | |
| print(f"Error during speech generation: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| print("XTTS C3PO test completed!") | |
| print("\nGenerated files:") | |
| for file in os.listdir("."): | |
| if file.startswith("c3po_test") and file.endswith(".wav"): | |
| print(f" - {file}") |