| | """ |
| | Example usage of Romanian Matcha-TTS models with HuggingFace integration |
| | |
| | This script shows how to use the HuggingFace model loader with the original |
| | Matcha-TTS repository for inference. |
| | """ |
| |
|
| | import sys |
| | import os |
| | import torch |
| | import soundfile as sf |
| | from pathlib import Path |
| |
|
| | |
| | sys.path.insert(0, str(Path(__file__).parent.parent / "src")) |
| |
|
| | |
| | from model_loader import ModelLoader |
| |
|
| | def load_matcha_dependencies(): |
| | """ |
| | Try to import Matcha-TTS dependencies |
| | |
| | Make sure you have the main repository installed: |
| | pip install git+https://github.com/adrianstanea/Matcha-TTS.git |
| | """ |
| | try: |
| | |
| | from matcha.models.matcha_tts import MatchaTTS |
| | from matcha.hifigan.models import Generator as HiFiGAN |
| | from matcha.hifigan.config import v1 |
| | from matcha.hifigan.env import AttrDict |
| | from matcha.hifigan.denoiser import Denoiser |
| | from matcha.text import text_to_sequence |
| | from matcha.utils.utils import intersperse |
| | return { |
| | 'MatchaTTS': MatchaTTS, |
| | 'HiFiGAN': HiFiGAN, |
| | 'v1': v1, |
| | 'AttrDict': AttrDict, |
| | 'Denoiser': Denoiser, |
| | 'text_to_sequence': text_to_sequence, |
| | 'intersperse': intersperse |
| | } |
| | except ImportError as e: |
| | print(f"Error importing Matcha-TTS dependencies: {e}") |
| | print("Please install the main repository:") |
| | print("pip install git+https://github.com/adrianstanea/Matcha-TTS.git") |
| | return None |
| |
|
| | def synthesize_romanian(text: str, model: str = "bas_950", repo_path: str = None): |
| | """ |
| | Synthesize Romanian speech using HuggingFace model loader |
| | |
| | Args: |
| | text: Romanian text to synthesize |
| | model: Model name (swara, bas_10, bas_950, sgs_10, sgs_950) |
| | repo_path: Path to HuggingFace repo (local or repo ID) |
| | """ |
| |
|
| | |
| | matcha_deps = load_matcha_dependencies() |
| | if matcha_deps is None: |
| | return None |
| |
|
| | |
| | if repo_path is None: |
| | |
| | repo_path = str(Path(__file__).parent.parent) |
| |
|
| | try: |
| | loader = ModelLoader.from_pretrained(repo_path) |
| | print(f"✓ Loaded model configuration from {repo_path}") |
| | except Exception as e: |
| | print(f"✗ Failed to load model configuration: {e}") |
| | return None |
| |
|
| | |
| | model_info = loader.load_models(model=model) |
| | print(f"✓ Model info loaded: {model_info['model_name']}") |
| | print(f" Description: {model_info['model_info']['description']}") |
| | print(f" Training data: {model_info['model_info'].get('training_data', 'N/A')}") |
| |
|
| | device = torch.device(model_info['device']) |
| | print(f"✓ Using device: {device}") |
| |
|
| | |
| | try: |
| | model = matcha_deps['MatchaTTS'].load_from_checkpoint( |
| | model_info['model_path'], |
| | map_location=device, |
| | weights_only=False |
| | ) |
| | model.eval() |
| | print(f"✓ Loaded TTS model from {model_info['model_path']}") |
| | except Exception as e: |
| | print(f"✗ Failed to load TTS model: {e}") |
| | return None |
| |
|
| | |
| | try: |
| | h = matcha_deps['AttrDict'](matcha_deps['v1']) |
| | vocoder = matcha_deps['HiFiGAN'](h).to(device) |
| | checkpoint = torch.load(model_info['vocoder_path'], map_location=device, weights_only=False) |
| | vocoder.load_state_dict(checkpoint['generator']) |
| | vocoder.eval() |
| | vocoder.remove_weight_norm() |
| | denoiser = matcha_deps['Denoiser'](vocoder, mode='zeros') |
| | print(f"✓ Loaded vocoder from {model_info['vocoder_path']}") |
| | except Exception as e: |
| | print(f"✗ Failed to load vocoder: {e}") |
| | return None |
| |
|
| | |
| | print(f"Processing text: '{text}'") |
| | try: |
| | |
| | x = torch.tensor( |
| | matcha_deps['intersperse']( |
| | matcha_deps['text_to_sequence'](text, ['romanian_cleaners'])[0], 0 |
| | ), |
| | dtype=torch.long, |
| | device=device |
| | )[None] |
| | x_lengths = torch.tensor([x.shape[-1]], dtype=torch.long, device=device) |
| | print("✓ Text processed successfully") |
| | except Exception as e: |
| | print(f"✗ Failed to process text: {e}") |
| | return None |
| |
|
| | |
| | print("Generating speech...") |
| | try: |
| | with torch.inference_mode(): |
| | |
| | params = model_info['inference_params'] |
| |
|
| | output = model.synthesise( |
| | x, x_lengths, |
| | n_timesteps=params['n_timesteps'], |
| | temperature=params['temperature'], |
| | length_scale=params['length_scale'] |
| | ) |
| |
|
| | |
| | mel = output['mel'] |
| | audio = vocoder(mel).clamp(-1, 1) |
| | audio = denoiser(audio.squeeze(0), strength=0.00025).cpu().squeeze() |
| |
|
| | print("✓ Speech generated successfully") |
| | return audio.numpy(), model_info['config']['sample_rate'] |
| |
|
| | except Exception as e: |
| | print(f"✗ Failed to generate speech: {e}") |
| | return None |
| |
|
| | def main(): |
| | """Example usage""" |
| |
|
| | |
| | repo_path = str(Path(__file__).parent.parent) |
| |
|
| | |
| | test_texts = [ |
| | "Bună ziua! Acesta este un test de sinteză vocală.", |
| | "România are o cultură bogată și o istorie fascinantă.", |
| | "Limba română face parte din familia limbilor romanice." |
| | ] |
| |
|
| | |
| | test_models = ["bas_10", "bas_950", "sgs_10", "sgs_950"] |
| |
|
| | |
| | output_dir = Path("generated_samples") |
| | output_dir.mkdir(exist_ok=True) |
| |
|
| | for model in test_models: |
| | print(f"\n{'='*50}") |
| | print(f"Testing model: {model}") |
| | print(f"{'='*50}") |
| |
|
| | for i, text in enumerate(test_texts): |
| | print(f"\nText {i+1}: {text}") |
| |
|
| | result = synthesize_romanian( |
| | text=text, |
| | model=model, |
| | repo_path=repo_path |
| | ) |
| |
|
| | if result is not None: |
| | audio, sr = result |
| | output_file = output_dir / f"sample_{model}_{i+1}.wav" |
| | sf.write(output_file, audio, sr) |
| | print(f"✓ Saved audio to {output_file}") |
| | else: |
| | print(f"✗ Failed to generate audio for {model}") |
| |
|
| | if __name__ == "__main__": |
| | main() |