Spaces:

Vaishnavi0404
/

Text2Sing-DiffSinger

Running

App Files Files Community

Vaishnavi0404 commited on Apr 14, 2025

Commit

475a0f9

verified ·

1 Parent(s): 1f65ac7

Update voice_synthesizer.py

Browse files

Files changed (1) hide show

voice_synthesizer.py +186 -67

voice_synthesizer.py CHANGED Viewed

@@ -1,4 +1,7 @@
 import torch
 from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
 import scipy
 import numpy as np
@@ -6,27 +9,122 @@ import soundfile as sf
 class VoiceSynthesizer:
     def __init__(self):
-        """Initialize the voice synthesizer with the SpeechT5 model"""
-        # Load models
-        self.processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
-        self.model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
-        self.vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
         # Load speaker embeddings
-        self.speaker_embeddings = {
-            "neutral": torch.load("speaker_embeddings/neutral.pt") if torch.cuda.is_available() else
-                       torch.load("speaker_embeddings/neutral.pt", map_location=torch.device('cpu')),
-            "feminine": torch.load("speaker_embeddings/feminine.pt") if torch.cuda.is_available() else
-                        torch.load("speaker_embeddings/feminine.pt", map_location=torch.device('cpu')),
-            "masculine": torch.load("speaker_embeddings/masculine.pt") if torch.cuda.is_available() else
-                         torch.load("speaker_embeddings/masculine.pt", map_location=torch.device('cpu'))
-        }
         # Set device
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         self.model.to(self.device)
         self.vocoder.to(self.device)
     def synthesize(self, text, output_path, voice_type="neutral", speed=1.0):
         """
         Synthesize speech from text
@@ -42,62 +140,76 @@ class VoiceSynthesizer:
             voice_type = "neutral"
             print(f"Invalid voice type. Using default 'neutral' voice.")
-        # Process input text
-        inputs = self.processor(text=text, return_tensors="pt").to(self.device)
-        # Get speaker embeddings and ensure proper shape
-        speaker_embeddings = self.speaker_embeddings[voice_type].to(self.device)
-        # Print shape for debugging
-        print(f"Speaker embeddings shape before: {speaker_embeddings.shape}")
-        # Fix the dimension issue - ensure it's a 2D tensor with shape [1, embedding_dim]
-        if len(speaker_embeddings.shape) == 1:
-            speaker_embeddings = speaker_embeddings.unsqueeze(0)  # Add batch dimension
-        print(f"Speaker embeddings shape after: {speaker_embeddings.shape}")
         try:
-            # Generate speech
-            speech = self.model.generate_speech(
-                inputs["input_ids"],
-                speaker_embeddings,
-                vocoder=self.vocoder
-            )
-        except IndexError as e:
-            # Alternative approach if the above fails
-            print(f"Error in generate_speech: {e}")
-            print("Trying alternative approach...")
-            # If the first approach fails, try reshaping the embeddings differently
-            if len(speaker_embeddings.shape) == 2:
-                if speaker_embeddings.shape[0] > 1 and speaker_embeddings.shape[0] > speaker_embeddings.shape[1]:
-                    speaker_embeddings = speaker_embeddings.mean(dim=0, keepdim=True)
-                elif speaker_embeddings.shape[0] == 1:
-                    # Ensure correct normalization dimension
-                    speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=1)
-            # Try generation again
-            speech = self.model.generate_speech(
-                inputs["input_ids"],
-                speaker_embeddings,
-                vocoder=self.vocoder
-            )
-        # Convert to numpy array
-        speech = speech.cpu().numpy()
-        # Adjust speed if needed
-        if speed != 1.0:
-            import librosa
-            speech = librosa.effects.time_stretch(speech, rate=speed)
-        # Save audio file
-        sf.write(output_path, speech, samplerate=16000)
-        print(f"Speech synthesized and saved to {output_path}")
-        return output_path
     def create_speaker_embedding(self, reference_file, output_path):
         """
@@ -112,7 +224,14 @@ class VoiceSynthesizer:
         print("Creating speaker embeddings requires a speaker encoder model")
         print("Using default embeddings instead")
         # For now, we'll just copy one of the existing embeddings
-        torch.save(self.speaker_embeddings["neutral"], output_path)
         return output_path

+import os
+import time
 import torch
+import requests
 from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
 import scipy
 import numpy as np
 class VoiceSynthesizer:
     def __init__(self):
+        """Initialize the voice synthesizer with the SpeechT5 model with local caching"""
+        # Create cache directory
+        model_cache_dir = os.path.join(os.path.dirname(__file__), "model_cache")
+        os.makedirs(model_cache_dir, exist_ok=True)
+        # Initialize models with retry mechanism
+        self.processor, self.model, self.vocoder = self._initialize_models(model_cache_dir)
         # Load speaker embeddings
+        self.speaker_embeddings = self._load_speaker_embeddings()
         # Set device
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         self.model.to(self.device)
         self.vocoder.to(self.device)
+    def _initialize_models(self, cache_dir, max_retries=3):
+        """Initialize models with retry mechanism"""
+        for attempt in range(max_retries):
+            try:
+                print(f"Loading models (attempt {attempt+1}/{max_retries})...")
+                # Try to load from local cache first
+                try:
+                    processor = SpeechT5Processor.from_pretrained(
+                        "microsoft/speecht5_tts",
+                        local_files_only=True,
+                        cache_dir=cache_dir
+                    )
+                    model = SpeechT5ForTextToSpeech.from_pretrained(
+                        "microsoft/speecht5_tts",
+                        local_files_only=True,
+                        cache_dir=cache_dir
+                    )
+                    vocoder = SpeechT5HifiGan.from_pretrained(
+                        "microsoft/speecht5_hifigan",
+                        local_files_only=True,
+                        cache_dir=cache_dir
+                    )
+                    print("Successfully loaded models from local cache.")
+                    return processor, model, vocoder
+                except Exception as local_err:
+                    print(f"Could not load models from local cache: {local_err}")
+                    # If loading from cache fails, try downloading
+                    print("Downloading models from Hugging Face Hub...")
+                    # Increase timeout for downloads
+                    import huggingface_hub
+                    huggingface_hub.constants.HF_HUB_DOWNLOAD_TIMEOUT = 30  # Increase timeout to 30 seconds
+                    processor = SpeechT5Processor.from_pretrained(
+                        "microsoft/speecht5_tts",
+                        cache_dir=cache_dir
+                    )
+                    model = SpeechT5ForTextToSpeech.from_pretrained(
+                        "microsoft/speecht5_tts",
+                        cache_dir=cache_dir
+                    )
+                    vocoder = SpeechT5HifiGan.from_pretrained(
+                        "microsoft/speecht5_hifigan",
+                        cache_dir=cache_dir
+                    )
+                    print("Successfully downloaded and cached models.")
+                    return processor, model, vocoder
+            except (OSError, requests.exceptions.ReadTimeout) as e:
+                if attempt < max_retries - 1:
+                    wait_time = 5 * (attempt + 1)  # Linear backoff: 5s, 10s, 15s
+                    print(f"Attempt {attempt+1} failed: {e}")
+                    print(f"Retrying in {wait_time} seconds...")
+                    time.sleep(wait_time)
+                else:
+                    print(f"Failed to load models after {max_retries} attempts.")
+                    raise e
+    def _load_speaker_embeddings(self):
+        """Load speaker embeddings with error handling"""
+        embeddings_dir = os.path.join(os.path.dirname(__file__), "speaker_embeddings")
+        os.makedirs(embeddings_dir, exist_ok=True)
+        # Create mapping for speaker embeddings
+        embedding_files = {
+            "neutral": os.path.join(embeddings_dir, "neutral.pt"),
+            "feminine": os.path.join(embeddings_dir, "feminine.pt"),
+            "masculine": os.path.join(embeddings_dir, "masculine.pt")
+        }
+        # Load embeddings with proper error handling
+        speaker_embeddings = {}
+        for voice_type, file_path in embedding_files.items():
+            try:
+                if os.path.exists(file_path):
+                    if torch.cuda.is_available():
+                        speaker_embeddings[voice_type] = torch.load(file_path)
+                    else:
+                        speaker_embeddings[voice_type] = torch.load(file_path, map_location=torch.device('cpu'))
+                    print(f"Loaded {voice_type} speaker embedding")
+                else:
+                    print(f"Warning: Speaker embedding file {file_path} not found")
+                    # Create a fallback embedding if file doesn't exist
+                    # This is placeholder - in production you'd want real speaker embeddings
+                    if not speaker_embeddings:  # Only create placeholder for first missing file
+                        print("Creating placeholder speaker embedding")
+                        placeholder = torch.ones(1, 512) / 512  # Typical embedding dimension is 512
+                        speaker_embeddings[voice_type] = placeholder
+                    else:
+                        # Reuse existing embedding for missing voices
+                        speaker_embeddings[voice_type] = next(iter(speaker_embeddings.values()))
+            except Exception as e:
+                print(f"Error loading {voice_type} speaker embedding: {e}")
+                # Create fallback embedding on error
+                placeholder = torch.ones(1, 512) / 512
+                speaker_embeddings[voice_type] = placeholder
+        return speaker_embeddings
     def synthesize(self, text, output_path, voice_type="neutral", speed=1.0):
         """
         Synthesize speech from text
             voice_type = "neutral"
             print(f"Invalid voice type. Using default 'neutral' voice.")
         try:
+            # Process input text
+            inputs = self.processor(text=text, return_tensors="pt").to(self.device)
+            # Get speaker embeddings and ensure proper shape
+            speaker_embeddings = self.speaker_embeddings[voice_type].to(self.device)
+            # Print shape for debugging
+            print(f"Speaker embeddings shape before: {speaker_embeddings.shape}")
+            # Fix the dimension issue - ensure it's a 2D tensor with shape [1, embedding_dim]
+            if len(speaker_embeddings.shape) == 1:
+                speaker_embeddings = speaker_embeddings.unsqueeze(0)  # Add batch dimension
+            print(f"Speaker embeddings shape after: {speaker_embeddings.shape}")
+            try:
+                # Generate speech
+                speech = self.model.generate_speech(
+                    inputs["input_ids"],
+                    speaker_embeddings,
+                    vocoder=self.vocoder
+                )
+            except IndexError as e:
+                # Alternative approach if the above fails
+                print(f"Error in generate_speech: {e}")
+                print("Trying alternative approach...")
+                # If the first approach fails, try reshaping the embeddings differently
+                if len(speaker_embeddings.shape) == 2:
+                    if speaker_embeddings.shape[0] > 1 and speaker_embeddings.shape[0] > speaker_embeddings.shape[1]:
+                        speaker_embeddings = speaker_embeddings.mean(dim=0, keepdim=True)
+                    elif speaker_embeddings.shape[0] == 1:
+                        # Ensure correct normalization dimension
+                        speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=1)
+                # Try generation again
+                speech = self.model.generate_speech(
+                    inputs["input_ids"],
+                    speaker_embeddings,
+                    vocoder=self.vocoder
+                )
+            # Convert to numpy array
+            speech = speech.cpu().numpy()
+            # Adjust speed if needed
+            if speed != 1.0:
+                try:
+                    import librosa
+                    speech = librosa.effects.time_stretch(speech, rate=speed)
+                except Exception as e:
+                    print(f"Error adjusting speed: {e}")
+                    # Continue with original speed
+                    pass
+            # Create output directory if it doesn't exist
+            os.makedirs(os.path.dirname(os.path.abspath(output_path)), exist_ok=True)
+            # Save audio file
+            sf.write(output_path, speech, samplerate=16000)
+            print(f"Speech synthesized and saved to {output_path}")
+            return output_path
+        except Exception as e:
+            print(f"Error in speech synthesis: {e}")
+            # Return error placeholder or raise exception based on your error handling strategy
+            raise
     def create_speaker_embedding(self, reference_file, output_path):
         """
         print("Creating speaker embeddings requires a speaker encoder model")
         print("Using default embeddings instead")
+        # Create output directory if it doesn't exist
+        os.makedirs(os.path.dirname(os.path.abspath(output_path)), exist_ok=True)
         # For now, we'll just copy one of the existing embeddings
+        try:
+            torch.save(self.speaker_embeddings["neutral"], output_path)
+            print(f"Saved placeholder speaker embedding to {output_path}")
+        except Exception as e:
+            print(f"Error saving speaker embedding: {e}")
         return output_path