Spaces:

Hameed13
/

Huggingface_News_Podcast

Build error

App Files Files Community

Hameed13 commited on May 19, 2025

Commit

411b835

verified ·

1 Parent(s): ab83827

Update yarngpt/generate.py

Browse files

Files changed (1) hide show

yarngpt/generate.py +118 -80

yarngpt/generate.py CHANGED Viewed

@@ -1,114 +1,152 @@
-import torch
-import torchaudio
-from transformers import AutoModelForCausalLM, AutoTokenizer
-from yarngpt.audiotokenizer import AudioTokenizerV2
 import os
 import logging
 # Configure logging
-logging.basicConfig(
-    level=logging.INFO,
-    format='[%(asctime)s] %(message)s',
-    datefmt='%Y-%m-%d %H:%M:%S'
-)
 class TextToSpeech:
-    """Custom TextToSpeech class that mimics the successful Colab implementation"""
-    def __init__(self):
-        """Initialize the TTS components"""
-        logging.info("Initializing TextToSpeech class...")
-        # Set paths
         try:
-            tokenizer_path = "saheedniyi/YarnGPT2"
-            # Check if we're running on HF Spaces or local environment
-            if os.path.exists("/home/user"):  # HF Spaces environment
-                base_path = "/home/user"
-            else:
-                base_path = "."
-            wav_tokenizer_config_path = os.path.join(base_path, "wavtokenizer_mediumdata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml")
-            wav_tokenizer_model_path = os.path.join(base_path, "wavtokenizer_large_speech_320_24k.ckpt")
-            # Check if files exist
-            if not os.path.exists(wav_tokenizer_config_path):
-                logging.warning(f"Config file not found at {wav_tokenizer_config_path}")
-            if not os.path.exists(wav_tokenizer_model_path):
-                logging.warning(f"Model file not found at {wav_tokenizer_model_path}")
-            # Initialize audio tokenizer
-            logging.info("Initializing audio tokenizer...")
-            self.audio_tokenizer = AudioTokenizerV2(
-                tokenizer_path, wav_tokenizer_model_path, wav_tokenizer_config_path
-            )
-            # Load model
-            logging.info(f"Loading model from {tokenizer_path}...")
-            self.model = AutoModelForCausalLM.from_pretrained(
-                tokenizer_path,
-                torch_dtype="auto"
-            ).to(self.audio_tokenizer.device)
-            logging.info("TextToSpeech initialization complete")
         except Exception as e:
-            logging.error(f"Error initializing TextToSpeech: {str(e)}")
             import traceback
             traceback.print_exc()
             raise
-    def tts(self, text, output_file, accent="nigerian", language="english", speaker="tayo"):
-        """Generate speech from text and save to file
         Args:
-            text: Text to convert to speech
-            output_file: Path to save the audio file
-            accent: Type of accent (currently ignored, uses speaker instead)
-            language: Language ("english", "yoruba", "igbo", "hausa")
-            speaker: Voice to use (default: "tayo")
         Returns:
-            Path to generated audio file
         """
         try:
-            # Map accent to speaker if needed
-            if accent == "nigerian" and speaker == "tayo":
-                # Use default speaker
-                pass
-            elif accent != "nigerian":
-                # Could map different accents to different speakers
-                logging.info(f"Accent '{accent}' requested, using speaker '{speaker}'")
-            logging.info(f"Generating audio for text: '{text[:50]}...'")
-            logging.info(f"Using speaker: {speaker}, language: {language}")
-            # Create prompt
-            prompt = self.audio_tokenizer.create_prompt(text, lang=language, speaker_name=speaker)
-            input_ids = self.audio_tokenizer.tokenize_prompt(prompt)
-            # Generate audio
-            output = self.model.generate(
-                input_ids=input_ids,
-                temperature=0.1,
-                repetition_penalty=1.1,
-                max_length=4000,
-            )
-            # Convert to audio
-            codes = self.audio_tokenizer.get_codes(output)
-            audio = self.audio_tokenizer.get_audio(codes)
-            # Save audio file
-            logging.info(f"Saving audio to {output_file}")
-            torchaudio.save(output_file, audio, sample_rate=24000)
-            return output_file
         except Exception as e:
-            logging.error(f"Error in TTS generation: {str(e)}")
             import traceback
             traceback.print_exc()
             raise

 import os
+import sys
 import logging
+import torch
+import numpy as np
+import warnings
+from transformers import AutoTokenizer, AutoProcessor, AutoModel
+import soundfile as sf
+from typing import Optional, Tuple
 # Configure logging
+logging.basicConfig(level=logging.INFO,
+                   format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+# Suppress irrelevant warnings
+warnings.filterwarnings("ignore", category=UserWarning, message=".*The attention mask and the pad token.*")
+warnings.filterwarnings("ignore", category=UserWarning, message=".*torch.nn.utils.weight_norm is deprecated.*")
 class TextToSpeech:
+    """Nigerian Text-to-Speech synthesizer using YarnGPT models."""
+    def __init__(self, model_name_or_path, processor_name_or_path=None, disable_playback=True):
+        """
+        Initialize the TextToSpeech class.
+        Args:
+            model_name_or_path (str): Path or name of the YarnGPT model
+            processor_name_or_path (str, optional): Path or name of the processor
+            disable_playback (bool, optional): Whether to disable audio playback
+        """
+        self.model_name_or_path = model_name_or_path
+        self.processor_name_or_path = processor_name_or_path or model_name_or_path
+        self.disable_playback = disable_playback
+        # Set environment variable to disable PortAudio
+        if disable_playback:
+            os.environ["OUTETTS_NO_PORTAUDIO"] = "1"
+        logger.info(f"Initializing TextToSpeech with model: {model_name_or_path}")
         try:
+            # Initialize tokenizer
+            logger.info("Loading tokenizer...")
+            self.tokenizer = AutoTokenizer.from_pretrained(self.processor_name_or_path)
+            logger.info("Tokenizer loaded successfully")
+            # Initialize processor
+            logger.info("Loading processor...")
+            self.processor = AutoProcessor.from_pretrained(self.processor_name_or_path)
+            logger.info("Processor loaded successfully")
+            # Initialize model with appropriate device
+            logger.info("Loading model...")
+            self.device = "cuda" if torch.cuda.is_available() else "cpu"
+            logger.info(f"Using device: {self.device}")
+            # Select appropriate torch dtype based on device and available memory
+            if self.device == "cuda":
+                # Try to use float16 on CUDA devices for better performance
+                try:
+                    dummy_tensor = torch.zeros(1, device=self.device, dtype=torch.float16)
+                    self.dtype = torch.float16
+                    logger.info("Using torch.float16 for better performance")
+                except Exception:
+                    self.dtype = torch.float32
+                    logger.info("Failed to use torch.float16, falling back to torch.float32")
+            else:
+                # Use float32 on CPU
+                self.dtype = torch.float32
+                logger.info("Using torch.float32 on CPU device")
+            # Load model with trust_remote_code=True for custom models
+            self.model = AutoModel.from_pretrained(
+                self.model_name_or_path,
+                torch_dtype=self.dtype,
+                trust_remote_code=True
+            ).to(self.device)
+            logger.info("Model loaded successfully")
         except Exception as e:
+            logger.error(f"Error initializing TextToSpeech: {e}")
             import traceback
             traceback.print_exc()
             raise
+    def tts(self, text: str, accent: str = "nigerian", save_path: Optional[str] = None,
+           speed: float = 1.0, get_array: bool = False) -> Optional[Tuple[np.ndarray, int]]:
+        """
+        Generate speech from text.
         Args:
+            text (str): Text to convert to speech
+            accent (str, optional): Accent for the speech. Defaults to "nigerian".
+            save_path (str, optional): Path to save the audio file. Defaults to None.
+            speed (float, optional): Speed factor for speech. Defaults to 1.0.
+            get_array (bool, optional): Return audio as numpy array. Defaults to False.
         Returns:
+            Tuple[numpy.ndarray, int] or None: Audio data and sample rate if get_array=True
         """
+        logger.info(f"Generating speech for text: '{text[:50]}...' with accent '{accent}'")
         try:
+            # Prepare input
+            inputs = self.processor(
+                text=text,
+                accent=accent,
+                return_tensors="pt",
+                padding=True,
+            )
+            inputs = {k: v.to(self.device) for k, v in inputs.items()}
+            # Generate speech
+            with torch.no_grad():
+                outputs = self.model.generate(
+                    **inputs,
+                    pad_token_id=self.tokenizer.pad_token_id,
+                    max_new_tokens=1000
+                )
+            # Process outputs
+            audio_data = outputs.generated_wavs.cpu().numpy().squeeze()
+            sample_rate = self.model.config.sampling_rate
+            # Adjust speed if needed
+            if speed != 1.0 and speed > 0:
+                try:
+                    import librosa
+                    audio_data = librosa.effects.time_stretch(audio_data, rate=speed)
+                    logger.info(f"Adjusted audio speed by factor {speed}")
+                except ImportError:
+                    logger.warning("librosa not available, speed adjustment skipped")
+                except Exception as e:
+                    logger.warning(f"Speed adjustment failed: {e}")
+            # Save if path is provided
+            if save_path:
+                logger.info(f"Saving audio to {save_path}")
+                sf.write(save_path, audio_data, sample_rate)
+            # Return the audio data and sample rate if requested
+            if get_array:
+                return audio_data, sample_rate
+            return None
         except Exception as e:
+            logger.error(f"Error generating speech: {e}")
             import traceback
             traceback.print_exc()
             raise