Spaces:

Hameed13
/

Huggingface_News_Podcast

Build error

App Files Files Community

Hameed13 commited on May 21, 2025

Commit

9988b14

verified ·

1 Parent(s): 3ec7d48

Update yarngpt/generate.py

Browse files

Files changed (1) hide show

yarngpt/generate.py +175 -49

yarngpt/generate.py CHANGED Viewed

@@ -1,56 +1,182 @@
-def __init__(self, model_name_or_path, processor_name_or_path=None):
     """
-    Initialize the TextToSpeech class.
     Args:
-        model_name_or_path (str): Path or name of the YarnGPT model
-        processor_name_or_path (str, optional): Path or name of the processor
     """
-    self.model_name_or_path = model_name_or_path
-    self.processor_name_or_path = processor_name_or_path or model_name_or_path
-    self.init_time = INIT_TIMESTAMP
-    self.user = CURRENT_USER
-    logger.info(f"Initializing TextToSpeech with model: {model_name_or_path}")
-    logger.info(f"Initialization time: {self.init_time}")
-    logger.info(f"User: {self.user}")
     try:
-        # Initialize tokenizer using the repository ID
-        logger.info("Loading tokenizer...")
-        self.tokenizer = AutoTokenizer.from_pretrained(
-            self.processor_name_or_path,
-            token=os.getenv('HF_TOKEN'),
-            trust_remote_code=True
-        )
-        logger.info("Tokenizer loaded successfully")
-        # Initialize processor
-        logger.info("Loading processor...")
-        self.processor = AutoProcessor.from_pretrained(
-            self.processor_name_or_path,
-            token=os.getenv('HF_TOKEN'),
-            trust_remote_code=True
-        )
-        logger.info("Processor loaded successfully")
-        # Initialize model
-        logger.info("Loading model...")
-        self.device = "cuda" if torch.cuda.is_available() else "cpu"
-        logger.info(f"Using device: {self.device}")
-        self.dtype = torch.float16 if self.device == "cuda" else torch.float32
-        logger.info(f"Using torch dtype: {self.dtype}")
-        self.model = AutoModel.from_pretrained(
-            self.model_name_or_path,
-            torch_dtype=self.dtype,
-            trust_remote_code=True,
-            token=os.getenv('HF_TOKEN')
-        ).to(self.device)
-        logger.info("Model loaded successfully")
     except Exception as e:
-        logger.error(f"Error initializing TextToSpeech: {e}")
         raise

+import os
+import sys
+import logging
+import torch
+import torchaudio
+import numpy as np
+from transformers import AutoTokenizer, AutoProcessor, AutoModel
+from huggingface_hub import hf_hub_download
+import warnings
+import scipy.io.wavfile as wav
+from datetime import datetime
+# Configure logging
+logging.basicConfig(level=logging.INFO,
+                   format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+# Suppress irrelevant warnings
+warnings.filterwarnings("ignore", category=UserWarning, message=".*The attention mask and the pad token.*")
+warnings.filterwarnings("ignore", category=UserWarning, message=".*torch.nn.utils.weight_norm is deprecated.*")
+# Constants
+INIT_TIMESTAMP = "2025-05-21 01:36:55"
+CURRENT_USER = "Abdulhameed556"
+class TextToSpeech:
+    def __init__(self, model_name_or_path, processor_name_or_path=None):
+        """
+        Initialize the TextToSpeech class.
+        Args:
+            model_name_or_path (str): Path or name of the YarnGPT model
+            processor_name_or_path (str, optional): Path or name of the processor
+        """
+        self.model_name_or_path = model_name_or_path
+        self.processor_name_or_path = processor_name_or_path or model_name_or_path
+        self.init_time = INIT_TIMESTAMP
+        self.user = CURRENT_USER
+        logger.info(f"Initializing TextToSpeech with model: {model_name_or_path}")
+        logger.info(f"Initialization time: {self.init_time}")
+        logger.info(f"User: {self.user}")
+        try:
+            # Initialize tokenizer using the repository ID
+            logger.info("Loading tokenizer...")
+            self.tokenizer = AutoTokenizer.from_pretrained(
+                self.processor_name_or_path,
+                token=os.getenv('HF_TOKEN'),
+                trust_remote_code=True
+            )
+            logger.info("Tokenizer loaded successfully")
+            # Initialize processor
+            logger.info("Loading processor...")
+            self.processor = AutoProcessor.from_pretrained(
+                self.processor_name_or_path,
+                token=os.getenv('HF_TOKEN'),
+                trust_remote_code=True
+            )
+            logger.info("Processor loaded successfully")
+            # Initialize model
+            logger.info("Loading model...")
+            self.device = "cuda" if torch.cuda.is_available() else "cpu"
+            logger.info(f"Using device: {self.device}")
+            self.dtype = torch.float16 if self.device == "cuda" else torch.float32
+            logger.info(f"Using torch dtype: {self.dtype}")
+            self.model = AutoModel.from_pretrained(
+                self.model_name_or_path,
+                torch_dtype=self.dtype,
+                trust_remote_code=True,
+                token=os.getenv('HF_TOKEN')
+            ).to(self.device)
+            logger.info("Model loaded successfully")
+        except Exception as e:
+            logger.error(f"Error initializing TextToSpeech: {e}")
+            raise
+    def get_status(self):
+        """Return the current status of the TTS system."""
+        return {
+            "initialized_at": self.init_time,
+            "user": self.user,
+            "device": self.device,
+            "dtype": str(self.dtype),
+            "model_name": self.model_name_or_path,
+            "processor_name": self.processor_name_or_path,
+            "model_loaded": hasattr(self, 'model'),
+            "tokenizer_loaded": hasattr(self, 'tokenizer'),
+            "processor_loaded": hasattr(self, 'processor')
+        }
+    def tts(self, text, accent="nigerian", save_path=None, speed=1.0):
+        """
+        Generate speech from text.
+        Args:
+            text (str): Text to convert to speech
+            accent (str, optional): Accent for the speech. Defaults to "nigerian".
+            save_path (str, optional): Path to save the audio file. Defaults to None.
+            speed (float, optional): Speed factor for speech. Defaults to 1.0.
+        Returns:
+            numpy.ndarray: Audio data as a numpy array
+        """
+        logger.info(f"Generating speech for text: '{text[:50]}...' with accent '{accent}'")
+        try:
+            # Prepare input
+            inputs = self.processor(
+                text=text,
+                accent=accent,
+                return_tensors="pt",
+                padding=True,
+            )
+            inputs = {k: v.to(self.device) for k, v in inputs.items()}
+            # Generate speech
+            with torch.no_grad():
+                outputs = self.model.generate(
+                    **inputs,
+                    pad_token_id=self.tokenizer.pad_token_id,
+                    max_new_tokens=1000
+                )
+            # Process outputs
+            audio_data = outputs.generated_wavs.cpu().numpy().squeeze()
+            # Adjust speed if needed
+            if speed != 1.0:
+                import librosa
+                audio_data = librosa.effects.time_stretch(audio_data, rate=speed)
+            # Save if path is provided
+            if save_path:
+                logger.info(f"Saving audio to {save_path}")
+                sample_rate = self.model.config.sampling_rate
+                wav.write(save_path, sample_rate, audio_data.astype(np.float32))
+            return audio_data
+        except Exception as e:
+            logger.error(f"Error generating speech: {e}")
+            raise
+def generate_audio(text, checkpoint_path, config_path=None, temperature=0.2, top_p=0.7, top_k=50, speed=1.0):
     """
+    Convenience function to generate audio from text.
     Args:
+        text (str): The text to convert to speech
+        checkpoint_path (str): Path to the model checkpoint
+        config_path (str, optional): Path to model config
+        temperature (float, optional): Temperature for generation. Defaults to 0.2.
+        top_p (float, optional): Top-p sampling parameter. Defaults to 0.7.
+        top_k (int, optional): Top-k sampling parameter. Defaults to 50.
+        speed (float, optional): Speed factor for speech. Defaults to 1.0.
+    Returns:
+        numpy.ndarray: Generated audio data
     """
     try:
+        start_time = datetime.utcnow()
+        logger.info(f"Starting audio generation at {start_time.strftime('%Y-%m-%d %H:%M:%S')}")
+        tts = TextToSpeech(checkpoint_path)
+        audio_data = tts.tts(text, speed=speed)
+        end_time = datetime.utcnow()
+        duration = (end_time - start_time).total_seconds()
+        logger.info(f"Audio generation completed in {duration:.2f} seconds")
+        return audio_data
     except Exception as e:
+        logger.error(f"Error in generate_audio: {e}")
         raise