Spaces:

tonyshark
/

test_voice

Runtime error

App Files Files Community

tonyshark commited on Sep 19, 2025

Commit

ffaba3a

verified ·

1 Parent(s): 820a522

Upload 2 files

Browse files

Files changed (2) hide show

app.py +245 -617
requirements.txt +1 -138

app.py CHANGED Viewed

@@ -1,3 +1,9 @@
 import gradio as gr
 import torch
 import soundfile as sf
@@ -10,64 +16,6 @@ import sys
 import json
 import time
 from huggingface_hub import hf_hub_download, login
-try:
-    from modelscope import snapshot_download
-    MODEL_SCOPE_AVAILABLE = True
-except ImportError:
-    MODEL_SCOPE_AVAILABLE = False
-# Add the src directory to Python path
-current_dir = os.path.dirname(__file__)
-src_path = os.path.join(current_dir, 'src')
-examples_path = os.path.join(current_dir, 'examples')
-# Add paths to sys.path
-if src_path not in sys.path:
-    sys.path.append(src_path)
-if examples_path not in sys.path:
-    sys.path.append(examples_path)
-# Try to import slam_llm modules
-try:
-    from slam_llm.utils.model_utils import get_custom_model_factory
-    from slam_llm.utils.dataset_utils import get_preprocessed_dataset
-    from examples.tts.utils.codec_utils import audio_decode_cosyvoice
-    from examples.tts.tts_config import ModelConfig, TrainConfig, DataConfig, LogConfig, FSDPConfig, DecodeConfig
-    SLAM_LLM_AVAILABLE = True
-except ImportError as e:
-    logger.warning(f"SLAM-LLM modules not available: {e}")
-    logger.warning("Running in demo mode with placeholder functions")
-    SLAM_LLM_AVAILABLE = False
-    # Create placeholder classes and functions
-    class ModelConfig:
-        def __init__(self, **kwargs):
-            for key, value in kwargs.items():
-                setattr(self, key, value)
-    class TrainConfig:
-        def __init__(self, **kwargs):
-            for key, value in kwargs.items():
-                setattr(self, key, value)
-    class DataConfig:
-        def __init__(self, **kwargs):
-            for key, value in kwargs.items():
-                setattr(self, key, value)
-    class DecodeConfig:
-        def __init__(self, **kwargs):
-            for key, value in kwargs.items():
-                setattr(self, key, value)
-    def get_custom_model_factory(*args, **kwargs):
-        return lambda *args, **kwargs: (None, None)
-    def get_preprocessed_dataset(*args, **kwargs):
-        return None
-    def audio_decode_cosyvoice(*args, **kwargs):
-        return None
 # Set up logging
 logging.basicConfig(level=logging.INFO)
@@ -75,31 +23,14 @@ logger = logging.getLogger(__name__)
 class EmoVoiceHFDemo:
     def __init__(self):
-        self.model = None
-        self.tokenizer = None
-        self.codec_decoder = None
-        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        self.model_loaded = False
-        self.model_config = None
-        self.train_config = None
-        self.dataset_config = None
-        self.decode_config = None
-        # Hugging Face token
         self.hf_token = "hf_iJNtPHacizBjWZAuvAvgHwWNd"+"WUkcwXLzE"
-        # Model paths with alternatives
-        self.hf_model_paths = {
-            "llm_path": "Qwen/Qwen2.5-0.5B",
-            "llm_alternatives": ["Qwen/Qwen2.5-0.5B-Instruct", "Qwen/Qwen2.5-0.5B-Chat"],
-            "codec_path": "iic/CosyVoice-300M",  # ModelScope
-            "codec_alternatives": ["iic/CosyVoice-300M-SFT", "iic/CosyVoice-300M-Instruct"],
-            "ckpt_path": "yhaha/EmoVoice",
-            "ckpt_alternatives": ["yhaha/EmoVoice-0.5B", "yhaha/EmoVoice-1.5B"]
-        }
         # Auto login to Hugging Face
         self._auto_login()
     def _auto_login(self):
         """Auto login to Hugging Face with token"""
@@ -110,568 +41,265 @@ class EmoVoiceHFDemo:
         except Exception as e:
             logger.warning(f"⚠️ Failed to auto-login to Hugging Face: {e}")
             logger.warning("App will continue in demo mode")
-    def download_models(self) -> str:
-        """Download models from Hugging Face Hub"""
-        try:
-            logger.info("Downloading models from Hugging Face Hub...")
-            # Download Qwen2.5-0.5B with fallback
-            qwen_path = None
-            for repo_id in [self.hf_model_paths["llm_path"]] + self.hf_model_paths["llm_alternatives"]:
-                try:
-                    logger.info(f"Trying to download Qwen from {repo_id}...")
-                    qwen_path = hf_hub_download(
-                        repo_id=repo_id,
-                        filename="config.json",
-                        cache_dir="./models",
-                        token=self.hf_token
-                    )
-                    logger.info(f"✅ Successfully downloaded Qwen from {repo_id}")
-                    break
-                except Exception as e:
-                    logger.warning(f"Failed to download Qwen from {repo_id}: {e}")
-                    continue
-            if qwen_path is None:
-                return "❌ Failed to download Qwen model from any repository"
-            qwen_dir = os.path.dirname(qwen_path)
-            # Download CosyVoice with fallback (try ModelScope first, then Hugging Face)
-            cosyvoice_path = None
-            cosyvoice_dir = None
-            # Try ModelScope first
-            if MODEL_SCOPE_AVAILABLE:
-                for repo_id in [self.hf_model_paths["codec_path"]] + self.hf_model_paths["codec_alternatives"]:
-                    try:
-                        logger.info(f"Trying to download CosyVoice from ModelScope: {repo_id}...")
-                        cosyvoice_dir = snapshot_download(repo_id, cache_dir="./models")
-                        logger.info(f"✅ Successfully downloaded CosyVoice from ModelScope: {repo_id}")
-                        break
-                    except Exception as e:
-                        logger.warning(f"Failed to download CosyVoice from ModelScope {repo_id}: {e}")
-                        continue
-            # If ModelScope failed, try Hugging Face
-            if cosyvoice_dir is None:
-                for repo_id in [self.hf_model_paths["codec_path"]] + self.hf_model_paths["codec_alternatives"]:
-                    try:
-                        logger.info(f"Trying to download CosyVoice from Hugging Face: {repo_id}...")
-                        cosyvoice_path = hf_hub_download(
-                            repo_id=repo_id,
-                            filename="config.json",
-                            cache_dir="./models",
-                            token=self.hf_token
-                        )
-                        cosyvoice_dir = os.path.dirname(cosyvoice_path)
-                        logger.info(f"✅ Successfully downloaded CosyVoice from Hugging Face: {repo_id}")
-                        break
-                    except Exception as e:
-                        logger.warning(f"Failed to download CosyVoice from Hugging Face {repo_id}: {e}")
-                        continue
-            if cosyvoice_dir is None:
-                return "❌ Failed to download CosyVoice model from any repository"
-            # Download EmoVoice checkpoint with fallback
-            emovoice_path = None
-            for repo_id in [self.hf_model_paths["ckpt_path"]] + self.hf_model_paths["ckpt_alternatives"]:
-                try:
-                    logger.info(f"Trying to download EmoVoice from {repo_id}...")
-                    emovoice_path = hf_hub_download(
-                        repo_id=repo_id,
-                        filename="EmoVoice.pt",
-                        cache_dir="./models",
-                        token=self.hf_token
-                    )
-                    logger.info(f"✅ Successfully downloaded EmoVoice from {repo_id}")
-                    break
-                except Exception as e:
-                    logger.warning(f"Failed to download EmoVoice from {repo_id}: {e}")
-                    continue
-            if emovoice_path is None:
-                return "❌ Failed to download EmoVoice checkpoint from any repository"
-            return f"✅ Models downloaded successfully!\nQwen: {qwen_dir}\nCosyVoice: {cosyvoice_dir}\nEmoVoice: {emovoice_path}"
-        except Exception as e:
-            logger.error(f"Error downloading models: {str(e)}")
-            return f"❌ Error downloading models: {str(e)}"
-    def load_model(self) -> str:
-        """Load the EmoVoice model with Hugging Face paths"""
-        try:
-            logger.info("Loading EmoVoice model...")
-            if not SLAM_LLM_AVAILABLE:
-                return "⚠️ SLAM-LLM modules not available. Running in demo mode with placeholder functionality."
-            # Get model paths
-            qwen_dir = os.path.join("./models", "models--Qwen--Qwen2.5-0.5B", "snapshots")
-            if os.path.exists(qwen_dir):
-                qwen_path = os.path.join(qwen_dir, os.listdir(qwen_dir)[0])
-            else:
-                return "❌ Qwen model not found. Please download models first."
-            # Try different possible CosyVoice paths (ModelScope and Hugging Face)
-            cosyvoice_paths = [
-                # ModelScope paths
-                os.path.join("./models", "iic--CosyVoice-300M"),
-                os.path.join("./models", "iic--CosyVoice-300M-SFT"),
-                os.path.join("./models", "iic--CosyVoice-300M-Instruct"),
-                # Hugging Face paths
-                os.path.join("./models", "models--FunAudioLLM--CosyVoice", "snapshots"),
-                os.path.join("./models", "models--iic--CosyVoice-300M-SFT", "snapshots"),
-                os.path.join("./models", "models--FunAudioLLM--CosyVoice-300M-SFT", "snapshots")
-            ]
-            cosyvoice_path = None
-            for cosyvoice_dir in cosyvoice_paths:
-                if os.path.exists(cosyvoice_dir):
-                    # Check if it's a ModelScope path (direct directory) or Hugging Face path (with snapshots)
-                    if "snapshots" in cosyvoice_dir:
-                        # Hugging Face path
-                        cosyvoice_path = os.path.join(cosyvoice_dir, os.listdir(cosyvoice_dir)[0])
-                    else:
-                        # ModelScope path
-                        cosyvoice_path = cosyvoice_dir
-                    break
-            if cosyvoice_path is None:
-                return "❌ CosyVoice model not found. Please download models first."
-            emovoice_path = os.path.join("./models", "models--yhaha--EmoVoice", "snapshots")
-            if os.path.exists(emovoice_path):
-                emovoice_snapshot = os.path.join(emovoice_path, os.listdir(emovoice_path)[0])
-                ckpt_path = os.path.join(emovoice_snapshot, "EmoVoice.pt")
-            else:
-                return "❌ EmoVoice checkpoint not found. Please download models first."
-            # Model configuration
-            self.model_config = ModelConfig(
-                llm_name="qwen2.5-0.5b",
-                llm_path=qwen_path,
-                llm_dim=896,
-                codec_decoder_path=cosyvoice_path,
-                codec_decode=True,
-                vocab_config={
-                    "code_layer": 3,
-                    "total_audio_vocabsize": 4160,
-                    "total_vocabsize": 156160
-                },
-                codec_decoder_type="CosyVoice",
-                group_decode=True,
-                group_decode_adapter_type="linear",
-                use_text_stream=False
-            )
-            # Training configuration
-            self.train_config = TrainConfig(
-                model_name="tts",
-                freeze_encoder=True,
-                freeze_llm=True,
-                freeze_group_decode_adapter=True,
-                batching_strategy="custom",
-                num_epochs=1,
-                val_batch_size=1,
-                num_workers_dataloader=0,  # Use 0 for HF Space
-                seed=42
-            )
-            # Dataset configuration
-            self.dataset_config = DataConfig(
-                dataset="speech_dataset_tts",
-                inference_mode=True,
-                vocab_config={
-                    "code_layer": 3,
-                    "total_audio_vocabsize": 4160,
-                    "total_vocabsize": 156160
-                },
-                num_latency_tokens=0,
-                do_layershift=False,
-                use_emo=True
-            )
-            # Decode configuration
-            self.decode_config = DecodeConfig(
-                text_repetition_penalty=1.2,
-                audio_repetition_penalty=1.2,
-                max_new_tokens=3000,
-                do_sample=False,
-                top_p=1.0,
-                top_k=0,
-                temperature=1.0,
-                decode_text_only=False,
-                num_latency_tokens=0,
-                do_layershift=False
-            )
-            # Initialize model factory
-            model_factory = get_custom_model_factory(self.model_config, logger)
-            self.model, self.tokenizer = model_factory(self.train_config, self.model_config)
-            self.codec_decoder = self.model.codec_decoder
-            # Load checkpoint
-            if os.path.exists(ckpt_path):
-                checkpoint = torch.load(ckpt_path, map_location=self.device)
-                if isinstance(checkpoint, dict) and 'model_state_dict' in checkpoint:
-                    self.model.load_state_dict(checkpoint['model_state_dict'], strict=False)
-                else:
-                    self.model.load_state_dict(checkpoint, strict=False)
-            else:
-                logger.warning(f"Checkpoint not found at {ckpt_path}, using random weights")
-            self.model.to(self.device)
-            self.model.eval()
-            self.model_loaded = True
-            logger.info("Model loaded successfully!")
-            return "✅ Model loaded successfully!"
-        except Exception as e:
-            logger.error(f"Error loading model: {str(e)}")
-            return f"❌ Error loading model: {str(e)}"
-    def create_demo_dataset(self, text: str, emotion: str, reference_audio: Optional[str] = None) -> str:
-        """Create a temporary dataset file for inference"""
-        demo_data = {
-            "source_text": text,
-            "target_text": text,
-            "emotion": emotion,
-            "reference_audio": reference_audio or "",
-            "key": "demo_sample"
         }
-        # Create temporary file
-        temp_file = tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl', delete=False)
-        temp_file.write(json.dumps(demo_data) + '\n')
-        temp_file.close()
-        return temp_file.name
-    def _generate_demo_audio(self, text: str, emotion: str) -> Tuple[Optional[str], str]:
-        """Generate demo audio when SLAM-LLM is not available"""
         try:
-            logger.info(f"Generating demo audio for text: '{text}' with emotion: '{emotion}'")
-            # Create temporary directory for output
-            with tempfile.TemporaryDirectory() as temp_dir:
-                # Generate different tones based on emotion
-                emotion_frequencies = {
-                    "neutral": 440,    # A note
-                    "happy": 523,      # C note (higher, brighter)
-                    "sad": 349,        # F note (lower, darker)
-                    "angry": 659,      # E note (higher, sharper)
-                    "fearful": 311,    # D# note (lower, tense)
-                    "disgusted": 392,  # G note (mid, harsh)
-                    "surprised": 554   # C# note (higher, excited)
-                }
-                frequency = emotion_frequencies.get(emotion, 440)
-                sample_rate = 22050
-                duration = min(len(text) * 0.1, 5.0)  # Duration based on text length, max 5 seconds
-                # Generate audio with emotion-based characteristics
-                t = np.linspace(0, duration, int(sample_rate * duration), False)
-                if emotion == "happy":
-                    # Bright, upbeat tone
-                    audio_data = np.sin(2 * np.pi * frequency * t) * 0.3
-                    audio_data += np.sin(2 * np.pi * frequency * 2 * t) * 0.1  # Harmonic
-                elif emotion == "sad":
-                    # Slow, melancholic tone
-                    audio_data = np.sin(2 * np.pi * frequency * t) * 0.2
-                    audio_data *= np.exp(-t * 0.5)  # Fade out
-                elif emotion == "angry":
-                    # Sharp, aggressive tone
-                    audio_data = np.sin(2 * np.pi * frequency * t) * 0.4
-                    audio_data += np.sin(2 * np.pi * frequency * 1.5 * t) * 0.2  # Distortion
-                elif emotion == "fearful":
-                    # Tense, trembling tone
-                    audio_data = np.sin(2 * np.pi * frequency * t) * 0.25
-                    audio_data *= (1 + 0.1 * np.sin(2 * np.pi * 10 * t))  # Tremolo
-                elif emotion == "disgusted":
-                    # Harsh, grating tone
-                    audio_data = np.sin(2 * np.pi * frequency * t) * 0.3
-                    audio_data += np.sin(2 * np.pi * frequency * 3 * t) * 0.15  # Harsh harmonic
-                elif emotion == "surprised":
-                    # Quick, excited tone
-                    audio_data = np.sin(2 * np.pi * frequency * t) * 0.35
-                    audio_data += np.sin(2 * np.pi * frequency * 2.5 * t) * 0.1  # Excitement
-                else:  # neutral
-                    # Clean, balanced tone
-                    audio_data = np.sin(2 * np.pi * frequency * t) * 0.3
-                # Add some variation to make it more interesting
-                audio_data += np.random.normal(0, 0.01, len(audio_data))  # Slight noise
-                # Save to temporary file
-                output_path = os.path.join(temp_dir, "demo_speech.wav")
-                sf.write(output_path, audio_data, sample_rate)
-                return output_path, f"🎵 Demo audio generated!\nText: '{text}'\nEmotion: {emotion}\nNote: This is a demo audio. For real speech synthesis, please ensure SLAM-LLM modules are available."
         except Exception as e:
-            logger.error(f"Error generating demo audio: {str(e)}")
-            return None, f"❌ Error generating demo audio: {str(e)}"
-    def generate_speech(self,
-                       text: str,
-                       emotion: str = "neutral",
-                       reference_audio: Optional[str] = None) -> Tuple[Optional[str], str]:
-        """Generate speech from text with specified emotion"""
-        if not self.model_loaded and SLAM_LLM_AVAILABLE:
-            return None, "❌ Model not loaded. Please load the model first."
-        if not text.strip():
-            return None, "❌ Please enter some text to convert."
-        # Demo mode - generate placeholder audio
-        if not SLAM_LLM_AVAILABLE:
-            return self._generate_demo_audio(text, emotion)
-        try:
-            logger.info(f"Generating speech for text: '{text}' with emotion: '{emotion}'")
-            start_time = time.time()
-            # Create temporary dataset file
-            dataset_file = self.create_demo_dataset(text, emotion, reference_audio)
-            try:
-                # Update dataset config with the temporary file
-                self.dataset_config.val_data_path = dataset_file
-                self.dataset_config.train_data_path = dataset_file
-                # Get dataset
-                dataset = get_preprocessed_dataset(
-                    self.tokenizer,
-                    self.dataset_config,
-                    split="test",
-                )
-                # Create dataloader
-                dataloader = torch.utils.data.DataLoader(
-                    dataset,
-                    num_workers=0,  # Use 0 for HF Space
-                    pin_memory=True,
-                    shuffle=False,
-                    batch_size=1,
-                    drop_last=False,
-                    collate_fn=dataset.collator
-                )
-                # Generate speech
-                with torch.no_grad():
-                    for batch in dataloader:
-                        # Move batch to device
-                        for key in batch.keys():
-                            if isinstance(batch[key], torch.Tensor):
-                                batch[key] = batch[key].to(self.device)
-                        # Get audio prompt path
-                        audio_prompt_path = batch.get("neutral_speaker_wav", [None])[0]
-                        # Generate using the model
-                        model_outputs = self.model.generate(**batch, **self.decode_config)
-                        # Extract outputs
-                        code_layer = self.model_config.vocab_config.code_layer
-                        text_outputs = model_outputs[code_layer]
-                        audio_outputs = model_outputs[:code_layer]
-                        # Decode text
-                        output_text = self.tokenizer.decode(text_outputs, add_special_tokens=False, skip_special_tokens=True)
-                        logger.info(f"Generated text: {output_text}")
-                        # Decode audio
-                        if not self.decode_config.decode_text_only:
-                            audio_tokens = [audio_outputs[layer] for layer in range(code_layer)] if code_layer > 0 else audio_outputs
-                            # Create temporary directory for output
-                            with tempfile.TemporaryDirectory() as temp_dir:
-                                audio_hat = audio_decode_cosyvoice(
-                                    audio_tokens,
-                                    self.model_config,
-                                    self.codec_decoder,
-                                    audio_prompt_path,
-                                    code_layer,
-                                    self.dataset_config.num_latency_tokens,
-                                    speed=1.0
-                                )
-                                if audio_hat is not None:
-                                    # Save audio
-                                    output_path = os.path.join(temp_dir, "generated_speech.wav")
-                                    sf.write(output_path, audio_hat.squeeze().cpu().numpy(), 22050)
-                                    end_time = time.time()
-                                    generation_time = end_time - start_time
-                                    audio_length = audio_hat.shape[1] / 22050
-                                    return output_path, f"✅ Generated speech successfully!\nText: {output_text}\nGeneration time: {generation_time:.2f}s\nAudio length: {audio_length:.2f}s"
-                                else:
-                                    return None, "❌ Failed to decode audio tokens"
-                        else:
-                            end_time = time.time()
-                            generation_time = end_time - start_time
-                            return None, f"✅ Generated text successfully!\nText: {output_text}\nGeneration time: {generation_time:.2f}s"
-                        break  # Only process first batch
-            finally:
-                # Clean up temporary dataset file
-                if os.path.exists(dataset_file):
-                    os.unlink(dataset_file)
-        except Exception as e:
-            logger.error(f"Error generating speech: {str(e)}")
-            return None, f"❌ Error generating speech: {str(e)}"
-# Initialize the demo
-demo_app = EmoVoiceHFDemo()
-def download_models_interface():
-    """Interface for downloading models"""
-    return demo_app.download_models()
-def load_model_interface():
-    """Interface for loading the model"""
-    return demo_app.load_model()
-def generate_speech_interface(text, emotion):
-    """Interface for generating speech"""
-    audio_path, message = demo_app.generate_speech(text, emotion, None)
-    return audio_path, message
-# Create Gradio interface
-def create_demo():
-    with gr.Blocks(title="Voice AI ", theme=gr.themes.Soft()) as demo:
-        gr.Markdown("""
-        # 🎭 Voice AI
-        **LLM-based Emotional Text-To-Speech Model with Freestyle Text Prompting**
-        This demo allows you to generate emotional speech from text using the EmoVoice model.
-        """)
-        with gr.Tab("Setup"):
-            gr.Markdown("### Model Setup")
-            gr.Markdown("""
-            **Step 1:** Download models from Hugging Face Hub (Auto-login enabled)
-            **Step 2:** Load the EmoVoice model
-            **Note:** This may take a few minutes on first run.
-            """)
-            with gr.Row():
-                download_btn = gr.Button("📥 Download Models", variant="secondary", size="lg")
-                load_btn = gr.Button("🚀 Load Model", variant="primary", size="lg")
-            setup_status = gr.Textbox(label="Setup Status", interactive=False, lines=4)
-            download_btn.click(
-                fn=download_models_interface,
-                outputs=[setup_status]
-            )
-            load_btn.click(
-                fn=load_model_interface,
-                outputs=[setup_status]
-            )
-        with gr.Tab("Generate Speech"):
-            gr.Markdown("### Generate Emotional Speech")
-            with gr.Row():
-                with gr.Column():
-                    text_input = gr.Textbox(
-                        label="Text to Convert",
-                        placeholder="Enter the text you want to convert to speech...",
-                        lines=4,
-                        info="Enter the text you want to convert to speech"
-                    )
-                    emotion = gr.Dropdown(
-                        choices=["neutral", "happy", "sad", "angry", "fearful", "disgusted", "surprised"],
-                        value="neutral",
-                        label="Emotion",
-                        info="Select the emotion for the generated speech"
                     )
-                    # reference_audio = gr.Audio(
-                    #     label="Reference Audio (Optional)",
-                    #     type="filepath",
-                    #     info="Optional reference audio for voice cloning"
-                    # )
-                    generate_btn = gr.Button("🎵 Generate Speech", variant="primary", size="lg")
-                with gr.Column():
-                    output_audio = gr.Audio(label="Generated Speech", type="filepath")
-                    output_message = gr.Textbox(label="Generation Status", interactive=False, lines=5)
             generate_btn.click(
-                fn=generate_speech_interface,
-                inputs=[text_input, emotion],
-                outputs=[output_audio, output_message]
             )
-        with gr.Tab("Examples"):
-            gr.Markdown("### Example Texts and Emotions")
-            examples = [
-                ["Hello, how are you today? I hope you're having a wonderful day!", "happy"],
-                ["I'm feeling really sad about this situation. It's been very difficult.", "sad"],
-                ["This is absolutely ridiculous! I can't believe what's happening!", "angry"],
-                ["Oh my goodness, I can't believe it! This is amazing!", "surprised"],
-                ["I'm so scared right now. I don't know what to do.", "fearful"],
-                ["That's disgusting! I can't stand the smell.", "disgusted"],
-                ["The weather is nice today. It's a beautiful day for a walk.", "neutral"]
-            ]
-            gr.Examples(
-                examples=examples,
-                inputs=[text_input, emotion],
-                label="Click to use example"
             )
-        with gr.Tab("About"):
-            gr.Markdown("""
-            ## About EmoVoice
-            EmoVoice is an advanced LLM-based Emotional Text-To-Speech model that can generate high-quality speech with various emotions from text input.
-            ### Key Features:
-            - **🎯 Freestyle Text Prompting**: Generate speech from any text input
-            - **😊 Multiple Emotions**: Support for 7 different emotions
-            - **🎵 High Quality**: Based on advanced LLM and codec technologies
-            - **🎤 Reference Audio**: Optional reference audio for voice cloning
-            - **⚡ Real-time**: Fast generation with RTF < 1.0
-            ### Citation:
-            ```
-            @article{yang2025emovoice,
-              title={EmoVoice: LLM-based Emotional Text-To-Speech Model with Freestyle Text Prompting},
-              author={Yang, Guanrou and Yang, Chen and Chen, Qian and Ma, Ziyang and Chen, Wenxi and Wang, Wen and Wang, Tianrui and Yang, Yifan and Niu, Zhikang and Liu, Wenrui and others},
-              journal={arXiv preprint arXiv:2504.12867},
-              year={2025}
-            }
-            ```
-            **Paper**: https://arxiv.org/abs/2504.12867
-            **GitHub**: https://github.com/yhaha0908/EmoVoice
-            **Hugging Face**: https://huggingface.co/yhaha/EmoVoice
-            """)
-    return demo
 if __name__ == "__main__":
-    demo = create_demo()
-    demo.launch(
-        server_name="0.0.0.0",
-        server_port=7860,
-        share=False,
-        debug=True,
-        show_error=True
-    )

+#!/usr/bin/env python3
+"""
+EmoVoice Hugging Face Space Demo
+LLM-based Emotional Text-To-Speech Model
+"""
 import gradio as gr
 import torch
 import soundfile as sf
 import json
 import time
 from huggingface_hub import hf_hub_download, login
 # Set up logging
 logging.basicConfig(level=logging.INFO)
 class EmoVoiceHFDemo:
     def __init__(self):
         self.hf_token = "hf_iJNtPHacizBjWZAuvAvgHwWNd"+"WUkcwXLzE"
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         # Auto login to Hugging Face
         self._auto_login()
+        logger.info("🎭 EmoVoice Hugging Face Demo initialized")
+        logger.info(f"🔧 Device: {self.device}")
     def _auto_login(self):
         """Auto login to Hugging Face with token"""
         except Exception as e:
             logger.warning(f"⚠️ Failed to auto-login to Hugging Face: {e}")
             logger.warning("App will continue in demo mode")
+    def generate_demo_audio(self, text: str, emotion: str) -> Tuple[np.ndarray, int]:
+        """Generate demo audio with emotion (placeholder function)"""
+        emotion_frequencies = {
+            "neutral": 440,    # A note
+            "happy": 523,      # C note
+            "sad": 349,        # F note
+            "angry": 659,      # E note
+            "fearful": 311,    # D# note
+            "disgusted": 392,  # G note
+            "surprised": 554   # C# note
         }
+        frequency = emotion_frequencies.get(emotion, 440)
+        sample_rate = 22050
+        duration = min(len(text) * 0.1, 3.0)  # Max 3 seconds
+        # Generate sine wave
+        t = np.linspace(0, duration, int(sample_rate * duration), False)
+        audio_data = np.sin(2 * np.pi * frequency * t) * 0.3
+        # Add emotion-specific characteristics
+        if emotion == "happy":
+            # Add harmonics for happy
+            audio_data += 0.1 * np.sin(2 * np.pi * frequency * 2 * t)
+        elif emotion == "sad":
+            # Lower amplitude for sad
+            audio_data *= 0.7
+        elif emotion == "angry":
+            # Add noise for angry
+            audio_data += 0.05 * np.random.normal(0, 1, len(audio_data))
+        elif emotion == "fearful":
+            # Tremolo effect for fearful
+            audio_data *= (1 + 0.3 * np.sin(2 * np.pi * 5 * t))
+        elif emotion == "disgusted":
+            # Lower frequency for disgusted
+            audio_data *= 0.8
+        elif emotion == "surprised":
+            # Quick attack for surprised
+            attack_samples = int(0.1 * sample_rate)
+            audio_data[:attack_samples] *= np.linspace(0, 1, attack_samples)
+        return audio_data, sample_rate
+    def generate_speech(self, text: str, emotion: str) -> Optional[str]:
+        """Generate speech and return audio file path"""
         try:
+            logger.info(f"Generating speech: '{text}' with emotion '{emotion}'")
+            if not text.strip():
+                return None
+            # Generate demo audio
+            audio_data, sample_rate = self.generate_demo_audio(text, emotion)
+            # Save to temporary file
+            with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp:
+                sf.write(tmp.name, audio_data, sample_rate)
+                logger.info(f"✅ Generated audio: {tmp.name}")
+                return tmp.name
         except Exception as e:
+            logger.error(f"Error generating speech: {e}")
+            return None
+    def create_interface(self):
+        """Create the Gradio interface"""
+        with gr.Blocks(
+            title="EmoVoice Demo",
+            theme=gr.themes.Soft(),
+            css="""
+            .gradio-container {
+                max-width: 1200px !important;
+                margin: auto !important;
+            }
+            .main-header {
+                text-align: center;
+                margin-bottom: 2rem;
+            }
+            .emotion-grid {
+                display: grid;
+                grid-template-columns: repeat(auto-fit, minmax(150px, 1fr));
+                gap: 10px;
+                margin: 1rem 0;
+            }
+            """
+        ) as demo:
+            # Header
+            gr.HTML("""
+            <div class="main-header">
+                <h1>🎭 EmoVoice Demo</h1>
+                <p><strong>LLM-based Emotional Text-To-Speech Model</strong></p>
+                <p>Generate emotional speech from text with 7 different emotions</p>
+            </div>
+            """)
+            with gr.Tabs():
+                # Main Generation Tab
+                with gr.Tab("🎵 Generate Speech"):
+                    with gr.Row():
+                        with gr.Column(scale=2):
+                            text_input = gr.Textbox(
+                                label="Text to Convert",
+                                placeholder="Enter text to convert to speech...",
+                                value="Hello world! This is a demo of EmoVoice emotional text-to-speech.",
+                                lines=4,
+                                max_lines=10
+                            )
+                            # Emotion selection with better UI
+                            gr.Markdown("### Choose Emotion")
+                            emotion_radio = gr.Radio(
+                                choices=[
+                                    ("😊 Happy", "happy"),
+                                    ("😢 Sad", "sad"),
+                                    ("😠 Angry", "angry"),
+                                    ("😨 Fearful", "fearful"),
+                                    ("🤢 Disgusted", "disgusted"),
+                                    ("😲 Surprised", "surprised"),
+                                    ("😐 Neutral", "neutral")
+                                ],
+                                value="neutral",
+                                label="Emotion",
+                                info="Select the emotional tone for your speech"
+                            )
+                            generate_btn = gr.Button(
+                                "🎵 Generate Speech",
+                                variant="primary",
+                                size="lg"
+                            )
+                        with gr.Column(scale=1):
+                            audio_output = gr.Audio(
+                                label="Generated Audio",
+                                type="filepath",
+                                interactive=False
+                            )
+                            # Audio info
+                            audio_info = gr.Textbox(
+                                label="Audio Info",
+                                interactive=False,
+                                visible=False
+                            )
+                # Examples Tab
+                with gr.Tab("📝 Examples"):
+                    gr.Markdown("### Try these examples:")
+                    examples = [
+                        ["Hello world! How are you today?", "happy"],
+                        ["I'm feeling really sad about this situation.", "sad"],
+                        ["I can't believe this happened! This is outrageous!", "angry"],
+                        ["This is so surprising and unexpected!", "surprised"],
+                        ["I'm scared of what might happen next.", "fearful"],
+                        ["That's disgusting and repulsive.", "disgusted"],
+                        ["The weather is nice today.", "neutral"]
+                    ]
+                    gr.Examples(
+                        examples=examples,
+                        inputs=[text_input, emotion_radio],
+                        label="Click any example to try it"
                     )
+                # About Tab
+                with gr.Tab("ℹ️ About"):
+                    gr.Markdown("""
+                    ## 🎭 EmoVoice Demo
+                    **LLM-based Emotional Text-To-Speech Model**
+                    This demo showcases the EmoVoice model's ability to generate emotional speech from text.
+                    The model can express 7 different emotions:
+                    - 😊 **Happy**: Cheerful and upbeat tone
+                    - 😢 **Sad**: Melancholic and somber tone
+                    - 😠 **Angry**: Intense and aggressive tone
+                    - 😨 **Fearful**: Anxious and worried tone
+                    - 🤢 **Disgusted**: Repulsed and revolted tone
+                    - 😲 **Surprised**: Excited and astonished tone
+                    - 😐 **Neutral**: Calm and balanced tone
+                    ### How to Use:
+                    1. Enter your text in the input box
+                    2. Select an emotion from the options
+                    3. Click "Generate Speech" to create audio
+                    4. Play the generated audio
+                    ### Technical Details:
+                    - **Model**: EmoVoice (LLM-based TTS)
+                    - **Sample Rate**: 22050 Hz
+                    - **Format**: WAV
+                    - **Max Duration**: 3 seconds
+                    ### Note:
+                    This is a demo version with placeholder audio generation.
+                    The actual EmoVoice model would provide more realistic speech synthesis.
+                    """)
+            # Generate function
+            def generate_speech_wrapper(text, emotion):
+                if not text.strip():
+                    return None, "Please enter some text to convert."
+                audio_path = self.generate_speech(text, emotion)
+                if audio_path:
+                    # Get audio info
+                    try:
+                        audio_data, sample_rate = sf.read(audio_path)
+                        duration = len(audio_data) / sample_rate
+                        info = f"Duration: {duration:.2f}s | Sample Rate: {sample_rate}Hz | Samples: {len(audio_data)}"
+                        return audio_path, info
+                    except:
+                        return audio_path, "Audio generated successfully"
+                else:
+                    return None, "Error generating audio. Please try again."
+            # Connect the generate button
             generate_btn.click(
+                fn=generate_speech_wrapper,
+                inputs=[text_input, emotion_radio],
+                outputs=[audio_output, audio_info]
             )
+            # Show audio info when audio is generated
+            audio_output.change(
+                fn=lambda x: gr.Textbox(visible=True) if x else gr.Textbox(visible=False),
+                inputs=[audio_output],
+                outputs=[audio_info]
             )
+        return demo
+def main():
+    """Main function to run the demo"""
+    try:
+        # Create demo instance
+        demo_app = EmoVoiceHFDemo()
+        # Create interface
+        demo = demo_app.create_interface()
+        # Launch the demo
+        logger.info("🚀 Launching EmoVoice Demo...")
+        demo.launch(
+            share=False,
+            server_name="0.0.0.0",
+            server_port=7860,
+            show_error=True,
+            show_tips=True
+        )
+    except Exception as e:
+        logger.error(f"Error launching demo: {e}")
+        import traceback
+        traceback.print_exc()
 if __name__ == "__main__":
+    main()

requirements.txt CHANGED Viewed

@@ -1,144 +1,7 @@
-# Core dependencies for EmoVoice Hugging Face Space
 gradio>=4.0.0
 torch>=2.0.0
 torchaudio>=2.0.0
 soundfile>=0.12.0
 numpy>=1.21.0
-librosa>=0.10.0
 scipy>=1.9.0
-matplotlib>=3.5.0
-# Hugging Face dependencies
-huggingface-hub>=0.25.0
-transformers>=4.43.0
-tokenizers>=0.19.0
-safetensors>=0.4.0
-# Audio processing
-pydub>=0.25.0
-webrtcvad>=2.0.0
-# Additional dependencies
-accelerate>=0.34.0
-datasets>=3.3.0
-omegaconf>=2.3.0
-hydra-core>=1.3.0
-einops>=0.8.0
-rotary-embedding-torch>=0.8.0
-# Web interface
-fastapi>=0.100.0
-uvicorn>=0.20.0
-starlette>=0.37.0
-# Utilities
-tqdm>=4.66.0
-requests>=2.32.0
-packaging>=24.0
-typing-extensions>=4.12.0
-# EmoVoice specific dependencies
-absl-py>=2.1.0
-addict>=2.4.0
-aiofiles>=23.2.1
-anyio>=4.6.2.post1
-asttokens>=2.4.1
-async-timeout>=4.0.3
-attrs>=24.2.0
-bitstring>=4.2.3
-blinker>=1.8.2
-Brotli>=1.1.0
-cachetools>=4.2.4
-cffi>=1.17.1
-charset-normalizer>=3.4.0
-click>=8.1.7
-colorama>=0.4.6
-coloredlogs>=15.0.1
-conformer>=0.3.2
-cycler>=0.12.1
-Cython>=3.0.11
-debugpy>=1.8.8
-decorator>=5.1.1
-deepspeed>=0.14.2
-diffusers>=0.27.2
-dill>=0.3.8
-editdistance>=0.8.1
-exceptiongroup>=1.2.2
-executing>=2.1.0
-ffmpy>=0.4.0
-filelock>=3.16.1
-fire>=0.6.0
-frozenlist>=1.4.1
-fsspec>=2024.2.0
-gdown>=5.1.0
-gitdb>=4.0.11
-GitPython>=3.1.43
-HyperPyYAML>=1.2.2
-importlib_metadata>=8.5.0
-jsonschema>=4.23.0
-kaldiio>=2.18.0
-lightning>=2.2.4
-lightning-utilities>=0.11.8
-loralib>=0.1.2
-Mako>=1.3.5
-Markdown>=3.7
-modelscope>=1.18.1
-more-itertools>=10.5.0
-mpmath>=1.3.0
-msgpack>=1.1.0
-multidict>=6.1.0
-multiprocess>=0.70.16
-networkx>=3.1
-ninja>=1.11.1.1
-numba>=0.60.0
-oauthlib>=3.2.2
-onnx>=1.16.0
-onnxruntime-gpu>=1.16.0
-openai-whisper
-orjson>=3.10.11
-pandas>=2.2.2
-protobuf>=4.25.5
-pydantic>=2.0.0,<2.4.0
-pydantic_core>=2.0.0,<2.14.0
-pydash>=8.0.3
-Pygments>=2.18.0
-PyJWT>=2.9.0
-pynini>=2.1.5
-pynndescent>=0.5.13
-pynvml>=11.5.3
-pyparsing>=3.2.0
-python-dateutil>=2.9.0.post0
-python-dotenv>=1.0.1
-python-multipart>=0.0.17
-pytorch-lightning>=2.4.0
-pytorch-wpe>=0.0.1
-pytz>=2024.2
-PyYAML>=6.0.2
-rdflib>=7.1.3
-referencing>=0.35.1
-regex>=2024.9.11
-rotary-embedding-torch>=0.8.6
-scikit-learn>=1.5.2
-segments>=2.3.0
-sentence-transformers>=4.0.1
-sentencepiece>=0.2.0
-stack-data>=0.6.3
-tensorboard>=2.14.0
-tensorboardX>=2.6.2.2
-torchmetrics>=1.6.0
-torchvision>=0.19.1
-triton>=3.0.0
-typeguard>=4.4.1
-typer>=0.13.0
-ujson>=5.10.0
-urllib3>=2.2.3
-uvloop>=0.21.0
-wandb>=0.18.1
-websockets>=11.0.3
-WeTextProcessing>=1.0.3
-wget>=3.2
-whisper_normalizer>=0.0.10
-wrapt>=1.16.0
-zipp>=3.21.0
-peft
-funasr

 gradio>=4.0.0
 torch>=2.0.0
 torchaudio>=2.0.0
 soundfile>=0.12.0
 numpy>=1.21.0
+huggingface-hub>=0.16.0
 scipy>=1.9.0