tonyshark commited on
Commit
ffaba3a
·
verified ·
1 Parent(s): 820a522

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +245 -617
  2. requirements.txt +1 -138
app.py CHANGED
@@ -1,3 +1,9 @@
 
 
 
 
 
 
1
  import gradio as gr
2
  import torch
3
  import soundfile as sf
@@ -10,64 +16,6 @@ import sys
10
  import json
11
  import time
12
  from huggingface_hub import hf_hub_download, login
13
- try:
14
- from modelscope import snapshot_download
15
- MODEL_SCOPE_AVAILABLE = True
16
- except ImportError:
17
- MODEL_SCOPE_AVAILABLE = False
18
-
19
- # Add the src directory to Python path
20
- current_dir = os.path.dirname(__file__)
21
- src_path = os.path.join(current_dir, 'src')
22
- examples_path = os.path.join(current_dir, 'examples')
23
-
24
- # Add paths to sys.path
25
- if src_path not in sys.path:
26
- sys.path.append(src_path)
27
- if examples_path not in sys.path:
28
- sys.path.append(examples_path)
29
-
30
- # Try to import slam_llm modules
31
- try:
32
- from slam_llm.utils.model_utils import get_custom_model_factory
33
- from slam_llm.utils.dataset_utils import get_preprocessed_dataset
34
- from examples.tts.utils.codec_utils import audio_decode_cosyvoice
35
- from examples.tts.tts_config import ModelConfig, TrainConfig, DataConfig, LogConfig, FSDPConfig, DecodeConfig
36
- SLAM_LLM_AVAILABLE = True
37
- except ImportError as e:
38
- logger.warning(f"SLAM-LLM modules not available: {e}")
39
- logger.warning("Running in demo mode with placeholder functions")
40
- SLAM_LLM_AVAILABLE = False
41
-
42
- # Create placeholder classes and functions
43
- class ModelConfig:
44
- def __init__(self, **kwargs):
45
- for key, value in kwargs.items():
46
- setattr(self, key, value)
47
-
48
- class TrainConfig:
49
- def __init__(self, **kwargs):
50
- for key, value in kwargs.items():
51
- setattr(self, key, value)
52
-
53
- class DataConfig:
54
- def __init__(self, **kwargs):
55
- for key, value in kwargs.items():
56
- setattr(self, key, value)
57
-
58
- class DecodeConfig:
59
- def __init__(self, **kwargs):
60
- for key, value in kwargs.items():
61
- setattr(self, key, value)
62
-
63
- def get_custom_model_factory(*args, **kwargs):
64
- return lambda *args, **kwargs: (None, None)
65
-
66
- def get_preprocessed_dataset(*args, **kwargs):
67
- return None
68
-
69
- def audio_decode_cosyvoice(*args, **kwargs):
70
- return None
71
 
72
  # Set up logging
73
  logging.basicConfig(level=logging.INFO)
@@ -75,31 +23,14 @@ logger = logging.getLogger(__name__)
75
 
76
  class EmoVoiceHFDemo:
77
  def __init__(self):
78
- self.model = None
79
- self.tokenizer = None
80
- self.codec_decoder = None
81
- self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
82
- self.model_loaded = False
83
- self.model_config = None
84
- self.train_config = None
85
- self.dataset_config = None
86
- self.decode_config = None
87
-
88
- # Hugging Face token
89
  self.hf_token = "hf_iJNtPHacizBjWZAuvAvgHwWNd"+"WUkcwXLzE"
90
-
91
- # Model paths with alternatives
92
- self.hf_model_paths = {
93
- "llm_path": "Qwen/Qwen2.5-0.5B",
94
- "llm_alternatives": ["Qwen/Qwen2.5-0.5B-Instruct", "Qwen/Qwen2.5-0.5B-Chat"],
95
- "codec_path": "iic/CosyVoice-300M", # ModelScope
96
- "codec_alternatives": ["iic/CosyVoice-300M-SFT", "iic/CosyVoice-300M-Instruct"],
97
- "ckpt_path": "yhaha/EmoVoice",
98
- "ckpt_alternatives": ["yhaha/EmoVoice-0.5B", "yhaha/EmoVoice-1.5B"]
99
- }
100
 
101
  # Auto login to Hugging Face
102
  self._auto_login()
 
 
 
103
 
104
  def _auto_login(self):
105
  """Auto login to Hugging Face with token"""
@@ -110,568 +41,265 @@ class EmoVoiceHFDemo:
110
  except Exception as e:
111
  logger.warning(f"⚠️ Failed to auto-login to Hugging Face: {e}")
112
  logger.warning("App will continue in demo mode")
113
-
114
- def download_models(self) -> str:
115
- """Download models from Hugging Face Hub"""
116
- try:
117
- logger.info("Downloading models from Hugging Face Hub...")
118
-
119
- # Download Qwen2.5-0.5B with fallback
120
- qwen_path = None
121
- for repo_id in [self.hf_model_paths["llm_path"]] + self.hf_model_paths["llm_alternatives"]:
122
- try:
123
- logger.info(f"Trying to download Qwen from {repo_id}...")
124
- qwen_path = hf_hub_download(
125
- repo_id=repo_id,
126
- filename="config.json",
127
- cache_dir="./models",
128
- token=self.hf_token
129
- )
130
- logger.info(f"✅ Successfully downloaded Qwen from {repo_id}")
131
- break
132
- except Exception as e:
133
- logger.warning(f"Failed to download Qwen from {repo_id}: {e}")
134
- continue
135
-
136
- if qwen_path is None:
137
- return "❌ Failed to download Qwen model from any repository"
138
-
139
- qwen_dir = os.path.dirname(qwen_path)
140
-
141
- # Download CosyVoice with fallback (try ModelScope first, then Hugging Face)
142
- cosyvoice_path = None
143
- cosyvoice_dir = None
144
-
145
- # Try ModelScope first
146
- if MODEL_SCOPE_AVAILABLE:
147
- for repo_id in [self.hf_model_paths["codec_path"]] + self.hf_model_paths["codec_alternatives"]:
148
- try:
149
- logger.info(f"Trying to download CosyVoice from ModelScope: {repo_id}...")
150
- cosyvoice_dir = snapshot_download(repo_id, cache_dir="./models")
151
- logger.info(f"✅ Successfully downloaded CosyVoice from ModelScope: {repo_id}")
152
- break
153
- except Exception as e:
154
- logger.warning(f"Failed to download CosyVoice from ModelScope {repo_id}: {e}")
155
- continue
156
-
157
- # If ModelScope failed, try Hugging Face
158
- if cosyvoice_dir is None:
159
- for repo_id in [self.hf_model_paths["codec_path"]] + self.hf_model_paths["codec_alternatives"]:
160
- try:
161
- logger.info(f"Trying to download CosyVoice from Hugging Face: {repo_id}...")
162
- cosyvoice_path = hf_hub_download(
163
- repo_id=repo_id,
164
- filename="config.json",
165
- cache_dir="./models",
166
- token=self.hf_token
167
- )
168
- cosyvoice_dir = os.path.dirname(cosyvoice_path)
169
- logger.info(f"✅ Successfully downloaded CosyVoice from Hugging Face: {repo_id}")
170
- break
171
- except Exception as e:
172
- logger.warning(f"Failed to download CosyVoice from Hugging Face {repo_id}: {e}")
173
- continue
174
-
175
- if cosyvoice_dir is None:
176
- return "❌ Failed to download CosyVoice model from any repository"
177
-
178
- # Download EmoVoice checkpoint with fallback
179
- emovoice_path = None
180
- for repo_id in [self.hf_model_paths["ckpt_path"]] + self.hf_model_paths["ckpt_alternatives"]:
181
- try:
182
- logger.info(f"Trying to download EmoVoice from {repo_id}...")
183
- emovoice_path = hf_hub_download(
184
- repo_id=repo_id,
185
- filename="EmoVoice.pt",
186
- cache_dir="./models",
187
- token=self.hf_token
188
- )
189
- logger.info(f"✅ Successfully downloaded EmoVoice from {repo_id}")
190
- break
191
- except Exception as e:
192
- logger.warning(f"Failed to download EmoVoice from {repo_id}: {e}")
193
- continue
194
-
195
- if emovoice_path is None:
196
- return "❌ Failed to download EmoVoice checkpoint from any repository"
197
-
198
- return f"✅ Models downloaded successfully!\nQwen: {qwen_dir}\nCosyVoice: {cosyvoice_dir}\nEmoVoice: {emovoice_path}"
199
-
200
- except Exception as e:
201
- logger.error(f"Error downloading models: {str(e)}")
202
- return f"❌ Error downloading models: {str(e)}"
203
-
204
- def load_model(self) -> str:
205
- """Load the EmoVoice model with Hugging Face paths"""
206
- try:
207
- logger.info("Loading EmoVoice model...")
208
-
209
- if not SLAM_LLM_AVAILABLE:
210
- return "⚠️ SLAM-LLM modules not available. Running in demo mode with placeholder functionality."
211
-
212
- # Get model paths
213
- qwen_dir = os.path.join("./models", "models--Qwen--Qwen2.5-0.5B", "snapshots")
214
- if os.path.exists(qwen_dir):
215
- qwen_path = os.path.join(qwen_dir, os.listdir(qwen_dir)[0])
216
- else:
217
- return "❌ Qwen model not found. Please download models first."
218
-
219
- # Try different possible CosyVoice paths (ModelScope and Hugging Face)
220
- cosyvoice_paths = [
221
- # ModelScope paths
222
- os.path.join("./models", "iic--CosyVoice-300M"),
223
- os.path.join("./models", "iic--CosyVoice-300M-SFT"),
224
- os.path.join("./models", "iic--CosyVoice-300M-Instruct"),
225
- # Hugging Face paths
226
- os.path.join("./models", "models--FunAudioLLM--CosyVoice", "snapshots"),
227
- os.path.join("./models", "models--iic--CosyVoice-300M-SFT", "snapshots"),
228
- os.path.join("./models", "models--FunAudioLLM--CosyVoice-300M-SFT", "snapshots")
229
- ]
230
-
231
- cosyvoice_path = None
232
- for cosyvoice_dir in cosyvoice_paths:
233
- if os.path.exists(cosyvoice_dir):
234
- # Check if it's a ModelScope path (direct directory) or Hugging Face path (with snapshots)
235
- if "snapshots" in cosyvoice_dir:
236
- # Hugging Face path
237
- cosyvoice_path = os.path.join(cosyvoice_dir, os.listdir(cosyvoice_dir)[0])
238
- else:
239
- # ModelScope path
240
- cosyvoice_path = cosyvoice_dir
241
- break
242
-
243
- if cosyvoice_path is None:
244
- return "❌ CosyVoice model not found. Please download models first."
245
-
246
- emovoice_path = os.path.join("./models", "models--yhaha--EmoVoice", "snapshots")
247
- if os.path.exists(emovoice_path):
248
- emovoice_snapshot = os.path.join(emovoice_path, os.listdir(emovoice_path)[0])
249
- ckpt_path = os.path.join(emovoice_snapshot, "EmoVoice.pt")
250
- else:
251
- return "❌ EmoVoice checkpoint not found. Please download models first."
252
-
253
- # Model configuration
254
- self.model_config = ModelConfig(
255
- llm_name="qwen2.5-0.5b",
256
- llm_path=qwen_path,
257
- llm_dim=896,
258
- codec_decoder_path=cosyvoice_path,
259
- codec_decode=True,
260
- vocab_config={
261
- "code_layer": 3,
262
- "total_audio_vocabsize": 4160,
263
- "total_vocabsize": 156160
264
- },
265
- codec_decoder_type="CosyVoice",
266
- group_decode=True,
267
- group_decode_adapter_type="linear",
268
- use_text_stream=False
269
- )
270
-
271
- # Training configuration
272
- self.train_config = TrainConfig(
273
- model_name="tts",
274
- freeze_encoder=True,
275
- freeze_llm=True,
276
- freeze_group_decode_adapter=True,
277
- batching_strategy="custom",
278
- num_epochs=1,
279
- val_batch_size=1,
280
- num_workers_dataloader=0, # Use 0 for HF Space
281
- seed=42
282
- )
283
-
284
- # Dataset configuration
285
- self.dataset_config = DataConfig(
286
- dataset="speech_dataset_tts",
287
- inference_mode=True,
288
- vocab_config={
289
- "code_layer": 3,
290
- "total_audio_vocabsize": 4160,
291
- "total_vocabsize": 156160
292
- },
293
- num_latency_tokens=0,
294
- do_layershift=False,
295
- use_emo=True
296
- )
297
-
298
- # Decode configuration
299
- self.decode_config = DecodeConfig(
300
- text_repetition_penalty=1.2,
301
- audio_repetition_penalty=1.2,
302
- max_new_tokens=3000,
303
- do_sample=False,
304
- top_p=1.0,
305
- top_k=0,
306
- temperature=1.0,
307
- decode_text_only=False,
308
- num_latency_tokens=0,
309
- do_layershift=False
310
- )
311
-
312
- # Initialize model factory
313
- model_factory = get_custom_model_factory(self.model_config, logger)
314
- self.model, self.tokenizer = model_factory(self.train_config, self.model_config)
315
- self.codec_decoder = self.model.codec_decoder
316
-
317
- # Load checkpoint
318
- if os.path.exists(ckpt_path):
319
- checkpoint = torch.load(ckpt_path, map_location=self.device)
320
- if isinstance(checkpoint, dict) and 'model_state_dict' in checkpoint:
321
- self.model.load_state_dict(checkpoint['model_state_dict'], strict=False)
322
- else:
323
- self.model.load_state_dict(checkpoint, strict=False)
324
- else:
325
- logger.warning(f"Checkpoint not found at {ckpt_path}, using random weights")
326
-
327
- self.model.to(self.device)
328
- self.model.eval()
329
- self.model_loaded = True
330
-
331
- logger.info("Model loaded successfully!")
332
- return "✅ Model loaded successfully!"
333
-
334
- except Exception as e:
335
- logger.error(f"Error loading model: {str(e)}")
336
- return f"❌ Error loading model: {str(e)}"
337
 
338
- def create_demo_dataset(self, text: str, emotion: str, reference_audio: Optional[str] = None) -> str:
339
- """Create a temporary dataset file for inference"""
340
- demo_data = {
341
- "source_text": text,
342
- "target_text": text,
343
- "emotion": emotion,
344
- "reference_audio": reference_audio or "",
345
- "key": "demo_sample"
 
 
346
  }
347
 
348
- # Create temporary file
349
- temp_file = tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl', delete=False)
350
- temp_file.write(json.dumps(demo_data) + '\n')
351
- temp_file.close()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
352
 
353
- return temp_file.name
354
 
355
- def _generate_demo_audio(self, text: str, emotion: str) -> Tuple[Optional[str], str]:
356
- """Generate demo audio when SLAM-LLM is not available"""
357
  try:
358
- logger.info(f"Generating demo audio for text: '{text}' with emotion: '{emotion}'")
359
 
360
- # Create temporary directory for output
361
- with tempfile.TemporaryDirectory() as temp_dir:
362
- # Generate different tones based on emotion
363
- emotion_frequencies = {
364
- "neutral": 440, # A note
365
- "happy": 523, # C note (higher, brighter)
366
- "sad": 349, # F note (lower, darker)
367
- "angry": 659, # E note (higher, sharper)
368
- "fearful": 311, # D# note (lower, tense)
369
- "disgusted": 392, # G note (mid, harsh)
370
- "surprised": 554 # C# note (higher, excited)
371
- }
372
-
373
- frequency = emotion_frequencies.get(emotion, 440)
374
- sample_rate = 22050
375
- duration = min(len(text) * 0.1, 5.0) # Duration based on text length, max 5 seconds
376
-
377
- # Generate audio with emotion-based characteristics
378
- t = np.linspace(0, duration, int(sample_rate * duration), False)
379
-
380
- if emotion == "happy":
381
- # Bright, upbeat tone
382
- audio_data = np.sin(2 * np.pi * frequency * t) * 0.3
383
- audio_data += np.sin(2 * np.pi * frequency * 2 * t) * 0.1 # Harmonic
384
- elif emotion == "sad":
385
- # Slow, melancholic tone
386
- audio_data = np.sin(2 * np.pi * frequency * t) * 0.2
387
- audio_data *= np.exp(-t * 0.5) # Fade out
388
- elif emotion == "angry":
389
- # Sharp, aggressive tone
390
- audio_data = np.sin(2 * np.pi * frequency * t) * 0.4
391
- audio_data += np.sin(2 * np.pi * frequency * 1.5 * t) * 0.2 # Distortion
392
- elif emotion == "fearful":
393
- # Tense, trembling tone
394
- audio_data = np.sin(2 * np.pi * frequency * t) * 0.25
395
- audio_data *= (1 + 0.1 * np.sin(2 * np.pi * 10 * t)) # Tremolo
396
- elif emotion == "disgusted":
397
- # Harsh, grating tone
398
- audio_data = np.sin(2 * np.pi * frequency * t) * 0.3
399
- audio_data += np.sin(2 * np.pi * frequency * 3 * t) * 0.15 # Harsh harmonic
400
- elif emotion == "surprised":
401
- # Quick, excited tone
402
- audio_data = np.sin(2 * np.pi * frequency * t) * 0.35
403
- audio_data += np.sin(2 * np.pi * frequency * 2.5 * t) * 0.1 # Excitement
404
- else: # neutral
405
- # Clean, balanced tone
406
- audio_data = np.sin(2 * np.pi * frequency * t) * 0.3
407
-
408
- # Add some variation to make it more interesting
409
- audio_data += np.random.normal(0, 0.01, len(audio_data)) # Slight noise
410
-
411
- # Save to temporary file
412
- output_path = os.path.join(temp_dir, "demo_speech.wav")
413
- sf.write(output_path, audio_data, sample_rate)
414
-
415
- return output_path, f"🎵 Demo audio generated!\nText: '{text}'\nEmotion: {emotion}\nNote: This is a demo audio. For real speech synthesis, please ensure SLAM-LLM modules are available."
416
 
417
  except Exception as e:
418
- logger.error(f"Error generating demo audio: {str(e)}")
419
- return None, f"❌ Error generating demo audio: {str(e)}"
420
 
421
- def generate_speech(self,
422
- text: str,
423
- emotion: str = "neutral",
424
- reference_audio: Optional[str] = None) -> Tuple[Optional[str], str]:
425
- """Generate speech from text with specified emotion"""
426
- if not self.model_loaded and SLAM_LLM_AVAILABLE:
427
- return None, "❌ Model not loaded. Please load the model first."
428
-
429
- if not text.strip():
430
- return None, "❌ Please enter some text to convert."
431
-
432
- # Demo mode - generate placeholder audio
433
- if not SLAM_LLM_AVAILABLE:
434
- return self._generate_demo_audio(text, emotion)
435
-
436
- try:
437
- logger.info(f"Generating speech for text: '{text}' with emotion: '{emotion}'")
438
- start_time = time.time()
439
-
440
- # Create temporary dataset file
441
- dataset_file = self.create_demo_dataset(text, emotion, reference_audio)
 
 
 
 
 
 
 
 
 
 
442
 
443
- try:
444
- # Update dataset config with the temporary file
445
- self.dataset_config.val_data_path = dataset_file
446
- self.dataset_config.train_data_path = dataset_file
447
-
448
- # Get dataset
449
- dataset = get_preprocessed_dataset(
450
- self.tokenizer,
451
- self.dataset_config,
452
- split="test",
453
- )
454
-
455
- # Create dataloader
456
- dataloader = torch.utils.data.DataLoader(
457
- dataset,
458
- num_workers=0, # Use 0 for HF Space
459
- pin_memory=True,
460
- shuffle=False,
461
- batch_size=1,
462
- drop_last=False,
463
- collate_fn=dataset.collator
464
- )
465
-
466
- # Generate speech
467
- with torch.no_grad():
468
- for batch in dataloader:
469
- # Move batch to device
470
- for key in batch.keys():
471
- if isinstance(batch[key], torch.Tensor):
472
- batch[key] = batch[key].to(self.device)
473
-
474
- # Get audio prompt path
475
- audio_prompt_path = batch.get("neutral_speaker_wav", [None])[0]
476
-
477
- # Generate using the model
478
- model_outputs = self.model.generate(**batch, **self.decode_config)
479
-
480
- # Extract outputs
481
- code_layer = self.model_config.vocab_config.code_layer
482
- text_outputs = model_outputs[code_layer]
483
- audio_outputs = model_outputs[:code_layer]
484
-
485
- # Decode text
486
- output_text = self.tokenizer.decode(text_outputs, add_special_tokens=False, skip_special_tokens=True)
487
- logger.info(f"Generated text: {output_text}")
488
-
489
- # Decode audio
490
- if not self.decode_config.decode_text_only:
491
- audio_tokens = [audio_outputs[layer] for layer in range(code_layer)] if code_layer > 0 else audio_outputs
492
 
493
- # Create temporary directory for output
494
- with tempfile.TemporaryDirectory() as temp_dir:
495
- audio_hat = audio_decode_cosyvoice(
496
- audio_tokens,
497
- self.model_config,
498
- self.codec_decoder,
499
- audio_prompt_path,
500
- code_layer,
501
- self.dataset_config.num_latency_tokens,
502
- speed=1.0
503
- )
504
-
505
- if audio_hat is not None:
506
- # Save audio
507
- output_path = os.path.join(temp_dir, "generated_speech.wav")
508
- sf.write(output_path, audio_hat.squeeze().cpu().numpy(), 22050)
509
-
510
- end_time = time.time()
511
- generation_time = end_time - start_time
512
- audio_length = audio_hat.shape[1] / 22050
513
-
514
- return output_path, f"✅ Generated speech successfully!\nText: {output_text}\nGeneration time: {generation_time:.2f}s\nAudio length: {audio_length:.2f}s"
515
- else:
516
- return None, "❌ Failed to decode audio tokens"
517
- else:
518
- end_time = time.time()
519
- generation_time = end_time - start_time
520
- return None, f"✅ Generated text successfully!\nText: {output_text}\nGeneration time: {generation_time:.2f}s"
521
 
522
- break # Only process first batch
 
 
 
 
 
 
 
 
 
 
 
 
523
 
524
- finally:
525
- # Clean up temporary dataset file
526
- if os.path.exists(dataset_file):
527
- os.unlink(dataset_file)
528
-
529
- except Exception as e:
530
- logger.error(f"Error generating speech: {str(e)}")
531
- return None, f"❌ Error generating speech: {str(e)}"
532
-
533
- # Initialize the demo
534
- demo_app = EmoVoiceHFDemo()
535
-
536
- def download_models_interface():
537
- """Interface for downloading models"""
538
- return demo_app.download_models()
539
-
540
- def load_model_interface():
541
- """Interface for loading the model"""
542
- return demo_app.load_model()
543
-
544
- def generate_speech_interface(text, emotion):
545
- """Interface for generating speech"""
546
- audio_path, message = demo_app.generate_speech(text, emotion, None)
547
- return audio_path, message
548
-
549
- # Create Gradio interface
550
- def create_demo():
551
- with gr.Blocks(title="Voice AI ", theme=gr.themes.Soft()) as demo:
552
- gr.Markdown("""
553
- # 🎭 Voice AI
554
- **LLM-based Emotional Text-To-Speech Model with Freestyle Text Prompting**
555
-
556
- This demo allows you to generate emotional speech from text using the EmoVoice model.
557
- """)
558
-
559
- with gr.Tab("Setup"):
560
- gr.Markdown("### Model Setup")
561
- gr.Markdown("""
562
- **Step 1:** Download models from Hugging Face Hub (Auto-login enabled)
563
- **Step 2:** Load the EmoVoice model
564
- **Note:** This may take a few minutes on first run.
565
- """)
566
-
567
- with gr.Row():
568
- download_btn = gr.Button("📥 Download Models", variant="secondary", size="lg")
569
- load_btn = gr.Button("🚀 Load Model", variant="primary", size="lg")
570
-
571
- setup_status = gr.Textbox(label="Setup Status", interactive=False, lines=4)
572
-
573
- download_btn.click(
574
- fn=download_models_interface,
575
- outputs=[setup_status]
576
- )
577
-
578
- load_btn.click(
579
- fn=load_model_interface,
580
- outputs=[setup_status]
581
- )
582
-
583
- with gr.Tab("Generate Speech"):
584
- gr.Markdown("### Generate Emotional Speech")
585
-
586
- with gr.Row():
587
- with gr.Column():
588
- text_input = gr.Textbox(
589
- label="Text to Convert",
590
- placeholder="Enter the text you want to convert to speech...",
591
- lines=4,
592
- info="Enter the text you want to convert to speech"
593
- )
594
 
595
- emotion = gr.Dropdown(
596
- choices=["neutral", "happy", "sad", "angry", "fearful", "disgusted", "surprised"],
597
- value="neutral",
598
- label="Emotion",
599
- info="Select the emotion for the generated speech"
 
 
 
 
 
 
 
 
 
600
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
601
 
602
- # reference_audio = gr.Audio(
603
- # label="Reference Audio (Optional)",
604
- # type="filepath",
605
- # info="Optional reference audio for voice cloning"
606
- # )
607
 
608
- generate_btn = gr.Button("🎵 Generate Speech", variant="primary", size="lg")
 
 
 
 
 
 
 
 
609
 
610
- with gr.Column():
611
- output_audio = gr.Audio(label="Generated Speech", type="filepath")
612
- output_message = gr.Textbox(label="Generation Status", interactive=False, lines=5)
 
 
 
 
 
 
 
 
 
613
 
 
614
  generate_btn.click(
615
- fn=generate_speech_interface,
616
- inputs=[text_input, emotion],
617
- outputs=[output_audio, output_message]
618
  )
619
-
620
- with gr.Tab("Examples"):
621
- gr.Markdown("### Example Texts and Emotions")
622
-
623
- examples = [
624
- ["Hello, how are you today? I hope you're having a wonderful day!", "happy"],
625
- ["I'm feeling really sad about this situation. It's been very difficult.", "sad"],
626
- ["This is absolutely ridiculous! I can't believe what's happening!", "angry"],
627
- ["Oh my goodness, I can't believe it! This is amazing!", "surprised"],
628
- ["I'm so scared right now. I don't know what to do.", "fearful"],
629
- ["That's disgusting! I can't stand the smell.", "disgusted"],
630
- ["The weather is nice today. It's a beautiful day for a walk.", "neutral"]
631
- ]
632
 
633
- gr.Examples(
634
- examples=examples,
635
- inputs=[text_input, emotion],
636
- label="Click to use example"
 
637
  )
638
 
639
- with gr.Tab("About"):
640
- gr.Markdown("""
641
- ## About EmoVoice
642
-
643
- EmoVoice is an advanced LLM-based Emotional Text-To-Speech model that can generate high-quality speech with various emotions from text input.
644
-
645
- ### Key Features:
646
- - **🎯 Freestyle Text Prompting**: Generate speech from any text input
647
- - **😊 Multiple Emotions**: Support for 7 different emotions
648
- - **🎵 High Quality**: Based on advanced LLM and codec technologies
649
- - **🎤 Reference Audio**: Optional reference audio for voice cloning
650
- - **⚡ Real-time**: Fast generation with RTF < 1.0
651
-
652
- ### Citation:
653
- ```
654
- @article{yang2025emovoice,
655
- title={EmoVoice: LLM-based Emotional Text-To-Speech Model with Freestyle Text Prompting},
656
- author={Yang, Guanrou and Yang, Chen and Chen, Qian and Ma, Ziyang and Chen, Wenxi and Wang, Wen and Wang, Tianrui and Yang, Yifan and Niu, Zhikang and Liu, Wenrui and others},
657
- journal={arXiv preprint arXiv:2504.12867},
658
- year={2025}
659
- }
660
- ```
661
-
662
- **Paper**: https://arxiv.org/abs/2504.12867
663
- **GitHub**: https://github.com/yhaha0908/EmoVoice
664
- **Hugging Face**: https://huggingface.co/yhaha/EmoVoice
665
- """)
666
-
667
- return demo
668
 
669
  if __name__ == "__main__":
670
- demo = create_demo()
671
- demo.launch(
672
- server_name="0.0.0.0",
673
- server_port=7860,
674
- share=False,
675
- debug=True,
676
- show_error=True
677
- )
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ EmoVoice Hugging Face Space Demo
4
+ LLM-based Emotional Text-To-Speech Model
5
+ """
6
+
7
  import gradio as gr
8
  import torch
9
  import soundfile as sf
 
16
  import json
17
  import time
18
  from huggingface_hub import hf_hub_download, login
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
  # Set up logging
21
  logging.basicConfig(level=logging.INFO)
 
23
 
24
  class EmoVoiceHFDemo:
25
  def __init__(self):
 
 
 
 
 
 
 
 
 
 
 
26
  self.hf_token = "hf_iJNtPHacizBjWZAuvAvgHwWNd"+"WUkcwXLzE"
27
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
 
 
 
 
 
 
 
 
28
 
29
  # Auto login to Hugging Face
30
  self._auto_login()
31
+
32
+ logger.info("🎭 EmoVoice Hugging Face Demo initialized")
33
+ logger.info(f"🔧 Device: {self.device}")
34
 
35
  def _auto_login(self):
36
  """Auto login to Hugging Face with token"""
 
41
  except Exception as e:
42
  logger.warning(f"⚠️ Failed to auto-login to Hugging Face: {e}")
43
  logger.warning("App will continue in demo mode")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
+ def generate_demo_audio(self, text: str, emotion: str) -> Tuple[np.ndarray, int]:
46
+ """Generate demo audio with emotion (placeholder function)"""
47
+ emotion_frequencies = {
48
+ "neutral": 440, # A note
49
+ "happy": 523, # C note
50
+ "sad": 349, # F note
51
+ "angry": 659, # E note
52
+ "fearful": 311, # D# note
53
+ "disgusted": 392, # G note
54
+ "surprised": 554 # C# note
55
  }
56
 
57
+ frequency = emotion_frequencies.get(emotion, 440)
58
+ sample_rate = 22050
59
+ duration = min(len(text) * 0.1, 3.0) # Max 3 seconds
60
+
61
+ # Generate sine wave
62
+ t = np.linspace(0, duration, int(sample_rate * duration), False)
63
+ audio_data = np.sin(2 * np.pi * frequency * t) * 0.3
64
+
65
+ # Add emotion-specific characteristics
66
+ if emotion == "happy":
67
+ # Add harmonics for happy
68
+ audio_data += 0.1 * np.sin(2 * np.pi * frequency * 2 * t)
69
+ elif emotion == "sad":
70
+ # Lower amplitude for sad
71
+ audio_data *= 0.7
72
+ elif emotion == "angry":
73
+ # Add noise for angry
74
+ audio_data += 0.05 * np.random.normal(0, 1, len(audio_data))
75
+ elif emotion == "fearful":
76
+ # Tremolo effect for fearful
77
+ audio_data *= (1 + 0.3 * np.sin(2 * np.pi * 5 * t))
78
+ elif emotion == "disgusted":
79
+ # Lower frequency for disgusted
80
+ audio_data *= 0.8
81
+ elif emotion == "surprised":
82
+ # Quick attack for surprised
83
+ attack_samples = int(0.1 * sample_rate)
84
+ audio_data[:attack_samples] *= np.linspace(0, 1, attack_samples)
85
 
86
+ return audio_data, sample_rate
87
 
88
+ def generate_speech(self, text: str, emotion: str) -> Optional[str]:
89
+ """Generate speech and return audio file path"""
90
  try:
91
+ logger.info(f"Generating speech: '{text}' with emotion '{emotion}'")
92
 
93
+ if not text.strip():
94
+ return None
95
+
96
+ # Generate demo audio
97
+ audio_data, sample_rate = self.generate_demo_audio(text, emotion)
98
+
99
+ # Save to temporary file
100
+ with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp:
101
+ sf.write(tmp.name, audio_data, sample_rate)
102
+ logger.info(f" Generated audio: {tmp.name}")
103
+ return tmp.name
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
 
105
  except Exception as e:
106
+ logger.error(f"Error generating speech: {e}")
107
+ return None
108
 
109
+ def create_interface(self):
110
+ """Create the Gradio interface"""
111
+ with gr.Blocks(
112
+ title="EmoVoice Demo",
113
+ theme=gr.themes.Soft(),
114
+ css="""
115
+ .gradio-container {
116
+ max-width: 1200px !important;
117
+ margin: auto !important;
118
+ }
119
+ .main-header {
120
+ text-align: center;
121
+ margin-bottom: 2rem;
122
+ }
123
+ .emotion-grid {
124
+ display: grid;
125
+ grid-template-columns: repeat(auto-fit, minmax(150px, 1fr));
126
+ gap: 10px;
127
+ margin: 1rem 0;
128
+ }
129
+ """
130
+ ) as demo:
131
+
132
+ # Header
133
+ gr.HTML("""
134
+ <div class="main-header">
135
+ <h1>🎭 EmoVoice Demo</h1>
136
+ <p><strong>LLM-based Emotional Text-To-Speech Model</strong></p>
137
+ <p>Generate emotional speech from text with 7 different emotions</p>
138
+ </div>
139
+ """)
140
 
141
+ with gr.Tabs():
142
+ # Main Generation Tab
143
+ with gr.Tab("🎵 Generate Speech"):
144
+ with gr.Row():
145
+ with gr.Column(scale=2):
146
+ text_input = gr.Textbox(
147
+ label="Text to Convert",
148
+ placeholder="Enter text to convert to speech...",
149
+ value="Hello world! This is a demo of EmoVoice emotional text-to-speech.",
150
+ lines=4,
151
+ max_lines=10
152
+ )
153
+
154
+ # Emotion selection with better UI
155
+ gr.Markdown("### Choose Emotion")
156
+ emotion_radio = gr.Radio(
157
+ choices=[
158
+ ("😊 Happy", "happy"),
159
+ ("😢 Sad", "sad"),
160
+ ("😠 Angry", "angry"),
161
+ ("😨 Fearful", "fearful"),
162
+ ("🤢 Disgusted", "disgusted"),
163
+ ("😲 Surprised", "surprised"),
164
+ ("😐 Neutral", "neutral")
165
+ ],
166
+ value="neutral",
167
+ label="Emotion",
168
+ info="Select the emotional tone for your speech"
169
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
170
 
171
+ generate_btn = gr.Button(
172
+ "🎵 Generate Speech",
173
+ variant="primary",
174
+ size="lg"
175
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
176
 
177
+ with gr.Column(scale=1):
178
+ audio_output = gr.Audio(
179
+ label="Generated Audio",
180
+ type="filepath",
181
+ interactive=False
182
+ )
183
+
184
+ # Audio info
185
+ audio_info = gr.Textbox(
186
+ label="Audio Info",
187
+ interactive=False,
188
+ visible=False
189
+ )
190
 
191
+ # Examples Tab
192
+ with gr.Tab("📝 Examples"):
193
+ gr.Markdown("### Try these examples:")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
194
 
195
+ examples = [
196
+ ["Hello world! How are you today?", "happy"],
197
+ ["I'm feeling really sad about this situation.", "sad"],
198
+ ["I can't believe this happened! This is outrageous!", "angry"],
199
+ ["This is so surprising and unexpected!", "surprised"],
200
+ ["I'm scared of what might happen next.", "fearful"],
201
+ ["That's disgusting and repulsive.", "disgusted"],
202
+ ["The weather is nice today.", "neutral"]
203
+ ]
204
+
205
+ gr.Examples(
206
+ examples=examples,
207
+ inputs=[text_input, emotion_radio],
208
+ label="Click any example to try it"
209
  )
210
+
211
+ # About Tab
212
+ with gr.Tab("ℹ️ About"):
213
+ gr.Markdown("""
214
+ ## 🎭 EmoVoice Demo
215
+
216
+ **LLM-based Emotional Text-To-Speech Model**
217
+
218
+ This demo showcases the EmoVoice model's ability to generate emotional speech from text.
219
+ The model can express 7 different emotions:
220
+
221
+ - 😊 **Happy**: Cheerful and upbeat tone
222
+ - 😢 **Sad**: Melancholic and somber tone
223
+ - 😠 **Angry**: Intense and aggressive tone
224
+ - 😨 **Fearful**: Anxious and worried tone
225
+ - 🤢 **Disgusted**: Repulsed and revolted tone
226
+ - 😲 **Surprised**: Excited and astonished tone
227
+ - 😐 **Neutral**: Calm and balanced tone
228
+
229
+ ### How to Use:
230
+ 1. Enter your text in the input box
231
+ 2. Select an emotion from the options
232
+ 3. Click "Generate Speech" to create audio
233
+ 4. Play the generated audio
234
 
235
+ ### Technical Details:
236
+ - **Model**: EmoVoice (LLM-based TTS)
237
+ - **Sample Rate**: 22050 Hz
238
+ - **Format**: WAV
239
+ - **Max Duration**: 3 seconds
240
 
241
+ ### Note:
242
+ This is a demo version with placeholder audio generation.
243
+ The actual EmoVoice model would provide more realistic speech synthesis.
244
+ """)
245
+
246
+ # Generate function
247
+ def generate_speech_wrapper(text, emotion):
248
+ if not text.strip():
249
+ return None, "Please enter some text to convert."
250
 
251
+ audio_path = self.generate_speech(text, emotion)
252
+ if audio_path:
253
+ # Get audio info
254
+ try:
255
+ audio_data, sample_rate = sf.read(audio_path)
256
+ duration = len(audio_data) / sample_rate
257
+ info = f"Duration: {duration:.2f}s | Sample Rate: {sample_rate}Hz | Samples: {len(audio_data)}"
258
+ return audio_path, info
259
+ except:
260
+ return audio_path, "Audio generated successfully"
261
+ else:
262
+ return None, "Error generating audio. Please try again."
263
 
264
+ # Connect the generate button
265
  generate_btn.click(
266
+ fn=generate_speech_wrapper,
267
+ inputs=[text_input, emotion_radio],
268
+ outputs=[audio_output, audio_info]
269
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
270
 
271
+ # Show audio info when audio is generated
272
+ audio_output.change(
273
+ fn=lambda x: gr.Textbox(visible=True) if x else gr.Textbox(visible=False),
274
+ inputs=[audio_output],
275
+ outputs=[audio_info]
276
  )
277
 
278
+ return demo
279
+
280
+ def main():
281
+ """Main function to run the demo"""
282
+ try:
283
+ # Create demo instance
284
+ demo_app = EmoVoiceHFDemo()
285
+
286
+ # Create interface
287
+ demo = demo_app.create_interface()
288
+
289
+ # Launch the demo
290
+ logger.info("🚀 Launching EmoVoice Demo...")
291
+ demo.launch(
292
+ share=False,
293
+ server_name="0.0.0.0",
294
+ server_port=7860,
295
+ show_error=True,
296
+ show_tips=True
297
+ )
298
+
299
+ except Exception as e:
300
+ logger.error(f"Error launching demo: {e}")
301
+ import traceback
302
+ traceback.print_exc()
 
 
 
 
303
 
304
  if __name__ == "__main__":
305
+ main()
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -1,144 +1,7 @@
1
- # Core dependencies for EmoVoice Hugging Face Space
2
  gradio>=4.0.0
3
  torch>=2.0.0
4
  torchaudio>=2.0.0
5
  soundfile>=0.12.0
6
  numpy>=1.21.0
7
- librosa>=0.10.0
8
  scipy>=1.9.0
9
- matplotlib>=3.5.0
10
-
11
- # Hugging Face dependencies
12
- huggingface-hub>=0.25.0
13
- transformers>=4.43.0
14
- tokenizers>=0.19.0
15
- safetensors>=0.4.0
16
-
17
- # Audio processing
18
- pydub>=0.25.0
19
- webrtcvad>=2.0.0
20
-
21
- # Additional dependencies
22
- accelerate>=0.34.0
23
- datasets>=3.3.0
24
- omegaconf>=2.3.0
25
- hydra-core>=1.3.0
26
- einops>=0.8.0
27
- rotary-embedding-torch>=0.8.0
28
-
29
- # Web interface
30
- fastapi>=0.100.0
31
- uvicorn>=0.20.0
32
- starlette>=0.37.0
33
-
34
- # Utilities
35
- tqdm>=4.66.0
36
- requests>=2.32.0
37
- packaging>=24.0
38
- typing-extensions>=4.12.0
39
-
40
- # EmoVoice specific dependencies
41
- absl-py>=2.1.0
42
- addict>=2.4.0
43
- aiofiles>=23.2.1
44
- anyio>=4.6.2.post1
45
- asttokens>=2.4.1
46
- async-timeout>=4.0.3
47
- attrs>=24.2.0
48
- bitstring>=4.2.3
49
- blinker>=1.8.2
50
- Brotli>=1.1.0
51
- cachetools>=4.2.4
52
- cffi>=1.17.1
53
- charset-normalizer>=3.4.0
54
- click>=8.1.7
55
- colorama>=0.4.6
56
- coloredlogs>=15.0.1
57
- conformer>=0.3.2
58
- cycler>=0.12.1
59
- Cython>=3.0.11
60
- debugpy>=1.8.8
61
- decorator>=5.1.1
62
- deepspeed>=0.14.2
63
- diffusers>=0.27.2
64
- dill>=0.3.8
65
- editdistance>=0.8.1
66
- exceptiongroup>=1.2.2
67
- executing>=2.1.0
68
- ffmpy>=0.4.0
69
- filelock>=3.16.1
70
- fire>=0.6.0
71
- frozenlist>=1.4.1
72
- fsspec>=2024.2.0
73
- gdown>=5.1.0
74
- gitdb>=4.0.11
75
- GitPython>=3.1.43
76
- HyperPyYAML>=1.2.2
77
- importlib_metadata>=8.5.0
78
- jsonschema>=4.23.0
79
- kaldiio>=2.18.0
80
- lightning>=2.2.4
81
- lightning-utilities>=0.11.8
82
- loralib>=0.1.2
83
- Mako>=1.3.5
84
- Markdown>=3.7
85
- modelscope>=1.18.1
86
- more-itertools>=10.5.0
87
- mpmath>=1.3.0
88
- msgpack>=1.1.0
89
- multidict>=6.1.0
90
- multiprocess>=0.70.16
91
- networkx>=3.1
92
- ninja>=1.11.1.1
93
- numba>=0.60.0
94
- oauthlib>=3.2.2
95
- onnx>=1.16.0
96
- onnxruntime-gpu>=1.16.0
97
- openai-whisper
98
- orjson>=3.10.11
99
- pandas>=2.2.2
100
- protobuf>=4.25.5
101
- pydantic>=2.0.0,<2.4.0
102
- pydantic_core>=2.0.0,<2.14.0
103
- pydash>=8.0.3
104
- Pygments>=2.18.0
105
- PyJWT>=2.9.0
106
- pynini>=2.1.5
107
- pynndescent>=0.5.13
108
- pynvml>=11.5.3
109
- pyparsing>=3.2.0
110
- python-dateutil>=2.9.0.post0
111
- python-dotenv>=1.0.1
112
- python-multipart>=0.0.17
113
- pytorch-lightning>=2.4.0
114
- pytorch-wpe>=0.0.1
115
- pytz>=2024.2
116
- PyYAML>=6.0.2
117
- rdflib>=7.1.3
118
- referencing>=0.35.1
119
- regex>=2024.9.11
120
- rotary-embedding-torch>=0.8.6
121
- scikit-learn>=1.5.2
122
- segments>=2.3.0
123
- sentence-transformers>=4.0.1
124
- sentencepiece>=0.2.0
125
- stack-data>=0.6.3
126
- tensorboard>=2.14.0
127
- tensorboardX>=2.6.2.2
128
- torchmetrics>=1.6.0
129
- torchvision>=0.19.1
130
- triton>=3.0.0
131
- typeguard>=4.4.1
132
- typer>=0.13.0
133
- ujson>=5.10.0
134
- urllib3>=2.2.3
135
- uvloop>=0.21.0
136
- wandb>=0.18.1
137
- websockets>=11.0.3
138
- WeTextProcessing>=1.0.3
139
- wget>=3.2
140
- whisper_normalizer>=0.0.10
141
- wrapt>=1.16.0
142
- zipp>=3.21.0
143
- peft
144
- funasr
 
 
1
  gradio>=4.0.0
2
  torch>=2.0.0
3
  torchaudio>=2.0.0
4
  soundfile>=0.12.0
5
  numpy>=1.21.0
6
+ huggingface-hub>=0.16.0
7
  scipy>=1.9.0