Hyprlyf commited on
Commit
f4b2387
·
verified ·
1 Parent(s): 032c086

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -52
app.py CHANGED
@@ -1,56 +1,17 @@
1
- import gradio as gr
2
- from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech
3
  import torch
4
- import soundfile as sf
5
- import numpy as np
6
 
7
- # Device configuration
8
- device = "cuda" if torch.cuda.is_available() else "cpu"
 
 
9
 
10
- # Languages & models
11
- # Note: SpeechT5 primarily English, other languages may require Roman transliteration
12
- languages = {
13
- "English": "microsoft/speecht5_tts",
14
- "Hindi": "microsoft/speecht5_tts",
15
- "Urdu": "microsoft/speecht5_tts",
16
- "Arabic": "microsoft/speecht5_tts",
17
- "Turkish": "microsoft/speecht5_tts",
18
- "Persian": "microsoft/speecht5_tts",
19
- "Malay": "microsoft/speecht5_tts"
20
- }
21
 
22
- # Load models (CPU-friendly)
23
- tts_pipelines = {}
24
- for lang, model_name in languages.items():
25
- processor = SpeechT5Processor.from_pretrained(model_name)
26
- model = SpeechT5ForTextToSpeech.from_pretrained(model_name).to(device)
27
- tts_pipelines[lang] = {"processor": processor, "model": model}
28
-
29
- # Text-to-Speech function
30
- def text_to_speech(text, language):
31
- processor = tts_pipelines[language]["processor"]
32
- model = tts_pipelines[language]["model"]
33
-
34
- inputs = processor(text=text, return_tensors="pt").to(device)
35
- with torch.no_grad():
36
- speech = model.generate_speech(inputs["input_ids"], speaker_embeddings=None)
37
-
38
- audio_np = speech.squeeze().cpu().numpy()
39
- samplerate = processor.feature_extractor.sampling_rate
40
-
41
- sf.write("output.wav", audio_np, samplerate)
42
- return (audio_np, samplerate)
43
-
44
- # Gradio Interface
45
- iface = gr.Interface(
46
- fn=text_to_speech,
47
- inputs=[
48
- gr.Textbox(lines=2, placeholder="Type your text here..."),
49
- gr.Dropdown(list(languages.keys()), label="Select Language")
50
- ],
51
- outputs=gr.Audio(type="numpy", autoplay=True),
52
- title="Multi-Language TTS (SpeechT5)",
53
- description="Type text, select language, and get speech output. Roman transliteration recommended for non-English languages."
54
- )
55
-
56
- iface.launch()
 
1
+ from transformers import SpeechT5HifiGan
 
2
  import torch
 
 
3
 
4
+ # Load TTS model
5
+ model_name = "microsoft/speecht5_tts"
6
+ processor = SpeechT5Processor.from_pretrained(model_name)
7
+ model = SpeechT5ForTextToSpeech.from_pretrained(model_name).to(device)
8
 
9
+ # Load default HiFi-GAN vocoder
10
+ vocoder = SpeechT5HifiGan.from_pretrained(model_name)
 
 
 
 
 
 
 
 
 
11
 
12
+ # Example speaker embedding
13
+ # Hugging Face dataset example: cmu-arctic-xvectors
14
+ # Here we can use 'matthijs/cmu-arctic-xvectors' speaker
15
+ from datasets import load_dataset
16
+ dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="train[:1]")
17
+ speaker_embedding = torch.tensor(dataset[0]["xvector"]).unsqueeze(0).to(device)