Bhavibond commited on
Commit
d1caf5e
·
verified ·
1 Parent(s): b9e1e85

Load the speaker embeddings directly from the dataset

Browse files
Files changed (1) hide show
  1. app.py +25 -6
app.py CHANGED
@@ -1,12 +1,22 @@
1
  import gradio as gr
2
- from transformers import pipeline
 
3
  import torch
 
4
  import os
5
 
6
- # Initialize models
7
  translator = pipeline("translation", model="Helsinki-NLP/opus-mt-en-mul", device=torch.device('cpu'))
8
  asr = pipeline("automatic-speech-recognition", model="openai/whisper-small", device=torch.device('cpu'))
9
- tts = pipeline("text-to-speech", model="microsoft/speecht5_tts", device=torch.device('cpu'))
 
 
 
 
 
 
 
 
10
 
11
  # Ensure cache directory for output files
12
  os.makedirs("output", exist_ok=True)
@@ -25,9 +35,14 @@ def process_audio(audio, target_language):
25
  translated_text = translated_text.get('translation_text', '')
26
 
27
  # Step 3: Generate speech from translated text
28
- speech = tts(translated_text)
 
 
 
 
 
29
  output_audio_path = "output/generated_speech.wav"
30
- speech["audio"].save(output_audio_path)
31
 
32
  # Step 4: Create Braille-compatible file
33
  braille_output_path = "output/braille.txt"
@@ -46,7 +61,11 @@ with gr.Blocks() as demo:
46
 
47
  with gr.Row():
48
  audio_input = gr.Audio(type="filepath", label="Upload Audio")
49
- target_language = gr.Dropdown(choices=["en", "hi", "kn", "ta", "te", "es", "de", "fr", "hu"], value="en", label="Target Language")
 
 
 
 
50
 
51
  with gr.Row():
52
  submit_button = gr.Button("Submit")
 
1
  import gradio as gr
2
+ from transformers import pipeline, SpeechT5Processor, SpeechT5ForTextToSpeech
3
+ import torchaudio
4
  import torch
5
+ from datasets import load_dataset
6
  import os
7
 
8
+ # Load ASR and Translation models
9
  translator = pipeline("translation", model="Helsinki-NLP/opus-mt-en-mul", device=torch.device('cpu'))
10
  asr = pipeline("automatic-speech-recognition", model="openai/whisper-small", device=torch.device('cpu'))
11
+
12
+ # Load TTS model and processor
13
+ processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
14
+ tts = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
15
+
16
+ # Load speaker embeddings from dataset
17
+ embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
18
+ speaker_embeddings = embeddings_dataset[7306]["xvector"] # Example speaker embedding
19
+ speaker_embeddings = torch.tensor(speaker_embeddings).unsqueeze(0) # Reshape for the model
20
 
21
  # Ensure cache directory for output files
22
  os.makedirs("output", exist_ok=True)
 
35
  translated_text = translated_text.get('translation_text', '')
36
 
37
  # Step 3: Generate speech from translated text
38
+ inputs = processor(translated_text, return_tensors="pt")
39
+
40
+ with torch.no_grad():
41
+ speech = tts.generate_speech(inputs["input_ids"], speaker_embeddings)
42
+
43
+ # Save generated speech
44
  output_audio_path = "output/generated_speech.wav"
45
+ torchaudio.save(output_audio_path, speech, 24000)
46
 
47
  # Step 4: Create Braille-compatible file
48
  braille_output_path = "output/braille.txt"
 
61
 
62
  with gr.Row():
63
  audio_input = gr.Audio(type="filepath", label="Upload Audio")
64
+ target_language = gr.Dropdown(
65
+ choices=["en", "hi", "kn", "ta", "te", "es", "de", "fr", "hu"],
66
+ value="en",
67
+ label="Target Language"
68
+ )
69
 
70
  with gr.Row():
71
  submit_button = gr.Button("Submit")