Spaces:
Sleeping
Sleeping
File size: 3,876 Bytes
8faecdb b623c58 d1caf5e 8faecdb d1caf5e 27c4742 8faecdb b623c58 d1caf5e b623c58 d1caf5e b623c58 d1caf5e b623c58 d1caf5e b623c58 8faecdb b623c58 27c4742 8faecdb b623c58 27c4742 b623c58 8faecdb 27c4742 b623c58 e6265b7 27c4742 b623c58 e6265b7 8faecdb 27c4742 e6265b7 b623c58 d1caf5e 4a54047 b623c58 d1caf5e 27c4742 b623c58 8faecdb 27c4742 8faecdb 27c4742 8faecdb 27c4742 d1caf5e 27c4742 b623c58 27c4742 b623c58 27c4742 b623c58 27c4742 b623c58 27c4742 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 |
import gradio as gr
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, AutoModelForSeq2SeqLM, AutoTokenizer
import torchaudio
import torch
from datasets import load_dataset
import os
# Load lightweight models
ASR_MODEL = "openai/whisper-tiny" # Faster ASR model
TRANSLATION_MODEL = "Helsinki-NLP/opus-mt-en-mul" # Lightweight translation model
# Load ASR model
from transformers import pipeline
asr = pipeline("automatic-speech-recognition", model=ASR_MODEL, device=0 if torch.cuda.is_available() else -1)
# Load translation model and tokenizer
translator_model = AutoModelForSeq2SeqLM.from_pretrained(TRANSLATION_MODEL)
translator_tokenizer = AutoTokenizer.from_pretrained(TRANSLATION_MODEL)
# Load TTS processor and model (use float16 for better speed)
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
tts = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(torch.float16)
# Cache speaker embeddings to avoid reloading every time
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0).to(torch.float16)
# Ensure output directory exists
os.makedirs("output", exist_ok=True)
# Processing function
def process_audio(audio, target_language):
if not audio:
return "Error: No audio file provided.", None, None
try:
# Step 1: Transcribe the audio
result = asr(audio)["text"]
if not result:
return "Error: Failed to transcribe audio.", None, None
# Step 2: Translate the text
inputs = translator_tokenizer(result, return_tensors="pt", padding=True)
outputs = translator_model.generate(**inputs)
translated_text = translator_tokenizer.decode(outputs[0], skip_special_tokens=True)
if not translated_text:
return "Error: Translation failed.", None, None
# Step 3: Generate speech from translated text
inputs = processor(text=translated_text, return_tensors="pt")
input_features = inputs.input_features.to(torch.float16)
with torch.no_grad():
speech = tts.generate_speech(input_features, speaker_embeddings)
# Save generated speech
output_audio_path = "output/generated_speech.wav"
torchaudio.save(output_audio_path, speech.cpu(), 24000)
# Step 4: Create Braille-compatible file
braille_output_path = "output/braille.txt"
with open(braille_output_path, "w", encoding="utf-8") as f:
f.write(translated_text)
return translated_text, output_audio_path, braille_output_path
except Exception as e:
return f"Error: {str(e)}", None, None
# Define Gradio interface
with gr.Blocks() as demo:
gr.Markdown("# Multi-Language Voice Translator")
with gr.Row():
audio_input = gr.Audio(type="filepath", label="Upload Audio")
target_language = gr.Dropdown(
choices=["en", "hi", "kn", "ta", "te", "es", "de", "fr", "hu"],
value="en",
label="Target Language"
)
with gr.Row():
submit_button = gr.Button("Translate & Synthesize")
clear_button = gr.Button("Clear")
with gr.Row():
translated_text = gr.Textbox(label="Translated Text")
generated_speech = gr.Audio(label="Generated Speech", interactive=False)
braille_file = gr.File(label="Download Braille File")
# Link functions to buttons
submit_button.click(
fn=process_audio,
inputs=[audio_input, target_language],
outputs=[translated_text, generated_speech, braille_file],
)
clear_button.click(
fn=lambda: ("", None, None),
inputs=[],
outputs=[translated_text, generated_speech, braille_file],
)
# Launch the app
demo.launch() |