File size: 3,876 Bytes
8faecdb
b623c58
d1caf5e
8faecdb
d1caf5e
27c4742
8faecdb
b623c58
 
 
d1caf5e
b623c58
 
 
 
 
 
 
 
 
d1caf5e
b623c58
d1caf5e
b623c58
d1caf5e
b623c58
8faecdb
b623c58
27c4742
8faecdb
b623c58
27c4742
b623c58
 
 
8faecdb
27c4742
 
b623c58
e6265b7
 
27c4742
 
b623c58
 
 
 
e6265b7
 
8faecdb
27c4742
e6265b7
b623c58
d1caf5e
 
4a54047
b623c58
d1caf5e
27c4742
b623c58
8faecdb
27c4742
 
 
8faecdb
 
27c4742
 
8faecdb
 
 
27c4742
 
 
 
 
 
d1caf5e
 
 
 
 
27c4742
 
b623c58
27c4742
 
 
 
 
b623c58
27c4742
b623c58
27c4742
 
 
 
 
b623c58
27c4742
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import gradio as gr
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, AutoModelForSeq2SeqLM, AutoTokenizer
import torchaudio
import torch
from datasets import load_dataset
import os

# Load lightweight models
ASR_MODEL = "openai/whisper-tiny"  # Faster ASR model
TRANSLATION_MODEL = "Helsinki-NLP/opus-mt-en-mul"  # Lightweight translation model

# Load ASR model
from transformers import pipeline
asr = pipeline("automatic-speech-recognition", model=ASR_MODEL, device=0 if torch.cuda.is_available() else -1)

# Load translation model and tokenizer
translator_model = AutoModelForSeq2SeqLM.from_pretrained(TRANSLATION_MODEL)
translator_tokenizer = AutoTokenizer.from_pretrained(TRANSLATION_MODEL)

# Load TTS processor and model (use float16 for better speed)
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
tts = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(torch.float16)

# Cache speaker embeddings to avoid reloading every time
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0).to(torch.float16)

# Ensure output directory exists
os.makedirs("output", exist_ok=True)

# Processing function
def process_audio(audio, target_language):
    if not audio:
        return "Error: No audio file provided.", None, None
    
    try:
        # Step 1: Transcribe the audio
        result = asr(audio)["text"]

        if not result:
            return "Error: Failed to transcribe audio.", None, None
        
        # Step 2: Translate the text
        inputs = translator_tokenizer(result, return_tensors="pt", padding=True)
        outputs = translator_model.generate(**inputs)
        translated_text = translator_tokenizer.decode(outputs[0], skip_special_tokens=True)

        if not translated_text:
            return "Error: Translation failed.", None, None

        # Step 3: Generate speech from translated text
        inputs = processor(text=translated_text, return_tensors="pt")
        input_features = inputs.input_features.to(torch.float16)

        with torch.no_grad():
            speech = tts.generate_speech(input_features, speaker_embeddings)

        # Save generated speech
        output_audio_path = "output/generated_speech.wav"
        torchaudio.save(output_audio_path, speech.cpu(), 24000)

        # Step 4: Create Braille-compatible file
        braille_output_path = "output/braille.txt"
        with open(braille_output_path, "w", encoding="utf-8") as f:
            f.write(translated_text)

        return translated_text, output_audio_path, braille_output_path
    
    except Exception as e:
        return f"Error: {str(e)}", None, None

# Define Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("# Multi-Language Voice Translator")

    with gr.Row():
        audio_input = gr.Audio(type="filepath", label="Upload Audio")
        target_language = gr.Dropdown(
            choices=["en", "hi", "kn", "ta", "te", "es", "de", "fr", "hu"],
            value="en",
            label="Target Language"
        )

    with gr.Row():
        submit_button = gr.Button("Translate & Synthesize")
        clear_button = gr.Button("Clear")

    with gr.Row():
        translated_text = gr.Textbox(label="Translated Text")
        generated_speech = gr.Audio(label="Generated Speech", interactive=False)
        braille_file = gr.File(label="Download Braille File")

    # Link functions to buttons
    submit_button.click(
        fn=process_audio,
        inputs=[audio_input, target_language],
        outputs=[translated_text, generated_speech, braille_file],
    )

    clear_button.click(
        fn=lambda: ("", None, None),
        inputs=[],
        outputs=[translated_text, generated_speech, braille_file],
    )

# Launch the app
demo.launch()