File size: 4,449 Bytes
18ab614
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import os
import numpy as np
import gradio as gr
import assemblyai as aai
from translate import Translator
import uuid
from gtts import gTTS
import tempfile
from pathlib import Path

def voice_to_voice(audio_file):
    # Transcribe speech
    transcript = transcribe_audio(audio_file)

    if transcript.status == aai.TranscriptStatus.error:
        raise gr.Error(transcript.error)
    else:
        transcript = transcript.text

    # Translate text
    list_translations = translate_text(transcript)
    generated_audio_paths = []

    # Generate speech from translated text
    for translation in list_translations:
        translated_audio_file_name = text_to_speech(translation)
        path = Path(translated_audio_file_name)
        generated_audio_paths.append(path)

    return generated_audio_paths[0], generated_audio_paths[1], generated_audio_paths[2], generated_audio_paths[3], generated_audio_paths[4], generated_audio_paths[5], list_translations[0], list_translations[1], list_translations[2], list_translations[3], list_translations[4], list_translations[5]

# Function to transcribe audio using AssemblyAI
def transcribe_audio(audio_file):
    aai.settings.api_key = "21f30361d02543cca65707e8f71721d8"

    transcriber = aai.Transcriber()
    transcript = transcriber.transcribe(audio_file)

    return transcript

# Function to translate text
def translate_text(text: str) -> str:
    languages = ["ru", "tr", "sv", "de", "es", "ja"]
    list_translations = []

    for lan in languages:
        translator = Translator(from_lang="en", to_lang=lan)
        translation = translator.translate(text)
        list_translations.append(translation)

    return list_translations

# Function to generate speech with gTTS (Google Text-to-Speech)
def text_to_speech(text: str) -> str:
    # Generate speech using gTTS (Google Text-to-Speech)
    tts = gTTS(text=text, lang='en', slow=True)

    # Save the audio to a temporary file
    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
        tts.save(tmp_file.name)
        audio_path = tmp_file.name

    return audio_path

input_audio = gr.Audio(
    sources=["microphone"],
    type="filepath",
    show_download_button=True,
    waveform_options=gr.WaveformOptions(
        waveform_color="#01C6FF",
        waveform_progress_color="#0066B4",
        skip_length=2,
        show_controls=False,
    ),
)

with gr.Blocks() as demo:
    gr.Markdown("## Echo: Voice Translation App")
    gr.Markdown("## Record yourself in English and immediately receive voice translations.")
    with gr.Row():
        with gr.Column():
            audio_input = gr.Audio(sources=["microphone"],
                                   type="filepath",
                                   show_download_button=True,
                                   waveform_options=gr.WaveformOptions(
                                       waveform_color="#01C6FF",
                                       waveform_progress_color="#0066B4",
                                       skip_length=2,
                                       show_controls=False,
                                   ),)
            with gr.Row():
                submit = gr.Button("Submit", variant="primary")
                btn = gr.ClearButton(audio_input, "Clear")

    with gr.Row():
        with gr.Group() as turkish:
            tr_output = gr.Audio(label="Turkish", interactive=False)
            tr_text = gr.Markdown()

        with gr.Group() as swedish:
            sv_output = gr.Audio(label="Swedish", interactive=False)
            sv_text = gr.Markdown()

        with gr.Group() as russian:
            ru_output = gr.Audio(label="Russian", interactive=False)
            ru_text = gr.Markdown()

    with gr.Row():
        with gr.Group():
            de_output = gr.Audio(label="German", interactive=False)
            de_text = gr.Markdown()

        with gr.Group():
            es_output = gr.Audio(label="Spanish", interactive=False)
            es_text = gr.Markdown()

        with gr.Group():
            jp_output = gr.Audio(label="Japanese", interactive=False)
            jp_text = gr.Markdown()

    output_components = [ru_output, tr_output, sv_output, de_output, es_output, jp_output, ru_text, tr_text, sv_text, de_text, es_text, jp_text]
    submit.click(fn=voice_to_voice, inputs=audio_input, outputs=output_components, show_progress=True)

if __name__ == "__main__":
    demo.launch()