bunnyyss09 commited on
Commit
6a06834
·
0 Parent(s):

Initial commit

Browse files
.gitattributes ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ # Auto detect text files and perform LF normalization
2
+ * text=auto
.gitignore ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Ignore environment files (to protect API keys and credentials)
2
+ .env
3
+
4
+ # Ignore OS-generated files
5
+ .DS_Store
6
+ Thumbs.db
7
+
8
+ # Ignore Python-related files
9
+ __pycache__/
10
+ *.pyc
11
+ *.pyo
12
+
13
+ # Ignore virtual environments (if you're using one)
14
+ venv/
15
+ env/
16
+
17
+ # Ignore logs and temporary files
18
+ *.log
19
+ *.tmp
README.md ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Voice-to-Voice Translator
2
+
3
+ ## Overview
4
+
5
+ This project is a voice-to-voice translator that allows users to speak in English and receive real-time translations in multiple languages, along with audio playback. The application leverages advanced AI-based speech recognition, translation, and text-to-speech technologies.
6
+
7
+ ## Technologies Used
8
+
9
+ - **Gradio** – Provides an easy-to-use web interface.
10
+ - **AssemblyAI** – Converts speech to text.
11
+ - **Python Translate Module** – Translates text into multiple languages.
12
+ - **ElevenLabs** – Converts translated text into spoken audio using AI voices.
13
+
14
+ ## Installation
15
+
16
+ 1. Clone this repository:
17
+ ```sh
18
+ git clone https://github.com/yourusername/voice-to-voice-translator.git
19
+ cd voice-to-voice-translator
20
+ ```
21
+ 2. Install dependencies:
22
+ ```sh
23
+ pip install -r requirements.txt
24
+ ```
25
+ 3. Set up your API keys by creating a `.env` file in the root directory:
26
+ ```ini
27
+ ASSEMBLYAI_API_KEY=your_assemblyai_api_key
28
+ ELEVENLABS_API_KEY=your_elevenlabs_api_key
29
+ ```
30
+ 4. Run the application:
31
+ ```sh
32
+ python src/voice_translator.py
33
+ ```
34
+
35
+ ## API Keys Required
36
+
37
+ You need API keys for the following services:
38
+
39
+ - [AssemblyAI API Key](https://www.assemblyai.com/?utm_source=youtube\&utm_medium=referral\&utm_campaign=yt_mis_66)
40
+ - [ElevenLabs API Key](https://elevenlabs.io/)
41
+
42
+ ## Features
43
+
44
+ - **Speech Recognition**: Converts spoken words into text using AssemblyAI.
45
+ - **Translation**: Uses Python’s `translate` module to support multiple languages.
46
+ - **Text-to-Speech**: ElevenLabs API generates AI-powered speech from translated text.
47
+ - **Real-time Streaming**: Supports streaming audio generation for faster output.
48
+ - **User-Friendly Interface**: Built using Gradio for a simple and interactive UI.
49
+
50
+ ## Sample Images
51
+
52
+  
53
+
54
+ ## Usage
55
+
56
+ - Click on the **Record** button to capture your speech.
57
+ - The system will transcribe, translate, and generate speech output in multiple languages.
58
+ - Download or listen to the translated audio directly from the interface.
requirements.txt ADDED
Binary file (130 Bytes). View file
 
sample_img1.png ADDED
sample_img2.png ADDED
src/images/Germany.png ADDED
src/images/Japanese.png ADDED
src/images/Russia.png ADDED
src/images/Spain.png ADDED
src/images/Sweden.png ADDED
src/images/Turkey.png ADDED
src/voice_translator.py ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import uuid
3
+ import gradio as gr
4
+ import assemblyai as aai
5
+ from translate import Translator
6
+ from elevenlabs import VoiceSettings
7
+ from elevenlabs.client import ElevenLabs
8
+ from pathlib import Path
9
+ from dotenv import load_dotenv
10
+ from PIL import Image
11
+
12
+ # Load environment variables
13
+ load_dotenv()
14
+
15
+ assemblyai_api_key = os.getenv("ASSEMBLYAI_API_KEY")
16
+ elevenlabs_api_key = os.getenv("ELEVENLABS_API_KEY")
17
+
18
+ # Initialize AssemblyAI and ElevenLabs clients
19
+ aai.settings.api_key = assemblyai_api_key
20
+ client = ElevenLabs(api_key=elevenlabs_api_key)
21
+
22
+ # Use relative paths
23
+ script_dir = Path(__file__).parent # Gets the directory where the script is running
24
+ images_dir = script_dir / "images" # Points to the 'images' folder in your project
25
+
26
+ ru_img = Image.open(images_dir / "Russia.png").resize((80, 40))
27
+ tr_img = Image.open(images_dir / "Turkey.png").resize((80, 40))
28
+ sv_img = Image.open(images_dir / "Sweden.png").resize((80, 40))
29
+ de_img = Image.open(images_dir / "Germany.png").resize((80, 40))
30
+ es_img = Image.open(images_dir / "Spain.png").resize((80, 40))
31
+ ja_img = Image.open(images_dir / "Japanese.png").resize((80, 40))
32
+
33
+
34
+ def voice_to_voice(audio_file, progress=gr.Progress()):
35
+ try:
36
+ # Transcribe speech
37
+ progress(0.1, desc="Transcribing audio...")
38
+ transcript = transcribe_audio(audio_file)
39
+
40
+ if transcript.status == aai.TranscriptStatus.error:
41
+ raise gr.Error(f"Transcription failed: {transcript.error}")
42
+ else:
43
+ transcript_text = transcript.text
44
+
45
+ # Translate text
46
+ progress(0.4, desc="Translating text...")
47
+ list_translations = translate_text(transcript_text)
48
+
49
+ # Generate speech from text
50
+ generated_audio_paths = []
51
+ for i, translation in enumerate(list_translations):
52
+ progress(0.5 + (i * 0.1), desc=f"Generating audio for {['Russian', 'Turkish', 'Swedish', 'German', 'Spanish', 'Japanese'][i]}...")
53
+ translated_audio_file_name = text_to_speech(translation)
54
+ path = Path(translated_audio_file_name)
55
+ generated_audio_paths.append(path)
56
+
57
+ return generated_audio_paths + list_translations
58
+
59
+ except Exception as e:
60
+ raise gr.Error(f"An error occurred: {str(e)}")
61
+
62
+ # Function to transcribe audio using AssemblyAI
63
+ def transcribe_audio(audio_file):
64
+ transcriber = aai.Transcriber()
65
+ transcript = transcriber.transcribe(audio_file)
66
+ return transcript
67
+
68
+ # Function to translate text
69
+ def translate_text(text: str) -> list:
70
+ languages = ["ru", "tr", "sv", "de", "es", "ja"]
71
+ list_translations = []
72
+ for lan in languages:
73
+ try:
74
+ translator = Translator(from_lang="en", to_lang=lan)
75
+ translation = translator.translate(text)
76
+ list_translations.append(translation)
77
+ except Exception as e:
78
+ print(f"Translation to {lan} failed: {str(e)}")
79
+ list_translations.append(f"Translation to {lan} failed.")
80
+ return list_translations
81
+
82
+ # Function to generate speech
83
+ def text_to_speech(text: str) -> str:
84
+ response = client.text_to_speech.convert(
85
+ voice_id="Xb7hH8MSUJpSbSDYk0k2", # Choose a voice on ElevenLabs dashboard and copy the id
86
+ optimize_streaming_latency="0",
87
+ output_format="mp3_22050_32",
88
+ text=text,
89
+ model_id="eleven_multilingual_v2", # Use the turbo model for low latency, for other languages use the `eleven_multilingual_v2`
90
+ voice_settings=VoiceSettings(
91
+ stability=0.5,
92
+ similarity_boost=0.75,
93
+ style=0.0,
94
+ use_speaker_boost=True,
95
+ ),
96
+ )
97
+
98
+ save_file_path = f"{uuid.uuid4()}.mp3"
99
+
100
+ # Writing the audio to a file
101
+ with open(save_file_path, "wb") as f:
102
+ for chunk in response:
103
+ if chunk:
104
+ f.write(chunk)
105
+
106
+ print(f"{save_file_path}: A new audio file was saved successfully!")
107
+
108
+ # Return the path of the saved audio file
109
+ return save_file_path
110
+
111
+ # Gradio UI
112
+ with gr.Blocks() as demo:
113
+ gr.Markdown("## Record yourself in English and immediately receive voice translations.")
114
+
115
+ with gr.Row():
116
+ with gr.Column():
117
+ audio_input = gr.Audio(
118
+ sources=["microphone"],
119
+ type="filepath",
120
+ show_download_button=True,
121
+ waveform_options=gr.WaveformOptions(
122
+ waveform_color="#01C6FF",
123
+ waveform_progress_color="#0066B4",
124
+ skip_length=2,
125
+ show_controls=False,
126
+ ),
127
+ )
128
+ with gr.Row():
129
+ submit = gr.Button("Submit", variant="primary")
130
+ btn = gr.ClearButton(audio_input, "Clear")
131
+
132
+ with gr.Row():
133
+ with gr.Group() as russian:
134
+ gr.Markdown("### Russian 🇷🇺")
135
+ gr.Image(ru_img)
136
+ ru_output = gr.Audio(label="Russian", interactive=False)
137
+ ru_text = gr.Markdown()
138
+
139
+ with gr.Group() as turkish:
140
+ gr.Markdown("### Turkish 🇹🇷")
141
+ gr.Image(tr_img)
142
+ tr_output = gr.Audio(label="Turkish", interactive=False)
143
+ tr_text = gr.Markdown()
144
+
145
+ with gr.Group() as swedish:
146
+ gr.Markdown("### Swedish 🇸🇪")
147
+ gr.Image(sv_img)
148
+ sv_output = gr.Audio(label="Swedish", interactive=False)
149
+ sv_text = gr.Markdown()
150
+
151
+ with gr.Row():
152
+ with gr.Group() as german:
153
+ gr.Markdown("### German 🇩🇪")
154
+ gr.Image(de_img)
155
+ de_output = gr.Audio(label="German", interactive=False)
156
+ de_text = gr.Markdown()
157
+
158
+ with gr.Group() as spanish:
159
+ gr.Markdown("### Spanish 🇪🇸")
160
+ gr.Image(es_img)
161
+ es_output = gr.Audio(label="Spanish", interactive=False)
162
+ es_text = gr.Markdown()
163
+
164
+ with gr.Group() as japanese:
165
+ gr.Markdown("### Japanese 🇯🇵")
166
+ gr.Image(ja_img)
167
+ jp_output = gr.Audio(label="Japanese", interactive=False)
168
+ jp_text = gr.Markdown()
169
+
170
+ output_components = [ru_output, tr_output, sv_output, de_output, es_output, jp_output, ru_text, tr_text, sv_text, de_text, es_text, jp_text]
171
+ submit.click(fn=voice_to_voice, inputs=audio_input, outputs=output_components, show_progress=True)
172
+
173
+ if __name__ == "__main__":
174
+ demo.launch()