Spaces:

bunnyyss09
/

Voice-to-Voice-Translator

Sleeping

App Files Files Community

bunnyyss09 commited on Jan 30, 2025

Commit

6a06834

0 Parent(s):

Initial commit

Browse files

Files changed (13) hide show

.gitattributes +2 -0
.gitignore +19 -0
README.md +58 -0
requirements.txt +0 -0
sample_img1.png +0 -0
sample_img2.png +0 -0
src/images/Germany.png +0 -0
src/images/Japanese.png +0 -0
src/images/Russia.png +0 -0
src/images/Spain.png +0 -0
src/images/Sweden.png +0 -0
src/images/Turkey.png +0 -0
src/voice_translator.py +174 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # Auto detect text files and perform LF normalization
2	+ * text=auto

.gitignore ADDED Viewed

	@@ -0,0 +1,19 @@

+# Ignore environment files (to protect API keys and credentials)
+.env
+# Ignore OS-generated files
+.DS_Store
+Thumbs.db
+# Ignore Python-related files
+__pycache__/
+*.pyc
+*.pyo
+# Ignore virtual environments (if you're using one)
+venv/
+env/
+# Ignore logs and temporary files
+*.log
+*.tmp

README.md ADDED Viewed

	@@ -0,0 +1,58 @@

+# Voice-to-Voice Translator
+## Overview
+This project is a voice-to-voice translator that allows users to speak in English and receive real-time translations in multiple languages, along with audio playback. The application leverages advanced AI-based speech recognition, translation, and text-to-speech technologies.
+## Technologies Used
+- **Gradio** – Provides an easy-to-use web interface.
+- **AssemblyAI** – Converts speech to text.
+- **Python Translate Module** – Translates text into multiple languages.
+- **ElevenLabs** – Converts translated text into spoken audio using AI voices.
+## Installation
+1. Clone this repository:
+   ```sh
+   git clone https://github.com/yourusername/voice-to-voice-translator.git
+   cd voice-to-voice-translator
+   ```
+2. Install dependencies:
+   ```sh
+   pip install -r requirements.txt
+   ```
+3. Set up your API keys by creating a `.env` file in the root directory:
+   ```ini
+   ASSEMBLYAI_API_KEY=your_assemblyai_api_key
+   ELEVENLABS_API_KEY=your_elevenlabs_api_key
+   ```
+4. Run the application:
+   ```sh
+   python src/voice_translator.py
+   ```
+## API Keys Required
+You need API keys for the following services:
+- [AssemblyAI API Key](https://www.assemblyai.com/?utm_source=youtube\&utm_medium=referral\&utm_campaign=yt_mis_66)
+- [ElevenLabs API Key](https://elevenlabs.io/)
+## Features
+- **Speech Recognition**: Converts spoken words into text using AssemblyAI.
+- **Translation**: Uses Python’s `translate` module to support multiple languages.
+- **Text-to-Speech**: ElevenLabs API generates AI-powered speech from translated text.
+- **Real-time Streaming**: Supports streaming audio generation for faster output.
+- **User-Friendly Interface**: Built using Gradio for a simple and interactive UI.
+## Sample Images
+&#x20;
+## Usage
+- Click on the **Record** button to capture your speech.
+- The system will transcribe, translate, and generate speech output in multiple languages.
+- Download or listen to the translated audio directly from the interface.

requirements.txt ADDED Viewed

Binary file (130 Bytes). View file

sample_img1.png ADDED Viewed

sample_img2.png ADDED Viewed

src/images/Germany.png ADDED Viewed

src/images/Japanese.png ADDED Viewed

src/images/Russia.png ADDED Viewed

src/images/Spain.png ADDED Viewed

src/images/Sweden.png ADDED Viewed

src/images/Turkey.png ADDED Viewed

src/voice_translator.py ADDED Viewed

	@@ -0,0 +1,174 @@

+import os
+import uuid
+import gradio as gr
+import assemblyai as aai
+from translate import Translator
+from elevenlabs import VoiceSettings
+from elevenlabs.client import ElevenLabs
+from pathlib import Path
+from dotenv import load_dotenv
+from PIL import Image
+# Load environment variables
+load_dotenv()
+assemblyai_api_key = os.getenv("ASSEMBLYAI_API_KEY")
+elevenlabs_api_key = os.getenv("ELEVENLABS_API_KEY")
+# Initialize AssemblyAI and ElevenLabs clients
+aai.settings.api_key = assemblyai_api_key
+client = ElevenLabs(api_key=elevenlabs_api_key)
+# Use relative paths
+script_dir = Path(__file__).parent  # Gets the directory where the script is running
+images_dir = script_dir / "images"  # Points to the 'images' folder in your project
+ru_img = Image.open(images_dir / "Russia.png").resize((80, 40))
+tr_img = Image.open(images_dir / "Turkey.png").resize((80, 40))
+sv_img = Image.open(images_dir / "Sweden.png").resize((80, 40))
+de_img = Image.open(images_dir / "Germany.png").resize((80, 40))
+es_img = Image.open(images_dir / "Spain.png").resize((80, 40))
+ja_img = Image.open(images_dir / "Japanese.png").resize((80, 40))
+def voice_to_voice(audio_file, progress=gr.Progress()):
+    try:
+        # Transcribe speech
+        progress(0.1, desc="Transcribing audio...")
+        transcript = transcribe_audio(audio_file)
+        if transcript.status == aai.TranscriptStatus.error:
+            raise gr.Error(f"Transcription failed: {transcript.error}")
+        else:
+            transcript_text = transcript.text
+        # Translate text
+        progress(0.4, desc="Translating text...")
+        list_translations = translate_text(transcript_text)
+        # Generate speech from text
+        generated_audio_paths = []
+        for i, translation in enumerate(list_translations):
+            progress(0.5 + (i * 0.1), desc=f"Generating audio for {['Russian', 'Turkish', 'Swedish', 'German', 'Spanish', 'Japanese'][i]}...")
+            translated_audio_file_name = text_to_speech(translation)
+            path = Path(translated_audio_file_name)
+            generated_audio_paths.append(path)
+        return generated_audio_paths + list_translations
+    except Exception as e:
+        raise gr.Error(f"An error occurred: {str(e)}")
+# Function to transcribe audio using AssemblyAI
+def transcribe_audio(audio_file):
+    transcriber = aai.Transcriber()
+    transcript = transcriber.transcribe(audio_file)
+    return transcript
+# Function to translate text
+def translate_text(text: str) -> list:
+    languages = ["ru", "tr", "sv", "de", "es", "ja"]
+    list_translations = []
+    for lan in languages:
+        try:
+            translator = Translator(from_lang="en", to_lang=lan)
+            translation = translator.translate(text)
+            list_translations.append(translation)
+        except Exception as e:
+            print(f"Translation to {lan} failed: {str(e)}")
+            list_translations.append(f"Translation to {lan} failed.")
+    return list_translations
+# Function to generate speech
+def text_to_speech(text: str) -> str:
+    response = client.text_to_speech.convert(
+        voice_id="Xb7hH8MSUJpSbSDYk0k2",  # Choose a voice on ElevenLabs dashboard and copy the id
+        optimize_streaming_latency="0",
+        output_format="mp3_22050_32",
+        text=text,
+        model_id="eleven_multilingual_v2",  # Use the turbo model for low latency, for other languages use the `eleven_multilingual_v2`
+        voice_settings=VoiceSettings(
+            stability=0.5,
+            similarity_boost=0.75,
+            style=0.0,
+            use_speaker_boost=True,
+        ),
+    )
+    save_file_path = f"{uuid.uuid4()}.mp3"
+    # Writing the audio to a file
+    with open(save_file_path, "wb") as f:
+        for chunk in response:
+            if chunk:
+                f.write(chunk)
+    print(f"{save_file_path}: A new audio file was saved successfully!")
+    # Return the path of the saved audio file
+    return save_file_path
+# Gradio UI
+with gr.Blocks() as demo:
+    gr.Markdown("## Record yourself in English and immediately receive voice translations.")
+    with gr.Row():
+        with gr.Column():
+            audio_input = gr.Audio(
+                sources=["microphone"],
+                type="filepath",
+                show_download_button=True,
+                waveform_options=gr.WaveformOptions(
+                    waveform_color="#01C6FF",
+                    waveform_progress_color="#0066B4",
+                    skip_length=2,
+                    show_controls=False,
+                ),
+            )
+            with gr.Row():
+                submit = gr.Button("Submit", variant="primary")
+                btn = gr.ClearButton(audio_input, "Clear")
+    with gr.Row():
+        with gr.Group() as russian:
+            gr.Markdown("### Russian 🇷🇺")
+            gr.Image(ru_img)
+            ru_output = gr.Audio(label="Russian", interactive=False)
+            ru_text = gr.Markdown()
+        with gr.Group() as turkish:
+            gr.Markdown("### Turkish 🇹🇷")
+            gr.Image(tr_img)
+            tr_output = gr.Audio(label="Turkish", interactive=False)
+            tr_text = gr.Markdown()
+        with gr.Group() as swedish:
+            gr.Markdown("### Swedish 🇸🇪")
+            gr.Image(sv_img)
+            sv_output = gr.Audio(label="Swedish", interactive=False)
+            sv_text = gr.Markdown()
+    with gr.Row():
+        with gr.Group() as german:
+            gr.Markdown("### German 🇩🇪")
+            gr.Image(de_img)
+            de_output = gr.Audio(label="German", interactive=False)
+            de_text = gr.Markdown()
+        with gr.Group() as spanish:
+            gr.Markdown("### Spanish 🇪🇸")
+            gr.Image(es_img)
+            es_output = gr.Audio(label="Spanish", interactive=False)
+            es_text = gr.Markdown()
+        with gr.Group() as japanese:
+            gr.Markdown("### Japanese 🇯🇵")
+            gr.Image(ja_img)
+            jp_output = gr.Audio(label="Japanese", interactive=False)
+            jp_text = gr.Markdown()
+    output_components = [ru_output, tr_output, sv_output, de_output, es_output, jp_output, ru_text, tr_text, sv_text, de_text, es_text, jp_text]
+    submit.click(fn=voice_to_voice, inputs=audio_input, outputs=output_components, show_progress=True)
+if __name__ == "__main__":
+    demo.launch()