Spaces:
Runtime error
Runtime error
Upload folder using huggingface_hub
Browse files- .gitattributes +1 -0
- .github/workflows/update_space.yml +28 -0
- .gitignore +3 -0
- README.md +134 -8
- audio_to_video.py +77 -0
- demo.py +166 -0
- demonstration/demo.mp4 +3 -0
- gradio_app.py +661 -0
- media_ingestion.py +135 -0
- requirements.txt +21 -0
- speech_diarization.py +223 -0
- speech_recognition.py +19 -0
- text_to_speech.py +619 -0
- translate.py +391 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
demonstration/demo.mp4 filter=lfs diff=lfs merge=lfs -text
|
.github/workflows/update_space.yml
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: Run Python script
|
| 2 |
+
|
| 3 |
+
on:
|
| 4 |
+
push:
|
| 5 |
+
branches:
|
| 6 |
+
- main
|
| 7 |
+
|
| 8 |
+
jobs:
|
| 9 |
+
build:
|
| 10 |
+
runs-on: ubuntu-latest
|
| 11 |
+
|
| 12 |
+
steps:
|
| 13 |
+
- name: Checkout
|
| 14 |
+
uses: actions/checkout@v2
|
| 15 |
+
|
| 16 |
+
- name: Set up Python
|
| 17 |
+
uses: actions/setup-python@v2
|
| 18 |
+
with:
|
| 19 |
+
python-version: '3.9'
|
| 20 |
+
|
| 21 |
+
- name: Install Gradio
|
| 22 |
+
run: python -m pip install gradio
|
| 23 |
+
|
| 24 |
+
- name: Log in to Hugging Face
|
| 25 |
+
run: python -c 'import huggingface_hub; huggingface_hub.login(token="${{ secrets.hf_token }}")'
|
| 26 |
+
|
| 27 |
+
- name: Deploy to Spaces
|
| 28 |
+
run: gradio deploy
|
.gitignore
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.env
|
| 2 |
+
__pycache__
|
| 3 |
+
.gradio
|
README.md
CHANGED
|
@@ -1,14 +1,140 @@
|
|
| 1 |
---
|
| 2 |
title: SyncDub
|
| 3 |
-
|
| 4 |
-
colorFrom: green
|
| 5 |
-
colorTo: blue
|
| 6 |
sdk: gradio
|
| 7 |
sdk_version: 5.29.0
|
| 8 |
-
app_file: app.py
|
| 9 |
-
pinned: false
|
| 10 |
-
license: mit
|
| 11 |
-
short_description: Synchronised dubbing with voice cloning
|
| 12 |
---
|
|
|
|
| 13 |
|
| 14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
title: SyncDub
|
| 3 |
+
app_file: gradio_app.py
|
|
|
|
|
|
|
| 4 |
sdk: gradio
|
| 5 |
sdk_version: 5.29.0
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
---
|
| 7 |
+
# SyncDub
|
| 8 |
|
| 9 |
+
SyncDub is a Python application designed to automatically translate and dub videos into various languages. It leverages speech recognition, speaker diarization, machine translation, and text-to-speech (TTS) technologies to create dubbed versions of input videos while attempting to preserve background audio and optionally clone speaker voices.
|
| 10 |
+
|
| 11 |
+
## Features
|
| 12 |
+
|
| 13 |
+
* **Video Input:** Accepts YouTube URLs or local video file uploads.
|
| 14 |
+
* **Audio Extraction & Separation:** Extracts audio from video and separates speech from background noise using Demucs.
|
| 15 |
+
* **Speech Recognition:** Transcribes the spoken content using Whisper.
|
| 16 |
+
* **Speaker Diarization:** Identifies different speakers in the audio using `pyannote.audio`.
|
| 17 |
+
* **Machine Translation:** Translates the transcribed text into multiple target languages using `deep-translator` (with options for batch, iterative, or Groq API methods).
|
| 18 |
+
* **Text-to-Speech (TTS):**
|
| 19 |
+
* **Simple Dubbing:** Generates dubbed audio using Microsoft Edge TTS, allowing gender selection per speaker.
|
| 20 |
+
* **Voice Cloning:** Uses Coqui XTTS to clone the original speakers' voices for a more natural dub (requires reference audio extraction).
|
| 21 |
+
* **Audio Mixing:** Combines the generated dubbed speech with the original background audio.
|
| 22 |
+
* **Video Reassembly:** Creates the final dubbed video file.
|
| 23 |
+
* **Subtitle Generation:** Outputs translated subtitles in `.srt` format.
|
| 24 |
+
* **Web Interface:** Provides an easy-to-use Gradio interface for processing videos.
|
| 25 |
+
* **Command-Line Interface:** Includes a `demo.py` script for terminal-based usage.
|
| 26 |
+
|
| 27 |
+
## Requirements
|
| 28 |
+
|
| 29 |
+
* Python 3.8+
|
| 30 |
+
* FFmpeg (must be installed and accessible in your system's PATH)
|
| 31 |
+
* Key Python packages (see `requirements.txt`):
|
| 32 |
+
* `yt-dlp`
|
| 33 |
+
* `moviepy`
|
| 34 |
+
* `pyannote.audio`
|
| 35 |
+
* `transformers`
|
| 36 |
+
* `torch` & `torchaudio` (often dependencies of the above)
|
| 37 |
+
* `deep-translator`
|
| 38 |
+
* `TTS` (Coqui TTS for voice cloning)
|
| 39 |
+
* `edge-tts`
|
| 40 |
+
* `gradio`
|
| 41 |
+
* `python-dotenv`
|
| 42 |
+
* `demucs`
|
| 43 |
+
* **API Keys:**
|
| 44 |
+
* `HUGGINGFACE_TOKEN`: Required for `pyannote.audio` speaker diarization models. Obtain from [Hugging Face](https://huggingface.co/settings/tokens).
|
| 45 |
+
* `GROQ_API_KEY` (Optional): Required if using the "groq" translation method. Obtain from [GroqCloud](https://console.groq.com/keys).
|
| 46 |
+
|
| 47 |
+
## Setup
|
| 48 |
+
|
| 49 |
+
1. **Clone the repository:**
|
| 50 |
+
```bash
|
| 51 |
+
git clone https://github.com/PranavInani/SyncDub
|
| 52 |
+
cd SyncDub
|
| 53 |
+
```
|
| 54 |
+
2. **Create and activate a virtual environment (recommended):**
|
| 55 |
+
```bash
|
| 56 |
+
python -m venv venv
|
| 57 |
+
source venv/bin/activate # On Windows use `venv\Scripts\activate`
|
| 58 |
+
```
|
| 59 |
+
3. **Install FFmpeg:** Follow instructions for your operating system ([https://ffmpeg.org/download.html](https://ffmpeg.org/download.html)). Ensure it's added to your system's PATH.
|
| 60 |
+
4. **Install Python dependencies:**
|
| 61 |
+
```bash
|
| 62 |
+
pip install -r requirements.txt
|
| 63 |
+
# Depending on your system, you might need to install PyTorch separately
|
| 64 |
+
# See https://pytorch.org/get-started/locally/
|
| 65 |
+
```
|
| 66 |
+
5. **Create a `.env` file:** In the root directory of the project, create a file named `.env` and add your Hugging Face token:
|
| 67 |
+
```dotenv
|
| 68 |
+
# filepath: .env
|
| 69 |
+
HUGGINGFACE_TOKEN=hf_YOUR_HUGGINGFACE_TOKEN_HERE
|
| 70 |
+
# Add GROQ_API_KEY if you plan to use the Groq translation method
|
| 71 |
+
# GROQ_API_KEY=gsk_YOUR_GROQ_API_KEY_HERE
|
| 72 |
+
```
|
| 73 |
+
Replace `hf_YOUR_HUGGINGFACE_TOKEN_HERE` with your actual token.
|
| 74 |
+
|
| 75 |
+
## Usage
|
| 76 |
+
|
| 77 |
+
### Gradio Web Interface
|
| 78 |
+
|
| 79 |
+
Launch the web application:
|
| 80 |
+
|
| 81 |
+
```bash
|
| 82 |
+
python gradio_app.py
|
| 83 |
+
```
|
| 84 |
+
|
| 85 |
+
This will start a local web server. Open the provided URL (usually `http://127.0.0.1:7860`) in your browser.
|
| 86 |
+
|
| 87 |
+
1. **Input:** Enter a video URL (e.g., YouTube) or upload a local video file.
|
| 88 |
+
2. **Target Language:** Select the desired output language.
|
| 89 |
+
3. **TTS Method:** Choose between "Simple dubbing (Edge TTS)" or "Voice cloning (XTTS)".
|
| 90 |
+
4. **Translation Method:** Select "batch", "iterative", or "groq".
|
| 91 |
+
5. **Maximum Speakers:** Optionally specify the number of speakers to detect. Click "Update Speaker Options" to configure genders if needed (especially for Edge TTS).
|
| 92 |
+
6. **Process:** Click "Process Video".
|
| 93 |
+
7. **Output:** Download links for the dubbed video and subtitle file will appear upon completion. Use the "Reset Everything" button to clear temporary files before processing a new video.
|
| 94 |
+
|
| 95 |
+
### Command-Line Demo
|
| 96 |
+
|
| 97 |
+
Run the demo script:
|
| 98 |
+
|
| 99 |
+
```bash
|
| 100 |
+
python demo.py
|
| 101 |
+
```
|
| 102 |
+
|
| 103 |
+
The script will prompt you for:
|
| 104 |
+
|
| 105 |
+
1. Video URL or local file path.
|
| 106 |
+
2. Target language code (e.g., `en`, `es`, `hi`).
|
| 107 |
+
3. TTS engine choice (1 for Edge TTS, 2 for XTTS).
|
| 108 |
+
4. Maximum number of speakers (optional).
|
| 109 |
+
5. Speaker genders (if using Edge TTS or as fallback for XTTS).
|
| 110 |
+
|
| 111 |
+
The processed files will be saved in the `temp` directory, with the final video typically named `output_video.mp4`.
|
| 112 |
+
|
| 113 |
+
## Configuration
|
| 114 |
+
|
| 115 |
+
* **API Keys:** Configure `HUGGINGFACE_TOKEN` and optionally `GROQ_API_KEY` in the `.env` file.
|
| 116 |
+
* **Models:** Model sizes and specific checkpoints can be adjusted within the Python scripts (`speech_recognition.py`, `speech_diarization.py`, etc.) if needed.
|
| 117 |
+
|
| 118 |
+
## Directory Structure
|
| 119 |
+
|
| 120 |
+
* `temp/`: Stores intermediate files like downloaded video, extracted audio, separated sources, final output video.
|
| 121 |
+
* `audio/`: Often used for initial audio extraction outputs.
|
| 122 |
+
* `audio2/`: Stores the generated TTS audio segments and the final mixed dubbed audio.
|
| 123 |
+
* `reference_audio/`: Stores extracted audio snippets for each speaker when using XTTS voice cloning.
|
| 124 |
+
* `outputs/`: Stores the final video and subtitle files made available for download in the Gradio interface.
|
| 125 |
+
|
| 126 |
+
## Troubleshooting
|
| 127 |
+
|
| 128 |
+
* **Errors during processing:**
|
| 129 |
+
* Ensure `ffmpeg` is installed correctly and accessible in your PATH.
|
| 130 |
+
* Verify that the `HUGGINGFACE_TOKEN` in your `.env` file is correct and valid.
|
| 131 |
+
* Check if you have sufficient disk space and memory, especially for large videos or voice cloning.
|
| 132 |
+
* Ensure all dependencies from `requirements.txt` are installed correctly in your virtual environment.
|
| 133 |
+
* **Voice cloning issues:** XTTS quality depends heavily on the quality and duration of the extracted reference audio for each speaker. If diarization is poor or speakers have very little unique speech, cloning may fail or produce poor results. Consider using "Simple dubbing" as an alternative.
|
| 134 |
+
* **Slow processing:** Video processing, especially diarization, translation, and TTS (XTTS), can be computationally intensive and time-consuming.
|
| 135 |
+
* **Reset:** Use the "Reset Everything" button in the Gradio app to clear temporary directories if you encounter persistent issues or before starting a new video.
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
## Contributing
|
| 139 |
+
|
| 140 |
+
Everyone is encouraged to contribute.
|
audio_to_video.py
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import subprocess
|
| 3 |
+
from IPython.display import FileLink, display
|
| 4 |
+
|
| 5 |
+
def create_video_with_mixed_audio(
|
| 6 |
+
main_video_path,
|
| 7 |
+
background_music_path,
|
| 8 |
+
main_audio_path,
|
| 9 |
+
temp_dir="temp", # Directory for temporary files
|
| 10 |
+
bg_volume=0.3,
|
| 11 |
+
main_audio_volume=1.0
|
| 12 |
+
):
|
| 13 |
+
"""
|
| 14 |
+
Create a video with mixed audio (main audio + background music)
|
| 15 |
+
|
| 16 |
+
Parameters:
|
| 17 |
+
main_video_path (str): Path to the main video file
|
| 18 |
+
background_music_path (str): Path to the background music file
|
| 19 |
+
main_audio_path (str): Path to the main audio (dubbed speech)
|
| 20 |
+
temp_dir (str): Directory for temporary files
|
| 21 |
+
bg_volume (float): Volume level for background music (0.0-1.0)
|
| 22 |
+
main_audio_volume (float): Volume level for main audio (0.0-1.0)
|
| 23 |
+
|
| 24 |
+
Returns:
|
| 25 |
+
bool: True if successful, False otherwise
|
| 26 |
+
"""
|
| 27 |
+
|
| 28 |
+
try:
|
| 29 |
+
# Ensure the temporary directory exists
|
| 30 |
+
os.makedirs(temp_dir, exist_ok=True)
|
| 31 |
+
|
| 32 |
+
# Define paths for temporary and output files
|
| 33 |
+
temp_audio_path = os.path.join(temp_dir, "mixed_audio.wav")
|
| 34 |
+
output_video_path = os.path.join(temp_dir, "output_video.mp4")
|
| 35 |
+
|
| 36 |
+
# Step 1: Mix the background audio and main audio with volume control
|
| 37 |
+
print("Step 1: Mixing audio tracks...")
|
| 38 |
+
mix_command = f'''ffmpeg -i "{background_music_path}" -i "{main_audio_path}" -filter_complex \
|
| 39 |
+
"[0:a]volume={bg_volume}[bg]; \
|
| 40 |
+
[1:a]volume={main_audio_volume}[main]; \
|
| 41 |
+
[bg][main]amix=inputs=2:duration=first:dropout_transition=2" \
|
| 42 |
+
"{temp_audio_path}" -y'''
|
| 43 |
+
|
| 44 |
+
subprocess.run(mix_command, shell=True, check=True)
|
| 45 |
+
|
| 46 |
+
# Step 2: Replace the original audio in the video with mixed audio
|
| 47 |
+
print("Step 2: Creating final video with mixed audio...")
|
| 48 |
+
video_command = f'''ffmpeg -i "{main_video_path}" -i "{temp_audio_path}" \
|
| 49 |
+
-c:v copy -map 0:v:0 -map 1:a:0 -shortest -c:a aac \
|
| 50 |
+
"{output_video_path}" -y'''
|
| 51 |
+
|
| 52 |
+
subprocess.run(video_command, shell=True, check=True)
|
| 53 |
+
|
| 54 |
+
# Check if output file exists and has a reasonable size
|
| 55 |
+
if os.path.exists(output_video_path) and os.path.getsize(output_video_path) > 1000:
|
| 56 |
+
print(f"✅ Success! Video created at: {output_video_path}")
|
| 57 |
+
print(f"File size: {os.path.getsize(output_video_path) / (1024*1024):.2f} MB")
|
| 58 |
+
|
| 59 |
+
# Display download link in Jupyter/Colab
|
| 60 |
+
try:
|
| 61 |
+
display(FileLink(output_video_path))
|
| 62 |
+
except:
|
| 63 |
+
pass
|
| 64 |
+
|
| 65 |
+
return True
|
| 66 |
+
else:
|
| 67 |
+
print("❌ Something went wrong. Output file is missing or too small.")
|
| 68 |
+
return False
|
| 69 |
+
|
| 70 |
+
except Exception as e:
|
| 71 |
+
print(f"❌ Error during processing: {str(e)}")
|
| 72 |
+
return False
|
| 73 |
+
finally:
|
| 74 |
+
# Optional: Clean up temporary files
|
| 75 |
+
# if os.path.exists(temp_audio_path):
|
| 76 |
+
# os.remove(temp_audio_path)
|
| 77 |
+
pass
|
demo.py
ADDED
|
@@ -0,0 +1,166 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import sys
|
| 3 |
+
import logging
|
| 4 |
+
from dotenv import load_dotenv
|
| 5 |
+
import re
|
| 6 |
+
|
| 7 |
+
# Add the current directory to path to help with imports
|
| 8 |
+
current_dir = os.path.dirname(os.path.abspath(__file__))
|
| 9 |
+
if current_dir not in sys.path:
|
| 10 |
+
sys.path.append(current_dir)
|
| 11 |
+
|
| 12 |
+
# Import the required modules
|
| 13 |
+
from media_ingestion import MediaIngester
|
| 14 |
+
from speech_recognition import SpeechRecognizer
|
| 15 |
+
from speech_diarization import SpeakerDiarizer
|
| 16 |
+
from translate import translate_text, generate_srt_subtitles
|
| 17 |
+
from text_to_speech import generate_tts # Import both TTS functions
|
| 18 |
+
from audio_to_video import create_video_with_mixed_audio
|
| 19 |
+
|
| 20 |
+
def create_directories(dirs):
|
| 21 |
+
"""Create necessary directories"""
|
| 22 |
+
for directory in dirs:
|
| 23 |
+
os.makedirs(directory, exist_ok=True)
|
| 24 |
+
|
| 25 |
+
def main():
|
| 26 |
+
# Load environment variables
|
| 27 |
+
load_dotenv()
|
| 28 |
+
|
| 29 |
+
# Configure logging
|
| 30 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
| 31 |
+
logger = logging.getLogger(__name__)
|
| 32 |
+
|
| 33 |
+
# Create necessary directories
|
| 34 |
+
create_directories(["temp", "audio", "audio2", "reference_audio"])
|
| 35 |
+
|
| 36 |
+
# Get API tokens
|
| 37 |
+
hf_token = os.getenv("HUGGINGFACE_TOKEN")
|
| 38 |
+
|
| 39 |
+
if not hf_token:
|
| 40 |
+
logger.error("Error: HUGGINGFACE_TOKEN not found in .env file")
|
| 41 |
+
return
|
| 42 |
+
|
| 43 |
+
# Get input from user
|
| 44 |
+
media_source = input("Enter video URL or local file path: ")
|
| 45 |
+
target_language = input("Enter target language code (e.g., en, es, fr, de): ")
|
| 46 |
+
|
| 47 |
+
# Choose TTS engine
|
| 48 |
+
print("\nSelect TTS engine:")
|
| 49 |
+
print("1. Simple dubbing (Edge TTS)")
|
| 50 |
+
print("2. Voice cloning (XTTS)")
|
| 51 |
+
tts_choice = input("Enter choice (1/2): ").strip()
|
| 52 |
+
use_voice_cloning = tts_choice == "2"
|
| 53 |
+
|
| 54 |
+
# Initialize components
|
| 55 |
+
logger.info("Initializing pipeline components...")
|
| 56 |
+
ingester = MediaIngester(output_dir="temp")
|
| 57 |
+
recognizer = SpeechRecognizer(model_size="base")
|
| 58 |
+
diarizer = SpeakerDiarizer(hf_token=hf_token)
|
| 59 |
+
|
| 60 |
+
# Step 1: Process input and extract audio
|
| 61 |
+
logger.info("Processing media source...")
|
| 62 |
+
video_path = ingester.process_input(media_source)
|
| 63 |
+
audio_path = ingester.extract_audio(video_path)
|
| 64 |
+
clean_audio_path, bg_audio_path = ingester.separate_audio_sources(audio_path)
|
| 65 |
+
logger.info("Extracted audio: %s", audio_path)
|
| 66 |
+
logger.info("Cleaned audio: %s", clean_audio_path)
|
| 67 |
+
logger.info("Background audio: %s", bg_audio_path)
|
| 68 |
+
logger.info("Audio processing completed.")
|
| 69 |
+
|
| 70 |
+
# Step 2: Perform speech recognition
|
| 71 |
+
logger.info("Transcribing audio...")
|
| 72 |
+
segments = recognizer.transcribe(clean_audio_path)
|
| 73 |
+
|
| 74 |
+
# Step 3: Perform speaker diarization
|
| 75 |
+
logger.info("Identifying speakers...")
|
| 76 |
+
|
| 77 |
+
# Add user input for max speakers
|
| 78 |
+
max_speakers_str = input("Maximum number of speakers to detect (leave blank for auto): ")
|
| 79 |
+
max_speakers = int(max_speakers_str) if max_speakers_str.strip() else None
|
| 80 |
+
|
| 81 |
+
# Then call diarize with this parameter
|
| 82 |
+
speakers = diarizer.diarize(clean_audio_path, max_speakers=max_speakers)
|
| 83 |
+
|
| 84 |
+
# Step 4: Assign speakers to segments
|
| 85 |
+
logger.info("Assigning speakers to segments...")
|
| 86 |
+
final_segments = diarizer.assign_speakers_to_segments(segments, speakers)
|
| 87 |
+
|
| 88 |
+
# Step 5: Translate the segments
|
| 89 |
+
logger.info(f"Translating to {target_language}...")
|
| 90 |
+
translated_segments = translate_text(
|
| 91 |
+
final_segments,
|
| 92 |
+
target_lang=target_language,
|
| 93 |
+
translation_method="batch" # Can be "batch" or "iterative" or "groq"
|
| 94 |
+
)
|
| 95 |
+
|
| 96 |
+
# Print translated segments for debugging
|
| 97 |
+
subtitle_file = f"temp/{os.path.basename(video_path).split('.')[0]}_{target_language}.srt"
|
| 98 |
+
generate_srt_subtitles(translated_segments, output_file=subtitle_file)
|
| 99 |
+
logger.info(f"Generated subtitle file: {subtitle_file}")
|
| 100 |
+
|
| 101 |
+
# Step 6: Configure voice characteristics for speakers
|
| 102 |
+
voice_config = {} # Map of speaker_id to gender or voice config
|
| 103 |
+
|
| 104 |
+
# Detect number of unique speakers
|
| 105 |
+
unique_speakers = set()
|
| 106 |
+
for segment in translated_segments:
|
| 107 |
+
if 'speaker' in segment:
|
| 108 |
+
unique_speakers.add(segment['speaker'])
|
| 109 |
+
|
| 110 |
+
logger.info(f"Detected {len(unique_speakers)} speakers")
|
| 111 |
+
|
| 112 |
+
if use_voice_cloning:
|
| 113 |
+
# Extract reference audio for voice cloning
|
| 114 |
+
logger.info("Extracting speaker reference audio for voice cloning...")
|
| 115 |
+
reference_files = diarizer.extract_speaker_references(
|
| 116 |
+
clean_audio_path,
|
| 117 |
+
speakers,
|
| 118 |
+
output_dir="reference_audio"
|
| 119 |
+
)
|
| 120 |
+
|
| 121 |
+
# Create voice config for XTTS
|
| 122 |
+
for speaker in sorted(list(unique_speakers)):
|
| 123 |
+
match = re.search(r'SPEAKER_(\d+)', speaker)
|
| 124 |
+
if match:
|
| 125 |
+
speaker_id = int(match.group(1))
|
| 126 |
+
if speaker in reference_files:
|
| 127 |
+
voice_config[speaker_id] = {
|
| 128 |
+
'engine': 'xtts',
|
| 129 |
+
'reference_audio': reference_files[speaker],
|
| 130 |
+
'language': target_language
|
| 131 |
+
}
|
| 132 |
+
logger.info(f"Using voice cloning for Speaker {speaker_id+1} with reference file: {os.path.basename(reference_files[speaker])}")
|
| 133 |
+
else:
|
| 134 |
+
# Fallback to Edge TTS if no reference audio
|
| 135 |
+
logger.warning(f"No reference audio found for Speaker {speaker_id+1}, falling back to Edge TTS")
|
| 136 |
+
gender = input(f"Select voice gender for Speaker {speaker_id+1} (m/f): ").lower()
|
| 137 |
+
voice_config[speaker_id] = {
|
| 138 |
+
'engine': 'edge_tts',
|
| 139 |
+
'gender': "female" if gender.startswith("f") else "male"
|
| 140 |
+
}
|
| 141 |
+
else:
|
| 142 |
+
# Standard Edge TTS configuration - keeping your current approach
|
| 143 |
+
if len(unique_speakers) > 0:
|
| 144 |
+
for speaker in sorted(list(unique_speakers)):
|
| 145 |
+
match = re.search(r'SPEAKER_(\d+)', speaker)
|
| 146 |
+
if match:
|
| 147 |
+
speaker_id = int(match.group(1))
|
| 148 |
+
gender = input(f"Select voice gender for Speaker {speaker_id+1} (m/f): ").lower()
|
| 149 |
+
voice_config[speaker_id] = {
|
| 150 |
+
'engine': 'edge_tts',
|
| 151 |
+
'gender': "female" if gender.startswith("f") else "male"
|
| 152 |
+
}
|
| 153 |
+
|
| 154 |
+
# Step 7: Generate speech in target language
|
| 155 |
+
logger.info("Generating speech...")
|
| 156 |
+
|
| 157 |
+
dubbed_audio_path = generate_tts(translated_segments, target_language, voice_config, output_dir="audio2")
|
| 158 |
+
|
| 159 |
+
# Step 8: Create video with mixed audio
|
| 160 |
+
logger.info("Creating video with translated audio...")
|
| 161 |
+
create_video_with_mixed_audio(video_path, bg_audio_path, dubbed_audio_path)
|
| 162 |
+
|
| 163 |
+
logger.info("Process completed successfully!")
|
| 164 |
+
|
| 165 |
+
if __name__ == "__main__":
|
| 166 |
+
main()
|
demonstration/demo.mp4
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5cbbcaef260aaf1256756f6ffa67d7a6fc8b25a71daeba4a22052d4773f8e767
|
| 3 |
+
size 13703532
|
gradio_app.py
ADDED
|
@@ -0,0 +1,661 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import sys
|
| 3 |
+
import logging
|
| 4 |
+
import tempfile
|
| 5 |
+
import re
|
| 6 |
+
import gradio as gr
|
| 7 |
+
from dotenv import load_dotenv
|
| 8 |
+
import threading
|
| 9 |
+
import shutil
|
| 10 |
+
|
| 11 |
+
# Add the current directory to path to help with imports
|
| 12 |
+
current_dir = os.path.dirname(os.path.abspath(__file__))
|
| 13 |
+
if current_dir not in sys.path:
|
| 14 |
+
sys.path.append(current_dir)
|
| 15 |
+
|
| 16 |
+
# Import the required modules
|
| 17 |
+
from media_ingestion import MediaIngester
|
| 18 |
+
from speech_recognition import SpeechRecognizer
|
| 19 |
+
from speech_diarization import SpeakerDiarizer
|
| 20 |
+
from translate import translate_text, generate_srt_subtitles
|
| 21 |
+
from text_to_speech import generate_tts
|
| 22 |
+
from audio_to_video import create_video_with_mixed_audio
|
| 23 |
+
|
| 24 |
+
# Load environment variables
|
| 25 |
+
load_dotenv()
|
| 26 |
+
|
| 27 |
+
# Configure logging
|
| 28 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
| 29 |
+
logger = logging.getLogger(__name__)
|
| 30 |
+
|
| 31 |
+
# Create necessary directories
|
| 32 |
+
os.makedirs("temp", exist_ok=True)
|
| 33 |
+
os.makedirs("audio", exist_ok=True)
|
| 34 |
+
os.makedirs("audio2", exist_ok=True)
|
| 35 |
+
os.makedirs("reference_audio", exist_ok=True)
|
| 36 |
+
os.makedirs("outputs", exist_ok=True) # Add directory for downloadable outputs
|
| 37 |
+
|
| 38 |
+
# Global variables for process tracking
|
| 39 |
+
processing_status = {}
|
| 40 |
+
|
| 41 |
+
def create_session_id():
|
| 42 |
+
"""Create a unique session ID for tracking progress"""
|
| 43 |
+
import uuid
|
| 44 |
+
return str(uuid.uuid4())[:8]
|
| 45 |
+
|
| 46 |
+
def reset_application():
|
| 47 |
+
"""
|
| 48 |
+
Reset the application state by thoroughly cleaning all temporary files and directories
|
| 49 |
+
"""
|
| 50 |
+
import os
|
| 51 |
+
import shutil
|
| 52 |
+
import time
|
| 53 |
+
|
| 54 |
+
# Directories to completely clean
|
| 55 |
+
directories_to_clean = ["temp", "audio", "audio2", "reference_audio", "outputs"]
|
| 56 |
+
|
| 57 |
+
try:
|
| 58 |
+
# First attempt - delete individual files
|
| 59 |
+
for directory in directories_to_clean:
|
| 60 |
+
if os.path.exists(directory):
|
| 61 |
+
logger.info(f"Cleaning directory: {directory}")
|
| 62 |
+
for filename in os.listdir(directory):
|
| 63 |
+
file_path = os.path.join(directory, filename)
|
| 64 |
+
try:
|
| 65 |
+
if os.path.isfile(file_path):
|
| 66 |
+
os.unlink(file_path)
|
| 67 |
+
logger.info(f"Deleted file: {file_path}")
|
| 68 |
+
elif os.path.isdir(file_path):
|
| 69 |
+
shutil.rmtree(file_path)
|
| 70 |
+
logger.info(f"Deleted subdirectory: {file_path}")
|
| 71 |
+
except Exception as e:
|
| 72 |
+
logger.warning(f"Failed to delete {file_path}: {e}")
|
| 73 |
+
|
| 74 |
+
# Double-check if any files remain (for stubborn files)
|
| 75 |
+
for directory in directories_to_clean:
|
| 76 |
+
if os.path.exists(directory) and any(os.scandir(directory)):
|
| 77 |
+
# Try more aggressive approach - recreate the directory
|
| 78 |
+
try:
|
| 79 |
+
# Rename the old directory
|
| 80 |
+
temp_dir = f"{directory}_old_{int(time.time())}"
|
| 81 |
+
os.rename(directory, temp_dir)
|
| 82 |
+
# Create a new empty directory
|
| 83 |
+
os.makedirs(directory, exist_ok=True)
|
| 84 |
+
# Try to remove the old directory in background
|
| 85 |
+
try:
|
| 86 |
+
shutil.rmtree(temp_dir)
|
| 87 |
+
except:
|
| 88 |
+
pass # Ignore if we can't delete it immediately
|
| 89 |
+
except Exception as e:
|
| 90 |
+
logger.warning(f"Failed to recreate directory {directory}: {e}")
|
| 91 |
+
|
| 92 |
+
# Clear out individual files that might be in the root directory
|
| 93 |
+
root_files_to_check = [
|
| 94 |
+
"dubbed_conversation.wav",
|
| 95 |
+
"downloaded_video.mp4",
|
| 96 |
+
"downloaded_video_hi.srt",
|
| 97 |
+
"extracted_audio.wav",
|
| 98 |
+
"mixed_audio.wav",
|
| 99 |
+
"output_video.mp4"
|
| 100 |
+
]
|
| 101 |
+
|
| 102 |
+
for filename in root_files_to_check:
|
| 103 |
+
if os.path.exists(filename):
|
| 104 |
+
try:
|
| 105 |
+
os.unlink(filename)
|
| 106 |
+
logger.info(f"Deleted root file: {filename}")
|
| 107 |
+
except Exception as e:
|
| 108 |
+
logger.warning(f"Failed to delete root file {filename}: {e}")
|
| 109 |
+
|
| 110 |
+
# Reset the global processing status
|
| 111 |
+
global processing_status
|
| 112 |
+
processing_status = {}
|
| 113 |
+
|
| 114 |
+
# Generate a new session ID
|
| 115 |
+
new_session_id = create_session_id()
|
| 116 |
+
|
| 117 |
+
# For visual confirmation to the user, return timestamp of the reset
|
| 118 |
+
timestamp = time.strftime("%H:%M:%S", time.localtime())
|
| 119 |
+
success_message = f"⚠️ Application reset complete at {timestamp}. All temporary files cleared. Ready for new processing."
|
| 120 |
+
|
| 121 |
+
return {
|
| 122 |
+
"new_status": success_message,
|
| 123 |
+
"session_id": new_session_id,
|
| 124 |
+
"media_input": None, # Clear media input field
|
| 125 |
+
"output": None, # Clear output file
|
| 126 |
+
"subtitle": None, # Clear subtitle file
|
| 127 |
+
"message": "" # Clear output message
|
| 128 |
+
}
|
| 129 |
+
|
| 130 |
+
except Exception as e:
|
| 131 |
+
logger.exception("Error during application reset")
|
| 132 |
+
timestamp = time.strftime("%H:%M:%S", time.localtime())
|
| 133 |
+
return {
|
| 134 |
+
"new_status": f"⚠️ Reset attempted at {timestamp} but encountered errors: {str(e)}",
|
| 135 |
+
"session_id": None,
|
| 136 |
+
"media_input": None,
|
| 137 |
+
"output": None,
|
| 138 |
+
"subtitle": None,
|
| 139 |
+
"message": ""
|
| 140 |
+
}
|
| 141 |
+
|
| 142 |
+
def process_video(media_source, target_language, tts_choice, max_speakers, speaker_genders, session_id, translation_method="batch", progress=gr.Progress()):
|
| 143 |
+
"""Main processing function that handles the complete pipeline"""
|
| 144 |
+
global processing_status
|
| 145 |
+
processing_status[session_id] = {"status": "Starting", "progress": 0}
|
| 146 |
+
|
| 147 |
+
try:
|
| 148 |
+
# Get API tokens
|
| 149 |
+
hf_token = os.getenv("HUGGINGFACE_TOKEN")
|
| 150 |
+
|
| 151 |
+
if not hf_token:
|
| 152 |
+
return {"error": True, "message": "Error: HUGGINGFACE_TOKEN not found in .env file"}
|
| 153 |
+
|
| 154 |
+
# Determine if input is URL or file
|
| 155 |
+
is_url = media_source.startswith(("http://", "https://"))
|
| 156 |
+
|
| 157 |
+
# Initialize components
|
| 158 |
+
progress(0.05, desc="Initializing components")
|
| 159 |
+
processing_status[session_id] = {"status": "Initializing components", "progress": 0.05}
|
| 160 |
+
|
| 161 |
+
ingester = MediaIngester(output_dir="temp")
|
| 162 |
+
recognizer = SpeechRecognizer(model_size="base")
|
| 163 |
+
diarizer = SpeakerDiarizer(hf_token=hf_token)
|
| 164 |
+
|
| 165 |
+
# Step 1: Process input and extract audio
|
| 166 |
+
progress(0.1, desc="Processing media source")
|
| 167 |
+
processing_status[session_id] = {"status": "Processing media source", "progress": 0.1}
|
| 168 |
+
|
| 169 |
+
video_path = ingester.process_input(media_source)
|
| 170 |
+
audio_path = ingester.extract_audio(video_path)
|
| 171 |
+
|
| 172 |
+
progress(0.15, desc="Separating audio sources")
|
| 173 |
+
processing_status[session_id] = {"status": "Separating audio sources", "progress": 0.15}
|
| 174 |
+
|
| 175 |
+
clean_audio_path, bg_audio_path = ingester.separate_audio_sources(audio_path)
|
| 176 |
+
|
| 177 |
+
# Step 2: Perform speech recognition
|
| 178 |
+
progress(0.2, desc="Transcribing audio")
|
| 179 |
+
processing_status[session_id] = {"status": "Transcribing audio", "progress": 0.2}
|
| 180 |
+
|
| 181 |
+
segments = recognizer.transcribe(clean_audio_path)
|
| 182 |
+
|
| 183 |
+
# Step 3: Perform speaker diarization
|
| 184 |
+
progress(0.3, desc="Identifying speakers")
|
| 185 |
+
processing_status[session_id] = {"status": "Identifying speakers", "progress": 0.3}
|
| 186 |
+
|
| 187 |
+
# Convert max_speakers to int or None
|
| 188 |
+
max_speakers_val = int(max_speakers) if max_speakers and max_speakers.strip() else None
|
| 189 |
+
|
| 190 |
+
# Diarize audio to identify speakers
|
| 191 |
+
speakers = diarizer.diarize(clean_audio_path, max_speakers=max_speakers_val)
|
| 192 |
+
|
| 193 |
+
# Step 4: Assign speakers to segments
|
| 194 |
+
progress(0.4, desc="Assigning speakers to segments")
|
| 195 |
+
processing_status[session_id] = {"status": "Assigning speakers to segments", "progress": 0.4}
|
| 196 |
+
|
| 197 |
+
final_segments = diarizer.assign_speakers_to_segments(segments, speakers)
|
| 198 |
+
|
| 199 |
+
# Step 5: Translate the segments
|
| 200 |
+
progress(0.5, desc=f"Translating to {target_language}")
|
| 201 |
+
processing_status[session_id] = {"status": f"Translating to {target_language}", "progress": 0.5}
|
| 202 |
+
|
| 203 |
+
# Validate target language
|
| 204 |
+
valid_languages = ["en", "es", "fr", "de", "it", "ja", "ko", "pt", "ru", "zh", "hi"]
|
| 205 |
+
if target_language not in valid_languages:
|
| 206 |
+
logger.warning(f"Unsupported language: {target_language}, falling back to English")
|
| 207 |
+
target_language = "en"
|
| 208 |
+
|
| 209 |
+
translated_segments = translate_text(
|
| 210 |
+
final_segments,
|
| 211 |
+
target_lang=target_language,
|
| 212 |
+
translation_method=translation_method
|
| 213 |
+
)
|
| 214 |
+
|
| 215 |
+
# Generate subtitle file
|
| 216 |
+
subtitle_file = f"temp/{os.path.basename(video_path).split('.')[0]}_{target_language}.srt"
|
| 217 |
+
generate_srt_subtitles(translated_segments, output_file=subtitle_file)
|
| 218 |
+
|
| 219 |
+
# Step 6: Configure voice characteristics for speakers
|
| 220 |
+
progress(0.6, desc="Configuring voices")
|
| 221 |
+
processing_status[session_id] = {"status": "Configuring voices", "progress": 0.6}
|
| 222 |
+
|
| 223 |
+
# Detect number of unique speakers
|
| 224 |
+
unique_speakers = set()
|
| 225 |
+
for segment in translated_segments:
|
| 226 |
+
if 'speaker' in segment:
|
| 227 |
+
unique_speakers.add(segment['speaker'])
|
| 228 |
+
|
| 229 |
+
logger.info(f"Detected {len(unique_speakers)} speakers")
|
| 230 |
+
|
| 231 |
+
# Use provided speaker genders
|
| 232 |
+
use_voice_cloning = tts_choice == "Voice cloning (XTTS)"
|
| 233 |
+
voice_config = {} # Map of speaker_id to gender or voice config
|
| 234 |
+
|
| 235 |
+
if use_voice_cloning:
|
| 236 |
+
# Extract reference audio for voice cloning
|
| 237 |
+
logger.info("Extracting speaker reference audio for voice cloning...")
|
| 238 |
+
reference_files = diarizer.extract_speaker_references(
|
| 239 |
+
clean_audio_path,
|
| 240 |
+
speakers,
|
| 241 |
+
output_dir="reference_audio"
|
| 242 |
+
)
|
| 243 |
+
|
| 244 |
+
# Create voice config for XTTS
|
| 245 |
+
for speaker in sorted(list(unique_speakers)):
|
| 246 |
+
match = re.search(r'SPEAKER_(\d+)', speaker)
|
| 247 |
+
if match:
|
| 248 |
+
speaker_id = int(match.group(1))
|
| 249 |
+
if speaker in reference_files:
|
| 250 |
+
voice_config[speaker_id] = {
|
| 251 |
+
'engine': 'xtts',
|
| 252 |
+
'reference_audio': reference_files[speaker],
|
| 253 |
+
'language': target_language # Use the validated target language
|
| 254 |
+
}
|
| 255 |
+
logger.info(f"Using voice cloning for Speaker {speaker_id+1} with reference file: {os.path.basename(reference_files[speaker])}")
|
| 256 |
+
else:
|
| 257 |
+
# Fallback to Edge TTS if no reference audio
|
| 258 |
+
logger.warning(f"No reference audio found for Speaker {speaker_id+1}, falling back to Edge TTS")
|
| 259 |
+
gender = "female" # Default fallback
|
| 260 |
+
if str(speaker_id) in speaker_genders and speaker_genders[str(speaker_id)]:
|
| 261 |
+
gender = speaker_genders[str(speaker_id)]
|
| 262 |
+
|
| 263 |
+
voice_config[speaker_id] = {
|
| 264 |
+
'engine': 'edge_tts',
|
| 265 |
+
'gender': gender
|
| 266 |
+
}
|
| 267 |
+
else:
|
| 268 |
+
# Standard Edge TTS configuration
|
| 269 |
+
if len(unique_speakers) > 0:
|
| 270 |
+
for speaker in sorted(list(unique_speakers)):
|
| 271 |
+
match = re.search(r'SPEAKER_(\d+)', speaker)
|
| 272 |
+
if match:
|
| 273 |
+
speaker_id = int(match.group(1))
|
| 274 |
+
gender = "female" if speaker_id % 2 == 0 else "male" # Default fallback
|
| 275 |
+
|
| 276 |
+
# Use selected gender if available
|
| 277 |
+
if str(speaker_id) in speaker_genders and speaker_genders[str(speaker_id)]:
|
| 278 |
+
gender = speaker_genders[str(speaker_id)]
|
| 279 |
+
|
| 280 |
+
voice_config[speaker_id] = gender
|
| 281 |
+
|
| 282 |
+
# Step 7: Generate speech in target language
|
| 283 |
+
progress(0.7, desc=f"Generating speech in {target_language}")
|
| 284 |
+
processing_status[session_id] = {"status": f"Generating speech in {target_language}", "progress": 0.7}
|
| 285 |
+
|
| 286 |
+
dubbed_audio_path = generate_tts(translated_segments, target_language, voice_config, output_dir="audio2")
|
| 287 |
+
|
| 288 |
+
# Step 8: Create video with mixed audio
|
| 289 |
+
progress(0.85, desc="Creating final video")
|
| 290 |
+
processing_status[session_id] = {"status": "Creating final video", "progress": 0.85}
|
| 291 |
+
|
| 292 |
+
success = create_video_with_mixed_audio(
|
| 293 |
+
main_video_path=video_path,
|
| 294 |
+
background_music_path=bg_audio_path,
|
| 295 |
+
main_audio_path=dubbed_audio_path
|
| 296 |
+
)
|
| 297 |
+
|
| 298 |
+
if not success:
|
| 299 |
+
raise RuntimeError("Failed to create final video with audio")
|
| 300 |
+
|
| 301 |
+
# Use known output path since function returns boolean
|
| 302 |
+
output_video_path = os.path.join("temp", "output_video.mp4")
|
| 303 |
+
|
| 304 |
+
# Verify the output video exists
|
| 305 |
+
if not os.path.exists(output_video_path):
|
| 306 |
+
raise FileNotFoundError(f"Output video not found at expected path: {output_video_path}")
|
| 307 |
+
|
| 308 |
+
# Create downloadable copies with unique names
|
| 309 |
+
file_basename = os.path.basename(video_path).split('.')[0]
|
| 310 |
+
downloadable_video = f"outputs/{file_basename}_{target_language}_{session_id}.mp4"
|
| 311 |
+
downloadable_subtitle = f"outputs/{file_basename}_{target_language}_{session_id}.srt"
|
| 312 |
+
|
| 313 |
+
# Copy files to outputs directory for download
|
| 314 |
+
shutil.copy2(output_video_path, downloadable_video)
|
| 315 |
+
shutil.copy2(subtitle_file, downloadable_subtitle)
|
| 316 |
+
|
| 317 |
+
# Complete
|
| 318 |
+
progress(1.0, desc="Process completed")
|
| 319 |
+
processing_status[session_id] = {"status": "Completed", "progress": 1.0}
|
| 320 |
+
|
| 321 |
+
return {
|
| 322 |
+
"error": False,
|
| 323 |
+
"video": downloadable_video,
|
| 324 |
+
"subtitle": downloadable_subtitle,
|
| 325 |
+
"message": "Process completed successfully! Click on the files to download."
|
| 326 |
+
}
|
| 327 |
+
|
| 328 |
+
except Exception as e:
|
| 329 |
+
logger.exception("Error in processing pipeline")
|
| 330 |
+
processing_status[session_id] = {"status": f"Error: {str(e)}", "progress": -1}
|
| 331 |
+
return {"error": True, "message": f"Error: {str(e)}"}
|
| 332 |
+
|
| 333 |
+
def get_processing_status(session_id):
|
| 334 |
+
"""Get the current processing status for the given session"""
|
| 335 |
+
global processing_status
|
| 336 |
+
if session_id in processing_status:
|
| 337 |
+
return processing_status[session_id]["status"]
|
| 338 |
+
return "No status available"
|
| 339 |
+
|
| 340 |
+
def check_api_tokens():
|
| 341 |
+
"""Check if required API tokens are set"""
|
| 342 |
+
missing_tokens = []
|
| 343 |
+
|
| 344 |
+
if not os.getenv("HUGGINGFACE_TOKEN"):
|
| 345 |
+
missing_tokens.append("HUGGINGFACE_TOKEN")
|
| 346 |
+
|
| 347 |
+
if missing_tokens:
|
| 348 |
+
return f"Warning: Missing API tokens: {', '.join(missing_tokens)}. Please set them in your .env file."
|
| 349 |
+
else:
|
| 350 |
+
return "All required API tokens are set."
|
| 351 |
+
|
| 352 |
+
def check_system_state():
|
| 353 |
+
"""
|
| 354 |
+
Check the state of all directories and report what files still exist
|
| 355 |
+
"""
|
| 356 |
+
import os
|
| 357 |
+
|
| 358 |
+
# Directories to check
|
| 359 |
+
directories_to_check = ["temp", "audio", "audio2", "reference_audio", "outputs"]
|
| 360 |
+
report = []
|
| 361 |
+
|
| 362 |
+
for directory in directories_to_check:
|
| 363 |
+
if os.path.exists(directory):
|
| 364 |
+
files = os.listdir(directory)
|
| 365 |
+
if files:
|
| 366 |
+
report.append(f"Directory '{directory}' contains {len(files)} files: {', '.join(files[:5])}")
|
| 367 |
+
if len(files) > 5:
|
| 368 |
+
report.append(f"... and {len(files) - 5} more files")
|
| 369 |
+
else:
|
| 370 |
+
report.append(f"Directory '{directory}' is empty")
|
| 371 |
+
else:
|
| 372 |
+
report.append(f"Directory '{directory}' does not exist")
|
| 373 |
+
|
| 374 |
+
# Check root files
|
| 375 |
+
root_files = [f for f in os.listdir('.') if os.path.isfile(f)]
|
| 376 |
+
if root_files:
|
| 377 |
+
report.append(f"Root directory contains {len(root_files)} files")
|
| 378 |
+
|
| 379 |
+
return "\n".join(report)
|
| 380 |
+
|
| 381 |
+
# Define the Gradio interface
|
| 382 |
+
def create_interface():
|
| 383 |
+
with gr.Blocks(title="SyncDub - Video Translation and Dubbing") as app:
|
| 384 |
+
gr.Markdown("# SyncDub - Video Translation and Dubbing")
|
| 385 |
+
gr.Markdown("Translate and dub videos to different languages with speaker diarization")
|
| 386 |
+
|
| 387 |
+
session_id = create_session_id() # Create a session ID for tracking progress
|
| 388 |
+
|
| 389 |
+
with gr.Tab("Process Video"):
|
| 390 |
+
with gr.Row():
|
| 391 |
+
with gr.Column(scale=2):
|
| 392 |
+
media_input = gr.Textbox(label="Video URL or File Upload", placeholder="Enter a YouTube URL or upload a video file")
|
| 393 |
+
|
| 394 |
+
with gr.Row():
|
| 395 |
+
# Enhanced language dropdown with full language names
|
| 396 |
+
target_language = gr.Dropdown(
|
| 397 |
+
choices=[
|
| 398 |
+
("English", "en"),
|
| 399 |
+
("Spanish", "es"),
|
| 400 |
+
("French", "fr"),
|
| 401 |
+
("German", "de"),
|
| 402 |
+
("Hindi", "hi"),
|
| 403 |
+
("Italian", "it"),
|
| 404 |
+
("Japanese", "ja"),
|
| 405 |
+
("Korean", "ko"),
|
| 406 |
+
("Portuguese", "pt"),
|
| 407 |
+
("Russian", "ru"),
|
| 408 |
+
("Chinese", "zh")
|
| 409 |
+
],
|
| 410 |
+
label="Target Language",
|
| 411 |
+
value="hi"
|
| 412 |
+
)
|
| 413 |
+
tts_choice = gr.Radio(
|
| 414 |
+
choices=["Simple dubbing (Edge TTS)", "Voice cloning (XTTS)"],
|
| 415 |
+
label="TTS Method",
|
| 416 |
+
value="Simple dubbing (Edge TTS)"
|
| 417 |
+
)
|
| 418 |
+
|
| 419 |
+
# Add translation method selection
|
| 420 |
+
with gr.Row():
|
| 421 |
+
translation_method = gr.Radio(
|
| 422 |
+
choices=["batch", "iterative", "groq"],
|
| 423 |
+
label="Translation Method",
|
| 424 |
+
value="batch",
|
| 425 |
+
info="Batch: Faster for longer content. Iterative: May be more accurate for short content. Groq: Uses Groq LLM API."
|
| 426 |
+
)
|
| 427 |
+
|
| 428 |
+
# Speaker count input and update button
|
| 429 |
+
with gr.Row():
|
| 430 |
+
max_speakers = gr.Textbox(label="Maximum number of speakers", placeholder="Leave blank for auto")
|
| 431 |
+
update_speakers_btn = gr.Button("Update Speaker Options")
|
| 432 |
+
|
| 433 |
+
# Speaker gender container
|
| 434 |
+
with gr.Group(visible=False) as speaker_genders_container:
|
| 435 |
+
gr.Markdown("### Speaker Gender Selection")
|
| 436 |
+
speaker_genders = {}
|
| 437 |
+
for i in range(8): # Support up to 8 speakers
|
| 438 |
+
speaker_genders[str(i)] = gr.Radio(
|
| 439 |
+
choices=["male", "female"],
|
| 440 |
+
value="male" if i % 2 == 1 else "female",
|
| 441 |
+
label=f"Speaker {i} Gender",
|
| 442 |
+
visible=False # Initially hidden
|
| 443 |
+
)
|
| 444 |
+
|
| 445 |
+
process_btn = gr.Button("Process Video", variant="primary")
|
| 446 |
+
status_text = gr.Textbox(label="Status", value="Ready", interactive=False)
|
| 447 |
+
|
| 448 |
+
with gr.Column(scale=3):
|
| 449 |
+
# Replace video display with file downloads
|
| 450 |
+
gr.Markdown("### Output Files")
|
| 451 |
+
output_message = gr.Textbox(label="Status", interactive=False)
|
| 452 |
+
with gr.Row():
|
| 453 |
+
output = gr.File(label="Download Video")
|
| 454 |
+
subtitle_output = gr.File(label="Download Subtitles")
|
| 455 |
+
|
| 456 |
+
# Function to update speaker gender options
|
| 457 |
+
def update_speaker_options(max_speakers_value):
|
| 458 |
+
updates = {}
|
| 459 |
+
|
| 460 |
+
try:
|
| 461 |
+
num_speakers = int(max_speakers_value) if max_speakers_value.strip() else 0
|
| 462 |
+
|
| 463 |
+
if num_speakers > 0:
|
| 464 |
+
# Show the speaker gender container
|
| 465 |
+
updates[speaker_genders_container] = gr.Group(visible=True)
|
| 466 |
+
|
| 467 |
+
# Show only the relevant number of speaker options
|
| 468 |
+
for i in range(8):
|
| 469 |
+
updates[speaker_genders[str(i)]] = gr.Radio(
|
| 470 |
+
visible=(i < num_speakers)
|
| 471 |
+
)
|
| 472 |
+
else:
|
| 473 |
+
# Hide all if no valid number
|
| 474 |
+
updates[speaker_genders_container] = gr.Group(visible=False)
|
| 475 |
+
except ValueError:
|
| 476 |
+
# Hide all if invalid number
|
| 477 |
+
updates[speaker_genders_container] = gr.Group(visible=False)
|
| 478 |
+
|
| 479 |
+
return updates
|
| 480 |
+
|
| 481 |
+
# Connect the update button to show/hide speaker options
|
| 482 |
+
update_speakers_btn.click(
|
| 483 |
+
fn=update_speaker_options,
|
| 484 |
+
inputs=[max_speakers],
|
| 485 |
+
outputs=[speaker_genders_container] + [speaker_genders[str(i)] for i in range(8)]
|
| 486 |
+
)
|
| 487 |
+
|
| 488 |
+
# Function to actually pass the gender values to the process_video function
|
| 489 |
+
def process_with_genders(media_source, target_language, tts_choice, max_speakers, translation_method, *gender_values):
|
| 490 |
+
# Convert the gender values into a dictionary to pass to process_video
|
| 491 |
+
speaker_genders_dict = {str(i): gender for i, gender in enumerate(gender_values) if gender}
|
| 492 |
+
result = process_video(media_source, target_language, tts_choice, max_speakers,
|
| 493 |
+
speaker_genders_dict, session_id, translation_method=translation_method)
|
| 494 |
+
|
| 495 |
+
# Return the output values based on whether there was an error
|
| 496 |
+
if result.get("error", False):
|
| 497 |
+
return None, None, result.get("message", "An error occurred")
|
| 498 |
+
else:
|
| 499 |
+
return result.get("video"), result.get("subtitle"), result.get("message")
|
| 500 |
+
|
| 501 |
+
# Connect the process button
|
| 502 |
+
process_btn.click(
|
| 503 |
+
fn=process_with_genders,
|
| 504 |
+
inputs=[
|
| 505 |
+
media_input,
|
| 506 |
+
target_language,
|
| 507 |
+
tts_choice,
|
| 508 |
+
max_speakers,
|
| 509 |
+
translation_method, # Add translation method to inputs
|
| 510 |
+
# Pass individual radio components, not a Group
|
| 511 |
+
*[speaker_genders[str(i)] for i in range(8)]
|
| 512 |
+
],
|
| 513 |
+
outputs=[output, subtitle_output, output_message]
|
| 514 |
+
)
|
| 515 |
+
|
| 516 |
+
# Update status periodically
|
| 517 |
+
status_timer = gr.Timer(2, lambda: get_processing_status(session_id), None, status_text)
|
| 518 |
+
|
| 519 |
+
# Create a more compatible approach for status updates
|
| 520 |
+
def start_status_updates(session_id):
|
| 521 |
+
def update_status_thread():
|
| 522 |
+
import time
|
| 523 |
+
while session_id in processing_status and processing_status[session_id]["progress"] < 1.0:
|
| 524 |
+
try:
|
| 525 |
+
time.sleep(1) # Update status every second
|
| 526 |
+
# This is a workaround since we can't use JavaScript directly
|
| 527 |
+
except:
|
| 528 |
+
break
|
| 529 |
+
|
| 530 |
+
thread = threading.Thread(target=update_status_thread)
|
| 531 |
+
thread.daemon = True # Thread will exit when main program exits
|
| 532 |
+
thread.start()
|
| 533 |
+
return "Processing started"
|
| 534 |
+
|
| 535 |
+
# Status checking function
|
| 536 |
+
def check_status(session_id):
|
| 537 |
+
status = get_processing_status(session_id)
|
| 538 |
+
return status
|
| 539 |
+
|
| 540 |
+
# Define the handle_reset function here
|
| 541 |
+
def handle_reset():
|
| 542 |
+
"""Handle the reset button click by calling reset_application()"""
|
| 543 |
+
try:
|
| 544 |
+
# Assuming reset_application is defined elsewhere in your code
|
| 545 |
+
result = reset_application()
|
| 546 |
+
# Inform user that reset is in progress
|
| 547 |
+
gr.Info("Resetting application...")
|
| 548 |
+
# Return values in the order expected by the outputs list
|
| 549 |
+
return (
|
| 550 |
+
"Application reset successful. Ready for new video processing.",
|
| 551 |
+
None, # media_input
|
| 552 |
+
None, # output
|
| 553 |
+
None, # subtitle_output
|
| 554 |
+
"" # output_message
|
| 555 |
+
)
|
| 556 |
+
except Exception as e:
|
| 557 |
+
logger.exception("Error in reset handler")
|
| 558 |
+
return (
|
| 559 |
+
f"Reset failed: {str(e)}",
|
| 560 |
+
None, None, None, ""
|
| 561 |
+
)
|
| 562 |
+
|
| 563 |
+
# Replace multiple button rows with a single row containing both buttons
|
| 564 |
+
with gr.Row():
|
| 565 |
+
reset_btn = gr.Button("🗑️ Reset Everything", variant="stop")
|
| 566 |
+
refresh_btn = gr.Button("Refresh Status")
|
| 567 |
+
|
| 568 |
+
# Connect the refresh button to check status
|
| 569 |
+
refresh_btn.click(
|
| 570 |
+
fn=check_status,
|
| 571 |
+
inputs=[gr.State(session_id)],
|
| 572 |
+
outputs=[status_text]
|
| 573 |
+
)
|
| 574 |
+
|
| 575 |
+
# Now use the defined handle_reset function
|
| 576 |
+
reset_btn.click(
|
| 577 |
+
fn=handle_reset,
|
| 578 |
+
inputs=[],
|
| 579 |
+
outputs=[
|
| 580 |
+
status_text,
|
| 581 |
+
media_input,
|
| 582 |
+
output,
|
| 583 |
+
subtitle_output,
|
| 584 |
+
output_message
|
| 585 |
+
]
|
| 586 |
+
)
|
| 587 |
+
|
| 588 |
+
# Create a simple auto-refresh component using a Textbox with a timer
|
| 589 |
+
gr.HTML("""
|
| 590 |
+
<script>
|
| 591 |
+
// Simple poller to update status
|
| 592 |
+
document.addEventListener('DOMContentLoaded', function() {
|
| 593 |
+
let refreshInterval;
|
| 594 |
+
|
| 595 |
+
// Look for the primary button (Process Video)
|
| 596 |
+
const processButton = document.querySelector('button.primary');
|
| 597 |
+
|
| 598 |
+
if (processButton) {
|
| 599 |
+
// When process starts, begin polling
|
| 600 |
+
processButton.addEventListener('click', function() {
|
| 601 |
+
if (refreshInterval) clearInterval(refreshInterval);
|
| 602 |
+
|
| 603 |
+
// Find the refresh button
|
| 604 |
+
const refreshButtons = Array.from(document.querySelectorAll('button'));
|
| 605 |
+
const refreshButton = refreshButtons.find(btn => btn.textContent.includes('Refresh Status'));
|
| 606 |
+
|
| 607 |
+
if (refreshButton) {
|
| 608 |
+
// Start auto-polling every 2 seconds
|
| 609 |
+
refreshInterval = setInterval(function() {
|
| 610 |
+
refreshButton.click();
|
| 611 |
+
}, 2000);
|
| 612 |
+
|
| 613 |
+
// Stop polling after 30 minutes (safety)
|
| 614 |
+
setTimeout(function() {
|
| 615 |
+
if (refreshInterval) clearInterval(refreshInterval);
|
| 616 |
+
}, 30*60*1000);
|
| 617 |
+
}
|
| 618 |
+
});
|
| 619 |
+
}
|
| 620 |
+
});
|
| 621 |
+
</script>
|
| 622 |
+
""")
|
| 623 |
+
|
| 624 |
+
# Add a debug button to the interface
|
| 625 |
+
with gr.Row():
|
| 626 |
+
check_btn = gr.Button("Check System State", variant="secondary")
|
| 627 |
+
check_output = gr.Textbox(label="System State")
|
| 628 |
+
|
| 629 |
+
check_btn.click(fn=check_system_state, inputs=[], outputs=[check_output])
|
| 630 |
+
|
| 631 |
+
with gr.Tab("Help"):
|
| 632 |
+
gr.Markdown("""
|
| 633 |
+
## How to use SyncDub
|
| 634 |
+
|
| 635 |
+
1. **Input**: Enter a YouTube URL or path to a local video file, or upload a video
|
| 636 |
+
2. **Target Language**: Select the language you want to translate and dub into
|
| 637 |
+
3. **TTS Engine**:
|
| 638 |
+
- **Simple dubbing**: Uses Edge TTS (faster but less natural sounding)
|
| 639 |
+
- **Voice cloning**: Uses XTTS to clone the original speakers' voices (slower but more natural)
|
| 640 |
+
4. **Maximum Speakers**: Optionally specify the maximum number of speakers to detect
|
| 641 |
+
5. **Translation Method**: Choose the translation method (Batch, Iterative, or Groq)
|
| 642 |
+
6. **Process**: Click the Process Video button to start
|
| 643 |
+
|
| 644 |
+
## Requirements
|
| 645 |
+
|
| 646 |
+
Make sure you have the following API tokens in your `.env` file:
|
| 647 |
+
- `HUGGINGFACE_TOKEN`: Required for speech diarization
|
| 648 |
+
|
| 649 |
+
## Troubleshooting
|
| 650 |
+
|
| 651 |
+
- If you encounter errors, check that all API tokens are set correctly
|
| 652 |
+
- For large videos, the process may take several minutes
|
| 653 |
+
- If voice cloning doesn't sound right, try simple dubbing instead
|
| 654 |
+
""")
|
| 655 |
+
|
| 656 |
+
return app
|
| 657 |
+
|
| 658 |
+
# Launch the interface
|
| 659 |
+
if __name__ == "__main__":
|
| 660 |
+
app = create_interface()
|
| 661 |
+
app.launch(share=True)
|
media_ingestion.py
ADDED
|
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import yt_dlp
|
| 2 |
+
import os
|
| 3 |
+
from moviepy.editor import VideoFileClip
|
| 4 |
+
import subprocess
|
| 5 |
+
import shutil
|
| 6 |
+
|
| 7 |
+
class MediaIngester:
|
| 8 |
+
def __init__(self, output_dir="temp"):
|
| 9 |
+
self.output_dir = output_dir
|
| 10 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 11 |
+
|
| 12 |
+
def process_input(self, source):
|
| 13 |
+
"""Process various input types and extract audio"""
|
| 14 |
+
if source.startswith(("http://", "https://")):
|
| 15 |
+
# Handle URL (including YouTube)
|
| 16 |
+
return self.download_from_url(source)
|
| 17 |
+
elif os.path.isfile(source):
|
| 18 |
+
# Handle local file
|
| 19 |
+
return self.process_local_file(source)
|
| 20 |
+
else:
|
| 21 |
+
raise ValueError("Input source not recognized")
|
| 22 |
+
|
| 23 |
+
def download_from_url(self, url):
|
| 24 |
+
"""Download media from URL (including YouTube)"""
|
| 25 |
+
output_path = os.path.join(self.output_dir, "downloaded_video.mp4")
|
| 26 |
+
ydl_opts = {
|
| 27 |
+
'format': 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]',
|
| 28 |
+
'outtmpl': output_path,
|
| 29 |
+
# Add additional options to make downloads more robust
|
| 30 |
+
'retries': 10,
|
| 31 |
+
'fragment_retries': 10,
|
| 32 |
+
'ignoreerrors': False,
|
| 33 |
+
'no_warnings': False,
|
| 34 |
+
'geo_bypass': True,
|
| 35 |
+
}
|
| 36 |
+
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
| 37 |
+
ydl.download([url])
|
| 38 |
+
return output_path
|
| 39 |
+
|
| 40 |
+
def process_local_file(self, file_path):
|
| 41 |
+
"""Process local video or audio file"""
|
| 42 |
+
# For simplicity, just return the path if it's a valid file
|
| 43 |
+
return file_path
|
| 44 |
+
|
| 45 |
+
def extract_audio(self, video_path):
|
| 46 |
+
"""Extract audio from video file"""
|
| 47 |
+
audio_path = os.path.join(self.output_dir, "extracted_audio.wav")
|
| 48 |
+
video = VideoFileClip(video_path)
|
| 49 |
+
video.audio.write_audiofile(audio_path)
|
| 50 |
+
return audio_path
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def separate_audio_sources(self, audio_path):
|
| 55 |
+
"""
|
| 56 |
+
Separate voice and background music from an audio file using Demucs
|
| 57 |
+
|
| 58 |
+
Parameters:
|
| 59 |
+
audio_path (str): Path to the input audio file
|
| 60 |
+
|
| 61 |
+
Returns:
|
| 62 |
+
tuple: (voice_audio_path, background_music_path)
|
| 63 |
+
"""
|
| 64 |
+
# Create output directory for separated audio
|
| 65 |
+
separation_dir = os.path.join(self.output_dir, "separated")
|
| 66 |
+
os.makedirs(separation_dir, exist_ok=True)
|
| 67 |
+
|
| 68 |
+
# Final output paths
|
| 69 |
+
voice_path = os.path.join(separation_dir, "voice.wav")
|
| 70 |
+
music_path = os.path.join(separation_dir, "music.wav")
|
| 71 |
+
|
| 72 |
+
try:
|
| 73 |
+
# Method 1: Using Demucs as a command-line tool
|
| 74 |
+
cmd = [
|
| 75 |
+
"demucs", "--two-stems=vocals",
|
| 76 |
+
"-o", separation_dir,
|
| 77 |
+
audio_path
|
| 78 |
+
]
|
| 79 |
+
|
| 80 |
+
print(f"Separating audio sources from {os.path.basename(audio_path)}...")
|
| 81 |
+
result = subprocess.run(cmd, check=True, capture_output=True, text=True)
|
| 82 |
+
print("Separation complete.")
|
| 83 |
+
|
| 84 |
+
# Demucs creates a subdirectory with model name and then the base name
|
| 85 |
+
base_name = os.path.splitext(os.path.basename(audio_path))[0]
|
| 86 |
+
model_name = "htdemucs" # default model
|
| 87 |
+
demucs_output_dir = os.path.join(separation_dir, model_name, base_name)
|
| 88 |
+
|
| 89 |
+
# Get the paths to the separated files
|
| 90 |
+
actual_voice_path = os.path.join(demucs_output_dir, "vocals.wav")
|
| 91 |
+
actual_music_path = os.path.join(demucs_output_dir, "no_vocals.wav")
|
| 92 |
+
|
| 93 |
+
# Move files to their final locations
|
| 94 |
+
shutil.copy2(actual_voice_path, voice_path)
|
| 95 |
+
shutil.copy2(actual_music_path, music_path)
|
| 96 |
+
|
| 97 |
+
# Clean up if needed
|
| 98 |
+
shutil.rmtree(os.path.join(separation_dir, model_name))
|
| 99 |
+
|
| 100 |
+
return voice_path, music_path
|
| 101 |
+
|
| 102 |
+
except Exception as e:
|
| 103 |
+
print(f"Error during audio separation: {e}")
|
| 104 |
+
|
| 105 |
+
# Method 2: Fall back to Python API
|
| 106 |
+
try:
|
| 107 |
+
print("Attempting separation using Python API...")
|
| 108 |
+
import torch
|
| 109 |
+
from demucs.pretrained import get_model
|
| 110 |
+
from demucs.apply import apply_model
|
| 111 |
+
import torchaudio
|
| 112 |
+
|
| 113 |
+
# Load audio
|
| 114 |
+
audio, sr = torchaudio.load(audio_path)
|
| 115 |
+
|
| 116 |
+
# Convert to mono if needed
|
| 117 |
+
if audio.shape[0] > 1:
|
| 118 |
+
audio = audio.mean(0, keepdim=True)
|
| 119 |
+
|
| 120 |
+
# Load model
|
| 121 |
+
model = get_model('htdemucs')
|
| 122 |
+
model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
|
| 123 |
+
|
| 124 |
+
# Apply separation
|
| 125 |
+
sources = apply_model(model, audio, device=torch.device("cuda" if torch.cuda.is_available() else "cpu"))
|
| 126 |
+
|
| 127 |
+
# Sources is a dictionary with keys "vocals" and "no_vocals"
|
| 128 |
+
torchaudio.save(voice_path, sources[0].cpu(), sr)
|
| 129 |
+
torchaudio.save(music_path, sources[1].cpu(), sr)
|
| 130 |
+
|
| 131 |
+
return voice_path, music_path
|
| 132 |
+
|
| 133 |
+
except Exception as e2:
|
| 134 |
+
print(f"Python API separation also failed: {e2}")
|
| 135 |
+
return None, None
|
requirements.txt
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
yt-dlp==2025.3.25
|
| 2 |
+
moviepy==1.0.3
|
| 3 |
+
pyannote.audio==3.3.2
|
| 4 |
+
librosa==0.10.0
|
| 5 |
+
soundfile==0.12.1
|
| 6 |
+
scipy==1.11.4
|
| 7 |
+
numpy==1.22.0
|
| 8 |
+
pydub==0.25.1
|
| 9 |
+
ffmpeg-python==0.2.0
|
| 10 |
+
python-dotenv==1.1.0
|
| 11 |
+
transformers==4.36.2
|
| 12 |
+
deep-translator==1.11.4
|
| 13 |
+
langchain-groq==0.3.1
|
| 14 |
+
tqdm==4.66.1
|
| 15 |
+
edge-tts==7.0.0
|
| 16 |
+
openai==1.12.0
|
| 17 |
+
gtts==2.5.4
|
| 18 |
+
demucs==4.0.1
|
| 19 |
+
TTS==0.22.0
|
| 20 |
+
langchain==0.3.21
|
| 21 |
+
logger==1.4
|
speech_diarization.py
ADDED
|
@@ -0,0 +1,223 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pyannote.audio import Pipeline
|
| 2 |
+
import torch
|
| 3 |
+
import os
|
| 4 |
+
import time
|
| 5 |
+
|
| 6 |
+
class SpeakerDiarizer:
|
| 7 |
+
def __init__(self, hf_token, device=None):
|
| 8 |
+
"""Initialize speaker diarization with HuggingFace token"""
|
| 9 |
+
self.diarization_pipeline = None
|
| 10 |
+
try:
|
| 11 |
+
print("Loading diarization pipeline...")
|
| 12 |
+
# Check available devices
|
| 13 |
+
if device is None:
|
| 14 |
+
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
| 15 |
+
print(f"Using device: {device}")
|
| 16 |
+
|
| 17 |
+
# Use the newer version that's compatible with your libraries
|
| 18 |
+
self.diarization_pipeline = Pipeline.from_pretrained(
|
| 19 |
+
"pyannote/speaker-diarization-3.1",
|
| 20 |
+
use_auth_token=hf_token
|
| 21 |
+
)
|
| 22 |
+
self.diarization_pipeline.to(torch.device(device))
|
| 23 |
+
print("Diarization model loaded successfully!")
|
| 24 |
+
except Exception as e:
|
| 25 |
+
print(f"Error loading diarization model: {e}")
|
| 26 |
+
|
| 27 |
+
def diarize(self, audio_path, min_speakers=1, max_speakers=None, device=None):
|
| 28 |
+
"""Identify speakers in audio file"""
|
| 29 |
+
if not self.diarization_pipeline:
|
| 30 |
+
print("Diarization pipeline not available")
|
| 31 |
+
return []
|
| 32 |
+
|
| 33 |
+
try:
|
| 34 |
+
print("Starting speaker diarization (this may take several minutes for longer files)...")
|
| 35 |
+
start_time = time.time()
|
| 36 |
+
|
| 37 |
+
# Set parameters for diarization
|
| 38 |
+
params = {}
|
| 39 |
+
if min_speakers is not None:
|
| 40 |
+
params["min_speakers"] = min_speakers
|
| 41 |
+
if max_speakers is not None:
|
| 42 |
+
params["max_speakers"] = max_speakers
|
| 43 |
+
|
| 44 |
+
# Set device if specified (cuda:0, cpu, etc.)
|
| 45 |
+
if device:
|
| 46 |
+
print(f"Using device: {device}")
|
| 47 |
+
self.diarization_pipeline.to(torch.device(device))
|
| 48 |
+
|
| 49 |
+
# Add progress updates
|
| 50 |
+
print("Running diarization model...")
|
| 51 |
+
print("This process may take several minutes with no visible progress...")
|
| 52 |
+
print("Consider using a smaller audio segment for testing")
|
| 53 |
+
|
| 54 |
+
# Use the diarization pipeline
|
| 55 |
+
diarization = self.diarization_pipeline(audio_path, **params)
|
| 56 |
+
|
| 57 |
+
print("Processing diarization results...")
|
| 58 |
+
speakers = []
|
| 59 |
+
for turn, _, speaker in diarization.itertracks(yield_label=True):
|
| 60 |
+
speakers.append({'start': turn.start, 'end': turn.end, 'speaker': speaker})
|
| 61 |
+
|
| 62 |
+
duration = time.time() - start_time
|
| 63 |
+
print(f"Diarization completed in {duration:.1f} seconds")
|
| 64 |
+
print(f"Detected {len(set(s['speaker'] for s in speakers))} unique speakers")
|
| 65 |
+
|
| 66 |
+
return speakers
|
| 67 |
+
except Exception as e:
|
| 68 |
+
print(f"Error during diarization: {e}")
|
| 69 |
+
return []
|
| 70 |
+
|
| 71 |
+
def assign_speakers_to_segments(self, segments, speakers):
|
| 72 |
+
"""
|
| 73 |
+
Assign speaker labels to transcript segments based on timing overlap
|
| 74 |
+
|
| 75 |
+
Args:
|
| 76 |
+
segments: List of transcript segments with start/end times
|
| 77 |
+
speakers: List of speaker segments from diarization
|
| 78 |
+
|
| 79 |
+
Returns:
|
| 80 |
+
Updated segments with speaker information
|
| 81 |
+
"""
|
| 82 |
+
# If no speakers found, assign everything to SPEAKER_0
|
| 83 |
+
if not speakers:
|
| 84 |
+
for segment in segments:
|
| 85 |
+
segment["speaker"] = "SPEAKER_0"
|
| 86 |
+
return segments
|
| 87 |
+
|
| 88 |
+
# For single speaker, optimize by assigning all to same speaker
|
| 89 |
+
if len(set(s["speaker"] for s in speakers)) == 1:
|
| 90 |
+
speaker_id = speakers[0]["speaker"]
|
| 91 |
+
for segment in segments:
|
| 92 |
+
segment["speaker"] = speaker_id
|
| 93 |
+
return segments
|
| 94 |
+
|
| 95 |
+
# Process each segment
|
| 96 |
+
for segment in segments:
|
| 97 |
+
segment_start = segment.get("start", 0)
|
| 98 |
+
segment_end = segment.get("end", 0)
|
| 99 |
+
segment_duration = segment_end - segment_start
|
| 100 |
+
|
| 101 |
+
# Find overlapping speakers
|
| 102 |
+
speaker_overlaps = []
|
| 103 |
+
|
| 104 |
+
for speaker_turn in speakers:
|
| 105 |
+
# Fast check for any overlap
|
| 106 |
+
if not (speaker_turn["end"] <= segment_start or speaker_turn["start"] >= segment_end):
|
| 107 |
+
# Calculate overlap duration
|
| 108 |
+
overlap_start = max(speaker_turn["start"], segment_start)
|
| 109 |
+
overlap_end = min(speaker_turn["end"], segment_end)
|
| 110 |
+
overlap_duration = overlap_end - overlap_start
|
| 111 |
+
|
| 112 |
+
# Calculate overlap percentage relative to segment duration
|
| 113 |
+
overlap_percentage = overlap_duration / segment_duration if segment_duration > 0 else 0
|
| 114 |
+
|
| 115 |
+
speaker_overlaps.append((speaker_turn["speaker"], overlap_duration, overlap_percentage))
|
| 116 |
+
|
| 117 |
+
# Assign speaker with the most overlap
|
| 118 |
+
if speaker_overlaps:
|
| 119 |
+
# Sort by overlap duration (descending)
|
| 120 |
+
speaker_overlaps.sort(key=lambda x: x[1], reverse=True)
|
| 121 |
+
segment["speaker"] = speaker_overlaps[0][0]
|
| 122 |
+
|
| 123 |
+
# Add confidence score if desired
|
| 124 |
+
# segment["speaker_confidence"] = speaker_overlaps[0][2]
|
| 125 |
+
else:
|
| 126 |
+
# Find nearest speaker if no overlap
|
| 127 |
+
segment_mid = (segment_start + segment_end) / 2
|
| 128 |
+
|
| 129 |
+
closest_speaker = min(
|
| 130 |
+
speakers,
|
| 131 |
+
key=lambda s: min(
|
| 132 |
+
abs(s["start"] - segment_mid),
|
| 133 |
+
abs(s["end"] - segment_mid)
|
| 134 |
+
)
|
| 135 |
+
)
|
| 136 |
+
segment["speaker"] = closest_speaker["speaker"]
|
| 137 |
+
|
| 138 |
+
# You can log this if logging is set up
|
| 139 |
+
# print(f"No speaker overlap found for segment at {segment_start:.2f}s, using nearest speaker")
|
| 140 |
+
|
| 141 |
+
return segments
|
| 142 |
+
|
| 143 |
+
def extract_speaker_references(self, audio_path, speakers, output_dir="reference_audio", min_duration=3.0, max_duration=10.0):
|
| 144 |
+
"""
|
| 145 |
+
Extract reference audio clips for each unique speaker.
|
| 146 |
+
|
| 147 |
+
Args:
|
| 148 |
+
audio_path: Path to the original audio file
|
| 149 |
+
speakers: List of speaker segments from diarization
|
| 150 |
+
output_dir: Directory to save reference audio clips
|
| 151 |
+
min_duration: Minimum duration for a reference clip (seconds)
|
| 152 |
+
max_duration: Maximum duration for a reference clip (seconds)
|
| 153 |
+
|
| 154 |
+
Returns:
|
| 155 |
+
Dictionary mapping speaker IDs to reference audio file paths
|
| 156 |
+
"""
|
| 157 |
+
import os
|
| 158 |
+
from pydub import AudioSegment
|
| 159 |
+
|
| 160 |
+
# Ensure output directory exists
|
| 161 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 162 |
+
|
| 163 |
+
# Load the original audio file
|
| 164 |
+
try:
|
| 165 |
+
full_audio = AudioSegment.from_file(audio_path)
|
| 166 |
+
except Exception as e:
|
| 167 |
+
print(f"Error loading audio file: {e}")
|
| 168 |
+
return {}
|
| 169 |
+
|
| 170 |
+
# Get unique speaker IDs
|
| 171 |
+
unique_speakers = set(segment["speaker"] for segment in speakers)
|
| 172 |
+
reference_files = {}
|
| 173 |
+
|
| 174 |
+
print(f"Extracting reference audio for {len(unique_speakers)} speakers...")
|
| 175 |
+
|
| 176 |
+
for speaker in unique_speakers:
|
| 177 |
+
# Find all segments for this speaker
|
| 178 |
+
speaker_segments = [s for s in speakers if s["speaker"] == speaker]
|
| 179 |
+
|
| 180 |
+
# Sort segments by duration (descending)
|
| 181 |
+
speaker_segments.sort(key=lambda s: s["end"] - s["start"], reverse=True)
|
| 182 |
+
|
| 183 |
+
# Find a segment with suitable duration
|
| 184 |
+
selected_segment = None
|
| 185 |
+
for segment in speaker_segments:
|
| 186 |
+
duration = segment["end"] - segment["start"]
|
| 187 |
+
if duration >= min_duration:
|
| 188 |
+
# If longer than max_duration, trim it
|
| 189 |
+
if duration > max_duration:
|
| 190 |
+
mid_point = (segment["start"] + segment["end"]) / 2
|
| 191 |
+
half_max = max_duration / 2
|
| 192 |
+
segment = {
|
| 193 |
+
"start": mid_point - half_max,
|
| 194 |
+
"end": mid_point + half_max,
|
| 195 |
+
"speaker": speaker
|
| 196 |
+
}
|
| 197 |
+
selected_segment = segment
|
| 198 |
+
break
|
| 199 |
+
|
| 200 |
+
# If no segment is long enough, take the longest one
|
| 201 |
+
if selected_segment is None and speaker_segments:
|
| 202 |
+
selected_segment = speaker_segments[0]
|
| 203 |
+
|
| 204 |
+
# Extract the audio segment
|
| 205 |
+
if selected_segment:
|
| 206 |
+
start_ms = int(selected_segment["start"] * 1000)
|
| 207 |
+
end_ms = int(selected_segment["end"] * 1000)
|
| 208 |
+
|
| 209 |
+
# Extract audio segment
|
| 210 |
+
speaker_audio = full_audio[start_ms:end_ms]
|
| 211 |
+
|
| 212 |
+
# Save to file
|
| 213 |
+
speaker_id = speaker.replace("SPEAKER_", "")
|
| 214 |
+
output_path = os.path.join(output_dir, f"speaker_{speaker_id}_reference.wav")
|
| 215 |
+
speaker_audio.export(output_path, format="wav")
|
| 216 |
+
|
| 217 |
+
reference_files[speaker] = output_path
|
| 218 |
+
|
| 219 |
+
print(f" Extracted {selected_segment['end'] - selected_segment['start']:.2f}s reference audio for {speaker}")
|
| 220 |
+
else:
|
| 221 |
+
print(f" No suitable audio segment found for {speaker}")
|
| 222 |
+
|
| 223 |
+
return reference_files
|
speech_recognition.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import whisper
|
| 2 |
+
import os
|
| 3 |
+
|
| 4 |
+
class SpeechRecognizer:
|
| 5 |
+
def __init__(self, model_size="base"):
|
| 6 |
+
self.model = whisper.load_model(model_size)
|
| 7 |
+
|
| 8 |
+
def transcribe(self, audio_path, language="en"):
|
| 9 |
+
"""Transcribe audio file with timestamps"""
|
| 10 |
+
# Get results with word timestamps
|
| 11 |
+
result = self.model.transcribe(
|
| 12 |
+
audio_path,
|
| 13 |
+
language=language,
|
| 14 |
+
word_timestamps=False,
|
| 15 |
+
verbose=False
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
# Return segments with timestamps
|
| 19 |
+
return result["segments"]
|
text_to_speech.py
ADDED
|
@@ -0,0 +1,619 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import os
|
| 3 |
+
import re
|
| 4 |
+
import tempfile
|
| 5 |
+
import logging
|
| 6 |
+
import torch
|
| 7 |
+
from pydub import AudioSegment
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
import subprocess
|
| 10 |
+
import librosa
|
| 11 |
+
import soundfile as sf
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
# Set up basic logging
|
| 15 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
| 16 |
+
logger = logging.getLogger(__name__)
|
| 17 |
+
|
| 18 |
+
# Create directory structure
|
| 19 |
+
def ensure_directories():
|
| 20 |
+
"""Ensure the required directories exist"""
|
| 21 |
+
directories = ["audio", "audio2", "reference_audio"]
|
| 22 |
+
for directory in directories:
|
| 23 |
+
os.makedirs(directory, exist_ok=True)
|
| 24 |
+
|
| 25 |
+
ensure_directories() # Call immediately to ensure directories exist
|
| 26 |
+
|
| 27 |
+
# Setup audio effects for pydub
|
| 28 |
+
def setup_audio_effects():
|
| 29 |
+
"""Setup custom audio effects"""
|
| 30 |
+
from pydub import effects
|
| 31 |
+
|
| 32 |
+
# Add speedup if it's missing
|
| 33 |
+
if not hasattr(AudioSegment, "speedup"):
|
| 34 |
+
def speedup(audio_segment, playback_speed=1.5):
|
| 35 |
+
if playback_speed <= 0 or playback_speed == 1.0:
|
| 36 |
+
return audio_segment
|
| 37 |
+
new_frame_rate = int(audio_segment.frame_rate * playback_speed)
|
| 38 |
+
adjusted = audio_segment._spawn(audio_segment.raw_data,
|
| 39 |
+
overrides={'frame_rate': new_frame_rate})
|
| 40 |
+
return adjusted.set_frame_rate(audio_segment.frame_rate)
|
| 41 |
+
AudioSegment.speedup = speedup
|
| 42 |
+
|
| 43 |
+
# Add time_stretch if it's missing
|
| 44 |
+
if not hasattr(effects, "time_stretch"):
|
| 45 |
+
def time_stretch(audio_segment, stretch_factor):
|
| 46 |
+
if stretch_factor <= 0 or stretch_factor == 1.0:
|
| 47 |
+
return audio_segment
|
| 48 |
+
original_frame_rate = audio_segment.frame_rate
|
| 49 |
+
new_frame_rate = int(original_frame_rate / stretch_factor)
|
| 50 |
+
stretched = audio_segment._spawn(
|
| 51 |
+
audio_segment.raw_data,
|
| 52 |
+
overrides={'frame_rate': new_frame_rate}
|
| 53 |
+
)
|
| 54 |
+
return stretched.set_frame_rate(original_frame_rate)
|
| 55 |
+
effects.time_stretch = time_stretch
|
| 56 |
+
|
| 57 |
+
return effects
|
| 58 |
+
|
| 59 |
+
effects = setup_audio_effects()
|
| 60 |
+
|
| 61 |
+
def adjust_audio_duration(audio_segment, target_duration):
|
| 62 |
+
"""Adjust audio to target duration by adding silence or trimming"""
|
| 63 |
+
current_duration = len(audio_segment) / 1000 # ms to seconds
|
| 64 |
+
|
| 65 |
+
if current_duration < target_duration:
|
| 66 |
+
silence_duration_ms = int((target_duration - current_duration) * 1000)
|
| 67 |
+
silence = AudioSegment.silent(duration=silence_duration_ms)
|
| 68 |
+
return audio_segment + silence
|
| 69 |
+
else:
|
| 70 |
+
return audio_segment[:int(target_duration * 1000)]
|
| 71 |
+
|
| 72 |
+
# XTTS Model Loader (Singleton pattern)
|
| 73 |
+
class XTTSModelLoader:
|
| 74 |
+
_instance = None
|
| 75 |
+
model = None
|
| 76 |
+
|
| 77 |
+
@classmethod
|
| 78 |
+
def get_model(cls):
|
| 79 |
+
"""Get or initialize the XTTS model"""
|
| 80 |
+
if cls.model is None:
|
| 81 |
+
try:
|
| 82 |
+
from TTS.api import TTS
|
| 83 |
+
|
| 84 |
+
# Determine device
|
| 85 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 86 |
+
logger.info(f"Loading XTTS model on {device}...")
|
| 87 |
+
|
| 88 |
+
# Load the model
|
| 89 |
+
cls.model = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
|
| 90 |
+
logger.info("XTTS model loaded successfully")
|
| 91 |
+
except Exception as e:
|
| 92 |
+
logger.error(f"Error loading XTTS model: {e}")
|
| 93 |
+
return None
|
| 94 |
+
|
| 95 |
+
return cls.model
|
| 96 |
+
|
| 97 |
+
def smooth_speed_change(audio_path, target_duration):
|
| 98 |
+
"""
|
| 99 |
+
Adjust audio speed with instantaneous time stretching to match target duration
|
| 100 |
+
|
| 101 |
+
Args:
|
| 102 |
+
audio_path: Path to audio file to adjust
|
| 103 |
+
target_duration: Target duration in seconds
|
| 104 |
+
|
| 105 |
+
Returns:
|
| 106 |
+
Path to adjusted audio file (temporary file)
|
| 107 |
+
"""
|
| 108 |
+
try:
|
| 109 |
+
# Debug start
|
| 110 |
+
print(f"\n[DEBUG] Starting audio speed adjustment:")
|
| 111 |
+
print(f"[DEBUG] Input file: {audio_path}")
|
| 112 |
+
print(f"[DEBUG] Target duration: {target_duration:.2f}s")
|
| 113 |
+
|
| 114 |
+
# Load audio with librosa
|
| 115 |
+
y, sr = librosa.load(audio_path, sr=None)
|
| 116 |
+
|
| 117 |
+
# Calculate current duration and speed factor
|
| 118 |
+
current_duration = librosa.get_duration(y=y, sr=sr)
|
| 119 |
+
speed_factor = current_duration / target_duration
|
| 120 |
+
|
| 121 |
+
print(f"[DEBUG] Current duration: {current_duration:.2f}s")
|
| 122 |
+
print(f"[DEBUG] Calculated speed factor: {speed_factor:.3f}")
|
| 123 |
+
|
| 124 |
+
# If the difference is minimal, return original path
|
| 125 |
+
if abs(speed_factor - 1) < 0.05:
|
| 126 |
+
print(f"[DEBUG] Speed factor {speed_factor:.3f} is within 5% threshold, skipping adjustment")
|
| 127 |
+
return audio_path
|
| 128 |
+
|
| 129 |
+
# Dynamic speed factor limits based on audio duration
|
| 130 |
+
# Allow more aggressive speed factors for short audio
|
| 131 |
+
if current_duration < 10.0: # Short audio under 10 seconds
|
| 132 |
+
max_speed = 3.0 # More aggressive for short segments
|
| 133 |
+
else:
|
| 134 |
+
max_speed = 2.7 # Standard limit for longer audio
|
| 135 |
+
|
| 136 |
+
min_speed = 0.5 # Allow more slowdown when needed
|
| 137 |
+
|
| 138 |
+
# Check if extreme speed change is needed
|
| 139 |
+
extreme_adjustment = (speed_factor > max_speed)
|
| 140 |
+
|
| 141 |
+
# Limit speed factor to reasonable range
|
| 142 |
+
original_speed_factor = speed_factor
|
| 143 |
+
speed_factor = min(max(speed_factor, min_speed), max_speed)
|
| 144 |
+
|
| 145 |
+
if original_speed_factor != speed_factor:
|
| 146 |
+
print(f"[DEBUG] Speed factor clamped from {original_speed_factor:.3f} to {speed_factor:.3f}")
|
| 147 |
+
if extreme_adjustment:
|
| 148 |
+
print(f"[DEBUG] Extreme adjustment needed - will apply max speed and then trim")
|
| 149 |
+
|
| 150 |
+
# Track processing time
|
| 151 |
+
import time
|
| 152 |
+
start_time = time.time()
|
| 153 |
+
|
| 154 |
+
# SIMPLIFIED: Apply direct time stretching to the entire audio at once
|
| 155 |
+
print(f"[DEBUG] Applying instantaneous time stretching with factor {speed_factor:.3f}")
|
| 156 |
+
stretched_audio = librosa.effects.time_stretch(y=y, rate=speed_factor)
|
| 157 |
+
|
| 158 |
+
# Calculate new duration
|
| 159 |
+
expected_duration = len(stretched_audio) / sr
|
| 160 |
+
|
| 161 |
+
# Save to temporary file
|
| 162 |
+
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.wav')
|
| 163 |
+
sf.write(temp_file.name, stretched_audio, sr)
|
| 164 |
+
|
| 165 |
+
# Calculate processing time
|
| 166 |
+
process_time = time.time() - start_time
|
| 167 |
+
|
| 168 |
+
# Verify the actual duration after processing
|
| 169 |
+
y_check, sr_check = librosa.load(temp_file.name, sr=None)
|
| 170 |
+
actual_duration = librosa.get_duration(y=y_check, sr=sr_check)
|
| 171 |
+
|
| 172 |
+
method = "direct"
|
| 173 |
+
|
| 174 |
+
# For extreme cases, perform additional trimming
|
| 175 |
+
if extreme_adjustment and actual_duration > target_duration:
|
| 176 |
+
print(f"[DEBUG] Performing additional trim for extreme case")
|
| 177 |
+
# Calculate how many samples to keep
|
| 178 |
+
samples_to_keep = int(target_duration * sr_check)
|
| 179 |
+
|
| 180 |
+
# Apply a small fade out to avoid clicks
|
| 181 |
+
fade_samples = min(int(0.1 * sr_check), samples_to_keep // 4) # 100ms fade or less
|
| 182 |
+
|
| 183 |
+
# Keep only the needed samples
|
| 184 |
+
trimmed_audio = y_check[:samples_to_keep]
|
| 185 |
+
|
| 186 |
+
# Apply fade out to avoid clicks
|
| 187 |
+
if fade_samples > 0:
|
| 188 |
+
fade_env = np.linspace(1.0, 0.0, fade_samples)
|
| 189 |
+
trimmed_audio[-fade_samples:] *= fade_env
|
| 190 |
+
|
| 191 |
+
# Save the trimmed version
|
| 192 |
+
sf.write(temp_file.name, trimmed_audio, sr_check)
|
| 193 |
+
|
| 194 |
+
# Update actual duration
|
| 195 |
+
actual_duration = librosa.get_duration(y=trimmed_audio, sr=sr_check)
|
| 196 |
+
method += "+trim"
|
| 197 |
+
|
| 198 |
+
print(f"[DEBUG] Method used: {method}")
|
| 199 |
+
print(f"[DEBUG] Processing completed in {process_time:.2f} seconds")
|
| 200 |
+
print(f"[DEBUG] Expected new duration: {expected_duration:.2f}s")
|
| 201 |
+
print(f"[DEBUG] Actual new duration: {actual_duration:.2f}s")
|
| 202 |
+
print(f"[DEBUG] Target was: {target_duration:.2f}s")
|
| 203 |
+
print(f"[DEBUG] Difference from target: {abs(actual_duration - target_duration):.3f}s")
|
| 204 |
+
print(f"[DEBUG] Output file: {temp_file.name}")
|
| 205 |
+
|
| 206 |
+
return temp_file.name
|
| 207 |
+
|
| 208 |
+
except Exception as e:
|
| 209 |
+
import traceback
|
| 210 |
+
print(f"[DEBUG ERROR] Audio speed adjustment failed: {e}")
|
| 211 |
+
print(traceback.format_exc())
|
| 212 |
+
logger.warning(f"Audio speed adjustment failed: {e}")
|
| 213 |
+
return audio_path
|
| 214 |
+
|
| 215 |
+
def create_segmented_edge_tts(text, pitch, voice, output_path, target_duration=None):
|
| 216 |
+
"""Create voice clone with specific characteristics and timing using Edge TTS"""
|
| 217 |
+
# Create a temporary file
|
| 218 |
+
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp3')
|
| 219 |
+
temp_filename = temp_file.name # Store filename before closing
|
| 220 |
+
temp_file.close()
|
| 221 |
+
|
| 222 |
+
# Fix pitch formatting
|
| 223 |
+
pitch_param = f"+{pitch}Hz" if pitch >= 0 else f"{pitch}Hz"
|
| 224 |
+
|
| 225 |
+
command = [
|
| 226 |
+
"edge-tts",
|
| 227 |
+
f"--pitch={pitch_param}",
|
| 228 |
+
"--voice", voice,
|
| 229 |
+
"--text", text,
|
| 230 |
+
"--write-media", temp_filename
|
| 231 |
+
]
|
| 232 |
+
subprocess.run(command, check=True)
|
| 233 |
+
# Load audio
|
| 234 |
+
audio = AudioSegment.from_file(temp_filename, format="mp3")
|
| 235 |
+
|
| 236 |
+
# Time constraint adjustment
|
| 237 |
+
if target_duration is not None:
|
| 238 |
+
current_duration = len(audio) / 1000 # ms to seconds
|
| 239 |
+
|
| 240 |
+
if abs(current_duration - target_duration) > 0.1: # 100ms threshold
|
| 241 |
+
speed_factor = current_duration / target_duration
|
| 242 |
+
speed_factor = min(max(speed_factor, 0.7), 3) # Keep within bounds
|
| 243 |
+
|
| 244 |
+
logger.info(f" Adjusting timing: {current_duration:.2f}s → {target_duration:.2f}s (factor: {speed_factor:.2f})")
|
| 245 |
+
|
| 246 |
+
# Apply time adjustment
|
| 247 |
+
# Instead of speed adjustments after generation, use Edge TTS rate parameter
|
| 248 |
+
if speed_factor < 1:
|
| 249 |
+
rate_adjustment = f"-{int((1 - speed_factor) * 100)}%"
|
| 250 |
+
else:
|
| 251 |
+
rate_adjustment = f"+{int((speed_factor - 1) * 100)}%"
|
| 252 |
+
|
| 253 |
+
# Regenerate with adjusted rate
|
| 254 |
+
os.unlink(temp_file.name) # Remove the previous temp file
|
| 255 |
+
|
| 256 |
+
# Create new command with rate parameter and fixed pitch formatting
|
| 257 |
+
command = [
|
| 258 |
+
"edge-tts",
|
| 259 |
+
f"--pitch={pitch_param}",
|
| 260 |
+
f"--rate={rate_adjustment}",
|
| 261 |
+
"--voice", voice,
|
| 262 |
+
"--text", text,
|
| 263 |
+
"--write-media", temp_filename
|
| 264 |
+
]
|
| 265 |
+
subprocess.run(command, check=True)
|
| 266 |
+
|
| 267 |
+
# Reload audio with rate adjustment
|
| 268 |
+
audio = AudioSegment.from_file(temp_filename, format="mp3")
|
| 269 |
+
|
| 270 |
+
# Fine-tune if needed
|
| 271 |
+
new_duration = len(audio) / 1000
|
| 272 |
+
if abs(new_duration - target_duration) > 0.1:
|
| 273 |
+
audio = adjust_audio_duration(audio, target_duration)
|
| 274 |
+
|
| 275 |
+
# Save the modified audio
|
| 276 |
+
audio.export(output_path, format="wav")
|
| 277 |
+
|
| 278 |
+
# Clean up temporary file
|
| 279 |
+
os.unlink(temp_file.name)
|
| 280 |
+
|
| 281 |
+
# Log final duration
|
| 282 |
+
final_audio = AudioSegment.from_file(output_path)
|
| 283 |
+
final_duration = len(final_audio) / 1000
|
| 284 |
+
logger.info(f" Final duration: {final_duration:.2f}s (target: {target_duration if target_duration else 'None'}s)")
|
| 285 |
+
|
| 286 |
+
return output_path
|
| 287 |
+
|
| 288 |
+
def create_segmented_xtts(text, reference_audio, language, output_path, target_duration=None):
|
| 289 |
+
"""Create voice-cloned speech using XTTS with speaker's reference audio and duration control"""
|
| 290 |
+
# Get the model (will be loaded on first call)
|
| 291 |
+
tts_model = XTTSModelLoader.get_model()
|
| 292 |
+
|
| 293 |
+
if tts_model is None:
|
| 294 |
+
raise RuntimeError("XTTS model could not be loaded. Ensure TTS is installed.")
|
| 295 |
+
|
| 296 |
+
# Verify reference audio exists
|
| 297 |
+
if not os.path.exists(reference_audio):
|
| 298 |
+
raise FileNotFoundError(f"Reference audio file not found: {reference_audio}")
|
| 299 |
+
|
| 300 |
+
# Generate speech
|
| 301 |
+
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.wav')
|
| 302 |
+
temp_filename = temp_file.name
|
| 303 |
+
temp_file.close()
|
| 304 |
+
|
| 305 |
+
logger.info(f"Generating XTTS speech using reference: {os.path.basename(reference_audio)}")
|
| 306 |
+
|
| 307 |
+
# Step 1: Try to optimize the generation parameters based on text length and target duration
|
| 308 |
+
# Short text might need special handling to avoid excessive padding
|
| 309 |
+
is_short_text = len(text.strip()) < 10
|
| 310 |
+
|
| 311 |
+
# XTTS generation options
|
| 312 |
+
generation_kwargs = {}
|
| 313 |
+
|
| 314 |
+
# Add text length information for very short text to help the model
|
| 315 |
+
# Note: These are example parameters - actual parameter support depends on the XTTS version
|
| 316 |
+
if is_short_text and target_duration is not None and target_duration < 2.0:
|
| 317 |
+
logger.info(f" Short text detected, attempting to minimize padding")
|
| 318 |
+
# These parameters may or may not be supported by the TTS model being used
|
| 319 |
+
generation_kwargs = {
|
| 320 |
+
'enable_text_splitting': False, # Avoid splitting short text
|
| 321 |
+
'no_silence_end': True, # Reduce trailing silence
|
| 322 |
+
}
|
| 323 |
+
# Some models may support 'speed' parameter
|
| 324 |
+
if hasattr(tts_model, 'tts_with_speed'):
|
| 325 |
+
generation_kwargs['speed'] = 1.2 # Slightly faster for short text
|
| 326 |
+
|
| 327 |
+
try:
|
| 328 |
+
# Try generating with optional parameters if supported
|
| 329 |
+
if generation_kwargs:
|
| 330 |
+
try:
|
| 331 |
+
tts_model.tts_to_file(
|
| 332 |
+
text=text,
|
| 333 |
+
speaker_wav=reference_audio,
|
| 334 |
+
language=language,
|
| 335 |
+
file_path=temp_filename,
|
| 336 |
+
**generation_kwargs
|
| 337 |
+
)
|
| 338 |
+
except (TypeError, ValueError):
|
| 339 |
+
# If parameters aren't supported, fall back to standard call
|
| 340 |
+
logger.info(" Advanced parameters not supported, using standard generation")
|
| 341 |
+
tts_model.tts_to_file(
|
| 342 |
+
text=text,
|
| 343 |
+
speaker_wav=reference_audio,
|
| 344 |
+
language=language,
|
| 345 |
+
file_path=temp_filename
|
| 346 |
+
)
|
| 347 |
+
else:
|
| 348 |
+
# Standard generation
|
| 349 |
+
tts_model.tts_to_file(
|
| 350 |
+
text=text,
|
| 351 |
+
speaker_wav=reference_audio,
|
| 352 |
+
language=language,
|
| 353 |
+
file_path=temp_filename
|
| 354 |
+
)
|
| 355 |
+
|
| 356 |
+
# Load generated audio
|
| 357 |
+
audio = AudioSegment.from_file(temp_filename)
|
| 358 |
+
|
| 359 |
+
# Step 2: Apply duration adjustment if needed
|
| 360 |
+
if target_duration is not None:
|
| 361 |
+
current_duration = len(audio) / 1000 # ms to seconds
|
| 362 |
+
|
| 363 |
+
if abs(current_duration - target_duration) > 0.1: # 100ms threshold
|
| 364 |
+
# Calculate speed factor - inverse of duration ratio
|
| 365 |
+
speed_factor = current_duration / target_duration
|
| 366 |
+
speed_factor = min(max(speed_factor, 0.7), 3) # Allow wider range for better adjustment
|
| 367 |
+
|
| 368 |
+
logger.info(f" Adjusting timing: {current_duration:.2f}s → {target_duration:.2f}s (speed factor: {speed_factor:.2f})")
|
| 369 |
+
|
| 370 |
+
try:
|
| 371 |
+
# Always attempt smooth speed change since regeneration doesn't work
|
| 372 |
+
logger.info(" Applying smooth speed adjustment...")
|
| 373 |
+
adjusted_path = smooth_speed_change(temp_filename, target_duration)
|
| 374 |
+
|
| 375 |
+
if adjusted_path != temp_filename: # If path is different, adjustment was done
|
| 376 |
+
# Load the adjusted audio
|
| 377 |
+
audio = AudioSegment.from_file(adjusted_path)
|
| 378 |
+
|
| 379 |
+
# Check if adjustment was successful
|
| 380 |
+
new_duration = len(audio) / 1000
|
| 381 |
+
if abs(new_duration - target_duration) <= 0.15: # 150ms tolerance
|
| 382 |
+
logger.info(f" Smooth adjustment successful: {new_duration:.2f}s")
|
| 383 |
+
|
| 384 |
+
# Clean up original file and use the adjusted one
|
| 385 |
+
os.unlink(temp_filename)
|
| 386 |
+
temp_filename = adjusted_path
|
| 387 |
+
else:
|
| 388 |
+
# Clean up adjusted file and just use duration adjustment
|
| 389 |
+
logger.info(f" Smooth adjustment not precise enough ({new_duration:.2f}s), will fine-tune with duration adjustment")
|
| 390 |
+
os.unlink(adjusted_path)
|
| 391 |
+
# We'll fall through to the final duration adjustment step
|
| 392 |
+
except Exception as e:
|
| 393 |
+
logger.warning(f" Smooth speed adjustment failed: {str(e)}")
|
| 394 |
+
# We'll fall through to the final duration adjustment step
|
| 395 |
+
|
| 396 |
+
# Always perform final duration adjustment to ensure exact timing
|
| 397 |
+
new_duration = len(audio) / 1000
|
| 398 |
+
if abs(new_duration - target_duration) > 0.1:
|
| 399 |
+
logger.info(f" Fine-tuning with duration adjustment: {new_duration:.2f}s → {target_duration:.2f}s")
|
| 400 |
+
audio = adjust_audio_duration(audio, target_duration)
|
| 401 |
+
|
| 402 |
+
# Save the final audio
|
| 403 |
+
audio.export(output_path, format="wav")
|
| 404 |
+
|
| 405 |
+
# Clean up
|
| 406 |
+
os.unlink(temp_filename)
|
| 407 |
+
|
| 408 |
+
# Log final duration
|
| 409 |
+
final_audio = AudioSegment.from_file(output_path)
|
| 410 |
+
final_duration = len(final_audio) / 1000
|
| 411 |
+
logger.info(f" Final duration: {final_duration:.2f}s (target: {target_duration if target_duration else 'None'}s)")
|
| 412 |
+
|
| 413 |
+
return output_path
|
| 414 |
+
|
| 415 |
+
except Exception as e:
|
| 416 |
+
logger.error(f"XTTS generation failed: {e}")
|
| 417 |
+
if os.path.exists(temp_filename):
|
| 418 |
+
os.unlink(temp_filename)
|
| 419 |
+
raise
|
| 420 |
+
|
| 421 |
+
def process_voice_config(voice_config):
|
| 422 |
+
"""
|
| 423 |
+
Process voice configuration to support both Edge TTS and XTTS
|
| 424 |
+
|
| 425 |
+
Args:
|
| 426 |
+
voice_config: Dict with speaker_id keys and configuration values
|
| 427 |
+
For Edge TTS: {'engine': 'edge_tts', 'gender': 'male'/'female'} or simply 'male'/'female'
|
| 428 |
+
For XTTS: {'engine': 'xtts', 'reference_audio': '/path/to/audio.wav', 'language': 'hi'}
|
| 429 |
+
|
| 430 |
+
Returns:
|
| 431 |
+
Processed configuration dictionary
|
| 432 |
+
"""
|
| 433 |
+
processed_config = {}
|
| 434 |
+
|
| 435 |
+
# Handle empty config
|
| 436 |
+
if not voice_config:
|
| 437 |
+
return {0: {'engine': 'edge_tts', 'voice': "hi-IN-MadhurNeural", 'pitch': 0}}
|
| 438 |
+
|
| 439 |
+
# Track Edge TTS speaker counts for pitch variations
|
| 440 |
+
edge_male_count = 0
|
| 441 |
+
edge_female_count = 0
|
| 442 |
+
|
| 443 |
+
# Pitch variations for multiple Edge TTS speakers of same gender
|
| 444 |
+
male_pitches = [0, -30, 40] # Default, deeper, higher
|
| 445 |
+
female_pitches = [0, 25, -25] # Default, higher, deeper
|
| 446 |
+
|
| 447 |
+
for speaker_id, config in voice_config.items():
|
| 448 |
+
# Convert string speaker_id to int if needed
|
| 449 |
+
if isinstance(speaker_id, str) and speaker_id.isdigit():
|
| 450 |
+
speaker_id = int(speaker_id)
|
| 451 |
+
|
| 452 |
+
# Determine which engine to use (default is edge_tts)
|
| 453 |
+
if isinstance(config, dict):
|
| 454 |
+
engine = config.get('engine', 'edge_tts')
|
| 455 |
+
else:
|
| 456 |
+
# Handle simple gender strings for backwards compatibility
|
| 457 |
+
engine = 'edge_tts'
|
| 458 |
+
config = {'gender': config} if config in ['male', 'female'] else {'gender': 'male'}
|
| 459 |
+
|
| 460 |
+
if engine == 'xtts':
|
| 461 |
+
# XTTS configuration - each speaker needs their own reference audio
|
| 462 |
+
if 'reference_audio' not in config:
|
| 463 |
+
logger.warning(f"No reference audio provided for XTTS speaker {speaker_id}, falling back to Edge TTS")
|
| 464 |
+
# Fall back to Edge TTS if no reference audio
|
| 465 |
+
engine = 'edge_tts'
|
| 466 |
+
gender = config.get('gender', 'male')
|
| 467 |
+
else:
|
| 468 |
+
# Valid XTTS configuration
|
| 469 |
+
processed_config[speaker_id] = {
|
| 470 |
+
'engine': 'xtts',
|
| 471 |
+
'reference_audio': config['reference_audio'],
|
| 472 |
+
'language': config.get('language', 'hi') # Default to Hindi
|
| 473 |
+
}
|
| 474 |
+
continue # Skip the Edge TTS processing below
|
| 475 |
+
|
| 476 |
+
# Edge TTS configuration (if engine is edge_tts or XTTS fallback)
|
| 477 |
+
gender = config.get('gender', 'male')
|
| 478 |
+
|
| 479 |
+
if gender == 'male':
|
| 480 |
+
# Assign male voice and pitch
|
| 481 |
+
pitch = male_pitches[edge_male_count % len(male_pitches)]
|
| 482 |
+
processed_config[speaker_id] = {
|
| 483 |
+
'engine': 'edge_tts',
|
| 484 |
+
'voice': "hi-IN-MadhurNeural",
|
| 485 |
+
'pitch': pitch
|
| 486 |
+
}
|
| 487 |
+
edge_male_count += 1
|
| 488 |
+
else:
|
| 489 |
+
# Assign female voice and pitch
|
| 490 |
+
pitch = female_pitches[edge_female_count % len(female_pitches)]
|
| 491 |
+
processed_config[speaker_id] = {
|
| 492 |
+
'engine': 'edge_tts',
|
| 493 |
+
'voice': "hi-IN-SwaraNeural",
|
| 494 |
+
'pitch': pitch
|
| 495 |
+
}
|
| 496 |
+
edge_female_count += 1
|
| 497 |
+
|
| 498 |
+
return processed_config
|
| 499 |
+
|
| 500 |
+
def generate_tts(segments, target_language, voice_config=None, output_dir="audio2"):
|
| 501 |
+
"""
|
| 502 |
+
Generate speech for all segments using appropriate TTS engine per speaker
|
| 503 |
+
|
| 504 |
+
Args:
|
| 505 |
+
segments: List of segments with text, speaker, start and end times
|
| 506 |
+
target_language: Language code for TTS
|
| 507 |
+
voice_config: Dictionary with speaker configurations
|
| 508 |
+
- For Edge TTS: {'gender': 'male'/'female'} or just 'male'/'female'
|
| 509 |
+
- For XTTS: {'engine': 'xtts', 'reference_audio': '/path/to/audio.wav'}
|
| 510 |
+
output_dir: Directory to save the final audio
|
| 511 |
+
|
| 512 |
+
Returns:
|
| 513 |
+
Path to the final combined audio file
|
| 514 |
+
"""
|
| 515 |
+
# Ensure output directory exists
|
| 516 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 517 |
+
|
| 518 |
+
# Generate the full audio
|
| 519 |
+
output_path = os.path.join(output_dir, "dubbed_conversation.wav")
|
| 520 |
+
max_end_time = max(segment['end'] for segment in segments)
|
| 521 |
+
|
| 522 |
+
# Create a silent audio of the total duration
|
| 523 |
+
combined = AudioSegment.silent(duration=int(max_end_time * 1000) + 100)
|
| 524 |
+
ensure_directories()
|
| 525 |
+
audio_files = []
|
| 526 |
+
|
| 527 |
+
# Process voice configuration
|
| 528 |
+
processed_config = process_voice_config(voice_config or {})
|
| 529 |
+
print(processed_config)
|
| 530 |
+
|
| 531 |
+
# Process each segment
|
| 532 |
+
for i, segment in enumerate(segments):
|
| 533 |
+
# Extract speaker ID
|
| 534 |
+
speaker = segment.get('speaker', 'SPEAKER_00')
|
| 535 |
+
match = re.search(r'SPEAKER_(\d+)', speaker)
|
| 536 |
+
speaker_id = int(match.group(1)) if match else 0
|
| 537 |
+
|
| 538 |
+
# Get speaker configuration
|
| 539 |
+
speaker_config = processed_config.get(speaker_id,
|
| 540 |
+
{'engine': 'edge_tts', 'voice': "hi-IN-SwaraNeural", 'pitch': 0})
|
| 541 |
+
|
| 542 |
+
# Get text and timing information
|
| 543 |
+
text = segment['text']
|
| 544 |
+
start = segment['start']
|
| 545 |
+
end = segment['end']
|
| 546 |
+
duration = end - start
|
| 547 |
+
|
| 548 |
+
# Create output filename
|
| 549 |
+
output_file = f"audio/{start}.wav"
|
| 550 |
+
|
| 551 |
+
logger.info(f"Processing segment {i+1} (Speaker {speaker_id}, Engine: {speaker_config['engine']}):")
|
| 552 |
+
logger.info(f" Text: {text[:50]}{'...' if len(text) > 50 else ''}")
|
| 553 |
+
logger.info(f" Duration: {duration:.2f}s")
|
| 554 |
+
|
| 555 |
+
# Choose appropriate TTS engine
|
| 556 |
+
if speaker_config['engine'] == 'xtts':
|
| 557 |
+
# XTTS generation with speaker's reference audio
|
| 558 |
+
try:
|
| 559 |
+
create_segmented_xtts(
|
| 560 |
+
text=text,
|
| 561 |
+
reference_audio=speaker_config['reference_audio'],
|
| 562 |
+
language=speaker_config.get('language', target_language),
|
| 563 |
+
output_path=output_file,
|
| 564 |
+
target_duration=duration,
|
| 565 |
+
)
|
| 566 |
+
except Exception as e:
|
| 567 |
+
logger.error(f"Error using XTTS for speaker {speaker_id}: {e}")
|
| 568 |
+
logger.warning(f"Falling back to Edge TTS for this segment")
|
| 569 |
+
# Fallback to Edge TTS
|
| 570 |
+
create_segmented_edge_tts(
|
| 571 |
+
text=text,
|
| 572 |
+
pitch=0,
|
| 573 |
+
voice="hi-IN-SwaraNeural",
|
| 574 |
+
output_path=output_file,
|
| 575 |
+
target_duration=duration,
|
| 576 |
+
)
|
| 577 |
+
else:
|
| 578 |
+
# Edge TTS generation
|
| 579 |
+
create_segmented_edge_tts(
|
| 580 |
+
text=text,
|
| 581 |
+
pitch=speaker_config.get('pitch', 0),
|
| 582 |
+
voice=speaker_config.get('voice', "hi-IN-SwaraNeural"),
|
| 583 |
+
output_path=output_file,
|
| 584 |
+
target_duration=duration,
|
| 585 |
+
)
|
| 586 |
+
|
| 587 |
+
audio_files.append(output_file)
|
| 588 |
+
|
| 589 |
+
# Add segment to combined audio at the exact timestamp
|
| 590 |
+
segment_audio = AudioSegment.from_file(output_file)
|
| 591 |
+
position_ms = int(segment['start'] * 1000)
|
| 592 |
+
combined = combined.overlay(segment_audio, position=position_ms)
|
| 593 |
+
|
| 594 |
+
# Export the final combined audio
|
| 595 |
+
combined.export(output_path, format="wav")
|
| 596 |
+
logger.info(f" Final combined duration: {len(combined) / 1000:.2f}s")
|
| 597 |
+
|
| 598 |
+
# Clean up segment files
|
| 599 |
+
for file in audio_files:
|
| 600 |
+
try:
|
| 601 |
+
os.remove(file)
|
| 602 |
+
except:
|
| 603 |
+
pass
|
| 604 |
+
|
| 605 |
+
# Verify the final duration
|
| 606 |
+
final_audio = AudioSegment.from_file(output_path)
|
| 607 |
+
final_duration_sec = len(final_audio) / 1000
|
| 608 |
+
|
| 609 |
+
print(f"\nTarget duration: {max_end_time:.2f} seconds")
|
| 610 |
+
print(f"Actual duration: {final_duration_sec:.2f} seconds")
|
| 611 |
+
|
| 612 |
+
# If the final audio is still too long, trim it
|
| 613 |
+
if final_duration_sec > max_end_time + 0.1: # Allow 100ms grace
|
| 614 |
+
trimmed = final_audio[:int(max_end_time * 1000)]
|
| 615 |
+
trimmed.export(output_path, format="wav")
|
| 616 |
+
print(f"Trimmed to exactly {max_end_time:.2f} seconds")
|
| 617 |
+
|
| 618 |
+
return output_path
|
| 619 |
+
|
translate.py
ADDED
|
@@ -0,0 +1,391 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import copy
|
| 3 |
+
import logging
|
| 4 |
+
import time
|
| 5 |
+
from typing import List, Dict, Optional, Any, Union
|
| 6 |
+
from itertools import chain
|
| 7 |
+
from tqdm import tqdm
|
| 8 |
+
from deep_translator import GoogleTranslator
|
| 9 |
+
from dotenv import load_dotenv
|
| 10 |
+
|
| 11 |
+
# Add this at the top of your translate.py file
|
| 12 |
+
# Load environment variables from .env file
|
| 13 |
+
load_dotenv()
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
# Configure logging
|
| 17 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
| 18 |
+
logger = logging.getLogger(__name__)
|
| 19 |
+
|
| 20 |
+
# Import Groq
|
| 21 |
+
from langchain_groq import ChatGroq
|
| 22 |
+
from langchain_core.prompts import ChatPromptTemplate
|
| 23 |
+
from langchain.chains import LLMChain
|
| 24 |
+
|
| 25 |
+
# Language code constants - simplified for now
|
| 26 |
+
ISO_LANGUAGE_CODES = {
|
| 27 |
+
"en": "english",
|
| 28 |
+
"es": "spanish",
|
| 29 |
+
"fr": "french",
|
| 30 |
+
"de": "german",
|
| 31 |
+
"it": "italian",
|
| 32 |
+
"pt": "portuguese",
|
| 33 |
+
"ru": "russian",
|
| 34 |
+
"zh": "chinese",
|
| 35 |
+
"ja": "japanese",
|
| 36 |
+
"ko": "korean",
|
| 37 |
+
"hi": "hindi",
|
| 38 |
+
"ar": "arabic",
|
| 39 |
+
}
|
| 40 |
+
|
| 41 |
+
def fix_language_code(language_code: Optional[str]) -> str:
|
| 42 |
+
"""Convert language code to format compatible with translator."""
|
| 43 |
+
if not language_code:
|
| 44 |
+
return "auto"
|
| 45 |
+
|
| 46 |
+
# Clean up language code (remove region specifiers)
|
| 47 |
+
language_code = language_code.lower().split('-')[0]
|
| 48 |
+
|
| 49 |
+
# Return the cleaned code if it's in our list, otherwise default to auto
|
| 50 |
+
return language_code if language_code in ISO_LANGUAGE_CODES else "auto"
|
| 51 |
+
|
| 52 |
+
def translate_iterative(segments: List[Dict[str, Any]],
|
| 53 |
+
target_lang: str,
|
| 54 |
+
source_lang: Optional[str] = None) -> List[Dict[str, Any]]:
|
| 55 |
+
"""
|
| 56 |
+
Translate text segments individually to the specified language.
|
| 57 |
+
|
| 58 |
+
Args:
|
| 59 |
+
segments: List of dictionaries with 'text' key containing the text to translate
|
| 60 |
+
target_lang: Target language code
|
| 61 |
+
source_lang: Source language code (defaults to auto-detect)
|
| 62 |
+
|
| 63 |
+
Returns:
|
| 64 |
+
List of segments with translated text
|
| 65 |
+
"""
|
| 66 |
+
segments_copy = copy.deepcopy(segments)
|
| 67 |
+
source = fix_language_code(source_lang)
|
| 68 |
+
target = fix_language_code(target_lang)
|
| 69 |
+
|
| 70 |
+
logger.info(f"Translating {len(segments)} segments from {source} to {target} (iterative)")
|
| 71 |
+
translator = GoogleTranslator(source=source, target=target)
|
| 72 |
+
|
| 73 |
+
for i, segment in enumerate(tqdm(segments_copy, desc="Translating")):
|
| 74 |
+
text = segment["text"].strip()
|
| 75 |
+
try:
|
| 76 |
+
translated_text = translator.translate(text)
|
| 77 |
+
segments_copy[i]["text"] = translated_text
|
| 78 |
+
except Exception as error:
|
| 79 |
+
logger.error(f"Error translating segment {i}: {error}")
|
| 80 |
+
# Keep original text if translation fails
|
| 81 |
+
segments_copy[i]["text"] = text
|
| 82 |
+
|
| 83 |
+
return segments_copy
|
| 84 |
+
|
| 85 |
+
def verify_translation(original_segments: List[Dict[str, Any]],
|
| 86 |
+
segments_copy: List[Dict[str, Any]],
|
| 87 |
+
translated_lines: List[str],
|
| 88 |
+
target_lang: str,
|
| 89 |
+
source_lang: Optional[str] = None) -> List[Dict[str, Any]]:
|
| 90 |
+
"""
|
| 91 |
+
Verify translation integrity and assign translated text to segments.
|
| 92 |
+
Falls back to iterative translation if segment counts don't match.
|
| 93 |
+
"""
|
| 94 |
+
if len(original_segments) == len(translated_lines):
|
| 95 |
+
for i in range(len(segments_copy)):
|
| 96 |
+
segments_copy[i]["text"] = translated_lines[i].replace("\t", " ").replace("\n", " ").strip()
|
| 97 |
+
return segments_copy
|
| 98 |
+
else:
|
| 99 |
+
logger.error(
|
| 100 |
+
f"Translation failed: segment count mismatch. Original: {len(original_segments)}, "
|
| 101 |
+
f"Translated: {len(translated_lines)}. Switching to iterative translation."
|
| 102 |
+
)
|
| 103 |
+
return translate_iterative(original_segments, target_lang, source_lang)
|
| 104 |
+
|
| 105 |
+
def translate_batch(segments: List[Dict[str, Any]],
|
| 106 |
+
target_lang: str,
|
| 107 |
+
chunk_size: int = 4000,
|
| 108 |
+
source_lang: Optional[str] = None) -> List[Dict[str, Any]]:
|
| 109 |
+
"""
|
| 110 |
+
Translate a batch of text segments in chunks to respect API limits.
|
| 111 |
+
|
| 112 |
+
Args:
|
| 113 |
+
segments: List of dictionaries with 'text' key
|
| 114 |
+
target_lang: Target language code
|
| 115 |
+
chunk_size: Maximum character count per chunk (default: 4000)
|
| 116 |
+
source_lang: Source language code (defaults to auto-detect)
|
| 117 |
+
|
| 118 |
+
Returns:
|
| 119 |
+
List of segments with translated text
|
| 120 |
+
"""
|
| 121 |
+
segments_copy = copy.deepcopy(segments)
|
| 122 |
+
source = fix_language_code(source_lang)
|
| 123 |
+
target = fix_language_code(target_lang)
|
| 124 |
+
|
| 125 |
+
logger.info(f"Translating {len(segments)} segments from {source} to {target} (batch)")
|
| 126 |
+
|
| 127 |
+
# Extract text from segments
|
| 128 |
+
text_lines = [segment["text"].strip() for segment in segments]
|
| 129 |
+
|
| 130 |
+
# Create chunks respecting character limit
|
| 131 |
+
text_chunks = []
|
| 132 |
+
current_chunk = ""
|
| 133 |
+
chunk_segments = []
|
| 134 |
+
segment_tracking = []
|
| 135 |
+
|
| 136 |
+
for line in text_lines:
|
| 137 |
+
line = " " if not line else line
|
| 138 |
+
if (len(current_chunk) + len(line) + 7) <= chunk_size: # 7 for separator
|
| 139 |
+
if current_chunk:
|
| 140 |
+
current_chunk += " ||||| "
|
| 141 |
+
current_chunk += line
|
| 142 |
+
chunk_segments.append(line)
|
| 143 |
+
else:
|
| 144 |
+
text_chunks.append(current_chunk)
|
| 145 |
+
segment_tracking.append(chunk_segments)
|
| 146 |
+
current_chunk = line
|
| 147 |
+
chunk_segments = [line]
|
| 148 |
+
|
| 149 |
+
if current_chunk:
|
| 150 |
+
text_chunks.append(current_chunk)
|
| 151 |
+
segment_tracking.append(chunk_segments)
|
| 152 |
+
|
| 153 |
+
# Translate chunks
|
| 154 |
+
translator = GoogleTranslator(source=source, target=target)
|
| 155 |
+
translated_segments = []
|
| 156 |
+
progress_bar = tqdm(total=len(segments), desc="Translating")
|
| 157 |
+
|
| 158 |
+
try:
|
| 159 |
+
for chunk_text, chunk_segments in zip(text_chunks, segment_tracking):
|
| 160 |
+
translated_chunk = translator.translate(chunk_text.strip())
|
| 161 |
+
split_translations = translated_chunk.split("|||||")
|
| 162 |
+
|
| 163 |
+
# Verify chunk integrity
|
| 164 |
+
if len(split_translations) == len(chunk_segments):
|
| 165 |
+
progress_bar.update(len(split_translations))
|
| 166 |
+
translated_segments.extend([t.strip() for t in split_translations])
|
| 167 |
+
else:
|
| 168 |
+
logger.warning(
|
| 169 |
+
f"Chunk translation mismatch. Expected {len(chunk_segments)}, "
|
| 170 |
+
f"got {len(split_translations)}. Translating segment by segment."
|
| 171 |
+
)
|
| 172 |
+
for segment in chunk_segments:
|
| 173 |
+
translated_text = translator.translate(segment.strip())
|
| 174 |
+
translated_segments.append(translated_text.strip())
|
| 175 |
+
progress_bar.update(1)
|
| 176 |
+
|
| 177 |
+
progress_bar.close()
|
| 178 |
+
|
| 179 |
+
# Verify and return
|
| 180 |
+
return verify_translation(segments, segments_copy, translated_segments, target_lang, source_lang)
|
| 181 |
+
|
| 182 |
+
except Exception as error:
|
| 183 |
+
progress_bar.close()
|
| 184 |
+
logger.error(f"Batch translation failed: {error}")
|
| 185 |
+
return translate_iterative(segments, target_lang, source_lang)
|
| 186 |
+
|
| 187 |
+
def translate_with_groq(segments: List[Dict[str, Any]],
|
| 188 |
+
target_lang: str,
|
| 189 |
+
model_name: str = "llama-3.3-70b-versatile",
|
| 190 |
+
source_lang: Optional[str] = None,
|
| 191 |
+
batch_size: int = 10) -> List[Dict[str, Any]]:
|
| 192 |
+
"""
|
| 193 |
+
Translate text segments using Groq API.
|
| 194 |
+
|
| 195 |
+
Args:
|
| 196 |
+
segments: List of dictionaries with 'text' key
|
| 197 |
+
target_lang: Target language code
|
| 198 |
+
model_name: Groq model to use (default: "llama-3.3-70b-versatile")
|
| 199 |
+
source_lang: Source language code (optional)
|
| 200 |
+
batch_size: Number of segments to process in each API call
|
| 201 |
+
|
| 202 |
+
Returns:
|
| 203 |
+
List of segments with translated text
|
| 204 |
+
"""
|
| 205 |
+
segments_copy = copy.deepcopy(segments)
|
| 206 |
+
|
| 207 |
+
# Get language names instead of codes for clarity in prompting
|
| 208 |
+
target_language = ISO_LANGUAGE_CODES.get(fix_language_code(target_lang), "the target language")
|
| 209 |
+
source_language = "auto-detected language"
|
| 210 |
+
if source_lang:
|
| 211 |
+
source_language = ISO_LANGUAGE_CODES.get(fix_language_code(source_lang), "the source language")
|
| 212 |
+
|
| 213 |
+
logger.info(f"Translating {len(segments)} segments from {source_language} to {target_language} using Groq")
|
| 214 |
+
|
| 215 |
+
# Set up Groq LLM
|
| 216 |
+
llm = ChatGroq(model_name=model_name, temperature=0.2)
|
| 217 |
+
|
| 218 |
+
# Process segments in batches
|
| 219 |
+
translated_segments = []
|
| 220 |
+
total_batches = (len(segments) + batch_size - 1) // batch_size
|
| 221 |
+
|
| 222 |
+
for batch_idx in tqdm(range(total_batches), desc="Translating batches"):
|
| 223 |
+
start_idx = batch_idx * batch_size
|
| 224 |
+
end_idx = min(start_idx + batch_size, len(segments))
|
| 225 |
+
batch = segments[start_idx:end_idx]
|
| 226 |
+
|
| 227 |
+
# Extract text from segments
|
| 228 |
+
batch_texts = [segment["text"].strip() for segment in batch]
|
| 229 |
+
|
| 230 |
+
# Create numbered text array for the prompt
|
| 231 |
+
numbered_texts = [f"{i+1}. {text}" for i, text in enumerate(batch_texts)]
|
| 232 |
+
batch_content = "\n".join(numbered_texts)
|
| 233 |
+
|
| 234 |
+
# Create a prompt template for translation
|
| 235 |
+
template = """
|
| 236 |
+
You are a professional translator. Translate the following text segments from {source_language} to {target_language}.
|
| 237 |
+
|
| 238 |
+
IMPORTANT INSTRUCTIONS:
|
| 239 |
+
1. Preserve the meaning, tone, and style of the original text
|
| 240 |
+
2. Only respond with JSON in the exact format shown below
|
| 241 |
+
3. Each numbered segment should be translated separately
|
| 242 |
+
4. Maintain the original numbering in your response
|
| 243 |
+
5. Translated Segments should be short and concise
|
| 244 |
+
6. the translated segment should be of similar size as input.
|
| 245 |
+
|
| 246 |
+
Text to translate:
|
| 247 |
+
{text_segments}
|
| 248 |
+
|
| 249 |
+
The response should be ONLY a JSON array with this exact structure:
|
| 250 |
+
[
|
| 251 |
+
"translated segment 1",
|
| 252 |
+
"translated segment 2",
|
| 253 |
+
...
|
| 254 |
+
]
|
| 255 |
+
"""
|
| 256 |
+
|
| 257 |
+
prompt = ChatPromptTemplate.from_messages([("system", template)])
|
| 258 |
+
|
| 259 |
+
try:
|
| 260 |
+
# Create a chain and execute
|
| 261 |
+
chain = LLMChain(llm=llm, prompt=prompt)
|
| 262 |
+
response = chain.run(
|
| 263 |
+
source_language=source_language,
|
| 264 |
+
target_language=target_language,
|
| 265 |
+
text_segments=batch_content
|
| 266 |
+
)
|
| 267 |
+
|
| 268 |
+
# Parse the response
|
| 269 |
+
# First try to find JSON in the response using regex
|
| 270 |
+
import re
|
| 271 |
+
json_match = re.search(r'\[.*\]', response.strip(), re.DOTALL)
|
| 272 |
+
|
| 273 |
+
if json_match:
|
| 274 |
+
try:
|
| 275 |
+
translated_texts = json.loads(json_match.group(0))
|
| 276 |
+
except:
|
| 277 |
+
# If regex json extraction fails, try direct parsing
|
| 278 |
+
translated_texts = json.loads(response.strip())
|
| 279 |
+
else:
|
| 280 |
+
# If no JSON array found, try to parse directly
|
| 281 |
+
translated_texts = json.loads(response.strip())
|
| 282 |
+
|
| 283 |
+
# Verify correct count
|
| 284 |
+
if len(translated_texts) != len(batch):
|
| 285 |
+
logger.warning(
|
| 286 |
+
f"Translation count mismatch. Expected {len(batch)}, "
|
| 287 |
+
f"got {len(translated_texts)}. Falling back to Google Translate for this batch."
|
| 288 |
+
)
|
| 289 |
+
# Fall back to Google for this batch
|
| 290 |
+
fallback_translations = translate_iterative(batch, target_lang, source_lang)
|
| 291 |
+
translated_texts = [segment["text"] for segment in fallback_translations]
|
| 292 |
+
|
| 293 |
+
# Add translations to the result
|
| 294 |
+
translated_segments.extend(translated_texts)
|
| 295 |
+
|
| 296 |
+
# Avoid hitting rate limits
|
| 297 |
+
time.sleep(0.5)
|
| 298 |
+
|
| 299 |
+
except Exception as error:
|
| 300 |
+
logger.error(f"Groq translation error for batch {batch_idx+1}/{total_batches}: {error}")
|
| 301 |
+
logger.warning("Falling back to Google Translate for this batch")
|
| 302 |
+
|
| 303 |
+
# Fall back to Google for this batch
|
| 304 |
+
fallback_translations = translate_iterative(batch, target_lang, source_lang)
|
| 305 |
+
batch_translations = [segment["text"] for segment in fallback_translations]
|
| 306 |
+
translated_segments.extend(batch_translations)
|
| 307 |
+
|
| 308 |
+
# Verify and update segments
|
| 309 |
+
return verify_translation(segments, segments_copy, translated_segments, target_lang, source_lang)
|
| 310 |
+
|
| 311 |
+
def translate_text(segments: List[Dict[str, Any]],
|
| 312 |
+
target_lang: str,
|
| 313 |
+
translation_method: str = "batch",
|
| 314 |
+
chunk_size: int = 4000,
|
| 315 |
+
source_lang: Optional[str] = None,
|
| 316 |
+
groq_model: str = "llama-3.3-70b-versatile",
|
| 317 |
+
groq_batch_size: int = 10) -> List[Dict[str, Any]]:
|
| 318 |
+
"""
|
| 319 |
+
Main translation function that handles different translation methods.
|
| 320 |
+
|
| 321 |
+
Args:
|
| 322 |
+
segments: List of dictionaries with 'text' key
|
| 323 |
+
target_lang: Target language code
|
| 324 |
+
translation_method: "batch", "iterative", or "groq" (default: "batch")
|
| 325 |
+
chunk_size: Maximum character count per chunk for batch translation
|
| 326 |
+
source_lang: Source language code (defaults to auto-detect)
|
| 327 |
+
groq_model: Model name for Groq translation
|
| 328 |
+
groq_batch_size: Batch size for Groq translation
|
| 329 |
+
|
| 330 |
+
Returns:
|
| 331 |
+
List of segments with translated text
|
| 332 |
+
"""
|
| 333 |
+
if not segments:
|
| 334 |
+
logger.warning("No segments to translate")
|
| 335 |
+
return segments
|
| 336 |
+
|
| 337 |
+
if translation_method == "batch":
|
| 338 |
+
return translate_batch(segments, target_lang, chunk_size, source_lang)
|
| 339 |
+
elif translation_method == "iterative":
|
| 340 |
+
return translate_iterative(segments, target_lang, source_lang)
|
| 341 |
+
elif translation_method == "groq":
|
| 342 |
+
return translate_with_groq(
|
| 343 |
+
segments,
|
| 344 |
+
target_lang,
|
| 345 |
+
model_name=groq_model,
|
| 346 |
+
source_lang=source_lang,
|
| 347 |
+
batch_size=groq_batch_size
|
| 348 |
+
)
|
| 349 |
+
else:
|
| 350 |
+
logger.error(f"Unknown translation method: {translation_method}")
|
| 351 |
+
return translate_batch(segments, target_lang, chunk_size, source_lang)
|
| 352 |
+
|
| 353 |
+
def generate_srt_subtitles(segments, output_file="output.srt"):
|
| 354 |
+
"""
|
| 355 |
+
Generate an SRT subtitle file from translated segments.
|
| 356 |
+
|
| 357 |
+
Args:
|
| 358 |
+
segments: List of dictionaries with 'start', 'end', and 'text' keys
|
| 359 |
+
output_file: Path to the output SRT file
|
| 360 |
+
|
| 361 |
+
Returns:
|
| 362 |
+
Path to the created SRT file
|
| 363 |
+
"""
|
| 364 |
+
logger.info(f"Generating SRT subtitle file: {output_file}")
|
| 365 |
+
|
| 366 |
+
# Format time as HH:MM:SS,mmm
|
| 367 |
+
def format_time(seconds):
|
| 368 |
+
hours = int(seconds // 3600)
|
| 369 |
+
minutes = int((seconds % 3600) // 60)
|
| 370 |
+
seconds = seconds % 60
|
| 371 |
+
milliseconds = int((seconds - int(seconds)) * 1000)
|
| 372 |
+
return f"{hours:02d}:{minutes:02d}:{int(seconds):02d},{milliseconds:03d}"
|
| 373 |
+
|
| 374 |
+
with open(output_file, "w", encoding="utf-8") as f:
|
| 375 |
+
for i, segment in enumerate(segments, 1):
|
| 376 |
+
# Extract timing information
|
| 377 |
+
start_time = segment.get("start", 0)
|
| 378 |
+
end_time = segment.get("end", 0)
|
| 379 |
+
text = segment.get("text", "").strip()
|
| 380 |
+
|
| 381 |
+
# Skip empty segments
|
| 382 |
+
if not text:
|
| 383 |
+
continue
|
| 384 |
+
|
| 385 |
+
# Write subtitle entry
|
| 386 |
+
f.write(f"{i}\n")
|
| 387 |
+
f.write(f"{format_time(start_time)} --> {format_time(end_time)}\n")
|
| 388 |
+
f.write(f"{text}\n\n")
|
| 389 |
+
|
| 390 |
+
logger.info(f"SRT subtitle file created successfully: {output_file}")
|
| 391 |
+
return output_file
|