Spaces:
Sleeping
Sleeping
Deepfake-Audio
Browse filesThis view is limited to 50 files because it contains too many changes. Β See raw diff
- .gitattributes +0 -0
- .gitignore +22 -0
- CITATION.cff +13 -0
- DEEPFAKE-AUDIO.ipynb +923 -0
- DEEPFAKE-AUDIO.py +689 -0
- Dataset/encoder.pt +3 -0
- Dataset/samples/Andrew Tate.wav +3 -0
- Dataset/samples/Barack Obama.wav +3 -0
- Dataset/samples/Bill Gates.wav +3 -0
- Dataset/samples/Donald Trump.wav +3 -0
- Dataset/samples/Elon Musk.wav +3 -0
- Dataset/samples/Greta Thunberg.wav +3 -0
- Dataset/samples/Hillary Clinton.wav +3 -0
- Dataset/samples/J.K. Rowling.wav +3 -0
- Dataset/samples/Jensen Huang.wav +3 -0
- Dataset/samples/Joe Biden.wav +3 -0
- Dataset/samples/Kamala Harris.wav +3 -0
- Dataset/samples/Mark Zuckerberg.wav +3 -0
- Dataset/samples/Oprah Winfrey.wav +3 -0
- Dataset/samples/Steve Jobs.wav +3 -0
- Dataset/synthesizer.pt +3 -0
- Dataset/vocoder.pt +3 -0
- LICENSE +21 -0
- Mega/Filly.jpg +3 -0
- Mega/Mega.png +3 -0
- Mega/Mega_Chair.png +3 -0
- Mega/Mega_Dining.jpg +3 -0
- Mega/Mega_Professional.jpg +3 -0
- Mega/Mega_and_Hetvi.png +3 -0
- README.md +369 -0
- SECURITY.md +41 -0
- Source Code/DEEPFAKE-AUDIO.ipynb +923 -0
- Source Code/Dockerfile +31 -0
- Source Code/app.py +907 -0
- Source Code/app_ui_demo.py +649 -0
- Source Code/demo_cli.py +237 -0
- Source Code/demo_toolbox.py +66 -0
- Source Code/encoder/__init__.py +27 -0
- Source Code/encoder/audio.py +140 -0
- Source Code/encoder/config.py +81 -0
- Source Code/encoder/data_objects/__init__.py +29 -0
- Source Code/encoder/data_objects/random_cycler.py +72 -0
- Source Code/encoder/data_objects/speaker.py +68 -0
- Source Code/encoder/data_objects/speaker_batch.py +46 -0
- Source Code/encoder/data_objects/speaker_verification_dataset.py +96 -0
- Source Code/encoder/data_objects/utterance.py +59 -0
- Source Code/encoder/inference.py +167 -0
- Source Code/encoder/model.py +152 -0
- Source Code/encoder/params_data.py +47 -0
- Source Code/encoder/params_model.py +37 -0
.gitattributes
ADDED
|
File without changes
|
.gitignore
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.pyc
|
| 2 |
+
*.aux
|
| 3 |
+
*.log
|
| 4 |
+
*.out
|
| 5 |
+
*.synctex.gz
|
| 6 |
+
*.suo
|
| 7 |
+
*__pycache__
|
| 8 |
+
*.idea
|
| 9 |
+
*.ipynb_checkpoints
|
| 10 |
+
*.pickle
|
| 11 |
+
*.npy
|
| 12 |
+
*.blg
|
| 13 |
+
*.bbl
|
| 14 |
+
*.bcf
|
| 15 |
+
*.toc
|
| 16 |
+
*.sh
|
| 17 |
+
encoder/saved_models/*
|
| 18 |
+
synthesizer/saved_models/*
|
| 19 |
+
vocoder/saved_models/*
|
| 20 |
+
saved_models/*
|
| 21 |
+
.venv/
|
| 22 |
+
.pytest_cache/
|
CITATION.cff
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
cff-version: 1.2.0
|
| 2 |
+
message: "If you use this project or its associated academic materials, please cite them as below."
|
| 3 |
+
authors:
|
| 4 |
+
- family-names: "Thakur"
|
| 5 |
+
given-names: "Amey"
|
| 6 |
+
orcid: "https://orcid.org/0000-0001-5644-1575"
|
| 7 |
+
- family-names: "Satish"
|
| 8 |
+
given-names: "Mega"
|
| 9 |
+
orcid: "https://orcid.org/0000-0002-1844-9557"
|
| 10 |
+
title: "Deepfake Audio"
|
| 11 |
+
version: 1.0.0
|
| 12 |
+
date-released: 2021-02-06
|
| 13 |
+
url: "https://github.com/Amey-Thakur/DEEPFAKE-AUDIO"
|
DEEPFAKE-AUDIO.ipynb
ADDED
|
@@ -0,0 +1,923 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "markdown",
|
| 5 |
+
"metadata": {
|
| 6 |
+
"id": "view-in-github",
|
| 7 |
+
"colab_type": "text"
|
| 8 |
+
},
|
| 9 |
+
"source": [
|
| 10 |
+
"<a href=\"https://colab.research.google.com/github/Amey-Thakur/DEEPFAKE-AUDIO/blob/main/DEEPFAKE-AUDIO.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
|
| 11 |
+
]
|
| 12 |
+
},
|
| 13 |
+
{
|
| 14 |
+
"cell_type": "markdown",
|
| 15 |
+
"metadata": {
|
| 16 |
+
"id": "X7Om-e0ahXrB"
|
| 17 |
+
},
|
| 18 |
+
"source": [
|
| 19 |
+
"#\n",
|
| 20 |
+
"<h1 align=\"center\">\ud83c\udf99\ufe0f Deepfake Audio</h1>\n",
|
| 21 |
+
"<h3 align=\"center\"><i>A neural voice cloning studio powered by SV2TTS technology</i></h3>\n",
|
| 22 |
+
"\n",
|
| 23 |
+
"<div align=\"center\">\n",
|
| 24 |
+
"\n",
|
| 25 |
+
"| **Author** | **Profiles** |\n",
|
| 26 |
+
"|:---:|:---|\n",
|
| 27 |
+
"| **Amey Thakur** | [](https://github.com/Amey-Thakur) [](https://orcid.org/0000-0001-5644-1575) [](https://scholar.google.ca/citations?user=0inooPgAAAAJ&hl=en) [](https://www.kaggle.com/ameythakur20) |\n",
|
| 28 |
+
"| **Mega Satish** | [](https://github.com/msatmod) [](https://orcid.org/0000-0002-1844-9557) [](https://scholar.google.ca/citations?user=7Ajrr6EAAAAJ&hl=en) [](https://www.kaggle.com/megasatish) |\n",
|
| 29 |
+
"\n",
|
| 30 |
+
"---\n",
|
| 31 |
+
"\n",
|
| 32 |
+
"**Attribution:** This project builds upon the foundational work of [CorentinJ/Real-Time-Voice-Cloning](https://github.com/CorentinJ/Real-Time-Voice-Cloning).\n",
|
| 33 |
+
"\n",
|
| 34 |
+
"\ud83d\ude80 **Live Demo:** [Hugging Face Space](https://huggingface.co/spaces/ameythakur/Deepfake-Audio) | \ud83c\udfa5 **Video Demo:** [YouTube](https://youtu.be/i3wnBcbHDbs) | \ud83d\udcbb **Repository:** [GitHub](https://github.com/Amey-Thakur/DEEPFAKE-AUDIO)\n",
|
| 35 |
+
"\n",
|
| 36 |
+
"<a href=\"https://youtu.be/i3wnBcbHDbs\">\n",
|
| 37 |
+
" <img src=\"https://img.youtube.com/vi/i3wnBcbHDbs/0.jpg\" alt=\"Video Demo\" width=\"60%\">\n",
|
| 38 |
+
"</a>\n",
|
| 39 |
+
"\n",
|
| 40 |
+
"</div>\n",
|
| 41 |
+
"\n",
|
| 42 |
+
"## \ud83d\udcd6 Introduction\n",
|
| 43 |
+
"\n",
|
| 44 |
+
"> **An audio deepfake is when a \u201ccloned\u201d voice that is potentially indistinguishable from the real person\u2019s is used to produce synthetic audio.**\n",
|
| 45 |
+
"\n",
|
| 46 |
+
"This research notebook demonstrates the **SV2TTS (Speaker Verification to Text-to-Speech)** framework, a three-stage deep learning pipeline capable of cloning a voice from a mere 5 seconds of audio.\n",
|
| 47 |
+
"\n",
|
| 48 |
+
"### The Pipeline\n",
|
| 49 |
+
"1. **Speaker Encoder**: Creates a fixed-dimensional embedding (fingerprint) from the reference audio.\n",
|
| 50 |
+
"2. **Synthesizer**: Generates a Mel Spectrogram from text, conditioned on the speaker embedding.\n",
|
| 51 |
+
"3. **Vocoder**: Converts the Mel Spectrogram into a raw time-domain waveform (audible speech)."
|
| 52 |
+
]
|
| 53 |
+
},
|
| 54 |
+
{
|
| 55 |
+
"cell_type": "markdown",
|
| 56 |
+
"metadata": {
|
| 57 |
+
"id": "4kb73A1rhXrE"
|
| 58 |
+
},
|
| 59 |
+
"source": [
|
| 60 |
+
"## \u2601\ufe0f Cloud Environment Setup\n",
|
| 61 |
+
"Execute the following cell **only** if you are running this notebook in a cloud environment like **Google Colab** or **Kaggle**.\n",
|
| 62 |
+
"\n",
|
| 63 |
+
"This script will:\n",
|
| 64 |
+
"1. **Clone the Repository**: Tries GitHub first, then falls back to **Personal Hugging Face Space** (`ameythakur/Deepfake-Audio`) if GitHub fails.\n",
|
| 65 |
+
"2. **Environment Detection**: Automatically detects **Kaggle** vs **Colab**.\n",
|
| 66 |
+
"3. **Data Retrieval**:\n",
|
| 67 |
+
" * **Kaggle**: Links directly from `/kaggle/input/deepfakeaudio` (No download needed).\n",
|
| 68 |
+
" * **Others**: Attempts Git LFS pull.\n",
|
| 69 |
+
"4. **Fallback to Kagglehub**: If LFS budget exceeded, downloads from `ameythakur20/deepfakeaudio`.\n",
|
| 70 |
+
"5. Install all required Python and System dependencies."
|
| 71 |
+
]
|
| 72 |
+
},
|
| 73 |
+
{
|
| 74 |
+
"cell_type": "code",
|
| 75 |
+
"execution_count": 1,
|
| 76 |
+
"metadata": {
|
| 77 |
+
"colab": {
|
| 78 |
+
"base_uri": "https://localhost:8080/"
|
| 79 |
+
},
|
| 80 |
+
"id": "bDzcxXrzhXrE",
|
| 81 |
+
"outputId": "060d68bb-0b5c-43a8-f0fd-119d6335404e"
|
| 82 |
+
},
|
| 83 |
+
"outputs": [
|
| 84 |
+
{
|
| 85 |
+
"output_type": "stream",
|
| 86 |
+
"name": "stdout",
|
| 87 |
+
"text": [
|
| 88 |
+
"\ud83d\udcbb Detected Google Colab Environment. Initiating setup...\n",
|
| 89 |
+
"\u2b07\ufe0f Cloning DEEPFAKE-AUDIO repository from MAIN (GitHub)...\n",
|
| 90 |
+
"Cloning into 'DEEPFAKE-AUDIO'...\n",
|
| 91 |
+
"remote: Enumerating objects: 682, done.\u001b[K\n",
|
| 92 |
+
"remote: Counting objects: 100% (62/62), done.\u001b[K\n",
|
| 93 |
+
"remote: Compressing objects: 100% (55/55), done.\u001b[K\n",
|
| 94 |
+
"remote: Total 682 (delta 7), reused 7 (delta 7), pack-reused 620 (from 1)\u001b[K\n",
|
| 95 |
+
"Receiving objects: 100% (682/682), 71.95 MiB | 12.37 MiB/s, done.\n",
|
| 96 |
+
"Resolving deltas: 100% (328/328), done.\n",
|
| 97 |
+
"Downloading Dataset/encoder.pt (17 MB)\n",
|
| 98 |
+
"Error downloading object: Dataset/encoder.pt (39373b8): Smudge error: Error downloading Dataset/encoder.pt (39373b86598fa3da9fcddee6142382efe09777e8d37dc9c0561f41f0070f134e): batch response: This repository exceeded its LFS budget. The account responsible for the budget should increase it to restore access.\n",
|
| 99 |
+
"\n",
|
| 100 |
+
"Errors logged to /content/DEEPFAKE-AUDIO/.git/lfs/logs/20260129T113138.869344323.log\n",
|
| 101 |
+
"Use `git lfs logs last` to view the log.\n",
|
| 102 |
+
"error: external filter 'git-lfs filter-process' failed\n",
|
| 103 |
+
"fatal: Dataset/encoder.pt: smudge filter lfs failed\n",
|
| 104 |
+
"warning: Clone succeeded, but checkout failed.\n",
|
| 105 |
+
"You can inspect what was checked out with 'git status'\n",
|
| 106 |
+
"and retry with 'git restore --source=HEAD :/'\n",
|
| 107 |
+
"\n",
|
| 108 |
+
"\ud83d\udd27 Installing dependencies...\n",
|
| 109 |
+
"Reading package lists... Done\n",
|
| 110 |
+
"Building dependency tree... Done\n",
|
| 111 |
+
"Reading state information... Done\n",
|
| 112 |
+
"libsndfile1 is already the newest version (1.0.31-2ubuntu0.2).\n",
|
| 113 |
+
"0 upgraded, 0 newly installed, 0 to remove and 41 not upgraded.\n",
|
| 114 |
+
"\ud83d\udce6 Attempting Git LFS pull...\n",
|
| 115 |
+
"Updated git hooks.\n",
|
| 116 |
+
"Git LFS initialized.\n",
|
| 117 |
+
"Error updating the git index:\n",
|
| 118 |
+
"error: Source Code/encoder/__init__.py: cannot add to the index - missing --add option?\n",
|
| 119 |
+
"fatal: Unable to process path Source Code/encoder/__init__.py\n",
|
| 120 |
+
"\n",
|
| 121 |
+
"\n",
|
| 122 |
+
"Errors logged to /content/DEEPFAKE-AUDIO/.git/lfs/logs/20260129T113158.470476122.log\n",
|
| 123 |
+
"Use `git lfs logs last` to view the log.\n",
|
| 124 |
+
"batch response: This repository exceeded its LFS budget. The account responsible for the budget should increase it to restore access.\n",
|
| 125 |
+
"error: failed to fetch some objects from 'https://github.com/Amey-Thakur/DEEPFAKE-AUDIO.git/info/lfs'\n",
|
| 126 |
+
"\u26a0\ufe0f GitHub LFS Budget Exceeded or Pull Failed. Using Kaggle Fallback...\n",
|
| 127 |
+
"Requirement already satisfied: kagglehub in /usr/local/lib/python3.12/dist-packages (0.3.13)\n",
|
| 128 |
+
"Requirement already satisfied: packaging in /usr/local/lib/python3.12/dist-packages (from kagglehub) (25.0)\n",
|
| 129 |
+
"Requirement already satisfied: pyyaml in /usr/local/lib/python3.12/dist-packages (from kagglehub) (6.0.3)\n",
|
| 130 |
+
"Requirement already satisfied: requests in /usr/local/lib/python3.12/dist-packages (from kagglehub) (2.32.4)\n",
|
| 131 |
+
"Requirement already satisfied: tqdm in /usr/local/lib/python3.12/dist-packages (from kagglehub) (4.67.1)\n",
|
| 132 |
+
"Requirement already satisfied: charset_normalizer<4,>=2 in /usr/local/lib/python3.12/dist-packages (from requests->kagglehub) (3.4.4)\n",
|
| 133 |
+
"Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.12/dist-packages (from requests->kagglehub) (3.11)\n",
|
| 134 |
+
"Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.12/dist-packages (from requests->kagglehub) (2.5.0)\n",
|
| 135 |
+
"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.12/dist-packages (from requests->kagglehub) (2026.1.4)\n",
|
| 136 |
+
"\ud83d\ude80 Downloading assets from Kagglehub (ameythakur20/deepfakeaudio)...\n",
|
| 137 |
+
"Warning: Looks like you're using an outdated `kagglehub` version (installed: 0.3.13), please consider upgrading to the latest version (0.4.1).\n",
|
| 138 |
+
"Downloading from https://www.kaggle.com/api/v1/datasets/download/ameythakur20/deepfakeaudio?dataset_version_number=5...\n"
|
| 139 |
+
]
|
| 140 |
+
},
|
| 141 |
+
{
|
| 142 |
+
"output_type": "stream",
|
| 143 |
+
"name": "stderr",
|
| 144 |
+
"text": [
|
| 145 |
+
"100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 550M/550M [00:25<00:00, 22.7MB/s]"
|
| 146 |
+
]
|
| 147 |
+
},
|
| 148 |
+
{
|
| 149 |
+
"output_type": "stream",
|
| 150 |
+
"name": "stdout",
|
| 151 |
+
"text": [
|
| 152 |
+
"Extracting files...\n"
|
| 153 |
+
]
|
| 154 |
+
},
|
| 155 |
+
{
|
| 156 |
+
"output_type": "stream",
|
| 157 |
+
"name": "stderr",
|
| 158 |
+
"text": [
|
| 159 |
+
"\n"
|
| 160 |
+
]
|
| 161 |
+
},
|
| 162 |
+
{
|
| 163 |
+
"output_type": "stream",
|
| 164 |
+
"name": "stdout",
|
| 165 |
+
"text": [
|
| 166 |
+
"\u2705 Samples linked from Kaggle.\n",
|
| 167 |
+
"\u2705 Models linked from Kaggle.\n"
|
| 168 |
+
]
|
| 169 |
+
}
|
| 170 |
+
],
|
| 171 |
+
"source": [
|
| 172 |
+
"import os\n",
|
| 173 |
+
"import sys\n",
|
| 174 |
+
"import shutil\n",
|
| 175 |
+
"\n",
|
| 176 |
+
"# Detect Cloud Environment (Colab/Kaggle)\n",
|
| 177 |
+
"try:\n",
|
| 178 |
+
" shell = get_ipython()\n",
|
| 179 |
+
" if 'google.colab' in str(shell):\n",
|
| 180 |
+
" print(\"\ud83d\udcbb Detected Google Colab Environment. Initiating setup...\")\n",
|
| 181 |
+
"\n",
|
| 182 |
+
" # 1. Clone the Repository (GitHub with HF Fallback)\n",
|
| 183 |
+
" if not os.path.exists(\"DEEPFAKE-AUDIO\"):\n",
|
| 184 |
+
" print(\"\u2b07\ufe0f Cloning DEEPFAKE-AUDIO repository from MAIN (GitHub)...\")\n",
|
| 185 |
+
" clone_status = shell.system(\"git clone https://github.com/Amey-Thakur/DEEPFAKE-AUDIO\")\n",
|
| 186 |
+
"\n",
|
| 187 |
+
" # Fallback to Hugging Face if GitHub clone failed (folder empty or not created)\n",
|
| 188 |
+
" if not os.path.exists(\"DEEPFAKE-AUDIO\") or not os.listdir(\"DEEPFAKE-AUDIO\"):\n",
|
| 189 |
+
" print(\"\u26a0\ufe0f GitHub Clone Failed. Attempting Fallback: Personal Hugging Face Space...\")\n",
|
| 190 |
+
" if os.path.exists(\"DEEPFAKE-AUDIO\"): shutil.rmtree(\"DEEPFAKE-AUDIO\")\n",
|
| 191 |
+
" shell.system(\"git clone https://huggingface.co/spaces/ameythakur/Deepfake-Audio DEEPFAKE-AUDIO\")\n",
|
| 192 |
+
" print(\"\u2705 Cloned from Hugging Face Space.\")\n",
|
| 193 |
+
"\n",
|
| 194 |
+
" os.chdir(\"/content/DEEPFAKE-AUDIO\")\n",
|
| 195 |
+
"\n",
|
| 196 |
+
" # Install Dependencies (Colab)\n",
|
| 197 |
+
" print(\"\ud83d\udd27 Installing dependencies...\")\n",
|
| 198 |
+
" shell.system(\"apt-get install -y libsndfile1\")\n",
|
| 199 |
+
" shell.system(\"pip install librosa==0.9.2 unidecode webrtcvad inflect umap-learn scikit-learn>=1.3 tqdm scipy 'matplotlib>=3.7,<3.9' Pillow>=10.2 soundfile huggingface_hub\")\n",
|
| 200 |
+
"\n",
|
| 201 |
+
"\n",
|
| 202 |
+
"\n",
|
| 203 |
+
" # 2. Attempt Git LFS (Colab)\n",
|
| 204 |
+
" print(\"\ud83d\udce6 Attempting Git LFS pull...\")\n",
|
| 205 |
+
" shell.system(\"git lfs install\")\n",
|
| 206 |
+
" lfs_status = shell.system(\"git lfs pull\")\n",
|
| 207 |
+
"\n",
|
| 208 |
+
" # 3. Check for Fallback (If LFS failed or budget exceeded)\n",
|
| 209 |
+
" sample_trigger = \"Dataset/samples/Steve Jobs.wav\"\n",
|
| 210 |
+
" if lfs_status != 0 or not os.path.exists(sample_trigger) or os.path.getsize(sample_trigger) < 1000:\n",
|
| 211 |
+
" print(\"\u26a0\ufe0f GitHub LFS Budget Exceeded or Pull Failed. Using Kaggle Fallback...\")\n",
|
| 212 |
+
" shell.system(\"pip install kagglehub\")\n",
|
| 213 |
+
" import kagglehub\n",
|
| 214 |
+
" print(\"\ud83d\ude80 Downloading assets from Kagglehub (ameythakur20/deepfakeaudio)...\")\n",
|
| 215 |
+
" k_path = kagglehub.dataset_download(\"ameythakur20/deepfakeaudio\")\n",
|
| 216 |
+
"\n",
|
| 217 |
+
" # Link/Copy samples\n",
|
| 218 |
+
" k_samples = os.path.join(k_path, \"samples\")\n",
|
| 219 |
+
" if os.path.exists(k_samples):\n",
|
| 220 |
+
" if os.path.exists(\"Dataset/samples\"):\n",
|
| 221 |
+
" shutil.rmtree(\"Dataset/samples\")\n",
|
| 222 |
+
" os.makedirs(\"Dataset\", exist_ok=True)\n",
|
| 223 |
+
" os.symlink(k_samples, \"Dataset/samples\")\n",
|
| 224 |
+
" print(\"\u2705 Samples linked from Kaggle.\")\n",
|
| 225 |
+
"\n",
|
| 226 |
+
" # Link/Copy models\n",
|
| 227 |
+
" for model in [\"encoder.pt\", \"synthesizer.pt\", \"vocoder.pt\"]:\n",
|
| 228 |
+
" k_model = os.path.join(k_path, model)\n",
|
| 229 |
+
" if os.path.exists(k_model):\n",
|
| 230 |
+
" target = os.path.join(\"Dataset\", model)\n",
|
| 231 |
+
" if os.path.exists(target): os.remove(target)\n",
|
| 232 |
+
" os.symlink(k_model, target)\n",
|
| 233 |
+
" print(\"\u2705 Models linked from Kaggle.\")\n",
|
| 234 |
+
"\n",
|
| 235 |
+
" elif \"kaggle\" in os.environ.get(\"KAGGLE_KERNEL_RUN_TYPE\", \"\"):\n",
|
| 236 |
+
" print(\"\ud83d\udcbb Detected Kaggle Environment. Initiating setup...\")\n",
|
| 237 |
+
" os.chdir(\"/kaggle/working\")\n",
|
| 238 |
+
"\n",
|
| 239 |
+
" # 1. Clone the Repository (GitHub with HF Fallback)\n",
|
| 240 |
+
" if not os.path.exists(\"DEEPFAKE-AUDIO\"):\n",
|
| 241 |
+
" print(\"\u2b07\ufe0f Cloning DEEPFAKE-AUDIO repository from MAIN (GitHub)...\")\n",
|
| 242 |
+
" shell.system(\"git clone https://github.com/Amey-Thakur/DEEPFAKE-AUDIO\")\n",
|
| 243 |
+
"\n",
|
| 244 |
+
" if not os.path.exists(\"DEEPFAKE-AUDIO\") or not os.listdir(\"DEEPFAKE-AUDIO\"):\n",
|
| 245 |
+
" print(\"\u26a0\ufe0f GitHub Clone Failed. Attempting Fallback: Personal Hugging Face Space...\")\n",
|
| 246 |
+
" if os.path.exists(\"DEEPFAKE-AUDIO\"): shutil.rmtree(\"DEEPFAKE-AUDIO\")\n",
|
| 247 |
+
" shell.system(\"git clone https://huggingface.co/spaces/ameythakur/Deepfake-Audio DEEPFAKE-AUDIO\")\n",
|
| 248 |
+
" print(\"\u2705 Cloned from Hugging Face Space.\")\n",
|
| 249 |
+
"\n",
|
| 250 |
+
" os.chdir(\"/kaggle/working/DEEPFAKE-AUDIO\")\n",
|
| 251 |
+
"\n",
|
| 252 |
+
" # 2. Priority: Link Kaggle Dataset (Skip LFS pull if dataset exists)\n",
|
| 253 |
+
" kaggle_input = \"/kaggle/input/deepfakeaudio\"\n",
|
| 254 |
+
" if os.path.exists(kaggle_input):\n",
|
| 255 |
+
" print(f\"\u2705 Kaggle Dataset Detected at {kaggle_input}. Linking assets...\")\n",
|
| 256 |
+
" # Link logic specific to Kaggle structure\n",
|
| 257 |
+
" if os.path.exists(\"Dataset/samples\"):\n",
|
| 258 |
+
" shutil.rmtree(\"Dataset/samples\")\n",
|
| 259 |
+
" if not os.path.exists(\"Dataset\"):\n",
|
| 260 |
+
" os.makedirs(\"Dataset\")\n",
|
| 261 |
+
" # Attempt to symlink folder or copy items\n",
|
| 262 |
+
" try:\n",
|
| 263 |
+
" if os.path.exists(os.path.join(kaggle_input, \"samples\")):\n",
|
| 264 |
+
" os.symlink(os.path.join(kaggle_input, \"samples\"), \"Dataset/samples\")\n",
|
| 265 |
+
" for model in [\"encoder.pt\", \"synthesizer.pt\", \"vocoder.pt\"]:\n",
|
| 266 |
+
" src = os.path.join(kaggle_input, model)\n",
|
| 267 |
+
" dst = os.path.join(\"Dataset\", model)\n",
|
| 268 |
+
" if os.path.exists(src):\n",
|
| 269 |
+
" if os.path.exists(dst): os.remove(dst)\n",
|
| 270 |
+
" os.symlink(src, dst)\n",
|
| 271 |
+
" except Exception as e: print(f\"Warning during linking: {e}\")\n",
|
| 272 |
+
" print(\"\u2705 Assets linked from Kaggle Input.\")\n",
|
| 273 |
+
" else:\n",
|
| 274 |
+
" print(\"\u26a0\ufe0f Kaggle Input not found. Attempting standard LFS pull...\")\n",
|
| 275 |
+
" shell.system(\"git lfs install\")\n",
|
| 276 |
+
" shell.system(\"git lfs pull\")\n",
|
| 277 |
+
"\n",
|
| 278 |
+
" # Install Dependencies\n",
|
| 279 |
+
" print(\"\ud83d\udd27 Installing dependencies...\")\n",
|
| 280 |
+
" shell.system(\"apt-get install -y libsndfile1\")\n",
|
| 281 |
+
" shell.system(\"pip install librosa==0.9.2 unidecode webrtcvad inflect umap-learn scikit-learn>=1.3 tqdm scipy 'matplotlib>=3.7,<3.9' Pillow>=10.2 soundfile huggingface_hub\")\n",
|
| 282 |
+
"\n",
|
| 283 |
+
" # 2. Attempt Git LFS\n",
|
| 284 |
+
" print(\"\ud83d\udce6 Attempting Git LFS pull...\")\n",
|
| 285 |
+
" shell.system(\"git lfs install\")\n",
|
| 286 |
+
" lfs_status = shell.system(\"git lfs pull\")\n",
|
| 287 |
+
"\n",
|
| 288 |
+
" # 3. Check for Fallback (If LFS failed or budget exceeded)\n",
|
| 289 |
+
" # Detection: If samples folder is empty or contains small pointer files\n",
|
| 290 |
+
" sample_trigger = \"Dataset/samples/Steve Jobs.wav\"\n",
|
| 291 |
+
" if lfs_status != 0 or not os.path.exists(sample_trigger) or os.path.getsize(sample_trigger) < 1000:\n",
|
| 292 |
+
" print(\"\u26a0\ufe0f GitHub LFS Budget Exceeded or Pull Failed. Using Kaggle Fallback...\")\n",
|
| 293 |
+
" shell.system(\"pip install kagglehub\")\n",
|
| 294 |
+
" import kagglehub\n",
|
| 295 |
+
"\n",
|
| 296 |
+
" # Pull from public Kaggle dataset\n",
|
| 297 |
+
" print(\"\ud83d\ude80 Downloading assets from Kagglehub (ameythakur20/deepfakeaudio)...\")\n",
|
| 298 |
+
" k_path = kagglehub.dataset_download(\"ameythakur20/deepfakeaudio\")\n",
|
| 299 |
+
"\n",
|
| 300 |
+
" # Link/Copy samples\n",
|
| 301 |
+
" k_samples = os.path.join(k_path, \"samples\")\n",
|
| 302 |
+
" if os.path.exists(k_samples):\n",
|
| 303 |
+
" if os.path.exists(\"Dataset/samples\"):\n",
|
| 304 |
+
" shutil.rmtree(\"Dataset/samples\")\n",
|
| 305 |
+
" os.makedirs(\"Dataset\", exist_ok=True)\n",
|
| 306 |
+
" os.symlink(k_samples, \"Dataset/samples\")\n",
|
| 307 |
+
" print(\"\u2705 Samples linked from Kaggle.\")\n",
|
| 308 |
+
"\n",
|
| 309 |
+
" # Link/Copy models\n",
|
| 310 |
+
" for model in [\"encoder.pt\", \"synthesizer.pt\", \"vocoder.pt\"]:\n",
|
| 311 |
+
" k_model = os.path.join(k_path, model)\n",
|
| 312 |
+
" if os.path.exists(k_model):\n",
|
| 313 |
+
" target = os.path.join(\"Dataset\", model)\n",
|
| 314 |
+
" if os.path.exists(target): os.remove(target)\n",
|
| 315 |
+
" os.symlink(k_model, target)\n",
|
| 316 |
+
" print(\"\u2705 Models linked from Kaggle.\")\n",
|
| 317 |
+
"\n",
|
| 318 |
+
" # 4. Pull Latest Code Changes\n",
|
| 319 |
+
" print(\"\ud83d\udd04 Synchronizing with remote repository...\")\n",
|
| 320 |
+
" shell.system(\"git pull\")\n",
|
| 321 |
+
"\n",
|
| 322 |
+
" # 5. Install System Dependencies\n",
|
| 323 |
+
" print(\"\ud83d\udd27 Installing system dependencies (libsndfile1)...\")\n",
|
| 324 |
+
" shell.system(\"apt-get install -y libsndfile1\")\n",
|
| 325 |
+
"\n",
|
| 326 |
+
" # 6. Install Python Dependencies\n",
|
| 327 |
+
" print(\"\ud83d\udce6 Installing Python libraries...\")\n",
|
| 328 |
+
" shell.system(\"pip install librosa==0.9.2 unidecode webrtcvad inflect umap-learn scikit-learn>=1.3 tqdm scipy 'matplotlib>=3.7,<3.9' Pillow>=10.2 soundfile huggingface_hub\")\n",
|
| 329 |
+
"\n",
|
| 330 |
+
" print(\"\u2705 Environment setup complete. Ready for cloning.\")\n",
|
| 331 |
+
" else:\n",
|
| 332 |
+
" print(\"\ud83c\udfe0 Running in local or custom environment. Skipping cloud setup.\")\n",
|
| 333 |
+
"except NameError:\n",
|
| 334 |
+
" print(\"\ud83c\udfe0 Running in local or custom environment. Skipping cloud setup.\")"
|
| 335 |
+
]
|
| 336 |
+
},
|
| 337 |
+
{
|
| 338 |
+
"cell_type": "markdown",
|
| 339 |
+
"metadata": {
|
| 340 |
+
"id": "3sJUw_G8hXrG"
|
| 341 |
+
},
|
| 342 |
+
"source": [
|
| 343 |
+
"## 1\ufe0f\u20e3 Model & Data Initialization\n",
|
| 344 |
+
"\n",
|
| 345 |
+
"We prioritize data availability to ensure the notebook runs smoothly regardless of the platform. The system checks for checkpoints in this order:\n",
|
| 346 |
+
"\n",
|
| 347 |
+
"1. **Repository Local** (`Dataset/` / `Source Code/`): Fast local access if cloned.\n",
|
| 348 |
+
"2. **Kaggle Dataset** (`/kaggle/input/deepfakeaudio/`): Pre-loaded environment data.\n",
|
| 349 |
+
" * *Reference*: [Amey Thakur's Kaggle Dataset](https://www.kaggle.com/datasets/ameythakur20/deepfakeaudio)\n",
|
| 350 |
+
"3. **Personal Backup** (Hugging Face Space): `ameythakur/Deepfake-Audio`.\n",
|
| 351 |
+
" * *Reference*: [Amey Thakur's HF Space](https://huggingface.co/spaces/ameythakur/Deepfake-Audio)\n",
|
| 352 |
+
"4. **HuggingFace Auto-Download**: Robust fallback for fresh environments.\n",
|
| 353 |
+
" * *Reference*: [CorentinJ's SV2TTS Repository](https://huggingface.co/CorentinJ/SV2TTS)"
|
| 354 |
+
]
|
| 355 |
+
},
|
| 356 |
+
{
|
| 357 |
+
"cell_type": "code",
|
| 358 |
+
"execution_count": 2,
|
| 359 |
+
"metadata": {
|
| 360 |
+
"colab": {
|
| 361 |
+
"base_uri": "https://localhost:8080/"
|
| 362 |
+
},
|
| 363 |
+
"id": "bY79_5WrhXrH",
|
| 364 |
+
"outputId": "c1767603-817a-4f2c-8ce9-a09f2436bc5d"
|
| 365 |
+
},
|
| 366 |
+
"outputs": [
|
| 367 |
+
{
|
| 368 |
+
"output_type": "stream",
|
| 369 |
+
"name": "stdout",
|
| 370 |
+
"text": [
|
| 371 |
+
"\ud83d\udcc2 Working Directory: /content/DEEPFAKE-AUDIO\n",
|
| 372 |
+
"\u2705 Module Path Registered: /content/DEEPFAKE-AUDIO/Source Code\n",
|
| 373 |
+
"\u2b07\ufe0f Verifying Model Availability...\n",
|
| 374 |
+
"\u2705 Found high-priority local models in 'Dataset/'. Verified.\n"
|
| 375 |
+
]
|
| 376 |
+
}
|
| 377 |
+
],
|
| 378 |
+
"source": [
|
| 379 |
+
"import sys\n",
|
| 380 |
+
"import os\n",
|
| 381 |
+
"from pathlib import Path\n",
|
| 382 |
+
"import zipfile\n",
|
| 383 |
+
"import shutil\n",
|
| 384 |
+
"\n",
|
| 385 |
+
"# Determine if running in Google Colab\n",
|
| 386 |
+
"IS_COLAB = 'google.colab' in sys.modules\n",
|
| 387 |
+
"\n",
|
| 388 |
+
"# Register 'Source Code' to Python path for module imports\n",
|
| 389 |
+
"source_path = os.path.abspath(\"Source Code\")\n",
|
| 390 |
+
"if source_path not in sys.path:\n",
|
| 391 |
+
" sys.path.append(source_path)\n",
|
| 392 |
+
"\n",
|
| 393 |
+
"print(f\"\ud83d\udcc2 Working Directory: {os.getcwd()}\")\n",
|
| 394 |
+
"print(f\"\u2705 Module Path Registered: {source_path}\")\n",
|
| 395 |
+
"\n",
|
| 396 |
+
"# Define paths for model checkpoints\n",
|
| 397 |
+
"extract_path = \"pretrained_models\"\n",
|
| 398 |
+
"\n",
|
| 399 |
+
"if not os.path.exists(extract_path):\n",
|
| 400 |
+
" os.makedirs(extract_path)\n",
|
| 401 |
+
"\n",
|
| 402 |
+
"# --- \ud83e\udde0 Checkpoint Verification Strategy ---\n",
|
| 403 |
+
"print(\"\u2b07\ufe0f Verifying Model Availability...\")\n",
|
| 404 |
+
"\n",
|
| 405 |
+
"# Priority 1: Check Local Repository 'Dataset/' folder\n",
|
| 406 |
+
"core_models = [\"encoder.pt\", \"synthesizer.pt\", \"vocoder.pt\"]\n",
|
| 407 |
+
"\n",
|
| 408 |
+
"def is_valid_pt(p):\n",
|
| 409 |
+
" \"\"\"Checks if a file exists and is not an LFS pointer (typically < 1KB).\"\"\"\n",
|
| 410 |
+
" return os.path.exists(p) and os.path.getsize(p) > 1000\n",
|
| 411 |
+
"\n",
|
| 412 |
+
"dataset_models_present = all([is_valid_pt(os.path.join(\"Dataset\", m)) for m in core_models])\n",
|
| 413 |
+
"\n",
|
| 414 |
+
"if dataset_models_present:\n",
|
| 415 |
+
" print(\"\u2705 Found high-priority local models in 'Dataset/'. Verified.\")\n",
|
| 416 |
+
"else:\n",
|
| 417 |
+
" # Priority 2: Check Kaggle Dataset (Online Pre-loaded environment data)\n",
|
| 418 |
+
" kaggle_path = \"/kaggle/input/deepfakeaudio\"\n",
|
| 419 |
+
" kaggle_models_present = all([is_valid_pt(os.path.join(kaggle_path, m)) for m in core_models])\n",
|
| 420 |
+
"\n",
|
| 421 |
+
" if kaggle_models_present:\n",
|
| 422 |
+
" print(f\"\u2705 Found hardcoded Kaggle Dataset models at {kaggle_path}. Skipping download.\")\n",
|
| 423 |
+
" else:\n",
|
| 424 |
+
" print(\"\u26a0\ufe0f Models not found or are LFS pointers. Attempting fallback download...\")\n",
|
| 425 |
+
"\n",
|
| 426 |
+
" # Priority 3: Personal Hugging Face Space (ameythakur/Deepfake-Audio)\n",
|
| 427 |
+
" personal_hf_success = False\n",
|
| 428 |
+
" try:\n",
|
| 429 |
+
" print(\"\ud83d\ude80 Attempting download from Personal Hugging Face Space (ameythakur/Deepfake-Audio)...\")\n",
|
| 430 |
+
" from huggingface_hub import hf_hub_download\n",
|
| 431 |
+
" os.makedirs(\"pretrained_models\", exist_ok=True)\n",
|
| 432 |
+
" for model in core_models:\n",
|
| 433 |
+
" try:\n",
|
| 434 |
+
" fpath = hf_hub_download(repo_id=\"ameythakur/Deepfake-Audio\", filename=f\"Dataset/{model}\", repo_type=\"space\", local_dir=\"pretrained_models\")\n",
|
| 435 |
+
" except:\n",
|
| 436 |
+
" fpath = hf_hub_download(repo_id=\"ameythakur/Deepfake-Audio\", filename=model, repo_type=\"space\", local_dir=\"pretrained_models\")\n",
|
| 437 |
+
" target = os.path.join(\"pretrained_models\", model)\n",
|
| 438 |
+
" if fpath != target and os.path.exists(fpath): shutil.move(fpath, target)\n",
|
| 439 |
+
" if os.path.exists(os.path.join(\"pretrained_models\", \"Dataset\")): shutil.rmtree(os.path.join(\"pretrained_models\", \"Dataset\"))\n",
|
| 440 |
+
" print(\"\u2705 Models successfully acquired via Personal Hugging Face fallback.\")\n",
|
| 441 |
+
" personal_hf_success = True\n",
|
| 442 |
+
" except Exception as e_hf:\n",
|
| 443 |
+
" print(f\"\u26a0\ufe0f Personal HF Checkpoint failed: {e_hf}. Trying External Fallback...\")\n",
|
| 444 |
+
"\n",
|
| 445 |
+
" # Priority 4 (Fallback): Auto-download from HuggingFace via utils script\n",
|
| 446 |
+
" if not personal_hf_success:\n",
|
| 447 |
+
" try:\n",
|
| 448 |
+
" from utils.default_models import ensure_default_models\n",
|
| 449 |
+
" ensure_default_models(Path(\"pretrained_models\"))\n",
|
| 450 |
+
" print(\"\u2705 Models successfully acquired via External HuggingFace fallback.\")\n",
|
| 451 |
+
" except Exception as e:\n",
|
| 452 |
+
" print(f\"\u26a0\ufe0f Critical: Could not auto-download models. Error: {e}\")"
|
| 453 |
+
]
|
| 454 |
+
},
|
| 455 |
+
{
|
| 456 |
+
"cell_type": "markdown",
|
| 457 |
+
"metadata": {
|
| 458 |
+
"id": "ycvMxmYJhXrI"
|
| 459 |
+
},
|
| 460 |
+
"source": [
|
| 461 |
+
"## 2\ufe0f\u20e3 Architecture Loading\n",
|
| 462 |
+
"\n",
|
| 463 |
+
"We now initialize the three distinct neural networks that comprise the SV2TTS framework. Please ensure you are running on a **GPU Runtime** (e.g., T4 on Colab) for optimal performance."
|
| 464 |
+
]
|
| 465 |
+
},
|
| 466 |
+
{
|
| 467 |
+
"cell_type": "code",
|
| 468 |
+
"execution_count": 3,
|
| 469 |
+
"metadata": {
|
| 470 |
+
"colab": {
|
| 471 |
+
"base_uri": "https://localhost:8080/"
|
| 472 |
+
},
|
| 473 |
+
"id": "yEShAzbfhXrI",
|
| 474 |
+
"outputId": "9d2ce872-19a3-4259-a10b-8afebb394b28"
|
| 475 |
+
},
|
| 476 |
+
"outputs": [
|
| 477 |
+
{
|
| 478 |
+
"output_type": "stream",
|
| 479 |
+
"name": "stdout",
|
| 480 |
+
"text": [
|
| 481 |
+
"\ud83c\udfaf Computation Device: cuda\n",
|
| 482 |
+
"\u23f3 Loading Neural Networks (SV2TTS Pipeline)...\n",
|
| 483 |
+
"\ud83d\udfe2 Loading Encoder from Repository: Dataset/encoder.pt\n",
|
| 484 |
+
"Loaded encoder \"encoder.pt\" trained to step 1564501\n",
|
| 485 |
+
"\ud83d\udfe2 Loading Synthesizer from Repository: Dataset/synthesizer.pt\n",
|
| 486 |
+
"Synthesizer using device: cuda\n",
|
| 487 |
+
"\ud83d\udfe2 Loading Vocoder from Repository: Dataset/vocoder.pt\n",
|
| 488 |
+
"Building Wave-RNN\n",
|
| 489 |
+
"Trainable Parameters: 4.481M\n",
|
| 490 |
+
"Loading model weights at Dataset/vocoder.pt\n",
|
| 491 |
+
"\u2705 Pipeline operational. All components loaded correctly.\n"
|
| 492 |
+
]
|
| 493 |
+
}
|
| 494 |
+
],
|
| 495 |
+
"source": [
|
| 496 |
+
"from encoder import inference as encoder\n",
|
| 497 |
+
"from synthesizer.inference import Synthesizer\n",
|
| 498 |
+
"from vocoder import inference as vocoder\n",
|
| 499 |
+
"import numpy as np\n",
|
| 500 |
+
"import torch\n",
|
| 501 |
+
"from pathlib import Path\n",
|
| 502 |
+
"\n",
|
| 503 |
+
"# Hardware Acceleration Check\n",
|
| 504 |
+
"device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
|
| 505 |
+
"print(f\"\ud83c\udfaf Computation Device: {device}\")\n",
|
| 506 |
+
"\n",
|
| 507 |
+
"def resolve_checkpoint(component_name, legacy_path_suffix):\n",
|
| 508 |
+
" \"\"\"\n",
|
| 509 |
+
" Intelligently resolves the path to model checkpoints based on priority.\n",
|
| 510 |
+
" 1. Repository /Dataset/ folder.\n",
|
| 511 |
+
" 2. Kaggle Input directory (Hardcoded: /kaggle/input/deepfakeaudio/).\n",
|
| 512 |
+
" 3. Auto-downloaded 'pretrained_models'.\n",
|
| 513 |
+
" \"\"\"\n",
|
| 514 |
+
"\n",
|
| 515 |
+
" def is_valid(p):\n",
|
| 516 |
+
" return p.exists() and p.stat().st_size > 1000\n",
|
| 517 |
+
"\n",
|
| 518 |
+
" # 1. Repository Local (Dataset/)\n",
|
| 519 |
+
" dataset_p = Path(\"Dataset\") / f\"{component_name.lower()}.pt\"\n",
|
| 520 |
+
" if is_valid(dataset_p):\n",
|
| 521 |
+
" print(f\"\ud83d\udfe2 Loading {component_name} from Repository: {dataset_p}\")\n",
|
| 522 |
+
" return dataset_p\n",
|
| 523 |
+
"\n",
|
| 524 |
+
" # 2. Kaggle Environment (Hardcoded Path: /kaggle/input/deepfakeaudio/)\n",
|
| 525 |
+
" kaggle_p = Path(\"/kaggle/input/deepfakeaudio\") / f\"{component_name.lower()}.pt\"\n",
|
| 526 |
+
" if is_valid(kaggle_p):\n",
|
| 527 |
+
" print(f\"\ud83d\udfe2 Loading {component_name} from Kaggle: {kaggle_p}\")\n",
|
| 528 |
+
" return kaggle_p\n",
|
| 529 |
+
"\n",
|
| 530 |
+
" # 3. Default / Auto-Downloaded Fallback\n",
|
| 531 |
+
" default_p = Path(\"pretrained_models/default\") / f\"{component_name.lower()}.pt\"\n",
|
| 532 |
+
" if is_valid(default_p):\n",
|
| 533 |
+
" print(f\"\ud83d\udfe2 Loading {component_name} from Fallback: {default_p}\")\n",
|
| 534 |
+
" return default_p\n",
|
| 535 |
+
"\n",
|
| 536 |
+
" # 4. Legacy/Manual Paths\n",
|
| 537 |
+
" legacy_p = Path(\"pretrained_models\") / legacy_path_suffix\n",
|
| 538 |
+
" if legacy_p.exists():\n",
|
| 539 |
+
" if legacy_p.is_dir():\n",
|
| 540 |
+
" pts = [f for f in legacy_p.glob(\"*.pt\") if is_valid(f)]\n",
|
| 541 |
+
" if pts: return pts[0]\n",
|
| 542 |
+
" pts_rec = [f for f in legacy_p.rglob(\"*.pt\") if is_valid(f)]\n",
|
| 543 |
+
" if pts_rec: return pts_rec[0]\n",
|
| 544 |
+
" elif is_valid(legacy_p):\n",
|
| 545 |
+
" return legacy_p\n",
|
| 546 |
+
"\n",
|
| 547 |
+
" print(f'\u26a0\ufe0f Warning: {component_name} checkpoint not found! Falling back to dynamic search...')\n",
|
| 548 |
+
" return None\n",
|
| 549 |
+
"\n",
|
| 550 |
+
"print(\"\u23f3 Loading Neural Networks (SV2TTS Pipeline)...\")\n",
|
| 551 |
+
"\n",
|
| 552 |
+
"try:\n",
|
| 553 |
+
" # 1. Encoder: Extract speaker embedding\n",
|
| 554 |
+
" encoder_path = resolve_checkpoint(\"Encoder\", \"encoder/saved_models\")\n",
|
| 555 |
+
" encoder.load_model(encoder_path)\n",
|
| 556 |
+
"\n",
|
| 557 |
+
" # 2. Synthesizer: Generates spectrograms from text\n",
|
| 558 |
+
" synth_path = resolve_checkpoint(\"Synthesizer\", \"synthesizer/saved_models/logs-pretrained/taco_pretrained\")\n",
|
| 559 |
+
" synthesizer = Synthesizer(synth_path)\n",
|
| 560 |
+
"\n",
|
| 561 |
+
" # 3. Vocoder: Converts spectrograms to audio waveforms\n",
|
| 562 |
+
" vocoder_path = resolve_checkpoint(\"Vocoder\", \"vocoder/saved_models/pretrained\")\n",
|
| 563 |
+
" vocoder.load_model(vocoder_path)\n",
|
| 564 |
+
"\n",
|
| 565 |
+
" print(\"\u2705 Pipeline operational. All components loaded correctly.\")\n",
|
| 566 |
+
"except Exception as e:\n",
|
| 567 |
+
" print(f\"\u274c Architecture Error: {e}\")"
|
| 568 |
+
]
|
| 569 |
+
},
|
| 570 |
+
{
|
| 571 |
+
"cell_type": "markdown",
|
| 572 |
+
"metadata": {
|
| 573 |
+
"id": "GjaGAUyVhXrJ"
|
| 574 |
+
},
|
| 575 |
+
"source": [
|
| 576 |
+
"## 3\ufe0f\u20e3 Inference Interface\n",
|
| 577 |
+
"\n",
|
| 578 |
+
"Select your **Input Method** below to begin cloning.\n",
|
| 579 |
+
"\n",
|
| 580 |
+
"* **Presets**: Choose from a high-quality list of celebrity samples.\n",
|
| 581 |
+
"* **Upload**: Use your own `.wav` or `.mp3` file (5-10 seconds recommended).\n",
|
| 582 |
+
"* **Record**: Capture your voice directly in the browser (Colab only)."
|
| 583 |
+
]
|
| 584 |
+
},
|
| 585 |
+
{
|
| 586 |
+
"cell_type": "code",
|
| 587 |
+
"execution_count": 4,
|
| 588 |
+
"metadata": {
|
| 589 |
+
"colab": {
|
| 590 |
+
"base_uri": "https://localhost:8080/",
|
| 591 |
+
"height": 1000,
|
| 592 |
+
"referenced_widgets": [
|
| 593 |
+
"17941275da2c4712a8f7aeee24e985dc",
|
| 594 |
+
"d35d66d0fc6f4956a22315288f75761e",
|
| 595 |
+
"e5738978acde489493cc00b7580b41bf",
|
| 596 |
+
"a1c213e8e9ed4e6a80611f91e37c16ee",
|
| 597 |
+
"2a6b69d651c24930b892c0bb52e9d9e4",
|
| 598 |
+
"39339f0b568c4d5084de7664d336ba50",
|
| 599 |
+
"0923c7a8ae574c4fa2c9eb8c25076b95",
|
| 600 |
+
"b0a46e803d84423cacb55e37ae6c043b",
|
| 601 |
+
"bb70b9f36452494da95ee1033601218f",
|
| 602 |
+
"f3008160ab52453d932a0cb62c711eee",
|
| 603 |
+
"f035a503fafe48038036bdd50bd07b1c",
|
| 604 |
+
"dac9a6ef6db347e08b2268c5e5654008",
|
| 605 |
+
"779855519a5844d3bcc1ed69a8c44608",
|
| 606 |
+
"05f5b43427fe47fbae468c4cec2be72c",
|
| 607 |
+
"98051b53e539482aadf5d4b132dd3021",
|
| 608 |
+
"3ed45fa49669481dbee831dfb2f6ebeb",
|
| 609 |
+
"9d9b4954b5a74bf2b2f0fef974564bda",
|
| 610 |
+
"3753f04d0dbb48a6a9a5c5589425ad91",
|
| 611 |
+
"c294dc1d01e3421097eefe7eb2e5e377",
|
| 612 |
+
"de81a54ce89a4aaeaba52f01c571b212",
|
| 613 |
+
"7dbdc18b94a54cb6a492c7f33a661a56",
|
| 614 |
+
"b9b8b1f96c3d4b89a9bc3e32d6076ad3",
|
| 615 |
+
"91e19a809309432995d435aeba1e8bde",
|
| 616 |
+
"c51b8d7f054b41039f354fce639d6947",
|
| 617 |
+
"20d72122af6f449c90d53871c5a29b7d",
|
| 618 |
+
"f73ef86eeaab452eb11e45f1e17464a8",
|
| 619 |
+
"edf2d4f03fae4c5da75d0e3e1ad60382"
|
| 620 |
+
]
|
| 621 |
+
},
|
| 622 |
+
"id": "3wq8NsHohXrJ",
|
| 623 |
+
"outputId": "b92ea5cd-958b-4f83-9c48-1726f5f821a1"
|
| 624 |
+
},
|
| 625 |
+
"outputs": [
|
| 626 |
+
{
|
| 627 |
+
"output_type": "stream",
|
| 628 |
+
"name": "stdout",
|
| 629 |
+
"text": [
|
| 630 |
+
"Select Input Method:\n",
|
| 631 |
+
"\u2705 Samples located at: /content/DEEPFAKE-AUDIO/Dataset/samples\n"
|
| 632 |
+
]
|
| 633 |
+
},
|
| 634 |
+
{
|
| 635 |
+
"output_type": "display_data",
|
| 636 |
+
"data": {
|
| 637 |
+
"text/plain": [
|
| 638 |
+
"Tab(children=(VBox(children=(Dropdown(description='Preset:', options=('Donald Trump.wav', 'Steve Jobs.wav', 'A\u2026"
|
| 639 |
+
],
|
| 640 |
+
"application/vnd.jupyter.widget-view+json": {
|
| 641 |
+
"version_major": 2,
|
| 642 |
+
"version_minor": 0,
|
| 643 |
+
"model_id": "17941275da2c4712a8f7aeee24e985dc"
|
| 644 |
+
}
|
| 645 |
+
},
|
| 646 |
+
"metadata": {}
|
| 647 |
+
},
|
| 648 |
+
{
|
| 649 |
+
"output_type": "display_data",
|
| 650 |
+
"data": {
|
| 651 |
+
"text/plain": [
|
| 652 |
+
"Textarea(value=\"Hello, I'm Elon Musk. Welcome to Deepfake Audio by Amey Thakur and Mega Satish. Explore AI voi\u2026"
|
| 653 |
+
],
|
| 654 |
+
"application/vnd.jupyter.widget-view+json": {
|
| 655 |
+
"version_major": 2,
|
| 656 |
+
"version_minor": 0,
|
| 657 |
+
"model_id": "de81a54ce89a4aaeaba52f01c571b212"
|
| 658 |
+
}
|
| 659 |
+
},
|
| 660 |
+
"metadata": {}
|
| 661 |
+
},
|
| 662 |
+
{
|
| 663 |
+
"output_type": "display_data",
|
| 664 |
+
"data": {
|
| 665 |
+
"text/plain": [
|
| 666 |
+
"Button(button_style='primary', description='Clone Voice! \ud83d\ude80', style=ButtonStyle())"
|
| 667 |
+
],
|
| 668 |
+
"application/vnd.jupyter.widget-view+json": {
|
| 669 |
+
"version_major": 2,
|
| 670 |
+
"version_minor": 0,
|
| 671 |
+
"model_id": "91e19a809309432995d435aeba1e8bde"
|
| 672 |
+
}
|
| 673 |
+
},
|
| 674 |
+
"metadata": {}
|
| 675 |
+
},
|
| 676 |
+
{
|
| 677 |
+
"output_type": "display_data",
|
| 678 |
+
"data": {
|
| 679 |
+
"text/plain": [
|
| 680 |
+
"Output()"
|
| 681 |
+
],
|
| 682 |
+
"application/vnd.jupyter.widget-view+json": {
|
| 683 |
+
"version_major": 2,
|
| 684 |
+
"version_minor": 0,
|
| 685 |
+
"model_id": "f73ef86eeaab452eb11e45f1e17464a8"
|
| 686 |
+
}
|
| 687 |
+
},
|
| 688 |
+
"metadata": {}
|
| 689 |
+
}
|
| 690 |
+
],
|
| 691 |
+
"source": [
|
| 692 |
+
"import ipywidgets as widgets\n",
|
| 693 |
+
"from IPython.display import display, Javascript, Audio\n",
|
| 694 |
+
"try:\n",
|
| 695 |
+
" from google.colab import output\n",
|
| 696 |
+
" HAS_COLAB = True\n",
|
| 697 |
+
"except ImportError:\n",
|
| 698 |
+
" HAS_COLAB = False\n",
|
| 699 |
+
"from base64 import b64decode\n",
|
| 700 |
+
"import io\n",
|
| 701 |
+
"import librosa\n",
|
| 702 |
+
"import librosa.display\n",
|
| 703 |
+
"import os\n",
|
| 704 |
+
"import soundfile as sf\n",
|
| 705 |
+
"import matplotlib.pyplot as plt\n",
|
| 706 |
+
"import numpy as np\n",
|
| 707 |
+
"import glob\n",
|
| 708 |
+
"\n",
|
| 709 |
+
"RECORD = \"\"\"\n",
|
| 710 |
+
"const sleep = time => new Promise(resolve => setTimeout(resolve, time))\n",
|
| 711 |
+
"const b2text = blob => new Promise(resolve => {\n",
|
| 712 |
+
" const reader = new FileReader()\n",
|
| 713 |
+
" reader.onloadend = e => resolve(e.srcElement.result)\n",
|
| 714 |
+
" reader.readAsDataURL(blob)\n",
|
| 715 |
+
"})\n",
|
| 716 |
+
"var record = time => new Promise(async resolve => {\n",
|
| 717 |
+
" stream = await navigator.mediaDevices.getUserMedia({ audio: true })\n",
|
| 718 |
+
" recorder = new MediaRecorder(stream)\n",
|
| 719 |
+
" chunks = []\n",
|
| 720 |
+
" recorder.ondataavailable = e => chunks.push(e.data)\n",
|
| 721 |
+
" recorder.start()\n",
|
| 722 |
+
" await sleep(time)\n",
|
| 723 |
+
" recorder.onstop = async ()=>{\n",
|
| 724 |
+
" blob = new Blob(chunks)\n",
|
| 725 |
+
" text = await b2text(blob)\n",
|
| 726 |
+
" resolve(text)\n",
|
| 727 |
+
" }\n",
|
| 728 |
+
" recorder.stop()\n",
|
| 729 |
+
"})\"\"\"\n",
|
| 730 |
+
"\n",
|
| 731 |
+
"def record_audio(sec=10):\n",
|
| 732 |
+
" if not HAS_COLAB:\n",
|
| 733 |
+
" raise RuntimeError(\"Recording is only available in a Google Colab environment.\")\n",
|
| 734 |
+
" print(\"\ud83d\udd34 Recording active for %d seconds...\" % sec)\n",
|
| 735 |
+
" display(Javascript(RECORD))\n",
|
| 736 |
+
" s = output.eval_js('record(%d)' % (sec*1000))\n",
|
| 737 |
+
" print(\"\u2705 Recording saved.\")\n",
|
| 738 |
+
" binary = b64decode(s.split(',')[1])\n",
|
| 739 |
+
" with open('recording.wav', 'wb') as f:\n",
|
| 740 |
+
" f.write(binary)\n",
|
| 741 |
+
" return 'recording.wav'\n",
|
| 742 |
+
"\n",
|
| 743 |
+
"def visualize_results(original_wav, generated_wav, spec, embed, title=\"Analysis\"):\n",
|
| 744 |
+
" try:\n",
|
| 745 |
+
" fig, axes = plt.subplots(3, 1, figsize=(10, 12))\n",
|
| 746 |
+
" axes[0].set_title(\"Input Voice vs. Cloned Voice (Waveform)\")\n",
|
| 747 |
+
" try:\n",
|
| 748 |
+
" librosa.display.waveshow(original_wav, alpha=0.5, ax=axes[0], label=\"Original\")\n",
|
| 749 |
+
" librosa.display.waveshow(generated_wav, alpha=0.5, ax=axes[0], label=\"Cloned\", color='r')\n",
|
| 750 |
+
" axes[0].legend()\n",
|
| 751 |
+
" except:\n",
|
| 752 |
+
" axes[0].plot(original_wav, alpha=0.5, label=\"Original\")\n",
|
| 753 |
+
" axes[0].plot(generated_wav, alpha=0.5, label=\"Cloned\", color='r')\n",
|
| 754 |
+
" axes[0].legend()\n",
|
| 755 |
+
"\n",
|
| 756 |
+
" axes[1].set_title(\"Generated Mel Spectrogram\")\n",
|
| 757 |
+
" im = axes[1].imshow(spec, aspect=\"auto\", origin=\"lower\", interpolation='none')\n",
|
| 758 |
+
" fig.colorbar(im, ax=axes[1])\n",
|
| 759 |
+
"\n",
|
| 760 |
+
" axes[2].set_title(\"Speaker Embedding (256-D Heatmap)\")\n",
|
| 761 |
+
" if len(embed) == 256:\n",
|
| 762 |
+
" axes[2].imshow(embed.reshape(16, 16), aspect='auto', cmap='viridis')\n",
|
| 763 |
+
" else:\n",
|
| 764 |
+
" axes[2].plot(embed)\n",
|
| 765 |
+
"\n",
|
| 766 |
+
" plt.tight_layout()\n",
|
| 767 |
+
" plt.show()\n",
|
| 768 |
+
" except Exception as e:\n",
|
| 769 |
+
" print(f\"\u26a0\ufe0f Graphs partially failed: {e}. Audio was successful.\")\n",
|
| 770 |
+
"\n",
|
| 771 |
+
"# --- \ud83d\udee1\ufe0f IMPROVED SAMPLE DISCOVERY ---\n",
|
| 772 |
+
"def find_samples_dir():\n",
|
| 773 |
+
" \"\"\"Locates reference samples with high persistence across all environments.\"\"\"\n",
|
| 774 |
+
" # Priority paths\n",
|
| 775 |
+
" priority_roots = [\n",
|
| 776 |
+
" \"Source Code/samples\",\n",
|
| 777 |
+
" \"Dataset/samples\",\n",
|
| 778 |
+
" \"D:/GitHub/DEEPFAKE-AUDIO/Source Code/samples\",\n",
|
| 779 |
+
" \"D:/GitHub/DEEPFAKE-AUDIO/Dataset/samples\",\n",
|
| 780 |
+
" \"/content/DEEPFAKE-AUDIO/Source Code/samples\",\n",
|
| 781 |
+
" \"/kaggle/input/deepfakeaudio/samples\",\n",
|
| 782 |
+
" \"/kaggle/input/deepfakeaudio\"\n",
|
| 783 |
+
" ]\n",
|
| 784 |
+
"\n",
|
| 785 |
+
" def filter_real_audio(d):\n",
|
| 786 |
+
" if not os.path.exists(d): return []\n",
|
| 787 |
+
" # Check if files are real audio (not small LFS pointers < 1KB)\n",
|
| 788 |
+
" return [f for f in os.listdir(d) if f.lower().endswith((\".wav\", \".mp3\")) and os.path.getsize(os.path.join(d, f)) > 1024]\n",
|
| 789 |
+
"\n",
|
| 790 |
+
" for d in priority_roots:\n",
|
| 791 |
+
" files = filter_real_audio(d)\n",
|
| 792 |
+
" if files:\n",
|
| 793 |
+
" print(f\"\u2705 Samples located at: {os.path.abspath(d)}\")\n",
|
| 794 |
+
" return d, files\n",
|
| 795 |
+
"\n",
|
| 796 |
+
" # More aggressive glob search\n",
|
| 797 |
+
" print(\"\ud83d\udd0d Searching folders for audio samples...\")\n",
|
| 798 |
+
" potential_matches = glob.glob(\"**/samples/*.wav\", recursive=True) + glob.glob(\"**/samples/*.mp3\", recursive=True)\n",
|
| 799 |
+
" valid_matches = [m for m in potential_matches if os.path.getsize(m) > 1024]\n",
|
| 800 |
+
"\n",
|
| 801 |
+
" if valid_matches:\n",
|
| 802 |
+
" root = os.path.dirname(valid_matches[0])\n",
|
| 803 |
+
" files = [os.path.basename(f) for f in glob.glob(os.path.join(root, \"*.*\")) if f.lower().endswith((\".wav\", \".mp3\")) and os.path.getsize(f) > 1024]\n",
|
| 804 |
+
" print(f\"\u2728 Located samples via glob at: {os.path.abspath(root)}\")\n",
|
| 805 |
+
" return root, files\n",
|
| 806 |
+
"\n",
|
| 807 |
+
" return None, []\n",
|
| 808 |
+
"\n",
|
| 809 |
+
"print(\"Select Input Method:\")\n",
|
| 810 |
+
"tab = widgets.Tab()\n",
|
| 811 |
+
"\n",
|
| 812 |
+
"samples_dir, preset_files = find_samples_dir()\n",
|
| 813 |
+
"if samples_dir:\n",
|
| 814 |
+
" preset_files.sort()\n",
|
| 815 |
+
" for name in reversed([\"Donald Trump.wav\", \"Steve Jobs.wav\"]):\n",
|
| 816 |
+
" if name in preset_files:\n",
|
| 817 |
+
" preset_files.insert(0, preset_files.pop(preset_files.index(name)))\n",
|
| 818 |
+
"else:\n",
|
| 819 |
+
" print(\"\u26a0\ufe0f Warning: No reference samples found. Please run the setup cell or upload manually.\")\n",
|
| 820 |
+
"\n",
|
| 821 |
+
"dropdown = widgets.Dropdown(options=preset_files,\n",
|
| 822 |
+
" value=preset_files[0] if preset_files else None,\n",
|
| 823 |
+
" description='Preset:')\n",
|
| 824 |
+
"uploader = widgets.FileUpload(accept='.wav,.mp3', multiple=False)\n",
|
| 825 |
+
"record_btn = widgets.Button(description=\"Start Recording (10s)\", button_style='danger')\n",
|
| 826 |
+
"record_out = widgets.Output()\n",
|
| 827 |
+
"\n",
|
| 828 |
+
"def on_record_click(b):\n",
|
| 829 |
+
" with record_out:\n",
|
| 830 |
+
" record_btn.disabled = True\n",
|
| 831 |
+
" try: record_audio(10)\n",
|
| 832 |
+
" except Exception as e: print(f\"Error: {e}.\")\n",
|
| 833 |
+
" record_btn.disabled = False\n",
|
| 834 |
+
"record_btn.on_click(on_record_click)\n",
|
| 835 |
+
"\n",
|
| 836 |
+
"# Tab assignment MUST use .children attribute\n",
|
| 837 |
+
"tab.children = [\n",
|
| 838 |
+
" widgets.VBox([dropdown]),\n",
|
| 839 |
+
" widgets.VBox([uploader]),\n",
|
| 840 |
+
" widgets.VBox([record_btn, record_out])\n",
|
| 841 |
+
"]\n",
|
| 842 |
+
"tab.set_title(0, '\ud83c\udfb5 Presets')\n",
|
| 843 |
+
"tab.set_title(1, '\ud83d\udcc2 Upload')\n",
|
| 844 |
+
"tab.set_title(2, '\ud83d\udd34 Record')\n",
|
| 845 |
+
"display(tab)\n",
|
| 846 |
+
"\n",
|
| 847 |
+
"text_input = widgets.Textarea(\n",
|
| 848 |
+
" value=\"Hello, I'm Elon Musk. Welcome to Deepfake Audio by Amey Thakur and Mega Satish. Explore AI voice Go!\",\n",
|
| 849 |
+
" placeholder='Enter text to synthesize...',\n",
|
| 850 |
+
" description='Text:',\n",
|
| 851 |
+
" layout=widgets.Layout(width='50%', height='100px')\n",
|
| 852 |
+
")\n",
|
| 853 |
+
"clone_btn = widgets.Button(description=\"Clone Voice! \ud83d\ude80\", button_style='primary')\n",
|
| 854 |
+
"out = widgets.Output()\n",
|
| 855 |
+
"display(text_input, clone_btn, out)\n",
|
| 856 |
+
"\n",
|
| 857 |
+
"def run_cloning(b):\n",
|
| 858 |
+
" with out:\n",
|
| 859 |
+
" out.clear_output()\n",
|
| 860 |
+
" active_tab = tab.selected_index\n",
|
| 861 |
+
" input_path = None\n",
|
| 862 |
+
" try:\n",
|
| 863 |
+
" if active_tab == 0:\n",
|
| 864 |
+
" if not dropdown.value: return print(\"\u274c No preset selected.\")\n",
|
| 865 |
+
" input_path = os.path.join(samples_dir, dropdown.value)\n",
|
| 866 |
+
" print(f\"\ud83c\udf99\ufe0f Source: Preset ({dropdown.value})\")\n",
|
| 867 |
+
" elif active_tab == 1:\n",
|
| 868 |
+
" if not uploader.value: return print(\"\u274c No file uploaded.\")\n",
|
| 869 |
+
" fname = list(uploader.value.keys())[0]\n",
|
| 870 |
+
" content = uploader.value[fname]['content']\n",
|
| 871 |
+
" input_path = \"uploaded_sample.wav\"\n",
|
| 872 |
+
" with open(input_path, \"wb\") as f: f.write(content)\n",
|
| 873 |
+
" print(f\"\ud83c\udf99\ufe0f Source: Upload ({fname})\")\n",
|
| 874 |
+
" elif active_tab == 2:\n",
|
| 875 |
+
" if not os.path.exists(\"recording.wav\"): return print(\"\u274c No recording found.\")\n",
|
| 876 |
+
" input_path = \"recording.wav\"\n",
|
| 877 |
+
" print(\"\ud83c\udf99\ufe0f Source: Microphone\")\n",
|
| 878 |
+
"\n",
|
| 879 |
+
" print(\"\u23f3 Step 1/3: Encoding speaker identity...\")\n",
|
| 880 |
+
" original_wav, sampling_rate = librosa.load(input_path)\n",
|
| 881 |
+
" preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate)\n",
|
| 882 |
+
" embed = encoder.embed_utterance(preprocessed_wav)\n",
|
| 883 |
+
" print(\"\u23f3 Step 2/3: Synthesizing speech...\")\n",
|
| 884 |
+
" specs = synthesizer.synthesize_spectrograms([text_input.value], [embed])\n",
|
| 885 |
+
" spec = specs[0]\n",
|
| 886 |
+
" print(\"\u23f3 Step 3/3: Generating waveform...\")\n",
|
| 887 |
+
" generated_wav = vocoder.infer_waveform(spec)\n",
|
| 888 |
+
" print(\"\ud83c\udf89 Synthesis Complete!\")\n",
|
| 889 |
+
" display(Audio(generated_wav, rate=synthesizer.sample_rate))\n",
|
| 890 |
+
" print(\"\\n\ud83d\udcca Generating Analysis...\")\n",
|
| 891 |
+
" visualize_results(original_wav, generated_wav, spec, embed)\n",
|
| 892 |
+
" except Exception as e: print(f\"\u274c Error: {e}\")\n",
|
| 893 |
+
"clone_btn.on_click(run_cloning)"
|
| 894 |
+
]
|
| 895 |
+
}
|
| 896 |
+
],
|
| 897 |
+
"metadata": {
|
| 898 |
+
"kernelspec": {
|
| 899 |
+
"display_name": "Python 3",
|
| 900 |
+
"name": "python3"
|
| 901 |
+
},
|
| 902 |
+
"language_info": {
|
| 903 |
+
"codemirror_mode": {
|
| 904 |
+
"name": "ipython",
|
| 905 |
+
"version": 3
|
| 906 |
+
},
|
| 907 |
+
"file_extension": ".py",
|
| 908 |
+
"mimetype": "text/x-python",
|
| 909 |
+
"name": "python",
|
| 910 |
+
"nbconvert_exporter": "python",
|
| 911 |
+
"pygments_lexer": "ipython3",
|
| 912 |
+
"version": "3.10.12"
|
| 913 |
+
},
|
| 914 |
+
"colab": {
|
| 915 |
+
"provenance": [],
|
| 916 |
+
"gpuType": "T4",
|
| 917 |
+
"include_colab_link": true
|
| 918 |
+
},
|
| 919 |
+
"accelerator": "GPU"
|
| 920 |
+
},
|
| 921 |
+
"nbformat": 4,
|
| 922 |
+
"nbformat_minor": 0
|
| 923 |
+
}
|
DEEPFAKE-AUDIO.py
ADDED
|
@@ -0,0 +1,689 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ==================================================================================================
|
| 2 |
+
# DEEPFAKE-AUDIO - DEEPFAKE-AUDIO.py (Neural Voice Cloning Research Script)
|
| 3 |
+
# ==================================================================================================
|
| 4 |
+
#
|
| 5 |
+
# π DESCRIPTION
|
| 6 |
+
# This script is a comprehensive, production-ready Python implementation of the SV2TTS
|
| 7 |
+
# (Speaker Verification to Text-to-Speech) voice cloning pipeline. It is a direct
|
| 8 |
+
# conversion of the 'DEEPFAKE-AUDIO.ipynb' Jupyter notebook, designed to run in both
|
| 9 |
+
# local and cloud (Colab/Kaggle) environments.
|
| 10 |
+
#
|
| 11 |
+
# The pipeline enables cloning a voice from as little as 5 seconds of audio. It consists
|
| 12 |
+
# of three distinct neural networks:
|
| 13 |
+
# 1. Speaker Encoder: Extracts a fixed-dimensional embedding (fingerprint) from
|
| 14 |
+
# the reference audio.
|
| 15 |
+
# 2. Synthesizer (Tacotron 2): Generates a Mel-Spectrogram from text, conditioned
|
| 16 |
+
# on the speaker embedding.
|
| 17 |
+
# 3. Vocoder (WaveRNN): Converts the Mel-Spectrogram into a raw time-domain waveform
|
| 18 |
+
# (audible speech).
|
| 19 |
+
#
|
| 20 |
+
# KEY FEATURES:
|
| 21 |
+
# - Cross-Platform: Runs seamlessly on Windows, Linux, Google Colab, and Kaggle.
|
| 22 |
+
# - Robust Fallbacks: Implements a multi-source data retrieval strategy (Local ->
|
| 23 |
+
# Kaggle -> Hugging Face) to guarantee model availability.
|
| 24 |
+
# - Interactive Mode: Supports preset samples, local file uploads, and microphone
|
| 25 |
+
# recording (Colab only).
|
| 26 |
+
# - Analysis & Visualization: Generates waveform comparisons, Mel-Spectrograms, and
|
| 27 |
+
# speaker embedding heatmaps.
|
| 28 |
+
#
|
| 29 |
+
# π€ AUTHORS
|
| 30 |
+
# - Amey Thakur
|
| 31 |
+
# - GitHub: https://github.com/Amey-Thakur
|
| 32 |
+
# - ORCID: https://orcid.org/0000-0001-5644-1575
|
| 33 |
+
# - Google Scholar: https://scholar.google.ca/citations?user=0inooPgAAAAJ
|
| 34 |
+
# - Mega Satish
|
| 35 |
+
# - GitHub: https://github.com/msatmod
|
| 36 |
+
# - ORCID: https://orcid.org/0000-0002-1844-9557
|
| 37 |
+
# - Google Scholar: https://scholar.google.ca/citations?user=7Ajrr6EAAAAJ
|
| 38 |
+
#
|
| 39 |
+
# π€π» CREDITS
|
| 40 |
+
# This project builds upon the foundational work of the Real-Time Voice Cloning project.
|
| 41 |
+
# Original Repository: https://github.com/CorentinJ/Real-Time-Voice-Cloning
|
| 42 |
+
# Pre-trained Models: https://huggingface.co/CorentinJ/SV2TTS
|
| 43 |
+
#
|
| 44 |
+
# π PROJECT LINKS
|
| 45 |
+
# Repository: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO
|
| 46 |
+
# Live Demo: https://huggingface.co/spaces/ameythakur/Deepfake-Audio
|
| 47 |
+
# Video Demo: https://youtu.be/i3wnBcbHDbs
|
| 48 |
+
# Kaggle Dataset: https://www.kaggle.com/datasets/ameythakur20/deepfakeaudio
|
| 49 |
+
#
|
| 50 |
+
# π LICENSE
|
| 51 |
+
# Released under the MIT License
|
| 52 |
+
# Release Date: 2021-02-06
|
| 53 |
+
# ==================================================================================================
|
| 54 |
+
|
| 55 |
+
"""
|
| 56 |
+
DEEPFAKE-AUDIO.py: A complete, standalone script for neural voice cloning.
|
| 57 |
+
|
| 58 |
+
This script provides a command-line interface for the SV2TTS voice cloning pipeline.
|
| 59 |
+
It can be run directly or imported as a module.
|
| 60 |
+
|
| 61 |
+
Usage:
|
| 62 |
+
python DEEPFAKE-AUDIO.py [--preset <name>] [--input <path>] [--text <text>]
|
| 63 |
+
|
| 64 |
+
Example:
|
| 65 |
+
python DEEPFAKE-AUDIO.py --preset "Steve Jobs.wav" --text "Hello, this is a cloned voice."
|
| 66 |
+
"""
|
| 67 |
+
|
| 68 |
+
# ==================================================================================================
|
| 69 |
+
# SECTION 1: IMPORTS & GLOBAL CONFIGURATION
|
| 70 |
+
# ==================================================================================================
|
| 71 |
+
# This section imports all necessary libraries and sets up global variables. It handles
|
| 72 |
+
# environment detection and defines paths for data and model discovery.
|
| 73 |
+
|
| 74 |
+
import os
|
| 75 |
+
import sys
|
| 76 |
+
import shutil
|
| 77 |
+
import glob
|
| 78 |
+
import argparse
|
| 79 |
+
from pathlib import Path
|
| 80 |
+
|
| 81 |
+
# --- Cross-Platform Compatibility ---
|
| 82 |
+
# Ensure the 'Source Code' directory is on Python's path for module imports.
|
| 83 |
+
# This is critical for accessing the encoder, synthesizer, and vocoder submodules.
|
| 84 |
+
SCRIPT_DIR = Path(__file__).resolve().parent
|
| 85 |
+
SOURCE_CODE_PATH = SCRIPT_DIR / "Source Code"
|
| 86 |
+
if str(SOURCE_CODE_PATH) not in sys.path:
|
| 87 |
+
sys.path.insert(0, str(SOURCE_CODE_PATH))
|
| 88 |
+
|
| 89 |
+
# --- Cloud Environment Detection ---
|
| 90 |
+
# These flags help adapt the script's behavior based on the runtime environment.
|
| 91 |
+
IS_COLAB = "google.colab" in sys.modules
|
| 92 |
+
IS_KAGGLE = "KAGGLE_KERNEL_RUN_TYPE" in os.environ
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
# ==================================================================================================
|
| 96 |
+
# SECTION 2: CLOUD ENVIRONMENT SETUP (COLAB/KAGGLE)
|
| 97 |
+
# ==================================================================================================
|
| 98 |
+
# This function is designed to run exclusively in cloud notebook environments.
|
| 99 |
+
# It handles repository cloning, dependency installation, and data retrieval with
|
| 100 |
+
# multiple fallback sources.
|
| 101 |
+
|
| 102 |
+
def setup_cloud_environment():
|
| 103 |
+
"""
|
| 104 |
+
Orchestrates the cloud environment setup for Google Colab and Kaggle.
|
| 105 |
+
|
| 106 |
+
This function performs the following steps:
|
| 107 |
+
1. Clones the DEEPFAKE-AUDIO repository from GitHub.
|
| 108 |
+
2. Falls back to Hugging Face if GitHub cloning fails.
|
| 109 |
+
3. Installs necessary system (libsndfile1) and Python dependencies.
|
| 110 |
+
4. Attempts to pull large files via Git LFS.
|
| 111 |
+
5. Falls back to Kagglehub for data if LFS budget is exceeded.
|
| 112 |
+
"""
|
| 113 |
+
try:
|
| 114 |
+
# Attempt to get the IPython shell for executing system commands.
|
| 115 |
+
shell = get_ipython()
|
| 116 |
+
except NameError:
|
| 117 |
+
# If NameError, we are not in an interactive IPython environment.
|
| 118 |
+
print("π Running in local or custom environment. Skipping cloud setup.")
|
| 119 |
+
return
|
| 120 |
+
|
| 121 |
+
repo_name = "DEEPFAKE-AUDIO"
|
| 122 |
+
|
| 123 |
+
# --- Google Colab Specific Setup ---
|
| 124 |
+
if IS_COLAB:
|
| 125 |
+
print("π» Detected Google Colab Environment. Initiating setup...")
|
| 126 |
+
colab_working_dir = f"/content/{repo_name}"
|
| 127 |
+
|
| 128 |
+
# Step 1: Clone the Repository
|
| 129 |
+
if not os.path.exists(colab_working_dir):
|
| 130 |
+
print("β¬οΈ Cloning DEEPFAKE-AUDIO repository from GitHub...")
|
| 131 |
+
shell.system(f"git clone https://github.com/Amey-Thakur/{repo_name}")
|
| 132 |
+
|
| 133 |
+
# Fallback to Hugging Face if GitHub clone fails
|
| 134 |
+
if not os.path.exists(colab_working_dir) or not os.listdir(colab_working_dir):
|
| 135 |
+
print("β οΈ GitHub Clone Failed. Attempting Fallback: Hugging Face Space...")
|
| 136 |
+
if os.path.exists(colab_working_dir):
|
| 137 |
+
shutil.rmtree(colab_working_dir)
|
| 138 |
+
shell.system(f"git clone https://huggingface.co/spaces/ameythakur/Deepfake-Audio {repo_name}")
|
| 139 |
+
print("β
Cloned from Hugging Face Space.")
|
| 140 |
+
|
| 141 |
+
os.chdir(colab_working_dir)
|
| 142 |
+
|
| 143 |
+
# Step 2: Install Dependencies
|
| 144 |
+
print("π§ Installing system and Python dependencies...")
|
| 145 |
+
shell.system("apt-get install -y libsndfile1")
|
| 146 |
+
deps = "librosa==0.9.2 unidecode webrtcvad inflect umap-learn scikit-learn>=1.3 tqdm scipy 'matplotlib>=3.7,<3.9' Pillow>=10.2 soundfile huggingface_hub"
|
| 147 |
+
shell.system(f"pip install {deps}")
|
| 148 |
+
|
| 149 |
+
# Step 3: Attempt Git LFS Pull
|
| 150 |
+
print("π¦ Attempting Git LFS pull for large model files...")
|
| 151 |
+
shell.system("git lfs install")
|
| 152 |
+
lfs_status = shell.system("git lfs pull")
|
| 153 |
+
|
| 154 |
+
# Step 4: Kaggle Fallback if LFS Fails
|
| 155 |
+
# Check for a valid sample file to determine LFS success.
|
| 156 |
+
sample_trigger = "Dataset/samples/Steve Jobs.wav"
|
| 157 |
+
is_lfs_failed = (
|
| 158 |
+
lfs_status != 0 or
|
| 159 |
+
not os.path.exists(sample_trigger) or
|
| 160 |
+
os.path.getsize(sample_trigger) < 1000 # LFS pointers are < 1KB
|
| 161 |
+
)
|
| 162 |
+
|
| 163 |
+
if is_lfs_failed:
|
| 164 |
+
print("β οΈ GitHub LFS failed or exceeded budget. Using Kaggle Fallback...")
|
| 165 |
+
shell.system("pip install kagglehub")
|
| 166 |
+
import kagglehub
|
| 167 |
+
print("π Downloading assets from Kagglehub (ameythakur20/deepfakeaudio)...")
|
| 168 |
+
kaggle_path = kagglehub.dataset_download("ameythakur20/deepfakeaudio")
|
| 169 |
+
_link_kaggle_assets(kaggle_path)
|
| 170 |
+
|
| 171 |
+
# --- Kaggle Specific Setup ---
|
| 172 |
+
elif IS_KAGGLE:
|
| 173 |
+
print("π» Detected Kaggle Environment. Initiating setup...")
|
| 174 |
+
kaggle_working_dir = f"/kaggle/working/{repo_name}"
|
| 175 |
+
|
| 176 |
+
# Step 1: Clone the Repository
|
| 177 |
+
if not os.path.exists(kaggle_working_dir):
|
| 178 |
+
print("β¬οΈ Cloning DEEPFAKE-AUDIO repository from GitHub...")
|
| 179 |
+
os.chdir("/kaggle/working")
|
| 180 |
+
shell.system(f"git clone https://github.com/Amey-Thakur/{repo_name}")
|
| 181 |
+
|
| 182 |
+
os.chdir(kaggle_working_dir)
|
| 183 |
+
|
| 184 |
+
# Step 2: Link Kaggle Dataset (Highest Priority on Kaggle)
|
| 185 |
+
kaggle_input_path = "/kaggle/input/deepfakeaudio"
|
| 186 |
+
if os.path.exists(kaggle_input_path):
|
| 187 |
+
print(f"β
Kaggle Dataset Detected at {kaggle_input_path}. Linking assets...")
|
| 188 |
+
_link_kaggle_assets(kaggle_input_path)
|
| 189 |
+
else:
|
| 190 |
+
print("β οΈ Kaggle Input not found. Attempting Git LFS pull...")
|
| 191 |
+
shell.system("git lfs install")
|
| 192 |
+
shell.system("git lfs pull")
|
| 193 |
+
|
| 194 |
+
# Step 3: Install Dependencies
|
| 195 |
+
print("π§ Installing dependencies...")
|
| 196 |
+
shell.system("apt-get install -y libsndfile1")
|
| 197 |
+
deps = "librosa==0.9.2 unidecode webrtcvad inflect umap-learn scikit-learn>=1.3 tqdm scipy 'matplotlib>=3.7,<3.9' Pillow>=10.2 soundfile huggingface_hub"
|
| 198 |
+
shell.system(f"pip install {deps}")
|
| 199 |
+
print("β
Environment setup complete. Ready for cloning.")
|
| 200 |
+
|
| 201 |
+
|
| 202 |
+
def _link_kaggle_assets(source_path):
|
| 203 |
+
"""
|
| 204 |
+
Helper function to create symbolic links from Kaggle data to the expected 'Dataset/' folder.
|
| 205 |
+
|
| 206 |
+
Args:
|
| 207 |
+
source_path: The root path of the downloaded/linked Kaggle data.
|
| 208 |
+
"""
|
| 209 |
+
target_dir = "Dataset"
|
| 210 |
+
os.makedirs(target_dir, exist_ok=True)
|
| 211 |
+
|
| 212 |
+
# Link samples folder
|
| 213 |
+
k_samples = os.path.join(source_path, "samples")
|
| 214 |
+
target_samples = os.path.join(target_dir, "samples")
|
| 215 |
+
if os.path.exists(k_samples):
|
| 216 |
+
if os.path.exists(target_samples):
|
| 217 |
+
shutil.rmtree(target_samples)
|
| 218 |
+
os.symlink(k_samples, target_samples)
|
| 219 |
+
print("β
Samples linked from Kaggle.")
|
| 220 |
+
|
| 221 |
+
# Link model files
|
| 222 |
+
for model_name in ["encoder.pt", "synthesizer.pt", "vocoder.pt"]:
|
| 223 |
+
src = os.path.join(source_path, model_name)
|
| 224 |
+
dst = os.path.join(target_dir, model_name)
|
| 225 |
+
if os.path.exists(src):
|
| 226 |
+
if os.path.exists(dst):
|
| 227 |
+
os.remove(dst)
|
| 228 |
+
os.symlink(src, dst)
|
| 229 |
+
print("β
Models linked from Kaggle.")
|
| 230 |
+
|
| 231 |
+
|
| 232 |
+
# ==================================================================================================
|
| 233 |
+
# SECTION 3: MODEL & DATA INITIALIZATION
|
| 234 |
+
# ==================================================================================================
|
| 235 |
+
# This section handles the discovery and validation of model checkpoints (encoder,
|
| 236 |
+
# synthesizer, vocoder). It implements a priority-based search across local paths,
|
| 237 |
+
# Kaggle datasets, and Hugging Face.
|
| 238 |
+
|
| 239 |
+
def is_valid_checkpoint(filepath):
|
| 240 |
+
"""
|
| 241 |
+
Validates a file path to ensure it's a real model checkpoint and not an
|
| 242 |
+
LFS pointer file.
|
| 243 |
+
|
| 244 |
+
Args:
|
| 245 |
+
filepath: Path object or string to the file.
|
| 246 |
+
|
| 247 |
+
Returns:
|
| 248 |
+
True if the file exists and is larger than 1KB, False otherwise.
|
| 249 |
+
"""
|
| 250 |
+
path = Path(filepath)
|
| 251 |
+
return path.exists() and path.stat().st_size > 1000
|
| 252 |
+
|
| 253 |
+
|
| 254 |
+
def resolve_checkpoint_path(component_name: str, legacy_suffix: str) -> Path:
|
| 255 |
+
"""
|
| 256 |
+
Resolves the path to a model checkpoint using a priority-based search strategy.
|
| 257 |
+
|
| 258 |
+
Search Order:
|
| 259 |
+
1. Local 'Dataset/' folder (part of the repository).
|
| 260 |
+
2. Kaggle Input directory (`/kaggle/input/deepfakeaudio/`).
|
| 261 |
+
3. Auto-downloaded 'pretrained_models/default/' folder.
|
| 262 |
+
4. Legacy/manual paths for advanced users.
|
| 263 |
+
|
| 264 |
+
Args:
|
| 265 |
+
component_name: The name of the component ('Encoder', 'Synthesizer', 'Vocoder').
|
| 266 |
+
legacy_suffix: A fallback path suffix for older project structures.
|
| 267 |
+
|
| 268 |
+
Returns:
|
| 269 |
+
A Path object to the checkpoint, or None if not found.
|
| 270 |
+
"""
|
| 271 |
+
model_filename = f"{component_name.lower()}.pt"
|
| 272 |
+
|
| 273 |
+
# Priority 1: Local Repository 'Dataset/' folder
|
| 274 |
+
local_path = Path("Dataset") / model_filename
|
| 275 |
+
if is_valid_checkpoint(local_path):
|
| 276 |
+
print(f"π’ Loading {component_name} from Repository: {local_path}")
|
| 277 |
+
return local_path
|
| 278 |
+
|
| 279 |
+
# Priority 2: Kaggle Environment
|
| 280 |
+
kaggle_path = Path("/kaggle/input/deepfakeaudio") / model_filename
|
| 281 |
+
if is_valid_checkpoint(kaggle_path):
|
| 282 |
+
print(f"π’ Loading {component_name} from Kaggle: {kaggle_path}")
|
| 283 |
+
return kaggle_path
|
| 284 |
+
|
| 285 |
+
# Priority 3: Auto-Downloaded Fallback ('pretrained_models/default/')
|
| 286 |
+
default_path = Path("pretrained_models/default") / model_filename
|
| 287 |
+
if is_valid_checkpoint(default_path):
|
| 288 |
+
print(f"π’ Loading {component_name} from Fallback: {default_path}")
|
| 289 |
+
return default_path
|
| 290 |
+
|
| 291 |
+
# Priority 4: Legacy/Manual Paths
|
| 292 |
+
legacy_path = Path("pretrained_models") / legacy_suffix
|
| 293 |
+
if legacy_path.exists():
|
| 294 |
+
if legacy_path.is_dir():
|
| 295 |
+
pts = [f for f in legacy_path.glob("*.pt") if is_valid_checkpoint(f)]
|
| 296 |
+
if pts:
|
| 297 |
+
return pts[0]
|
| 298 |
+
elif is_valid_checkpoint(legacy_path):
|
| 299 |
+
return legacy_path
|
| 300 |
+
|
| 301 |
+
print(f"β οΈ Warning: {component_name} checkpoint not found!")
|
| 302 |
+
return None
|
| 303 |
+
|
| 304 |
+
|
| 305 |
+
def download_models_from_huggingface():
|
| 306 |
+
"""
|
| 307 |
+
Attempts to download model checkpoints from Hugging Face if they are not
|
| 308 |
+
found locally.
|
| 309 |
+
|
| 310 |
+
Tries the following sources in order:
|
| 311 |
+
1. Personal Hugging Face Space: `ameythakur/Deepfake-Audio`
|
| 312 |
+
2. External Fallback: Uses the `utils.default_models` script.
|
| 313 |
+
"""
|
| 314 |
+
from huggingface_hub import hf_hub_download
|
| 315 |
+
|
| 316 |
+
core_models = ["encoder.pt", "synthesizer.pt", "vocoder.pt"]
|
| 317 |
+
target_dir = Path("pretrained_models")
|
| 318 |
+
target_dir.mkdir(exist_ok=True)
|
| 319 |
+
|
| 320 |
+
print("π Attempting download from Hugging Face Space (ameythakur/Deepfake-Audio)...")
|
| 321 |
+
try:
|
| 322 |
+
for model in core_models:
|
| 323 |
+
try:
|
| 324 |
+
# Try nested path first (Dataset/model.pt)
|
| 325 |
+
fpath = hf_hub_download(
|
| 326 |
+
repo_id="ameythakur/Deepfake-Audio",
|
| 327 |
+
filename=f"Dataset/{model}",
|
| 328 |
+
repo_type="space",
|
| 329 |
+
local_dir=str(target_dir)
|
| 330 |
+
)
|
| 331 |
+
except Exception:
|
| 332 |
+
# Try root path (model.pt)
|
| 333 |
+
fpath = hf_hub_download(
|
| 334 |
+
repo_id="ameythakur/Deepfake-Audio",
|
| 335 |
+
filename=model,
|
| 336 |
+
repo_type="space",
|
| 337 |
+
local_dir=str(target_dir)
|
| 338 |
+
)
|
| 339 |
+
target_file = target_dir / model
|
| 340 |
+
if Path(fpath) != target_file and Path(fpath).exists():
|
| 341 |
+
shutil.move(fpath, target_file)
|
| 342 |
+
|
| 343 |
+
nested_folder = target_dir / "Dataset"
|
| 344 |
+
if nested_folder.exists():
|
| 345 |
+
shutil.rmtree(nested_folder)
|
| 346 |
+
print("β
Models successfully acquired via Personal Hugging Face fallback.")
|
| 347 |
+
return True
|
| 348 |
+
except Exception as e:
|
| 349 |
+
print(f"β οΈ Personal HF download failed: {e}. Trying external fallback...")
|
| 350 |
+
|
| 351 |
+
# External Fallback: Use the utility script from the project
|
| 352 |
+
try:
|
| 353 |
+
from utils.default_models import ensure_default_models
|
| 354 |
+
ensure_default_models(target_dir)
|
| 355 |
+
print("β
Models successfully acquired via External HuggingFace fallback.")
|
| 356 |
+
return True
|
| 357 |
+
except Exception as e:
|
| 358 |
+
print(f"β Critical: Could not auto-download models. Error: {e}")
|
| 359 |
+
return False
|
| 360 |
+
|
| 361 |
+
|
| 362 |
+
def verify_and_load_pipeline():
|
| 363 |
+
"""
|
| 364 |
+
Verifies the availability of model checkpoints and loads the SV2TTS pipeline.
|
| 365 |
+
|
| 366 |
+
This function first checks for locally available models. If models are missing,
|
| 367 |
+
it attempts to download them from Hugging Face. Finally, it loads the encoder,
|
| 368 |
+
synthesizer, and vocoder into memory.
|
| 369 |
+
|
| 370 |
+
Returns:
|
| 371 |
+
A tuple containing (encoder_module, synthesizer_instance, vocoder_module),
|
| 372 |
+
or None if loading fails.
|
| 373 |
+
"""
|
| 374 |
+
import torch
|
| 375 |
+
from encoder import inference as encoder
|
| 376 |
+
from synthesizer.inference import Synthesizer
|
| 377 |
+
from vocoder import inference as vocoder
|
| 378 |
+
|
| 379 |
+
print(f"π― Computation Device: {'cuda' if torch.cuda.is_available() else 'cpu'}")
|
| 380 |
+
print("β¬οΈ Verifying Model Availability...")
|
| 381 |
+
|
| 382 |
+
core_models = ["encoder.pt", "synthesizer.pt", "vocoder.pt"]
|
| 383 |
+
dataset_models_present = all(
|
| 384 |
+
is_valid_checkpoint(Path("Dataset") / m) for m in core_models
|
| 385 |
+
)
|
| 386 |
+
|
| 387 |
+
if dataset_models_present:
|
| 388 |
+
print("β
Found high-priority local models in 'Dataset/'. Verified.")
|
| 389 |
+
else:
|
| 390 |
+
# Attempt to download if not present
|
| 391 |
+
kaggle_path = Path("/kaggle/input/deepfakeaudio")
|
| 392 |
+
kaggle_models_present = all(
|
| 393 |
+
is_valid_checkpoint(kaggle_path / m) for m in core_models
|
| 394 |
+
)
|
| 395 |
+
if not kaggle_models_present:
|
| 396 |
+
print("β οΈ Models not found locally. Attempting fallback download...")
|
| 397 |
+
download_models_from_huggingface()
|
| 398 |
+
|
| 399 |
+
print("β³ Loading Neural Networks (SV2TTS Pipeline)...")
|
| 400 |
+
try:
|
| 401 |
+
# 1. Encoder: Extract speaker embedding
|
| 402 |
+
encoder_path = resolve_checkpoint_path("Encoder", "encoder/saved_models")
|
| 403 |
+
encoder.load_model(encoder_path)
|
| 404 |
+
|
| 405 |
+
# 2. Synthesizer: Generates spectrograms from text
|
| 406 |
+
synth_path = resolve_checkpoint_path(
|
| 407 |
+
"Synthesizer", "synthesizer/saved_models/logs-pretrained/taco_pretrained"
|
| 408 |
+
)
|
| 409 |
+
synthesizer_instance = Synthesizer(synth_path)
|
| 410 |
+
|
| 411 |
+
# 3. Vocoder: Converts spectrograms to audio waveforms
|
| 412 |
+
vocoder_path = resolve_checkpoint_path("Vocoder", "vocoder/saved_models/pretrained")
|
| 413 |
+
vocoder.load_model(vocoder_path)
|
| 414 |
+
|
| 415 |
+
print("β
Pipeline operational. All components loaded correctly.")
|
| 416 |
+
return encoder, synthesizer_instance, vocoder
|
| 417 |
+
except Exception as e:
|
| 418 |
+
print(f"β Architecture Error: {e}")
|
| 419 |
+
return None
|
| 420 |
+
|
| 421 |
+
|
| 422 |
+
# ==================================================================================================
|
| 423 |
+
# SECTION 4: AUDIO SAMPLE DISCOVERY
|
| 424 |
+
# ==================================================================================================
|
| 425 |
+
# This section provides utilities for finding reference audio samples (e.g., celebrity
|
| 426 |
+
# presets) across various locations in the file system.
|
| 427 |
+
|
| 428 |
+
def find_samples_directory():
|
| 429 |
+
"""
|
| 430 |
+
Locates the directory containing reference audio samples.
|
| 431 |
+
|
| 432 |
+
Searches through a prioritized list of paths and returns the first one
|
| 433 |
+
containing valid audio files (>.wav or .mp3 > 1KB).
|
| 434 |
+
|
| 435 |
+
Returns:
|
| 436 |
+
A tuple (samples_dir_path, list_of_filenames) or (None, []) if not found.
|
| 437 |
+
"""
|
| 438 |
+
priority_paths = [
|
| 439 |
+
"Source Code/samples",
|
| 440 |
+
"Dataset/samples",
|
| 441 |
+
"D:/GitHub/DEEPFAKE-AUDIO/Source Code/samples",
|
| 442 |
+
"D:/GitHub/DEEPFAKE-AUDIO/Dataset/samples",
|
| 443 |
+
"/content/DEEPFAKE-AUDIO/Source Code/samples",
|
| 444 |
+
"/kaggle/input/deepfakeaudio/samples",
|
| 445 |
+
"/kaggle/input/deepfakeaudio",
|
| 446 |
+
]
|
| 447 |
+
|
| 448 |
+
def get_valid_audio_files(directory):
|
| 449 |
+
"""Filters a directory for real audio files (not LFS pointers)."""
|
| 450 |
+
if not os.path.exists(directory):
|
| 451 |
+
return []
|
| 452 |
+
return [
|
| 453 |
+
f for f in os.listdir(directory)
|
| 454 |
+
if f.lower().endswith((".wav", ".mp3")) and
|
| 455 |
+
os.path.getsize(os.path.join(directory, f)) > 1024
|
| 456 |
+
]
|
| 457 |
+
|
| 458 |
+
for path in priority_paths:
|
| 459 |
+
files = get_valid_audio_files(path)
|
| 460 |
+
if files:
|
| 461 |
+
print(f"β
Samples located at: {os.path.abspath(path)}")
|
| 462 |
+
return path, files
|
| 463 |
+
|
| 464 |
+
# Fallback: Use glob to search recursively
|
| 465 |
+
print("π Searching for audio samples via glob...")
|
| 466 |
+
potential_matches = (
|
| 467 |
+
glob.glob("**/samples/*.wav", recursive=True) +
|
| 468 |
+
glob.glob("**/samples/*.mp3", recursive=True)
|
| 469 |
+
)
|
| 470 |
+
valid_matches = [m for m in potential_matches if os.path.getsize(m) > 1024]
|
| 471 |
+
|
| 472 |
+
if valid_matches:
|
| 473 |
+
root_dir = os.path.dirname(valid_matches[0])
|
| 474 |
+
files = [os.path.basename(f) for f in valid_matches]
|
| 475 |
+
print(f"β¨ Located samples via glob at: {os.path.abspath(root_dir)}")
|
| 476 |
+
return root_dir, list(set(files))
|
| 477 |
+
|
| 478 |
+
return None, []
|
| 479 |
+
|
| 480 |
+
|
| 481 |
+
# ==================================================================================================
|
| 482 |
+
# SECTION 5: INFERENCE & VISUALIZATION
|
| 483 |
+
# ==================================================================================================
|
| 484 |
+
# This section contains the core voice cloning logic and functions for visualizing
|
| 485 |
+
# the results (waveforms, spectrograms, embeddings).
|
| 486 |
+
|
| 487 |
+
def clone_voice(
|
| 488 |
+
encoder_module,
|
| 489 |
+
synthesizer_instance,
|
| 490 |
+
vocoder_module,
|
| 491 |
+
input_audio_path: str,
|
| 492 |
+
text_to_synthesize: str
|
| 493 |
+
):
|
| 494 |
+
"""
|
| 495 |
+
Performs the voice cloning inference.
|
| 496 |
+
|
| 497 |
+
This is the main function that orchestrates the three-stage pipeline:
|
| 498 |
+
encode -> synthesize -> vocalize.
|
| 499 |
+
|
| 500 |
+
Args:
|
| 501 |
+
encoder_module: The loaded encoder module.
|
| 502 |
+
synthesizer_instance: The loaded Synthesizer instance.
|
| 503 |
+
vocoder_module: The loaded vocoder module.
|
| 504 |
+
input_audio_path: Path to the reference audio file.
|
| 505 |
+
text_to_synthesize: The text string to be spoken by the cloned voice.
|
| 506 |
+
|
| 507 |
+
Returns:
|
| 508 |
+
A tuple (generated_waveform, mel_spectrogram, speaker_embedding, original_waveform).
|
| 509 |
+
"""
|
| 510 |
+
import numpy as np
|
| 511 |
+
import librosa
|
| 512 |
+
|
| 513 |
+
print(f"ποΈ Reference Audio: {input_audio_path}")
|
| 514 |
+
print(f"π Text to Clone: \"{text_to_synthesize[:80]}...\"" if len(text_to_synthesize) > 80 else f"π Text to Clone: \"{text_to_synthesize}\"")
|
| 515 |
+
|
| 516 |
+
# --- Step 1: Encode ---
|
| 517 |
+
# Load the reference audio and preprocess it for the encoder.
|
| 518 |
+
print("β³ Step 1/3: Encoding speaker identity...")
|
| 519 |
+
original_wav, sampling_rate = librosa.load(input_audio_path)
|
| 520 |
+
preprocessed_wav = encoder_module.preprocess_wav(original_wav, sampling_rate)
|
| 521 |
+
speaker_embedding = encoder_module.embed_utterance(preprocessed_wav)
|
| 522 |
+
print(f" -> Embedding shape: {speaker_embedding.shape}")
|
| 523 |
+
|
| 524 |
+
# --- Step 2: Synthesize ---
|
| 525 |
+
# Generate a Mel-Spectrogram from the text, conditioned on the speaker embedding.
|
| 526 |
+
print("β³ Step 2/3: Synthesizing speech (Mel-Spectrogram)...")
|
| 527 |
+
specs = synthesizer_instance.synthesize_spectrograms([text_to_synthesize], [speaker_embedding])
|
| 528 |
+
mel_spectrogram = specs[0]
|
| 529 |
+
print(f" -> Spectrogram shape: {mel_spectrogram.shape}")
|
| 530 |
+
|
| 531 |
+
# --- Step 3: Vocalize ---
|
| 532 |
+
# Convert the Mel-Spectrogram into a raw audio waveform.
|
| 533 |
+
print("β³ Step 3/3: Generating waveform (WaveRNN)...")
|
| 534 |
+
generated_wav = vocoder_module.infer_waveform(mel_spectrogram)
|
| 535 |
+
print(f" -> Waveform length: {len(generated_wav)} samples")
|
| 536 |
+
|
| 537 |
+
print("π Synthesis Complete!")
|
| 538 |
+
return generated_wav, mel_spectrogram, speaker_embedding, original_wav
|
| 539 |
+
|
| 540 |
+
|
| 541 |
+
def visualize_results(original_wav, generated_wav, mel_spectrogram, speaker_embedding):
|
| 542 |
+
"""
|
| 543 |
+
Generates a multi-panel visualization of the cloning results.
|
| 544 |
+
|
| 545 |
+
Creates a figure with three subplots:
|
| 546 |
+
1. Waveform comparison: Original vs. Cloned audio.
|
| 547 |
+
2. Generated Mel-Spectrogram.
|
| 548 |
+
3. Speaker Embedding heatmap.
|
| 549 |
+
|
| 550 |
+
Args:
|
| 551 |
+
original_wav: The original waveform as a NumPy array.
|
| 552 |
+
generated_wav: The generated waveform as a NumPy array.
|
| 553 |
+
mel_spectrogram: The Mel-Spectrogram as a NumPy array.
|
| 554 |
+
speaker_embedding: The 256-D speaker embedding vector.
|
| 555 |
+
"""
|
| 556 |
+
import matplotlib.pyplot as plt
|
| 557 |
+
import librosa.display
|
| 558 |
+
|
| 559 |
+
try:
|
| 560 |
+
fig, axes = plt.subplots(3, 1, figsize=(12, 14))
|
| 561 |
+
|
| 562 |
+
# --- Panel 1: Waveform Comparison ---
|
| 563 |
+
axes[0].set_title("Input Voice vs. Cloned Voice (Waveform)", fontsize=14)
|
| 564 |
+
try:
|
| 565 |
+
librosa.display.waveshow(original_wav, alpha=0.6, ax=axes[0], label="Original", color='blue')
|
| 566 |
+
librosa.display.waveshow(generated_wav, alpha=0.6, ax=axes[0], label="Cloned", color='red')
|
| 567 |
+
except Exception:
|
| 568 |
+
axes[0].plot(original_wav, alpha=0.6, label="Original", color='blue')
|
| 569 |
+
axes[0].plot(generated_wav, alpha=0.6, label="Cloned", color='red')
|
| 570 |
+
axes[0].legend(loc='upper right')
|
| 571 |
+
axes[0].set_xlabel("Time")
|
| 572 |
+
axes[0].set_ylabel("Amplitude")
|
| 573 |
+
|
| 574 |
+
# --- Panel 2: Mel-Spectrogram ---
|
| 575 |
+
axes[1].set_title("Generated Mel Spectrogram", fontsize=14)
|
| 576 |
+
img = axes[1].imshow(mel_spectrogram, aspect="auto", origin="lower", interpolation="none", cmap='magma')
|
| 577 |
+
fig.colorbar(img, ax=axes[1], format="%+2.0f dB")
|
| 578 |
+
axes[1].set_xlabel("Time Frames")
|
| 579 |
+
axes[1].set_ylabel("Mel Frequency Bins")
|
| 580 |
+
|
| 581 |
+
# --- Panel 3: Speaker Embedding ---
|
| 582 |
+
axes[2].set_title("Speaker Embedding (256-D Identity Fingerprint)", fontsize=14)
|
| 583 |
+
if len(speaker_embedding) == 256:
|
| 584 |
+
axes[2].imshow(speaker_embedding.reshape(16, 16), aspect='auto', cmap='viridis')
|
| 585 |
+
else:
|
| 586 |
+
axes[2].plot(speaker_embedding)
|
| 587 |
+
axes[2].set_xlabel("Dimension")
|
| 588 |
+
axes[2].set_ylabel("Value")
|
| 589 |
+
|
| 590 |
+
plt.tight_layout()
|
| 591 |
+
plt.savefig("cloning_results.png", dpi=150)
|
| 592 |
+
print("π Analysis saved to 'cloning_results.png'.")
|
| 593 |
+
plt.show()
|
| 594 |
+
except Exception as e:
|
| 595 |
+
print(f"β οΈ Visualization partially failed: {e}. Audio cloning was successful.")
|
| 596 |
+
|
| 597 |
+
|
| 598 |
+
# ==================================================================================================
|
| 599 |
+
# SECTION 6: MAIN EXECUTION
|
| 600 |
+
# ==================================================================================================
|
| 601 |
+
# This is the entry point for the script when run from the command line.
|
| 602 |
+
|
| 603 |
+
def main():
|
| 604 |
+
"""
|
| 605 |
+
Main entry point for the DEEPFAKE-AUDIO script.
|
| 606 |
+
|
| 607 |
+
Parses command-line arguments and runs the voice cloning pipeline.
|
| 608 |
+
"""
|
| 609 |
+
parser = argparse.ArgumentParser(
|
| 610 |
+
description="DEEPFAKE-AUDIO: Neural Voice Cloning with SV2TTS",
|
| 611 |
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
| 612 |
+
epilog="""
|
| 613 |
+
Examples:
|
| 614 |
+
python DEEPFAKE-AUDIO.py --preset "Steve Jobs.wav" --text "Hello world!"
|
| 615 |
+
python DEEPFAKE-AUDIO.py --input "my_voice.wav" --text "This is a cloned message."
|
| 616 |
+
"""
|
| 617 |
+
)
|
| 618 |
+
parser.add_argument(
|
| 619 |
+
"--preset", type=str, default=None,
|
| 620 |
+
help="Name of a preset sample file (e.g., 'Steve Jobs.wav')."
|
| 621 |
+
)
|
| 622 |
+
parser.add_argument(
|
| 623 |
+
"--input", type=str, default=None,
|
| 624 |
+
help="Path to a custom audio file to use as the voice reference."
|
| 625 |
+
)
|
| 626 |
+
parser.add_argument(
|
| 627 |
+
"--text", type=str,
|
| 628 |
+
default="Hello, I'm a cloned voice. Welcome to Deepfake Audio by Amey Thakur and Mega Satish.",
|
| 629 |
+
help="The text to be synthesized by the cloned voice."
|
| 630 |
+
)
|
| 631 |
+
parser.add_argument(
|
| 632 |
+
"--output", type=str, default="cloned_output.wav",
|
| 633 |
+
help="Path to save the generated audio file."
|
| 634 |
+
)
|
| 635 |
+
|
| 636 |
+
args = parser.parse_args()
|
| 637 |
+
|
| 638 |
+
# --- Step 1: Cloud Setup (if applicable) ---
|
| 639 |
+
setup_cloud_environment()
|
| 640 |
+
|
| 641 |
+
# --- Step 2: Verify and Load Pipeline ---
|
| 642 |
+
pipeline = verify_and_load_pipeline()
|
| 643 |
+
if pipeline is None:
|
| 644 |
+
print("β Failed to load the SV2TTS pipeline. Exiting.")
|
| 645 |
+
sys.exit(1)
|
| 646 |
+
encoder_mod, synthesizer_inst, vocoder_mod = pipeline
|
| 647 |
+
|
| 648 |
+
# --- Step 3: Determine Input Audio ---
|
| 649 |
+
input_audio_path = None
|
| 650 |
+
if args.input:
|
| 651 |
+
input_audio_path = args.input
|
| 652 |
+
elif args.preset:
|
| 653 |
+
samples_dir, _ = find_samples_directory()
|
| 654 |
+
if samples_dir:
|
| 655 |
+
input_audio_path = os.path.join(samples_dir, args.preset)
|
| 656 |
+
else:
|
| 657 |
+
print(f"β Could not find samples directory for preset '{args.preset}'.")
|
| 658 |
+
sys.exit(1)
|
| 659 |
+
else:
|
| 660 |
+
# Default to first available preset
|
| 661 |
+
samples_dir, preset_files = find_samples_directory()
|
| 662 |
+
if samples_dir and preset_files:
|
| 663 |
+
input_audio_path = os.path.join(samples_dir, preset_files[0])
|
| 664 |
+
print(f"βΉοΈ No input specified. Using default preset: {preset_files[0]}")
|
| 665 |
+
else:
|
| 666 |
+
print("β No input audio provided and no presets found. Exiting.")
|
| 667 |
+
sys.exit(1)
|
| 668 |
+
|
| 669 |
+
if not os.path.exists(input_audio_path):
|
| 670 |
+
print(f"β Audio file not found: {input_audio_path}")
|
| 671 |
+
sys.exit(1)
|
| 672 |
+
|
| 673 |
+
# --- Step 4: Run Inference ---
|
| 674 |
+
generated_wav, mel_spec, embedding, original_wav = clone_voice(
|
| 675 |
+
encoder_mod, synthesizer_inst, vocoder_mod,
|
| 676 |
+
input_audio_path, args.text
|
| 677 |
+
)
|
| 678 |
+
|
| 679 |
+
# --- Step 5: Save Output ---
|
| 680 |
+
import soundfile as sf
|
| 681 |
+
sf.write(args.output, generated_wav, synthesizer_inst.sample_rate)
|
| 682 |
+
print(f"πΎ Cloned audio saved to: {args.output}")
|
| 683 |
+
|
| 684 |
+
# --- Step 6: Visualize Results ---
|
| 685 |
+
visualize_results(original_wav, generated_wav, mel_spec, embedding)
|
| 686 |
+
|
| 687 |
+
|
| 688 |
+
if __name__ == "__main__":
|
| 689 |
+
main()
|
Dataset/encoder.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:39373b86598fa3da9fcddee6142382efe09777e8d37dc9c0561f41f0070f134e
|
| 3 |
+
size 17090379
|
Dataset/samples/Andrew Tate.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:41fad5f1a1a63e6fafaf34c5368c45f7ac3bcd2ecf229a2466da5a47c2d34025
|
| 3 |
+
size 7864398
|
Dataset/samples/Barack Obama.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:42ba473919a79233690b60b3de56bb3eb0e6587173908a4b83841d30c18cdfc8
|
| 3 |
+
size 8454222
|
Dataset/samples/Bill Gates.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b2602bec3edeb2a8169714192391a1997581efea2bff4c07eab7233d617c5b2d
|
| 3 |
+
size 6520910
|
Dataset/samples/Donald Trump.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9d8b44d73192e9c04dd241f16177e4c5753bcefadde69e6e24b45e278b821f8c
|
| 3 |
+
size 4210766
|
Dataset/samples/Elon Musk.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b589d835fc42525cb352bc11a228f796e92e9848af1245b1d9407b8eb1270637
|
| 3 |
+
size 24199246
|
Dataset/samples/Greta Thunberg.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:97ef868d3962520afbcc9e1c355672f133b116b3f7cdad0c06c4dd99b45a244f
|
| 3 |
+
size 8454222
|
Dataset/samples/Hillary Clinton.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:942a270a80e7ca7dd288e2b20c48b082457139e5476bf46966a231125b20dffb
|
| 3 |
+
size 9011278
|
Dataset/samples/J.K. Rowling.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7f55242ae560bf4b5dcf05b2ea8629188315c47c420a95deec1ffa7fe07a3b57
|
| 3 |
+
size 16121934
|
Dataset/samples/Jensen Huang.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:045dd1bd68ad341a2df5d0990c8a3143c6ed5f4a2322e97dcfa40e8559dbd5fa
|
| 3 |
+
size 20168782
|
Dataset/samples/Joe Biden.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a9fc49ce69a8fb2dfc83a7dad1f8de67738aa65c3d1cb02956e38873d11d4d3a
|
| 3 |
+
size 4030542
|
Dataset/samples/Kamala Harris.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5dbec60bd5be09cb31436ca6652241aa97a05c8187efbfd02df0c45f5c7aa7ea
|
| 3 |
+
size 7487566
|
Dataset/samples/Mark Zuckerberg.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fb167c71774a4986b0505c908c29ae010cce5841cf954e9e92bb5dbdfe8b4b64
|
| 3 |
+
size 63357006
|
Dataset/samples/Oprah Winfrey.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7c9125a51042f3703b8456ad97e4ab2a25d1af5ba445a325ebe3d9e28156e06d
|
| 3 |
+
size 16891982
|
Dataset/samples/Steve Jobs.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:322ab6e68bc0c3686198b27ba1602bda4788d29879f73edcfc7b879b2c966805
|
| 3 |
+
size 149480654
|
Dataset/synthesizer.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c05e07428f95d0ed8755e1ef54cc8ae251300413d94ce5867a56afe39c499d94
|
| 3 |
+
size 370554559
|
Dataset/vocoder.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1d7a6861589e927e0fbdaa5849ca022258fe2b58a20cc7bfb8fb598ccf936169
|
| 3 |
+
size 53845290
|
LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MIT License
|
| 2 |
+
|
| 3 |
+
Copyright (c) 2021 Amey Thakur & Mega Satish
|
| 4 |
+
|
| 5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
+
of this software and associated documentation files (the "Software"), to deal
|
| 7 |
+
in the Software without restriction, including without limitation the rights
|
| 8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
+
copies of the Software, and to permit persons to whom the Software is
|
| 10 |
+
furnished to do so, subject to the following conditions:
|
| 11 |
+
|
| 12 |
+
The above copyright notice and this permission notice shall be included in all
|
| 13 |
+
copies or substantial portions of the Software.
|
| 14 |
+
|
| 15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
| 21 |
+
SOFTWARE.
|
Mega/Filly.jpg
ADDED
|
Git LFS Details
|
Mega/Mega.png
ADDED
|
Git LFS Details
|
Mega/Mega_Chair.png
ADDED
|
Git LFS Details
|
Mega/Mega_Dining.jpg
ADDED
|
Git LFS Details
|
Mega/Mega_Professional.jpg
ADDED
|
Git LFS Details
|
Mega/Mega_and_Hetvi.png
ADDED
|
Git LFS Details
|
README.md
ADDED
|
@@ -0,0 +1,369 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Deepfake Audio
|
| 3 |
+
emoji: ποΈ
|
| 4 |
+
colorFrom: indigo
|
| 5 |
+
colorTo: yellow
|
| 6 |
+
sdk: gradio
|
| 7 |
+
sdk_version: 4.44.1
|
| 8 |
+
python_version: 3.11
|
| 9 |
+
app_file: Source Code/app.py
|
| 10 |
+
pinned: false
|
| 11 |
+
---
|
| 12 |
+
|
| 13 |
+
<div align="center">
|
| 14 |
+
|
| 15 |
+
<a name="readme-top"></a>
|
| 16 |
+
# Deepfake Audio
|
| 17 |
+
|
| 18 |
+
[](LICENSE)
|
| 19 |
+

|
| 20 |
+
[](https://github.com/Amey-Thakur/DEEPFAKE-AUDIO)
|
| 21 |
+
[](https://github.com/Amey-Thakur/DEEPFAKE-AUDIO)
|
| 22 |
+
|
| 23 |
+
An advanced neural voice synthesis platform implementing Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis (SV2TTS) for high-fidelity zero-shot voice cloning.
|
| 24 |
+
|
| 25 |
+
**[Source Code](Source%20Code/Deepfake-Audio/)** Β· **[Technical Specification](docs/SPECIFICATION.md)** Β· **[Video Demo](https://youtu.be/i3wnBcbHDbs)** Β· **[Live Demo](https://huggingface.co/spaces/ameythakur/Deepfake-Audio)**
|
| 26 |
+
|
| 27 |
+
<br>
|
| 28 |
+
|
| 29 |
+
<a href="https://youtu.be/i3wnBcbHDbs">
|
| 30 |
+
<img src="https://img.youtube.com/vi/i3wnBcbHDbs/hqdefault.jpg" alt="Video Demo" width="70%">
|
| 31 |
+
</a>
|
| 32 |
+
|
| 33 |
+
</div>
|
| 34 |
+
|
| 35 |
+
---
|
| 36 |
+
|
| 37 |
+
<div align="center">
|
| 38 |
+
|
| 39 |
+
[Authors](#authors) Β· [Overview](#overview) Β· [Features](#features) Β· [Structure](#project-structure) Β· [Results](#results) Β· [Quick Start](#quick-start) Β· [Usage Guidelines](#usage-guidelines) Β· [License](#license) Β· [About](#about-this-repository) Β· [Acknowledgments](#acknowledgments)
|
| 40 |
+
|
| 41 |
+
</div>
|
| 42 |
+
|
| 43 |
+
---
|
| 44 |
+
|
| 45 |
+
<!-- AUTHORS -->
|
| 46 |
+
<div align="center">
|
| 47 |
+
|
| 48 |
+
<a name="authors"></a>
|
| 49 |
+
## Authors
|
| 50 |
+
|
| 51 |
+
| <a href="https://github.com/Amey-Thakur"><img src="https://github.com/Amey-Thakur.png" width="150" height="150" alt="Amey Thakur"></a><br>[**Amey Thakur**](https://github.com/Amey-Thakur)<br><br>[](https://orcid.org/0000-0001-5644-1575) | <a href="https://github.com/msatmod"><img src="Mega/Mega.png" width="150" height="150" alt="Mega Satish"></a><br>[**Mega Satish**](https://github.com/msatmod)<br><br>[](https://orcid.org/0000-0002-1844-9557) |
|
| 52 |
+
| :---: | :---: |
|
| 53 |
+
|
| 54 |
+
</div>
|
| 55 |
+
|
| 56 |
+
> [!IMPORTANT]
|
| 57 |
+
> ### π€π» Special Acknowledgement
|
| 58 |
+
> *Special thanks to **[Mega Satish](https://github.com/msatmod)** for her meaningful contributions, guidance, and support that helped shape this work.*
|
| 59 |
+
|
| 60 |
+
---
|
| 61 |
+
|
| 62 |
+
<!-- OVERVIEW -->
|
| 63 |
+
<a name="overview"></a>
|
| 64 |
+
## Overview
|
| 65 |
+
|
| 66 |
+
**Deepfake Audio** is a multi-stage neural voice synthesis architecture designed to clone speaker identities and generate high-fidelity speech from textual input. By implementing the **SV2TTS** framework, this project translates skeletal vocal characteristics into a latent embedding, which then conditions a generative model to produce new vocalizations with strikingly natural prosody and timbre.
|
| 67 |
+
|
| 68 |
+
> [!IMPORTANT]
|
| 69 |
+
> ### Attribution
|
| 70 |
+
> This project builds upon the foundational research and implementation of the **[Real-Time-Voice-Cloning](https://github.com/CorentinJ/Real-Time-Voice-Cloning)** repository by **[Corentin Jemine](https://github.com/CorentinJ)**.
|
| 71 |
+
|
| 72 |
+
> [!NOTE]
|
| 73 |
+
> ### ποΈ Defining Audio Deepfakes
|
| 74 |
+
> An **audio deepfake** is when a βclonedβ voice that is potentially indistinguishable from the real personβs is used to produce synthetic audio. This process involves utilizing advanced neural architectures, such as the **SV2TTS** framework, to distillate high-dimensional vocal identities into latent embeddings. These embeddings then condition a generative model to synthesize new speech that mirrors the original speaker's prosody, timbre, and acoustic nuances with striking fidelity.
|
| 75 |
+
|
| 76 |
+
The repository serves as a digital study into the mechanics of neural cloning and signal processing, brought into a modern context via a **Progressive Web App (PWA)** interface, enabling high-performance voice synthesis through a decoupled engine architecture.
|
| 77 |
+
|
| 78 |
+
### Synthesis Heuristics
|
| 79 |
+
The classification engine is governed by strict **computational design patterns** ensuring fidelity and responsiveness:
|
| 80 |
+
* **Speaker Normalization**: The encoder utilizes a linear speaker verification pipeline, incrementally distilling lexical tokens into a global affective voice state.
|
| 81 |
+
* **Zero-Shot Inference**: Beyond simple playback, the system integrates a **Tacotron 2-based synthesizer** that dynamically refines its accuracy over time, simulating an organic learning curve for complex phonetic structures.
|
| 82 |
+
* **Real-Time Vocoding**: Audio reconstruction supports both streaming and batch generation, ensuring **high-fidelity** waveform response critical for interactive neural study.
|
| 83 |
+
|
| 84 |
+
> [!TIP]
|
| 85 |
+
> **Acoustic Precision Integration**
|
| 86 |
+
>
|
| 87 |
+
> To maximize cloning clarity, the engine employs a **multi-stage neural pipeline**. **Latent filters** refine the embedding stream, and **probabilistic weights** visualize the voice's confidence vector, strictly coupling acoustic flair with state changes. This ensures the user's mental model is constantly synchronized with the underlying neural simulation.
|
| 88 |
+
|
| 89 |
+
---
|
| 90 |
+
|
| 91 |
+
<!-- FEATURES -->
|
| 92 |
+
<a name="features"></a>
|
| 93 |
+
## Features
|
| 94 |
+
|
| 95 |
+
| Feature | Description |
|
| 96 |
+
|---------|-------------|
|
| 97 |
+
| **SV2TTS Core** | Combines **LSTM Speaker Encoders** with **Tacotron Synthesizers** for comprehensive voice cloning. |
|
| 98 |
+
| **PWA Architecture** | Implements a robust standalone installable interface for immediate neural vocalization study. |
|
| 99 |
+
| **Academic Clarity** | In-depth and detailed comments integrated throughout the codebase for transparent logic study. |
|
| 100 |
+
| **Neural Topology** | Efficient **Decoupled Engine execution** via Gradio and Torch for native high-performance access. |
|
| 101 |
+
| **Inference Pipeline** | Asynchronous architecture ensuring **stability** and responsiveness on local clients. |
|
| 102 |
+
| **Visual Feedback** | **Interactive Status Monitors** that trigger on synthesis events for sensory reward. |
|
| 103 |
+
| **State Feedback** | **Embedding-Based Indicators** and waveform effects for high-impact acoustic feel. |
|
| 104 |
+
| **Social Persistence** | **Interactive Footer Integration** bridging the analysis to the source repository. |
|
| 105 |
+
|
| 106 |
+
> [!NOTE]
|
| 107 |
+
> ### Interactive Polish: The Acoustic Singularity
|
| 108 |
+
> We have engineered a **Logic-Driven State Manager** that calibrates vocal scores across multiple vectors to simulate human-like identity transfer. The visual language focuses on the minimalist "Neon Mic" aesthetic, ensuring maximum focus on the interactive neural trajectory.
|
| 109 |
+
|
| 110 |
+
### Tech Stack
|
| 111 |
+
- **Languages**: Python 3.9+
|
| 112 |
+
- **Logic**: **Neural Pipelines** (SV2TTS & Signal Processing)
|
| 113 |
+
- **Frameworks**: **PyTorch** & **TensorFlow** (Inference)
|
| 114 |
+
- **UI System**: Modern Design (Gradio & Custom CSS)
|
| 115 |
+
- **Deployment**: Local execution / Hugging Face Spaces
|
| 116 |
+
- **Architecture**: Progressive Web App (PWA)
|
| 117 |
+
|
| 118 |
+
---
|
| 119 |
+
|
| 120 |
+
<!-- STRUCTURE -->
|
| 121 |
+
<a name="project-structure"></a>
|
| 122 |
+
## Project Structure
|
| 123 |
+
|
| 124 |
+
```python
|
| 125 |
+
DEEPFAKE-AUDIO/
|
| 126 |
+
β
|
| 127 |
+
βββ Dataset/ # Neural Assets
|
| 128 |
+
β βββ samples/ # Voice Reference Audio
|
| 129 |
+
β βββ encoder.pt # Speaker Verification Model
|
| 130 |
+
β βββ synthesizer.pt # TTS Synthesis Model
|
| 131 |
+
β βββ vocoder.pt # Waveform Reconstruction Model
|
| 132 |
+
β
|
| 133 |
+
βββ docs/ # Academic Documentation
|
| 134 |
+
β βββ SPECIFICATION.md # Technical Architecture
|
| 135 |
+
β
|
| 136 |
+
βββ Mega/ # Attribution Assets
|
| 137 |
+
β βββ Filly.jpg # Companion (Filly)
|
| 138 |
+
β βββ Mega.png # Profile Image (Mega Satish)
|
| 139 |
+
β
|
| 140 |
+
βββ screenshots/ # Visual Gallery
|
| 141 |
+
β βββ 01_landing_page.png
|
| 142 |
+
β βββ 02_landing_page_footer.png
|
| 143 |
+
β βββ 03_example_run_config.png
|
| 144 |
+
β βββ 04_example_run_processing.png
|
| 145 |
+
β βββ 05_example_run_results.png
|
| 146 |
+
β βββ 06_example_run_results_footer.png
|
| 147 |
+
β βββ 07_download_option.png
|
| 148 |
+
β βββ Audio.wav # Sample Output
|
| 149 |
+
β βββ favicon.png # Project Icon
|
| 150 |
+
β
|
| 151 |
+
βββ Source Code/ # Primary Application Layer
|
| 152 |
+
β βββ app.py # Gradio Studio Interface
|
| 153 |
+
β βββ app_ui_demo.py # UI-Only Verification Mode
|
| 154 |
+
β βββ Dockerfile # Containerization Config
|
| 155 |
+
β βββ requirements.txt # Dependency Manifest
|
| 156 |
+
β βββ favicon.png # Application Icon
|
| 157 |
+
β βββ intro_message.wav # Audio Branding
|
| 158 |
+
β
|
| 159 |
+
βββ .gitattributes # Signal Normalization
|
| 160 |
+
βββ .gitignore # Deployment Exclusions
|
| 161 |
+
βββ DEEPFAKE-AUDIO.ipynb # Research Notebook
|
| 162 |
+
βββ DEEPFAKE-AUDIO.py # Research Script (Standalone CLI)
|
| 163 |
+
βββ SECURITY.md # Security Protocols
|
| 164 |
+
βββ CITATION.cff # Academic Citation Manifest
|
| 165 |
+
βββ codemeta.json # Metadata Standard
|
| 166 |
+
βββ LICENSE # MIT License (Verbatim)
|
| 167 |
+
βββ README.md # Project Entrance
|
| 168 |
+
```
|
| 169 |
+
|
| 170 |
+
---
|
| 171 |
+
|
| 172 |
+
<a name="results"></a>
|
| 173 |
+
## Results
|
| 174 |
+
|
| 175 |
+
<div align="center">
|
| 176 |
+
<b>Main Interface: Modern Design</b>
|
| 177 |
+
<br>
|
| 178 |
+
<i>Initial system state with clean aesthetics and synchronized brand identity.</i>
|
| 179 |
+
<br><br>
|
| 180 |
+
<img src="screenshots/01_landing_page.png" alt="Landing Page" width="90%">
|
| 181 |
+
<br>
|
| 182 |
+
<sub><i>π‘ <b>Interactive Element:</b> Engage the title header to activate the system's auditory introduction.</i></sub>
|
| 183 |
+
<br><br><br>
|
| 184 |
+
|
| 185 |
+
<b>Interactive Polish: Footer Integration</b>
|
| 186 |
+
<br>
|
| 187 |
+
<i>Seamlessly integrated authorship and social persistence.</i>
|
| 188 |
+
<br><br>
|
| 189 |
+
<img src="screenshots/02_landing_page_footer.png" alt="Footer UI" width="90%">
|
| 190 |
+
<br><br><br>
|
| 191 |
+
|
| 192 |
+
<b>Synthesis Setup: Adaptive Config</b>
|
| 193 |
+
<br>
|
| 194 |
+
<i>Configuring target text and reference identity for neural cloning.</i>
|
| 195 |
+
<br><br>
|
| 196 |
+
<img src="screenshots/03_example_run_config.png" alt="Configuration" width="90%">
|
| 197 |
+
<br><br><br>
|
| 198 |
+
|
| 199 |
+
<b>Neural Processing: Real-Time Inference</b>
|
| 200 |
+
<br>
|
| 201 |
+
<i>System Distillery extracting acoustic embeddings and synthesizing mel-spectrograms.</i>
|
| 202 |
+
<br><br>
|
| 203 |
+
<img src="screenshots/04_example_run_processing.png" alt="Processing" width="90%">
|
| 204 |
+
<br><br><br>
|
| 205 |
+
|
| 206 |
+
<b>Quantified Output: Generated Results</b>
|
| 207 |
+
<br>
|
| 208 |
+
<i>Successful high-fidelity audio synthesis with precise identity fidelity.</i>
|
| 209 |
+
<br><br>
|
| 210 |
+
<img src="screenshots/05_example_run_results.png" alt="Results" width="90%">
|
| 211 |
+
<br><br><br>
|
| 212 |
+
|
| 213 |
+
<b>Complete User Flow: Result & Footer</b>
|
| 214 |
+
<br>
|
| 215 |
+
<i>Comprehensive view of the post-synthesis state.</i>
|
| 216 |
+
<br><br>
|
| 217 |
+
<img src="screenshots/06_example_run_results_footer.png" alt="Results Footer" width="90%">
|
| 218 |
+
<br><br><br>
|
| 219 |
+
|
| 220 |
+
<b>System Options: Audio Export</b>
|
| 221 |
+
<br>
|
| 222 |
+
<i>Exporting synthesized waveforms for downstream academic reference.</i>
|
| 223 |
+
<br><br>
|
| 224 |
+
<img src="screenshots/07_download_option.png" alt="Download" width="90%">
|
| 225 |
+
<br><br><br>
|
| 226 |
+
|
| 227 |
+
<b>Generated Result Output: Audio Signal</b>
|
| 228 |
+
<br>
|
| 229 |
+
<i>Interactive verified output from the neural synthesis pipeline.</i>
|
| 230 |
+
<br><br>
|
| 231 |
+
<a href="screenshots/Audio.wav"><img src="screenshots/favicon.png" alt="Audio Result" width="100"></a>
|
| 232 |
+
<br>
|
| 233 |
+
<a href="screenshots/Audio.wav"><b>Listen to Generated Sample</b></a>
|
| 234 |
+
</div>
|
| 235 |
+
|
| 236 |
+
---
|
| 237 |
+
|
| 238 |
+
<!-- QUICK START -->
|
| 239 |
+
<a name="quick-start"></a>
|
| 240 |
+
## Quick Start
|
| 241 |
+
|
| 242 |
+
### 1. Prerequisites
|
| 243 |
+
- **Python 3.9+**: Required for runtime execution. [Download Python](https://www.python.org/downloads/)
|
| 244 |
+
- **Git**: For version control and cloning. [Download Git](https://git-scm.com/downloads)
|
| 245 |
+
|
| 246 |
+
> [!WARNING]
|
| 247 |
+
> ### Neural Model Acquisition
|
| 248 |
+
>
|
| 249 |
+
> The synthesis engine relies on pre-trained neural models. Ensure you have the weights (`encoder.pt`, `synthesizer.pt`, `vocoder.pt`) placed in the `Dataset/` directory. Failure to synchronize these assets will result in initialization errors.
|
| 250 |
+
|
| 251 |
+
### 2. Installation & Setup
|
| 252 |
+
|
| 253 |
+
#### Step 1: Clone the Repository
|
| 254 |
+
Open your terminal and clone the repository:
|
| 255 |
+
```bash
|
| 256 |
+
git clone https://github.com/Amey-Thakur/DEEPFAKE-AUDIO.git
|
| 257 |
+
cd DEEPFAKE-AUDIO
|
| 258 |
+
```
|
| 259 |
+
|
| 260 |
+
#### Step 2: Configure Virtual Environment
|
| 261 |
+
Prepare an isolated environment to manage dependencies:
|
| 262 |
+
|
| 263 |
+
**Windows (Command Prompt / PowerShell):**
|
| 264 |
+
```bash
|
| 265 |
+
python -m venv venv
|
| 266 |
+
venv\Scripts\activate
|
| 267 |
+
```
|
| 268 |
+
|
| 269 |
+
**macOS / Linux (Terminal):**
|
| 270 |
+
```bash
|
| 271 |
+
python3 -m venv venv
|
| 272 |
+
source venv/bin/activate
|
| 273 |
+
```
|
| 274 |
+
|
| 275 |
+
#### Step 3: Install Core Dependencies
|
| 276 |
+
Ensure your environment is active, then install the required libraries:
|
| 277 |
+
```bash
|
| 278 |
+
pip install -r "Source Code/requirements.txt"
|
| 279 |
+
```
|
| 280 |
+
|
| 281 |
+
### 3. Execution
|
| 282 |
+
|
| 283 |
+
#### A. Interactive Web Studio (PWA)
|
| 284 |
+
Launch the primary Gradio-based studio engine:
|
| 285 |
+
```bash
|
| 286 |
+
python "Source Code/app.py"
|
| 287 |
+
```
|
| 288 |
+
> [!TIP]
|
| 289 |
+
> **PWA Installation**: Once the studio is running, you can click the "Install" icon in your browser's address bar to add the **Deepfake Audio Studio** to your desktop as a standalone application.
|
| 290 |
+
|
| 291 |
+
#### B. Research & Automation Script
|
| 292 |
+
For automated synthesis or command-line research workflows:
|
| 293 |
+
```bash
|
| 294 |
+
# Example: Using a preset identity
|
| 295 |
+
python DEEPFAKE-AUDIO.py --preset "Steve Jobs.wav" --text "Neural cloning active."
|
| 296 |
+
|
| 297 |
+
# Example: Using a custom voice file
|
| 298 |
+
python DEEPFAKE-AUDIO.py --input "my_voice.wav" --text "Synthesizing new speech."
|
| 299 |
+
```
|
| 300 |
+
|
| 301 |
+
---
|
| 302 |
+
|
| 303 |
+
<!-- USAGE GUIDELINES -->
|
| 304 |
+
<a name="usage-guidelines"></a>
|
| 305 |
+
## Usage Guidelines
|
| 306 |
+
|
| 307 |
+
This repository is openly shared to support learning and knowledge exchange across the academic community.
|
| 308 |
+
|
| 309 |
+
**For Students**
|
| 310 |
+
Use this project as reference material for understanding **Neural Voice Synthesis**, **Transfer Learning (SV2TTS)**, and **real-time audio inference**. The source code is available for study to facilitate self-paced learning and exploration of **Python-based deep learning pipelines and PWA integration**.
|
| 311 |
+
|
| 312 |
+
**For Educators**
|
| 313 |
+
This project may serve as a practical lab example or supplementary teaching resource for **Deep Learning**, **Acoustic Science**, and **Interactive System Architecture** courses. Attribution is appreciated when utilizing content.
|
| 314 |
+
|
| 315 |
+
**For Researchers**
|
| 316 |
+
The documentation and architectural approach may provide insights into **academic project structuring**, **neural identity representation**, and **hybrid multi-stage synthesis pipelines**.
|
| 317 |
+
|
| 318 |
+
---
|
| 319 |
+
|
| 320 |
+
<!-- LICENSE -->
|
| 321 |
+
<a name="license"></a>
|
| 322 |
+
## License
|
| 323 |
+
|
| 324 |
+
This repository and all its creative and technical assets are made available under the **MIT License**. See the [LICENSE](LICENSE) file for complete terms.
|
| 325 |
+
|
| 326 |
+
> [!NOTE]
|
| 327 |
+
> **Summary**: You are free to share and adapt this content for any purpose, even commercially, as long as you provide appropriate attribution to the original authors.
|
| 328 |
+
|
| 329 |
+
Copyright Β© 2021 Amey Thakur & Mega Satish
|
| 330 |
+
|
| 331 |
+
---
|
| 332 |
+
|
| 333 |
+
<!-- ABOUT -->
|
| 334 |
+
<a name="about-this-repository"></a>
|
| 335 |
+
## About This Repository
|
| 336 |
+
|
| 337 |
+
**Created & Maintained by**: [Amey Thakur](https://github.com/Amey-Thakur) & [Mega Satish](https://github.com/msatmod)
|
| 338 |
+
|
| 339 |
+
This project features **Deepfake Audio**, a three-stage neural voice synthesis system. It represents a personal exploration into **Deep Learning**-based identity transfer and high-performance interactive application architecture via **Gradio**.
|
| 340 |
+
|
| 341 |
+
**Connect:** [GitHub](https://github.com/Amey-Thakur) Β· [LinkedIn](https://www.linkedin.com/in/amey-thakur) Β· [ORCID](https://orcid.org/0000-0001-5644-1575)
|
| 342 |
+
|
| 343 |
+
### Acknowledgments
|
| 344 |
+
|
| 345 |
+
Grateful acknowledgment to [**Mega Satish**](https://github.com/msatmod) for her exceptional collaboration and scholarly partnership on this neural voice cloning research. Her constant support, technical clarity, and dedication to software quality were instrumental in achieving the system's functional objectives. Learning alongside her was a transformative experience; her thoughtful approach to problem-solving and steady encouragement turned complex requirements into meaningful learning moments. This work reflects the growth and insights gained from our side-by-side academic journey. Thank you, Mega, for everything you shared and taught along the way.
|
| 346 |
+
|
| 347 |
+
Special thanks to [**Corentin Jemine**](https://github.com/CorentinJ) for the foundational research and open-source implementation of the **Real-Time-Voice-Cloning** repository, which served as the cornerstone for this project's technical architecture.
|
| 348 |
+
|
| 349 |
+
---
|
| 350 |
+
|
| 351 |
+
<div align="center">
|
| 352 |
+
|
| 353 |
+
[β Back to Top](#readme-top)
|
| 354 |
+
|
| 355 |
+
[Authors](#authors) Β· [Overview](#overview) Β· [Features](#features) Β· [Structure](#project-structure) Β· [Results](#results) Β· [Quick Start](#quick-start) Β· [Usage Guidelines](#usage-guidelines) Β· [License](#license) Β· [About](#about-this-repository) Β· [Acknowledgments](#acknowledgments)
|
| 356 |
+
|
| 357 |
+
<br>
|
| 358 |
+
|
| 359 |
+
ποΈ **[Deepfake Audio](https://huggingface.co/spaces/ameythakur/Deepfake-Audio)**
|
| 360 |
+
|
| 361 |
+
---
|
| 362 |
+
|
| 363 |
+
### π [Computer Engineering Repository](https://github.com/Amey-Thakur/COMPUTER-ENGINEERING)
|
| 364 |
+
|
| 365 |
+
**Computer Engineering (B.E.) - University of Mumbai**
|
| 366 |
+
|
| 367 |
+
*Semester-wise curriculum, laboratories, projects, and academic notes.*
|
| 368 |
+
|
| 369 |
+
</div>
|
SECURITY.md
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Security Policy
|
| 2 |
+
|
| 3 |
+
## Maintenance Status
|
| 4 |
+
|
| 5 |
+
This repository is part of a curated collection of academic and engineering projects and is maintained in a finalized and stable state. The project is preserved as a complete and authoritative record, with its scope and contents intentionally fixed to ensure long-term academic and professional reference.
|
| 6 |
+
|
| 7 |
+
## Supported Versions
|
| 8 |
+
|
| 9 |
+
As a finalized project, only the version listed below is authoritative:
|
| 10 |
+
|
| 11 |
+
| Version | Supported |
|
| 12 |
+
| ------- | --------- |
|
| 13 |
+
| 1.0.0 | Yes |
|
| 14 |
+
|
| 15 |
+
## Vulnerability Reporting Protocol
|
| 16 |
+
|
| 17 |
+
In accordance with established academic and professional standards for security disclosure, security-related observations associated with this project are documented through formal scholarly channels.
|
| 18 |
+
|
| 19 |
+
To document a security concern, communication is facilitated with the project curators:
|
| 20 |
+
- **Curator**: [Amey Thakur](https://github.com/Amey-Thakur)
|
| 21 |
+
- **Collaborator**: [Mega Satish](https://github.com/msatmod)
|
| 22 |
+
- **Method**: Reports are submitted via the repositoryβs [GitHub Issues](https://github.com/Amey-Thakur/DEEPFAKE-AUDIO/issues) interface to formally record security-related findings.
|
| 23 |
+
|
| 24 |
+
Submissions include:
|
| 25 |
+
1. A precise and technically accurate description of the identified issue.
|
| 26 |
+
2. Demonstrable steps or technical evidence sufficient to contextualize the finding.
|
| 27 |
+
3. An explanation of the issueβs relevance within the defined scope of the project.
|
| 28 |
+
|
| 29 |
+
## Implementation Context: Deep Learning Audio Synthesis
|
| 30 |
+
|
| 31 |
+
This project consists of an advanced Deep Learning platform, utilizing Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis (SV2TTS) and Real-Time Vocoders for high-fidelity audio generation.
|
| 32 |
+
|
| 33 |
+
- **Scope Limitation**: This policy applies exclusively to the documentation, code, and model assets contained within this repository and does not extend to the runtime environment (Google Colab/Python) or third-party libraries.
|
| 34 |
+
|
| 35 |
+
## Technical Integrity Statement
|
| 36 |
+
|
| 37 |
+
This repository is preserved as a fixed academic and engineering project. Security-related submissions are recorded for documentation and contextual reference and do not imply active monitoring, response obligations, or subsequent modification of the repository.
|
| 38 |
+
|
| 39 |
+
---
|
| 40 |
+
|
| 41 |
+
*This document defines the security posture of a finalized Python project.*
|
Source Code/DEEPFAKE-AUDIO.ipynb
ADDED
|
@@ -0,0 +1,923 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "markdown",
|
| 5 |
+
"metadata": {
|
| 6 |
+
"id": "view-in-github",
|
| 7 |
+
"colab_type": "text"
|
| 8 |
+
},
|
| 9 |
+
"source": [
|
| 10 |
+
"<a href=\"https://colab.research.google.com/github/Amey-Thakur/DEEPFAKE-AUDIO/blob/main/DEEPFAKE-AUDIO.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
|
| 11 |
+
]
|
| 12 |
+
},
|
| 13 |
+
{
|
| 14 |
+
"cell_type": "markdown",
|
| 15 |
+
"metadata": {
|
| 16 |
+
"id": "X7Om-e0ahXrB"
|
| 17 |
+
},
|
| 18 |
+
"source": [
|
| 19 |
+
"#\n",
|
| 20 |
+
"<h1 align=\"center\">\ud83c\udf99\ufe0f Deepfake Audio</h1>\n",
|
| 21 |
+
"<h3 align=\"center\"><i>A neural voice cloning studio powered by SV2TTS technology</i></h3>\n",
|
| 22 |
+
"\n",
|
| 23 |
+
"<div align=\"center\">\n",
|
| 24 |
+
"\n",
|
| 25 |
+
"| **Author** | **Profiles** |\n",
|
| 26 |
+
"|:---:|:---|\n",
|
| 27 |
+
"| **Amey Thakur** | [](https://github.com/Amey-Thakur) [](https://orcid.org/0000-0001-5644-1575) [](https://scholar.google.ca/citations?user=0inooPgAAAAJ&hl=en) [](https://www.kaggle.com/ameythakur20) |\n",
|
| 28 |
+
"| **Mega Satish** | [](https://github.com/msatmod) [](https://orcid.org/0000-0002-1844-9557) [](https://scholar.google.ca/citations?user=7Ajrr6EAAAAJ&hl=en) [](https://www.kaggle.com/megasatish) |\n",
|
| 29 |
+
"\n",
|
| 30 |
+
"---\n",
|
| 31 |
+
"\n",
|
| 32 |
+
"**Attribution:** This project builds upon the foundational work of [CorentinJ/Real-Time-Voice-Cloning](https://github.com/CorentinJ/Real-Time-Voice-Cloning).\n",
|
| 33 |
+
"\n",
|
| 34 |
+
"\ud83d\ude80 **Live Demo:** [Hugging Face Space](https://huggingface.co/spaces/ameythakur/Deepfake-Audio) | \ud83c\udfa5 **Video Demo:** [YouTube](https://youtu.be/i3wnBcbHDbs) | \ud83d\udcbb **Repository:** [GitHub](https://github.com/Amey-Thakur/DEEPFAKE-AUDIO)\n",
|
| 35 |
+
"\n",
|
| 36 |
+
"<a href=\"https://youtu.be/i3wnBcbHDbs\">\n",
|
| 37 |
+
" <img src=\"https://img.youtube.com/vi/i3wnBcbHDbs/0.jpg\" alt=\"Video Demo\" width=\"60%\">\n",
|
| 38 |
+
"</a>\n",
|
| 39 |
+
"\n",
|
| 40 |
+
"</div>\n",
|
| 41 |
+
"\n",
|
| 42 |
+
"## \ud83d\udcd6 Introduction\n",
|
| 43 |
+
"\n",
|
| 44 |
+
"> **An audio deepfake is when a \u201ccloned\u201d voice that is potentially indistinguishable from the real person\u2019s is used to produce synthetic audio.**\n",
|
| 45 |
+
"\n",
|
| 46 |
+
"This research notebook demonstrates the **SV2TTS (Speaker Verification to Text-to-Speech)** framework, a three-stage deep learning pipeline capable of cloning a voice from a mere 5 seconds of audio.\n",
|
| 47 |
+
"\n",
|
| 48 |
+
"### The Pipeline\n",
|
| 49 |
+
"1. **Speaker Encoder**: Creates a fixed-dimensional embedding (fingerprint) from the reference audio.\n",
|
| 50 |
+
"2. **Synthesizer**: Generates a Mel Spectrogram from text, conditioned on the speaker embedding.\n",
|
| 51 |
+
"3. **Vocoder**: Converts the Mel Spectrogram into a raw time-domain waveform (audible speech)."
|
| 52 |
+
]
|
| 53 |
+
},
|
| 54 |
+
{
|
| 55 |
+
"cell_type": "markdown",
|
| 56 |
+
"metadata": {
|
| 57 |
+
"id": "4kb73A1rhXrE"
|
| 58 |
+
},
|
| 59 |
+
"source": [
|
| 60 |
+
"## \u2601\ufe0f Cloud Environment Setup\n",
|
| 61 |
+
"Execute the following cell **only** if you are running this notebook in a cloud environment like **Google Colab** or **Kaggle**.\n",
|
| 62 |
+
"\n",
|
| 63 |
+
"This script will:\n",
|
| 64 |
+
"1. **Clone the Repository**: Tries GitHub first, then falls back to **Personal Hugging Face Space** (`ameythakur/Deepfake-Audio`) if GitHub fails.\n",
|
| 65 |
+
"2. **Environment Detection**: Automatically detects **Kaggle** vs **Colab**.\n",
|
| 66 |
+
"3. **Data Retrieval**:\n",
|
| 67 |
+
" * **Kaggle**: Links directly from `/kaggle/input/deepfakeaudio` (No download needed).\n",
|
| 68 |
+
" * **Others**: Attempts Git LFS pull.\n",
|
| 69 |
+
"4. **Fallback to Kagglehub**: If LFS budget exceeded, downloads from `ameythakur20/deepfakeaudio`.\n",
|
| 70 |
+
"5. Install all required Python and System dependencies."
|
| 71 |
+
]
|
| 72 |
+
},
|
| 73 |
+
{
|
| 74 |
+
"cell_type": "code",
|
| 75 |
+
"execution_count": 1,
|
| 76 |
+
"metadata": {
|
| 77 |
+
"colab": {
|
| 78 |
+
"base_uri": "https://localhost:8080/"
|
| 79 |
+
},
|
| 80 |
+
"id": "bDzcxXrzhXrE",
|
| 81 |
+
"outputId": "060d68bb-0b5c-43a8-f0fd-119d6335404e"
|
| 82 |
+
},
|
| 83 |
+
"outputs": [
|
| 84 |
+
{
|
| 85 |
+
"output_type": "stream",
|
| 86 |
+
"name": "stdout",
|
| 87 |
+
"text": [
|
| 88 |
+
"\ud83d\udcbb Detected Google Colab Environment. Initiating setup...\n",
|
| 89 |
+
"\u2b07\ufe0f Cloning DEEPFAKE-AUDIO repository from MAIN (GitHub)...\n",
|
| 90 |
+
"Cloning into 'DEEPFAKE-AUDIO'...\n",
|
| 91 |
+
"remote: Enumerating objects: 682, done.\u001b[K\n",
|
| 92 |
+
"remote: Counting objects: 100% (62/62), done.\u001b[K\n",
|
| 93 |
+
"remote: Compressing objects: 100% (55/55), done.\u001b[K\n",
|
| 94 |
+
"remote: Total 682 (delta 7), reused 7 (delta 7), pack-reused 620 (from 1)\u001b[K\n",
|
| 95 |
+
"Receiving objects: 100% (682/682), 71.95 MiB | 12.37 MiB/s, done.\n",
|
| 96 |
+
"Resolving deltas: 100% (328/328), done.\n",
|
| 97 |
+
"Downloading Dataset/encoder.pt (17 MB)\n",
|
| 98 |
+
"Error downloading object: Dataset/encoder.pt (39373b8): Smudge error: Error downloading Dataset/encoder.pt (39373b86598fa3da9fcddee6142382efe09777e8d37dc9c0561f41f0070f134e): batch response: This repository exceeded its LFS budget. The account responsible for the budget should increase it to restore access.\n",
|
| 99 |
+
"\n",
|
| 100 |
+
"Errors logged to /content/DEEPFAKE-AUDIO/.git/lfs/logs/20260129T113138.869344323.log\n",
|
| 101 |
+
"Use `git lfs logs last` to view the log.\n",
|
| 102 |
+
"error: external filter 'git-lfs filter-process' failed\n",
|
| 103 |
+
"fatal: Dataset/encoder.pt: smudge filter lfs failed\n",
|
| 104 |
+
"warning: Clone succeeded, but checkout failed.\n",
|
| 105 |
+
"You can inspect what was checked out with 'git status'\n",
|
| 106 |
+
"and retry with 'git restore --source=HEAD :/'\n",
|
| 107 |
+
"\n",
|
| 108 |
+
"\ud83d\udd27 Installing dependencies...\n",
|
| 109 |
+
"Reading package lists... Done\n",
|
| 110 |
+
"Building dependency tree... Done\n",
|
| 111 |
+
"Reading state information... Done\n",
|
| 112 |
+
"libsndfile1 is already the newest version (1.0.31-2ubuntu0.2).\n",
|
| 113 |
+
"0 upgraded, 0 newly installed, 0 to remove and 41 not upgraded.\n",
|
| 114 |
+
"\ud83d\udce6 Attempting Git LFS pull...\n",
|
| 115 |
+
"Updated git hooks.\n",
|
| 116 |
+
"Git LFS initialized.\n",
|
| 117 |
+
"Error updating the git index:\n",
|
| 118 |
+
"error: Source Code/encoder/__init__.py: cannot add to the index - missing --add option?\n",
|
| 119 |
+
"fatal: Unable to process path Source Code/encoder/__init__.py\n",
|
| 120 |
+
"\n",
|
| 121 |
+
"\n",
|
| 122 |
+
"Errors logged to /content/DEEPFAKE-AUDIO/.git/lfs/logs/20260129T113158.470476122.log\n",
|
| 123 |
+
"Use `git lfs logs last` to view the log.\n",
|
| 124 |
+
"batch response: This repository exceeded its LFS budget. The account responsible for the budget should increase it to restore access.\n",
|
| 125 |
+
"error: failed to fetch some objects from 'https://github.com/Amey-Thakur/DEEPFAKE-AUDIO.git/info/lfs'\n",
|
| 126 |
+
"\u26a0\ufe0f GitHub LFS Budget Exceeded or Pull Failed. Using Kaggle Fallback...\n",
|
| 127 |
+
"Requirement already satisfied: kagglehub in /usr/local/lib/python3.12/dist-packages (0.3.13)\n",
|
| 128 |
+
"Requirement already satisfied: packaging in /usr/local/lib/python3.12/dist-packages (from kagglehub) (25.0)\n",
|
| 129 |
+
"Requirement already satisfied: pyyaml in /usr/local/lib/python3.12/dist-packages (from kagglehub) (6.0.3)\n",
|
| 130 |
+
"Requirement already satisfied: requests in /usr/local/lib/python3.12/dist-packages (from kagglehub) (2.32.4)\n",
|
| 131 |
+
"Requirement already satisfied: tqdm in /usr/local/lib/python3.12/dist-packages (from kagglehub) (4.67.1)\n",
|
| 132 |
+
"Requirement already satisfied: charset_normalizer<4,>=2 in /usr/local/lib/python3.12/dist-packages (from requests->kagglehub) (3.4.4)\n",
|
| 133 |
+
"Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.12/dist-packages (from requests->kagglehub) (3.11)\n",
|
| 134 |
+
"Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.12/dist-packages (from requests->kagglehub) (2.5.0)\n",
|
| 135 |
+
"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.12/dist-packages (from requests->kagglehub) (2026.1.4)\n",
|
| 136 |
+
"\ud83d\ude80 Downloading assets from Kagglehub (ameythakur20/deepfakeaudio)...\n",
|
| 137 |
+
"Warning: Looks like you're using an outdated `kagglehub` version (installed: 0.3.13), please consider upgrading to the latest version (0.4.1).\n",
|
| 138 |
+
"Downloading from https://www.kaggle.com/api/v1/datasets/download/ameythakur20/deepfakeaudio?dataset_version_number=5...\n"
|
| 139 |
+
]
|
| 140 |
+
},
|
| 141 |
+
{
|
| 142 |
+
"output_type": "stream",
|
| 143 |
+
"name": "stderr",
|
| 144 |
+
"text": [
|
| 145 |
+
"100%|\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588| 550M/550M [00:25<00:00, 22.7MB/s]"
|
| 146 |
+
]
|
| 147 |
+
},
|
| 148 |
+
{
|
| 149 |
+
"output_type": "stream",
|
| 150 |
+
"name": "stdout",
|
| 151 |
+
"text": [
|
| 152 |
+
"Extracting files...\n"
|
| 153 |
+
]
|
| 154 |
+
},
|
| 155 |
+
{
|
| 156 |
+
"output_type": "stream",
|
| 157 |
+
"name": "stderr",
|
| 158 |
+
"text": [
|
| 159 |
+
"\n"
|
| 160 |
+
]
|
| 161 |
+
},
|
| 162 |
+
{
|
| 163 |
+
"output_type": "stream",
|
| 164 |
+
"name": "stdout",
|
| 165 |
+
"text": [
|
| 166 |
+
"\u2705 Samples linked from Kaggle.\n",
|
| 167 |
+
"\u2705 Models linked from Kaggle.\n"
|
| 168 |
+
]
|
| 169 |
+
}
|
| 170 |
+
],
|
| 171 |
+
"source": [
|
| 172 |
+
"import os\n",
|
| 173 |
+
"import sys\n",
|
| 174 |
+
"import shutil\n",
|
| 175 |
+
"\n",
|
| 176 |
+
"# Detect Cloud Environment (Colab/Kaggle)\n",
|
| 177 |
+
"try:\n",
|
| 178 |
+
" shell = get_ipython()\n",
|
| 179 |
+
" if 'google.colab' in str(shell):\n",
|
| 180 |
+
" print(\"\ud83d\udcbb Detected Google Colab Environment. Initiating setup...\")\n",
|
| 181 |
+
"\n",
|
| 182 |
+
" # 1. Clone the Repository (GitHub with HF Fallback)\n",
|
| 183 |
+
" if not os.path.exists(\"DEEPFAKE-AUDIO\"):\n",
|
| 184 |
+
" print(\"\u2b07\ufe0f Cloning DEEPFAKE-AUDIO repository from MAIN (GitHub)...\")\n",
|
| 185 |
+
" clone_status = shell.system(\"git clone https://github.com/Amey-Thakur/DEEPFAKE-AUDIO\")\n",
|
| 186 |
+
"\n",
|
| 187 |
+
" # Fallback to Hugging Face if GitHub clone failed (folder empty or not created)\n",
|
| 188 |
+
" if not os.path.exists(\"DEEPFAKE-AUDIO\") or not os.listdir(\"DEEPFAKE-AUDIO\"):\n",
|
| 189 |
+
" print(\"\u26a0\ufe0f GitHub Clone Failed. Attempting Fallback: Personal Hugging Face Space...\")\n",
|
| 190 |
+
" if os.path.exists(\"DEEPFAKE-AUDIO\"): shutil.rmtree(\"DEEPFAKE-AUDIO\")\n",
|
| 191 |
+
" shell.system(\"git clone https://huggingface.co/spaces/ameythakur/Deepfake-Audio DEEPFAKE-AUDIO\")\n",
|
| 192 |
+
" print(\"\u2705 Cloned from Hugging Face Space.\")\n",
|
| 193 |
+
"\n",
|
| 194 |
+
" os.chdir(\"/content/DEEPFAKE-AUDIO\")\n",
|
| 195 |
+
"\n",
|
| 196 |
+
" # Install Dependencies (Colab)\n",
|
| 197 |
+
" print(\"\ud83d\udd27 Installing dependencies...\")\n",
|
| 198 |
+
" shell.system(\"apt-get install -y libsndfile1\")\n",
|
| 199 |
+
" shell.system(\"pip install librosa==0.9.2 unidecode webrtcvad inflect umap-learn scikit-learn>=1.3 tqdm scipy 'matplotlib>=3.7,<3.9' Pillow>=10.2 soundfile huggingface_hub\")\n",
|
| 200 |
+
"\n",
|
| 201 |
+
"\n",
|
| 202 |
+
"\n",
|
| 203 |
+
" # 2. Attempt Git LFS (Colab)\n",
|
| 204 |
+
" print(\"\ud83d\udce6 Attempting Git LFS pull...\")\n",
|
| 205 |
+
" shell.system(\"git lfs install\")\n",
|
| 206 |
+
" lfs_status = shell.system(\"git lfs pull\")\n",
|
| 207 |
+
"\n",
|
| 208 |
+
" # 3. Check for Fallback (If LFS failed or budget exceeded)\n",
|
| 209 |
+
" sample_trigger = \"Dataset/samples/Steve Jobs.wav\"\n",
|
| 210 |
+
" if lfs_status != 0 or not os.path.exists(sample_trigger) or os.path.getsize(sample_trigger) < 1000:\n",
|
| 211 |
+
" print(\"\u26a0\ufe0f GitHub LFS Budget Exceeded or Pull Failed. Using Kaggle Fallback...\")\n",
|
| 212 |
+
" shell.system(\"pip install kagglehub\")\n",
|
| 213 |
+
" import kagglehub\n",
|
| 214 |
+
" print(\"\ud83d\ude80 Downloading assets from Kagglehub (ameythakur20/deepfakeaudio)...\")\n",
|
| 215 |
+
" k_path = kagglehub.dataset_download(\"ameythakur20/deepfakeaudio\")\n",
|
| 216 |
+
"\n",
|
| 217 |
+
" # Link/Copy samples\n",
|
| 218 |
+
" k_samples = os.path.join(k_path, \"samples\")\n",
|
| 219 |
+
" if os.path.exists(k_samples):\n",
|
| 220 |
+
" if os.path.exists(\"Dataset/samples\"):\n",
|
| 221 |
+
" shutil.rmtree(\"Dataset/samples\")\n",
|
| 222 |
+
" os.makedirs(\"Dataset\", exist_ok=True)\n",
|
| 223 |
+
" os.symlink(k_samples, \"Dataset/samples\")\n",
|
| 224 |
+
" print(\"\u2705 Samples linked from Kaggle.\")\n",
|
| 225 |
+
"\n",
|
| 226 |
+
" # Link/Copy models\n",
|
| 227 |
+
" for model in [\"encoder.pt\", \"synthesizer.pt\", \"vocoder.pt\"]:\n",
|
| 228 |
+
" k_model = os.path.join(k_path, model)\n",
|
| 229 |
+
" if os.path.exists(k_model):\n",
|
| 230 |
+
" target = os.path.join(\"Dataset\", model)\n",
|
| 231 |
+
" if os.path.exists(target): os.remove(target)\n",
|
| 232 |
+
" os.symlink(k_model, target)\n",
|
| 233 |
+
" print(\"\u2705 Models linked from Kaggle.\")\n",
|
| 234 |
+
"\n",
|
| 235 |
+
" elif \"kaggle\" in os.environ.get(\"KAGGLE_KERNEL_RUN_TYPE\", \"\"):\n",
|
| 236 |
+
" print(\"\ud83d\udcbb Detected Kaggle Environment. Initiating setup...\")\n",
|
| 237 |
+
" os.chdir(\"/kaggle/working\")\n",
|
| 238 |
+
"\n",
|
| 239 |
+
" # 1. Clone the Repository (GitHub with HF Fallback)\n",
|
| 240 |
+
" if not os.path.exists(\"DEEPFAKE-AUDIO\"):\n",
|
| 241 |
+
" print(\"\u2b07\ufe0f Cloning DEEPFAKE-AUDIO repository from MAIN (GitHub)...\")\n",
|
| 242 |
+
" shell.system(\"git clone https://github.com/Amey-Thakur/DEEPFAKE-AUDIO\")\n",
|
| 243 |
+
"\n",
|
| 244 |
+
" if not os.path.exists(\"DEEPFAKE-AUDIO\") or not os.listdir(\"DEEPFAKE-AUDIO\"):\n",
|
| 245 |
+
" print(\"\u26a0\ufe0f GitHub Clone Failed. Attempting Fallback: Personal Hugging Face Space...\")\n",
|
| 246 |
+
" if os.path.exists(\"DEEPFAKE-AUDIO\"): shutil.rmtree(\"DEEPFAKE-AUDIO\")\n",
|
| 247 |
+
" shell.system(\"git clone https://huggingface.co/spaces/ameythakur/Deepfake-Audio DEEPFAKE-AUDIO\")\n",
|
| 248 |
+
" print(\"\u2705 Cloned from Hugging Face Space.\")\n",
|
| 249 |
+
"\n",
|
| 250 |
+
" os.chdir(\"/kaggle/working/DEEPFAKE-AUDIO\")\n",
|
| 251 |
+
"\n",
|
| 252 |
+
" # 2. Priority: Link Kaggle Dataset (Skip LFS pull if dataset exists)\n",
|
| 253 |
+
" kaggle_input = \"/kaggle/input/deepfakeaudio\"\n",
|
| 254 |
+
" if os.path.exists(kaggle_input):\n",
|
| 255 |
+
" print(f\"\u2705 Kaggle Dataset Detected at {kaggle_input}. Linking assets...\")\n",
|
| 256 |
+
" # Link logic specific to Kaggle structure\n",
|
| 257 |
+
" if os.path.exists(\"Dataset/samples\"):\n",
|
| 258 |
+
" shutil.rmtree(\"Dataset/samples\")\n",
|
| 259 |
+
" if not os.path.exists(\"Dataset\"):\n",
|
| 260 |
+
" os.makedirs(\"Dataset\")\n",
|
| 261 |
+
" # Attempt to symlink folder or copy items\n",
|
| 262 |
+
" try:\n",
|
| 263 |
+
" if os.path.exists(os.path.join(kaggle_input, \"samples\")):\n",
|
| 264 |
+
" os.symlink(os.path.join(kaggle_input, \"samples\"), \"Dataset/samples\")\n",
|
| 265 |
+
" for model in [\"encoder.pt\", \"synthesizer.pt\", \"vocoder.pt\"]:\n",
|
| 266 |
+
" src = os.path.join(kaggle_input, model)\n",
|
| 267 |
+
" dst = os.path.join(\"Dataset\", model)\n",
|
| 268 |
+
" if os.path.exists(src):\n",
|
| 269 |
+
" if os.path.exists(dst): os.remove(dst)\n",
|
| 270 |
+
" os.symlink(src, dst)\n",
|
| 271 |
+
" except Exception as e: print(f\"Warning during linking: {e}\")\n",
|
| 272 |
+
" print(\"\u2705 Assets linked from Kaggle Input.\")\n",
|
| 273 |
+
" else:\n",
|
| 274 |
+
" print(\"\u26a0\ufe0f Kaggle Input not found. Attempting standard LFS pull...\")\n",
|
| 275 |
+
" shell.system(\"git lfs install\")\n",
|
| 276 |
+
" shell.system(\"git lfs pull\")\n",
|
| 277 |
+
"\n",
|
| 278 |
+
" # Install Dependencies\n",
|
| 279 |
+
" print(\"\ud83d\udd27 Installing dependencies...\")\n",
|
| 280 |
+
" shell.system(\"apt-get install -y libsndfile1\")\n",
|
| 281 |
+
" shell.system(\"pip install librosa==0.9.2 unidecode webrtcvad inflect umap-learn scikit-learn>=1.3 tqdm scipy 'matplotlib>=3.7,<3.9' Pillow>=10.2 soundfile huggingface_hub\")\n",
|
| 282 |
+
"\n",
|
| 283 |
+
" # 2. Attempt Git LFS\n",
|
| 284 |
+
" print(\"\ud83d\udce6 Attempting Git LFS pull...\")\n",
|
| 285 |
+
" shell.system(\"git lfs install\")\n",
|
| 286 |
+
" lfs_status = shell.system(\"git lfs pull\")\n",
|
| 287 |
+
"\n",
|
| 288 |
+
" # 3. Check for Fallback (If LFS failed or budget exceeded)\n",
|
| 289 |
+
" # Detection: If samples folder is empty or contains small pointer files\n",
|
| 290 |
+
" sample_trigger = \"Dataset/samples/Steve Jobs.wav\"\n",
|
| 291 |
+
" if lfs_status != 0 or not os.path.exists(sample_trigger) or os.path.getsize(sample_trigger) < 1000:\n",
|
| 292 |
+
" print(\"\u26a0\ufe0f GitHub LFS Budget Exceeded or Pull Failed. Using Kaggle Fallback...\")\n",
|
| 293 |
+
" shell.system(\"pip install kagglehub\")\n",
|
| 294 |
+
" import kagglehub\n",
|
| 295 |
+
"\n",
|
| 296 |
+
" # Pull from public Kaggle dataset\n",
|
| 297 |
+
" print(\"\ud83d\ude80 Downloading assets from Kagglehub (ameythakur20/deepfakeaudio)...\")\n",
|
| 298 |
+
" k_path = kagglehub.dataset_download(\"ameythakur20/deepfakeaudio\")\n",
|
| 299 |
+
"\n",
|
| 300 |
+
" # Link/Copy samples\n",
|
| 301 |
+
" k_samples = os.path.join(k_path, \"samples\")\n",
|
| 302 |
+
" if os.path.exists(k_samples):\n",
|
| 303 |
+
" if os.path.exists(\"Dataset/samples\"):\n",
|
| 304 |
+
" shutil.rmtree(\"Dataset/samples\")\n",
|
| 305 |
+
" os.makedirs(\"Dataset\", exist_ok=True)\n",
|
| 306 |
+
" os.symlink(k_samples, \"Dataset/samples\")\n",
|
| 307 |
+
" print(\"\u2705 Samples linked from Kaggle.\")\n",
|
| 308 |
+
"\n",
|
| 309 |
+
" # Link/Copy models\n",
|
| 310 |
+
" for model in [\"encoder.pt\", \"synthesizer.pt\", \"vocoder.pt\"]:\n",
|
| 311 |
+
" k_model = os.path.join(k_path, model)\n",
|
| 312 |
+
" if os.path.exists(k_model):\n",
|
| 313 |
+
" target = os.path.join(\"Dataset\", model)\n",
|
| 314 |
+
" if os.path.exists(target): os.remove(target)\n",
|
| 315 |
+
" os.symlink(k_model, target)\n",
|
| 316 |
+
" print(\"\u2705 Models linked from Kaggle.\")\n",
|
| 317 |
+
"\n",
|
| 318 |
+
" # 4. Pull Latest Code Changes\n",
|
| 319 |
+
" print(\"\ud83d\udd04 Synchronizing with remote repository...\")\n",
|
| 320 |
+
" shell.system(\"git pull\")\n",
|
| 321 |
+
"\n",
|
| 322 |
+
" # 5. Install System Dependencies\n",
|
| 323 |
+
" print(\"\ud83d\udd27 Installing system dependencies (libsndfile1)...\")\n",
|
| 324 |
+
" shell.system(\"apt-get install -y libsndfile1\")\n",
|
| 325 |
+
"\n",
|
| 326 |
+
" # 6. Install Python Dependencies\n",
|
| 327 |
+
" print(\"\ud83d\udce6 Installing Python libraries...\")\n",
|
| 328 |
+
" shell.system(\"pip install librosa==0.9.2 unidecode webrtcvad inflect umap-learn scikit-learn>=1.3 tqdm scipy 'matplotlib>=3.7,<3.9' Pillow>=10.2 soundfile huggingface_hub\")\n",
|
| 329 |
+
"\n",
|
| 330 |
+
" print(\"\u2705 Environment setup complete. Ready for cloning.\")\n",
|
| 331 |
+
" else:\n",
|
| 332 |
+
" print(\"\ud83c\udfe0 Running in local or custom environment. Skipping cloud setup.\")\n",
|
| 333 |
+
"except NameError:\n",
|
| 334 |
+
" print(\"\ud83c\udfe0 Running in local or custom environment. Skipping cloud setup.\")"
|
| 335 |
+
]
|
| 336 |
+
},
|
| 337 |
+
{
|
| 338 |
+
"cell_type": "markdown",
|
| 339 |
+
"metadata": {
|
| 340 |
+
"id": "3sJUw_G8hXrG"
|
| 341 |
+
},
|
| 342 |
+
"source": [
|
| 343 |
+
"## 1\ufe0f\u20e3 Model & Data Initialization\n",
|
| 344 |
+
"\n",
|
| 345 |
+
"We prioritize data availability to ensure the notebook runs smoothly regardless of the platform. The system checks for checkpoints in this order:\n",
|
| 346 |
+
"\n",
|
| 347 |
+
"1. **Repository Local** (`Dataset/` / `Source Code/`): Fast local access if cloned.\n",
|
| 348 |
+
"2. **Kaggle Dataset** (`/kaggle/input/deepfakeaudio/`): Pre-loaded environment data.\n",
|
| 349 |
+
" * *Reference*: [Amey Thakur's Kaggle Dataset](https://www.kaggle.com/datasets/ameythakur20/deepfakeaudio)\n",
|
| 350 |
+
"3. **Personal Backup** (Hugging Face Space): `ameythakur/Deepfake-Audio`.\n",
|
| 351 |
+
" * *Reference*: [Amey Thakur's HF Space](https://huggingface.co/spaces/ameythakur/Deepfake-Audio)\n",
|
| 352 |
+
"4. **HuggingFace Auto-Download**: Robust fallback for fresh environments.\n",
|
| 353 |
+
" * *Reference*: [CorentinJ's SV2TTS Repository](https://huggingface.co/CorentinJ/SV2TTS)"
|
| 354 |
+
]
|
| 355 |
+
},
|
| 356 |
+
{
|
| 357 |
+
"cell_type": "code",
|
| 358 |
+
"execution_count": 2,
|
| 359 |
+
"metadata": {
|
| 360 |
+
"colab": {
|
| 361 |
+
"base_uri": "https://localhost:8080/"
|
| 362 |
+
},
|
| 363 |
+
"id": "bY79_5WrhXrH",
|
| 364 |
+
"outputId": "c1767603-817a-4f2c-8ce9-a09f2436bc5d"
|
| 365 |
+
},
|
| 366 |
+
"outputs": [
|
| 367 |
+
{
|
| 368 |
+
"output_type": "stream",
|
| 369 |
+
"name": "stdout",
|
| 370 |
+
"text": [
|
| 371 |
+
"\ud83d\udcc2 Working Directory: /content/DEEPFAKE-AUDIO\n",
|
| 372 |
+
"\u2705 Module Path Registered: /content/DEEPFAKE-AUDIO/Source Code\n",
|
| 373 |
+
"\u2b07\ufe0f Verifying Model Availability...\n",
|
| 374 |
+
"\u2705 Found high-priority local models in 'Dataset/'. Verified.\n"
|
| 375 |
+
]
|
| 376 |
+
}
|
| 377 |
+
],
|
| 378 |
+
"source": [
|
| 379 |
+
"import sys\n",
|
| 380 |
+
"import os\n",
|
| 381 |
+
"from pathlib import Path\n",
|
| 382 |
+
"import zipfile\n",
|
| 383 |
+
"import shutil\n",
|
| 384 |
+
"\n",
|
| 385 |
+
"# Determine if running in Google Colab\n",
|
| 386 |
+
"IS_COLAB = 'google.colab' in sys.modules\n",
|
| 387 |
+
"\n",
|
| 388 |
+
"# Register 'Source Code' to Python path for module imports\n",
|
| 389 |
+
"source_path = os.path.abspath(\"Source Code\")\n",
|
| 390 |
+
"if source_path not in sys.path:\n",
|
| 391 |
+
" sys.path.append(source_path)\n",
|
| 392 |
+
"\n",
|
| 393 |
+
"print(f\"\ud83d\udcc2 Working Directory: {os.getcwd()}\")\n",
|
| 394 |
+
"print(f\"\u2705 Module Path Registered: {source_path}\")\n",
|
| 395 |
+
"\n",
|
| 396 |
+
"# Define paths for model checkpoints\n",
|
| 397 |
+
"extract_path = \"pretrained_models\"\n",
|
| 398 |
+
"\n",
|
| 399 |
+
"if not os.path.exists(extract_path):\n",
|
| 400 |
+
" os.makedirs(extract_path)\n",
|
| 401 |
+
"\n",
|
| 402 |
+
"# --- \ud83e\udde0 Checkpoint Verification Strategy ---\n",
|
| 403 |
+
"print(\"\u2b07\ufe0f Verifying Model Availability...\")\n",
|
| 404 |
+
"\n",
|
| 405 |
+
"# Priority 1: Check Local Repository 'Dataset/' folder\n",
|
| 406 |
+
"core_models = [\"encoder.pt\", \"synthesizer.pt\", \"vocoder.pt\"]\n",
|
| 407 |
+
"\n",
|
| 408 |
+
"def is_valid_pt(p):\n",
|
| 409 |
+
" \"\"\"Checks if a file exists and is not an LFS pointer (typically < 1KB).\"\"\"\n",
|
| 410 |
+
" return os.path.exists(p) and os.path.getsize(p) > 1000\n",
|
| 411 |
+
"\n",
|
| 412 |
+
"dataset_models_present = all([is_valid_pt(os.path.join(\"Dataset\", m)) for m in core_models])\n",
|
| 413 |
+
"\n",
|
| 414 |
+
"if dataset_models_present:\n",
|
| 415 |
+
" print(\"\u2705 Found high-priority local models in 'Dataset/'. Verified.\")\n",
|
| 416 |
+
"else:\n",
|
| 417 |
+
" # Priority 2: Check Kaggle Dataset (Online Pre-loaded environment data)\n",
|
| 418 |
+
" kaggle_path = \"/kaggle/input/deepfakeaudio\"\n",
|
| 419 |
+
" kaggle_models_present = all([is_valid_pt(os.path.join(kaggle_path, m)) for m in core_models])\n",
|
| 420 |
+
"\n",
|
| 421 |
+
" if kaggle_models_present:\n",
|
| 422 |
+
" print(f\"\u2705 Found hardcoded Kaggle Dataset models at {kaggle_path}. Skipping download.\")\n",
|
| 423 |
+
" else:\n",
|
| 424 |
+
" print(\"\u26a0\ufe0f Models not found or are LFS pointers. Attempting fallback download...\")\n",
|
| 425 |
+
"\n",
|
| 426 |
+
" # Priority 3: Personal Hugging Face Space (ameythakur/Deepfake-Audio)\n",
|
| 427 |
+
" personal_hf_success = False\n",
|
| 428 |
+
" try:\n",
|
| 429 |
+
" print(\"\ud83d\ude80 Attempting download from Personal Hugging Face Space (ameythakur/Deepfake-Audio)...\")\n",
|
| 430 |
+
" from huggingface_hub import hf_hub_download\n",
|
| 431 |
+
" os.makedirs(\"pretrained_models\", exist_ok=True)\n",
|
| 432 |
+
" for model in core_models:\n",
|
| 433 |
+
" try:\n",
|
| 434 |
+
" fpath = hf_hub_download(repo_id=\"ameythakur/Deepfake-Audio\", filename=f\"Dataset/{model}\", repo_type=\"space\", local_dir=\"pretrained_models\")\n",
|
| 435 |
+
" except:\n",
|
| 436 |
+
" fpath = hf_hub_download(repo_id=\"ameythakur/Deepfake-Audio\", filename=model, repo_type=\"space\", local_dir=\"pretrained_models\")\n",
|
| 437 |
+
" target = os.path.join(\"pretrained_models\", model)\n",
|
| 438 |
+
" if fpath != target and os.path.exists(fpath): shutil.move(fpath, target)\n",
|
| 439 |
+
" if os.path.exists(os.path.join(\"pretrained_models\", \"Dataset\")): shutil.rmtree(os.path.join(\"pretrained_models\", \"Dataset\"))\n",
|
| 440 |
+
" print(\"\u2705 Models successfully acquired via Personal Hugging Face fallback.\")\n",
|
| 441 |
+
" personal_hf_success = True\n",
|
| 442 |
+
" except Exception as e_hf:\n",
|
| 443 |
+
" print(f\"\u26a0\ufe0f Personal HF Checkpoint failed: {e_hf}. Trying External Fallback...\")\n",
|
| 444 |
+
"\n",
|
| 445 |
+
" # Priority 4 (Fallback): Auto-download from HuggingFace via utils script\n",
|
| 446 |
+
" if not personal_hf_success:\n",
|
| 447 |
+
" try:\n",
|
| 448 |
+
" from utils.default_models import ensure_default_models\n",
|
| 449 |
+
" ensure_default_models(Path(\"pretrained_models\"))\n",
|
| 450 |
+
" print(\"\u2705 Models successfully acquired via External HuggingFace fallback.\")\n",
|
| 451 |
+
" except Exception as e:\n",
|
| 452 |
+
" print(f\"\u26a0\ufe0f Critical: Could not auto-download models. Error: {e}\")"
|
| 453 |
+
]
|
| 454 |
+
},
|
| 455 |
+
{
|
| 456 |
+
"cell_type": "markdown",
|
| 457 |
+
"metadata": {
|
| 458 |
+
"id": "ycvMxmYJhXrI"
|
| 459 |
+
},
|
| 460 |
+
"source": [
|
| 461 |
+
"## 2\ufe0f\u20e3 Architecture Loading\n",
|
| 462 |
+
"\n",
|
| 463 |
+
"We now initialize the three distinct neural networks that comprise the SV2TTS framework. Please ensure you are running on a **GPU Runtime** (e.g., T4 on Colab) for optimal performance."
|
| 464 |
+
]
|
| 465 |
+
},
|
| 466 |
+
{
|
| 467 |
+
"cell_type": "code",
|
| 468 |
+
"execution_count": 3,
|
| 469 |
+
"metadata": {
|
| 470 |
+
"colab": {
|
| 471 |
+
"base_uri": "https://localhost:8080/"
|
| 472 |
+
},
|
| 473 |
+
"id": "yEShAzbfhXrI",
|
| 474 |
+
"outputId": "9d2ce872-19a3-4259-a10b-8afebb394b28"
|
| 475 |
+
},
|
| 476 |
+
"outputs": [
|
| 477 |
+
{
|
| 478 |
+
"output_type": "stream",
|
| 479 |
+
"name": "stdout",
|
| 480 |
+
"text": [
|
| 481 |
+
"\ud83c\udfaf Computation Device: cuda\n",
|
| 482 |
+
"\u23f3 Loading Neural Networks (SV2TTS Pipeline)...\n",
|
| 483 |
+
"\ud83d\udfe2 Loading Encoder from Repository: Dataset/encoder.pt\n",
|
| 484 |
+
"Loaded encoder \"encoder.pt\" trained to step 1564501\n",
|
| 485 |
+
"\ud83d\udfe2 Loading Synthesizer from Repository: Dataset/synthesizer.pt\n",
|
| 486 |
+
"Synthesizer using device: cuda\n",
|
| 487 |
+
"\ud83d\udfe2 Loading Vocoder from Repository: Dataset/vocoder.pt\n",
|
| 488 |
+
"Building Wave-RNN\n",
|
| 489 |
+
"Trainable Parameters: 4.481M\n",
|
| 490 |
+
"Loading model weights at Dataset/vocoder.pt\n",
|
| 491 |
+
"\u2705 Pipeline operational. All components loaded correctly.\n"
|
| 492 |
+
]
|
| 493 |
+
}
|
| 494 |
+
],
|
| 495 |
+
"source": [
|
| 496 |
+
"from encoder import inference as encoder\n",
|
| 497 |
+
"from synthesizer.inference import Synthesizer\n",
|
| 498 |
+
"from vocoder import inference as vocoder\n",
|
| 499 |
+
"import numpy as np\n",
|
| 500 |
+
"import torch\n",
|
| 501 |
+
"from pathlib import Path\n",
|
| 502 |
+
"\n",
|
| 503 |
+
"# Hardware Acceleration Check\n",
|
| 504 |
+
"device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
|
| 505 |
+
"print(f\"\ud83c\udfaf Computation Device: {device}\")\n",
|
| 506 |
+
"\n",
|
| 507 |
+
"def resolve_checkpoint(component_name, legacy_path_suffix):\n",
|
| 508 |
+
" \"\"\"\n",
|
| 509 |
+
" Intelligently resolves the path to model checkpoints based on priority.\n",
|
| 510 |
+
" 1. Repository /Dataset/ folder.\n",
|
| 511 |
+
" 2. Kaggle Input directory (Hardcoded: /kaggle/input/deepfakeaudio/).\n",
|
| 512 |
+
" 3. Auto-downloaded 'pretrained_models'.\n",
|
| 513 |
+
" \"\"\"\n",
|
| 514 |
+
"\n",
|
| 515 |
+
" def is_valid(p):\n",
|
| 516 |
+
" return p.exists() and p.stat().st_size > 1000\n",
|
| 517 |
+
"\n",
|
| 518 |
+
" # 1. Repository Local (Dataset/)\n",
|
| 519 |
+
" dataset_p = Path(\"Dataset\") / f\"{component_name.lower()}.pt\"\n",
|
| 520 |
+
" if is_valid(dataset_p):\n",
|
| 521 |
+
" print(f\"\ud83d\udfe2 Loading {component_name} from Repository: {dataset_p}\")\n",
|
| 522 |
+
" return dataset_p\n",
|
| 523 |
+
"\n",
|
| 524 |
+
" # 2. Kaggle Environment (Hardcoded Path: /kaggle/input/deepfakeaudio/)\n",
|
| 525 |
+
" kaggle_p = Path(\"/kaggle/input/deepfakeaudio\") / f\"{component_name.lower()}.pt\"\n",
|
| 526 |
+
" if is_valid(kaggle_p):\n",
|
| 527 |
+
" print(f\"\ud83d\udfe2 Loading {component_name} from Kaggle: {kaggle_p}\")\n",
|
| 528 |
+
" return kaggle_p\n",
|
| 529 |
+
"\n",
|
| 530 |
+
" # 3. Default / Auto-Downloaded Fallback\n",
|
| 531 |
+
" default_p = Path(\"pretrained_models/default\") / f\"{component_name.lower()}.pt\"\n",
|
| 532 |
+
" if is_valid(default_p):\n",
|
| 533 |
+
" print(f\"\ud83d\udfe2 Loading {component_name} from Fallback: {default_p}\")\n",
|
| 534 |
+
" return default_p\n",
|
| 535 |
+
"\n",
|
| 536 |
+
" # 4. Legacy/Manual Paths\n",
|
| 537 |
+
" legacy_p = Path(\"pretrained_models\") / legacy_path_suffix\n",
|
| 538 |
+
" if legacy_p.exists():\n",
|
| 539 |
+
" if legacy_p.is_dir():\n",
|
| 540 |
+
" pts = [f for f in legacy_p.glob(\"*.pt\") if is_valid(f)]\n",
|
| 541 |
+
" if pts: return pts[0]\n",
|
| 542 |
+
" pts_rec = [f for f in legacy_p.rglob(\"*.pt\") if is_valid(f)]\n",
|
| 543 |
+
" if pts_rec: return pts_rec[0]\n",
|
| 544 |
+
" elif is_valid(legacy_p):\n",
|
| 545 |
+
" return legacy_p\n",
|
| 546 |
+
"\n",
|
| 547 |
+
" print(f'\u26a0\ufe0f Warning: {component_name} checkpoint not found! Falling back to dynamic search...')\n",
|
| 548 |
+
" return None\n",
|
| 549 |
+
"\n",
|
| 550 |
+
"print(\"\u23f3 Loading Neural Networks (SV2TTS Pipeline)...\")\n",
|
| 551 |
+
"\n",
|
| 552 |
+
"try:\n",
|
| 553 |
+
" # 1. Encoder: Extract speaker embedding\n",
|
| 554 |
+
" encoder_path = resolve_checkpoint(\"Encoder\", \"encoder/saved_models\")\n",
|
| 555 |
+
" encoder.load_model(encoder_path)\n",
|
| 556 |
+
"\n",
|
| 557 |
+
" # 2. Synthesizer: Generates spectrograms from text\n",
|
| 558 |
+
" synth_path = resolve_checkpoint(\"Synthesizer\", \"synthesizer/saved_models/logs-pretrained/taco_pretrained\")\n",
|
| 559 |
+
" synthesizer = Synthesizer(synth_path)\n",
|
| 560 |
+
"\n",
|
| 561 |
+
" # 3. Vocoder: Converts spectrograms to audio waveforms\n",
|
| 562 |
+
" vocoder_path = resolve_checkpoint(\"Vocoder\", \"vocoder/saved_models/pretrained\")\n",
|
| 563 |
+
" vocoder.load_model(vocoder_path)\n",
|
| 564 |
+
"\n",
|
| 565 |
+
" print(\"\u2705 Pipeline operational. All components loaded correctly.\")\n",
|
| 566 |
+
"except Exception as e:\n",
|
| 567 |
+
" print(f\"\u274c Architecture Error: {e}\")"
|
| 568 |
+
]
|
| 569 |
+
},
|
| 570 |
+
{
|
| 571 |
+
"cell_type": "markdown",
|
| 572 |
+
"metadata": {
|
| 573 |
+
"id": "GjaGAUyVhXrJ"
|
| 574 |
+
},
|
| 575 |
+
"source": [
|
| 576 |
+
"## 3\ufe0f\u20e3 Inference Interface\n",
|
| 577 |
+
"\n",
|
| 578 |
+
"Select your **Input Method** below to begin cloning.\n",
|
| 579 |
+
"\n",
|
| 580 |
+
"* **Presets**: Choose from a high-quality list of celebrity samples.\n",
|
| 581 |
+
"* **Upload**: Use your own `.wav` or `.mp3` file (5-10 seconds recommended).\n",
|
| 582 |
+
"* **Record**: Capture your voice directly in the browser (Colab only)."
|
| 583 |
+
]
|
| 584 |
+
},
|
| 585 |
+
{
|
| 586 |
+
"cell_type": "code",
|
| 587 |
+
"execution_count": 4,
|
| 588 |
+
"metadata": {
|
| 589 |
+
"colab": {
|
| 590 |
+
"base_uri": "https://localhost:8080/",
|
| 591 |
+
"height": 1000,
|
| 592 |
+
"referenced_widgets": [
|
| 593 |
+
"17941275da2c4712a8f7aeee24e985dc",
|
| 594 |
+
"d35d66d0fc6f4956a22315288f75761e",
|
| 595 |
+
"e5738978acde489493cc00b7580b41bf",
|
| 596 |
+
"a1c213e8e9ed4e6a80611f91e37c16ee",
|
| 597 |
+
"2a6b69d651c24930b892c0bb52e9d9e4",
|
| 598 |
+
"39339f0b568c4d5084de7664d336ba50",
|
| 599 |
+
"0923c7a8ae574c4fa2c9eb8c25076b95",
|
| 600 |
+
"b0a46e803d84423cacb55e37ae6c043b",
|
| 601 |
+
"bb70b9f36452494da95ee1033601218f",
|
| 602 |
+
"f3008160ab52453d932a0cb62c711eee",
|
| 603 |
+
"f035a503fafe48038036bdd50bd07b1c",
|
| 604 |
+
"dac9a6ef6db347e08b2268c5e5654008",
|
| 605 |
+
"779855519a5844d3bcc1ed69a8c44608",
|
| 606 |
+
"05f5b43427fe47fbae468c4cec2be72c",
|
| 607 |
+
"98051b53e539482aadf5d4b132dd3021",
|
| 608 |
+
"3ed45fa49669481dbee831dfb2f6ebeb",
|
| 609 |
+
"9d9b4954b5a74bf2b2f0fef974564bda",
|
| 610 |
+
"3753f04d0dbb48a6a9a5c5589425ad91",
|
| 611 |
+
"c294dc1d01e3421097eefe7eb2e5e377",
|
| 612 |
+
"de81a54ce89a4aaeaba52f01c571b212",
|
| 613 |
+
"7dbdc18b94a54cb6a492c7f33a661a56",
|
| 614 |
+
"b9b8b1f96c3d4b89a9bc3e32d6076ad3",
|
| 615 |
+
"91e19a809309432995d435aeba1e8bde",
|
| 616 |
+
"c51b8d7f054b41039f354fce639d6947",
|
| 617 |
+
"20d72122af6f449c90d53871c5a29b7d",
|
| 618 |
+
"f73ef86eeaab452eb11e45f1e17464a8",
|
| 619 |
+
"edf2d4f03fae4c5da75d0e3e1ad60382"
|
| 620 |
+
]
|
| 621 |
+
},
|
| 622 |
+
"id": "3wq8NsHohXrJ",
|
| 623 |
+
"outputId": "b92ea5cd-958b-4f83-9c48-1726f5f821a1"
|
| 624 |
+
},
|
| 625 |
+
"outputs": [
|
| 626 |
+
{
|
| 627 |
+
"output_type": "stream",
|
| 628 |
+
"name": "stdout",
|
| 629 |
+
"text": [
|
| 630 |
+
"Select Input Method:\n",
|
| 631 |
+
"\u2705 Samples located at: /content/DEEPFAKE-AUDIO/Dataset/samples\n"
|
| 632 |
+
]
|
| 633 |
+
},
|
| 634 |
+
{
|
| 635 |
+
"output_type": "display_data",
|
| 636 |
+
"data": {
|
| 637 |
+
"text/plain": [
|
| 638 |
+
"Tab(children=(VBox(children=(Dropdown(description='Preset:', options=('Donald Trump.wav', 'Steve Jobs.wav', 'A\u2026"
|
| 639 |
+
],
|
| 640 |
+
"application/vnd.jupyter.widget-view+json": {
|
| 641 |
+
"version_major": 2,
|
| 642 |
+
"version_minor": 0,
|
| 643 |
+
"model_id": "17941275da2c4712a8f7aeee24e985dc"
|
| 644 |
+
}
|
| 645 |
+
},
|
| 646 |
+
"metadata": {}
|
| 647 |
+
},
|
| 648 |
+
{
|
| 649 |
+
"output_type": "display_data",
|
| 650 |
+
"data": {
|
| 651 |
+
"text/plain": [
|
| 652 |
+
"Textarea(value=\"Hello, I'm Elon Musk. Welcome to Deepfake Audio by Amey Thakur and Mega Satish. Explore AI voi\u2026"
|
| 653 |
+
],
|
| 654 |
+
"application/vnd.jupyter.widget-view+json": {
|
| 655 |
+
"version_major": 2,
|
| 656 |
+
"version_minor": 0,
|
| 657 |
+
"model_id": "de81a54ce89a4aaeaba52f01c571b212"
|
| 658 |
+
}
|
| 659 |
+
},
|
| 660 |
+
"metadata": {}
|
| 661 |
+
},
|
| 662 |
+
{
|
| 663 |
+
"output_type": "display_data",
|
| 664 |
+
"data": {
|
| 665 |
+
"text/plain": [
|
| 666 |
+
"Button(button_style='primary', description='Clone Voice! \ud83d\ude80', style=ButtonStyle())"
|
| 667 |
+
],
|
| 668 |
+
"application/vnd.jupyter.widget-view+json": {
|
| 669 |
+
"version_major": 2,
|
| 670 |
+
"version_minor": 0,
|
| 671 |
+
"model_id": "91e19a809309432995d435aeba1e8bde"
|
| 672 |
+
}
|
| 673 |
+
},
|
| 674 |
+
"metadata": {}
|
| 675 |
+
},
|
| 676 |
+
{
|
| 677 |
+
"output_type": "display_data",
|
| 678 |
+
"data": {
|
| 679 |
+
"text/plain": [
|
| 680 |
+
"Output()"
|
| 681 |
+
],
|
| 682 |
+
"application/vnd.jupyter.widget-view+json": {
|
| 683 |
+
"version_major": 2,
|
| 684 |
+
"version_minor": 0,
|
| 685 |
+
"model_id": "f73ef86eeaab452eb11e45f1e17464a8"
|
| 686 |
+
}
|
| 687 |
+
},
|
| 688 |
+
"metadata": {}
|
| 689 |
+
}
|
| 690 |
+
],
|
| 691 |
+
"source": [
|
| 692 |
+
"import ipywidgets as widgets\n",
|
| 693 |
+
"from IPython.display import display, Javascript, Audio\n",
|
| 694 |
+
"try:\n",
|
| 695 |
+
" from google.colab import output\n",
|
| 696 |
+
" HAS_COLAB = True\n",
|
| 697 |
+
"except ImportError:\n",
|
| 698 |
+
" HAS_COLAB = False\n",
|
| 699 |
+
"from base64 import b64decode\n",
|
| 700 |
+
"import io\n",
|
| 701 |
+
"import librosa\n",
|
| 702 |
+
"import librosa.display\n",
|
| 703 |
+
"import os\n",
|
| 704 |
+
"import soundfile as sf\n",
|
| 705 |
+
"import matplotlib.pyplot as plt\n",
|
| 706 |
+
"import numpy as np\n",
|
| 707 |
+
"import glob\n",
|
| 708 |
+
"\n",
|
| 709 |
+
"RECORD = \"\"\"\n",
|
| 710 |
+
"const sleep = time => new Promise(resolve => setTimeout(resolve, time))\n",
|
| 711 |
+
"const b2text = blob => new Promise(resolve => {\n",
|
| 712 |
+
" const reader = new FileReader()\n",
|
| 713 |
+
" reader.onloadend = e => resolve(e.srcElement.result)\n",
|
| 714 |
+
" reader.readAsDataURL(blob)\n",
|
| 715 |
+
"})\n",
|
| 716 |
+
"var record = time => new Promise(async resolve => {\n",
|
| 717 |
+
" stream = await navigator.mediaDevices.getUserMedia({ audio: true })\n",
|
| 718 |
+
" recorder = new MediaRecorder(stream)\n",
|
| 719 |
+
" chunks = []\n",
|
| 720 |
+
" recorder.ondataavailable = e => chunks.push(e.data)\n",
|
| 721 |
+
" recorder.start()\n",
|
| 722 |
+
" await sleep(time)\n",
|
| 723 |
+
" recorder.onstop = async ()=>{\n",
|
| 724 |
+
" blob = new Blob(chunks)\n",
|
| 725 |
+
" text = await b2text(blob)\n",
|
| 726 |
+
" resolve(text)\n",
|
| 727 |
+
" }\n",
|
| 728 |
+
" recorder.stop()\n",
|
| 729 |
+
"})\"\"\"\n",
|
| 730 |
+
"\n",
|
| 731 |
+
"def record_audio(sec=10):\n",
|
| 732 |
+
" if not HAS_COLAB:\n",
|
| 733 |
+
" raise RuntimeError(\"Recording is only available in a Google Colab environment.\")\n",
|
| 734 |
+
" print(\"\ud83d\udd34 Recording active for %d seconds...\" % sec)\n",
|
| 735 |
+
" display(Javascript(RECORD))\n",
|
| 736 |
+
" s = output.eval_js('record(%d)' % (sec*1000))\n",
|
| 737 |
+
" print(\"\u2705 Recording saved.\")\n",
|
| 738 |
+
" binary = b64decode(s.split(',')[1])\n",
|
| 739 |
+
" with open('recording.wav', 'wb') as f:\n",
|
| 740 |
+
" f.write(binary)\n",
|
| 741 |
+
" return 'recording.wav'\n",
|
| 742 |
+
"\n",
|
| 743 |
+
"def visualize_results(original_wav, generated_wav, spec, embed, title=\"Analysis\"):\n",
|
| 744 |
+
" try:\n",
|
| 745 |
+
" fig, axes = plt.subplots(3, 1, figsize=(10, 12))\n",
|
| 746 |
+
" axes[0].set_title(\"Input Voice vs. Cloned Voice (Waveform)\")\n",
|
| 747 |
+
" try:\n",
|
| 748 |
+
" librosa.display.waveshow(original_wav, alpha=0.5, ax=axes[0], label=\"Original\")\n",
|
| 749 |
+
" librosa.display.waveshow(generated_wav, alpha=0.5, ax=axes[0], label=\"Cloned\", color='r')\n",
|
| 750 |
+
" axes[0].legend()\n",
|
| 751 |
+
" except:\n",
|
| 752 |
+
" axes[0].plot(original_wav, alpha=0.5, label=\"Original\")\n",
|
| 753 |
+
" axes[0].plot(generated_wav, alpha=0.5, label=\"Cloned\", color='r')\n",
|
| 754 |
+
" axes[0].legend()\n",
|
| 755 |
+
"\n",
|
| 756 |
+
" axes[1].set_title(\"Generated Mel Spectrogram\")\n",
|
| 757 |
+
" im = axes[1].imshow(spec, aspect=\"auto\", origin=\"lower\", interpolation='none')\n",
|
| 758 |
+
" fig.colorbar(im, ax=axes[1])\n",
|
| 759 |
+
"\n",
|
| 760 |
+
" axes[2].set_title(\"Speaker Embedding (256-D Heatmap)\")\n",
|
| 761 |
+
" if len(embed) == 256:\n",
|
| 762 |
+
" axes[2].imshow(embed.reshape(16, 16), aspect='auto', cmap='viridis')\n",
|
| 763 |
+
" else:\n",
|
| 764 |
+
" axes[2].plot(embed)\n",
|
| 765 |
+
"\n",
|
| 766 |
+
" plt.tight_layout()\n",
|
| 767 |
+
" plt.show()\n",
|
| 768 |
+
" except Exception as e:\n",
|
| 769 |
+
" print(f\"\u26a0\ufe0f Graphs partially failed: {e}. Audio was successful.\")\n",
|
| 770 |
+
"\n",
|
| 771 |
+
"# --- \ud83d\udee1\ufe0f IMPROVED SAMPLE DISCOVERY ---\n",
|
| 772 |
+
"def find_samples_dir():\n",
|
| 773 |
+
" \"\"\"Locates reference samples with high persistence across all environments.\"\"\"\n",
|
| 774 |
+
" # Priority paths\n",
|
| 775 |
+
" priority_roots = [\n",
|
| 776 |
+
" \"Source Code/samples\",\n",
|
| 777 |
+
" \"Dataset/samples\",\n",
|
| 778 |
+
" \"D:/GitHub/DEEPFAKE-AUDIO/Source Code/samples\",\n",
|
| 779 |
+
" \"D:/GitHub/DEEPFAKE-AUDIO/Dataset/samples\",\n",
|
| 780 |
+
" \"/content/DEEPFAKE-AUDIO/Source Code/samples\",\n",
|
| 781 |
+
" \"/kaggle/input/deepfakeaudio/samples\",\n",
|
| 782 |
+
" \"/kaggle/input/deepfakeaudio\"\n",
|
| 783 |
+
" ]\n",
|
| 784 |
+
"\n",
|
| 785 |
+
" def filter_real_audio(d):\n",
|
| 786 |
+
" if not os.path.exists(d): return []\n",
|
| 787 |
+
" # Check if files are real audio (not small LFS pointers < 1KB)\n",
|
| 788 |
+
" return [f for f in os.listdir(d) if f.lower().endswith((\".wav\", \".mp3\")) and os.path.getsize(os.path.join(d, f)) > 1024]\n",
|
| 789 |
+
"\n",
|
| 790 |
+
" for d in priority_roots:\n",
|
| 791 |
+
" files = filter_real_audio(d)\n",
|
| 792 |
+
" if files:\n",
|
| 793 |
+
" print(f\"\u2705 Samples located at: {os.path.abspath(d)}\")\n",
|
| 794 |
+
" return d, files\n",
|
| 795 |
+
"\n",
|
| 796 |
+
" # More aggressive glob search\n",
|
| 797 |
+
" print(\"\ud83d\udd0d Searching folders for audio samples...\")\n",
|
| 798 |
+
" potential_matches = glob.glob(\"**/samples/*.wav\", recursive=True) + glob.glob(\"**/samples/*.mp3\", recursive=True)\n",
|
| 799 |
+
" valid_matches = [m for m in potential_matches if os.path.getsize(m) > 1024]\n",
|
| 800 |
+
"\n",
|
| 801 |
+
" if valid_matches:\n",
|
| 802 |
+
" root = os.path.dirname(valid_matches[0])\n",
|
| 803 |
+
" files = [os.path.basename(f) for f in glob.glob(os.path.join(root, \"*.*\")) if f.lower().endswith((\".wav\", \".mp3\")) and os.path.getsize(f) > 1024]\n",
|
| 804 |
+
" print(f\"\u2728 Located samples via glob at: {os.path.abspath(root)}\")\n",
|
| 805 |
+
" return root, files\n",
|
| 806 |
+
"\n",
|
| 807 |
+
" return None, []\n",
|
| 808 |
+
"\n",
|
| 809 |
+
"print(\"Select Input Method:\")\n",
|
| 810 |
+
"tab = widgets.Tab()\n",
|
| 811 |
+
"\n",
|
| 812 |
+
"samples_dir, preset_files = find_samples_dir()\n",
|
| 813 |
+
"if samples_dir:\n",
|
| 814 |
+
" preset_files.sort()\n",
|
| 815 |
+
" for name in reversed([\"Donald Trump.wav\", \"Steve Jobs.wav\"]):\n",
|
| 816 |
+
" if name in preset_files:\n",
|
| 817 |
+
" preset_files.insert(0, preset_files.pop(preset_files.index(name)))\n",
|
| 818 |
+
"else:\n",
|
| 819 |
+
" print(\"\u26a0\ufe0f Warning: No reference samples found. Please run the setup cell or upload manually.\")\n",
|
| 820 |
+
"\n",
|
| 821 |
+
"dropdown = widgets.Dropdown(options=preset_files,\n",
|
| 822 |
+
" value=preset_files[0] if preset_files else None,\n",
|
| 823 |
+
" description='Preset:')\n",
|
| 824 |
+
"uploader = widgets.FileUpload(accept='.wav,.mp3', multiple=False)\n",
|
| 825 |
+
"record_btn = widgets.Button(description=\"Start Recording (10s)\", button_style='danger')\n",
|
| 826 |
+
"record_out = widgets.Output()\n",
|
| 827 |
+
"\n",
|
| 828 |
+
"def on_record_click(b):\n",
|
| 829 |
+
" with record_out:\n",
|
| 830 |
+
" record_btn.disabled = True\n",
|
| 831 |
+
" try: record_audio(10)\n",
|
| 832 |
+
" except Exception as e: print(f\"Error: {e}.\")\n",
|
| 833 |
+
" record_btn.disabled = False\n",
|
| 834 |
+
"record_btn.on_click(on_record_click)\n",
|
| 835 |
+
"\n",
|
| 836 |
+
"# Tab assignment MUST use .children attribute\n",
|
| 837 |
+
"tab.children = [\n",
|
| 838 |
+
" widgets.VBox([dropdown]),\n",
|
| 839 |
+
" widgets.VBox([uploader]),\n",
|
| 840 |
+
" widgets.VBox([record_btn, record_out])\n",
|
| 841 |
+
"]\n",
|
| 842 |
+
"tab.set_title(0, '\ud83c\udfb5 Presets')\n",
|
| 843 |
+
"tab.set_title(1, '\ud83d\udcc2 Upload')\n",
|
| 844 |
+
"tab.set_title(2, '\ud83d\udd34 Record')\n",
|
| 845 |
+
"display(tab)\n",
|
| 846 |
+
"\n",
|
| 847 |
+
"text_input = widgets.Textarea(\n",
|
| 848 |
+
" value=\"Hello, I'm Elon Musk. Welcome to Deepfake Audio by Amey Thakur and Mega Satish. Explore AI voice Go!\",\n",
|
| 849 |
+
" placeholder='Enter text to synthesize...',\n",
|
| 850 |
+
" description='Text:',\n",
|
| 851 |
+
" layout=widgets.Layout(width='50%', height='100px')\n",
|
| 852 |
+
")\n",
|
| 853 |
+
"clone_btn = widgets.Button(description=\"Clone Voice! \ud83d\ude80\", button_style='primary')\n",
|
| 854 |
+
"out = widgets.Output()\n",
|
| 855 |
+
"display(text_input, clone_btn, out)\n",
|
| 856 |
+
"\n",
|
| 857 |
+
"def run_cloning(b):\n",
|
| 858 |
+
" with out:\n",
|
| 859 |
+
" out.clear_output()\n",
|
| 860 |
+
" active_tab = tab.selected_index\n",
|
| 861 |
+
" input_path = None\n",
|
| 862 |
+
" try:\n",
|
| 863 |
+
" if active_tab == 0:\n",
|
| 864 |
+
" if not dropdown.value: return print(\"\u274c No preset selected.\")\n",
|
| 865 |
+
" input_path = os.path.join(samples_dir, dropdown.value)\n",
|
| 866 |
+
" print(f\"\ud83c\udf99\ufe0f Source: Preset ({dropdown.value})\")\n",
|
| 867 |
+
" elif active_tab == 1:\n",
|
| 868 |
+
" if not uploader.value: return print(\"\u274c No file uploaded.\")\n",
|
| 869 |
+
" fname = list(uploader.value.keys())[0]\n",
|
| 870 |
+
" content = uploader.value[fname]['content']\n",
|
| 871 |
+
" input_path = \"uploaded_sample.wav\"\n",
|
| 872 |
+
" with open(input_path, \"wb\") as f: f.write(content)\n",
|
| 873 |
+
" print(f\"\ud83c\udf99\ufe0f Source: Upload ({fname})\")\n",
|
| 874 |
+
" elif active_tab == 2:\n",
|
| 875 |
+
" if not os.path.exists(\"recording.wav\"): return print(\"\u274c No recording found.\")\n",
|
| 876 |
+
" input_path = \"recording.wav\"\n",
|
| 877 |
+
" print(\"\ud83c\udf99\ufe0f Source: Microphone\")\n",
|
| 878 |
+
"\n",
|
| 879 |
+
" print(\"\u23f3 Step 1/3: Encoding speaker identity...\")\n",
|
| 880 |
+
" original_wav, sampling_rate = librosa.load(input_path)\n",
|
| 881 |
+
" preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate)\n",
|
| 882 |
+
" embed = encoder.embed_utterance(preprocessed_wav)\n",
|
| 883 |
+
" print(\"\u23f3 Step 2/3: Synthesizing speech...\")\n",
|
| 884 |
+
" specs = synthesizer.synthesize_spectrograms([text_input.value], [embed])\n",
|
| 885 |
+
" spec = specs[0]\n",
|
| 886 |
+
" print(\"\u23f3 Step 3/3: Generating waveform...\")\n",
|
| 887 |
+
" generated_wav = vocoder.infer_waveform(spec)\n",
|
| 888 |
+
" print(\"\ud83c\udf89 Synthesis Complete!\")\n",
|
| 889 |
+
" display(Audio(generated_wav, rate=synthesizer.sample_rate))\n",
|
| 890 |
+
" print(\"\\n\ud83d\udcca Generating Analysis...\")\n",
|
| 891 |
+
" visualize_results(original_wav, generated_wav, spec, embed)\n",
|
| 892 |
+
" except Exception as e: print(f\"\u274c Error: {e}\")\n",
|
| 893 |
+
"clone_btn.on_click(run_cloning)"
|
| 894 |
+
]
|
| 895 |
+
}
|
| 896 |
+
],
|
| 897 |
+
"metadata": {
|
| 898 |
+
"kernelspec": {
|
| 899 |
+
"display_name": "Python 3",
|
| 900 |
+
"name": "python3"
|
| 901 |
+
},
|
| 902 |
+
"language_info": {
|
| 903 |
+
"codemirror_mode": {
|
| 904 |
+
"name": "ipython",
|
| 905 |
+
"version": 3
|
| 906 |
+
},
|
| 907 |
+
"file_extension": ".py",
|
| 908 |
+
"mimetype": "text/x-python",
|
| 909 |
+
"name": "python",
|
| 910 |
+
"nbconvert_exporter": "python",
|
| 911 |
+
"pygments_lexer": "ipython3",
|
| 912 |
+
"version": "3.10.12"
|
| 913 |
+
},
|
| 914 |
+
"colab": {
|
| 915 |
+
"provenance": [],
|
| 916 |
+
"gpuType": "T4",
|
| 917 |
+
"include_colab_link": true
|
| 918 |
+
},
|
| 919 |
+
"accelerator": "GPU"
|
| 920 |
+
},
|
| 921 |
+
"nbformat": 4,
|
| 922 |
+
"nbformat_minor": 0
|
| 923 |
+
}
|
Source Code/Dockerfile
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Use Python 3.11-slim as base
|
| 2 |
+
FROM python:3.11-slim
|
| 3 |
+
|
| 4 |
+
# Set environment variables
|
| 5 |
+
ENV PYTHONUNBUFFERED=1
|
| 6 |
+
ENV GRADIO_SERVER_NAME="0.0.0.0"
|
| 7 |
+
|
| 8 |
+
# Set working directory
|
| 9 |
+
WORKDIR /app
|
| 10 |
+
|
| 11 |
+
# Install system dependencies
|
| 12 |
+
RUN apt-get update && apt-get install -y \
|
| 13 |
+
libsndfile1 \
|
| 14 |
+
ffmpeg \
|
| 15 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 16 |
+
|
| 17 |
+
# Copy requirements first for better caching
|
| 18 |
+
COPY requirements.txt .
|
| 19 |
+
|
| 20 |
+
# Install Python dependencies
|
| 21 |
+
# Adding gradio explicitly since it was missing from requirements.txt
|
| 22 |
+
RUN pip install --no-cache-dir -r requirements.txt gradio
|
| 23 |
+
|
| 24 |
+
# Copy the rest of the application code
|
| 25 |
+
COPY . .
|
| 26 |
+
|
| 27 |
+
# Expose Gradio port
|
| 28 |
+
EXPOSE 7860
|
| 29 |
+
|
| 30 |
+
# Command to run the application
|
| 31 |
+
CMD ["python", "app.py"]
|
Source Code/app.py
ADDED
|
@@ -0,0 +1,907 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ==================================================================================================
|
| 2 |
+
# DEEPFAKE AUDIO - app.py (The Studio Interface)
|
| 3 |
+
# ==================================================================================================
|
| 4 |
+
#
|
| 5 |
+
# π DESCRIPTION
|
| 6 |
+
# This is the primary entry point for the Deepfake Audio Studio. It implements a modern,
|
| 7 |
+
# high-performance Gradio-based web interface designed to facilitate zero-shot voice cloning
|
| 8 |
+
# using the SV2TTS (Speaker Verification to Transfer Learning) framework.
|
| 9 |
+
#
|
| 10 |
+
# The application is architected for "Graceful Degradation": if AI models or hardware-level
|
| 11 |
+
# dependencies (like TensorFlow/CUDA) are unavailable, it falls back to a "UI-Only Demo Mode"
|
| 12 |
+
# to preserve the accessibility of the research documentation.
|
| 13 |
+
#
|
| 14 |
+
# π€ AUTHORS
|
| 15 |
+
# - Amey Thakur (https://github.com/Amey-Thakur)
|
| 16 |
+
# - Mega Satish (https://github.com/msatmod)
|
| 17 |
+
#
|
| 18 |
+
# π€π» CREDITS
|
| 19 |
+
# Original Real-Time Voice Cloning methodology by CorentinJ
|
| 20 |
+
# Repository: https://github.com/CorentinJ/Real-Time-Voice-Cloning
|
| 21 |
+
#
|
| 22 |
+
# π PROJECT LINKS
|
| 23 |
+
# Repository: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO
|
| 24 |
+
# Video Demo: https://youtu.be/i3wnBcbHDbs
|
| 25 |
+
# Demo: https://huggingface.co/spaces/ameythakur/Deepfake-Audio
|
| 26 |
+
# Research: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO/blob/main/DEEPFAKE-AUDIO.ipynb
|
| 27 |
+
#
|
| 28 |
+
# π LICENSE
|
| 29 |
+
# Released under the MIT License
|
| 30 |
+
# Release Date: 2021-02-06
|
| 31 |
+
# ==================================================================================================
|
| 32 |
+
|
| 33 |
+
import os
|
| 34 |
+
import sys
|
| 35 |
+
from pathlib import Path
|
| 36 |
+
|
| 37 |
+
# --- SYSTEM INITIALIZATION ---
|
| 38 |
+
# We configure the environment variables early to suppress verbose C++ logs from TensorFlow,
|
| 39 |
+
# ensuring a clean terminal output focus on the application state.
|
| 40 |
+
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
|
| 41 |
+
os.environ['PYTHONWARNINGS'] = 'ignore'
|
| 42 |
+
|
| 43 |
+
import warnings
|
| 44 |
+
warnings.filterwarnings('ignore')
|
| 45 |
+
|
| 46 |
+
import logging
|
| 47 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
| 48 |
+
logger = logging.getLogger(__name__)
|
| 49 |
+
|
| 50 |
+
# --- ARCHITECTURAL DIRECTORIES ---
|
| 51 |
+
# Establish absolute paths to ensure the application resolves assets correctly regardless of
|
| 52 |
+
# the execution context (Source Code vs Docker vs Cloud).
|
| 53 |
+
PROJ_DIR = Path(__file__).parent.absolute()
|
| 54 |
+
ROOT_DIR = PROJ_DIR.parent
|
| 55 |
+
DATASET_DIR = ROOT_DIR / "Dataset"
|
| 56 |
+
SAMPLES_DIR = DATASET_DIR / "samples"
|
| 57 |
+
|
| 58 |
+
# Inject current directory into sys.path to allow internal module resolution for encoder/synthesizer.
|
| 59 |
+
if str(PROJ_DIR) not in sys.path:
|
| 60 |
+
sys.path.insert(0, str(PROJ_DIR))
|
| 61 |
+
|
| 62 |
+
import numpy as np
|
| 63 |
+
import gradio as gr
|
| 64 |
+
import base64
|
| 65 |
+
|
| 66 |
+
# --- AI ENGINE DEPENDENCIES ---
|
| 67 |
+
# The application attempts to load the multi-stage neural pipeline.
|
| 68 |
+
# Stage 1: Encoder (Speaker Verification)
|
| 69 |
+
# Stage 2: Synthesizer (Tacotron 2)
|
| 70 |
+
# Stage 3: Vocoder (WaveRNN)
|
| 71 |
+
TF_AVAILABLE = False
|
| 72 |
+
MODELS_READY = False
|
| 73 |
+
STARTUP_ERROR = ""
|
| 74 |
+
synthesizer = None
|
| 75 |
+
|
| 76 |
+
try:
|
| 77 |
+
import librosa
|
| 78 |
+
LIBROSA_AVAILABLE = True
|
| 79 |
+
except ImportError:
|
| 80 |
+
LIBROSA_AVAILABLE = False
|
| 81 |
+
logger.warning("librosa not available - audio loading will fail.")
|
| 82 |
+
|
| 83 |
+
try:
|
| 84 |
+
# TensorFlow 2.x requires eager execution to be disabled for legacy SV2TTS support elements.
|
| 85 |
+
import tensorflow as tf
|
| 86 |
+
tf.compat.v1.disable_eager_execution()
|
| 87 |
+
tf.get_logger().setLevel('ERROR')
|
| 88 |
+
TF_AVAILABLE = True
|
| 89 |
+
except Exception as e:
|
| 90 |
+
logger.warning(f"TensorFlow unavailable (possibly blocked by policy): {e}")
|
| 91 |
+
TF_AVAILABLE = False
|
| 92 |
+
|
| 93 |
+
try:
|
| 94 |
+
# Import the cloned submodules. They must exist in the PROJ_DIR.
|
| 95 |
+
import encoder.inference
|
| 96 |
+
import encoder.audio
|
| 97 |
+
from synthesizer.inference import Synthesizer
|
| 98 |
+
from vocoder import inference as vocoder
|
| 99 |
+
AI_MODULES_AVAILABLE = True
|
| 100 |
+
except ImportError as e:
|
| 101 |
+
logger.warning(f"AI modules not available: {e}")
|
| 102 |
+
AI_MODULES_AVAILABLE = False
|
| 103 |
+
|
| 104 |
+
# --- Model Paths (Updated to 'Dataset' directory) ---
|
| 105 |
+
ENC_MODEL = DATASET_DIR / "encoder.pt"
|
| 106 |
+
SYN_MODEL = DATASET_DIR / "synthesizer.pt"
|
| 107 |
+
VOC_MODEL = DATASET_DIR / "vocoder.pt"
|
| 108 |
+
|
| 109 |
+
# --- Sample Loading (Dynamic Discovery from Dataset/samples) ---
|
| 110 |
+
SAMPLES = {}
|
| 111 |
+
if SAMPLES_DIR.exists():
|
| 112 |
+
for f in SAMPLES_DIR.glob("*.wav"):
|
| 113 |
+
# Use filename as key directly (e.g., "Barack Obama")
|
| 114 |
+
name = f.stem
|
| 115 |
+
# If user still uses old _preset convention, clean it up just in case
|
| 116 |
+
if "_preset" in name:
|
| 117 |
+
name = name.replace("_preset", "").replace("_", " ").title()
|
| 118 |
+
SAMPLES[name] = str(f)
|
| 119 |
+
|
| 120 |
+
def load_preset(name):
|
| 121 |
+
if name in SAMPLES:
|
| 122 |
+
logger.info(f"Loading preset: {name}")
|
| 123 |
+
return SAMPLES[name]
|
| 124 |
+
return None
|
| 125 |
+
|
| 126 |
+
# --- Model Loading (If Available) ---
|
| 127 |
+
if TF_AVAILABLE and AI_MODULES_AVAILABLE:
|
| 128 |
+
def load_models():
|
| 129 |
+
try:
|
| 130 |
+
if not encoder.inference.is_loaded():
|
| 131 |
+
encoder.inference.load_model(ENC_MODEL)
|
| 132 |
+
synth = Synthesizer(SYN_MODEL)
|
| 133 |
+
vocoder.load_model(VOC_MODEL)
|
| 134 |
+
return synth
|
| 135 |
+
except Exception as e:
|
| 136 |
+
logger.error(f"Load error: {e}")
|
| 137 |
+
raise e
|
| 138 |
+
|
| 139 |
+
try:
|
| 140 |
+
# Check if files exist before trying to load
|
| 141 |
+
if not ENC_MODEL.exists() or not SYN_MODEL.exists() or not VOC_MODEL.exists():
|
| 142 |
+
raise FileNotFoundError(f"Models missing in {DATASET_DIR}")
|
| 143 |
+
|
| 144 |
+
synthesizer = load_models()
|
| 145 |
+
MODELS_READY = True
|
| 146 |
+
logger.info("β
All AI models loaded successfully!")
|
| 147 |
+
except Exception as e:
|
| 148 |
+
MODELS_READY = False
|
| 149 |
+
STARTUP_ERROR = str(e)
|
| 150 |
+
logger.warning(f"Models not loaded: {e}")
|
| 151 |
+
else:
|
| 152 |
+
STARTUP_ERROR = "TensorFlow blocked by system policy" if not TF_AVAILABLE else "AI modules not found"
|
| 153 |
+
logger.warning(f"Running in UI-only mode: {STARTUP_ERROR}")
|
| 154 |
+
|
| 155 |
+
# --- Synthesis Logic ---
|
| 156 |
+
def synthesize(audio_file, text, progress=gr.Progress()):
|
| 157 |
+
logger.info(f"Synthesize called with audio_file: {audio_file}, text: '{text}'")
|
| 158 |
+
|
| 159 |
+
if text is None: text = ""
|
| 160 |
+
text = str(text).strip()
|
| 161 |
+
|
| 162 |
+
if not MODELS_READY:
|
| 163 |
+
return None, f"β οΈ Demo Mode: {STARTUP_ERROR}"
|
| 164 |
+
if not audio_file or not text:
|
| 165 |
+
logger.warning(f"Validation failed. Audio: {bool(audio_file)}, Text: {bool(text)}")
|
| 166 |
+
return None, "Reference audio and text script are required."
|
| 167 |
+
|
| 168 |
+
try:
|
| 169 |
+
progress(0.2, desc="Extracting Voice Identity")
|
| 170 |
+
original_wav, sampling_rate = librosa.load(audio_file, sr=None)
|
| 171 |
+
preprocessed_wav = encoder.audio.preprocess_wav(original_wav, sampling_rate)
|
| 172 |
+
embed = encoder.inference.embed_utterance(preprocessed_wav)
|
| 173 |
+
|
| 174 |
+
progress(0.5, desc="Synthesizing Speech")
|
| 175 |
+
specs = synthesizer.synthesize_spectrograms([text], [embed])
|
| 176 |
+
|
| 177 |
+
progress(0.8, desc="Generating High-Fidelity Audio")
|
| 178 |
+
# Ensure we handle batching correctly
|
| 179 |
+
generated_wav = vocoder.infer_waveform(specs[0], batched=True, target=11000, overlap=1100)
|
| 180 |
+
|
| 181 |
+
# progress(0.85, desc="Removing Vocoder Noise")
|
| 182 |
+
# generated_wav = vocoder.infer_denoised(generated_wav)
|
| 183 |
+
|
| 184 |
+
progress(0.9, desc="Refining Audio Quality")
|
| 185 |
+
generated_wav = encoder.inference.preprocess_wav(generated_wav)
|
| 186 |
+
|
| 187 |
+
if np.abs(generated_wav).max() > 0:
|
| 188 |
+
generated_wav = generated_wav / np.abs(generated_wav).max() * 0.98
|
| 189 |
+
|
| 190 |
+
generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate), mode="constant")
|
| 191 |
+
|
| 192 |
+
progress(1.0, desc="Finalizing")
|
| 193 |
+
return (synthesizer.sample_rate, generated_wav), "β
Synthesis Complete."
|
| 194 |
+
except Exception as e:
|
| 195 |
+
logger.exception("Synthesis failed")
|
| 196 |
+
return None, f"Error: {str(e)}"
|
| 197 |
+
|
| 198 |
+
def play_intro():
|
| 199 |
+
intro_path = PROJ_DIR / "intro_message.wav"
|
| 200 |
+
logger.info(f"Intro button clicked. Searching for: {intro_path}")
|
| 201 |
+
if intro_path.exists():
|
| 202 |
+
logger.info("Intro file found. Returning path.")
|
| 203 |
+
return str(intro_path)
|
| 204 |
+
logger.warning("Intro file NOT found.")
|
| 205 |
+
return None
|
| 206 |
+
|
| 207 |
+
# --- NEON MIC ICON (RUNTIME BASE64) ---
|
| 208 |
+
try:
|
| 209 |
+
with open(PROJ_DIR / "favicon.png", "rb") as f:
|
| 210 |
+
encoded_icon = base64.b64encode(f.read()).decode('utf-8')
|
| 211 |
+
NEON_MIC_ICON = f"data:image/png;base64,{encoded_icon}"
|
| 212 |
+
except Exception as e:
|
| 213 |
+
print(f"Warning: Could not encode favicon: {e}")
|
| 214 |
+
NEON_MIC_ICON = "/file=favicon.png"
|
| 215 |
+
|
| 216 |
+
# --- Minimalist Navy & Orange UI ---
|
| 217 |
+
custom_css = """
|
| 218 |
+
/* General Styles */
|
| 219 |
+
@import url('https://fonts.googleapis.com/css2?family=Play:wght@400;700&display=swap');
|
| 220 |
+
|
| 221 |
+
* {
|
| 222 |
+
font-family: 'Play', sans-serif !important;
|
| 223 |
+
-webkit-user-select: none;
|
| 224 |
+
-moz-user-select: none;
|
| 225 |
+
-ms-user-select: none;
|
| 226 |
+
user-select: none;
|
| 227 |
+
}
|
| 228 |
+
|
| 229 |
+
/* Allow selection in inputs */
|
| 230 |
+
input, textarea, .gr-box, .gr-input, .gr-text-input {
|
| 231 |
+
-webkit-user-select: text !important;
|
| 232 |
+
-moz-user-select: text !important;
|
| 233 |
+
-ms-user-select: text !important;
|
| 234 |
+
user-select: text !important;
|
| 235 |
+
cursor: text !important;
|
| 236 |
+
}
|
| 237 |
+
|
| 238 |
+
body {
|
| 239 |
+
background-color: #0a192f !important;
|
| 240 |
+
color: #ccd6f6 !important;
|
| 241 |
+
overflow-x: hidden;
|
| 242 |
+
}
|
| 243 |
+
|
| 244 |
+
.gradio-container {
|
| 245 |
+
background-color: transparent !important;
|
| 246 |
+
position: relative !important;
|
| 247 |
+
z-index: 1 !important;
|
| 248 |
+
}
|
| 249 |
+
|
| 250 |
+
/* Pseudo-element for the pattern to control opacity properly without affecting text */
|
| 251 |
+
body::before {
|
| 252 |
+
content: "";
|
| 253 |
+
position: fixed;
|
| 254 |
+
top: 0;
|
| 255 |
+
left: 0;
|
| 256 |
+
width: 100%;
|
| 257 |
+
height: 100%;
|
| 258 |
+
background-image: url('FAVICON_PLACEHOLDER') !important;
|
| 259 |
+
background-repeat: repeat !important;
|
| 260 |
+
background-size: 60px !important;
|
| 261 |
+
opacity: 0.05 !important; /* Dimmed to 5% - Perfect watermark */
|
| 262 |
+
pointer-events: none;
|
| 263 |
+
z-index: 0;
|
| 264 |
+
}
|
| 265 |
+
""".replace("FAVICON_PLACEHOLDER", NEON_MIC_ICON) + """
|
| 266 |
+
.main-container {
|
| 267 |
+
max-width: 950px !important;
|
| 268 |
+
margin: 0 auto !important;
|
| 269 |
+
padding: 20px !important;
|
| 270 |
+
}
|
| 271 |
+
#header h1 {
|
| 272 |
+
font-size: 2.2rem;
|
| 273 |
+
color: #ff8c00;
|
| 274 |
+
margin-bottom: 0px;
|
| 275 |
+
letter-spacing: -1px;
|
| 276 |
+
}
|
| 277 |
+
#header p { font-size: 0.9rem; color: #8892b0; margin-top: 5px; }
|
| 278 |
+
|
| 279 |
+
#intro-btn {
|
| 280 |
+
background: transparent !important;
|
| 281 |
+
border: none !important;
|
| 282 |
+
font-size: 2.2rem !important;
|
| 283 |
+
color: #ff8c00 !important;
|
| 284 |
+
font-weight: 800 !important;
|
| 285 |
+
cursor: pointer !important;
|
| 286 |
+
box-shadow: none !important;
|
| 287 |
+
padding: 0 !important;
|
| 288 |
+
margin: 0 !important;
|
| 289 |
+
transition: all 0.3s ease !important;
|
| 290 |
+
}
|
| 291 |
+
|
| 292 |
+
#intro-btn:hover {
|
| 293 |
+
color: #ccd6f6 !important;
|
| 294 |
+
transform: scale(1.02);
|
| 295 |
+
}
|
| 296 |
+
|
| 297 |
+
#intro-audio { display: none !important; }
|
| 298 |
+
|
| 299 |
+
.studio-card {
|
| 300 |
+
background: #112240 !important;
|
| 301 |
+
border: 1px solid #233554 !important;
|
| 302 |
+
border-radius: 12px !important;
|
| 303 |
+
padding: 10px !important;
|
| 304 |
+
box-shadow: 0 10px 30px -15px rgba(2, 12, 27, 0.7) !important;
|
| 305 |
+
margin-bottom: 6px !important;
|
| 306 |
+
transition: transform 0.2s ease, border-color 0.2s ease !important;
|
| 307 |
+
min-height: 180px !important;
|
| 308 |
+
display: flex !important;
|
| 309 |
+
flex-direction: column !important;
|
| 310 |
+
overflow: visible !important;
|
| 311 |
+
}
|
| 312 |
+
|
| 313 |
+
.studio-card .prose {
|
| 314 |
+
margin-bottom: 5px !important;
|
| 315 |
+
}
|
| 316 |
+
|
| 317 |
+
/* Force Play Font on everything */
|
| 318 |
+
* { font-family: 'Play', sans-serif !important; }
|
| 319 |
+
span, button, input, label, textarea, select { font-family: 'Play', sans-serif !important; }
|
| 320 |
+
|
| 321 |
+
.studio-card:hover {
|
| 322 |
+
transform: translateY(-2px);
|
| 323 |
+
border-color: #ff8c00 !important;
|
| 324 |
+
}
|
| 325 |
+
|
| 326 |
+
/* --- PRO CODE DEFINITIVE "ZERO-BLEED" FIX --- */
|
| 327 |
+
|
| 328 |
+
/* 1. FLATTEN THE WRAPPERS: Make specific outer containers invisible */
|
| 329 |
+
#script-box, #status-box,
|
| 330 |
+
#script-box > .form, #status-box > .form,
|
| 331 |
+
.gray-border, .form {
|
| 332 |
+
background-color: transparent !important;
|
| 333 |
+
background: transparent !important;
|
| 334 |
+
border: none !important;
|
| 335 |
+
box-shadow: none !important;
|
| 336 |
+
padding: 0 !important;
|
| 337 |
+
}
|
| 338 |
+
|
| 339 |
+
/* 2. STYLE THE INNER INPUTS: Apply the theme ONLY to the interactive element */
|
| 340 |
+
#script-box textarea, #status-box input,
|
| 341 |
+
#preset-dropdown-container .gr-dropdown {
|
| 342 |
+
background-color: #112240 !important; /* Perfect Match to Studio Card */
|
| 343 |
+
border: 2px solid #ff8c00 !important;
|
| 344 |
+
border-radius: 12px !important;
|
| 345 |
+
color: #ccd6f6 !important;
|
| 346 |
+
box-shadow: none !important;
|
| 347 |
+
}
|
| 348 |
+
|
| 349 |
+
/* Remove placeholder color interference */
|
| 350 |
+
#script-box textarea::placeholder { color: #555 !important; }
|
| 351 |
+
|
| 352 |
+
/* Focus States */
|
| 353 |
+
#script-box textarea:focus, #status-box input:focus {
|
| 354 |
+
border-color: #ffb347 !important;
|
| 355 |
+
box-shadow: 0 0 15px rgba(255, 140, 0, 0.4) !important;
|
| 356 |
+
}
|
| 357 |
+
|
| 358 |
+
/* --- PRO CODE "VOICE DECK" (SCROLLABLE LIST) --- */
|
| 359 |
+
|
| 360 |
+
/* The Voice Deck Container */
|
| 361 |
+
#voice-deck {
|
| 362 |
+
max-height: 70px !important;
|
| 363 |
+
overflow-y: auto !important;
|
| 364 |
+
background-color: #0d1b2a !important; /* Slightly darker than card for depth */
|
| 365 |
+
border: 2px solid #ff8c00 !important;
|
| 366 |
+
border-radius: 12px !important;
|
| 367 |
+
padding: 8px !important;
|
| 368 |
+
margin-bottom: 10px !important;
|
| 369 |
+
scrollbar-width: thin !important;
|
| 370 |
+
scrollbar-color: #ff8c00 #0a192f !important;
|
| 371 |
+
}
|
| 372 |
+
|
| 373 |
+
/* Radio Item Styling (Premium Chips) */
|
| 374 |
+
#voice-deck label {
|
| 375 |
+
display: flex !important;
|
| 376 |
+
align-items: center !important;
|
| 377 |
+
width: 100% !important;
|
| 378 |
+
background: transparent !important;
|
| 379 |
+
border: 1px solid #233554 !important;
|
| 380 |
+
border-radius: 8px !important;
|
| 381 |
+
margin-bottom: 4px !important;
|
| 382 |
+
padding: 8px 12px !important;
|
| 383 |
+
transition: all 0.2s ease !important;
|
| 384 |
+
cursor: pointer !important;
|
| 385 |
+
color: #8892b0 !important;
|
| 386 |
+
}
|
| 387 |
+
|
| 388 |
+
#voice-deck label:hover {
|
| 389 |
+
background: #112240 !important;
|
| 390 |
+
border-color: #ff8c00 !important;
|
| 391 |
+
color: #ff8c00 !important;
|
| 392 |
+
}
|
| 393 |
+
|
| 394 |
+
#voice-deck label.selected, #voice-deck input:checked + span {
|
| 395 |
+
background: #233554 !important;
|
| 396 |
+
border-color: #ff8c00 !important;
|
| 397 |
+
color: #ff8c00 !important;
|
| 398 |
+
font-weight: bold !important;
|
| 399 |
+
box-shadow: 0 0 10px rgba(255, 140, 0, 0.2) !important;
|
| 400 |
+
}
|
| 401 |
+
|
| 402 |
+
/* Hide the default radio circle */
|
| 403 |
+
#voice-deck input[type="radio"] {
|
| 404 |
+
display: none !important;
|
| 405 |
+
}
|
| 406 |
+
|
| 407 |
+
/* Ensure text is nicely aligned */
|
| 408 |
+
#voice-deck span {
|
| 409 |
+
margin-left: 0 !important;
|
| 410 |
+
font-size: 0.95rem !important;
|
| 411 |
+
}
|
| 412 |
+
|
| 413 |
+
/* Audio Input Transparency Fix */
|
| 414 |
+
#audio-input, #audio-input .gr-input, #audio-input .gr-box, #audio-input .gr-block {
|
| 415 |
+
background: transparent !important;
|
| 416 |
+
background-color: transparent !important;
|
| 417 |
+
border: none !important;
|
| 418 |
+
box-shadow: none !important;
|
| 419 |
+
}
|
| 420 |
+
|
| 421 |
+
|
| 422 |
+
div[role="option"] {
|
| 423 |
+
padding: 10px !important;
|
| 424 |
+
color: #ccd6f6 !important;
|
| 425 |
+
border-bottom: 1px solid #233554 !important;
|
| 426 |
+
}
|
| 427 |
+
|
| 428 |
+
div[role="option"]:hover, div[role="option"][aria-selected="true"] {
|
| 429 |
+
background-color: #233554 !important;
|
| 430 |
+
color: #ff8c00 !important;
|
| 431 |
+
}
|
| 432 |
+
|
| 433 |
+
div[role="listbox"]::-webkit-scrollbar {
|
| 434 |
+
width: 6px !important;
|
| 435 |
+
}
|
| 436 |
+
|
| 437 |
+
div[role="listbox"]::-webkit-scrollbar-track {
|
| 438 |
+
background: #0a192f !important;
|
| 439 |
+
border-radius: 10px !important;
|
| 440 |
+
}
|
| 441 |
+
|
| 442 |
+
div[role="listbox"]::-webkit-scrollbar-thumb {
|
| 443 |
+
background: #ff8c00 !important;
|
| 444 |
+
border-radius: 10px !important;
|
| 445 |
+
border: 1px solid #0a192f !important;
|
| 446 |
+
}
|
| 447 |
+
|
| 448 |
+
div[role="listbox"]::-webkit-scrollbar-thumb:hover {
|
| 449 |
+
background: #ffa500 !important;
|
| 450 |
+
}
|
| 451 |
+
|
| 452 |
+
/* --- PREMIUM SCROLLBARS --- */
|
| 453 |
+
/* Global Scrollbar Styling */
|
| 454 |
+
::-webkit-scrollbar {
|
| 455 |
+
width: 6px !important;
|
| 456 |
+
height: 6px !important;
|
| 457 |
+
}
|
| 458 |
+
|
| 459 |
+
::-webkit-scrollbar-track {
|
| 460 |
+
background: #0a192f !important;
|
| 461 |
+
}
|
| 462 |
+
|
| 463 |
+
::-webkit-scrollbar-thumb {
|
| 464 |
+
background: #ff8c00 !important;
|
| 465 |
+
border-radius: 20px !important;
|
| 466 |
+
border: 1px solid #0a192f !important;
|
| 467 |
+
}
|
| 468 |
+
|
| 469 |
+
::-webkit-scrollbar-thumb:hover {
|
| 470 |
+
background: #ffa500 !important;
|
| 471 |
+
box-shadow: 0 0 10px rgba(255, 140, 0, 0.5) !important;
|
| 472 |
+
}
|
| 473 |
+
|
| 474 |
+
/* Ensure Audio Player Scrollbars match */
|
| 475 |
+
audio::-webkit-scrollbar {
|
| 476 |
+
height: 6px !important;
|
| 477 |
+
}
|
| 478 |
+
|
| 479 |
+
div[role="option"] {
|
| 480 |
+
background-color: transparent !important;
|
| 481 |
+
color: #ccd6f6 !important;
|
| 482 |
+
}
|
| 483 |
+
|
| 484 |
+
div[role="option"]:hover, div[role="option"][aria-selected="true"] {
|
| 485 |
+
background-color: #233554 !important;
|
| 486 |
+
color: #ff8c00 !important;
|
| 487 |
+
}
|
| 488 |
+
|
| 489 |
+
.btn-primary {
|
| 490 |
+
background: #ff8c00 !important;
|
| 491 |
+
border: 2px solid #ff8c00 !important;
|
| 492 |
+
color: #0a192f !important;
|
| 493 |
+
font-weight: 800 !important;
|
| 494 |
+
border-radius: 8px !important;
|
| 495 |
+
padding: 0 20px !important;
|
| 496 |
+
height: 50px !important;
|
| 497 |
+
cursor: pointer !important;
|
| 498 |
+
transition: all 0.3s ease !important;
|
| 499 |
+
width: 100% !important;
|
| 500 |
+
}
|
| 501 |
+
|
| 502 |
+
.btn-primary:hover {
|
| 503 |
+
background: transparent !important;
|
| 504 |
+
color: #ff8c00 !important;
|
| 505 |
+
border-color: #ff8c00 !important;
|
| 506 |
+
}
|
| 507 |
+
|
| 508 |
+
.btn-secondary {
|
| 509 |
+
background: transparent !important;
|
| 510 |
+
border: 1px solid #233554 !important;
|
| 511 |
+
color: #8892b0 !important;
|
| 512 |
+
border-radius: 8px !important;
|
| 513 |
+
height: 50px !important;
|
| 514 |
+
padding: 0 20px !important;
|
| 515 |
+
transition: all 0.3s ease !important;
|
| 516 |
+
width: 100% !important;
|
| 517 |
+
}
|
| 518 |
+
|
| 519 |
+
.btn-secondary:hover {
|
| 520 |
+
background: #ff8c00 !important;
|
| 521 |
+
color: #0a192f !important;
|
| 522 |
+
border-color: #ff8c00 !important;
|
| 523 |
+
}
|
| 524 |
+
|
| 525 |
+
.info-section {
|
| 526 |
+
font-size: 0.85rem;
|
| 527 |
+
color: #8892b0;
|
| 528 |
+
margin-top: 20px;
|
| 529 |
+
padding: 0 15px;
|
| 530 |
+
animation: fadeIn 0.8s ease-out;
|
| 531 |
+
}
|
| 532 |
+
|
| 533 |
+
.info-header {
|
| 534 |
+
color: #ff8c00 !important;
|
| 535 |
+
font-weight: 800 !important;
|
| 536 |
+
margin-bottom: 5px !important;
|
| 537 |
+
display: block !important;
|
| 538 |
+
}
|
| 539 |
+
|
| 540 |
+
.footer {
|
| 541 |
+
text-align: center;
|
| 542 |
+
margin-top: 40px;
|
| 543 |
+
padding-top: 20px;
|
| 544 |
+
border-top: 1px solid #233554;
|
| 545 |
+
font-size: 0.8rem;
|
| 546 |
+
color: #8892b0;
|
| 547 |
+
animation: fadeIn 1s ease-out;
|
| 548 |
+
}
|
| 549 |
+
|
| 550 |
+
.footer a { color: #ff8c00; text-decoration: none; font-weight: 600; transition: opacity 0.2s; }
|
| 551 |
+
.footer a:hover { opacity: 0.8; }
|
| 552 |
+
|
| 553 |
+
.authorship {
|
| 554 |
+
margin-bottom: 10px;
|
| 555 |
+
font-weight: 600;
|
| 556 |
+
}
|
| 557 |
+
|
| 558 |
+
/* Dropdown Polish */
|
| 559 |
+
.gr-dropdown {
|
| 560 |
+
background: #0a192f !important;
|
| 561 |
+
border-color: #233554 !important;
|
| 562 |
+
margin-bottom: 12px !important;
|
| 563 |
+
}
|
| 564 |
+
.gr-dropdown:focus-within {
|
| 565 |
+
border-color: #ff8c00 !important;
|
| 566 |
+
}
|
| 567 |
+
|
| 568 |
+
/* Premium Progress Bar Styling */
|
| 569 |
+
.gr-progress {
|
| 570 |
+
background-color: #0a192f !important;
|
| 571 |
+
border: 1px solid #233554 !important;
|
| 572 |
+
border-radius: 12px !important;
|
| 573 |
+
height: 38px !important;
|
| 574 |
+
overflow: hidden !important;
|
| 575 |
+
box-shadow: inset 0 2px 4px rgba(0,0,0,0.3) !important;
|
| 576 |
+
}
|
| 577 |
+
|
| 578 |
+
.gr-progress .progress-level {
|
| 579 |
+
background: linear-gradient(90deg, #ff8c00, #ffb347) !important;
|
| 580 |
+
border-radius: 10px !important;
|
| 581 |
+
box-shadow: 0 0 20px rgba(255, 140, 0, 0.4) !important;
|
| 582 |
+
height: 100% !important;
|
| 583 |
+
transition: width 0.4s cubic-bezier(0.4, 0, 0.2, 1) !important;
|
| 584 |
+
position: relative !important;
|
| 585 |
+
overflow: hidden !important;
|
| 586 |
+
}
|
| 587 |
+
|
| 588 |
+
.gr-progress .progress-level::after {
|
| 589 |
+
content: "" !important;
|
| 590 |
+
position: absolute !important;
|
| 591 |
+
top: 0 !important;
|
| 592 |
+
left: -150% !important;
|
| 593 |
+
width: 100% !important;
|
| 594 |
+
height: 100% !important;
|
| 595 |
+
background: linear-gradient(
|
| 596 |
+
90deg,
|
| 597 |
+
transparent,
|
| 598 |
+
rgba(255, 255, 255, 0.4),
|
| 599 |
+
transparent
|
| 600 |
+
) !important;
|
| 601 |
+
animation: apple-shimmer 2s infinite linear !important;
|
| 602 |
+
}
|
| 603 |
+
|
| 604 |
+
@keyframes apple-shimmer {
|
| 605 |
+
0% { left: -150%; }
|
| 606 |
+
100% { left: 150%; }
|
| 607 |
+
}
|
| 608 |
+
|
| 609 |
+
.gr-progress .progress-text {
|
| 610 |
+
color: #ffffff !important;
|
| 611 |
+
font-family: 'Play', sans-serif !important;
|
| 612 |
+
font-weight: 700 !important;
|
| 613 |
+
font-size: 0.85rem !important;
|
| 614 |
+
line-height: 38px !important;
|
| 615 |
+
text-shadow: 0 1px 2px rgba(0,0,0,0.5) !important;
|
| 616 |
+
letter-spacing: 0.5px !important;
|
| 617 |
+
position: relative !important;
|
| 618 |
+
z-index: 5 !important;
|
| 619 |
+
}
|
| 620 |
+
|
| 621 |
+
.progress-container {
|
| 622 |
+
padding: 0 !important;
|
| 623 |
+
margin: 0 !important;
|
| 624 |
+
}
|
| 625 |
+
|
| 626 |
+
/* Subtle Animations */
|
| 627 |
+
@keyframes fadeIn {
|
| 628 |
+
from { opacity: 0; transform: translateY(8px); }
|
| 629 |
+
to { opacity: 1; transform: translateY(0); }
|
| 630 |
+
}
|
| 631 |
+
|
| 632 |
+
.main-container { animation: fadeIn 0.6s ease-out; }
|
| 633 |
+
.studio-card { transition: transform 0.2s ease, border-color 0.2s ease !important; }
|
| 634 |
+
.studio-card:hover { transform: translateY(-2px); border-color: #ff8c00 !important; }
|
| 635 |
+
""" + """
|
| 636 |
+
/* --- AUDIO PLAYER FIXES (THEME ALIGNMENT) --- */
|
| 637 |
+
#audio-input, #audio-output {
|
| 638 |
+
background-color: transparent !important;
|
| 639 |
+
border: none !important;
|
| 640 |
+
}
|
| 641 |
+
|
| 642 |
+
/* Force Transparent Backgrounds on ALL Audio Wrappers */
|
| 643 |
+
.gr-audio-wrapper, .wrapper, .audio-container, .controls, .waveform-container {
|
| 644 |
+
background: transparent !important;
|
| 645 |
+
background-color: transparent !important;
|
| 646 |
+
}
|
| 647 |
+
|
| 648 |
+
/* Waveform Canvas - Make it fit theme */
|
| 649 |
+
#audio-input canvas, #audio-output canvas {
|
| 650 |
+
background-color: #0d1b2a !important; /* Darker than card */
|
| 651 |
+
border-radius: 8px !important;
|
| 652 |
+
border: 1px solid #233554 !important;
|
| 653 |
+
}
|
| 654 |
+
|
| 655 |
+
/* --- THE BIG WHITE BAR FIX (Scrollbar/Timeline) --- */
|
| 656 |
+
|
| 657 |
+
/* 1. Target the specific scrollbar/slider container often used in Gradio Waveforms */
|
| 658 |
+
div[class*="scrollbar"], ::-webkit-scrollbar-track {
|
| 659 |
+
background: #0a192f !important;
|
| 660 |
+
background-color: #0a192f !important;
|
| 661 |
+
}
|
| 662 |
+
|
| 663 |
+
/* 2. Target Range Inputs (Progress Bar) */
|
| 664 |
+
input[type=range] {
|
| 665 |
+
-webkit-appearance: none;
|
| 666 |
+
background: transparent !important;
|
| 667 |
+
}
|
| 668 |
+
|
| 669 |
+
/* Chrome/Safari Slider Track */
|
| 670 |
+
input[type=range]::-webkit-slider-runnable-track {
|
| 671 |
+
width: 100%;
|
| 672 |
+
height: 6px;
|
| 673 |
+
cursor: pointer;
|
| 674 |
+
background: #233554 !important; /* Dark Navy Track */
|
| 675 |
+
border-radius: 5px;
|
| 676 |
+
}
|
| 677 |
+
|
| 678 |
+
/* Chrome/Safari Slider Thumb */
|
| 679 |
+
input[type=range]::-webkit-slider-thumb {
|
| 680 |
+
height: 16px;
|
| 681 |
+
width: 16px;
|
| 682 |
+
border-radius: 50%;
|
| 683 |
+
background: #ff8c00 !important; /* Orange Thumb */
|
| 684 |
+
cursor: pointer;
|
| 685 |
+
-webkit-appearance: none;
|
| 686 |
+
margin-top: -5px; /* Centers thumb */
|
| 687 |
+
}
|
| 688 |
+
|
| 689 |
+
/* Firefox Slider Track/Thumb */
|
| 690 |
+
input[type=range]::-moz-range-track {
|
| 691 |
+
background: #233554 !important;
|
| 692 |
+
height: 6px;
|
| 693 |
+
}
|
| 694 |
+
input[type=range]::-moz-range-thumb {
|
| 695 |
+
background: #ff8c00 !important;
|
| 696 |
+
border: none;
|
| 697 |
+
}
|
| 698 |
+
|
| 699 |
+
/* --- TIMESTAMP FIX (Make visible & overlap safe) --- */
|
| 700 |
+
.time, .timestamp, span[class*="time"] {
|
| 701 |
+
color: #ccd6f6 !important;
|
| 702 |
+
font-family: 'Play', monospace !important;
|
| 703 |
+
font-size: 13px !important;
|
| 704 |
+
font-weight: bold !important;
|
| 705 |
+
background-color: #0a192f !important; /* Solid bg to cover waveform if needed */
|
| 706 |
+
padding: 2px 8px !important;
|
| 707 |
+
border-radius: 4px !important;
|
| 708 |
+
z-index: 9999 !important; /* Force on top */
|
| 709 |
+
border: 1px solid #ff8c00 !important;
|
| 710 |
+
position: relative !important;
|
| 711 |
+
top: -2px !important; /* Slight nudge up */
|
| 712 |
+
text-shadow: none !important;
|
| 713 |
+
}
|
| 714 |
+
|
| 715 |
+
/* Play/Pause/Download/Share Icons */
|
| 716 |
+
.controls, .actions, .icon-button, .volume, .playback {
|
| 717 |
+
z-index: 10001 !important;
|
| 718 |
+
position: relative !important;
|
| 719 |
+
}
|
| 720 |
+
|
| 721 |
+
.controls button, .icon {
|
| 722 |
+
color: #ff8c00 !important;
|
| 723 |
+
font-weight: bold !important;
|
| 724 |
+
z-index: 10001 !important;
|
| 725 |
+
pointer-events: auto !important;
|
| 726 |
+
cursor: pointer !important;
|
| 727 |
+
}
|
| 728 |
+
|
| 729 |
+
.controls button:hover, .icon:hover {
|
| 730 |
+
color: #ccd6f6 !important;
|
| 731 |
+
background-color: rgba(255, 140, 0, 0.2) !important;
|
| 732 |
+
border-radius: 50% !important;
|
| 733 |
+
}
|
| 734 |
+
|
| 735 |
+
/* Remove any remaining white backgrounds */
|
| 736 |
+
div[style*="background-color: white"], div[style*="background: white"] {
|
| 737 |
+
background-color: transparent !important;
|
| 738 |
+
}
|
| 739 |
+
"""
|
| 740 |
+
|
| 741 |
+
theme = gr.themes.Default(
|
| 742 |
+
primary_hue="orange",
|
| 743 |
+
secondary_hue="slate",
|
| 744 |
+
).set(
|
| 745 |
+
body_background_fill="#0a192f",
|
| 746 |
+
block_background_fill="#112240",
|
| 747 |
+
input_background_fill="#0a192f",
|
| 748 |
+
input_border_color="#233554",
|
| 749 |
+
)
|
| 750 |
+
|
| 751 |
+
# --- Low-Level Security & Easter Egg ---
|
| 752 |
+
custom_js = """
|
| 753 |
+
function() {
|
| 754 |
+
// Security Protocol
|
| 755 |
+
document.addEventListener('contextmenu', event => {
|
| 756 |
+
event.preventDefault();
|
| 757 |
+
alert('Security Protocol Engaged: System protected by Amey Thakur & Mega Satish');
|
| 758 |
+
console.warn('Security Alert: Unauthorized access attempt detected.');
|
| 759 |
+
});
|
| 760 |
+
console.log("%c STOP! %c You are entering a protected zone.", "color: red; font-size: 50px; font-weight: bold;", "color: white; font-size: 20px;");
|
| 761 |
+
|
| 762 |
+
// PWA Service Worker Registration
|
| 763 |
+
if ('serviceWorker' in navigator) {
|
| 764 |
+
window.addEventListener('load', () => {
|
| 765 |
+
navigator.serviceWorker.register('/file=sw.js')
|
| 766 |
+
.then(reg => console.log('PWA Service Worker Registered', reg))
|
| 767 |
+
.catch(err => console.log('PWA Service Worker Failed', err));
|
| 768 |
+
});
|
| 769 |
+
}
|
| 770 |
+
}
|
| 771 |
+
"""
|
| 772 |
+
|
| 773 |
+
# --- Status Message ---
|
| 774 |
+
if MODELS_READY:
|
| 775 |
+
STATUS_MSG = "β
Ready. All AI models loaded."
|
| 776 |
+
else:
|
| 777 |
+
STATUS_MSG = f"β οΈ Demo Mode: {STARTUP_ERROR}"
|
| 778 |
+
|
| 779 |
+
# Inject Favicon via Head (Reliable) and Security JS
|
| 780 |
+
head_tags = f'''
|
| 781 |
+
<link rel="icon" type="image/png" href="{NEON_MIC_ICON}">
|
| 782 |
+
<link rel="manifest" href="/file=manifest.json">
|
| 783 |
+
'''
|
| 784 |
+
with gr.Blocks(title="Deepfake Audio Studio", theme=theme, css=custom_css, js=custom_js, head=head_tags) as demo:
|
| 785 |
+
with gr.Column(elem_classes=["main-container"]):
|
| 786 |
+
|
| 787 |
+
# Minimal Header
|
| 788 |
+
with gr.Column(elem_id="header"):
|
| 789 |
+
with gr.Row():
|
| 790 |
+
intro_btn = gr.Button("ποΈ Deepfake Audio", elem_id="intro-btn")
|
| 791 |
+
gr.Markdown("<div style='text-align: center; margin-top: 5px; margin-bottom: 50px; color: #8892b0;'>A neural voice cloning studio powered by SV2TTS technology</div>")
|
| 792 |
+
|
| 793 |
+
intro_audio = gr.Audio(visible=True, autoplay=True, elem_id="intro-audio")
|
| 794 |
+
|
| 795 |
+
# Compact 2x2 Grid
|
| 796 |
+
with gr.Row():
|
| 797 |
+
# Voice Deck (Scrollable Radio List)
|
| 798 |
+
with gr.Column(elem_classes=["studio-card"]):
|
| 799 |
+
gr.Markdown("<div class='card-title'>01. Voice Reference</div>")
|
| 800 |
+
|
| 801 |
+
# The Voice Deck
|
| 802 |
+
preset_dropdown = gr.Radio(
|
| 803 |
+
choices=["Custom Upload"] + sorted([k for k in SAMPLES.keys()]),
|
| 804 |
+
value="Custom Upload",
|
| 805 |
+
label="Voice Selection",
|
| 806 |
+
show_label=False,
|
| 807 |
+
elem_id="voice-deck",
|
| 808 |
+
interactive=True
|
| 809 |
+
)
|
| 810 |
+
|
| 811 |
+
audio_input = gr.Audio(type="filepath", label="Reference Sample", container=False, show_label=False, elem_id="audio-input")
|
| 812 |
+
|
| 813 |
+
with gr.Column():
|
| 814 |
+
with gr.Column(elem_classes=["studio-card"], elem_id="synthesis-output-card"):
|
| 815 |
+
gr.Markdown("<div class='card-title'>02. Synthesis Output</div>")
|
| 816 |
+
audio_output = gr.Audio(label="Generated Result", interactive=False, container=False, show_label=False, elem_id="audio-output")
|
| 817 |
+
|
| 818 |
+
# Input & Status Row (2x2 Grid Symmetry)
|
| 819 |
+
with gr.Row():
|
| 820 |
+
with gr.Column(elem_classes=["studio-card"]):
|
| 821 |
+
gr.Markdown("<div class='card-title'>03. Target Script</div>")
|
| 822 |
+
text_input = gr.Textbox(
|
| 823 |
+
label="Target text to synthesize",
|
| 824 |
+
placeholder="Enter audio text...",
|
| 825 |
+
lines=3,
|
| 826 |
+
show_label=False,
|
| 827 |
+
elem_id="script-box"
|
| 828 |
+
)
|
| 829 |
+
|
| 830 |
+
with gr.Column(elem_classes=["studio-card"]):
|
| 831 |
+
gr.Markdown("<div class='card-title'>04. System Status</div>")
|
| 832 |
+
status_info = gr.Textbox(
|
| 833 |
+
label="System Status",
|
| 834 |
+
value=STATUS_MSG,
|
| 835 |
+
interactive=False,
|
| 836 |
+
show_label=False,
|
| 837 |
+
elem_id="status-box"
|
| 838 |
+
)
|
| 839 |
+
|
| 840 |
+
# Controls
|
| 841 |
+
with gr.Row():
|
| 842 |
+
with gr.Column(scale=1):
|
| 843 |
+
reset_btn = gr.Button("Reset", variant="secondary", elem_id="reset-btn", elem_classes=["btn-secondary"])
|
| 844 |
+
with gr.Column(scale=1):
|
| 845 |
+
run_btn = gr.Button("Generate Voice Clone", variant="primary", elem_classes=["btn-primary"])
|
| 846 |
+
|
| 847 |
+
# Information Sections (Neat & Compact)
|
| 848 |
+
with gr.Row(elem_classes=["info-section"]):
|
| 849 |
+
with gr.Column():
|
| 850 |
+
gr.Markdown("<span class='info-header'>How it Works</span>")
|
| 851 |
+
gr.Markdown("Extracts speaker identity into a latent embedding to drive neural text-to-speech synthesis.")
|
| 852 |
+
with gr.Column():
|
| 853 |
+
gr.Markdown("<span class='info-header'>Privacy Notice</span>")
|
| 854 |
+
gr.Markdown("Audio is processed in memory and never stored. For educational and research use only.")
|
| 855 |
+
|
| 856 |
+
# Minimal Footer
|
| 857 |
+
with gr.Column(elem_classes=["footer"]):
|
| 858 |
+
gr.HTML("""
|
| 859 |
+
<div class='authorship'>
|
| 860 |
+
Created by <a href='https://github.com/Amey-Thakur' target='_blank'>Amey Thakur</a>
|
| 861 |
+
& <a href='https://github.com/msatmod' target='_blank'>Mega Satish</a>
|
| 862 |
+
</div>
|
| 863 |
+
<div style='margin-top: 12px;'>
|
| 864 |
+
<a href='https://github.com/Amey-Thakur/DEEPFAKE-AUDIO' target='_blank'>GitHub Repository</a> |
|
| 865 |
+
<a href='https://youtu.be/i3wnBcbHDbs' target='_blank'>YouTube Demo</a>
|
| 866 |
+
</div>
|
| 867 |
+
<p style='margin-top: 12px; opacity: 0.6;'>Β© 2021 Deepfake Audio Studio</p>
|
| 868 |
+
""")
|
| 869 |
+
|
| 870 |
+
# Events
|
| 871 |
+
run_btn.click(
|
| 872 |
+
fn=synthesize,
|
| 873 |
+
inputs=[audio_input, text_input],
|
| 874 |
+
outputs=[audio_output, status_info]
|
| 875 |
+
)
|
| 876 |
+
|
| 877 |
+
reset_btn.click(lambda: (None, "Custom Upload", "", None, STATUS_MSG), outputs=[audio_input, preset_dropdown, text_input, audio_output, status_info])
|
| 878 |
+
|
| 879 |
+
# Preset selection logic
|
| 880 |
+
def on_preset_change(name):
|
| 881 |
+
if name == "Custom Upload":
|
| 882 |
+
return None
|
| 883 |
+
return load_preset(name)
|
| 884 |
+
|
| 885 |
+
preset_dropdown.change(fn=on_preset_change, inputs=[preset_dropdown], outputs=[audio_input])
|
| 886 |
+
|
| 887 |
+
# Custom JS to force play because browser autoplay policies are strict
|
| 888 |
+
play_js = "() => { setTimeout(() => { const audio = document.querySelector('#intro-audio audio'); if (audio) audio.play(); }, 300); }"
|
| 889 |
+
intro_btn.click(fn=play_intro, outputs=intro_audio, js=play_js)
|
| 890 |
+
|
| 891 |
+
if __name__ == "__main__":
|
| 892 |
+
import argparse
|
| 893 |
+
parser = argparse.ArgumentParser()
|
| 894 |
+
parser.add_argument("--port", type=int, default=7860)
|
| 895 |
+
args = parser.parse_args()
|
| 896 |
+
|
| 897 |
+
print("=" * 60)
|
| 898 |
+
print("ποΈ DEEPFAKE AUDIO STUDIO")
|
| 899 |
+
print("=" * 60)
|
| 900 |
+
if MODELS_READY:
|
| 901 |
+
print("β
All AI models loaded - Full functionality available")
|
| 902 |
+
else:
|
| 903 |
+
print(f"β οΈ Demo Mode: {STARTUP_ERROR}")
|
| 904 |
+
print(f"π Open: http://localhost:{args.port}")
|
| 905 |
+
print("=" * 60)
|
| 906 |
+
|
| 907 |
+
demo.queue().launch(server_name="0.0.0.0", server_port=args.port, show_error=True, pwa=True, allowed_paths=[str(DATASET_DIR)])
|
Source Code/app_ui_demo.py
ADDED
|
@@ -0,0 +1,649 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ==================================================================================================
|
| 2 |
+
# DEEPFAKE AUDIO - app_ui_demo.py (The UI Sandbox)
|
| 3 |
+
# ==================================================================================================
|
| 4 |
+
#
|
| 5 |
+
# π DESCRIPTION
|
| 6 |
+
# This script serves as a lightweight, "UI-Only" verification environment for the Deepfake
|
| 7 |
+
# Audio Studio. It is designed to run without the heavy neural dependencies (TensorFlow, PyTorch)
|
| 8 |
+
# used in the production pipeline, allowing designers and developers to iterate on the
|
| 9 |
+
# Human-Machine Interface (HMI) aesthetics without loading massive model weights.
|
| 10 |
+
#
|
| 11 |
+
# π€ AUTHORS
|
| 12 |
+
# - Amey Thakur (https://github.com/Amey-Thakur)
|
| 13 |
+
# - Mega Satish (https://github.com/msatmod)
|
| 14 |
+
#
|
| 15 |
+
# π€π» CREDITS
|
| 16 |
+
# Original Real-Time Voice Cloning methodology by CorentinJ
|
| 17 |
+
# Repository: https://github.com/CorentinJ/Real-Time-Voice-Cloning
|
| 18 |
+
#
|
| 19 |
+
# π PROJECT LINKS
|
| 20 |
+
# Repository: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO
|
| 21 |
+
# Video Demo: https://youtu.be/i3wnBcbHDbs
|
| 22 |
+
# Research: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO/blob/main/DEEPFAKE-AUDIO.ipynb
|
| 23 |
+
#
|
| 24 |
+
# π LICENSE
|
| 25 |
+
# Released under the MIT License
|
| 26 |
+
# Release Date: 2021-02-06
|
| 27 |
+
# ==================================================================================================
|
| 28 |
+
|
| 29 |
+
import os
|
| 30 |
+
import sys
|
| 31 |
+
from pathlib import Path
|
| 32 |
+
|
| 33 |
+
# --- ENVIRONMENT CONFIGURATION ---
|
| 34 |
+
# We suppress lower-level system warnings to ensure the focus remains on the HMI presentation.
|
| 35 |
+
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
|
| 36 |
+
os.environ['PYTHONWARNINGS'] = 'ignore'
|
| 37 |
+
|
| 38 |
+
import warnings
|
| 39 |
+
warnings.filterwarnings('ignore')
|
| 40 |
+
|
| 41 |
+
import numpy as np
|
| 42 |
+
import gradio as gr
|
| 43 |
+
import base64
|
| 44 |
+
|
| 45 |
+
# --- DIRECTORY ARCHITECTURE ---
|
| 46 |
+
PROJ_DIR = Path(__file__).parent.absolute()
|
| 47 |
+
ROOT_DIR = PROJ_DIR.parent
|
| 48 |
+
DATASET_DIR = ROOT_DIR / "Dataset"
|
| 49 |
+
SAMPLES_DIR = DATASET_DIR / "samples"
|
| 50 |
+
|
| 51 |
+
# --- Sample Loading (Dynamic Discovery) ---
|
| 52 |
+
ROOT_DIR = PROJ_DIR.parent
|
| 53 |
+
SAMPLES_DIR = ROOT_DIR / "Dataset/samples"
|
| 54 |
+
if SAMPLES_DIR.exists():
|
| 55 |
+
for f in SAMPLES_DIR.glob("*.wav"):
|
| 56 |
+
stem = f.stem
|
| 57 |
+
if "_preset" in stem:
|
| 58 |
+
name = stem.replace("_preset", "").replace("_", " ").title()
|
| 59 |
+
else:
|
| 60 |
+
name = stem
|
| 61 |
+
SAMPLES[name] = str(f)
|
| 62 |
+
|
| 63 |
+
def load_preset(name):
|
| 64 |
+
if name in SAMPLES:
|
| 65 |
+
return SAMPLES[name]
|
| 66 |
+
return None
|
| 67 |
+
|
| 68 |
+
# --- Mock Synthesis (UI Demo Only) ---
|
| 69 |
+
def synthesize(audio_file, text, progress=gr.Progress()):
|
| 70 |
+
if not audio_file or not text:
|
| 71 |
+
return None, "Reference audio and text script are required."
|
| 72 |
+
|
| 73 |
+
progress(0.2, desc="Extracting Voice Identity")
|
| 74 |
+
progress(0.5, desc="Synthesizing Speech")
|
| 75 |
+
progress(0.8, desc="Generating High-Fidelity Audio")
|
| 76 |
+
progress(0.9, desc="Refining Audio Quality")
|
| 77 |
+
progress(1.0, desc="Finalizing")
|
| 78 |
+
|
| 79 |
+
return None, "β οΈ UI Demo Mode - TensorFlow blocked by system policy. Use Docker to run full synthesis."
|
| 80 |
+
|
| 81 |
+
def play_intro():
|
| 82 |
+
intro_path = PROJ_DIR / "intro_message.wav"
|
| 83 |
+
if intro_path.exists():
|
| 84 |
+
return str(intro_path)
|
| 85 |
+
return None
|
| 86 |
+
|
| 87 |
+
# --- NEON MIC ICON ---
|
| 88 |
+
try:
|
| 89 |
+
with open(PROJ_DIR / "favicon.png", "rb") as f:
|
| 90 |
+
encoded_icon = base64.b64encode(f.read()).decode('utf-8')
|
| 91 |
+
NEON_MIC_ICON = f"data:image/png;base64,{encoded_icon}"
|
| 92 |
+
except:
|
| 93 |
+
NEON_MIC_ICON = "/file=favicon.png"
|
| 94 |
+
|
| 95 |
+
# --- Minimalist Navy & Orange UI ---
|
| 96 |
+
custom_css = """
|
| 97 |
+
/* General Styles */
|
| 98 |
+
@import url('https://fonts.googleapis.com/css2?family=Play:wght@400;700&display=swap');
|
| 99 |
+
|
| 100 |
+
* {
|
| 101 |
+
font-family: 'Play', sans-serif !important;
|
| 102 |
+
-webkit-user-select: none;
|
| 103 |
+
-moz-user-select: none;
|
| 104 |
+
-ms-user-select: none;
|
| 105 |
+
user-select: none;
|
| 106 |
+
}
|
| 107 |
+
|
| 108 |
+
/* Allow selection in inputs */
|
| 109 |
+
input, textarea, .gr-box, .gr-input, .gr-text-input {
|
| 110 |
+
-webkit-user-select: text !important;
|
| 111 |
+
-moz-user-select: text !important;
|
| 112 |
+
-ms-user-select: text !important;
|
| 113 |
+
user-select: text !important;
|
| 114 |
+
cursor: text !important;
|
| 115 |
+
}
|
| 116 |
+
|
| 117 |
+
body {
|
| 118 |
+
background-color: #0a192f !important;
|
| 119 |
+
color: #ccd6f6 !important;
|
| 120 |
+
overflow-x: hidden;
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
.gradio-container {
|
| 124 |
+
background-color: transparent !important;
|
| 125 |
+
position: relative !important;
|
| 126 |
+
z-index: 1 !important;
|
| 127 |
+
}
|
| 128 |
+
|
| 129 |
+
/* Pseudo-element for the pattern to control opacity properly without affecting text */
|
| 130 |
+
body::before {
|
| 131 |
+
content: "";
|
| 132 |
+
position: fixed;
|
| 133 |
+
top: 0;
|
| 134 |
+
left: 0;
|
| 135 |
+
width: 100%;
|
| 136 |
+
height: 100%;
|
| 137 |
+
background-image: url('FAVICON_PLACEHOLDER') !important;
|
| 138 |
+
background-repeat: repeat !important;
|
| 139 |
+
background-size: 60px !important;
|
| 140 |
+
opacity: 0.05 !important; /* Dimmed to 5% - Perfect watermark */
|
| 141 |
+
pointer-events: none;
|
| 142 |
+
z-index: 0;
|
| 143 |
+
}
|
| 144 |
+
""".replace("FAVICON_PLACEHOLDER", NEON_MIC_ICON) + """
|
| 145 |
+
.main-container {
|
| 146 |
+
max-width: 950px !important;
|
| 147 |
+
margin: 0 auto !important;
|
| 148 |
+
padding: 20px !important;
|
| 149 |
+
}
|
| 150 |
+
#header h1 {
|
| 151 |
+
font-size: 2.2rem;
|
| 152 |
+
color: #ff8c00;
|
| 153 |
+
margin-bottom: 0px;
|
| 154 |
+
letter-spacing: -1px;
|
| 155 |
+
}
|
| 156 |
+
#header p { font-size: 0.9rem; color: #8892b0; margin-top: 5px; }
|
| 157 |
+
|
| 158 |
+
#intro-btn {
|
| 159 |
+
background: transparent !important;
|
| 160 |
+
border: none !important;
|
| 161 |
+
font-size: 2.2rem !important;
|
| 162 |
+
color: #ff8c00 !important;
|
| 163 |
+
font-weight: 800 !important;
|
| 164 |
+
cursor: pointer !important;
|
| 165 |
+
box-shadow: none !important;
|
| 166 |
+
padding: 0 !important;
|
| 167 |
+
margin: 0 !important;
|
| 168 |
+
transition: all 0.3s ease !important;
|
| 169 |
+
}
|
| 170 |
+
|
| 171 |
+
#intro-btn:hover {
|
| 172 |
+
color: #ccd6f6 !important;
|
| 173 |
+
transform: scale(1.02);
|
| 174 |
+
}
|
| 175 |
+
|
| 176 |
+
#intro-audio { display: none !important; }
|
| 177 |
+
|
| 178 |
+
.studio-card {
|
| 179 |
+
background: #112240 !important;
|
| 180 |
+
border: 1px solid #233554 !important;
|
| 181 |
+
border-radius: 12px !important;
|
| 182 |
+
padding: 10px !important;
|
| 183 |
+
box-shadow: 0 10px 30px -15px rgba(2, 12, 27, 0.7) !important;
|
| 184 |
+
margin-bottom: 6px !important;
|
| 185 |
+
transition: transform 0.2s ease, border-color 0.2s ease !important;
|
| 186 |
+
min-height: 180px !important;
|
| 187 |
+
display: flex !important;
|
| 188 |
+
flex-direction: column !important;
|
| 189 |
+
overflow: visible !important;
|
| 190 |
+
}
|
| 191 |
+
|
| 192 |
+
.studio-card .prose {
|
| 193 |
+
margin-bottom: 5px !important;
|
| 194 |
+
}
|
| 195 |
+
|
| 196 |
+
/* Force Play Font on everything */
|
| 197 |
+
* { font-family: 'Play', sans-serif !important; }
|
| 198 |
+
span, button, input, label, textarea, select { font-family: 'Play', sans-serif !important; }
|
| 199 |
+
|
| 200 |
+
.studio-card:hover {
|
| 201 |
+
transform: translateY(-2px);
|
| 202 |
+
border-color: #ff8c00 !important;
|
| 203 |
+
}
|
| 204 |
+
|
| 205 |
+
/* --- PRO CODE DEFINITIVE "ZERO-BLEED" FIX --- */
|
| 206 |
+
|
| 207 |
+
/* 1. FLATTEN THE WRAPPERS: Make specific outer containers invisible */
|
| 208 |
+
#script-box, #status-box,
|
| 209 |
+
#script-box > .form, #status-box > .form,
|
| 210 |
+
.gray-border, .form {
|
| 211 |
+
background-color: transparent !important;
|
| 212 |
+
background: transparent !important;
|
| 213 |
+
border: none !important;
|
| 214 |
+
box-shadow: none !important;
|
| 215 |
+
padding: 0 !important;
|
| 216 |
+
}
|
| 217 |
+
|
| 218 |
+
/* 2. STYLE THE INNER INPUTS: Apply the theme ONLY to the interactive element */
|
| 219 |
+
#script-box textarea, #status-box input,
|
| 220 |
+
#preset-dropdown-container .gr-dropdown {
|
| 221 |
+
background-color: #112240 !important; /* Perfect Match to Studio Card */
|
| 222 |
+
border: 2px solid #ff8c00 !important;
|
| 223 |
+
border-radius: 12px !important;
|
| 224 |
+
color: #ccd6f6 !important;
|
| 225 |
+
box-shadow: none !important;
|
| 226 |
+
}
|
| 227 |
+
|
| 228 |
+
/* Remove placeholder color interference */
|
| 229 |
+
#script-box textarea::placeholder { color: #555 !important; }
|
| 230 |
+
|
| 231 |
+
/* Focus States */
|
| 232 |
+
#script-box textarea:focus, #status-box input:focus {
|
| 233 |
+
border-color: #ffb347 !important;
|
| 234 |
+
box-shadow: 0 0 15px rgba(255, 140, 0, 0.4) !important;
|
| 235 |
+
}
|
| 236 |
+
|
| 237 |
+
/* --- PRO CODE "VOICE DECK" (SCROLLABLE LIST) --- */
|
| 238 |
+
|
| 239 |
+
/* The Voice Deck Container */
|
| 240 |
+
#voice-deck {
|
| 241 |
+
max-height: 70px !important;
|
| 242 |
+
overflow-y: auto !important;
|
| 243 |
+
background-color: #0d1b2a !important; /* Slightly darker than card for depth */
|
| 244 |
+
border: 2px solid #ff8c00 !important;
|
| 245 |
+
border-radius: 12px !important;
|
| 246 |
+
padding: 8px !important;
|
| 247 |
+
margin-bottom: 10px !important;
|
| 248 |
+
scrollbar-width: thin !important;
|
| 249 |
+
scrollbar-color: #ff8c00 #0a192f !important;
|
| 250 |
+
}
|
| 251 |
+
|
| 252 |
+
/* Radio Item Styling (Premium Chips) */
|
| 253 |
+
#voice-deck label {
|
| 254 |
+
display: flex !important;
|
| 255 |
+
align-items: center !important;
|
| 256 |
+
width: 100% !important;
|
| 257 |
+
background: transparent !important;
|
| 258 |
+
border: 1px solid #233554 !important;
|
| 259 |
+
border-radius: 8px !important;
|
| 260 |
+
margin-bottom: 4px !important;
|
| 261 |
+
padding: 8px 12px !important;
|
| 262 |
+
transition: all 0.2s ease !important;
|
| 263 |
+
cursor: pointer !important;
|
| 264 |
+
color: #8892b0 !important;
|
| 265 |
+
}
|
| 266 |
+
|
| 267 |
+
#voice-deck label:hover {
|
| 268 |
+
background: #112240 !important;
|
| 269 |
+
border-color: #ff8c00 !important;
|
| 270 |
+
color: #ff8c00 !important;
|
| 271 |
+
}
|
| 272 |
+
|
| 273 |
+
#voice-deck label.selected, #voice-deck input:checked + span {
|
| 274 |
+
background: #233554 !important;
|
| 275 |
+
border-color: #ff8c00 !important;
|
| 276 |
+
color: #ff8c00 !important;
|
| 277 |
+
font-weight: bold !important;
|
| 278 |
+
box-shadow: 0 0 10px rgba(255, 140, 0, 0.2) !important;
|
| 279 |
+
}
|
| 280 |
+
|
| 281 |
+
/* Hide the default radio circle */
|
| 282 |
+
#voice-deck input[type="radio"] {
|
| 283 |
+
display: none !important;
|
| 284 |
+
}
|
| 285 |
+
|
| 286 |
+
/* Ensure text is nicely aligned */
|
| 287 |
+
#voice-deck span {
|
| 288 |
+
margin-left: 0 !important;
|
| 289 |
+
font-size: 0.95rem !important;
|
| 290 |
+
}
|
| 291 |
+
|
| 292 |
+
/* Audio Input Transparency Fix */
|
| 293 |
+
#audio-input, #audio-input .gr-input, #audio-input .gr-box, #audio-input .gr-block {
|
| 294 |
+
background: transparent !important;
|
| 295 |
+
background-color: transparent !important;
|
| 296 |
+
border: none !important;
|
| 297 |
+
box-shadow: none !important;
|
| 298 |
+
}
|
| 299 |
+
|
| 300 |
+
|
| 301 |
+
div[role="option"] {
|
| 302 |
+
padding: 10px !important;
|
| 303 |
+
color: #ccd6f6 !important;
|
| 304 |
+
border-bottom: 1px solid #233554 !important;
|
| 305 |
+
}
|
| 306 |
+
|
| 307 |
+
div[role="option"]:hover, div[role="option"][aria-selected="true"] {
|
| 308 |
+
background-color: #233554 !important;
|
| 309 |
+
color: #ff8c00 !important;
|
| 310 |
+
}
|
| 311 |
+
|
| 312 |
+
div[role="listbox"]::-webkit-scrollbar {
|
| 313 |
+
width: 6px !important;
|
| 314 |
+
}
|
| 315 |
+
|
| 316 |
+
div[role="listbox"]::-webkit-scrollbar-track {
|
| 317 |
+
background: #0a192f !important;
|
| 318 |
+
border-radius: 10px !important;
|
| 319 |
+
}
|
| 320 |
+
|
| 321 |
+
div[role="listbox"]::-webkit-scrollbar-thumb {
|
| 322 |
+
background: #ff8c00 !important;
|
| 323 |
+
border-radius: 10px !important;
|
| 324 |
+
border: 1px solid #0a192f !important;
|
| 325 |
+
}
|
| 326 |
+
|
| 327 |
+
div[role="listbox"]::-webkit-scrollbar-thumb:hover {
|
| 328 |
+
background: #ffa500 !important;
|
| 329 |
+
}
|
| 330 |
+
|
| 331 |
+
/* --- PREMIUM SCROLLBARS --- */
|
| 332 |
+
/* Global Scrollbar Styling */
|
| 333 |
+
::-webkit-scrollbar {
|
| 334 |
+
width: 6px !important;
|
| 335 |
+
height: 6px !important;
|
| 336 |
+
}
|
| 337 |
+
|
| 338 |
+
::-webkit-scrollbar-track {
|
| 339 |
+
background: #0a192f !important;
|
| 340 |
+
}
|
| 341 |
+
|
| 342 |
+
::-webkit-scrollbar-thumb {
|
| 343 |
+
background: #ff8c00 !important;
|
| 344 |
+
border-radius: 20px !important;
|
| 345 |
+
border: 1px solid #0a192f !important;
|
| 346 |
+
}
|
| 347 |
+
|
| 348 |
+
::-webkit-scrollbar-thumb:hover {
|
| 349 |
+
background: #ffa500 !important;
|
| 350 |
+
box-shadow: 0 0 10px rgba(255, 140, 0, 0.5) !important;
|
| 351 |
+
}
|
| 352 |
+
|
| 353 |
+
/* Ensure Audio Player Scrollbars match */
|
| 354 |
+
audio::-webkit-scrollbar {
|
| 355 |
+
height: 6px !important;
|
| 356 |
+
}
|
| 357 |
+
|
| 358 |
+
div[role="option"] {
|
| 359 |
+
background-color: transparent !important;
|
| 360 |
+
color: #ccd6f6 !important;
|
| 361 |
+
}
|
| 362 |
+
|
| 363 |
+
div[role="option"]:hover, div[role="option"][aria-selected="true"] {
|
| 364 |
+
background-color: #233554 !important;
|
| 365 |
+
color: #ff8c00 !important;
|
| 366 |
+
}
|
| 367 |
+
|
| 368 |
+
.btn-primary {
|
| 369 |
+
background: #ff8c00 !important;
|
| 370 |
+
border: 2px solid #ff8c00 !important;
|
| 371 |
+
color: #0a192f !important;
|
| 372 |
+
font-weight: 800 !important;
|
| 373 |
+
border-radius: 8px !important;
|
| 374 |
+
padding: 0 20px !important;
|
| 375 |
+
height: 50px !important;
|
| 376 |
+
cursor: pointer !important;
|
| 377 |
+
transition: all 0.3s ease !important;
|
| 378 |
+
width: 100% !important;
|
| 379 |
+
}
|
| 380 |
+
|
| 381 |
+
.btn-primary:hover {
|
| 382 |
+
background: transparent !important;
|
| 383 |
+
color: #ff8c00 !important;
|
| 384 |
+
border-color: #ff8c00 !important;
|
| 385 |
+
}
|
| 386 |
+
|
| 387 |
+
.btn-secondary {
|
| 388 |
+
background: transparent !important;
|
| 389 |
+
border: 1px solid #233554 !important;
|
| 390 |
+
color: #8892b0 !important;
|
| 391 |
+
border-radius: 8px !important;
|
| 392 |
+
height: 50px !important;
|
| 393 |
+
padding: 0 20px !important;
|
| 394 |
+
transition: all 0.3s ease !important;
|
| 395 |
+
width: 100% !important;
|
| 396 |
+
}
|
| 397 |
+
|
| 398 |
+
.btn-secondary:hover {
|
| 399 |
+
background: #ff8c00 !important;
|
| 400 |
+
color: #0a192f !important;
|
| 401 |
+
border-color: #ff8c00 !important;
|
| 402 |
+
}
|
| 403 |
+
|
| 404 |
+
.info-section {
|
| 405 |
+
font-size: 0.85rem;
|
| 406 |
+
color: #8892b0;
|
| 407 |
+
margin-top: 20px;
|
| 408 |
+
padding: 0 15px;
|
| 409 |
+
animation: fadeIn 0.8s ease-out;
|
| 410 |
+
}
|
| 411 |
+
|
| 412 |
+
.info-header {
|
| 413 |
+
color: #ff8c00 !important;
|
| 414 |
+
font-weight: 800 !important;
|
| 415 |
+
margin-bottom: 5px !important;
|
| 416 |
+
display: block !important;
|
| 417 |
+
}
|
| 418 |
+
|
| 419 |
+
.footer {
|
| 420 |
+
text-align: center;
|
| 421 |
+
margin-top: 40px;
|
| 422 |
+
padding-top: 20px;
|
| 423 |
+
border-top: 1px solid #233554;
|
| 424 |
+
font-size: 0.8rem;
|
| 425 |
+
color: #8892b0;
|
| 426 |
+
animation: fadeIn 1s ease-out;
|
| 427 |
+
}
|
| 428 |
+
|
| 429 |
+
.footer a { color: #ff8c00; text-decoration: none; font-weight: 600; transition: opacity 0.2s; }
|
| 430 |
+
.footer a:hover { opacity: 0.8; }
|
| 431 |
+
|
| 432 |
+
.authorship {
|
| 433 |
+
margin-bottom: 10px;
|
| 434 |
+
font-weight: 600;
|
| 435 |
+
}
|
| 436 |
+
|
| 437 |
+
/* Dropdown Polish */
|
| 438 |
+
.gr-dropdown {
|
| 439 |
+
background: #0a192f !important;
|
| 440 |
+
border-color: #233554 !important;
|
| 441 |
+
margin-bottom: 12px !important;
|
| 442 |
+
}
|
| 443 |
+
.gr-dropdown:focus-within {
|
| 444 |
+
border-color: #ff8c00 !important;
|
| 445 |
+
}
|
| 446 |
+
|
| 447 |
+
/* Premium Progress Bar Styling */
|
| 448 |
+
.gr-progress {
|
| 449 |
+
background-color: #0a192f !important;
|
| 450 |
+
border: 1px solid #233554 !important;
|
| 451 |
+
border-radius: 12px !important;
|
| 452 |
+
height: 38px !important;
|
| 453 |
+
overflow: hidden !important;
|
| 454 |
+
box-shadow: inset 0 2px 4px rgba(0,0,0,0.3) !important;
|
| 455 |
+
}
|
| 456 |
+
|
| 457 |
+
.gr-progress .progress-level {
|
| 458 |
+
background: linear-gradient(90deg, #ff8c00, #ffb347) !important;
|
| 459 |
+
border-radius: 10px !important;
|
| 460 |
+
box-shadow: 0 0 20px rgba(255, 140, 0, 0.4) !important;
|
| 461 |
+
height: 100% !important;
|
| 462 |
+
transition: width 0.4s cubic-bezier(0.4, 0, 0.2, 1) !important;
|
| 463 |
+
position: relative !important;
|
| 464 |
+
overflow: hidden !important;
|
| 465 |
+
}
|
| 466 |
+
|
| 467 |
+
.gr-progress .progress-level::after {
|
| 468 |
+
content: "" !important;
|
| 469 |
+
position: absolute !important;
|
| 470 |
+
top: 0 !important;
|
| 471 |
+
left: -150% !important;
|
| 472 |
+
width: 100% !important;
|
| 473 |
+
height: 100% !important;
|
| 474 |
+
background: linear-gradient(
|
| 475 |
+
90deg,
|
| 476 |
+
transparent,
|
| 477 |
+
rgba(255, 255, 255, 0.4),
|
| 478 |
+
transparent
|
| 479 |
+
) !important;
|
| 480 |
+
animation: apple-shimmer 2s infinite linear !important;
|
| 481 |
+
}
|
| 482 |
+
|
| 483 |
+
@keyframes apple-shimmer {
|
| 484 |
+
0% { left: -150%; }
|
| 485 |
+
100% { left: 150%; }
|
| 486 |
+
}
|
| 487 |
+
|
| 488 |
+
.gr-progress .progress-text {
|
| 489 |
+
color: #ffffff !important;
|
| 490 |
+
font-family: 'Play', sans-serif !important;
|
| 491 |
+
font-weight: 700 !important;
|
| 492 |
+
font-size: 0.85rem !important;
|
| 493 |
+
line-height: 38px !important;
|
| 494 |
+
text-shadow: 0 1px 2px rgba(0,0,0,0.5) !important;
|
| 495 |
+
letter-spacing: 0.5px !important;
|
| 496 |
+
position: relative !important;
|
| 497 |
+
z-index: 5 !important;
|
| 498 |
+
}
|
| 499 |
+
|
| 500 |
+
.progress-container {
|
| 501 |
+
padding: 0 !important;
|
| 502 |
+
margin: 0 !important;
|
| 503 |
+
}
|
| 504 |
+
|
| 505 |
+
/* Subtle Animations */
|
| 506 |
+
@keyframes fadeIn {
|
| 507 |
+
from { opacity: 0; transform: translateY(8px); }
|
| 508 |
+
to { opacity: 1; transform: translateY(0); }
|
| 509 |
+
}
|
| 510 |
+
|
| 511 |
+
.main-container { animation: fadeIn 0.6s ease-out; }
|
| 512 |
+
.studio-card { transition: transform 0.2s ease, border-color 0.2s ease !important; }
|
| 513 |
+
.studio-card:hover { transform: translateY(-2px); border-color: #ff8c00 !important; }
|
| 514 |
+
"""
|
| 515 |
+
|
| 516 |
+
theme = gr.themes.Default(
|
| 517 |
+
primary_hue="orange",
|
| 518 |
+
secondary_hue="slate",
|
| 519 |
+
).set(
|
| 520 |
+
body_background_fill="#0a192f",
|
| 521 |
+
block_background_fill="#112240",
|
| 522 |
+
input_background_fill="#0a192f",
|
| 523 |
+
input_border_color="#233554",
|
| 524 |
+
)
|
| 525 |
+
|
| 526 |
+
# --- Low-Level Security & Easter Egg ---
|
| 527 |
+
custom_js = """
|
| 528 |
+
function() {
|
| 529 |
+
document.addEventListener('contextmenu', event => {
|
| 530 |
+
event.preventDefault();
|
| 531 |
+
alert('Security Protocol Engaged: System protected by Amey Thakur & Mega Satish');
|
| 532 |
+
console.warn('Security Alert: Unauthorized access attempt detected.');
|
| 533 |
+
});
|
| 534 |
+
console.log("%c STOP! %c You are entering a protected zone.", "color: red; font-size: 50px; font-weight: bold;", "color: white; font-size: 20px;");
|
| 535 |
+
}
|
| 536 |
+
"""
|
| 537 |
+
|
| 538 |
+
# Inject Favicon via Head (Reliable) and Security JS
|
| 539 |
+
with gr.Blocks(title="Deepfake Audio Studio", theme=theme, css=custom_css, js=custom_js, head='<link rel="icon" type="image/png" href="{}">'.format(NEON_MIC_ICON)) as demo:
|
| 540 |
+
with gr.Column(elem_classes=["main-container"]):
|
| 541 |
+
|
| 542 |
+
# Minimal Header
|
| 543 |
+
with gr.Column(elem_id="header"):
|
| 544 |
+
with gr.Row():
|
| 545 |
+
intro_btn = gr.Button("ποΈ Deepfake Audio", elem_id="intro-btn")
|
| 546 |
+
gr.Markdown("<div style='text-align: center; margin-top: 5px; margin-bottom: 50px; color: #8892b0;'>A neural voice cloning studio powered by SV2TTS technology</div>")
|
| 547 |
+
|
| 548 |
+
intro_audio = gr.Audio(visible=True, autoplay=True, elem_id="intro-audio")
|
| 549 |
+
|
| 550 |
+
# Compact 2x2 Grid
|
| 551 |
+
with gr.Row():
|
| 552 |
+
# Voice Deck (Scrollable Radio List)
|
| 553 |
+
with gr.Column(elem_classes=["studio-card"]):
|
| 554 |
+
gr.Markdown("<div class='card-title'>01. Voice Reference</div>")
|
| 555 |
+
|
| 556 |
+
# The Voice Deck
|
| 557 |
+
preset_dropdown = gr.Radio(
|
| 558 |
+
choices=["Custom Upload"] + sorted([k for k in SAMPLES.keys()]),
|
| 559 |
+
value="Custom Upload",
|
| 560 |
+
label="Voice Selection",
|
| 561 |
+
show_label=False,
|
| 562 |
+
elem_id="voice-deck",
|
| 563 |
+
interactive=True
|
| 564 |
+
)
|
| 565 |
+
|
| 566 |
+
audio_input = gr.Audio(type="filepath", label="Reference Sample", container=False, show_label=False, elem_id="audio-input")
|
| 567 |
+
|
| 568 |
+
with gr.Column():
|
| 569 |
+
with gr.Column(elem_classes=["studio-card"], elem_id="synthesis-output-card"):
|
| 570 |
+
gr.Markdown("<div class='card-title'>02. Synthesis Output</div>")
|
| 571 |
+
audio_output = gr.Audio(label="Generated Result", interactive=False, container=False, show_label=False, elem_id="audio-output")
|
| 572 |
+
|
| 573 |
+
# Input & Status Row (2x2 Grid Symmetry)
|
| 574 |
+
with gr.Row():
|
| 575 |
+
with gr.Column(elem_classes=["studio-card"]):
|
| 576 |
+
gr.Markdown("<div class='card-title'>03. Target Script</div>")
|
| 577 |
+
text_input = gr.Textbox(
|
| 578 |
+
label="Target text to synthesize",
|
| 579 |
+
placeholder="Enter audio text...",
|
| 580 |
+
lines=3,
|
| 581 |
+
show_label=False,
|
| 582 |
+
elem_id="script-box"
|
| 583 |
+
)
|
| 584 |
+
|
| 585 |
+
with gr.Column(elem_classes=["studio-card"]):
|
| 586 |
+
gr.Markdown("<div class='card-title'>04. System Status</div>")
|
| 587 |
+
status_info = gr.Textbox(
|
| 588 |
+
label="System Status",
|
| 589 |
+
value="β οΈ UI Demo Mode - TensorFlow blocked by system policy.",
|
| 590 |
+
interactive=False,
|
| 591 |
+
show_label=False,
|
| 592 |
+
elem_id="status-box"
|
| 593 |
+
)
|
| 594 |
+
|
| 595 |
+
# Controls
|
| 596 |
+
with gr.Row():
|
| 597 |
+
with gr.Column(scale=1):
|
| 598 |
+
reset_btn = gr.Button("Reset", variant="secondary", elem_id="reset-btn", elem_classes=["btn-secondary"])
|
| 599 |
+
with gr.Column(scale=1):
|
| 600 |
+
run_btn = gr.Button("Generate Voice Clone", variant="primary", elem_classes=["btn-primary"])
|
| 601 |
+
|
| 602 |
+
# Information Sections (Neat & Compact)
|
| 603 |
+
with gr.Row(elem_classes=["info-section"]):
|
| 604 |
+
with gr.Column():
|
| 605 |
+
gr.Markdown("<span class='info-header'>How it Works</span>")
|
| 606 |
+
gr.Markdown("Extracts speaker identity into a latent embedding to drive neural text-to-speech synthesis.")
|
| 607 |
+
with gr.Column():
|
| 608 |
+
gr.Markdown("<span class='info-header'>Privacy Notice</span>")
|
| 609 |
+
gr.Markdown("Audio is processed in memory and never stored. For educational and research use only.")
|
| 610 |
+
|
| 611 |
+
# Minimal Footer
|
| 612 |
+
with gr.Column(elem_classes=["footer"]):
|
| 613 |
+
gr.HTML("""
|
| 614 |
+
<div class='authorship'>
|
| 615 |
+
Created by <a href='https://github.com/Amey-Thakur' target='_blank'>Amey Thakur</a>
|
| 616 |
+
& <a href='https://github.com/msatmod' target='_blank'>Mega Satish</a>
|
| 617 |
+
</div>
|
| 618 |
+
<div style='margin-top: 12px;'>
|
| 619 |
+
<a href='https://github.com/Amey-Thakur/DEEPFAKE-AUDIO' target='_blank'>GitHub Repository</a> |
|
| 620 |
+
<a href='https://youtu.be/i3wnBcbHDbs' target='_blank'>YouTube Demo</a>
|
| 621 |
+
</div>
|
| 622 |
+
<p style='margin-top: 12px; opacity: 0.6;'>Β© 2021 Deepfake Audio Studio</p>
|
| 623 |
+
""")
|
| 624 |
+
|
| 625 |
+
# Events
|
| 626 |
+
run_btn.click(
|
| 627 |
+
fn=synthesize,
|
| 628 |
+
inputs=[audio_input, text_input],
|
| 629 |
+
outputs=[audio_output, status_info]
|
| 630 |
+
)
|
| 631 |
+
|
| 632 |
+
reset_btn.click(lambda: (None, "Custom Upload", "", None, "β οΈ UI Demo Mode"), outputs=[audio_input, preset_dropdown, text_input, audio_output, status_info])
|
| 633 |
+
|
| 634 |
+
# Preset selection logic
|
| 635 |
+
def on_preset_change(name):
|
| 636 |
+
if name == "Custom Upload":
|
| 637 |
+
return None
|
| 638 |
+
return load_preset(name)
|
| 639 |
+
|
| 640 |
+
preset_dropdown.change(fn=on_preset_change, inputs=[preset_dropdown], outputs=[audio_input])
|
| 641 |
+
|
| 642 |
+
# Custom JS to force play because browser autoplay policies are strict
|
| 643 |
+
play_js = "() => { setTimeout(() => { const audio = document.querySelector('#intro-audio audio'); if (audio) audio.play(); }, 300); }"
|
| 644 |
+
intro_btn.click(fn=play_intro, outputs=intro_audio, js=play_js)
|
| 645 |
+
|
| 646 |
+
if __name__ == "__main__":
|
| 647 |
+
print("ποΈ Launching Deepfake Audio Studio (UI Demo Mode)...")
|
| 648 |
+
print("π Open: http://localhost:7860")
|
| 649 |
+
demo.queue().launch(server_name="0.0.0.0", server_port=7860, show_error=True, pwa=True)
|
Source Code/demo_cli.py
ADDED
|
@@ -0,0 +1,237 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ==================================================================================================
|
| 2 |
+
# DEEPFAKE AUDIO - demo_cli.py (Command Line Entry)
|
| 3 |
+
# ==================================================================================================
|
| 4 |
+
#
|
| 5 |
+
# π DESCRIPTION
|
| 6 |
+
# This script provides a terminal-based interface to the Deepfake Audio synthesis pipeline.
|
| 7 |
+
# It is designed for researchers and developers who require a GUI-less environment to
|
| 8 |
+
# perform batch inference, integration testing, or low-level parameter exploration
|
| 9 |
+
# of the Speaker Verification to Transfer Learning (SV2TTS) system.
|
| 10 |
+
#
|
| 11 |
+
# π€ AUTHORS
|
| 12 |
+
# - Amey Thakur (https://github.com/Amey-Thakur)
|
| 13 |
+
# - Mega Satish (https://github.com/msatmod)
|
| 14 |
+
#
|
| 15 |
+
# π€π» CREDITS
|
| 16 |
+
# Original Real-Time Voice Cloning methodology by CorentinJ
|
| 17 |
+
# Repository: https://github.com/CorentinJ/Real-Time-Voice-Cloning
|
| 18 |
+
#
|
| 19 |
+
# π PROJECT LINKS
|
| 20 |
+
# Repository: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO
|
| 21 |
+
# Video Demo: https://youtu.be/i3wnBcbHDbs
|
| 22 |
+
# Research: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO/blob/main/DEEPFAKE-AUDIO.ipynb
|
| 23 |
+
#
|
| 24 |
+
# π LICENSE
|
| 25 |
+
# Released under the MIT License
|
| 26 |
+
# Release Date: 2021-02-06
|
| 27 |
+
# ==================================================================================================
|
| 28 |
+
|
| 29 |
+
import argparse
|
| 30 |
+
import os
|
| 31 |
+
from pathlib import Path
|
| 32 |
+
import librosa
|
| 33 |
+
import numpy as np
|
| 34 |
+
import soundfile as sf
|
| 35 |
+
import torch
|
| 36 |
+
|
| 37 |
+
# --- INTERNAL ARCHITECTURE ---
|
| 38 |
+
# These modules encapsulate the multi-stage neural logic.
|
| 39 |
+
from encoder import inference as encoder
|
| 40 |
+
from encoder.params_model import model_embedding_size as speaker_embedding_size
|
| 41 |
+
from synthesizer.inference import Synthesizer
|
| 42 |
+
from utils.argutils import print_args
|
| 43 |
+
from utils.default_models import ensure_default_models
|
| 44 |
+
from vocoder import inference as vocoder
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
if __name__ == '__main__':
|
| 48 |
+
parser = argparse.ArgumentParser(
|
| 49 |
+
formatter_class=argparse.ArgumentDefaultsHelpFormatter
|
| 50 |
+
)
|
| 51 |
+
parser.add_argument("-e", "--enc_model_fpath", type=Path,
|
| 52 |
+
default="saved_models/default/encoder.pt",
|
| 53 |
+
help="Path to a saved encoder")
|
| 54 |
+
parser.add_argument("-s", "--syn_model_fpath", type=Path,
|
| 55 |
+
default="saved_models/default/synthesizer.pt",
|
| 56 |
+
help="Path to a saved synthesizer")
|
| 57 |
+
parser.add_argument("-v", "--voc_model_fpath", type=Path,
|
| 58 |
+
default="saved_models/default/vocoder.pt",
|
| 59 |
+
help="Path to a saved vocoder")
|
| 60 |
+
parser.add_argument("--cpu", action="store_true", help=\
|
| 61 |
+
"If True, processing is done on CPU, even when a GPU is available.")
|
| 62 |
+
parser.add_argument("--no_sound", action="store_true", help=\
|
| 63 |
+
"If True, audio won't be played.")
|
| 64 |
+
parser.add_argument("--seed", type=int, default=None, help=\
|
| 65 |
+
"Optional random number seed value to make toolbox deterministic.")
|
| 66 |
+
args = parser.parse_args()
|
| 67 |
+
arg_dict = vars(args)
|
| 68 |
+
print_args(args, parser)
|
| 69 |
+
|
| 70 |
+
# Hide GPUs from Pytorch to force CPU processing
|
| 71 |
+
if arg_dict.pop("cpu"):
|
| 72 |
+
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
|
| 73 |
+
|
| 74 |
+
print("Running a test of your configuration...\n")
|
| 75 |
+
|
| 76 |
+
if torch.cuda.is_available():
|
| 77 |
+
device_id = torch.cuda.current_device()
|
| 78 |
+
gpu_properties = torch.cuda.get_device_properties(device_id)
|
| 79 |
+
## Print some environment information (for debugging purposes)
|
| 80 |
+
print("Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with "
|
| 81 |
+
"%.1fGb total memory.\n" %
|
| 82 |
+
(torch.cuda.device_count(),
|
| 83 |
+
device_id,
|
| 84 |
+
gpu_properties.name,
|
| 85 |
+
gpu_properties.major,
|
| 86 |
+
gpu_properties.minor,
|
| 87 |
+
gpu_properties.total_memory / 1e9))
|
| 88 |
+
else:
|
| 89 |
+
print("Using CPU for inference.\n")
|
| 90 |
+
|
| 91 |
+
## Load the models one by one.
|
| 92 |
+
print("Preparing the encoder, the synthesizer and the vocoder...")
|
| 93 |
+
ensure_default_models(Path("saved_models"))
|
| 94 |
+
encoder.load_model(args.enc_model_fpath)
|
| 95 |
+
synthesizer = Synthesizer(args.syn_model_fpath)
|
| 96 |
+
vocoder.load_model(args.voc_model_fpath)
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
## Run a test
|
| 100 |
+
print("Testing your configuration with small inputs.")
|
| 101 |
+
# Forward an audio waveform of zeroes that lasts 1 second. Notice how we can get the encoder's
|
| 102 |
+
# sampling rate, which may differ.
|
| 103 |
+
# If you're unfamiliar with digital audio, know that it is encoded as an array of floats
|
| 104 |
+
# (or sometimes integers, but mostly floats in this projects) ranging from -1 to 1.
|
| 105 |
+
# The sampling rate is the number of values (samples) recorded per second, it is set to
|
| 106 |
+
# 16000 for the encoder. Creating an array of length <sampling_rate> will always correspond
|
| 107 |
+
# to an audio of 1 second.
|
| 108 |
+
print("\tTesting the encoder...")
|
| 109 |
+
encoder.embed_utterance(np.zeros(encoder.sampling_rate))
|
| 110 |
+
|
| 111 |
+
# Create a dummy embedding. You would normally use the embedding that encoder.embed_utterance
|
| 112 |
+
# returns, but here we're going to make one ourselves just for the sake of showing that it's
|
| 113 |
+
# possible.
|
| 114 |
+
embed = np.random.rand(speaker_embedding_size)
|
| 115 |
+
# Embeddings are L2-normalized (this isn't important here, but if you want to make your own
|
| 116 |
+
# embeddings it will be).
|
| 117 |
+
embed /= np.linalg.norm(embed)
|
| 118 |
+
# The synthesizer can handle multiple inputs with batching. Let's create another embedding to
|
| 119 |
+
# illustrate that
|
| 120 |
+
embeds = [embed, np.zeros(speaker_embedding_size)]
|
| 121 |
+
texts = ["test 1", "test 2"]
|
| 122 |
+
print("\tTesting the synthesizer... (loading the model will output a lot of text)")
|
| 123 |
+
mels = synthesizer.synthesize_spectrograms(texts, embeds)
|
| 124 |
+
|
| 125 |
+
# The vocoder synthesizes one waveform at a time, but it's more efficient for long ones. We
|
| 126 |
+
# can concatenate the mel spectrograms to a single one.
|
| 127 |
+
mel = np.concatenate(mels, axis=1)
|
| 128 |
+
# The vocoder can take a callback function to display the generation. More on that later. For
|
| 129 |
+
# now we'll simply hide it like this:
|
| 130 |
+
no_action = lambda *args: None
|
| 131 |
+
print("\tTesting the vocoder...")
|
| 132 |
+
# For the sake of making this test short, we'll pass a short target length. The target length
|
| 133 |
+
# is the length of the wav segments that are processed in parallel. E.g. for audio sampled
|
| 134 |
+
# at 16000 Hertz, a target length of 8000 means that the target audio will be cut in chunks of
|
| 135 |
+
# 0.5 seconds which will all be generated together. The parameters here are absurdly short, and
|
| 136 |
+
# that has a detrimental effect on the quality of the audio. The default parameters are
|
| 137 |
+
# recommended in general.
|
| 138 |
+
vocoder.infer_waveform(mel, target=200, overlap=50, progress_callback=no_action)
|
| 139 |
+
|
| 140 |
+
print("All test passed! You can now synthesize speech.\n\n")
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
## Interactive speech generation
|
| 144 |
+
print("This is a GUI-less example of interface to SV2TTS. The purpose of this script is to "
|
| 145 |
+
"show how you can interface this project easily with your own. See the source code for "
|
| 146 |
+
"an explanation of what is happening.\n")
|
| 147 |
+
|
| 148 |
+
print("Interactive generation loop")
|
| 149 |
+
num_generated = 0
|
| 150 |
+
while True:
|
| 151 |
+
try:
|
| 152 |
+
# Get the reference audio filepath
|
| 153 |
+
message = "Reference voice: enter an audio filepath of a voice to be cloned (mp3, " \
|
| 154 |
+
"wav, m4a, flac, ...):\n"
|
| 155 |
+
in_fpath = Path(input(message).replace("\"", "").replace("\'", ""))
|
| 156 |
+
|
| 157 |
+
## Computing the embedding
|
| 158 |
+
# First, we load the wav using the function that the speaker encoder provides. This is
|
| 159 |
+
# important: there is preprocessing that must be applied.
|
| 160 |
+
|
| 161 |
+
# The following two methods are equivalent:
|
| 162 |
+
# - Directly load from the filepath:
|
| 163 |
+
preprocessed_wav = encoder.preprocess_wav(in_fpath)
|
| 164 |
+
# - If the wav is already loaded:
|
| 165 |
+
original_wav, sampling_rate = librosa.load(str(in_fpath))
|
| 166 |
+
preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate)
|
| 167 |
+
print("Loaded file succesfully")
|
| 168 |
+
|
| 169 |
+
# Then we derive the embedding. There are many functions and parameters that the
|
| 170 |
+
# speaker encoder interfaces. These are mostly for in-depth research. You will typically
|
| 171 |
+
# only use this function (with its default parameters):
|
| 172 |
+
embed = encoder.embed_utterance(preprocessed_wav)
|
| 173 |
+
print("Created the embedding")
|
| 174 |
+
|
| 175 |
+
|
| 176 |
+
## Generating the spectrogram
|
| 177 |
+
text = input("Write a sentence (+-20 words) to be synthesized:\n")
|
| 178 |
+
|
| 179 |
+
# If seed is specified, reset torch seed and force synthesizer reload
|
| 180 |
+
if args.seed is not None:
|
| 181 |
+
torch.manual_seed(args.seed)
|
| 182 |
+
synthesizer = Synthesizer(args.syn_model_fpath)
|
| 183 |
+
|
| 184 |
+
# The synthesizer works in batch, so you need to put your data in a list or numpy array
|
| 185 |
+
texts = [text]
|
| 186 |
+
embeds = [embed]
|
| 187 |
+
# If you know what the attention layer alignments are, you can retrieve them here by
|
| 188 |
+
# passing return_alignments=True
|
| 189 |
+
specs = synthesizer.synthesize_spectrograms(texts, embeds)
|
| 190 |
+
spec = specs[0]
|
| 191 |
+
print("Created the mel spectrogram")
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
## Generating the waveform
|
| 195 |
+
print("Synthesizing the waveform:")
|
| 196 |
+
|
| 197 |
+
# If seed is specified, reset torch seed and reload vocoder
|
| 198 |
+
if args.seed is not None:
|
| 199 |
+
torch.manual_seed(args.seed)
|
| 200 |
+
vocoder.load_model(args.voc_model_fpath)
|
| 201 |
+
|
| 202 |
+
# Synthesizing the waveform is fairly straightforward. Remember that the longer the
|
| 203 |
+
# spectrogram, the more time-efficient the vocoder.
|
| 204 |
+
generated_wav = vocoder.infer_waveform(spec)
|
| 205 |
+
|
| 206 |
+
|
| 207 |
+
## Post-generation
|
| 208 |
+
# There's a bug with sounddevice that makes the audio cut one second earlier, so we
|
| 209 |
+
# pad it.
|
| 210 |
+
generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate), mode="constant")
|
| 211 |
+
|
| 212 |
+
# Trim excess silences to compensate for gaps in spectrograms (issue #53)
|
| 213 |
+
generated_wav = encoder.preprocess_wav(generated_wav)
|
| 214 |
+
|
| 215 |
+
# Play the audio (non-blocking)
|
| 216 |
+
if not args.no_sound:
|
| 217 |
+
import sounddevice as sd
|
| 218 |
+
try:
|
| 219 |
+
sd.stop()
|
| 220 |
+
sd.play(generated_wav, synthesizer.sample_rate)
|
| 221 |
+
except sd.PortAudioError as e:
|
| 222 |
+
print("\nCaught exception: %s" % repr(e))
|
| 223 |
+
print("Continuing without audio playback. Suppress this message with the \"--no_sound\" flag.\n")
|
| 224 |
+
except:
|
| 225 |
+
raise
|
| 226 |
+
|
| 227 |
+
# Save it on the disk
|
| 228 |
+
filename = "demo_output_%02d.wav" % num_generated
|
| 229 |
+
print(generated_wav.dtype)
|
| 230 |
+
sf.write(filename, generated_wav.astype(np.float32), synthesizer.sample_rate)
|
| 231 |
+
num_generated += 1
|
| 232 |
+
print("\nSaved output as %s\n\n" % filename)
|
| 233 |
+
|
| 234 |
+
|
| 235 |
+
except Exception as e:
|
| 236 |
+
print("Caught exception: %s" % repr(e))
|
| 237 |
+
print("Restarting\n")
|
Source Code/demo_toolbox.py
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ==================================================================================================
|
| 2 |
+
# DEEPFAKE AUDIO - demo_toolbox.py (Legacy Research Interface)
|
| 3 |
+
# ==================================================================================================
|
| 4 |
+
#
|
| 5 |
+
# π DESCRIPTION
|
| 6 |
+
# This script launches the original Qt5-based Research Toolbox. While the modern Gradio
|
| 7 |
+
# interface is the preferred entry point for general studio use, the Toolbox remains a
|
| 8 |
+
# critical asset for in-depth data visualization, cross-dataset exploration, and
|
| 9 |
+
# laboratory-grade synthesis auditing.
|
| 10 |
+
#
|
| 11 |
+
# π€ AUTHORS
|
| 12 |
+
# - Amey Thakur (https://github.com/Amey-Thakur)
|
| 13 |
+
# - Mega Satish (https://github.com/msatmod)
|
| 14 |
+
#
|
| 15 |
+
# π€π» CREDITS
|
| 16 |
+
# Original Real-Time Voice Cloning methodology by CorentinJ
|
| 17 |
+
# Repository: https://github.com/CorentinJ/Real-Time-Voice-Cloning
|
| 18 |
+
#
|
| 19 |
+
# π PROJECT LINKS
|
| 20 |
+
# Repository: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO
|
| 21 |
+
# Video Demo: https://youtu.be/i3wnBcbHDbs
|
| 22 |
+
# Research: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO/blob/main/DEEPFAKE-AUDIO.ipynb
|
| 23 |
+
#
|
| 24 |
+
# π LICENSE
|
| 25 |
+
# Released under the MIT License
|
| 26 |
+
# Release Date: 2021-02-06
|
| 27 |
+
# ==================================================================================================
|
| 28 |
+
|
| 29 |
+
import argparse
|
| 30 |
+
import os
|
| 31 |
+
from pathlib import Path
|
| 32 |
+
|
| 33 |
+
# --- CORE TOOLBOX ENGINE ---
|
| 34 |
+
from toolbox import Toolbox
|
| 35 |
+
from utils.argutils import print_args
|
| 36 |
+
from utils.default_models import ensure_default_models
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
if __name__ == '__main__':
|
| 40 |
+
parser = argparse.ArgumentParser(
|
| 41 |
+
description="Runs the toolbox.",
|
| 42 |
+
formatter_class=argparse.ArgumentDefaultsHelpFormatter
|
| 43 |
+
)
|
| 44 |
+
|
| 45 |
+
parser.add_argument("-d", "--datasets_root", type=Path, help= \
|
| 46 |
+
"Path to the directory containing your datasets. See toolbox/__init__.py for a list of "
|
| 47 |
+
"supported datasets.", default=None)
|
| 48 |
+
parser.add_argument("-m", "--models_dir", type=Path, default="saved_models",
|
| 49 |
+
help="Directory containing all saved models")
|
| 50 |
+
parser.add_argument("--cpu", action="store_true", help=\
|
| 51 |
+
"If True, all inference will be done on CPU")
|
| 52 |
+
parser.add_argument("--seed", type=int, default=None, help=\
|
| 53 |
+
"Optional random number seed value to make toolbox deterministic.")
|
| 54 |
+
args = parser.parse_args()
|
| 55 |
+
arg_dict = vars(args)
|
| 56 |
+
print_args(args, parser)
|
| 57 |
+
|
| 58 |
+
# Hide GPUs from Pytorch to force CPU processing
|
| 59 |
+
if arg_dict.pop("cpu"):
|
| 60 |
+
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
|
| 61 |
+
|
| 62 |
+
# Remind the user to download pretrained models if needed
|
| 63 |
+
ensure_default_models(args.models_dir)
|
| 64 |
+
|
| 65 |
+
# Launch the toolbox
|
| 66 |
+
Toolbox(**arg_dict)
|
Source Code/encoder/__init__.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ==================================================================================================
|
| 2 |
+
# DEEPFAKE AUDIO - encoder/__init__.py (Neural Module Initialization)
|
| 3 |
+
# ==================================================================================================
|
| 4 |
+
#
|
| 5 |
+
# π DESCRIPTION
|
| 6 |
+
# This initialization script defines the 'encoder' as a Python package, facilitating
|
| 7 |
+
# structured access to neural speaker identity derivation modules. It ensures that
|
| 8 |
+
# internal utilities like audio preprocessing and inference engines are correctly
|
| 9 |
+
# namespaced within the SV2TTS ecosystem.
|
| 10 |
+
#
|
| 11 |
+
# π€ AUTHORS
|
| 12 |
+
# - Amey Thakur (https://github.com/Amey-Thakur)
|
| 13 |
+
# - Mega Satish (https://github.com/msatmod)
|
| 14 |
+
#
|
| 15 |
+
# π€π» CREDITS
|
| 16 |
+
# Original Real-Time Voice Cloning methodology by CorentinJ
|
| 17 |
+
# Repository: https://github.com/CorentinJ/Real-Time-Voice-Cloning
|
| 18 |
+
#
|
| 19 |
+
# π PROJECT LINKS
|
| 20 |
+
# Repository: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO
|
| 21 |
+
# Video Demo: https://youtu.be/i3wnBcbHDbs
|
| 22 |
+
# Research: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO/blob/main/DEEPFAKE-AUDIO.ipynb
|
| 23 |
+
#
|
| 24 |
+
# π LICENSE
|
| 25 |
+
# Released under the MIT License
|
| 26 |
+
# Release Date: 2021-02-06
|
| 27 |
+
# ==================================================================================================
|
Source Code/encoder/audio.py
ADDED
|
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ==================================================================================================
|
| 2 |
+
# DEEPFAKE AUDIO - encoder/audio.py (Acoustic Signal Processing)
|
| 3 |
+
# ==================================================================================================
|
| 4 |
+
#
|
| 5 |
+
# π DESCRIPTION
|
| 6 |
+
# This module implements the acoustic primitives required for the Speaker Encoder.
|
| 7 |
+
# It handles waveform normalization, resampling, and most importantly, the
|
| 8 |
+
# transformation of raw time-domain signals into frequency-domain Mel-Spectrograms.
|
| 9 |
+
# It also integrates Voice Activity Detection (VAD) via 'webrtcvad' to ensure that
|
| 10 |
+
# only active speech segments are passed to the neural distillation layers.
|
| 11 |
+
#
|
| 12 |
+
# π€ AUTHORS
|
| 13 |
+
# - Amey Thakur (https://github.com/Amey-Thakur)
|
| 14 |
+
# - Mega Satish (https://github.com/msatmod)
|
| 15 |
+
#
|
| 16 |
+
# π€π» CREDITS
|
| 17 |
+
# Original Real-Time Voice Cloning methodology by CorentinJ
|
| 18 |
+
# Repository: https://github.com/CorentinJ/Real-Time-Voice-Cloning
|
| 19 |
+
#
|
| 20 |
+
# π PROJECT LINKS
|
| 21 |
+
# Repository: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO
|
| 22 |
+
# Video Demo: https://youtu.be/i3wnBcbHDbs
|
| 23 |
+
# Research: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO/blob/main/DEEPFAKE-AUDIO.ipynb
|
| 24 |
+
#
|
| 25 |
+
# π LICENSE
|
| 26 |
+
# Released under the MIT License
|
| 27 |
+
# Release Date: 2021-02-06
|
| 28 |
+
# ==================================================================================================
|
| 29 |
+
|
| 30 |
+
from scipy.ndimage import binary_dilation
|
| 31 |
+
from encoder.params_data import *
|
| 32 |
+
from pathlib import Path
|
| 33 |
+
from typing import Optional, Union
|
| 34 |
+
from warnings import warn
|
| 35 |
+
import numpy as np
|
| 36 |
+
import librosa
|
| 37 |
+
import struct
|
| 38 |
+
|
| 39 |
+
# --- VAD INITIALIZATION ---
|
| 40 |
+
try:
|
| 41 |
+
import webrtcvad
|
| 42 |
+
except:
|
| 43 |
+
warn("β οΈ Scholarly Warning: 'webrtcvad' not detected. Noise removal and silence trimming will be bypassed.")
|
| 44 |
+
webrtcvad = None
|
| 45 |
+
|
| 46 |
+
int16_max = (2 ** 15) - 1
|
| 47 |
+
|
| 48 |
+
def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray],
|
| 49 |
+
source_sr: Optional[int] = None,
|
| 50 |
+
normalize: Optional[bool] = True,
|
| 51 |
+
trim_silence: Optional[bool] = True):
|
| 52 |
+
"""
|
| 53 |
+
Orchestrates the acoustic normalization pipeline.
|
| 54 |
+
1. Loads signal from disk or buffer.
|
| 55 |
+
2. Resamples to training-specific frequencies.
|
| 56 |
+
3. Normalizes volume (dBFS).
|
| 57 |
+
4. Trims non-speech intervals (if VAD is active).
|
| 58 |
+
"""
|
| 59 |
+
# Defensive Input Handling
|
| 60 |
+
if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
|
| 61 |
+
wav, source_sr = librosa.load(str(fpath_or_wav), sr=None)
|
| 62 |
+
else:
|
| 63 |
+
wav = fpath_or_wav
|
| 64 |
+
|
| 65 |
+
# Frequency Alignment
|
| 66 |
+
if source_sr is not None and source_sr != sampling_rate:
|
| 67 |
+
wav = librosa.resample(y=wav, orig_sr=source_sr, target_sr=sampling_rate)
|
| 68 |
+
|
| 69 |
+
# Amplitude Normalization
|
| 70 |
+
if normalize:
|
| 71 |
+
wav = normalize_volume(wav, audio_norm_target_dBFS, increase_only=True)
|
| 72 |
+
|
| 73 |
+
# Temporal Compression (Silence Removal)
|
| 74 |
+
if webrtcvad and trim_silence:
|
| 75 |
+
wav = trim_long_silences(wav)
|
| 76 |
+
|
| 77 |
+
return wav
|
| 78 |
+
|
| 79 |
+
def wav_to_mel_spectrogram(wav):
|
| 80 |
+
"""
|
| 81 |
+
Distills a time-domain waveform into a frequency-domain Mel-Spectrogram matrix.
|
| 82 |
+
This serves as the primary input for the Speaker Encoder neural network.
|
| 83 |
+
"""
|
| 84 |
+
frames = librosa.feature.melspectrogram(
|
| 85 |
+
y=wav,
|
| 86 |
+
sr=sampling_rate,
|
| 87 |
+
n_fft=int(sampling_rate * mel_window_length / 1000),
|
| 88 |
+
hop_length=int(sampling_rate * mel_window_step / 1000),
|
| 89 |
+
n_mels=mel_n_channels
|
| 90 |
+
)
|
| 91 |
+
return frames.astype(np.float32).T
|
| 92 |
+
|
| 93 |
+
def trim_long_silences(wav):
|
| 94 |
+
"""
|
| 95 |
+
Utilizes WebRTC Voice Activity Detection (VAD) to excise non-semantic silences.
|
| 96 |
+
Ensures the speaker identity is extracted from high-entropy speech segments only.
|
| 97 |
+
"""
|
| 98 |
+
# Spatial Decomposition into temporal windows
|
| 99 |
+
samples_per_window = (vad_window_length * sampling_rate) // 1000
|
| 100 |
+
wav = wav[:len(wav) - (len(wav) % samples_per_window)]
|
| 101 |
+
|
| 102 |
+
# Binary Serialization for VAD compatibility (16-bit PCM)
|
| 103 |
+
pcm_wave = struct.pack("%dh" % len(wav), *(np.round(wav * int16_max)).astype(np.int16))
|
| 104 |
+
|
| 105 |
+
# Statistical Speech Filtering
|
| 106 |
+
voice_flags = []
|
| 107 |
+
vad = webrtcvad.Vad(mode=3) # Aggressive Filtering
|
| 108 |
+
for window_start in range(0, len(wav), samples_per_window):
|
| 109 |
+
window_end = window_start + samples_per_window
|
| 110 |
+
voice_flags.append(vad.is_speech(pcm_wave[window_start * 2:window_end * 2],
|
| 111 |
+
sample_rate=sampling_rate))
|
| 112 |
+
voice_flags = np.array(voice_flags)
|
| 113 |
+
|
| 114 |
+
# Temporal Smoothing (Moving Average)
|
| 115 |
+
def moving_average(array, width):
|
| 116 |
+
array_padded = np.concatenate((np.zeros((width - 1) // 2), array, np.zeros(width // 2)))
|
| 117 |
+
ret = np.cumsum(array_padded, dtype=float)
|
| 118 |
+
ret[width:] = ret[width:] - ret[:-width]
|
| 119 |
+
return ret[width - 1:] / width
|
| 120 |
+
|
| 121 |
+
audio_mask = moving_average(voice_flags, vad_moving_average_width)
|
| 122 |
+
audio_mask = np.round(audio_mask).astype(bool)
|
| 123 |
+
|
| 124 |
+
# Morphological Dilation to preserve speech boundaries
|
| 125 |
+
audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1))
|
| 126 |
+
audio_mask = np.repeat(audio_mask, samples_per_window)
|
| 127 |
+
|
| 128 |
+
return wav[audio_mask == True]
|
| 129 |
+
|
| 130 |
+
def normalize_volume(wav, target_dBFS, increase_only=False, decrease_only=False):
|
| 131 |
+
"""Calibrates the signal's energy level to a target Decibel Full Scale (dBFS)."""
|
| 132 |
+
if increase_only and decrease_only:
|
| 133 |
+
raise ValueError("Conflict: Both increase and decrease flags are active.")
|
| 134 |
+
|
| 135 |
+
dBFS_change = target_dBFS - 10 * np.log10(np.mean(wav ** 2))
|
| 136 |
+
|
| 137 |
+
if (dBFS_change < 0 and increase_only) or (dBFS_change > 0 and decrease_only):
|
| 138 |
+
return wav
|
| 139 |
+
|
| 140 |
+
return wav * (10 ** (dBFS_change / 20))
|
Source Code/encoder/config.py
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ==================================================================================================
|
| 2 |
+
# DEEPFAKE AUDIO - encoder/config.py (Dataset Manifest & Corpora Configuration)
|
| 3 |
+
# ==================================================================================================
|
| 4 |
+
#
|
| 5 |
+
# π DESCRIPTION
|
| 6 |
+
# This module acts as the centralized registry for all speech datasets supported
|
| 7 |
+
# by the Speaker Encoder. It defines the directory structures for LibriSpeech,
|
| 8 |
+
# LibriTTS, VoxCeleb, and other major open-source corpora. These configurations
|
| 9 |
+
# guide the preprocessing scripts in discovering and categorizing audio samples
|
| 10 |
+
# for training and validation.
|
| 11 |
+
#
|
| 12 |
+
# π€ AUTHORS
|
| 13 |
+
# - Amey Thakur (https://github.com/Amey-Thakur)
|
| 14 |
+
# - Mega Satish (https://github.com/msatmod)
|
| 15 |
+
#
|
| 16 |
+
# π€π» CREDITS
|
| 17 |
+
# Original Real-Time Voice Cloning methodology by CorentinJ
|
| 18 |
+
# Repository: https://github.com/CorentinJ/Real-Time-Voice-Cloning
|
| 19 |
+
#
|
| 20 |
+
# π PROJECT LINKS
|
| 21 |
+
# Repository: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO
|
| 22 |
+
# Video Demo: https://youtu.be/i3wnBcbHDbs
|
| 23 |
+
# Research: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO/blob/main/DEEPFAKE-AUDIO.ipynb
|
| 24 |
+
#
|
| 25 |
+
# π LICENSE
|
| 26 |
+
# Released under the MIT License
|
| 27 |
+
# Release Date: 2021-02-06
|
| 28 |
+
# ==================================================================================================
|
| 29 |
+
|
| 30 |
+
# --- LIBRISPEECH MANIFEST ---
|
| 31 |
+
librispeech_datasets = {
|
| 32 |
+
"train": {
|
| 33 |
+
"clean": ["LibriSpeech/train-clean-100", "LibriSpeech/train-clean-360"],
|
| 34 |
+
"other": ["LibriSpeech/train-other-500"]
|
| 35 |
+
},
|
| 36 |
+
"test": {
|
| 37 |
+
"clean": ["LibriSpeech/test-clean"],
|
| 38 |
+
"other": ["LibriSpeech/test-other"]
|
| 39 |
+
},
|
| 40 |
+
"dev": {
|
| 41 |
+
"clean": ["LibriSpeech/dev-clean"],
|
| 42 |
+
"other": ["LibriSpeech/dev-other"]
|
| 43 |
+
},
|
| 44 |
+
}
|
| 45 |
+
|
| 46 |
+
# --- LIBRITTS MANIFEST ---
|
| 47 |
+
libritts_datasets = {
|
| 48 |
+
"train": {
|
| 49 |
+
"clean": ["LibriTTS/train-clean-100", "LibriTTS/train-clean-360"],
|
| 50 |
+
"other": ["LibriTTS/train-other-500"]
|
| 51 |
+
},
|
| 52 |
+
"test": {
|
| 53 |
+
"clean": ["LibriTTS/test-clean"],
|
| 54 |
+
"other": ["LibriTTS/test-other"]
|
| 55 |
+
},
|
| 56 |
+
"dev": {
|
| 57 |
+
"clean": ["LibriTTS/dev-clean"],
|
| 58 |
+
"other": ["LibriTTS/dev-other"]
|
| 59 |
+
},
|
| 60 |
+
}
|
| 61 |
+
|
| 62 |
+
# --- VOXCELEB MANIFEST (SPEAKER RECOGNITION) ---
|
| 63 |
+
voxceleb_datasets = {
|
| 64 |
+
"voxceleb1" : {
|
| 65 |
+
"train": ["VoxCeleb1/wav"],
|
| 66 |
+
"test": ["VoxCeleb1/test_wav"]
|
| 67 |
+
},
|
| 68 |
+
"voxceleb2" : {
|
| 69 |
+
"train": ["VoxCeleb2/dev/aac"],
|
| 70 |
+
"test": ["VoxCeleb2/test_wav"]
|
| 71 |
+
}
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
# --- MISCELLANEOUS CORPORA ---
|
| 75 |
+
other_datasets = [
|
| 76 |
+
"LJSpeech-1.1",
|
| 77 |
+
"VCTK-Corpus/wav48",
|
| 78 |
+
]
|
| 79 |
+
|
| 80 |
+
# --- LINGUISTIC CATEGORIZATION ---
|
| 81 |
+
anglophone_nationalites = ["australia", "canada", "ireland", "uk", "usa"]
|
Source Code/encoder/data_objects/__init__.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ==================================================================================================
|
| 2 |
+
# DEEPFAKE AUDIO - encoder/data_objects/__init__.py (Data Layer Initialization)
|
| 3 |
+
# ==================================================================================================
|
| 4 |
+
#
|
| 5 |
+
# π DESCRIPTION
|
| 6 |
+
# This initialization script exposes the primary data orchestration classes for
|
| 7 |
+
# speaker verification. It facilitates structured access to datasets and
|
| 8 |
+
# loaders, abstraction layers that underpin the neural training pipeline.
|
| 9 |
+
#
|
| 10 |
+
# π€ AUTHORS
|
| 11 |
+
# - Amey Thakur (https://github.com/Amey-Thakur)
|
| 12 |
+
# - Mega Satish (https://github.com/msatmod)
|
| 13 |
+
#
|
| 14 |
+
# π€π» CREDITS
|
| 15 |
+
# Original Real-Time Voice Cloning methodology by CorentinJ
|
| 16 |
+
# Repository: https://github.com/CorentinJ/Real-Time-Voice-Cloning
|
| 17 |
+
#
|
| 18 |
+
# π PROJECT LINKS
|
| 19 |
+
# Repository: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO
|
| 20 |
+
# Video Demo: https://youtu.be/i3wnBcbHDbs
|
| 21 |
+
# Research: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO/blob/main/DEEPFAKE-AUDIO.ipynb
|
| 22 |
+
#
|
| 23 |
+
# π LICENSE
|
| 24 |
+
# Released under the MIT License
|
| 25 |
+
# Release Date: 2021-02-06
|
| 26 |
+
# ==================================================================================================
|
| 27 |
+
|
| 28 |
+
from encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataset
|
| 29 |
+
from encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataLoader
|
Source Code/encoder/data_objects/random_cycler.py
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ==================================================================================================
|
| 2 |
+
# DEEPFAKE AUDIO - encoder/data_objects/random_cycler.py (Constrained Stochastic Iteration)
|
| 3 |
+
# ==================================================================================================
|
| 4 |
+
#
|
| 5 |
+
# π DESCRIPTION
|
| 6 |
+
# This utility provides a 'RandomCycler' class designed for uniform yet stochastic
|
| 7 |
+
# sampling of dataset items. It ensures that every item in a collection is
|
| 8 |
+
# seen with a guaranteed frequency, avoiding potential biases or 'starvation'
|
| 9 |
+
# during neural optimization steps while maintaining sufficient randomness.
|
| 10 |
+
#
|
| 11 |
+
# π€ AUTHORS
|
| 12 |
+
# - Amey Thakur (https://github.com/Amey-Thakur)
|
| 13 |
+
# - Mega Satish (https://github.com/msatmod)
|
| 14 |
+
#
|
| 15 |
+
# π€π» CREDITS
|
| 16 |
+
# Original Real-Time Voice Cloning methodology by CorentinJ
|
| 17 |
+
# Repository: https://github.com/CorentinJ/Real-Time-Voice-Cloning
|
| 18 |
+
#
|
| 19 |
+
# π PROJECT LINKS
|
| 20 |
+
# Repository: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO
|
| 21 |
+
# Video Demo: https://youtu.be/i3wnBcbHDbs
|
| 22 |
+
# Research: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO/blob/main/DEEPFAKE-AUDIO.ipynb
|
| 23 |
+
#
|
| 24 |
+
# π LICENSE
|
| 25 |
+
# Released under the MIT License
|
| 26 |
+
# Release Date: 2021-02-06
|
| 27 |
+
# ==================================================================================================
|
| 28 |
+
|
| 29 |
+
import random
|
| 30 |
+
|
| 31 |
+
class RandomCycler:
|
| 32 |
+
"""
|
| 33 |
+
Uniform Stochastic Sampler:
|
| 34 |
+
Maintains a sequence where each item is guaranteed to appear within a controlled
|
| 35 |
+
interval, ensuring balanced categorical exposure during training.
|
| 36 |
+
"""
|
| 37 |
+
def __init__(self, source):
|
| 38 |
+
if len(source) == 0:
|
| 39 |
+
raise Exception("Fatal: Cannot initialize RandomCycler with an empty collection.")
|
| 40 |
+
self.all_items = list(source)
|
| 41 |
+
self.next_items = []
|
| 42 |
+
|
| 43 |
+
def sample(self, count: int):
|
| 44 |
+
"""
|
| 45 |
+
Retrieves a 'count' number of items, replenishing and shuffling the internal
|
| 46 |
+
pool as needed to maintain stochastisity without repetition within a cycle.
|
| 47 |
+
"""
|
| 48 |
+
shuffle = lambda l: random.sample(l, len(l))
|
| 49 |
+
|
| 50 |
+
out = []
|
| 51 |
+
while count > 0:
|
| 52 |
+
# High-Volume Requests
|
| 53 |
+
if count >= len(self.all_items):
|
| 54 |
+
out.extend(shuffle(list(self.all_items)))
|
| 55 |
+
count -= len(self.all_items)
|
| 56 |
+
continue
|
| 57 |
+
|
| 58 |
+
# Partial Pool Refresh
|
| 59 |
+
n = min(count, len(self.next_items))
|
| 60 |
+
out.extend(self.next_items[:n])
|
| 61 |
+
count -= n
|
| 62 |
+
self.next_items = self.next_items[n:]
|
| 63 |
+
|
| 64 |
+
if len(self.next_items) == 0:
|
| 65 |
+
self.next_items = shuffle(list(self.all_items))
|
| 66 |
+
|
| 67 |
+
return out
|
| 68 |
+
|
| 69 |
+
def __next__(self):
|
| 70 |
+
"""Standard Python iterator hook for single-sample acquisition."""
|
| 71 |
+
return self.sample(1)[0]
|
| 72 |
+
|
Source Code/encoder/data_objects/speaker.py
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ==================================================================================================
|
| 2 |
+
# DEEPFAKE AUDIO - encoder/data_objects/speaker.py (Categorical Identity Representation)
|
| 3 |
+
# ==================================================================================================
|
| 4 |
+
#
|
| 5 |
+
# π DESCRIPTION
|
| 6 |
+
# This module implements the 'Speaker' abstraction, encapulating all linguistic
|
| 7 |
+
# and acoustic metadata for a single individual. It manages the retrieval
|
| 8 |
+
# and segmented sampling of utterances, acting as a gateway to the serialized
|
| 9 |
+
# Mel-Spectrograms used in neural distillation.
|
| 10 |
+
#
|
| 11 |
+
# π€ AUTHORS
|
| 12 |
+
# - Amey Thakur (https://github.com/Amey-Thakur)
|
| 13 |
+
# - Mega Satish (https://github.com/msatmod)
|
| 14 |
+
#
|
| 15 |
+
# π€π» CREDITS
|
| 16 |
+
# Original Real-Time Voice Cloning methodology by CorentinJ
|
| 17 |
+
# Repository: https://github.com/CorentinJ/Real-Time-Voice-Cloning
|
| 18 |
+
#
|
| 19 |
+
# π PROJECT LINKS
|
| 20 |
+
# Repository: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO
|
| 21 |
+
# Video Demo: https://youtu.be/i3wnBcbHDbs
|
| 22 |
+
# Research: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO/blob/main/DEEPFAKE-AUDIO.ipynb
|
| 23 |
+
#
|
| 24 |
+
# π LICENSE
|
| 25 |
+
# Released under the MIT License
|
| 26 |
+
# Release Date: 2021-02-06
|
| 27 |
+
# ==================================================================================================
|
| 28 |
+
|
| 29 |
+
from encoder.data_objects.random_cycler import RandomCycler
|
| 30 |
+
from encoder.data_objects.utterance import Utterance
|
| 31 |
+
from pathlib import Path
|
| 32 |
+
|
| 33 |
+
class Speaker:
|
| 34 |
+
"""
|
| 35 |
+
Categorical Data Container:
|
| 36 |
+
Aggregates all speech samples associated with a unique institutional speaker ID.
|
| 37 |
+
"""
|
| 38 |
+
def __init__(self, root: Path):
|
| 39 |
+
self.root = root
|
| 40 |
+
self.name = root.name
|
| 41 |
+
self.utterances = None
|
| 42 |
+
self.utterance_cycler = None
|
| 43 |
+
|
| 44 |
+
def _load_utterances(self):
|
| 45 |
+
"""Lazy-loading of utterance metadata from the serialized index (_sources.txt)."""
|
| 46 |
+
with self.root.joinpath("_sources.txt").open("r") as sources_file:
|
| 47 |
+
sources = [l.split(",") for l in sources_file]
|
| 48 |
+
|
| 49 |
+
# Identity Mapping: frames_fname -> original_wave_fpath
|
| 50 |
+
sources = {frames_fname: wave_fpath for frames_fname, wave_fpath in sources}
|
| 51 |
+
self.utterances = [Utterance(self.root.joinpath(f), w) for f, w in sources.items()]
|
| 52 |
+
self.utterance_cycler = RandomCycler(self.utterances)
|
| 53 |
+
|
| 54 |
+
def random_partial(self, count, n_frames):
|
| 55 |
+
"""
|
| 56 |
+
Samples a batch of <count> unique partial utterances.
|
| 57 |
+
Ensures diverse temporal coverage within the speaker's available vocal range.
|
| 58 |
+
"""
|
| 59 |
+
if self.utterances is None:
|
| 60 |
+
self._load_utterances()
|
| 61 |
+
|
| 62 |
+
# Stochastic selection of utterances
|
| 63 |
+
utterances = self.utterance_cycler.sample(count)
|
| 64 |
+
|
| 65 |
+
# Spatio-temporal cropping: (utterance, frames, crop_range)
|
| 66 |
+
a = [(u,) + u.random_partial(n_frames) for u in utterances]
|
| 67 |
+
|
| 68 |
+
return a
|
Source Code/encoder/data_objects/speaker_batch.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ==================================================================================================
|
| 2 |
+
# DEEPFAKE AUDIO - encoder/data_objects/speaker_batch.py (Neural Batch Collation)
|
| 3 |
+
# ==================================================================================================
|
| 4 |
+
#
|
| 5 |
+
# π DESCRIPTION
|
| 6 |
+
# This module defines the SpeakerBatch class, which aggregates multiple speakers
|
| 7 |
+
# and their respective partial utterances into a unified tensor structure. It
|
| 8 |
+
# facilitates the high-throughput gradient descent cycles required for the
|
| 9 |
+
# GE2E loss optimization.
|
| 10 |
+
#
|
| 11 |
+
# π€ AUTHORS
|
| 12 |
+
# - Amey Thakur (https://github.com/Amey-Thakur)
|
| 13 |
+
# - Mega Satish (https://github.com/msatmod)
|
| 14 |
+
#
|
| 15 |
+
# π€π» CREDITS
|
| 16 |
+
# Original Real-Time Voice Cloning methodology by CorentinJ
|
| 17 |
+
# Repository: https://github.com/CorentinJ/Real-Time-Voice-Cloning
|
| 18 |
+
#
|
| 19 |
+
# π PROJECT LINKS
|
| 20 |
+
# Repository: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO
|
| 21 |
+
# Video Demo: https://youtu.be/i3wnBcbHDbs
|
| 22 |
+
# Research: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO/blob/main/DEEPFAKE-AUDIO.ipynb
|
| 23 |
+
#
|
| 24 |
+
# π LICENSE
|
| 25 |
+
# Released under the MIT License
|
| 26 |
+
# Release Date: 2021-02-06
|
| 27 |
+
# ==================================================================================================
|
| 28 |
+
|
| 29 |
+
import numpy as np
|
| 30 |
+
from typing import List
|
| 31 |
+
from encoder.data_objects.speaker import Speaker
|
| 32 |
+
|
| 33 |
+
class SpeakerBatch:
|
| 34 |
+
"""
|
| 35 |
+
Categorical Batch Orchestrator:
|
| 36 |
+
Collates acoustic data for B speakers, each with M utterances, into a
|
| 37 |
+
consistent [B*M, T, C] matrix for neural ingestion.
|
| 38 |
+
"""
|
| 39 |
+
def __init__(self, speakers: List[Speaker], utterances_per_speaker: int, n_frames: int):
|
| 40 |
+
self.speakers = speakers
|
| 41 |
+
|
| 42 |
+
# Parallel Identity Sampling
|
| 43 |
+
self.partials = {s: s.random_partial(utterances_per_speaker, n_frames) for s in speakers}
|
| 44 |
+
|
| 45 |
+
# Sparse-to-Dense Materialization: (n_speakers * n_utterances, n_frames, mel_channels)
|
| 46 |
+
self.data = np.array([frames for s in speakers for _, frames, _ in self.partials[s]])
|
Source Code/encoder/data_objects/speaker_verification_dataset.py
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ==================================================================================================
|
| 2 |
+
# DEEPFAKE AUDIO - encoder/data_objects/speaker_verification_dataset.py (PyTorch Data Layer)
|
| 3 |
+
# ==================================================================================================
|
| 4 |
+
#
|
| 5 |
+
# π DESCRIPTION
|
| 6 |
+
# This module implements the PyTorch Dataset and DataLoader abstractions tailored
|
| 7 |
+
# for Speaker Verification. It manages the discovery of speaker directories,
|
| 8 |
+
# categorical sampling via RandomCycler, and high-performance batch collation.
|
| 9 |
+
#
|
| 10 |
+
# π€ AUTHORS
|
| 11 |
+
# - Amey Thakur (https://github.com/Amey-Thakur)
|
| 12 |
+
# - Mega Satish (https://github.com/msatmod)
|
| 13 |
+
#
|
| 14 |
+
# π€π» CREDITS
|
| 15 |
+
# Original Real-Time Voice Cloning methodology by CorentinJ
|
| 16 |
+
# Repository: https://github.com/CorentinJ/Real-Time-Voice-Cloning
|
| 17 |
+
#
|
| 18 |
+
# π PROJECT LINKS
|
| 19 |
+
# Repository: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO
|
| 20 |
+
# Video Demo: https://youtu.be/i3wnBcbHDbs
|
| 21 |
+
# Research: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO/blob/main/DEEPFAKE-AUDIO.ipynb
|
| 22 |
+
#
|
| 23 |
+
# π LICENSE
|
| 24 |
+
# Released under the MIT License
|
| 25 |
+
# Release Date: 2021-02-06
|
| 26 |
+
# ==================================================================================================
|
| 27 |
+
|
| 28 |
+
from encoder.data_objects.random_cycler import RandomCycler
|
| 29 |
+
from encoder.data_objects.speaker_batch import SpeakerBatch
|
| 30 |
+
from encoder.data_objects.speaker import Speaker
|
| 31 |
+
from encoder.params_data import partials_n_frames
|
| 32 |
+
from torch.utils.data import Dataset, DataLoader
|
| 33 |
+
from pathlib import Path
|
| 34 |
+
|
| 35 |
+
class SpeakerVerificationDataset(Dataset):
|
| 36 |
+
"""
|
| 37 |
+
Neural Corpus Interface:
|
| 38 |
+
Scans a root directory for processed speaker identities and provides
|
| 39 |
+
an infinite stochastic stream of categorical data.
|
| 40 |
+
"""
|
| 41 |
+
def __init__(self, datasets_root: Path):
|
| 42 |
+
self.root = datasets_root
|
| 43 |
+
|
| 44 |
+
# Identity Discovery
|
| 45 |
+
speaker_dirs = [f for f in self.root.glob("*") if f.is_dir()]
|
| 46 |
+
if len(speaker_dirs) == 0:
|
| 47 |
+
raise Exception("β οΈ Technical Alert: No speakers detected in %s." % self.root)
|
| 48 |
+
|
| 49 |
+
self.speakers = [Speaker(speaker_dir) for speaker_dir in speaker_dirs]
|
| 50 |
+
self.speaker_cycler = RandomCycler(self.speakers)
|
| 51 |
+
|
| 52 |
+
def __len__(self):
|
| 53 |
+
"""Returns a high constant to simulate an infinite stream for the DataLoader."""
|
| 54 |
+
return int(1e10)
|
| 55 |
+
|
| 56 |
+
def __getitem__(self, index):
|
| 57 |
+
"""Retrieves the next stochastic categorical identity."""
|
| 58 |
+
return next(self.speaker_cycler)
|
| 59 |
+
|
| 60 |
+
def get_logs(self):
|
| 61 |
+
"""Aggregates all preprocessing logs into a single analytical string."""
|
| 62 |
+
log_string = ""
|
| 63 |
+
for log_fpath in self.root.glob("*.txt"):
|
| 64 |
+
with log_fpath.open("r") as log_file:
|
| 65 |
+
log_string += "".join(log_file.readlines())
|
| 66 |
+
return log_string
|
| 67 |
+
|
| 68 |
+
class SpeakerVerificationDataLoader(DataLoader):
|
| 69 |
+
"""
|
| 70 |
+
High-Throughput Orchestrator:
|
| 71 |
+
Custom DataLoader designed to yield SpeakerBatch objects containing
|
| 72 |
+
diverse identities and utterances.
|
| 73 |
+
"""
|
| 74 |
+
def __init__(self, dataset, speakers_per_batch, utterances_per_speaker, sampler=None,
|
| 75 |
+
batch_sampler=None, num_workers=0, pin_memory=False, timeout=0,
|
| 76 |
+
worker_init_fn=None):
|
| 77 |
+
self.utterances_per_speaker = utterances_per_speaker
|
| 78 |
+
|
| 79 |
+
super().__init__(
|
| 80 |
+
dataset=dataset,
|
| 81 |
+
batch_size=speakers_per_batch,
|
| 82 |
+
shuffle=False,
|
| 83 |
+
sampler=sampler,
|
| 84 |
+
batch_sampler=batch_sampler,
|
| 85 |
+
num_workers=num_workers,
|
| 86 |
+
collate_fn=self.collate, # Custom collation for GE2E loss
|
| 87 |
+
pin_memory=pin_memory,
|
| 88 |
+
drop_last=False,
|
| 89 |
+
timeout=timeout,
|
| 90 |
+
worker_init_fn=worker_init_fn
|
| 91 |
+
)
|
| 92 |
+
|
| 93 |
+
def collate(self, speakers):
|
| 94 |
+
"""Constructs a SpeakerBatch from a set of sampled identities."""
|
| 95 |
+
return SpeakerBatch(speakers, self.utterances_per_speaker, partials_n_frames)
|
| 96 |
+
|
Source Code/encoder/data_objects/utterance.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ==================================================================================================
|
| 2 |
+
# DEEPFAKE AUDIO - encoder/data_objects/utterance.py (Vocal Unit Representation)
|
| 3 |
+
# ==================================================================================================
|
| 4 |
+
#
|
| 5 |
+
# π DESCRIPTION
|
| 6 |
+
# This module defines the 'Utterance' class, representing a single spoken phrase
|
| 7 |
+
# or acoustic segment. It provides mechanisms for loading preprocessed Mel-scale
|
| 8 |
+
# filterbanks from the disk and handles stochastic temporal cropping (random
|
| 9 |
+
# partials) to increase data variety during training.
|
| 10 |
+
#
|
| 11 |
+
# π€ AUTHORS
|
| 12 |
+
# - Amey Thakur (https://github.com/Amey-Thakur)
|
| 13 |
+
# - Mega Satish (https://github.com/msatmod)
|
| 14 |
+
#
|
| 15 |
+
# π€π» CREDITS
|
| 16 |
+
# Original Real-Time Voice Cloning methodology by CorentinJ
|
| 17 |
+
# Repository: https://github.com/CorentinJ/Real-Time-Voice-Cloning
|
| 18 |
+
#
|
| 19 |
+
# π PROJECT LINKS
|
| 20 |
+
# Repository: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO
|
| 21 |
+
# Video Demo: https://youtu.be/i3wnBcbHDbs
|
| 22 |
+
# Research: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO/blob/main/DEEPFAKE-AUDIO.ipynb
|
| 23 |
+
#
|
| 24 |
+
# π LICENSE
|
| 25 |
+
# Released under the MIT License
|
| 26 |
+
# Release Date: 2021-02-06
|
| 27 |
+
# ==================================================================================================
|
| 28 |
+
|
| 29 |
+
import numpy as np
|
| 30 |
+
|
| 31 |
+
class Utterance:
|
| 32 |
+
"""
|
| 33 |
+
Acoustic Data Container:
|
| 34 |
+
Manages the lifecycle of a single vocal sample, from disk retrieval to
|
| 35 |
+
stochastic temporal segmentation.
|
| 36 |
+
"""
|
| 37 |
+
def __init__(self, frames_fpath, wave_fpath):
|
| 38 |
+
self.frames_fpath = frames_fpath
|
| 39 |
+
self.wave_fpath = wave_fpath
|
| 40 |
+
|
| 41 |
+
def get_frames(self):
|
| 42 |
+
"""Deserializes the Mel-Spectrogram matrix from the filesystem."""
|
| 43 |
+
return np.load(self.frames_fpath)
|
| 44 |
+
|
| 45 |
+
def random_partial(self, n_frames):
|
| 46 |
+
"""
|
| 47 |
+
Spatio-Temporal Cropping:
|
| 48 |
+
Cuts a random segment of 'n_frames' from the full utterance.
|
| 49 |
+
This technique acts as a form of temporal data augmentation.
|
| 50 |
+
"""
|
| 51 |
+
frames = self.get_frames()
|
| 52 |
+
if frames.shape[0] == n_frames:
|
| 53 |
+
start = 0
|
| 54 |
+
else:
|
| 55 |
+
# Stochastic offset selection
|
| 56 |
+
start = np.random.randint(0, frames.shape[0] - n_frames)
|
| 57 |
+
|
| 58 |
+
end = start + n_frames
|
| 59 |
+
return frames[start:end], (start, end)
|
Source Code/encoder/inference.py
ADDED
|
@@ -0,0 +1,167 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ==================================================================================================
|
| 2 |
+
# DEEPFAKE AUDIO - encoder/inference.py (Neural Identity Distillation Interface)
|
| 3 |
+
# ==================================================================================================
|
| 4 |
+
#
|
| 5 |
+
# π DESCRIPTION
|
| 6 |
+
# This module provides the high-level API for using the Speaker Encoder in a
|
| 7 |
+
# production environment. It encapsulates the complexities of model loading,
|
| 8 |
+
# tensor orchestration, and d-vector derivation. It is the primary bridge
|
| 9 |
+
# used by the web interface (app.py) to extract speaker identities from
|
| 10 |
+
# uploaded reference audio samples.
|
| 11 |
+
#
|
| 12 |
+
# π€ AUTHORS
|
| 13 |
+
# - Amey Thakur (https://github.com/Amey-Thakur)
|
| 14 |
+
# - Mega Satish (https://github.com/msatmod)
|
| 15 |
+
#
|
| 16 |
+
# π€π» CREDITS
|
| 17 |
+
# Original Real-Time Voice Cloning methodology by CorentinJ
|
| 18 |
+
# Repository: https://github.com/CorentinJ/Real-Time-Voice-Cloning
|
| 19 |
+
#
|
| 20 |
+
# π PROJECT LINKS
|
| 21 |
+
# Repository: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO
|
| 22 |
+
# Video Demo: https://youtu.be/i3wnBcbHDbs
|
| 23 |
+
# Research: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO/blob/main/DEEPFAKE-AUDIO.ipynb
|
| 24 |
+
#
|
| 25 |
+
# π LICENSE
|
| 26 |
+
# Released under the MIT License
|
| 27 |
+
# Release Date: 2021-02-06
|
| 28 |
+
# ==================================================================================================
|
| 29 |
+
|
| 30 |
+
from encoder.params_data import *
|
| 31 |
+
from encoder.model import SpeakerEncoder
|
| 32 |
+
from encoder.audio import preprocess_wav
|
| 33 |
+
from matplotlib import cm
|
| 34 |
+
from encoder import audio
|
| 35 |
+
from pathlib import Path
|
| 36 |
+
import numpy as np
|
| 37 |
+
import torch
|
| 38 |
+
|
| 39 |
+
# --- INTERNAL STATE (SINGLETON PATTERN) ---
|
| 40 |
+
_model = None # type: SpeakerEncoder
|
| 41 |
+
_device = None # type: torch.device
|
| 42 |
+
|
| 43 |
+
def load_model(weights_fpath: Path, device=None):
|
| 44 |
+
"""
|
| 45 |
+
Initializes the Speaker Encoder neural network.
|
| 46 |
+
Deserializes the PyTorch state dictionary and prepares the model for eval mode.
|
| 47 |
+
"""
|
| 48 |
+
global _model, _device
|
| 49 |
+
|
| 50 |
+
# Precise hardware targeting
|
| 51 |
+
if device is None:
|
| 52 |
+
_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 53 |
+
elif isinstance(device, str):
|
| 54 |
+
_device = torch.device(device)
|
| 55 |
+
|
| 56 |
+
# Constructing the architecture
|
| 57 |
+
_model = SpeakerEncoder(_device, torch.device("cpu"))
|
| 58 |
+
|
| 59 |
+
# Loading serialized weights
|
| 60 |
+
checkpoint = torch.load(weights_fpath, map_location=_device, weights_only=False)
|
| 61 |
+
_model.load_state_dict(checkpoint["model_state"])
|
| 62 |
+
_model.eval()
|
| 63 |
+
|
| 64 |
+
print("π€π» Encoder Active: Loaded \"%s\" (Step %d)" % (weights_fpath.name, checkpoint["step"]))
|
| 65 |
+
|
| 66 |
+
def is_loaded():
|
| 67 |
+
"""Checks the initialization status of the neural engine."""
|
| 68 |
+
return _model is not None
|
| 69 |
+
|
| 70 |
+
def embed_frames_batch(frames_batch):
|
| 71 |
+
"""
|
| 72 |
+
Neural Forward Pass: Computes speaker embeddings for a batch of spectrograms.
|
| 73 |
+
Returns l2-normalized d-vectors.
|
| 74 |
+
"""
|
| 75 |
+
if _model is None:
|
| 76 |
+
raise Exception("Fatal: Neural Encoder is not initialized. Invoke load_model().")
|
| 77 |
+
|
| 78 |
+
frames = torch.from_numpy(frames_batch).to(_device)
|
| 79 |
+
embed = _model.forward(frames).detach().cpu().numpy()
|
| 80 |
+
return embed
|
| 81 |
+
|
| 82 |
+
def compute_partial_slices(n_samples, partial_utterance_n_frames=partials_n_frames,
|
| 83 |
+
min_pad_coverage=0.75, overlap=0.5):
|
| 84 |
+
"""
|
| 85 |
+
Spatio-Temporal Segmentation: Defines how a long utterance is sliced into
|
| 86 |
+
overlapping windows for stable embedding derivation.
|
| 87 |
+
"""
|
| 88 |
+
assert 0 <= overlap < 1
|
| 89 |
+
assert 0 < min_pad_coverage <= 1
|
| 90 |
+
|
| 91 |
+
samples_per_frame = int((sampling_rate * mel_window_step / 1000))
|
| 92 |
+
n_frames = int(np.ceil((n_samples + 1) / samples_per_frame))
|
| 93 |
+
frame_step = max(int(np.round(partial_utterance_n_frames * (1 - overlap))), 1)
|
| 94 |
+
|
| 95 |
+
# Window Orchestration
|
| 96 |
+
wav_slices, mel_slices = [], []
|
| 97 |
+
steps = max(1, n_frames - partial_utterance_n_frames + frame_step + 1)
|
| 98 |
+
for i in range(0, steps, frame_step):
|
| 99 |
+
mel_range = np.array([i, i + partial_utterance_n_frames])
|
| 100 |
+
wav_range = mel_range * samples_per_frame
|
| 101 |
+
mel_slices.append(slice(*mel_range))
|
| 102 |
+
wav_slices.append(slice(*wav_range))
|
| 103 |
+
|
| 104 |
+
# Defensive Padding Evaluation
|
| 105 |
+
last_wav_range = wav_slices[-1]
|
| 106 |
+
coverage = (n_samples - last_wav_range.start) / (last_wav_range.stop - last_wav_range.start)
|
| 107 |
+
if coverage < min_pad_coverage and len(mel_slices) > 1:
|
| 108 |
+
mel_slices = mel_slices[:-1]
|
| 109 |
+
wav_slices = wav_slices[:-1]
|
| 110 |
+
|
| 111 |
+
return wav_slices, mel_slices
|
| 112 |
+
|
| 113 |
+
def embed_utterance(wav, using_partials=True, return_partials=False, **kwargs):
|
| 114 |
+
"""
|
| 115 |
+
Core Identity Extraction: Distills a processed waveform into a single
|
| 116 |
+
256-dimensional identity vector (d-vector).
|
| 117 |
+
"""
|
| 118 |
+
# 1. Full-Waveform Processing (Fallback for short utterances)
|
| 119 |
+
if not using_partials:
|
| 120 |
+
frames = audio.wav_to_mel_spectrogram(wav)
|
| 121 |
+
embed = embed_frames_batch(frames[None, ...])[0]
|
| 122 |
+
if return_partials:
|
| 123 |
+
return embed, None, None
|
| 124 |
+
return embed
|
| 125 |
+
|
| 126 |
+
# 2. Windowed Distillation
|
| 127 |
+
wave_slices, mel_slices = compute_partial_slices(len(wav), **kwargs)
|
| 128 |
+
max_wave_length = wave_slices[-1].stop
|
| 129 |
+
if max_wave_length >= len(wav):
|
| 130 |
+
wav = np.pad(wav, (0, max_wave_length - len(wav)), "constant")
|
| 131 |
+
|
| 132 |
+
# 3. Batch Inference on Windows
|
| 133 |
+
frames = audio.wav_to_mel_spectrogram(wav)
|
| 134 |
+
frames_batch = np.array([frames[s] for s in mel_slices])
|
| 135 |
+
partial_embeds = embed_frames_batch(frames_batch)
|
| 136 |
+
|
| 137 |
+
# 4. Statistical Averaging & Re-Normalization
|
| 138 |
+
raw_embed = np.mean(partial_embeds, axis=0)
|
| 139 |
+
embed = raw_embed / np.linalg.norm(raw_embed, 2)
|
| 140 |
+
|
| 141 |
+
if return_partials:
|
| 142 |
+
return embed, partial_embeds, wave_slices
|
| 143 |
+
return embed
|
| 144 |
+
|
| 145 |
+
def embed_speaker(wavs, **kwargs):
|
| 146 |
+
"""Aggregate identity extraction for multiple utterances from the same speaker."""
|
| 147 |
+
raise NotImplementedError("Collaborative development in progress for multi-wav aggregation.")
|
| 148 |
+
|
| 149 |
+
def plot_embedding_as_heatmap(embed, ax=None, title="", shape=None, color_range=(0, 0.30)):
|
| 150 |
+
"""Visualizes the high-dimensional latent vector as a spatial intensity map."""
|
| 151 |
+
import matplotlib.pyplot as plt
|
| 152 |
+
if ax is None:
|
| 153 |
+
ax = plt.gca()
|
| 154 |
+
|
| 155 |
+
if shape is None:
|
| 156 |
+
height = int(np.sqrt(len(embed)))
|
| 157 |
+
shape = (height, -1)
|
| 158 |
+
embed = embed.reshape(shape)
|
| 159 |
+
|
| 160 |
+
cmap = plt.get_cmap()
|
| 161 |
+
mappable = ax.imshow(embed, cmap=cmap)
|
| 162 |
+
cbar = plt.colorbar(mappable, ax=ax, fraction=0.046, pad=0.04)
|
| 163 |
+
sm = cm.ScalarMappable(cmap=cmap)
|
| 164 |
+
sm.set_clim(*color_range)
|
| 165 |
+
|
| 166 |
+
ax.set_xticks([]), ax.set_yticks([])
|
| 167 |
+
ax.set_title(title)
|
Source Code/encoder/model.py
ADDED
|
@@ -0,0 +1,152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ==================================================================================================
|
| 2 |
+
# DEEPFAKE AUDIO - encoder/model.py (Neural Architecture Definition)
|
| 3 |
+
# ==================================================================================================
|
| 4 |
+
#
|
| 5 |
+
# π DESCRIPTION
|
| 6 |
+
# This module defines the SpeakerEncoder class, a three-layer LSTM-based neural
|
| 7 |
+
# network inspired by the 'Generalized End-to-End Loss for Speaker Verification'
|
| 8 |
+
# research. It maps variable-length speech features into fixed-dimensional
|
| 9 |
+
# embeddings (d-vectors) that represent the unique vocal characteristics of the
|
| 10 |
+
# speaker, enabling zero-shot voice cloning.
|
| 11 |
+
#
|
| 12 |
+
# π€ AUTHORS
|
| 13 |
+
# - Amey Thakur (https://github.com/Amey-Thakur)
|
| 14 |
+
# - Mega Satish (https://github.com/msatmod)
|
| 15 |
+
#
|
| 16 |
+
# π€π» CREDITS
|
| 17 |
+
# Original Real-Time Voice Cloning methodology by CorentinJ
|
| 18 |
+
# Repository: https://github.com/CorentinJ/Real-Time-Voice-Cloning
|
| 19 |
+
#
|
| 20 |
+
# π PROJECT LINKS
|
| 21 |
+
# Repository: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO
|
| 22 |
+
# Video Demo: https://youtu.be/i3wnBcbHDbs
|
| 23 |
+
# Research: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO/blob/main/DEEPFAKE-AUDIO.ipynb
|
| 24 |
+
#
|
| 25 |
+
# π LICENSE
|
| 26 |
+
# Released under the MIT License
|
| 27 |
+
# Release Date: 2021-02-06
|
| 28 |
+
# ==================================================================================================
|
| 29 |
+
|
| 30 |
+
from encoder.params_model import *
|
| 31 |
+
from encoder.params_data import *
|
| 32 |
+
from scipy.interpolate import interp1d
|
| 33 |
+
from sklearn.metrics import roc_curve
|
| 34 |
+
from torch.nn.utils import clip_grad_norm_
|
| 35 |
+
from scipy.optimize import brentq
|
| 36 |
+
from torch import nn
|
| 37 |
+
import numpy as np
|
| 38 |
+
import torch
|
| 39 |
+
|
| 40 |
+
class SpeakerEncoder(nn.Module):
|
| 41 |
+
"""
|
| 42 |
+
Spatio-Temporal Identity Extractor:
|
| 43 |
+
An LSTM architecture designed to condense acoustic feature sequences into
|
| 44 |
+
latent speaker representations.
|
| 45 |
+
"""
|
| 46 |
+
def __init__(self, device, loss_device):
|
| 47 |
+
super().__init__()
|
| 48 |
+
self.loss_device = loss_device
|
| 49 |
+
|
| 50 |
+
# --- RECURRENT BACKBONE ---
|
| 51 |
+
# Multi-layer LSTM to capture temporal acoustic dependencies.
|
| 52 |
+
self.lstm = nn.LSTM(input_size=mel_n_channels,
|
| 53 |
+
hidden_size=model_hidden_size,
|
| 54 |
+
num_layers=model_num_layers,
|
| 55 |
+
batch_first=True).to(device)
|
| 56 |
+
|
| 57 |
+
# --- PROJECTION LAYER ---
|
| 58 |
+
# Maps the final hidden state to the d-vector space.
|
| 59 |
+
self.linear = nn.Linear(in_features=model_hidden_size,
|
| 60 |
+
out_features=model_embedding_size).to(device)
|
| 61 |
+
self.relu = torch.nn.ReLU().to(device)
|
| 62 |
+
|
| 63 |
+
# --- COSINE SIMILARITY SCALE & BIAS ---
|
| 64 |
+
# Learnable parameters to transform cosine similarities into optimized logit ranges.
|
| 65 |
+
self.similarity_weight = nn.Parameter(torch.tensor([10.])).to(loss_device)
|
| 66 |
+
self.similarity_bias = nn.Parameter(torch.tensor([-5.])).to(loss_device)
|
| 67 |
+
|
| 68 |
+
# Optimization Criterion
|
| 69 |
+
self.loss_fn = nn.CrossEntropyLoss().to(loss_device)
|
| 70 |
+
|
| 71 |
+
def do_gradient_ops(self):
|
| 72 |
+
"""Manages gradient scaling and norm clipping for stable training dynamics."""
|
| 73 |
+
# Sensitivity reduction for similarity parameters
|
| 74 |
+
self.similarity_weight.grad *= 0.01
|
| 75 |
+
self.similarity_bias.grad *= 0.01
|
| 76 |
+
|
| 77 |
+
# Global Gradient Constraint
|
| 78 |
+
clip_grad_norm_(self.parameters(), 3, norm_type=2)
|
| 79 |
+
|
| 80 |
+
def forward(self, utterances, hidden_init=None):
|
| 81 |
+
"""
|
| 82 |
+
Neural Distillation:
|
| 83 |
+
Processes a batch of mel-spectrograms [B, T, C] and returns d-vectors [B, E].
|
| 84 |
+
"""
|
| 85 |
+
# Sequential temporal extraction
|
| 86 |
+
out, (hidden, cell) = self.lstm(utterances, hidden_init)
|
| 87 |
+
|
| 88 |
+
# State aggregation: Extract identity from the final LSTM layer's last state
|
| 89 |
+
embeds_raw = self.relu(self.linear(hidden[-1]))
|
| 90 |
+
|
| 91 |
+
# L2-Normalization: Project onto the identity hypersphere (unit length)
|
| 92 |
+
embeds = embeds_raw / (torch.norm(embeds_raw, dim=1, keepdim=True) + 1e-5)
|
| 93 |
+
|
| 94 |
+
return embeds
|
| 95 |
+
|
| 96 |
+
def similarity_matrix(self, embeds):
|
| 97 |
+
"""
|
| 98 |
+
Geometric Contrast: Computes the GE2E similarity matrix.
|
| 99 |
+
Quantifies the proximity of d-vectors to speaker centroids.
|
| 100 |
+
"""
|
| 101 |
+
speakers_per_batch, utterances_per_speaker = embeds.shape[:2]
|
| 102 |
+
|
| 103 |
+
# Inclusive centroids: Mean identity representation per speaker
|
| 104 |
+
centroids_incl = torch.mean(embeds, dim=1, keepdim=True)
|
| 105 |
+
centroids_incl = centroids_incl.clone() / (torch.norm(centroids_incl, dim=2, keepdim=True) + 1e-5)
|
| 106 |
+
|
| 107 |
+
# Exclusive centroids: LOO (Leave-One-Out) means to avoid biased similarity scoring
|
| 108 |
+
centroids_excl = (torch.sum(embeds, dim=1, keepdim=True) - embeds)
|
| 109 |
+
centroids_excl /= (utterances_per_speaker - 1)
|
| 110 |
+
centroids_excl = centroids_excl.clone() / (torch.norm(centroids_excl, dim=2, keepdim=True) + 1e-5)
|
| 111 |
+
|
| 112 |
+
# Similarity calculation via Dot Product (efficient Cosine Similarity equivalent)
|
| 113 |
+
sim_matrix = torch.zeros(speakers_per_batch, utterances_per_speaker,
|
| 114 |
+
speakers_per_batch).to(self.loss_device)
|
| 115 |
+
mask_matrix = 1 - np.eye(speakers_per_batch, dtype=int)
|
| 116 |
+
for j in range(speakers_per_batch):
|
| 117 |
+
mask = np.where(mask_matrix[j])[0]
|
| 118 |
+
sim_matrix[mask, :, j] = (embeds[mask] * centroids_incl[j]).sum(dim=2)
|
| 119 |
+
sim_matrix[j, :, j] = (embeds[j] * centroids_excl[j]).sum(dim=1)
|
| 120 |
+
|
| 121 |
+
# Scaling towards cross-entropy optimization
|
| 122 |
+
sim_matrix = sim_matrix * self.similarity_weight + self.similarity_bias
|
| 123 |
+
return sim_matrix
|
| 124 |
+
|
| 125 |
+
def loss(self, embeds):
|
| 126 |
+
"""
|
| 127 |
+
Discriminant Optimization:
|
| 128 |
+
Computes GE2E Softmax Loss and monitors Equal Error Rate (EER).
|
| 129 |
+
"""
|
| 130 |
+
speakers_per_batch, utterances_per_speaker = embeds.shape[:2]
|
| 131 |
+
|
| 132 |
+
# Global Similarity Awareness
|
| 133 |
+
sim_matrix = self.similarity_matrix(embeds)
|
| 134 |
+
sim_matrix = sim_matrix.reshape((speakers_per_batch * utterances_per_speaker,
|
| 135 |
+
speakers_per_batch))
|
| 136 |
+
|
| 137 |
+
# Target Generation (Diagonal Mapping)
|
| 138 |
+
ground_truth = np.repeat(np.arange(speakers_per_batch), utterances_per_speaker)
|
| 139 |
+
target = torch.from_numpy(ground_truth).long().to(self.loss_device)
|
| 140 |
+
loss = self.loss_fn(sim_matrix, target)
|
| 141 |
+
|
| 142 |
+
# Equal Error Rate (Diagnostic Telemetry)
|
| 143 |
+
with torch.no_grad():
|
| 144 |
+
inv_argmax = lambda i: np.eye(1, speakers_per_batch, i, dtype=int)[0]
|
| 145 |
+
labels = np.array([inv_argmax(i) for i in ground_truth])
|
| 146 |
+
preds = sim_matrix.detach().cpu().numpy()
|
| 147 |
+
|
| 148 |
+
# Statistical Error Estimation
|
| 149 |
+
fpr, tpr, thresholds = roc_curve(labels.flatten(), preds.flatten())
|
| 150 |
+
eer = brentq(lambda x: 1. - x - interp1d(fpr, tpr)(x), 0., 1.)
|
| 151 |
+
|
| 152 |
+
return loss, eer
|
Source Code/encoder/params_data.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ==================================================================================================
|
| 2 |
+
# DEEPFAKE AUDIO - encoder/params_data.py (Acoustic Feature Hyperparameters)
|
| 3 |
+
# ==================================================================================================
|
| 4 |
+
#
|
| 5 |
+
# π DESCRIPTION
|
| 6 |
+
# This configuration module defines the signal processing constants for the
|
| 7 |
+
# speaker encoder. It standardizes window lengths, sampling rates, and VAD
|
| 8 |
+
# sensitivities, ensuring consistency between training data preparation and
|
| 9 |
+
# real-time inference.
|
| 10 |
+
#
|
| 11 |
+
# π€ AUTHORS
|
| 12 |
+
# - Amey Thakur (https://github.com/Amey-Thakur)
|
| 13 |
+
# - Mega Satish (https://github.com/msatmod)
|
| 14 |
+
#
|
| 15 |
+
# π€π» CREDITS
|
| 16 |
+
# Original Real-Time Voice Cloning methodology by CorentinJ
|
| 17 |
+
# Repository: https://github.com/CorentinJ-Real-Time-Voice-Cloning
|
| 18 |
+
#
|
| 19 |
+
# π PROJECT LINKS
|
| 20 |
+
# Repository: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO
|
| 21 |
+
# Video Demo: https://youtu.be/i3wnBcbHDbs
|
| 22 |
+
# Research: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO/blob/main/DEEPFAKE-AUDIO.ipynb
|
| 23 |
+
#
|
| 24 |
+
# π LICENSE
|
| 25 |
+
# Released under the MIT License
|
| 26 |
+
# Release Date: 2021-02-06
|
| 27 |
+
# ==================================================================================================
|
| 28 |
+
|
| 29 |
+
# --- MEL-FILTERBANK CONFIGURATION ---
|
| 30 |
+
mel_window_length = 25 # Spectral analysis window (ms)
|
| 31 |
+
mel_window_step = 10 # Temporal stride between windows (ms)
|
| 32 |
+
mel_n_channels = 40 # Number of mel-scale frequency bins
|
| 33 |
+
|
| 34 |
+
# --- AUDIO TEMPORAL RESOLUTION ---
|
| 35 |
+
sampling_rate = 16000 # Global acoustic sampling frequency (Hz)
|
| 36 |
+
partials_n_frames = 160 # Sequence length for training utterances (1.6s)
|
| 37 |
+
inference_n_frames = 80 # Minimal sequence length for identity derivation (0.8s)
|
| 38 |
+
|
| 39 |
+
# --- VOICE ACTIVITY DETECTION (VAD) ---
|
| 40 |
+
# Sensitivity parameters for distinguishing speech from silence.
|
| 41 |
+
vad_window_length = 30 # Temporal resolution of VAD decisions (ms)
|
| 42 |
+
vad_moving_average_width = 8 # Smoothing factor for binary speech decisions
|
| 43 |
+
vad_max_silence_length = 6 # Maximum allowed internal silence gap before segmentation
|
| 44 |
+
|
| 45 |
+
# --- AMPLITUDE NORMALIZATION ---
|
| 46 |
+
audio_norm_target_dBFS = -30 # Target spectral energy level in Decibels
|
| 47 |
+
|
Source Code/encoder/params_model.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ==================================================================================================
|
| 2 |
+
# DEEPFAKE AUDIO - encoder/params_model.py (Neural Hyperparameters)
|
| 3 |
+
# ==================================================================================================
|
| 4 |
+
#
|
| 5 |
+
# π DESCRIPTION
|
| 6 |
+
# This module defines the architectural and optimization hyperparameters for
|
| 7 |
+
# the Speaker Encoder. These values determine the depth of the LSTM backbone,
|
| 8 |
+
# the dimensionality of the identity manifold (d-vector space), and the data
|
| 9 |
+
# orchestrations (batch size) required for training stability.
|
| 10 |
+
#
|
| 11 |
+
# π€ AUTHORS
|
| 12 |
+
# - Amey Thakur (https://github.com/Amey-Thakur)
|
| 13 |
+
# - Mega Satish (https://github.com/msatmod)
|
| 14 |
+
#
|
| 15 |
+
# π€π» CREDITS
|
| 16 |
+
# Original Real-Time Voice Cloning methodology by CorentinJ
|
| 17 |
+
# Repository: https://github.com/CorentinJ/Real-Time-Voice-Cloning
|
| 18 |
+
#
|
| 19 |
+
# π PROJECT LINKS
|
| 20 |
+
# Repository: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO
|
| 21 |
+
# Video Demo: https://youtu.be/i3wnBcbHDbs
|
| 22 |
+
# Research: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO/blob/main/DEEPFAKE-AUDIO.ipynb
|
| 23 |
+
#
|
| 24 |
+
# π LICENSE
|
| 25 |
+
# Released under the MIT License
|
| 26 |
+
# Release Date: 2021-02-06
|
| 27 |
+
# ==================================================================================================
|
| 28 |
+
|
| 29 |
+
# --- ARCHITECTURUAL DIMENSIONS ---
|
| 30 |
+
model_hidden_size = 256 # LSTM hidden state capacity
|
| 31 |
+
model_embedding_size = 256 # Final identity vector (d-vector) dimensionality
|
| 32 |
+
model_num_layers = 3 # Depth of the recurrent stack
|
| 33 |
+
|
| 34 |
+
# --- OPTIMIZATION ORCHESTRATION ---
|
| 35 |
+
learning_rate_init = 1e-4 # Initial stochastic gradient descent scaling
|
| 36 |
+
speakers_per_batch = 64 # Categorical diversity per optimization step
|
| 37 |
+
utterances_per_speaker = 10 # Sample diversity per categorical identity
|