ahmedumeraziz commited on
Commit
708f5fe
·
verified ·
1 Parent(s): 5061ba6

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +188 -0
app.py ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import numpy as np
3
+ import librosa
4
+ from pydub import AudioSegment
5
+ import soundfile as sf
6
+ import gdown
7
+ from TTS.api import TTS
8
+ from langdetect import detect
9
+ from scipy.spatial.distance import cosine
10
+ import torch
11
+ import matplotlib.pyplot as plt
12
+ import pandas as pd
13
+ import streamlit as st
14
+ from io import BytesIO
15
+
16
+ # === Utility Functions ===
17
+ def convert_mp3_to_wav(mp3_file, wav_file):
18
+ audio = AudioSegment.from_file(mp3_file, format="mp3")
19
+ audio.export(wav_file, format="wav")
20
+
21
+ def extract_mfcc(wav_file):
22
+ y, sr = librosa.load(wav_file, sr=None)
23
+ mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
24
+ return np.mean(mfcc, axis=1)
25
+
26
+ def clone_and_compare(tts, ref_wav, text, language, output_wav="cloned.wav"):
27
+ tts.tts_to_file(text=text, speaker_wav=ref_wav, language=language, file_path=output_wav)
28
+ orig = extract_mfcc(ref_wav)
29
+ clone = extract_mfcc(output_wav)
30
+ similarity = 1 - cosine(orig, clone)
31
+ return similarity, output_wav
32
+
33
+ def standardize_audio_format(input_file, output_file, sample_rate=22050):
34
+ y, sr = librosa.load(input_file, sr=sample_rate)
35
+ sf.write(output_file, y, sample_rate)
36
+
37
+ # === Streamlit App ===
38
+ def main():
39
+ st.title("🎙️ Voice Cloning App")
40
+ st.write("Clone voices and compare similarity with the original")
41
+
42
+ # Initialize TTS model
43
+ if 'tts' not in st.session_state:
44
+ with st.spinner("Loading TTS model..."):
45
+ st.session_state.tts = TTS(
46
+ model_name="tts_models/multilingual/multi-dataset/your_tts",
47
+ progress_bar=False,
48
+ gpu=torch.cuda.is_available()
49
+ )
50
+
51
+ # Input method selection
52
+ input_method = st.radio(
53
+ "How do you want to provide the voice/text data?",
54
+ options=[
55
+ "Upload audio and text manually",
56
+ "Enter local paths",
57
+ "Use Google Drive link",
58
+ "Upload existing CSV file"
59
+ ]
60
+ )
61
+
62
+ wav_file = None
63
+ input_text = None
64
+ csv_data = None
65
+
66
+ if input_method == "Upload audio and text manually":
67
+ audio_file = st.file_uploader("Upload your audio (MP3) file", type=["mp3"])
68
+ text_file = st.file_uploader("Upload your text file", type=["txt"])
69
+
70
+ if audio_file and text_file:
71
+ wav_file = "input.wav"
72
+ with open("temp.mp3", "wb") as f:
73
+ f.write(audio_file.getbuffer())
74
+ convert_mp3_to_wav("temp.mp3", wav_file)
75
+
76
+ input_text = text_file.read().decode("utf-8")
77
+
78
+ elif input_method == "Enter local paths":
79
+ mp3_path = st.text_input("Enter path to your MP3 file")
80
+ text_path = st.text_input("Enter path to your text file")
81
+
82
+ if mp3_path and text_path:
83
+ wav_file = mp3_path.replace(".mp3", ".wav")
84
+ convert_mp3_to_wav(mp3_path, wav_file)
85
+
86
+ with open(text_path, 'r') as file:
87
+ input_text = file.read()
88
+
89
+ elif input_method == "Use Google Drive link":
90
+ gdrive_url = st.text_input("Enter the Google Drive MP3 link")
91
+ input_text = st.text_area("Enter the text to be spoken using cloned voice")
92
+
93
+ if gdrive_url and input_text:
94
+ mp3_file = "input.mp3"
95
+ wav_file = "input.wav"
96
+ try:
97
+ file_id = gdrive_url.split("/d/")[1].split("/")[0]
98
+ download_url = f"https://drive.google.com/uc?id={file_id}"
99
+ gdown.download(download_url, mp3_file, quiet=False)
100
+ convert_mp3_to_wav(mp3_file, wav_file)
101
+ except Exception as e:
102
+ st.error(f"Error downloading from Google Drive: {e}")
103
+
104
+ elif input_method == "Upload existing CSV file":
105
+ csv_file = st.file_uploader("Upload your voice_dataset.csv", type=["csv"])
106
+ if csv_file:
107
+ csv_data = pd.read_csv(csv_file)
108
+ st.write("Uploaded CSV data:")
109
+ st.dataframe(csv_data)
110
+
111
+ # Process cloning if we have the required inputs
112
+ if csv_data is not None:
113
+ st.success("✅ You uploaded an existing CSV, skipping voice cloning.")
114
+ elif wav_file and input_text:
115
+ try:
116
+ language = detect(input_text)
117
+ st.write(f"Detected language: {language}")
118
+
119
+ if st.button("Start Voice Cloning"):
120
+ best_similarity = 0
121
+ best_output = ""
122
+ results = []
123
+
124
+ st.write("🔁 Running 5 cloning attempts for best match...")
125
+ progress_bar = st.progress(0)
126
+
127
+ for i in range(5):
128
+ with st.spinner(f"Running attempt {i+1}/5..."):
129
+ sim, out_file = clone_and_compare(
130
+ st.session_state.tts,
131
+ wav_file,
132
+ input_text,
133
+ language,
134
+ f"clone_try_{i}.wav"
135
+ )
136
+ results.append({"Attempt": i + 1, "Similarity": sim})
137
+ progress_bar.progress((i+1)/5)
138
+ st.write(f"Attempt {i+1}: Similarity = {sim*100:.2f}%")
139
+
140
+ if sim > best_similarity:
141
+ best_similarity = sim
142
+ best_output = out_file
143
+
144
+ # Standardize & Save Final Audio
145
+ standardize_audio_format(best_output, "final_cloned_voice.wav")
146
+ st.success(f"✅ Best voice with similarity {best_similarity*100:.2f}%")
147
+
148
+ # Save CSV
149
+ df = pd.DataFrame(results)
150
+ df.to_csv("voice_dataset.csv", index=False)
151
+
152
+ # Plot
153
+ fig, ax = plt.subplots()
154
+ ax.plot(df['Attempt'], df['Similarity'] * 100, marker='o')
155
+ ax.set_title("Voice Similarity Over Attempts")
156
+ ax.set_xlabel("Attempt")
157
+ ax.set_ylabel("Similarity (%)")
158
+ ax.set_ylim(0, 100)
159
+ ax.grid(True)
160
+ st.pyplot(fig)
161
+
162
+ # Download options
163
+ st.subheader("📥 Download Results")
164
+
165
+ col1, col2 = st.columns(2)
166
+
167
+ with col1:
168
+ with open("voice_dataset.csv", "rb") as f:
169
+ st.download_button(
170
+ "Download CSV",
171
+ f,
172
+ file_name="voice_dataset.csv",
173
+ mime="text/csv"
174
+ )
175
+
176
+ with col2:
177
+ with open("final_cloned_voice.wav", "rb") as f:
178
+ st.download_button(
179
+ "Download Audio",
180
+ f,
181
+ file_name="final_cloned_voice.wav",
182
+ mime="audio/wav"
183
+ )
184
+ except Exception as e:
185
+ st.error(f"An error occurred: {str(e)}")
186
+
187
+ if __name__ == "__main__":
188
+ main()