Spaces:
No application file
No application file
Update src/app.py
Browse files- src/app.py +135 -2
src/app.py
CHANGED
|
@@ -1,4 +1,137 @@
|
|
| 1 |
import streamlit as st
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
-
st.title("Voice Clone XTTS")
|
| 4 |
-
st.write("App is running")
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
+
import librosa
|
| 3 |
+
import tempfile
|
| 4 |
+
from transformers import pipeline
|
| 5 |
+
from TTS.api import TTS
|
| 6 |
+
|
| 7 |
+
from ui import render_header, render_sidebar ,render_status
|
| 8 |
+
|
| 9 |
+
st.set_page_config(page_title="Voice Clone Translator", layout="wide")
|
| 10 |
+
render_header()
|
| 11 |
+
render_sidebar()
|
| 12 |
+
render_status()
|
| 13 |
+
st.title("ποΈ Voice Cloning Translator (English β Hindi / French / Japanese)")
|
| 14 |
+
|
| 15 |
+
# -------- Load models --------
|
| 16 |
+
@st.cache_resource
|
| 17 |
+
def load_asr():
|
| 18 |
+
return pipeline(
|
| 19 |
+
"automatic-speech-recognition",
|
| 20 |
+
model="openai/whisper-small",
|
| 21 |
+
device=-1
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
@st.cache_resource
|
| 25 |
+
def load_translator(model_name, target_lang):
|
| 26 |
+
if model_name.startswith("facebook/m2m100"):
|
| 27 |
+
return pipeline(
|
| 28 |
+
"translation",
|
| 29 |
+
model=model_name,
|
| 30 |
+
src_lang="en",
|
| 31 |
+
tgt_lang=target_lang,
|
| 32 |
+
device=-1
|
| 33 |
+
)
|
| 34 |
+
else:
|
| 35 |
+
return pipeline(
|
| 36 |
+
"translation",
|
| 37 |
+
model=model_name,
|
| 38 |
+
device=-1
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
@st.cache_resource
|
| 43 |
+
def load_xtts():
|
| 44 |
+
return TTS(
|
| 45 |
+
"tts_models/multilingual/multi-dataset/xtts_v2",
|
| 46 |
+
gpu=False
|
| 47 |
+
)
|
| 48 |
+
|
| 49 |
+
asr = load_asr()
|
| 50 |
+
xtts = load_xtts()
|
| 51 |
+
|
| 52 |
+
# -------- Language config --------
|
| 53 |
+
LANGS = {
|
| 54 |
+
"Hindi": {
|
| 55 |
+
"translator": "Helsinki-NLP/opus-mt-en-hi",
|
| 56 |
+
"code": "hi",
|
| 57 |
+
"file": "hindi_my_voice.wav"
|
| 58 |
+
},
|
| 59 |
+
"French": {
|
| 60 |
+
"translator": "Helsinki-NLP/opus-mt-en-fr",
|
| 61 |
+
"code": "fr",
|
| 62 |
+
"file": "french_my_voice.wav"
|
| 63 |
+
},
|
| 64 |
+
"Japanese": {
|
| 65 |
+
"translator": "facebook/m2m100_418M",
|
| 66 |
+
"code": "ja",
|
| 67 |
+
"file": "japanese_my_voice.wav"
|
| 68 |
+
}
|
| 69 |
+
}
|
| 70 |
+
|
| 71 |
+
# -------- UI --------
|
| 72 |
+
target_lang = st.selectbox("Select Target Language", list(LANGS.keys()))
|
| 73 |
+
uploaded = st.file_uploader("Upload English voice (WAV)", type=["wav"])
|
| 74 |
+
text_input = st.text_area("Or type English text")
|
| 75 |
+
convert = st.button("Convert to Voice")
|
| 76 |
+
tab1, tab2, tab3 = st.tabs(["π Text", "π Translation", "π Voice"])
|
| 77 |
+
# -------- Processing --------
|
| 78 |
+
if convert:
|
| 79 |
+
if not uploaded and not text_input.strip():
|
| 80 |
+
st.warning("Upload audio or type text.")
|
| 81 |
+
else:
|
| 82 |
+
with st.spinner("Processing (CPU β slow but working)..."):
|
| 83 |
+
|
| 84 |
+
# -------- Handle uploaded audio --------
|
| 85 |
+
# -------- Get English text --------
|
| 86 |
+
if uploaded:
|
| 87 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
|
| 88 |
+
tmp.write(uploaded.read())
|
| 89 |
+
speaker_path = tmp.name
|
| 90 |
+
|
| 91 |
+
audio, sr = librosa.load(speaker_path, sr=16000)
|
| 92 |
+
english_text = asr(audio)["text"]
|
| 93 |
+
|
| 94 |
+
elif text_input.strip():
|
| 95 |
+
st.warning("β οΈ Upload a voice sample to clone your voice.")
|
| 96 |
+
st.stop()
|
| 97 |
+
|
| 98 |
+
else:
|
| 99 |
+
st.warning("Provide text or upload audio.")
|
| 100 |
+
st.stop()
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
with tab1:
|
| 104 |
+
st.subheader("Recognized English")
|
| 105 |
+
st.write(english_text)
|
| 106 |
+
|
| 107 |
+
# -------- Translation --------
|
| 108 |
+
translator = load_translator(
|
| 109 |
+
LANGS[target_lang]["translator"],
|
| 110 |
+
LANGS[target_lang]["code"]
|
| 111 |
+
)
|
| 112 |
+
|
| 113 |
+
translated_text = translator(english_text)[0]["translation_text"]
|
| 114 |
+
|
| 115 |
+
with tab2:
|
| 116 |
+
st.subheader(f"{target_lang} Text")
|
| 117 |
+
st.write(translated_text)
|
| 118 |
+
|
| 119 |
+
# -------- XTTS (Real Voice Cloning) --------
|
| 120 |
+
out_path = "out.wav"
|
| 121 |
+
xtts.tts_to_file(
|
| 122 |
+
text=translated_text,
|
| 123 |
+
speaker_wav=speaker_path,
|
| 124 |
+
language=LANGS[target_lang]["code"],
|
| 125 |
+
file_path=out_path,
|
| 126 |
+
split_sentences=False
|
| 127 |
+
)
|
| 128 |
+
|
| 129 |
+
with tab3:
|
| 130 |
+
st.subheader(f"{target_lang} Voice (Your Voice)")
|
| 131 |
+
st.audio(out_path)
|
| 132 |
+
st.download_button(
|
| 133 |
+
"β¬ Download Audio",
|
| 134 |
+
open(out_path, "rb"),
|
| 135 |
+
file_name=LANGS[target_lang]["file"]
|
| 136 |
+
)
|
| 137 |
|
|
|
|
|
|