Update app.py
Browse files
app.py
CHANGED
|
@@ -8,25 +8,23 @@ import gradio as gr
|
|
| 8 |
import soundfile as sf
|
| 9 |
from transformers import AutoModel
|
| 10 |
from typing import Tuple
|
| 11 |
-
import torchaudio
|
| 12 |
-
import librosa
|
| 13 |
-
import scipy
|
| 14 |
|
| 15 |
-
|
| 16 |
-
# ---------- LANGUAGE DETECTION (11 INDIAN SCRIPTS ONLY) ----------
|
| 17 |
def detect_language_from_text(text: str) -> str:
|
|
|
|
|
|
|
| 18 |
scripts = {
|
| 19 |
'as': set('অআইঈউঊঋএঐওঔকখগঘঙচছজঝঞটঠডঢণতথদধনপফবভমযরলশষসহক্ষজ্ঞড়ঢ়'),
|
| 20 |
'bn': set('অআইঈউঊঋএঐওঔকখগঘঙচছজঝঞটঠডঢণতথদধনপফবভমযরলশষসহক্ষজ্ঞড়ঢ়'),
|
| 21 |
'gu': set('અઆઇઈઉઊઋએઐઓઔકખગઘઙચછજઝઞટઠડઢણતથદધનપફબભમયરલળવશષસહક્ષજ્ઞ'),
|
| 22 |
'hi': set('अआइईउऊऋएऐओऔकखगघङचछजझञटठडढणतथदधनपफबभमयरलळवशषसहक्षज्ञ'),
|
| 23 |
-
'kn': set('
|
| 24 |
-
'ml': set('
|
| 25 |
-
'mr': set('
|
| 26 |
'or': set('ଅଆଇଈଉଊଋୠଌଏଐଓଔକଖଗଘଙଚଛଜଝଞଟଠଡଢଣତଥଦଧନପଫବଭମୟରଳୱଶଷସହକ୍ଷୟଲଵଡ଼ଢ଼'),
|
| 27 |
-
'pa': set('
|
| 28 |
-
'ta': set('
|
| 29 |
-
'te': set('
|
| 30 |
}
|
| 31 |
txt = set(text.replace(' ', ''))
|
| 32 |
for lang, chars in scripts.items():
|
|
@@ -45,6 +43,7 @@ class ProsodyNeutraliser:
|
|
| 45 |
if audio.dtype != np.float32:
|
| 46 |
audio = audio.astype(np.float32)
|
| 47 |
if src_sr != self.sr:
|
|
|
|
| 48 |
audio = torchaudio.functional.resample(torch.from_numpy(audio), src_sr, self.sr).numpy()
|
| 49 |
# very light pitch/energy flattening
|
| 50 |
f0, voiced_flag, _ = librosa.pyin(audio, fmin=librosa.note_to_hz('C2'),
|
|
@@ -82,35 +81,20 @@ def synthesize_speech(text, ref_audio, ref_text):
|
|
| 82 |
else:
|
| 83 |
return "Error: Invalid reference audio input."
|
| 84 |
|
| 85 |
-
# Detect languages
|
| 86 |
-
tgt_lang = detect_language_from_text(text)
|
| 87 |
-
ref_lang = detect_language_from_text(ref_text)
|
| 88 |
-
|
| 89 |
-
# If target is Odia but reference is not, neutralise prosody
|
| 90 |
-
if tgt_lang == 'or' and ref_lang != 'or':
|
| 91 |
-
neutraliser = ProsodyNeutraliser()
|
| 92 |
-
sample_rate, audio_data = neutraliser.neutralise_prosody(audio_data, sample_rate)
|
| 93 |
-
|
| 94 |
# Save reference audio directly without resampling
|
| 95 |
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio:
|
| 96 |
sf.write(temp_audio.name, audio_data, samplerate=sample_rate, format='WAV')
|
| 97 |
temp_audio.flush()
|
| 98 |
|
| 99 |
-
|
| 100 |
-
wav, sr = sf.read(temp_audio.name)
|
| 101 |
-
wav = torch.from_numpy(wav).float()
|
| 102 |
-
if sr != 24000:
|
| 103 |
-
wav = torchaudio.functional.resample(wav, sr, 24000)
|
| 104 |
-
|
| 105 |
-
# Feed tensor directly to model – never touches torchaudio.load
|
| 106 |
-
out = model(text, ref_audio_path=temp_audio.name, ref_text=ref_text)
|
| 107 |
|
| 108 |
# Normalize output and save
|
| 109 |
-
if
|
| 110 |
-
|
| 111 |
-
|
|
|
|
|
|
|
| 112 |
|
| 113 |
-
|
| 114 |
# Load TTS model
|
| 115 |
repo_id = "ai4bharat/IndicF5"
|
| 116 |
model = AutoModel.from_pretrained(repo_id, trust_remote_code=True)
|
|
@@ -118,41 +102,40 @@ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
| 118 |
print("Device", device)
|
| 119 |
model = model.to(device)
|
| 120 |
|
| 121 |
-
#
|
| 122 |
EXAMPLES = [
|
| 123 |
{
|
| 124 |
"audio_name": "PAN_F (Happy)",
|
| 125 |
"audio_url": "https://github.com/AI4Bharat/IndicF5/raw/refs/heads/main/prompts/PAN_F_HAPPY_00002.wav",
|
| 126 |
-
"ref_text": "ਇੱਕ ਗ੍ਰਾਹਕ ਨੇ ਸਾਡੀ
|
| 127 |
-
"synth_text": "
|
| 128 |
},
|
| 129 |
{
|
| 130 |
"audio_name": "TAM_F (Happy)",
|
| 131 |
"audio_url": "https://github.com/AI4Bharat/IndicF5/raw/refs/heads/main/prompts/TAM_F_HAPPY_00001.wav",
|
| 132 |
"ref_text": "நான் நெனச்ச மாதிரியே அமேசான்ல பெரிய தள்ளுபடி வந்திருக்கு. கம்மி காசுக்கே அந்தப் புது சேம்சங் மாடல வாங்கிடலாம்.",
|
| 133 |
-
"synth_text": "
|
| 134 |
},
|
| 135 |
{
|
| 136 |
"audio_name": "MAR_F (WIKI)",
|
| 137 |
"audio_url": "https://github.com/AI4Bharat/IndicF5/raw/refs/heads/main/prompts/MAR_F_WIKI_00001.wav",
|
| 138 |
"ref_text": "दिगंतराव्दारे अंतराळ कक्षेतला कचरा चिन्हित करण्यासाठी प्रयत्न केले जात आहे.",
|
| 139 |
-
"synth_text": "
|
| 140 |
},
|
| 141 |
{
|
| 142 |
"audio_name": "MAR_M (WIKI)",
|
| 143 |
"audio_url": "https://github.com/AI4Bharat/IndicF5/raw/refs/heads/main/prompts/MAR_M_WIKI_00001.wav",
|
| 144 |
"ref_text": "या प्रथाला एकोणीसशे पंचातर ईसवी पासून भारतीय दंड संहिताची धारा चारशे अठ्ठावीस आणि चारशे एकोणतीसच्या अन्तर्गत निषेध केला.",
|
| 145 |
-
"synth_text": "
|
| 146 |
},
|
| 147 |
{
|
| 148 |
"audio_name": "KAN_F (Happy)",
|
| 149 |
"audio_url": "https://github.com/AI4Bharat/IndicF5/raw/refs/heads/main/prompts/KAN_F_HAPPY_00001.wav",
|
| 150 |
-
"ref_text": "ನಮ್
|
| 151 |
-
"synth_text": "
|
| 152 |
},
|
| 153 |
]
|
| 154 |
|
| 155 |
-
|
| 156 |
# Preload all example audios
|
| 157 |
for example in EXAMPLES:
|
| 158 |
sample_rate, audio_data = load_audio_from_url(example["audio_url"])
|
|
|
|
| 8 |
import soundfile as sf
|
| 9 |
from transformers import AutoModel
|
| 10 |
from typing import Tuple
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
+
# ---------- LANGUAGE DETECTION (11 INDIAN LANGUAGES ONLY) ----------
|
|
|
|
| 13 |
def detect_language_from_text(text: str) -> str:
|
| 14 |
+
"""Return one of: as, bn, gu, hi, kn, ml, mr, or, pa, ta, te."""
|
| 15 |
+
# 11 Indian scripts – Latin (English) is **not** included
|
| 16 |
scripts = {
|
| 17 |
'as': set('অআইঈউঊঋএঐওঔকখগঘঙচছজঝঞটঠডঢণতথদধনপফবভমযরলশষসহক্ষজ্ঞড়ঢ়'),
|
| 18 |
'bn': set('অআইঈউঊঋএঐওঔকখগঘঙচছজঝঞটঠডঢণতথদধনপফবভমযরলশষসহক্ষজ্ঞড়ঢ়'),
|
| 19 |
'gu': set('અઆઇઈઉઊઋએઐઓઔકખગઘઙચછજઝઞટઠડઢણતથદધનપફબભમયરલળવશષસહક્ષજ્ઞ'),
|
| 20 |
'hi': set('अआइईउऊऋएऐओऔकखगघङचछजझञटठडढणतथदधनपफबभमयरलळवशषसहक्षज्ञ'),
|
| 21 |
+
'kn': set('ಅಆಇಈಉಊಋಏಐಓಔಕಖಗಘಙಚಛಜಝಞಟಠಡಢಣತಥದಧನಪಫಬಭಮಯರಲಳವಶಷಸಹಕ್ಷಜ್ಞ'),
|
| 22 |
+
'ml': set('അആഇഈഉഊഋഏഐഓഔകഖഗഘങചഛജഝഞടഠഡഢണതഥദധനപഫബഭമയരലളവശഷസഹക്ഷജ്ഞ'),
|
| 23 |
+
'mr': set('अआइईउऊऋएऐओऔकखगघङचछजझञटठडढणतथदधनपफबभमयरलळवशषसहक्षज्ञ'),
|
| 24 |
'or': set('ଅଆଇଈଉଊଋୠଌଏଐଓଔକଖଗଘଙଚଛଜଝଞଟଠଡଢଣତଥଦଧନପଫବଭମୟରଳୱଶଷସହକ୍ଷୟଲଵଡ଼ଢ଼'),
|
| 25 |
+
'pa': set('ਅਆਇਈਉਊਏਐਓਔਕਖਗਘਙਚਛਜਝਞਟਠਡਢਣਤਥਦਧਨਪਫਬਭਮਯਰਲਲ਼ਵਸ਼ਸਹਕਸ਼ਜ਼'),
|
| 26 |
+
'ta': set('அஆஇஈஉஊஎஐஒஔகஙசஜஞடணதநபமயரலவழளஶஷஸஹக்ஷஜ்ஞ'),
|
| 27 |
+
'te': set('అఆఇఈఉఊఋఎఐఒఔకఖగఘఙచఛజఝఞటఠడఢణతథదధనపఫబభమయరలళవశషసహక్షజ్ఞ'),
|
| 28 |
}
|
| 29 |
txt = set(text.replace(' ', ''))
|
| 30 |
for lang, chars in scripts.items():
|
|
|
|
| 43 |
if audio.dtype != np.float32:
|
| 44 |
audio = audio.astype(np.float32)
|
| 45 |
if src_sr != self.sr:
|
| 46 |
+
import torchaudio
|
| 47 |
audio = torchaudio.functional.resample(torch.from_numpy(audio), src_sr, self.sr).numpy()
|
| 48 |
# very light pitch/energy flattening
|
| 49 |
f0, voiced_flag, _ = librosa.pyin(audio, fmin=librosa.note_to_hz('C2'),
|
|
|
|
| 81 |
else:
|
| 82 |
return "Error: Invalid reference audio input."
|
| 83 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
# Save reference audio directly without resampling
|
| 85 |
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio:
|
| 86 |
sf.write(temp_audio.name, audio_data, samplerate=sample_rate, format='WAV')
|
| 87 |
temp_audio.flush()
|
| 88 |
|
| 89 |
+
audio = model(text, ref_audio_path=temp_audio.name, ref_text=ref_text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
|
| 91 |
# Normalize output and save
|
| 92 |
+
if audio.dtype == np.int16:
|
| 93 |
+
audio = audio.astype(np.float32) / 32768.0
|
| 94 |
+
|
| 95 |
+
return 24000, audio
|
| 96 |
+
|
| 97 |
|
|
|
|
| 98 |
# Load TTS model
|
| 99 |
repo_id = "ai4bharat/IndicF5"
|
| 100 |
model = AutoModel.from_pretrained(repo_id, trust_remote_code=True)
|
|
|
|
| 102 |
print("Device", device)
|
| 103 |
model = model.to(device)
|
| 104 |
|
| 105 |
+
# ---------- PRE-FETCH EXAMPLES (ONLY ODIA SYNTH TEXT) ----------
|
| 106 |
EXAMPLES = [
|
| 107 |
{
|
| 108 |
"audio_name": "PAN_F (Happy)",
|
| 109 |
"audio_url": "https://github.com/AI4Bharat/IndicF5/raw/refs/heads/main/prompts/PAN_F_HAPPY_00002.wav",
|
| 110 |
+
"ref_text": "ਇੱਕ ਗ੍ਰਾਹਕ ਨੇ ਸਾਡੀ ਬੇਮిసਾਲ ਸੇਵਾ ਬਾਰੇ ਦਿਲੋਂਗਵਾਹੀ ਦਿੱਤੀ ਜਿਸ ਨਾਲ ਸਾਨੂੰ ਅਨੰਦ ਮਹਿਸੂਸ ਹੋਇਆ।",
|
| 111 |
+
"synth_text": "ମୁଁ ଆପଣଙ୍କୁ ସ୍ୱାଗତ କରିବାକୁ ଚାହୁଁଛି, କେମିତି ଅଛନ୍ତି?"
|
| 112 |
},
|
| 113 |
{
|
| 114 |
"audio_name": "TAM_F (Happy)",
|
| 115 |
"audio_url": "https://github.com/AI4Bharat/IndicF5/raw/refs/heads/main/prompts/TAM_F_HAPPY_00001.wav",
|
| 116 |
"ref_text": "நான் நெனச்ச மாதிரியே அமேசான்ல பெரிய தள்ளுபடி வந்திருக்கு. கம்மி காசுக்கே அந்தப் புது சேம்சங் மாடல வாங்கிடலாம்.",
|
| 117 |
+
"synth_text": "ନମସ୍କାର, କେମିତି ଅଛନ୍ତି?"
|
| 118 |
},
|
| 119 |
{
|
| 120 |
"audio_name": "MAR_F (WIKI)",
|
| 121 |
"audio_url": "https://github.com/AI4Bharat/IndicF5/raw/refs/heads/main/prompts/MAR_F_WIKI_00001.wav",
|
| 122 |
"ref_text": "दिगंतराव्दारे अंतराळ कक्षेतला कचरा चिन्हित करण्यासाठी प्रयत्न केले जात आहे.",
|
| 123 |
+
"synth_text": "ଆପଣ କିପରି ଅଛନ୍ତି? ମୁଁ ଆପଣଙ୍କୁ ସ୍ୱାଗତ କରିବାକୁ ଚାହୁଁଛି।"
|
| 124 |
},
|
| 125 |
{
|
| 126 |
"audio_name": "MAR_M (WIKI)",
|
| 127 |
"audio_url": "https://github.com/AI4Bharat/IndicF5/raw/refs/heads/main/prompts/MAR_M_WIKI_00001.wav",
|
| 128 |
"ref_text": "या प्रथाला एकोणीसशे पंचातर ईसवी पासून भारतीय दंड संहिताची धारा चारशे अठ्ठावीस आणि चारशे एकोणतीसच्या अन्तर्गत निषेध केला.",
|
| 129 |
+
"synth_text": "ମୁଁ ଆଜି ବହୁତ ଖୁସି ଅଛି କାରଣ ମୋର କାମ ସଫଳ ହୋଇଛି।"
|
| 130 |
},
|
| 131 |
{
|
| 132 |
"audio_name": "KAN_F (Happy)",
|
| 133 |
"audio_url": "https://github.com/AI4Bharat/IndicF5/raw/refs/heads/main/prompts/KAN_F_HAPPY_00001.wav",
|
| 134 |
+
"ref_text": "ನಮ್ ಫ್ರಿಜ್ಜಲ್ಲಿ ಕೂలಿಂಗ್ ಸమಸ്യೆ ಆಗಿ ನಾನ್ ಭಾಳ ದినದಿಂದ ಒದ್ದಾಡ್ತಿದ್ದೆ, ಆದ್ರೆ ಅದ್ನೀಗ ಮೆకానిక್ ಆಗಿರೋ ನిమ್ ಸಹಾಯ್ದಿಂದ ಬಗೆಹರಿಸ್ಕೋಬోదು ಅಂತಾಗಿ ನಿರಾಳ ಆಯ್ತು ನಂಗೆ.",
|
| 135 |
+
"synth_text": "ନମସ୍କାର, କେମିତି ଅଛନ୍ତି?"
|
| 136 |
},
|
| 137 |
]
|
| 138 |
|
|
|
|
| 139 |
# Preload all example audios
|
| 140 |
for example in EXAMPLES:
|
| 141 |
sample_rate, audio_data = load_audio_from_url(example["audio_url"])
|