high77 commited on
Commit
5ad586c
·
verified ·
1 Parent(s): 4c5529f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -41
app.py CHANGED
@@ -8,25 +8,23 @@ import gradio as gr
8
  import soundfile as sf
9
  from transformers import AutoModel
10
  from typing import Tuple
11
- import torchaudio
12
- import librosa
13
- import scipy
14
 
15
-
16
- # ---------- LANGUAGE DETECTION (11 INDIAN SCRIPTS ONLY) ----------
17
  def detect_language_from_text(text: str) -> str:
 
 
18
  scripts = {
19
  'as': set('অআইঈউঊঋএঐওঔকখগঘঙচছজঝঞটঠডঢণতথদধনপফবভমযরলশষসহক্ষজ্ঞড়ঢ়'),
20
  'bn': set('অআইঈউঊঋএঐওঔকখগঘঙচছজঝঞটঠডঢণতথদধনপফবভমযরলশষসহক্ষজ্ঞড়ঢ়'),
21
  'gu': set('અઆઇઈઉઊઋએઐઓઔકખગઘઙચછજઝઞટઠડઢણતથદધનપફબભમયરલળવશષસહક્ષજ્ઞ'),
22
  'hi': set('अआइईउऊऋएऐओऔकखगघङचछजझञटठडढणतथदधनपफबभमयरलळवशषसहक्षज्ञ'),
23
- 'kn': set('ಅಆಇಈଉଊଋಏఐಓಔಕಖಗಘಙಚಛಜಝಞಟಠడಢಣತಥದಧನಪಫಬಭಮయರಲಳವಶಷಸಹಕ್ಷಜ್ಞ'),
24
- 'ml': set('അആഇഈഉഊऋഏఐഓഔകഖഗഘങചഛജഝഞടഠഡഢണതഥദധനപഫബഭമയരലളവശഷസഹക്ഷജ്ഞ'),
25
- 'mr': set('अआइईउऊऋएऐओऔकखगघङचछजझञटठडढणतथदधनपफबभमयरलळवशषसहক্ষज्ञ'),
26
  'or': set('ଅଆଇଈଉଊଋୠଌଏଐଓଔକଖଗଘଙଚଛଜଝଞଟଠଡଢଣତଥଦଧନପଫବଭମୟରଳୱଶଷସହକ୍ଷୟଲଵଡ଼ଢ଼'),
27
- 'pa': set('ਅਆਇਈਉਊਏਐਓऔਕਖਗਘਙਚਛਜਝਞਟਠਡਢਣਤਥਦਧਨਪਫਬਭਮਯਰਲਲ਼ਵਸ਼਷ਸਹਕਸ਼ਜ਼'),
28
- 'ta': set('அஆஇஈஉஊऋஎஐஒஔகஙசஜஞடணதநபமயரலவழளஶஷஸஹக்ஷஜ்ஞ'),
29
- 'te': set('అఆఇఈଉஊऋఎஐఒఔకఖగఘఙచఛజఝఞటఠడఢణతథదధనపఫబభమయరలళవశషసహక్షజ్ఞ'),
30
  }
31
  txt = set(text.replace(' ', ''))
32
  for lang, chars in scripts.items():
@@ -45,6 +43,7 @@ class ProsodyNeutraliser:
45
  if audio.dtype != np.float32:
46
  audio = audio.astype(np.float32)
47
  if src_sr != self.sr:
 
48
  audio = torchaudio.functional.resample(torch.from_numpy(audio), src_sr, self.sr).numpy()
49
  # very light pitch/energy flattening
50
  f0, voiced_flag, _ = librosa.pyin(audio, fmin=librosa.note_to_hz('C2'),
@@ -82,35 +81,20 @@ def synthesize_speech(text, ref_audio, ref_text):
82
  else:
83
  return "Error: Invalid reference audio input."
84
 
85
- # Detect languages
86
- tgt_lang = detect_language_from_text(text)
87
- ref_lang = detect_language_from_text(ref_text)
88
-
89
- # If target is Odia but reference is not, neutralise prosody
90
- if tgt_lang == 'or' and ref_lang != 'or':
91
- neutraliser = ProsodyNeutraliser()
92
- sample_rate, audio_data = neutraliser.neutralise_prosody(audio_data, sample_rate)
93
-
94
  # Save reference audio directly without resampling
95
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio:
96
  sf.write(temp_audio.name, audio_data, samplerate=sample_rate, format='WAV')
97
  temp_audio.flush()
98
 
99
- # Load with soundfile (not torchaudio) and convert to tensor
100
- wav, sr = sf.read(temp_audio.name)
101
- wav = torch.from_numpy(wav).float()
102
- if sr != 24000:
103
- wav = torchaudio.functional.resample(wav, sr, 24000)
104
-
105
- # Feed tensor directly to model – never touches torchaudio.load
106
- out = model(text, ref_audio_path=temp_audio.name, ref_text=ref_text)
107
 
108
  # Normalize output and save
109
- if out.dtype == np.int16:
110
- out = out.astype(np.float32) / 32768.0
111
- return 24000, out
 
 
112
 
113
-
114
  # Load TTS model
115
  repo_id = "ai4bharat/IndicF5"
116
  model = AutoModel.from_pretrained(repo_id, trust_remote_code=True)
@@ -118,41 +102,40 @@ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
118
  print("Device", device)
119
  model = model.to(device)
120
 
121
- # Example Data (Multiple Examples)
122
  EXAMPLES = [
123
  {
124
  "audio_name": "PAN_F (Happy)",
125
  "audio_url": "https://github.com/AI4Bharat/IndicF5/raw/refs/heads/main/prompts/PAN_F_HAPPY_00002.wav",
126
- "ref_text": "ਇੱਕ ਗ੍ਰਾਹਕ ਨੇ ਸਾਡੀ ਬੇਮਿਸਾਲ ਸੇਵਾ ਬਾਰੇ ਦਿਲੋਂਗਵਾਹੀ ਦਿੱਤੀ ਜਿਸ ਨਾਲ ਸਾਨੂੰ ਅਨੰਦ ਮਹਿਸੂਸ ਹੋਇਆ।",
127
- "synth_text": "ରେ ହେବା ଉଚିତ କି, ମୁଁ କାଳେ ନିଜ ଅଳିଆ"
128
  },
129
  {
130
  "audio_name": "TAM_F (Happy)",
131
  "audio_url": "https://github.com/AI4Bharat/IndicF5/raw/refs/heads/main/prompts/TAM_F_HAPPY_00001.wav",
132
  "ref_text": "நான் நெனச்ச மாதிரியே அமேசான்ல பெரிய தள்ளுபடி வந்திருக்கு. கம்மி காசுக்கே அந்தப் புது சேம்சங் மாடல வாங்கிடலாம்.",
133
- "synth_text": "ୟେ ଭୁଲ,ତଥାପି ବାଛିବାକୁ ଚେଷ୍ଟା କରେ"
134
  },
135
  {
136
  "audio_name": "MAR_F (WIKI)",
137
  "audio_url": "https://github.com/AI4Bharat/IndicF5/raw/refs/heads/main/prompts/MAR_F_WIKI_00001.wav",
138
  "ref_text": "दिगंतराव्दारे अंतराळ कक्षेतला कचरा चिन्हित करण्यासाठी प्रयत्न केले जात आहे.",
139
- "synth_text": " ହେଲେ ତାକୁ ଛାଡ଼ି ଦେବ।"
140
  },
141
  {
142
  "audio_name": "MAR_M (WIKI)",
143
  "audio_url": "https://github.com/AI4Bharat/IndicF5/raw/refs/heads/main/prompts/MAR_M_WIKI_00001.wav",
144
  "ref_text": "या प्रथाला एकोणीसशे पंचातर ईसवी पासून भारतीय दंड संहिताची धारा चारशे अठ्ठावीस आणि चारशे एकोणतीसच्या अन्तर्गत निषेध केला.",
145
- "synth_text": "ବୁଝା ପଡ଼ୁଛି, ମାତ୍ର ତଥାପି।"
146
  },
147
  {
148
  "audio_name": "KAN_F (Happy)",
149
  "audio_url": "https://github.com/AI4Bharat/IndicF5/raw/refs/heads/main/prompts/KAN_F_HAPPY_00001.wav",
150
- "ref_text": "ನಮ್‌ ಫ್ರిజ್ಜಲ್ಲಿ ಕೂలಿಂಗ్‌ ಸమస্যೆ ಆಗಿ నాన్‌ భాళ దినదಿಂದ ఒద్దాడ్తిద్దె, ఆద్రె అద్నీగ మెకానిక్ ఆగిరో నిమ్‌ సహాయ్దింద బగెహరిస్కోబోదు అంతాగి నిరాళ ఆయ్తు నంగె.",
151
- "synth_text": "ଦାକ୍ତର ଔଷଧ ଦେଇ ଦିଅନ୍ତି। ଆମେ ଔଷଧ କିଣି ଆଣୁ। ଦାକ୍ତର ପାଖକୁ ବି ଚାଲିଗଲେ।"
152
  },
153
  ]
154
 
155
-
156
  # Preload all example audios
157
  for example in EXAMPLES:
158
  sample_rate, audio_data = load_audio_from_url(example["audio_url"])
 
8
  import soundfile as sf
9
  from transformers import AutoModel
10
  from typing import Tuple
 
 
 
11
 
12
+ # ---------- LANGUAGE DETECTION (11 INDIAN LANGUAGES ONLY) ----------
 
13
  def detect_language_from_text(text: str) -> str:
14
+ """Return one of: as, bn, gu, hi, kn, ml, mr, or, pa, ta, te."""
15
+ # 11 Indian scripts – Latin (English) is **not** included
16
  scripts = {
17
  'as': set('অআইঈউঊঋএঐওঔকখগঘঙচছজঝঞটঠডঢণতথদধনপফবভমযরলশষসহক্ষজ্ঞড়ঢ়'),
18
  'bn': set('অআইঈউঊঋএঐওঔকখগঘঙচছজঝঞটঠডঢণতথদধনপফবভমযরলশষসহক্ষজ্ঞড়ঢ়'),
19
  'gu': set('અઆઇઈઉઊઋએઐઓઔકખગઘઙચછજઝઞટઠડઢણતથદધનપફબભમયરલળવશષસહક્ષજ્ઞ'),
20
  'hi': set('अआइईउऊऋएऐओऔकखगघङचछजझञटठडढणतथदधनपफबभमयरलळवशषसहक्षज्ञ'),
21
+ 'kn': set('ಅಆಇಈಉಊಋಏಐಓಔಕಖಗಘಙಚಛಜಝಞಟಠಡಢಣತಥದಧನಪಫಬಭಮಯರಲಳವಶಷಸಹಕ್ಷಜ್ಞ'),
22
+ 'ml': set('അആഇഈഉഊഋഏഐഓഔകഖഗഘങചഛജഝഞടഠഡഢണതഥദധനപഫബഭമയരലളവശഷസഹക്ഷജ്ഞ'),
23
+ 'mr': set('अआइईउऊऋएऐओऔकखगघङचछजझञटठडढणतथदधनपफबभमयरलळवशषसहक्षज्ञ'),
24
  'or': set('ଅଆଇଈଉଊଋୠଌଏଐଓଔକଖଗଘଙଚଛଜଝଞଟଠଡଢଣତଥଦଧନପଫବଭମୟରଳୱଶଷସହକ୍ଷୟଲଵଡ଼ଢ଼'),
25
+ 'pa': set('ਅਆਇਈਉਊਏਐਓਔਕਖਗਘਙਚਛਜਝਞਟਠਡਢਣਤਥਦਧਨਪਫਬਭਮਯਰਲਲ਼ਵਸ਼਷ਸਹਕਸ਼ਜ਼'),
26
+ 'ta': set('அஆஇஈஉஊ஋எஐஒஔகஙசஜஞடணதநபமயரலவழளஶஷஸஹக்ஷஜ்ஞ'),
27
+ 'te': set('అఆఇఈఉఊఋఎఐఒఔకఖగఘఙచఛజఝఞటఠడఢణతథదధనపఫబభమయరలళవశషసహక్షజ్ఞ'),
28
  }
29
  txt = set(text.replace(' ', ''))
30
  for lang, chars in scripts.items():
 
43
  if audio.dtype != np.float32:
44
  audio = audio.astype(np.float32)
45
  if src_sr != self.sr:
46
+ import torchaudio
47
  audio = torchaudio.functional.resample(torch.from_numpy(audio), src_sr, self.sr).numpy()
48
  # very light pitch/energy flattening
49
  f0, voiced_flag, _ = librosa.pyin(audio, fmin=librosa.note_to_hz('C2'),
 
81
  else:
82
  return "Error: Invalid reference audio input."
83
 
 
 
 
 
 
 
 
 
 
84
  # Save reference audio directly without resampling
85
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio:
86
  sf.write(temp_audio.name, audio_data, samplerate=sample_rate, format='WAV')
87
  temp_audio.flush()
88
 
89
+ audio = model(text, ref_audio_path=temp_audio.name, ref_text=ref_text)
 
 
 
 
 
 
 
90
 
91
  # Normalize output and save
92
+ if audio.dtype == np.int16:
93
+ audio = audio.astype(np.float32) / 32768.0
94
+
95
+ return 24000, audio
96
+
97
 
 
98
  # Load TTS model
99
  repo_id = "ai4bharat/IndicF5"
100
  model = AutoModel.from_pretrained(repo_id, trust_remote_code=True)
 
102
  print("Device", device)
103
  model = model.to(device)
104
 
105
+ # ---------- PRE-FETCH EXAMPLES (ONLY ODIA SYNTH TEXT) ----------
106
  EXAMPLES = [
107
  {
108
  "audio_name": "PAN_F (Happy)",
109
  "audio_url": "https://github.com/AI4Bharat/IndicF5/raw/refs/heads/main/prompts/PAN_F_HAPPY_00002.wav",
110
+ "ref_text": "ਇੱਕ ਗ੍ਰਾਹਕ ਨੇ ਸਾਡੀ ਬੇਮిసਾਲ ਸੇਵਾ ਬਾਰੇ ਦਿਲੋਂਗਵਾਹੀ ਦਿੱਤੀ ਜਿਸ ਨਾਲ ਸਾਨੂੰ ਅਨੰਦ ਮਹਿਸੂਸ ਹੋਇਆ।",
111
+ "synth_text": "ମୁଁ ଆପଣଙ୍କୁ ସ୍ୱାଗତ କରିବାକୁ ଚାହୁଁଛି, କେମିତି ଅଛନ୍ତି?"
112
  },
113
  {
114
  "audio_name": "TAM_F (Happy)",
115
  "audio_url": "https://github.com/AI4Bharat/IndicF5/raw/refs/heads/main/prompts/TAM_F_HAPPY_00001.wav",
116
  "ref_text": "நான் நெனச்ச மாதிரியே அமேசான்ல பெரிய தள்ளுபடி வந்திருக்கு. கம்மி காசுக்கே அந்தப் புது சேம்சங் மாடல வாங்கிடலாம்.",
117
+ "synth_text": "ନମସ୍କାର, କେମିତି ଅଛନ୍ତି?"
118
  },
119
  {
120
  "audio_name": "MAR_F (WIKI)",
121
  "audio_url": "https://github.com/AI4Bharat/IndicF5/raw/refs/heads/main/prompts/MAR_F_WIKI_00001.wav",
122
  "ref_text": "दिगंतराव्दारे अंतराळ कक्षेतला कचरा चिन्हित करण्यासाठी प्रयत्न केले जात आहे.",
123
+ "synth_text": "ଆପଣ କିପରି ଅଛନ୍ତି? ମୁଁ ଆପଣଙ୍କୁ ସ୍ୱାଗତ କରିବାକୁ ଚାହୁଁଛି।"
124
  },
125
  {
126
  "audio_name": "MAR_M (WIKI)",
127
  "audio_url": "https://github.com/AI4Bharat/IndicF5/raw/refs/heads/main/prompts/MAR_M_WIKI_00001.wav",
128
  "ref_text": "या प्रथाला एकोणीसशे पंचातर ईसवी पासून भारतीय दंड संहिताची धारा चारशे अठ्ठावीस आणि चारशे एकोणतीसच्या अन्तर्गत निषेध केला.",
129
+ "synth_text": "ମୁଁ ଆଜି ବହୁତ ଖୁସି ଅଛି କାରଣ ମୋର କାମ ସଫଳ ହୋଇଛି।"
130
  },
131
  {
132
  "audio_name": "KAN_F (Happy)",
133
  "audio_url": "https://github.com/AI4Bharat/IndicF5/raw/refs/heads/main/prompts/KAN_F_HAPPY_00001.wav",
134
+ "ref_text": "ನಮ್‌ ಫ್ರಿಜ್ಜಲ್ಲಿ ಕೂలಿಂಗ್‌ ಸమಸ്യೆ ಆಗಿ ನಾನ್‌ ಭಾಳ ದినದಿಂದ ಒದ್ದಾಡ್ತಿದ್ದೆ, ಆದ್ರೆ ಅದ್ನೀಗ ಮೆకానిక್ ಆಗಿರೋ ನిమ್‌ ಸಹಾಯ್ದಿಂದ ಬಗೆಹರಿಸ್ಕೋಬోదು ಅಂತಾಗಿ ನಿರಾಳ ಆಯ್ತು ನಂಗೆ.",
135
+ "synth_text": "ନମସ୍କାର, କେମିତି ଅଛନ୍ତି?"
136
  },
137
  ]
138
 
 
139
  # Preload all example audios
140
  for example in EXAMPLES:
141
  sample_rate, audio_data = load_audio_from_url(example["audio_url"])