FILMITO commited on
Commit
b95b3b5
·
verified ·
1 Parent(s): 4692c48

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +301 -488
app.py CHANGED
@@ -1,497 +1,310 @@
1
- import tempfile
2
-
3
- import edge_tts
4
  import gradio as gr
5
- from gradio_client import Client
6
- import pyarabic.araby as araby
7
-
8
- language_dict = {
9
- "English": {
10
- "Jenny": "en-US-JennyNeural",
11
- "Guy": "en-US-GuyNeural",
12
- "Ana": "en-US-AnaNeural",
13
- "Aria": "en-US-AriaNeural",
14
- "Christopher": "en-US-ChristopherNeural",
15
- "Eric": "en-US-EricNeural",
16
- "Michelle": "en-US-MichelleNeural",
17
- "Roger": "en-US-RogerNeural",
18
- "Natasha": "en-AU-NatashaNeural",
19
- "William": "en-AU-WilliamNeural",
20
- "Clara": "en-CA-ClaraNeural",
21
- "Liam": "en-CA-LiamNeural",
22
- "Libby": "en-GB-LibbyNeural",
23
- "Maisie": "en-GB-MaisieNeural",
24
- "Ryan": "en-GB-RyanNeural",
25
- "Sonia": "en-GB-SoniaNeural",
26
- "Thomas": "en-GB-ThomasNeural",
27
- "Sam": "en-HK-SamNeural",
28
- "Yan": "en-HK-YanNeural",
29
- "Connor": "en-IE-ConnorNeural",
30
- "Emily": "en-IE-EmilyNeural",
31
- "Neerja": "en-IN-NeerjaNeural",
32
- "Prabhat": "en-IN-PrabhatNeural",
33
- "Asilia": "en-KE-AsiliaNeural",
34
- "Chilemba": "en-KE-ChilembaNeural",
35
- "Abeo": "en-NG-AbeoNeural",
36
- "Ezinne": "en-NG-EzinneNeural",
37
- "Mitchell": "en-NZ-MitchellNeural",
38
- "James": "en-PH-JamesNeural",
39
- "Rosa": "en-PH-RosaNeural",
40
- "Luna": "en-SG-LunaNeural",
41
- "Wayne": "en-SG-WayneNeural",
42
- "Elimu": "en-TZ-ElimuNeural",
43
- "Imani": "en-TZ-ImaniNeural",
44
- "Leah": "en-ZA-LeahNeural",
45
- "Luke": "en-ZA-LukeNeural"
46
- },
47
- "Spanish": {
48
- "Elena": "es-AR-ElenaNeural",
49
- "Tomas": "es-AR-TomasNeural",
50
- "Marcelo": "es-BO-MarceloNeural",
51
- "Sofia": "es-BO-SofiaNeural",
52
- "Gonzalo": "es-CO-GonzaloNeural",
53
- "Salome": "es-CO-SalomeNeural",
54
- "Juan": "es-CR-JuanNeural",
55
- "Maria": "es-CR-MariaNeural",
56
- "Belkys": "es-CU-BelkysNeural",
57
- "Emilio": "es-DO-EmilioNeural",
58
- "Ramona": "es-DO-RamonaNeural",
59
- "Andrea": "es-EC-AndreaNeural",
60
- "Luis": "es-EC-LuisNeural",
61
- "Alvaro": "es-ES-AlvaroNeural",
62
- "Elvira": "es-ES-ElviraNeural",
63
- "Teresa": "es-GQ-TeresaNeural",
64
- "Andres": "es-GT-AndresNeural",
65
- "Marta": "es-GT-MartaNeural",
66
- "Carlos": "es-HN-CarlosNeural",
67
- "Karla": "es-HN-KarlaNeural",
68
- "Federico": "es-NI-FedericoNeural",
69
- "Yolanda": "es-NI-YolandaNeural",
70
- "Margarita": "es-PA-MargaritaNeural",
71
- "Roberto": "es-PA-RobertoNeural",
72
- "Alex": "es-PE-AlexNeural",
73
- "Camila": "es-PE-CamilaNeural",
74
- "Karina": "es-PR-KarinaNeural",
75
- "Victor": "es-PR-VictorNeural",
76
- "Mario": "es-PY-MarioNeural",
77
- "Tania": "es-PY-TaniaNeural",
78
- "Lorena": "es-SV-LorenaNeural",
79
- "Rodrigo": "es-SV-RodrigoNeural",
80
- "Alonso": "es-US-AlonsoNeural",
81
- "Paloma": "es-US-PalomaNeural",
82
- "Mateo": "es-UY-MateoNeural",
83
- "Valentina": "es-UY-ValentinaNeural",
84
- "Paola": "es-VE-PaolaNeural",
85
- "Sebastian": "es-VE-SebastianNeural"
86
- },
87
- "Arabic": {
88
- "Hamed": "ar-SA-HamedNeural",
89
- "Zariyah": "ar-SA-ZariyahNeural",
90
- "Fatima": "ar-AE-FatimaNeural",
91
- "Hamdan": "ar-AE-HamdanNeural",
92
- "Ali": "ar-BH-AliNeural",
93
- "Laila": "ar-BH-LailaNeural",
94
- "Ismael": "ar-DZ-IsmaelNeural",
95
- "Salma": "ar-EG-SalmaNeural",
96
- "Shakir": "ar-EG-ShakirNeural",
97
- "Bassel": "ar-IQ-BasselNeural",
98
- "Rana": "ar-IQ-RanaNeural",
99
- "Sana": "ar-JO-SanaNeural",
100
- "Taim": "ar-JO-TaimNeural",
101
- "Fahed": "ar-KW-FahedNeural",
102
- "Noura": "ar-KW-NouraNeural",
103
- "Layla": "ar-LB-LaylaNeural",
104
- "Rami": "ar-LB-RamiNeural",
105
- "Iman": "ar-LY-ImanNeural",
106
- "Omar": "ar-LY-OmarNeural",
107
- "Jamal": "ar-MA-JamalNeural",
108
- "Mouna": "ar-MA-MounaNeural",
109
- "Abdullah": "ar-OM-AbdullahNeural",
110
- "Aysha": "ar-OM-AyshaNeural",
111
- "Amal": "ar-QA-AmalNeural",
112
- "Moaz": "ar-QA-MoazNeural",
113
- "Amany": "ar-SY-AmanyNeural",
114
- "Laith": "ar-SY-LaithNeural",
115
- "Hedi": "ar-TN-HediNeural",
116
- "Reem": "ar-TN-ReemNeural",
117
- "Maryam": "ar-YE-MaryamNeural",
118
- "Saleh": "ar-YE-SalehNeural"
119
- },
120
- "Korean": {
121
- "Sun-Hi": "ko-KR-SunHiNeural",
122
- "InJoon": "ko-KR-InJoonNeural"
123
- },
124
- "Thai": {
125
- "Premwadee": "th-TH-PremwadeeNeural",
126
- "Niwat": "th-TH-NiwatNeural"
127
- },
128
- "Vietnamese": {
129
- "HoaiMy": "vi-VN-HoaiMyNeural",
130
- "NamMinh": "vi-VN-NamMinhNeural"
131
- },
132
- "Japanese": {
133
- "Nanami": "ja-JP-NanamiNeural",
134
- "Keita": "ja-JP-KeitaNeural"
135
- },
136
- "French": {
137
- "Denise": "fr-FR-DeniseNeural",
138
- "Eloise": "fr-FR-EloiseNeural",
139
- "Henri": "fr-FR-HenriNeural",
140
- "Sylvie": "fr-CA-SylvieNeural",
141
- "Antoine": "fr-CA-AntoineNeural",
142
- "Jean": "fr-CA-JeanNeural",
143
- "Ariane": "fr-CH-ArianeNeural",
144
- "Fabrice": "fr-CH-FabriceNeural",
145
- "Charline": "fr-BE-CharlineNeural",
146
- "Gerard": "fr-BE-GerardNeural"
147
- },
148
- "Portuguese": {
149
- "Francisca": "pt-BR-FranciscaNeural",
150
- "Antonio": "pt-BR-AntonioNeural",
151
- "Duarte": "pt-PT-DuarteNeural",
152
- "Raquel": "pt-PT-RaquelNeural"
153
- },
154
- "Indonesian": {
155
- "Ardi": "id-ID-ArdiNeural",
156
- "Gadis": "id-ID-GadisNeural"
157
- },
158
- "Hebrew": {
159
- "Avri": "he-IL-AvriNeural",
160
- "Hila": "he-IL-HilaNeural"
161
- },
162
- "Italian": {
163
- "Isabella": "it-IT-IsabellaNeural",
164
- "Diego": "it-IT-DiegoNeural",
165
- "Elsa": "it-IT-ElsaNeural"
166
- },
167
- "Dutch": {
168
- "Colette": "nl-NL-ColetteNeural",
169
- "Fenna": "nl-NL-FennaNeural",
170
- "Maarten": "nl-NL-MaartenNeural",
171
- "Arnaud": "nl-BE-ArnaudNeural",
172
- "Dena": "nl-BE-DenaNeural"
173
- },
174
- "Malay": {
175
- "Osman": "ms-MY-OsmanNeural",
176
- "Yasmin": "ms-MY-YasminNeural"
177
- },
178
- "Norwegian": {
179
- "Pernille": "nb-NO-PernilleNeural",
180
- "Finn": "nb-NO-FinnNeural"
181
- },
182
- "Swedish": {
183
- "Sofie": "sv-SE-SofieNeural",
184
- "Mattias": "sv-SE-MattiasNeural"
185
- },
186
- "Greek": {
187
- "Athina": "el-GR-AthinaNeural",
188
- "Nestoras": "el-GR-NestorasNeural"
189
- },
190
- "German": {
191
- "Katja": "de-DE-KatjaNeural",
192
- "Amala": "de-DE-AmalaNeural",
193
- "Conrad": "de-DE-ConradNeural",
194
- "Killian": "de-DE-KillianNeural",
195
- "Ingrid": "de-AT-IngridNeural",
196
- "Jonas": "de-AT-JonasNeural",
197
- "Jan": "de-CH-JanNeural",
198
- "Leni": "de-CH-LeniNeural"
199
- },
200
- "Afrikaans": {
201
- "Adri": "af-ZA-AdriNeural",
202
- "Willem": "af-ZA-WillemNeural"
203
- },
204
- "Amharic": {
205
- "Ameha": "am-ET-AmehaNeural",
206
- "Mekdes": "am-ET-MekdesNeural"
207
- },
208
- "Azerbaijani": {
209
- "Babek": "az-AZ-BabekNeural",
210
- "Banu": "az-AZ-BanuNeural"
211
- },
212
- "Bulgarian": {
213
- "Borislav": "bg-BG-BorislavNeural",
214
- "Kalina": "bg-BG-KalinaNeural"
215
- },
216
- "Bengali": {
217
- "Nabanita": "bn-BD-NabanitaNeural",
218
- "Pradeep": "bn-BD-PradeepNeural",
219
- "Bashkar": "bn-IN-BashkarNeural",
220
- "Tanishaa": "bn-IN-TanishaaNeural"
221
- },
222
- "Bosnian": {
223
- "Goran": "bs-BA-GoranNeural",
224
- "Vesna": "bs-BA-VesnaNeural"
225
- },
226
- "Catalan": {
227
- "Joana": "ca-ES-JoanaNeural",
228
- "Enric": "ca-ES-EnricNeural"
229
- },
230
- "Czech": {
231
- "Antonin": "cs-CZ-AntoninNeural",
232
- "Vlasta": "cs-CZ-VlastaNeural"
233
- },
234
- "Welsh": {
235
- "Aled": "cy-GB-AledNeural",
236
- "Nia": "cy-GB-NiaNeural"
237
- },
238
- "Danish": {
239
- "Christel": "da-DK-ChristelNeural",
240
- "Jeppe": "da-DK-JeppeNeural"
241
- },
242
- "Estonian": {
243
- "Anu": "et-EE-AnuNeural",
244
- "Kert": "et-EE-KertNeural"
245
- },
246
- "Persian": {
247
- "Dilara": "fa-IR-DilaraNeural",
248
- "Farid": "fa-IR-FaridNeural"
249
- },
250
- "Finnish": {
251
- "Harri": "fi-FI-HarriNeural",
252
- "Noora": "fi-FI-NooraNeural"
253
- },
254
- "Irish": {
255
- "Colm": "ga-IE-ColmNeural",
256
- "Orla": "ga-IE-OrlaNeural"
257
- },
258
- "Galician": {
259
- "Roi": "gl-ES-RoiNeural",
260
- "Sabela": "gl-ES-SabelaNeural"
261
- },
262
- "Gujarati": {
263
- "Dhwani": "gu-IN-DhwaniNeural",
264
- "Niranjan": "gu-IN-NiranjanNeural"
265
- },
266
- "Hindi": {
267
- "Madhur": "hi-IN-MadhurNeural",
268
- "Swara": "hi-IN-SwaraNeural"
269
- },
270
- "Croatian": {
271
- "Gabrijela": "hr-HR-GabrijelaNeural",
272
- "Srecko": "hr-HR-SreckoNeural"
273
- },
274
- "Hungarian": {
275
- "Noemi": "hu-HU-NoemiNeural",
276
- "Tamas": "hu-HU-TamasNeural"
277
- },
278
- "Icelandic": {
279
- "Gudrun": "is-IS-GudrunNeural",
280
- "Gunnar": "is-IS-GunnarNeural"
281
- },
282
- "Javanese": {
283
- "Dimas": "jv-ID-DimasNeural",
284
- "Siti": "jv-ID-SitiNeural"
285
- },
286
- "Georgian": {
287
- "Eka": "ka-GE-EkaNeural",
288
- "Giorgi": "ka-GE-GiorgiNeural"
289
- },
290
- "Kazakh": {
291
- "Aigul": "kk-KZ-AigulNeural",
292
- "Daulet": "kk-KZ-DauletNeural"
293
- },
294
- "Khmer": {
295
- "Piseth": "km-KH-PisethNeural",
296
- "Sreymom": "km-KH-SreymomNeural"
297
- },
298
- "Kannada": {
299
- "Gagan": "kn-IN-GaganNeural",
300
- "Sapna": "kn-IN-SapnaNeural"
301
- },
302
- "Lao": {
303
- "Chanthavong": "lo-LA-ChanthavongNeural",
304
- "Keomany": "lo-LA-KeomanyNeural"
305
- },
306
- "Lithuanian": {
307
- "Leonas": "lt-LT-LeonasNeural",
308
- "Ona": "lt-LT-OnaNeural"
309
- },
310
- "Latvian": {
311
- "Everita": "lv-LV-EveritaNeural",
312
- "Nils": "lv-LV-NilsNeural"
313
- },
314
- "Macedonian": {
315
- "Aleksandar": "mk-MK-AleksandarNeural",
316
- "Marija": "mk-MK-MarijaNeural"
317
- },
318
- "Malayalam": {
319
- "Midhun": "ml-IN-MidhunNeural",
320
- "Sobhana": "ml-IN-SobhanaNeural"
321
- },
322
- "Mongolian": {
323
- "Bataa": "mn-MN-BataaNeural",
324
- "Yesui": "mn-MN-YesuiNeural"
325
- },
326
- "Marathi": {
327
- "Aarohi": "mr-IN-AarohiNeural",
328
- "Manohar": "mr-IN-ManoharNeural"
329
- },
330
- "Maltese": {
331
- "Grace": "mt-MT-GraceNeural",
332
- "Joseph": "mt-MT-JosephNeural"
333
- },
334
- "Burmese": {
335
- "Nilar": "my-MM-NilarNeural",
336
- "Thiha": "my-MM-ThihaNeural"
337
- },
338
- "Nepali": {
339
- "Hemkala": "ne-NP-HemkalaNeural",
340
- "Sagar": "ne-NP-SagarNeural"
341
- },
342
- "Polish": {
343
- "Marek": "pl-PL-MarekNeural",
344
- "Zofia": "pl-PL-ZofiaNeural"
345
- },
346
- "Pashto": {
347
- "Gul Nawaz": "ps-AF-GulNawazNeural",
348
- "Latifa": "ps-AF-LatifaNeural"
349
- },
350
- "Romanian": {
351
- "Alina": "ro-RO-AlinaNeural",
352
- "Emil": "ro-RO-EmilNeural"
353
- },
354
- "Russian": {
355
- "Svetlana": "ru-RU-SvetlanaNeural",
356
- "Dmitry": "ru-RU-DmitryNeural"
357
- },
358
- "Sinhala": {
359
- "Sameera": "si-LK-SameeraNeural",
360
- "Thilini": "si-LK-ThiliniNeural"
361
- },
362
- "Slovak": {
363
- "Lukas": "sk-SK-LukasNeural",
364
- "Viktoria": "sk-SK-ViktoriaNeural"
365
- },
366
- "Slovenian": {
367
- "Petra": "sl-SI-PetraNeural",
368
- "Rok": "sl-SI-RokNeural"
369
- },
370
- "Somali": {
371
- "Muuse": "so-SO-MuuseNeural",
372
- "Ubax": "so-SO-UbaxNeural"
373
- },
374
- "Albanian": {
375
- "Anila": "sq-AL-AnilaNeural",
376
- "Ilir": "sq-AL-IlirNeural"
377
- },
378
- "Serbian": {
379
- "Nicholas": "sr-RS-NicholasNeural",
380
- "Sophie": "sr-RS-SophieNeural"
381
- },
382
- "Sundanese": {
383
- "Jajang": "su-ID-JajangNeural",
384
- "Tuti": "su-ID-TutiNeural"
385
- },
386
- "Swahili": {
387
- "Rafiki": "sw-KE-RafikiNeural",
388
- "Zuri": "sw-KE-ZuriNeural",
389
- "Daudi": "sw-TZ-DaudiNeural",
390
- "Rehema": "sw-TZ-RehemaNeural"
391
- },
392
- "Tamil": {
393
- "Pallavi": "ta-IN-PallaviNeural",
394
- "Valluvar": "ta-IN-ValluvarNeural",
395
- "Kumar": "ta-LK-KumarNeural",
396
- "Saranya": "ta-LK-SaranyaNeural",
397
- "Kani": "ta-MY-KaniNeural",
398
- "Surya": "ta-MY-SuryaNeural",
399
- "Anbu": "ta-SG-AnbuNeural"
400
- },
401
- "Telugu": {
402
- "Mohan": "te-IN-MohanNeural",
403
- "Shruti": "te-IN-ShrutiNeural"
404
- },
405
- "Turkish": {
406
- "Ahmet": "tr-TR-AhmetNeural",
407
- "Emel": "tr-TR-EmelNeural"
408
- },
409
- "Ukrainian": {
410
- "Ostap": "uk-UA-OstapNeural",
411
- "Polina": "uk-UA-PolinaNeural"
412
- },
413
- "Urdu": {
414
- "Gul": "ur-IN-GulNeural",
415
- "Salman": "ur-IN-SalmanNeural",
416
- "Asad": "ur-PK-AsadNeural",
417
- "Uzma": "ur-PK-UzmaNeural"
418
- },
419
- "Uzbek": {
420
- "Madina": "uz-UZ-MadinaNeural",
421
- "Sardor": "uz-UZ-SardorNeural"
422
- },
423
- "Mandarin": {
424
- "Xiaoxiao": "zh-CN-XiaoxiaoNeural",
425
- "Yunyang": "zh-CN-YunyangNeural",
426
- "Yunxi": "zh-CN-YunxiNeural",
427
- "Xiaoyi": "zh-CN-XiaoyiNeural",
428
- "Yunjian": "zh-CN-YunjianNeural",
429
- "Yunxia": "zh-CN-YunxiaNeural",
430
- "Xiaobei": "zh-CN-liaoning-XiaobeiNeural",
431
- "Xiaoni": "zh-CN-shaanxi-XiaoniNeural",
432
- "HiuMaan": "zh-HK-HiuMaanNeural",
433
- "HiuGaai": "zh-HK-HiuGaaiNeural",
434
- "WanLung": "zh-HK-WanLungNeural",
435
- "HsiaoChen": "zh-TW-HsiaoChenNeural",
436
- "HsiaoYu": "zh-TW-HsiaoYuNeural",
437
- "YunJhe": "zh-TW-YunJheNeural"
438
- },
439
- "Zulu": {
440
- "Thando": "zu-ZA-ThandoNeural",
441
- "Themba": "zu-ZA-ThembaNeural"
442
- }
443
- }
444
-
445
- client = Client("MohamedRashad/arabic-auto-tashkeel")
446
-
447
- async def text_to_speech_edge(text, language_code, speaker, tashkeel_checkbox=False):
448
 
449
- # Remove diacritics from Arabic text then add tashkeel
450
- if language_code == "Arabic" and tashkeel_checkbox:
451
- text = client.predict(
452
- input_text=araby.strip_diacritics(text),
453
- api_name="/infer_shakkala"
454
- )
455
 
456
- # Get the voice for the selected language and speaker
457
- voice = language_dict[language_code][speaker]
458
- communicate = edge_tts.Communicate(text, voice)
459
- with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
460
- tmp_path = tmp_file.name
461
- await communicate.save(tmp_path)
462
-
463
- return text, tmp_path
464
-
465
-
466
- def get_speakers(language):
467
- print(language)
468
- speakers = list(language_dict[language].keys())
469
- return gr.Dropdown(choices=speakers, value=speakers[0], interactive=True), gr.Checkbox(visible=language == "Arabic", interactive=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
470
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
471
 
472
- default_language = None
473
- default_speaker = None
474
- with gr.Blocks(title="Multilingual TTS") as demo:
475
- gr.HTML("<center><h1>Multilingual TTS (Edge TTS)</h1></center>")
476
- gr.HTML(f"<h2 style='color:Tomato;'> {len(language_dict)} languages supported</h3>")
477
- gr.HTML(f"<p> {', '.join(language_dict.keys())} </h3>")
478
- gr.Markdown("**Note:** A special feature is added for Arabic language only.")
 
 
479
  with gr.Row():
480
- with gr.Column():
481
- input_text = gr.Textbox(lines=5, label="Input Text", placeholder="Enter text to convert to speech")
482
- language = gr.Dropdown(
483
- choices=list(language_dict.keys()), value=default_language, label="Languages", interactive=True
 
 
484
  )
485
- speaker = gr.Dropdown(choices=[], value=default_speaker, label="Speakers", interactive=False)
486
- tashkeel_checkbox = gr.Checkbox(label="Tashkeel", value=False, visible=False, interactive=False)
487
- run_btn = gr.Button(value="Generate Audio", variant="primary")
488
-
489
- with gr.Column():
490
- output_text = gr.Textbox(label="Output Text")
491
- output_audio = gr.Audio(type="filepath", label="Audio Output")
492
-
493
- language.change(get_speakers, inputs=[language], outputs=[speaker, tashkeel_checkbox])
494
- run_btn.click(text_to_speech_edge, inputs=[input_text, language, speaker, tashkeel_checkbox], outputs=[output_text, output_audio])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
495
 
496
  if __name__ == "__main__":
497
- demo.queue().launch(share=True)
 
 
 
 
1
  import gradio as gr
2
+ import numpy as np
3
+ import tempfile
4
+ import librosa
5
+ import soundfile as sf
6
+ from scipy import signal
7
+ import os
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
+ class AIHumanizer:
10
+ def __init__(self):
11
+ pass
 
 
 
12
 
13
+ def humanize_audio(self, audio_path, intensity=0.7):
14
+ """Remove AI artifacts and make audio sound human-made"""
15
+ try:
16
+ print(f"Loading audio from: {audio_path}")
17
+
18
+ # Load the full song
19
+ y, sr = librosa.load(audio_path, sr=None, mono=False)
20
+
21
+ print(f"Audio loaded: shape={y.shape if hasattr(y, 'shape') else 'mono'}, sr={sr}")
22
+
23
+ # If stereo, process both channels
24
+ if len(y.shape) > 1:
25
+ print("Processing stereo audio...")
26
+ processed_channels = []
27
+ for i in range(y.shape[0]):
28
+ print(f"Processing channel {i+1}...")
29
+ processed_channel = self.process_channel(y[i], sr, intensity)
30
+ processed_channels.append(processed_channel)
31
+ y_processed = np.array(processed_channels)
32
+ else:
33
+ print("Processing mono audio...")
34
+ y_processed = self.process_channel(y, sr, intensity)
35
+ y_processed = np.array([y_processed])
36
+
37
+ print("Audio processing completed successfully")
38
+ return y_processed, sr
39
+
40
+ except Exception as e:
41
+ print(f"Error in humanize_audio: {str(e)}")
42
+ raise Exception(f"Humanization failed: {str(e)}")
43
+
44
+ def process_channel(self, y, sr, intensity):
45
+ """Process a single audio channel to remove AI artifacts"""
46
+ print(f"Processing channel: {len(y)} samples")
47
+
48
+ # Store original for blending
49
+ y_original = y.copy()
50
+
51
+ # 1. Reduce robotic frequencies
52
+ y = self.reduce_ai_artifacts(y, sr, intensity)
53
+
54
+ # 2. Add timing variations
55
+ y = self.add_timing_variations(y, sr, intensity)
56
+
57
+ # 3. Add pitch variations
58
+ y = self.add_pitch_variations(y, sr, intensity)
59
+
60
+ # 4. Add room ambiance
61
+ y = self.add_room_ambiance(y, sr, intensity)
62
+
63
+ # 5. Add analog warmth
64
+ y = self.add_analog_warmth(y, sr, intensity)
65
+
66
+ # 6. Reduce perfect quantization
67
+ y = self.reduce_perfect_quantization(y, sr, intensity)
68
+
69
+ return y
70
+
71
+ def reduce_ai_artifacts(self, y, sr, intensity):
72
+ """Reduce common AI audio artifacts"""
73
+ if sr > 4000 and intensity > 0.1:
74
+ try:
75
+ # Reduce harsh frequencies in the 2kHz-6kHz range
76
+ sos = signal.butter(4, [1900, 6100], 'bandstop', fs=sr, output='sos')
77
+ y_filtered = signal.sosfilt(sos, y)
78
+
79
+ # Blend with original
80
+ blend_factor = 0.3 * intensity
81
+ return y * (1 - blend_factor) + y_filtered * blend_factor
82
+ except:
83
+ return y
84
+ return y
85
+
86
+ def add_timing_variations(self, y, sr, intensity):
87
+ """Add subtle timing variations"""
88
+ if intensity < 0.2:
89
+ return y
90
+
91
+ try:
92
+ # Simple approach: small random stretches
93
+ segment_size = int(sr * 1.0) # 1-second segments
94
+ if len(y) < segment_size * 2:
95
+ return y
96
+
97
+ segments = []
98
+ for i in range(0, len(y), segment_size):
99
+ segment = y[i:i+segment_size]
100
+ if len(segment) == segment_size:
101
+ # Small random stretch
102
+ stretch = 1.0 + np.random.uniform(-0.01, 0.01) * intensity
103
+ new_len = int(segment_size * stretch)
104
+
105
+ # Resample
106
+ x_old = np.linspace(0, 1, segment_size)
107
+ x_new = np.linspace(0, 1, new_len)
108
+ segment_stretched = np.interp(x_new, x_old, segment)
109
+
110
+ # Trim or pad to original length
111
+ if len(segment_stretched) > segment_size:
112
+ segment_stretched = segment_stretched[:segment_size]
113
+ else:
114
+ segment_stretched = np.pad(segment_stretched, (0, segment_size - len(segment_stretched)))
115
+
116
+ segments.append(segment_stretched)
117
+ else:
118
+ segments.append(segment)
119
+
120
+ return np.concatenate(segments)
121
+ except:
122
+ return y
123
+
124
+ def add_pitch_variations(self, y, sr, intensity):
125
+ """Add subtle pitch variations"""
126
+ if intensity < 0.3:
127
+ return y
128
+
129
+ try:
130
+ # Small random pitch shifts
131
+ n_steps = np.random.uniform(-0.2, 0.2) * intensity
132
+ y_shifted = librosa.effects.pitch_shift(y, sr=sr, n_steps=n_steps)
133
+
134
+ # Blend
135
+ blend_factor = 0.2 * intensity
136
+ return y * (1 - blend_factor) + y_shifted * blend_factor
137
+ except:
138
+ return y
139
+
140
+ def add_room_ambiance(self, y, sr, intensity):
141
+ """Add natural room reverb"""
142
+ if intensity < 0.2:
143
+ return y
144
+
145
+ try:
146
+ # Simple reverb impulse
147
+ impulse_len = int(0.15 * sr)
148
+ if impulse_len < 10:
149
+ return y
150
+
151
+ impulse = np.zeros(impulse_len)
152
+ # Early reflection
153
+ early = int(0.01 * sr)
154
+ if early < impulse_len:
155
+ impulse[early] = 0.8
156
+ # Reverb tail
157
+ tail_start = min(early + 1, impulse_len)
158
+ if tail_start < impulse_len:
159
+ tail_len = impulse_len - tail_start
160
+ decay = np.exp(-np.linspace(0, 6, tail_len))
161
+ impulse[tail_start:] = decay * 0.4
162
+
163
+ # Apply convolution
164
+ y_reverb = signal.convolve(y, impulse, mode='same')
165
+ # Normalize
166
+ if np.max(np.abs(y_reverb)) > 0:
167
+ y_reverb = y_reverb / np.max(np.abs(y_reverb)) * np.max(np.abs(y))
168
+
169
+ # Blend
170
+ blend_factor = 0.1 * intensity
171
+ return y * (1 - blend_factor) + y_reverb * blend_factor
172
+ except:
173
+ return y
174
+
175
+ def add_analog_warmth(self, y, sr, intensity):
176
+ """Add analog-style warmth"""
177
+ if intensity < 0.1:
178
+ return y
179
+
180
+ try:
181
+ # Soft clipping
182
+ saturation = 1.0 + 0.4 * intensity
183
+ y_warm = np.tanh(y * saturation) / saturation
184
+
185
+ # Gentle low boost
186
+ if sr > 1000:
187
+ sos = signal.butter(2, 100, 'high', fs=sr, output='sos')
188
+ y_warm = signal.sosfilt(sos, y_warm)
189
+
190
+ blend_factor = 0.15 * intensity
191
+ return y * (1 - blend_factor) + y_warm * blend_factor
192
+ except:
193
+ return y
194
+
195
+ def reduce_perfect_quantization(self, y, sr, intensity):
196
+ """Reduce perfectly quantized timing"""
197
+ if intensity < 0.1:
198
+ return y
199
+
200
+ # Add subtle amplitude variations
201
+ t = np.arange(len(y)) / sr
202
+ # Slow LFO for natural dynamics
203
+ lfo1 = 1.0 + np.sin(2 * np.pi * 0.3 * t) * 0.02 * intensity
204
+ # Faster LFO for micro-variations
205
+ lfo2 = 1.0 + np.sin(2 * np.pi * 2.0 * t) * 0.01 * intensity
206
+ # Random noise
207
+ noise = 1.0 + np.random.normal(0, 0.005 * intensity, len(y))
208
+
209
+ combined = lfo1 * lfo2 * noise
210
+ return y * combined
211
 
212
+ def humanize_song(input_audio, intensity):
213
+ """Main humanization function"""
214
+ if input_audio is None:
215
+ return None, "Please upload an audio file"
216
+
217
+ humanizer = AIHumanizer()
218
+
219
+ try:
220
+ print("Starting humanization...")
221
+
222
+ # Get the file path from the audio input
223
+ audio_path = input_audio
224
+
225
+ # Process the audio
226
+ audio_data, sr = humanizer.humanize_audio(audio_path, intensity)
227
+
228
+ print(f"Processing complete. Saving audio...")
229
+
230
+ # Save as WAV
231
+ output_path = tempfile.mktemp(suffix='_humanized.wav')
232
+
233
+ # Handle stereo/mono properly
234
+ if audio_data.shape[0] == 1:
235
+ # Mono
236
+ sf.write(output_path, audio_data[0], sr)
237
+ else:
238
+ # Stereo - transpose for soundfile
239
+ sf.write(output_path, audio_data.T, sr)
240
+
241
+ print(f"Saved to: {output_path}")
242
+ return output_path, "✅ Success! Your song now sounds human-made. Download below."
243
+
244
+ except Exception as e:
245
+ error_msg = f"❌ Error: {str(e)}"
246
+ print(f"Error details: {error_msg}")
247
+ return None, error_msg
248
 
249
+ # Simple and compatible interface
250
+ with gr.Blocks(theme=gr.themes.Soft(), title="AI Song Humanizer") as demo:
251
+ gr.Markdown("""
252
+ # 🎵 AI Song Humanizer
253
+ **Remove AI Detection - Make Songs Sound Human-Made**
254
+
255
+ *Upload AI song Remove robotic sound Download human version*
256
+ """)
257
+
258
  with gr.Row():
259
+ with gr.Column(scale=1):
260
+ gr.Markdown("### 1. Upload AI Song")
261
+ input_audio = gr.Audio(
262
+ sources=["upload"],
263
+ type="filepath",
264
+ label="Upload your AI-generated song"
265
  )
266
+
267
+ gr.Markdown("### 2. Humanization Strength")
268
+ intensity = gr.Slider(
269
+ 0.1, 1.0, value=0.7,
270
+ label="How much human feel to add"
271
+ )
272
+
273
+ process_btn = gr.Button(
274
+ "✨ Humanize Song",
275
+ variant="primary",
276
+ size="lg"
277
+ )
278
+
279
+ with gr.Column(scale=1):
280
+ gr.Markdown("### 3. Download Result")
281
+ output_audio = gr.Audio(
282
+ label="Human-Sounding Version",
283
+ type="filepath"
284
+ )
285
+
286
+ status = gr.Textbox(
287
+ label="Status",
288
+ interactive=False
289
+ )
290
+
291
+ with gr.Accordion("📖 How It Works", open=False):
292
+ gr.Markdown("""
293
+ **This tool:**
294
+ - Takes your complete AI-generated song
295
+ - Removes robotic/AI artifacts
296
+ - Adds natural human performance variations
297
+ - Keeps your original music intact
298
+ - Makes it sound like humans performed it
299
+
300
+ **Perfect for making AI music undetectable!**
301
+ """)
302
+
303
+ process_btn.click(
304
+ fn=humanize_song,
305
+ inputs=[input_audio, intensity],
306
+ outputs=[output_audio, status]
307
+ )
308
 
309
  if __name__ == "__main__":
310
+ demo.launch()