| import gradio as gr |
| import edge_tts |
| import asyncio |
| import tempfile |
| import random |
| import re |
| from pydub import AudioSegment |
| from pydub.effects import normalize |
|
|
| |
| VOICES = { |
| "US Female (Best)": "en-US-JennyNeural", |
| "Indian Female": "en-IN-NeerjaNeural", |
| "Indian Male": "en-IN-PrabhatNeural" |
| } |
|
|
| |
| def add_emotion(text, mood, intensity): |
| if mood == "Happy": |
| prefix = "(smiling)" if intensity <= 6 else "(excited)" |
| elif mood == "Sad": |
| prefix = "(softly)" if intensity <= 6 else "(low tone)" |
| elif mood == "Angry": |
| prefix = "(firm)" if intensity <= 6 else "(serious)" |
| else: |
| prefix = "" |
|
|
| return f"{prefix} {text}".strip() |
|
|
| |
| def smart_pause(text): |
| text = re.sub(r'\.', '. ', text) |
| text = re.sub(r',', ', ', text) |
| text = re.sub(r'\?', '? ', text) |
| text = re.sub(r'!', '! ', text) |
| return text |
|
|
| |
| def humanize(text): |
| text = smart_pause(text) |
|
|
| |
| if random.random() > 0.6: |
| text = "hmm... " + text |
|
|
| return text |
|
|
| |
| def enhance_audio(file_path): |
| audio = AudioSegment.from_file(file_path) |
|
|
| |
| audio = normalize(audio) |
|
|
| |
| audio = audio.high_pass_filter(100) |
| audio = audio.low_pass_filter(4000) |
|
|
| |
| bg = audio - 35 |
| audio = audio.overlay(bg) |
|
|
| |
| audio = audio + 1 |
|
|
| out = file_path.replace(".mp3", "_final.wav") |
| audio.export(out, format="wav") |
| return out |
|
|
| |
| async def tts(text, voice): |
| file_path = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3").name |
|
|
| |
| rate = random.choice(["-5%", "-6%", "-7%"]) |
| pitch = random.choice(["+1Hz", "+2Hz"]) |
|
|
| communicate = edge_tts.Communicate( |
| text=text, |
| voice=voice, |
| rate=rate, |
| pitch=pitch |
| ) |
|
|
| await communicate.save(file_path) |
| return file_path |
|
|
| |
| def generate(text, mood, intensity, voice_name): |
| if not text.strip(): |
| return None |
|
|
| voice = VOICES[voice_name] |
|
|
| text = add_emotion(text, mood, intensity) |
| text = humanize(text) |
|
|
| mp3_file = asyncio.run(tts(text, voice)) |
| final_audio = enhance_audio(mp3_file) |
|
|
| return final_audio |
|
|
| |
| with gr.Blocks() as demo: |
| gr.Markdown("## π₯ Pro Human-like AI Voice (Free)") |
|
|
| text_input = gr.Textbox( |
| label="Enter Text", |
| placeholder="Example: kya tum theek ho?" |
| ) |
|
|
| mood = gr.Dropdown( |
| ["Normal", "Happy", "Sad", "Angry"], |
| value="Normal", |
| label="Emotion" |
| ) |
|
|
| intensity = gr.Slider(1, 10, value=5, label="Emotion Intensity") |
|
|
| voice_select = gr.Dropdown( |
| choices=list(VOICES.keys()), |
| value="US Female (Best)", |
| label="Voice" |
| ) |
|
|
| output_audio = gr.Audio(label="Generated Voice") |
|
|
| btn = gr.Button("Generate") |
| btn.click(generate, inputs=[text_input, mood, intensity, voice_select], outputs=output_audio) |
|
|
| demo.launch() |