| import gradio as gr |
| import edge_tts |
| import asyncio |
| import tempfile |
| import os |
| import soundfile as sf |
| import numpy as np |
| from scipy import signal |
| from scipy.signal import fftconvolve |
| from openai import OpenAI |
| import groq |
|
|
| GROQ_API_KEY = os.environ.get("GROQ_API_KEY") |
| DASHSCOPE_API_KEY = os.environ.get("DASHSCOPE_API_KEY") |
|
|
| groq_client = groq.Groq(api_key=GROQ_API_KEY) |
| qwen_client = OpenAI( |
| api_key=DASHSCOPE_API_KEY, |
| base_url="https://dashscope.aliyuncs.com/compatible-mode/v1" |
| ) |
|
|
| SYSTEM_PROMPT = """Tu ek helpful Hindi assistant hai. |
| Hamesha Hindi mein jawab de — short aur natural. |
| Friendly aur warm tone rakho. |
| Har jawab 2-3 sentences mein do. |
| Jaise real insaan baat karta hai waisa bolo.""" |
|
|
| |
| |
| |
| |
|
|
| def enhance_to_amit_profile(audio_path): |
| y, sr = sf.read(audio_path) |
| if len(y.shape) > 1: |
| y = y.mean(axis=1) |
| y = y.astype(np.float32) |
|
|
| |
| b_sub, a_sub = signal.butter(3, 200 / (sr / 2), btype='low') |
| sub = signal.filtfilt(b_sub, a_sub, y) |
| y = y + sub * 0.35 |
|
|
| |
| b_lm, a_lm = signal.butter(2, [200/(sr/2), 800/(sr/2)], btype='band') |
| low_mid = signal.filtfilt(b_lm, a_lm, y) |
| y = y + low_mid * 0.28 |
|
|
| |
| b_mid, a_mid = signal.butter(2, [800/(sr/2), 2500/(sr/2)], btype='band') |
| mid = signal.filtfilt(b_mid, a_mid, y) |
| y = y + mid * 0.12 |
|
|
| |
| b_ess, a_ess = signal.butter(2, 6000 / (sr / 2), btype='high') |
| ess = signal.filtfilt(b_ess, a_ess, y) |
| y = y - ess * 0.22 |
|
|
| |
| impulse = np.zeros(int(sr * 0.018)) |
| impulse[0] = 1.0 |
| impulse[int(sr * 0.007)] = 0.10 |
| impulse[int(sr * 0.014)] = 0.05 |
| impulse[int(sr * 0.017)] = 0.02 |
| room = fftconvolve(y, impulse)[:len(y)] |
| y = y * 0.88 + room * 0.12 |
|
|
| |
| |
| threshold = 0.25 |
| ratio = 0.65 |
| mask = np.abs(y) > threshold |
| y[mask] = (np.sign(y[mask]) * |
| (threshold + (np.abs(y[mask]) - threshold) * ratio)) |
|
|
| |
| from scipy.ndimage import uniform_filter1d |
| y = uniform_filter1d(y, size=3) |
|
|
| |
| current_rms = np.sqrt(np.mean(y**2)) |
| target_rms = 0.0561 |
| if current_rms > 0: |
| y = y * (target_rms / current_rms) |
|
|
| |
| y = np.clip(y, -0.95, 0.95) |
|
|
| with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f: |
| sf.write(f.name, y, sr) |
| return f.name |
|
|
| async def generate_edge_tts(text, style): |
| styles = { |
| "Conversational": ("-10%", "+0Hz"), |
| "Warm & Slow": ("-15%", "+0Hz"), |
| "Energetic": ("-5%", "+1Hz"), |
| "Calm": ("-18%", "-1Hz"), |
| "Professional": ("-8%", "+0Hz"), |
| } |
| rate, pitch = styles.get(style, ("-10%", "+0Hz")) |
|
|
| with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as f: |
| path = f.name |
|
|
| tts = edge_tts.Communicate( |
| text=text, |
| voice="hi-IN-MadhurNeural", |
| rate=rate, |
| pitch=pitch |
| ) |
| await tts.save(path) |
| return path |
|
|
| def generate_tts(text, style): |
| raw_path = asyncio.run(generate_edge_tts(text, style)) |
| enhanced_path = enhance_to_amit_profile(raw_path) |
| return enhanced_path |
|
|
| def speech_to_text(audio_path): |
| if audio_path is None: |
| return "" |
| with open(audio_path, "rb") as f: |
| result = groq_client.audio.transcriptions.create( |
| file=f, |
| model="whisper-large-v3", |
| language="hi", |
| response_format="text" |
| ) |
| return result |
|
|
| def get_response(user_msg, history): |
| messages = [{"role": "system", "content": SYSTEM_PROMPT}] |
| for human, assistant in history: |
| messages.append({"role": "user", "content": human}) |
| messages.append({"role": "assistant", "content": assistant}) |
| messages.append({"role": "user", "content": user_msg}) |
| response = qwen_client.chat.completions.create( |
| model="qwen-turbo", |
| messages=messages, |
| max_tokens=150, |
| temperature=0.8 |
| ) |
| return response.choices[0].message.content |
|
|
| def process_voice(audio, style, history): |
| if audio is None: |
| return history, None, "Mic se bolo..." |
| user_text = speech_to_text(audio) |
| if not user_text.strip(): |
| return history, None, "Kuch suna nahi..." |
| bot_response = get_response(user_text, history) |
| audio_path = generate_tts(bot_response, style) |
| history.append((user_text, bot_response)) |
| return history, audio_path, "" |
|
|
| def process_text(text, style, history): |
| if not text.strip(): |
| return history, None, "" |
| bot_response = get_response(text, history) |
| audio_path = generate_tts(bot_response, style) |
| history.append((text, bot_response)) |
| return history, audio_path, "" |
|
|
| def preview_voice(text, style): |
| if not text.strip(): |
| return None |
| return generate_tts(text, style) |
|
|
| with gr.Blocks(title="Hindi Voice Assistant") as demo: |
| gr.Markdown("## Hindi Voice Assistant") |
| gr.Markdown("Edge TTS — Conversational Hindi Voice") |
|
|
| with gr.Row(): |
| with gr.Column(scale=2): |
| chatbot = gr.Chatbot( |
| label="Conversation", |
| height=380, |
| bubble_full_width=False |
| ) |
|
|
| style_select = gr.Dropdown( |
| choices=[ |
| "Conversational", |
| "Warm & Slow", |
| "Energetic", |
| "Calm", |
| "Professional" |
| ], |
| value="Conversational", |
| label="Voice Style" |
| ) |
|
|
| voice_input = gr.Audio( |
| label="Bolkar poochho", |
| sources=["microphone"], |
| type="filepath" |
| ) |
|
|
| with gr.Row(): |
| text_input = gr.Textbox( |
| label="Ya likhkar poochho", |
| placeholder="Kuch bhi poochho...", |
| scale=4 |
| ) |
| send_btn = gr.Button("Send", variant="primary", scale=1) |
|
|
| clear_btn = gr.Button("Clear", variant="secondary") |
|
|
| with gr.Column(scale=1): |
| audio_output = gr.Audio( |
| label="Assistant ki Awaaz", |
| autoplay=True |
| ) |
| status = gr.Textbox( |
| label="Status", |
| interactive=False, |
| value="Ready!" |
| ) |
|
|
| gr.Markdown("### Voice Preview") |
| preview_text = gr.Textbox( |
| label="Test karo", |
| placeholder="Koi bhi text likho...", |
| lines=2 |
| ) |
| preview_btn = gr.Button("Preview Voice", variant="secondary") |
| preview_audio = gr.Audio(label="Preview", autoplay=True) |
|
|
| gr.Markdown(""" |
| ### Enhancement Profile: |
| - Bass warmth: +35% |
| - Low mid boost: +28% |
| - De-essing: active |
| - Room presence: subtle |
| - Natural dynamics: matched |
| |
| ### Test Karo: |
| ``` |
| नमस्ते! कैसे हैं आप? |
| ``` |
| ``` |
| मैं आपकी पूरी मदद करूँगा। |
| ``` |
| ``` |
| बताइए, क्या जानना है? |
| ``` |
| """) |
|
|
| chat_history = gr.State([]) |
|
|
| voice_input.stop_recording( |
| fn=process_voice, |
| inputs=[voice_input, style_select, chat_history], |
| outputs=[chatbot, audio_output, status] |
| ) |
|
|
| send_btn.click( |
| fn=process_text, |
| inputs=[text_input, style_select, chat_history], |
| outputs=[chatbot, audio_output, status] |
| ).then(fn=lambda: "", outputs=[text_input]) |
|
|
| text_input.submit( |
| fn=process_text, |
| inputs=[text_input, style_select, chat_history], |
| outputs=[chatbot, audio_output, status] |
| ).then(fn=lambda: "", outputs=[text_input]) |
|
|
| clear_btn.click( |
| fn=lambda: ([], None, "Ready!"), |
| outputs=[chatbot, audio_output, status] |
| ).then(fn=lambda: [], outputs=[chat_history]) |
|
|
| preview_btn.click( |
| fn=preview_voice, |
| inputs=[preview_text, style_select], |
| outputs=[preview_audio] |
| ) |
|
|
| demo.launch() |