hero / app.py
abhiXai's picture
Create app.py
e98d9d8 verified
import gradio as gr
import edge_tts
import asyncio
import tempfile
import os
import soundfile as sf
import numpy as np
from scipy import signal
from scipy.signal import fftconvolve
from openai import OpenAI
import groq
GROQ_API_KEY = os.environ.get("GROQ_API_KEY")
DASHSCOPE_API_KEY = os.environ.get("DASHSCOPE_API_KEY")
groq_client = groq.Groq(api_key=GROQ_API_KEY)
qwen_client = OpenAI(
api_key=DASHSCOPE_API_KEY,
base_url="https://dashscope.aliyuncs.com/compatible-mode/v1"
)
SYSTEM_PROMPT = """Tu ek helpful Hindi assistant hai.
Hamesha Hindi mein jawab de — short aur natural.
Friendly aur warm tone rakho.
Har jawab 2-3 sentences mein do.
Jaise real insaan baat karta hai waisa bolo."""
# ============================================================
# VOICE PROFILE (Amit ki awaaz se match kiya hua)
# Pitch: 134.8 Hz | Warmth ratio: 2.84 | Centroid: 1916 Hz
# ============================================================
def enhance_to_amit_profile(audio_path):
y, sr = sf.read(audio_path)
if len(y.shape) > 1:
y = y.mean(axis=1)
y = y.astype(np.float32)
# Step 1 — Sub bass warmth (amit ka 20-200 Hz strong hai)
b_sub, a_sub = signal.butter(3, 200 / (sr / 2), btype='low')
sub = signal.filtfilt(b_sub, a_sub, y)
y = y + sub * 0.35
# Step 2 — Low mid warmth boost (200-800 Hz — warmth ratio 2.84)
b_lm, a_lm = signal.butter(2, [200/(sr/2), 800/(sr/2)], btype='band')
low_mid = signal.filtfilt(b_lm, a_lm, y)
y = y + low_mid * 0.28
# Step 3 — Mid presence (800-2500 Hz — conversational clarity)
b_mid, a_mid = signal.butter(2, [800/(sr/2), 2500/(sr/2)], btype='band')
mid = signal.filtfilt(b_mid, a_mid, y)
y = y + mid * 0.12
# Step 4 — De-essing (amit mein harsh S nahi hai — remove karo)
b_ess, a_ess = signal.butter(2, 6000 / (sr / 2), btype='high')
ess = signal.filtfilt(b_ess, a_ess, y)
y = y - ess * 0.22
# Step 5 — Room presence (amit ki awaaz mein slight room feel)
impulse = np.zeros(int(sr * 0.018))
impulse[0] = 1.0
impulse[int(sr * 0.007)] = 0.10
impulse[int(sr * 0.014)] = 0.05
impulse[int(sr * 0.017)] = 0.02
room = fftconvolve(y, impulse)[:len(y)]
y = y * 0.88 + room * 0.12
# Step 6 — Natural dynamics (amit ka dynamic range: 0.23)
# Soft compression to match natural feel
threshold = 0.25
ratio = 0.65
mask = np.abs(y) > threshold
y[mask] = (np.sign(y[mask]) *
(threshold + (np.abs(y[mask]) - threshold) * ratio))
# Step 7 — ZCR smoothing (amit ZCR: 0.1237 — smooth transitions)
from scipy.ndimage import uniform_filter1d
y = uniform_filter1d(y, size=3)
# Step 8 — Final normalize to amit energy level (RMS: 0.0561)
current_rms = np.sqrt(np.mean(y**2))
target_rms = 0.0561
if current_rms > 0:
y = y * (target_rms / current_rms)
# Safety clip
y = np.clip(y, -0.95, 0.95)
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
sf.write(f.name, y, sr)
return f.name
async def generate_edge_tts(text, style):
styles = {
"Conversational": ("-10%", "+0Hz"),
"Warm & Slow": ("-15%", "+0Hz"),
"Energetic": ("-5%", "+1Hz"),
"Calm": ("-18%", "-1Hz"),
"Professional": ("-8%", "+0Hz"),
}
rate, pitch = styles.get(style, ("-10%", "+0Hz"))
with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as f:
path = f.name
tts = edge_tts.Communicate(
text=text,
voice="hi-IN-MadhurNeural",
rate=rate,
pitch=pitch
)
await tts.save(path)
return path
def generate_tts(text, style):
raw_path = asyncio.run(generate_edge_tts(text, style))
enhanced_path = enhance_to_amit_profile(raw_path)
return enhanced_path
def speech_to_text(audio_path):
if audio_path is None:
return ""
with open(audio_path, "rb") as f:
result = groq_client.audio.transcriptions.create(
file=f,
model="whisper-large-v3",
language="hi",
response_format="text"
)
return result
def get_response(user_msg, history):
messages = [{"role": "system", "content": SYSTEM_PROMPT}]
for human, assistant in history:
messages.append({"role": "user", "content": human})
messages.append({"role": "assistant", "content": assistant})
messages.append({"role": "user", "content": user_msg})
response = qwen_client.chat.completions.create(
model="qwen-turbo",
messages=messages,
max_tokens=150,
temperature=0.8
)
return response.choices[0].message.content
def process_voice(audio, style, history):
if audio is None:
return history, None, "Mic se bolo..."
user_text = speech_to_text(audio)
if not user_text.strip():
return history, None, "Kuch suna nahi..."
bot_response = get_response(user_text, history)
audio_path = generate_tts(bot_response, style)
history.append((user_text, bot_response))
return history, audio_path, ""
def process_text(text, style, history):
if not text.strip():
return history, None, ""
bot_response = get_response(text, history)
audio_path = generate_tts(bot_response, style)
history.append((text, bot_response))
return history, audio_path, ""
def preview_voice(text, style):
if not text.strip():
return None
return generate_tts(text, style)
with gr.Blocks(title="Hindi Voice Assistant") as demo:
gr.Markdown("## Hindi Voice Assistant")
gr.Markdown("Edge TTS — Conversational Hindi Voice")
with gr.Row():
with gr.Column(scale=2):
chatbot = gr.Chatbot(
label="Conversation",
height=380,
bubble_full_width=False
)
style_select = gr.Dropdown(
choices=[
"Conversational",
"Warm & Slow",
"Energetic",
"Calm",
"Professional"
],
value="Conversational",
label="Voice Style"
)
voice_input = gr.Audio(
label="Bolkar poochho",
sources=["microphone"],
type="filepath"
)
with gr.Row():
text_input = gr.Textbox(
label="Ya likhkar poochho",
placeholder="Kuch bhi poochho...",
scale=4
)
send_btn = gr.Button("Send", variant="primary", scale=1)
clear_btn = gr.Button("Clear", variant="secondary")
with gr.Column(scale=1):
audio_output = gr.Audio(
label="Assistant ki Awaaz",
autoplay=True
)
status = gr.Textbox(
label="Status",
interactive=False,
value="Ready!"
)
gr.Markdown("### Voice Preview")
preview_text = gr.Textbox(
label="Test karo",
placeholder="Koi bhi text likho...",
lines=2
)
preview_btn = gr.Button("Preview Voice", variant="secondary")
preview_audio = gr.Audio(label="Preview", autoplay=True)
gr.Markdown("""
### Enhancement Profile:
- Bass warmth: +35%
- Low mid boost: +28%
- De-essing: active
- Room presence: subtle
- Natural dynamics: matched
### Test Karo:
```
नमस्ते! कैसे हैं आप?
```
```
मैं आपकी पूरी मदद करूँगा।
```
```
बताइए, क्या जानना है?
```
""")
chat_history = gr.State([])
voice_input.stop_recording(
fn=process_voice,
inputs=[voice_input, style_select, chat_history],
outputs=[chatbot, audio_output, status]
)
send_btn.click(
fn=process_text,
inputs=[text_input, style_select, chat_history],
outputs=[chatbot, audio_output, status]
).then(fn=lambda: "", outputs=[text_input])
text_input.submit(
fn=process_text,
inputs=[text_input, style_select, chat_history],
outputs=[chatbot, audio_output, status]
).then(fn=lambda: "", outputs=[text_input])
clear_btn.click(
fn=lambda: ([], None, "Ready!"),
outputs=[chatbot, audio_output, status]
).then(fn=lambda: [], outputs=[chat_history])
preview_btn.click(
fn=preview_voice,
inputs=[preview_text, style_select],
outputs=[preview_audio]
)
demo.launch()