Spaces:

abhiXai
/

hero

Runtime error

App Files Files Community

hero / app.py

abhiXai

Create app.py

e98d9d8 verified 9 days ago

raw

history blame contribute delete

8.87 kB

	import gradio as gr
	import edge_tts
	import asyncio
	import tempfile
	import os
	import soundfile as sf
	import numpy as np
	from scipy import signal
	from scipy.signal import fftconvolve
	from openai import OpenAI
	import groq

	GROQ_API_KEY = os.environ.get("GROQ_API_KEY")
	DASHSCOPE_API_KEY = os.environ.get("DASHSCOPE_API_KEY")

	groq_client = groq.Groq(api_key=GROQ_API_KEY)
	qwen_client = OpenAI(
	api_key=DASHSCOPE_API_KEY,
	base_url="https://dashscope.aliyuncs.com/compatible-mode/v1"
	)

	SYSTEM_PROMPT = """Tu ek helpful Hindi assistant hai.
	Hamesha Hindi mein jawab de — short aur natural.
	Friendly aur warm tone rakho.
	Har jawab 2-3 sentences mein do.
	Jaise real insaan baat karta hai waisa bolo."""

	# ============================================================
	# VOICE PROFILE (Amit ki awaaz se match kiya hua)
	# Pitch: 134.8 Hz \| Warmth ratio: 2.84 \| Centroid: 1916 Hz
	# ============================================================

	def enhance_to_amit_profile(audio_path):
	y, sr = sf.read(audio_path)
	if len(y.shape) > 1:
	y = y.mean(axis=1)
	y = y.astype(np.float32)

	# Step 1 — Sub bass warmth (amit ka 20-200 Hz strong hai)
	b_sub, a_sub = signal.butter(3, 200 / (sr / 2), btype='low')
	sub = signal.filtfilt(b_sub, a_sub, y)
	y = y + sub * 0.35

	# Step 2 — Low mid warmth boost (200-800 Hz — warmth ratio 2.84)
	b_lm, a_lm = signal.butter(2, [200/(sr/2), 800/(sr/2)], btype='band')
	low_mid = signal.filtfilt(b_lm, a_lm, y)
	y = y + low_mid * 0.28

	# Step 3 — Mid presence (800-2500 Hz — conversational clarity)
	b_mid, a_mid = signal.butter(2, [800/(sr/2), 2500/(sr/2)], btype='band')
	mid = signal.filtfilt(b_mid, a_mid, y)
	y = y + mid * 0.12

	# Step 4 — De-essing (amit mein harsh S nahi hai — remove karo)
	b_ess, a_ess = signal.butter(2, 6000 / (sr / 2), btype='high')
	ess = signal.filtfilt(b_ess, a_ess, y)
	y = y - ess * 0.22

	# Step 5 — Room presence (amit ki awaaz mein slight room feel)
	impulse = np.zeros(int(sr * 0.018))
	impulse[0] = 1.0
	impulse[int(sr * 0.007)] = 0.10
	impulse[int(sr * 0.014)] = 0.05
	impulse[int(sr * 0.017)] = 0.02
	room = fftconvolve(y, impulse)[:len(y)]
	y = y * 0.88 + room * 0.12

	# Step 6 — Natural dynamics (amit ka dynamic range: 0.23)
	# Soft compression to match natural feel
	threshold = 0.25
	ratio = 0.65
	mask = np.abs(y) > threshold
	y[mask] = (np.sign(y[mask]) *
	(threshold + (np.abs(y[mask]) - threshold) * ratio))

	# Step 7 — ZCR smoothing (amit ZCR: 0.1237 — smooth transitions)
	from scipy.ndimage import uniform_filter1d
	y = uniform_filter1d(y, size=3)

	# Step 8 — Final normalize to amit energy level (RMS: 0.0561)
	current_rms = np.sqrt(np.mean(y**2))
	target_rms = 0.0561
	if current_rms > 0:
	y = y * (target_rms / current_rms)

	# Safety clip
	y = np.clip(y, -0.95, 0.95)

	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
	sf.write(f.name, y, sr)
	return f.name

	async def generate_edge_tts(text, style):
	styles = {
	"Conversational": ("-10%", "+0Hz"),
	"Warm & Slow": ("-15%", "+0Hz"),
	"Energetic": ("-5%", "+1Hz"),
	"Calm": ("-18%", "-1Hz"),
	"Professional": ("-8%", "+0Hz"),
	}
	rate, pitch = styles.get(style, ("-10%", "+0Hz"))

	with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as f:
	path = f.name

	tts = edge_tts.Communicate(
	text=text,
	voice="hi-IN-MadhurNeural",
	rate=rate,
	pitch=pitch
	)
	await tts.save(path)
	return path

	def generate_tts(text, style):
	raw_path = asyncio.run(generate_edge_tts(text, style))
	enhanced_path = enhance_to_amit_profile(raw_path)
	return enhanced_path

	def speech_to_text(audio_path):
	if audio_path is None:
	return ""
	with open(audio_path, "rb") as f:
	result = groq_client.audio.transcriptions.create(
	file=f,
	model="whisper-large-v3",
	language="hi",
	response_format="text"
	)
	return result

	def get_response(user_msg, history):
	messages = [{"role": "system", "content": SYSTEM_PROMPT}]
	for human, assistant in history:
	messages.append({"role": "user", "content": human})
	messages.append({"role": "assistant", "content": assistant})
	messages.append({"role": "user", "content": user_msg})
	response = qwen_client.chat.completions.create(
	model="qwen-turbo",
	messages=messages,
	max_tokens=150,
	temperature=0.8
	)
	return response.choices[0].message.content

	def process_voice(audio, style, history):
	if audio is None:
	return history, None, "Mic se bolo..."
	user_text = speech_to_text(audio)
	if not user_text.strip():
	return history, None, "Kuch suna nahi..."
	bot_response = get_response(user_text, history)
	audio_path = generate_tts(bot_response, style)
	history.append((user_text, bot_response))
	return history, audio_path, ""

	def process_text(text, style, history):
	if not text.strip():
	return history, None, ""
	bot_response = get_response(text, history)
	audio_path = generate_tts(bot_response, style)
	history.append((text, bot_response))
	return history, audio_path, ""

	def preview_voice(text, style):
	if not text.strip():
	return None
	return generate_tts(text, style)

	with gr.Blocks(title="Hindi Voice Assistant") as demo:
	gr.Markdown("## Hindi Voice Assistant")
	gr.Markdown("Edge TTS — Conversational Hindi Voice")

	with gr.Row():
	with gr.Column(scale=2):
	chatbot = gr.Chatbot(
	label="Conversation",
	height=380,
	bubble_full_width=False
	)

	style_select = gr.Dropdown(
	choices=[
	"Conversational",
	"Warm & Slow",
	"Energetic",
	"Calm",
	"Professional"
	],
	value="Conversational",
	label="Voice Style"
	)

	voice_input = gr.Audio(
	label="Bolkar poochho",
	sources=["microphone"],
	type="filepath"
	)

	with gr.Row():
	text_input = gr.Textbox(
	label="Ya likhkar poochho",
	placeholder="Kuch bhi poochho...",
	scale=4
	)
	send_btn = gr.Button("Send", variant="primary", scale=1)

	clear_btn = gr.Button("Clear", variant="secondary")

	with gr.Column(scale=1):
	audio_output = gr.Audio(
	label="Assistant ki Awaaz",
	autoplay=True
	)
	status = gr.Textbox(
	label="Status",
	interactive=False,
	value="Ready!"
	)

	gr.Markdown("### Voice Preview")
	preview_text = gr.Textbox(
	label="Test karo",
	placeholder="Koi bhi text likho...",
	lines=2
	)
	preview_btn = gr.Button("Preview Voice", variant="secondary")
	preview_audio = gr.Audio(label="Preview", autoplay=True)

	gr.Markdown("""
	### Enhancement Profile:
	- Bass warmth: +35%
	- Low mid boost: +28%
	- De-essing: active
	- Room presence: subtle
	- Natural dynamics: matched

	### Test Karo:
	```
	नमस्ते! कैसे हैं आप?
	```
	```
	मैं आपकी पूरी मदद करूँगा।
	```
	```
	बताइए, क्या जानना है?
	```
	""")

	chat_history = gr.State([])

	voice_input.stop_recording(
	fn=process_voice,
	inputs=[voice_input, style_select, chat_history],
	outputs=[chatbot, audio_output, status]
	)

	send_btn.click(
	fn=process_text,
	inputs=[text_input, style_select, chat_history],
	outputs=[chatbot, audio_output, status]
	).then(fn=lambda: "", outputs=[text_input])

	text_input.submit(
	fn=process_text,
	inputs=[text_input, style_select, chat_history],
	outputs=[chatbot, audio_output, status]
	).then(fn=lambda: "", outputs=[text_input])

	clear_btn.click(
	fn=lambda: ([], None, "Ready!"),
	outputs=[chatbot, audio_output, status]
	).then(fn=lambda: [], outputs=[chat_history])

	preview_btn.click(
	fn=preview_voice,
	inputs=[preview_text, style_select],
	outputs=[preview_audio]
	)

	demo.launch()