callentrin_test

Sleeping

App Files Files Community

callentrin_test / app.py

rishidahiya

Update app.py

2d146cc verified about 1 month ago

raw

history blame contribute delete

5.05 kB

	import gradio as gr
	from encoder import inference as encoder_inference
	from synthesizer.inference import Synthesizer
	from vocoder import inference as vocoder_inference
	import librosa
	import soundfile as sf
	import numpy as np
	import os

	# Load models at startup
	print("Loading models...")
	try:
	encoder_inference.load_model("saved_models/encoder.pt")
	print("✓ Encoder loaded!")
	except Exception as e:
	print(f"Encoder load error: {e}")

	try:
	synthesizer = Synthesizer("saved_models/synthesizer.pt")
	print("✓ Synthesizer loaded!")
	except Exception as e:
	print(f"Synthesizer load error: {e}")

	try:
	vocoder_inference.load_model("saved_models/vocoder.pt")
	print("✓ Vocoder loaded!")
	except Exception as e:
	print(f"Vocoder load error: {e}")

	print("Ready for voice cloning!")

	def clone_voice(voice_sample, text):
	"""Clone voice and generate speech"""
	try:
	if voice_sample is None:
	return None, "❌ Error: No voice sample provided"

	if not text or len(text.strip()) == 0:
	return None, "❌ Error: No text provided"

	print(f"Processing: text='{text}', voice_sample={voice_sample}")

	# Extract audio data and sample rate
	if isinstance(voice_sample, tuple):
	sr, audio_data = voice_sample
	wav = audio_data.astype(np.float32) / 32768.0
	else:
	wav, sr = librosa.load(voice_sample, sr=16000)

	print(f"Audio loaded: sr={sr}, shape={wav.shape}")

	# Resample if needed
	if sr != 16000:
	wav = librosa.resample(wav, orig_sr=sr, target_sr=16000)

	# Preprocess audio
	wav = encoder_inference.preprocess_wav(wav)
	print(f"Preprocessed audio: {wav.shape}")

	# Generate speaker embedding
	embed = encoder_inference.embed_utterance(wav)
	print(f"Speaker embedding: {embed.shape}")

	# Synthesize
	mels = synthesizer.synthesize_spectrograms([text], [embed])
	print(f"Mel-spectrogram: {mels[0].shape}")

	# Vocode to audio
	wav_generated = vocoder_inference.vocoder(mels[0])
	print(f"Generated audio: {wav_generated.shape}")

	return (22050, (wav_generated * 32768).astype(np.int16)), "✅ Success! Your voice has been cloned!"

	except Exception as e:
	print(f"Error: {e}")
	import traceback
	traceback.print_exc()
	return None, f"❌ Error: {str(e)}"

	# Create Gradio interface
	with gr.Blocks(title="Voice Cloning - Real-Time Test", theme=gr.themes.Soft()) as demo:
	gr.Markdown("# 🎤 Real-Time Voice Cloning")
	gr.Markdown("Record your voice, enter text, and hear it synthesized in your voice!")
	gr.Markdown("---")

	with gr.Row():
	with gr.Column():
	gr.Markdown("### 📝 Step 1: Record Your Voice")
	gr.Markdown("Record 5-10 seconds of clear audio in Hindi or Kannada")
	voice_input = gr.Audio(
	label="🎙️ Voice Sample (Microphone or Upload)",
	type="numpy",
	sources=["microphone", "upload"]
	)

	gr.Markdown("### ✍️ Step 2: Enter Text")
	text_input = gr.Textbox(
	label="📄 Text to Synthesize (Hindi or Kannada)",
	placeholder="नमस्ते, यह एक परीक्षण है",
	lines=3
	)

	with gr.Column():
	gr.Markdown("### 🔊 Step 3: Generated Speech")
	audio_output = gr.Audio(label="🎧 Cloned Voice Output", type="numpy")
	status_output = gr.Textbox(label="📊 Status", interactive=False, lines=2)

	clone_button = gr.Button("🎯 Clone Voice & Generate Speech", variant="primary", size="lg")
	clone_button.click(
	clone_voice,
	inputs=[voice_input, text_input],
	outputs=[audio_output, status_output]
	)

	gr.Markdown("""
	---
	### 📋 Instructions:
	1. Record your voice using the microphone (5-10 seconds) OR upload a WAV/OGG file
	- Speak clearly in Hindi or Kannada
	- Avoid background noise
	2. Enter text you want to generate in your voice (same language as recording)
	3. Click "Clone Voice & Generate Speech"
	4. Wait (10-30 seconds on CPU) and hear the result!

	### 💡 Tips for Best Results:
	- Clear voice samples = better results
	- 10+ seconds = better voice cloning accuracy
	- Same language as input voice works best
	- Patience - CPU processing takes time (GPU would be 2-3x faster)
	- Quality audio - minimize background noise

	### ⚠️ Limitations:
	- CPU processing is slower (~10-30 seconds per request)
	- Long texts (500+ characters) may timeout
	- Best results with 10+ second voice samples
	""")

	if __name__ == "__main__":
	demo.launch(share=False, server_name="0.0.0.0", server_port=7860)