Spaces:

nikhilhyperneuron
/

vocali-1

Sleeping

App Files Files Community

vocali-1 / app.py

nikhilhyperneuron

Update app.py

77a6bbb verified 5 months ago

raw

history blame contribute delete

4.3 kB

	import torch
	import gradio as gr
	from transformers import CsmForConditionalGeneration, AutoProcessor
	import tempfile
	import os
	from huggingface_hub import login


	# Initialize model and processor
	def load_model():
	# For Spaces, reference your model by its HF Hub ID
	model_id = "hyperneuronAILabs/vocali" # Replace with your HF model ID

	try:
	processor = AutoProcessor.from_pretrained(model_id)

	# Check for available hardware
	device = "cuda" if torch.cuda.is_available() else "cpu"
	print(f"Using device: {device}")

	# Use 8-bit quantization for better memory efficiency on Spaces
	model = CsmForConditionalGeneration.from_pretrained(
	model_id,
	device_map=device,
	torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
	low_cpu_mem_usage=True
	)

	return model, processor, device, None
	except Exception as e:
	return None, None, "cpu", str(e)

	# Load model on startup
	model, processor, device, error_msg = load_model()
	model_loaded = model is not None

	# Function to generate speech
	def generate_speech(text, max_new_tokens=70):
	if not model_loaded:
	return None, f"Model failed to load: {error_msg}"

	try:
	# Create conversation format
	conversation = [
	{"role": "0", "content": [{"type": "text", "text": text}]},
	]

	# Process the input
	inputs = processor.apply_chat_template(
	conversation,
	tokenize=True,
	return_dict=True,
	).to(device)

	# Generate audio with memory efficient settings
	with torch.no_grad(): # Save memory during inference
	audio = model.generate(
	**inputs,
	output_audio=True,
	max_new_tokens=max_new_tokens
	)

	# Save to a temporary file
	temp_dir = tempfile.gettempdir()
	output_path = os.path.join(temp_dir, f"generated_speech_{hash(text)}.wav")
	processor.save_audio(audio, output_path)

	return output_path, "Speech generated successfully!"

	except Exception as e:
	return None, f"Error generating speech: {str(e)}"

	# Create Gradio interface
	with gr.Blocks(title="Hindi Text-to-Speech Generator") as demo:
	gr.Markdown("# Hindi Text-to-Speech Generator")

	if not model_loaded:
	gr.Markdown(f"⚠️ Error loading model: {error_msg}")
	else:
	gr.Markdown("Enter text in Hindi to convert it to speech")

	with gr.Row():
	with gr.Column():
	text_input = gr.Textbox(
	label="Input Text",
	placeholder="मैं आपकी किस प्रकार सहायता कर सकता हूँ",
	lines=5
	)

	max_tokens = gr.Slider(
	minimum=10,
	maximum=100,
	value=50,
	step=5,
	label="Max New Tokens (higher values may use more memory)"
	)

	submit_btn = gr.Button("Generate Speech", variant="primary")

	with gr.Column():
	audio_output = gr.Audio(label="Generated Speech", type="filepath")
	status_text = gr.Textbox(label="Status", interactive=False)

	# Example inputs (fewer examples to conserve memory)
	if model_loaded:
	gr.Examples(
	examples=[
	["मैं आपकी किस प्रकार सहायता कर सकता हूँ", 50],
	],
	inputs=[text_input, max_tokens],
	outputs=[audio_output, status_text],
	fn=generate_speech,
	cache_examples=True
	)

	# Set up the function call
	submit_btn.click(
	fn=generate_speech,
	inputs=[text_input, max_tokens],
	outputs=[audio_output, status_text]
	)

	gr.Markdown("### System Information")
	gr.Markdown(f"- Using device: {device}")
	gr.Markdown(f"- Model loaded: {'Yes' if model_loaded else 'No'}")

	# Launch the app
	if __name__ == "__main__":
	demo.launch() # Don't use share=True on Spaces