vocali-1 / app.py
nikhilhyperneuron's picture
Update app.py
77a6bbb verified
import torch
import gradio as gr
from transformers import CsmForConditionalGeneration, AutoProcessor
import tempfile
import os
from huggingface_hub import login
# Initialize model and processor
def load_model():
# For Spaces, reference your model by its HF Hub ID
model_id = "hyperneuronAILabs/vocali" # Replace with your HF model ID
try:
processor = AutoProcessor.from_pretrained(model_id)
# Check for available hardware
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
# Use 8-bit quantization for better memory efficiency on Spaces
model = CsmForConditionalGeneration.from_pretrained(
model_id,
device_map=device,
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
low_cpu_mem_usage=True
)
return model, processor, device, None
except Exception as e:
return None, None, "cpu", str(e)
# Load model on startup
model, processor, device, error_msg = load_model()
model_loaded = model is not None
# Function to generate speech
def generate_speech(text, max_new_tokens=70):
if not model_loaded:
return None, f"Model failed to load: {error_msg}"
try:
# Create conversation format
conversation = [
{"role": "0", "content": [{"type": "text", "text": text}]},
]
# Process the input
inputs = processor.apply_chat_template(
conversation,
tokenize=True,
return_dict=True,
).to(device)
# Generate audio with memory efficient settings
with torch.no_grad(): # Save memory during inference
audio = model.generate(
**inputs,
output_audio=True,
max_new_tokens=max_new_tokens
)
# Save to a temporary file
temp_dir = tempfile.gettempdir()
output_path = os.path.join(temp_dir, f"generated_speech_{hash(text)}.wav")
processor.save_audio(audio, output_path)
return output_path, "Speech generated successfully!"
except Exception as e:
return None, f"Error generating speech: {str(e)}"
# Create Gradio interface
with gr.Blocks(title="Hindi Text-to-Speech Generator") as demo:
gr.Markdown("# Hindi Text-to-Speech Generator")
if not model_loaded:
gr.Markdown(f"⚠️ **Error loading model: {error_msg}**")
else:
gr.Markdown("Enter text in Hindi to convert it to speech")
with gr.Row():
with gr.Column():
text_input = gr.Textbox(
label="Input Text",
placeholder="मैं आपकी किस प्रकार सहायता कर सकता हूँ",
lines=5
)
max_tokens = gr.Slider(
minimum=10,
maximum=100,
value=50,
step=5,
label="Max New Tokens (higher values may use more memory)"
)
submit_btn = gr.Button("Generate Speech", variant="primary")
with gr.Column():
audio_output = gr.Audio(label="Generated Speech", type="filepath")
status_text = gr.Textbox(label="Status", interactive=False)
# Example inputs (fewer examples to conserve memory)
if model_loaded:
gr.Examples(
examples=[
["मैं आपकी किस प्रकार सहायता कर सकता हूँ", 50],
],
inputs=[text_input, max_tokens],
outputs=[audio_output, status_text],
fn=generate_speech,
cache_examples=True
)
# Set up the function call
submit_btn.click(
fn=generate_speech,
inputs=[text_input, max_tokens],
outputs=[audio_output, status_text]
)
gr.Markdown("### System Information")
gr.Markdown(f"- Using device: {device}")
gr.Markdown(f"- Model loaded: {'Yes' if model_loaded else 'No'}")
# Launch the app
if __name__ == "__main__":
demo.launch() # Don't use share=True on Spaces