Spaces:

nikhilhyperneuron
/

vocali-1

Sleeping

File size: 4,304 Bytes

import torch
import gradio as gr
from transformers import CsmForConditionalGeneration, AutoProcessor
import tempfile
import os
from huggingface_hub import login


# Initialize model and processor
def load_model():
    # For Spaces, reference your model by its HF Hub ID
    model_id = "hyperneuronAILabs/vocali"  # Replace with your HF model ID
    
    try:
        processor = AutoProcessor.from_pretrained(model_id)
        
        # Check for available hardware
        device = "cuda" if torch.cuda.is_available() else "cpu"
        print(f"Using device: {device}")
        
        # Use 8-bit quantization for better memory efficiency on Spaces
        model = CsmForConditionalGeneration.from_pretrained(
            model_id,
            device_map=device,
            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
            low_cpu_mem_usage=True
        )
        
        return model, processor, device, None
    except Exception as e:
        return None, None, "cpu", str(e)

# Load model on startup
model, processor, device, error_msg = load_model()
model_loaded = model is not None

# Function to generate speech
def generate_speech(text, max_new_tokens=70):
    if not model_loaded:
        return None, f"Model failed to load: {error_msg}"
    
    try:
        # Create conversation format
        conversation = [
            {"role": "0", "content": [{"type": "text", "text": text}]},
        ]
        
        # Process the input
        inputs = processor.apply_chat_template(
            conversation,
            tokenize=True,
            return_dict=True,
        ).to(device)
        
        # Generate audio with memory efficient settings
        with torch.no_grad():  # Save memory during inference
            audio = model.generate(
                **inputs, 
                output_audio=True, 
                max_new_tokens=max_new_tokens
            )
        
        # Save to a temporary file
        temp_dir = tempfile.gettempdir()
        output_path = os.path.join(temp_dir, f"generated_speech_{hash(text)}.wav")
        processor.save_audio(audio, output_path)
        
        return output_path, "Speech generated successfully!"
    
    except Exception as e:
        return None, f"Error generating speech: {str(e)}"

# Create Gradio interface
with gr.Blocks(title="Hindi Text-to-Speech Generator") as demo:
    gr.Markdown("# Hindi Text-to-Speech Generator")
    
    if not model_loaded:
        gr.Markdown(f"⚠️ **Error loading model: {error_msg}**")
    else:
        gr.Markdown("Enter text in Hindi to convert it to speech")
    
    with gr.Row():
        with gr.Column():
            text_input = gr.Textbox(
                label="Input Text",
                placeholder="मैं आपकी किस प्रकार सहायता कर सकता हूँ",
                lines=5
            )
            
            max_tokens = gr.Slider(
                minimum=10,
                maximum=100,
                value=50,
                step=5,
                label="Max New Tokens (higher values may use more memory)"
            )
            
            submit_btn = gr.Button("Generate Speech", variant="primary")
            
        with gr.Column():
            audio_output = gr.Audio(label="Generated Speech", type="filepath")
            status_text = gr.Textbox(label="Status", interactive=False)
    
    # Example inputs (fewer examples to conserve memory)
    if model_loaded:
        gr.Examples(
            examples=[
                ["मैं आपकी किस प्रकार सहायता कर सकता हूँ", 50],
            ],
            inputs=[text_input, max_tokens],
            outputs=[audio_output, status_text],
            fn=generate_speech,
            cache_examples=True
        )
    
    # Set up the function call
    submit_btn.click(
        fn=generate_speech,
        inputs=[text_input, max_tokens],
        outputs=[audio_output, status_text]
    )
    
    gr.Markdown("### System Information")
    gr.Markdown(f"- Using device: {device}")
    gr.Markdown(f"- Model loaded: {'Yes' if model_loaded else 'No'}")

# Launch the app
if __name__ == "__main__":
    demo.launch()  # Don't use share=True on Spaces