import torch import gradio as gr from transformers import CsmForConditionalGeneration, AutoProcessor import tempfile import os from huggingface_hub import login # Initialize model and processor def load_model(): # For Spaces, reference your model by its HF Hub ID model_id = "hyperneuronAILabs/vocali" # Replace with your HF model ID try: processor = AutoProcessor.from_pretrained(model_id) # Check for available hardware device = "cuda" if torch.cuda.is_available() else "cpu" print(f"Using device: {device}") # Use 8-bit quantization for better memory efficiency on Spaces model = CsmForConditionalGeneration.from_pretrained( model_id, device_map=device, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, low_cpu_mem_usage=True ) return model, processor, device, None except Exception as e: return None, None, "cpu", str(e) # Load model on startup model, processor, device, error_msg = load_model() model_loaded = model is not None # Function to generate speech def generate_speech(text, max_new_tokens=70): if not model_loaded: return None, f"Model failed to load: {error_msg}" try: # Create conversation format conversation = [ {"role": "0", "content": [{"type": "text", "text": text}]}, ] # Process the input inputs = processor.apply_chat_template( conversation, tokenize=True, return_dict=True, ).to(device) # Generate audio with memory efficient settings with torch.no_grad(): # Save memory during inference audio = model.generate( **inputs, output_audio=True, max_new_tokens=max_new_tokens ) # Save to a temporary file temp_dir = tempfile.gettempdir() output_path = os.path.join(temp_dir, f"generated_speech_{hash(text)}.wav") processor.save_audio(audio, output_path) return output_path, "Speech generated successfully!" except Exception as e: return None, f"Error generating speech: {str(e)}" # Create Gradio interface with gr.Blocks(title="Hindi Text-to-Speech Generator") as demo: gr.Markdown("# Hindi Text-to-Speech Generator") if not model_loaded: gr.Markdown(f"⚠️ **Error loading model: {error_msg}**") else: gr.Markdown("Enter text in Hindi to convert it to speech") with gr.Row(): with gr.Column(): text_input = gr.Textbox( label="Input Text", placeholder="मैं आपकी किस प्रकार सहायता कर सकता हूँ", lines=5 ) max_tokens = gr.Slider( minimum=10, maximum=100, value=50, step=5, label="Max New Tokens (higher values may use more memory)" ) submit_btn = gr.Button("Generate Speech", variant="primary") with gr.Column(): audio_output = gr.Audio(label="Generated Speech", type="filepath") status_text = gr.Textbox(label="Status", interactive=False) # Example inputs (fewer examples to conserve memory) if model_loaded: gr.Examples( examples=[ ["मैं आपकी किस प्रकार सहायता कर सकता हूँ", 50], ], inputs=[text_input, max_tokens], outputs=[audio_output, status_text], fn=generate_speech, cache_examples=True ) # Set up the function call submit_btn.click( fn=generate_speech, inputs=[text_input, max_tokens], outputs=[audio_output, status_text] ) gr.Markdown("### System Information") gr.Markdown(f"- Using device: {device}") gr.Markdown(f"- Model loaded: {'Yes' if model_loaded else 'No'}") # Launch the app if __name__ == "__main__": demo.launch() # Don't use share=True on Spaces