Spaces:
Sleeping
Sleeping
| import torch | |
| import gradio as gr | |
| from transformers import CsmForConditionalGeneration, AutoProcessor | |
| import tempfile | |
| import os | |
| from huggingface_hub import login | |
| # Initialize model and processor | |
| def load_model(): | |
| # For Spaces, reference your model by its HF Hub ID | |
| model_id = "hyperneuronAILabs/vocali" # Replace with your HF model ID | |
| try: | |
| processor = AutoProcessor.from_pretrained(model_id) | |
| # Check for available hardware | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| print(f"Using device: {device}") | |
| # Use 8-bit quantization for better memory efficiency on Spaces | |
| model = CsmForConditionalGeneration.from_pretrained( | |
| model_id, | |
| device_map=device, | |
| torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, | |
| low_cpu_mem_usage=True | |
| ) | |
| return model, processor, device, None | |
| except Exception as e: | |
| return None, None, "cpu", str(e) | |
| # Load model on startup | |
| model, processor, device, error_msg = load_model() | |
| model_loaded = model is not None | |
| # Function to generate speech | |
| def generate_speech(text, max_new_tokens=70): | |
| if not model_loaded: | |
| return None, f"Model failed to load: {error_msg}" | |
| try: | |
| # Create conversation format | |
| conversation = [ | |
| {"role": "0", "content": [{"type": "text", "text": text}]}, | |
| ] | |
| # Process the input | |
| inputs = processor.apply_chat_template( | |
| conversation, | |
| tokenize=True, | |
| return_dict=True, | |
| ).to(device) | |
| # Generate audio with memory efficient settings | |
| with torch.no_grad(): # Save memory during inference | |
| audio = model.generate( | |
| **inputs, | |
| output_audio=True, | |
| max_new_tokens=max_new_tokens | |
| ) | |
| # Save to a temporary file | |
| temp_dir = tempfile.gettempdir() | |
| output_path = os.path.join(temp_dir, f"generated_speech_{hash(text)}.wav") | |
| processor.save_audio(audio, output_path) | |
| return output_path, "Speech generated successfully!" | |
| except Exception as e: | |
| return None, f"Error generating speech: {str(e)}" | |
| # Create Gradio interface | |
| with gr.Blocks(title="Hindi Text-to-Speech Generator") as demo: | |
| gr.Markdown("# Hindi Text-to-Speech Generator") | |
| if not model_loaded: | |
| gr.Markdown(f"⚠️ **Error loading model: {error_msg}**") | |
| else: | |
| gr.Markdown("Enter text in Hindi to convert it to speech") | |
| with gr.Row(): | |
| with gr.Column(): | |
| text_input = gr.Textbox( | |
| label="Input Text", | |
| placeholder="मैं आपकी किस प्रकार सहायता कर सकता हूँ", | |
| lines=5 | |
| ) | |
| max_tokens = gr.Slider( | |
| minimum=10, | |
| maximum=100, | |
| value=50, | |
| step=5, | |
| label="Max New Tokens (higher values may use more memory)" | |
| ) | |
| submit_btn = gr.Button("Generate Speech", variant="primary") | |
| with gr.Column(): | |
| audio_output = gr.Audio(label="Generated Speech", type="filepath") | |
| status_text = gr.Textbox(label="Status", interactive=False) | |
| # Example inputs (fewer examples to conserve memory) | |
| if model_loaded: | |
| gr.Examples( | |
| examples=[ | |
| ["मैं आपकी किस प्रकार सहायता कर सकता हूँ", 50], | |
| ], | |
| inputs=[text_input, max_tokens], | |
| outputs=[audio_output, status_text], | |
| fn=generate_speech, | |
| cache_examples=True | |
| ) | |
| # Set up the function call | |
| submit_btn.click( | |
| fn=generate_speech, | |
| inputs=[text_input, max_tokens], | |
| outputs=[audio_output, status_text] | |
| ) | |
| gr.Markdown("### System Information") | |
| gr.Markdown(f"- Using device: {device}") | |
| gr.Markdown(f"- Model loaded: {'Yes' if model_loaded else 'No'}") | |
| # Launch the app | |
| if __name__ == "__main__": | |
| demo.launch() # Don't use share=True on Spaces |