Spaces:
Sleeping
Sleeping
File size: 4,304 Bytes
107cd17 77a6bbb 107cd17 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 | import torch
import gradio as gr
from transformers import CsmForConditionalGeneration, AutoProcessor
import tempfile
import os
from huggingface_hub import login
# Initialize model and processor
def load_model():
# For Spaces, reference your model by its HF Hub ID
model_id = "hyperneuronAILabs/vocali" # Replace with your HF model ID
try:
processor = AutoProcessor.from_pretrained(model_id)
# Check for available hardware
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
# Use 8-bit quantization for better memory efficiency on Spaces
model = CsmForConditionalGeneration.from_pretrained(
model_id,
device_map=device,
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
low_cpu_mem_usage=True
)
return model, processor, device, None
except Exception as e:
return None, None, "cpu", str(e)
# Load model on startup
model, processor, device, error_msg = load_model()
model_loaded = model is not None
# Function to generate speech
def generate_speech(text, max_new_tokens=70):
if not model_loaded:
return None, f"Model failed to load: {error_msg}"
try:
# Create conversation format
conversation = [
{"role": "0", "content": [{"type": "text", "text": text}]},
]
# Process the input
inputs = processor.apply_chat_template(
conversation,
tokenize=True,
return_dict=True,
).to(device)
# Generate audio with memory efficient settings
with torch.no_grad(): # Save memory during inference
audio = model.generate(
**inputs,
output_audio=True,
max_new_tokens=max_new_tokens
)
# Save to a temporary file
temp_dir = tempfile.gettempdir()
output_path = os.path.join(temp_dir, f"generated_speech_{hash(text)}.wav")
processor.save_audio(audio, output_path)
return output_path, "Speech generated successfully!"
except Exception as e:
return None, f"Error generating speech: {str(e)}"
# Create Gradio interface
with gr.Blocks(title="Hindi Text-to-Speech Generator") as demo:
gr.Markdown("# Hindi Text-to-Speech Generator")
if not model_loaded:
gr.Markdown(f"⚠️ **Error loading model: {error_msg}**")
else:
gr.Markdown("Enter text in Hindi to convert it to speech")
with gr.Row():
with gr.Column():
text_input = gr.Textbox(
label="Input Text",
placeholder="मैं आपकी किस प्रकार सहायता कर सकता हूँ",
lines=5
)
max_tokens = gr.Slider(
minimum=10,
maximum=100,
value=50,
step=5,
label="Max New Tokens (higher values may use more memory)"
)
submit_btn = gr.Button("Generate Speech", variant="primary")
with gr.Column():
audio_output = gr.Audio(label="Generated Speech", type="filepath")
status_text = gr.Textbox(label="Status", interactive=False)
# Example inputs (fewer examples to conserve memory)
if model_loaded:
gr.Examples(
examples=[
["मैं आपकी किस प्रकार सहायता कर सकता हूँ", 50],
],
inputs=[text_input, max_tokens],
outputs=[audio_output, status_text],
fn=generate_speech,
cache_examples=True
)
# Set up the function call
submit_btn.click(
fn=generate_speech,
inputs=[text_input, max_tokens],
outputs=[audio_output, status_text]
)
gr.Markdown("### System Information")
gr.Markdown(f"- Using device: {device}")
gr.Markdown(f"- Model loaded: {'Yes' if model_loaded else 'No'}")
# Launch the app
if __name__ == "__main__":
demo.launch() # Don't use share=True on Spaces |