File size: 4,304 Bytes
107cd17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77a6bbb
107cd17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import torch
import gradio as gr
from transformers import CsmForConditionalGeneration, AutoProcessor
import tempfile
import os
from huggingface_hub import login


# Initialize model and processor
def load_model():
    # For Spaces, reference your model by its HF Hub ID
    model_id = "hyperneuronAILabs/vocali"  # Replace with your HF model ID
    
    try:
        processor = AutoProcessor.from_pretrained(model_id)
        
        # Check for available hardware
        device = "cuda" if torch.cuda.is_available() else "cpu"
        print(f"Using device: {device}")
        
        # Use 8-bit quantization for better memory efficiency on Spaces
        model = CsmForConditionalGeneration.from_pretrained(
            model_id,
            device_map=device,
            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
            low_cpu_mem_usage=True
        )
        
        return model, processor, device, None
    except Exception as e:
        return None, None, "cpu", str(e)

# Load model on startup
model, processor, device, error_msg = load_model()
model_loaded = model is not None

# Function to generate speech
def generate_speech(text, max_new_tokens=70):
    if not model_loaded:
        return None, f"Model failed to load: {error_msg}"
    
    try:
        # Create conversation format
        conversation = [
            {"role": "0", "content": [{"type": "text", "text": text}]},
        ]
        
        # Process the input
        inputs = processor.apply_chat_template(
            conversation,
            tokenize=True,
            return_dict=True,
        ).to(device)
        
        # Generate audio with memory efficient settings
        with torch.no_grad():  # Save memory during inference
            audio = model.generate(
                **inputs, 
                output_audio=True, 
                max_new_tokens=max_new_tokens
            )
        
        # Save to a temporary file
        temp_dir = tempfile.gettempdir()
        output_path = os.path.join(temp_dir, f"generated_speech_{hash(text)}.wav")
        processor.save_audio(audio, output_path)
        
        return output_path, "Speech generated successfully!"
    
    except Exception as e:
        return None, f"Error generating speech: {str(e)}"

# Create Gradio interface
with gr.Blocks(title="Hindi Text-to-Speech Generator") as demo:
    gr.Markdown("# Hindi Text-to-Speech Generator")
    
    if not model_loaded:
        gr.Markdown(f"⚠️ **Error loading model: {error_msg}**")
    else:
        gr.Markdown("Enter text in Hindi to convert it to speech")
    
    with gr.Row():
        with gr.Column():
            text_input = gr.Textbox(
                label="Input Text",
                placeholder="मैं आपकी किस प्रकार सहायता कर सकता हूँ",
                lines=5
            )
            
            max_tokens = gr.Slider(
                minimum=10,
                maximum=100,
                value=50,
                step=5,
                label="Max New Tokens (higher values may use more memory)"
            )
            
            submit_btn = gr.Button("Generate Speech", variant="primary")
            
        with gr.Column():
            audio_output = gr.Audio(label="Generated Speech", type="filepath")
            status_text = gr.Textbox(label="Status", interactive=False)
    
    # Example inputs (fewer examples to conserve memory)
    if model_loaded:
        gr.Examples(
            examples=[
                ["मैं आपकी किस प्रकार सहायता कर सकता हूँ", 50],
            ],
            inputs=[text_input, max_tokens],
            outputs=[audio_output, status_text],
            fn=generate_speech,
            cache_examples=True
        )
    
    # Set up the function call
    submit_btn.click(
        fn=generate_speech,
        inputs=[text_input, max_tokens],
        outputs=[audio_output, status_text]
    )
    
    gr.Markdown("### System Information")
    gr.Markdown(f"- Using device: {device}")
    gr.Markdown(f"- Model loaded: {'Yes' if model_loaded else 'No'}")

# Launch the app
if __name__ == "__main__":
    demo.launch()  # Don't use share=True on Spaces