Spaces:

nikhilhyperneuron
/

vocali-1

Sleeping

App Files Files Community

nikhilhyperneuron commited on Oct 7, 2025

Commit

107cd17

verified ·

1 Parent(s): 86a4c3a

Create app.py

Browse files

Files changed (1) hide show

app.py +130 -0

app.py ADDED Viewed

	@@ -0,0 +1,130 @@

+import torch
+import gradio as gr
+from transformers import CsmForConditionalGeneration, AutoProcessor
+import tempfile
+import os
+from huggingface_hub import login
+# Initialize model and processor
+def load_model():
+    # For Spaces, reference your model by its HF Hub ID
+    model_id = "hyperneuronAILabs/vocali"  # Replace with your HF model ID
+    try:
+        processor = AutoProcessor.from_pretrained(model_id)
+        # Check for available hardware
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        print(f"Using device: {device}")
+        # Use 8-bit quantization for better memory efficiency on Spaces
+        model = CsmForConditionalGeneration.from_pretrained(
+            model_id,
+            device_map=device,
+            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+            low_cpu_mem_usage=True
+        )
+        return model, processor, device, None
+    except Exception as e:
+        return None, None, "cpu", str(e)
+# Load model on startup
+model, processor, device, error_msg = load_model()
+model_loaded = model is not None
+# Function to generate speech
+def generate_speech(text, max_new_tokens=70):
+    if not model_loaded:
+        return None, f"Model failed to load: {error_msg}"
+    try:
+        # Create conversation format
+        conversation = [
+            {"role": "0", "content": [{"type": "text", "text": text}]},
+        ]
+        # Process the input
+        inputs = processor.apply_chat_template(
+            conversation,
+            tokenize=True,
+            return_dict=True,
+        ).to(device)
+        # Generate audio with memory efficient settings
+        with torch.no_grad():  # Save memory during inference
+            audio = model.generate(
+                **inputs,
+                output_audio=True,
+                max_new_tokens=max_new_tokens
+            )
+        # Save to a temporary file
+        temp_dir = tempfile.gettempdir()
+        output_path = os.path.join(temp_dir, f"generated_speech_{hash(text)}.wav")
+        processor.save_audio(audio, output_path)
+        return output_path, "Speech generated successfully!"
+    except Exception as e:
+        return None, f"Error generating speech: {str(e)}"
+# Create Gradio interface
+with gr.Blocks(title="Hindi Text-to-Speech Generator") as demo:
+    gr.Markdown("# Hindi Text-to-Speech Generator")
+    if not model_loaded:
+        gr.Markdown(f"⚠️ **Error loading model: {error_msg}**")
+    else:
+        gr.Markdown("Enter text in Hindi to convert it to speech")
+    with gr.Row():
+        with gr.Column():
+            text_input = gr.Textbox(
+                label="Input Text",
+                placeholder="नमस्ते आप कैसे हैं?",
+                lines=5
+            )
+            max_tokens = gr.Slider(
+                minimum=10,
+                maximum=100,
+                value=50,
+                step=5,
+                label="Max New Tokens (higher values may use more memory)"
+            )
+            submit_btn = gr.Button("Generate Speech", variant="primary")
+        with gr.Column():
+            audio_output = gr.Audio(label="Generated Speech", type="filepath")
+            status_text = gr.Textbox(label="Status", interactive=False)
+    # Example inputs (fewer examples to conserve memory)
+    if model_loaded:
+        gr.Examples(
+            examples=[
+                ["नमस्ते आप कैसे हैं?", 50],
+                ["मैं आपकी किस प्रकार सहायता कर सकता हूँ", 50],
+            ],
+            inputs=[text_input, max_tokens],
+            outputs=[audio_output, status_text],
+            fn=generate_speech,
+            cache_examples=True
+        )
+    # Set up the function call
+    submit_btn.click(
+        fn=generate_speech,
+        inputs=[text_input, max_tokens],
+        outputs=[audio_output, status_text]
+    )
+    gr.Markdown("### System Information")
+    gr.Markdown(f"- Using device: {device}")
+    gr.Markdown(f"- Model loaded: {'Yes' if model_loaded else 'No'}")
+# Launch the app
+if __name__ == "__main__":
+    demo.launch()  # Don't use share=True on Spaces