Spaces:

jonloporto
/

ImageToSpeechTest

Sleeping

App Files Files Community

jonloporto commited on Jan 5

Commit

b78885b

verified ·

1 Parent(s): f84f0b0

Upload app.py

Browse files

Files changed (1) hide show

app.py +183 -0

app.py ADDED Viewed

	@@ -0,0 +1,183 @@

+# -*- coding: utf-8 -*-
+"""ImageToVoice Hugging Face Space
+Converts images to text using Hugging Face's image-to-text pipeline,
+then converts the text to speech using Supertonic TTS.
+"""
+import gradio as gr
+from supertonic import TTS
+from transformers import pipeline
+from PIL import Image
+import numpy as np
+import traceback
+# Initialize models (load once at startup)
+image_to_text = None
+tts = None
+init_error = None
+# Available voice styles for supertonic
+AVAILABLE_VOICES = ["M1", "M2", "M3", "M4", "M5", "F1", "F2", "F3", "F4"]
+try:
+    print("Initializing image-to-text pipeline...")
+    image_to_text = pipeline("image-to-text")
+    print("Image-to-text pipeline initialized successfully")
+except Exception as e:
+    init_error = f"Failed to initialize image-to-text: {str(e)}"
+    print(init_error)
+    traceback.print_exc()
+try:
+    print("Initializing TTS...")
+    tts = TTS(auto_download=True)
+    print("TTS initialized successfully")
+except Exception as e:
+    if init_error:
+        init_error += f"\nFailed to initialize TTS: {str(e)}"
+    else:
+        init_error = f"Failed to initialize TTS: {str(e)}"
+    print(init_error)
+    traceback.print_exc()
+def image_to_voice(image, voice_name):
+    """Convert image to text, then text to speech."""
+    if image is None:
+        return None, "Please upload an image."
+    if image_to_text is None or tts is None:
+        error_msg = "Error: Models failed to initialize. "
+        if init_error:
+            error_msg += f"\n\nDetails: {init_error}"
+        else:
+            error_msg += "Please check the logs for more information."
+        return None, error_msg
+    # Validate and get voice style
+    if voice_name not in AVAILABLE_VOICES:
+        voice_name = "M5"  # Default fallback
+        print(f"Invalid voice name, using default: M5")
+    try:
+        print(f"Getting voice style: {voice_name}")
+        style = tts.get_voice_style(voice_name=voice_name)
+        print(f"Voice style '{voice_name}' loaded successfully")
+    except Exception as e:
+        error_msg = f"Error: Failed to load voice style '{voice_name}': {str(e)}"
+        print(error_msg)
+        return None, error_msg
+    try:
+        print(f"Processing image: type={type(image)}, mode={image.mode if hasattr(image, 'mode') else 'N/A'}")
+        # Convert PIL Image to format expected by pipeline
+        if isinstance(image, Image.Image):
+            # PIL Image should work directly, but ensure it's RGB
+            if image.mode != 'RGB':
+                image = image.convert('RGB')
+                print(f"Converted image to RGB mode")
+        # Convert image to text
+        print("Running image-to-text pipeline...")
+        result = image_to_text(image)
+        print(f"Image-to-text result: {result}")
+        if not result or len(result) == 0:
+            return None, "Error: Could not extract text from image. The pipeline returned an empty result."
+        generated_text = result[0].get('generated_text', '')
+        if not generated_text:
+            return None, "Error: No text was extracted from the image. The generated text is empty."
+        print(f"Extracted text: {generated_text}")
+        # Convert text to speech
+        print(f"Synthesizing speech with voice '{voice_name}'...")
+        wav, duration = tts.synthesize(generated_text, voice_style=style)
+        print(f"Speech synthesized: duration={duration}, wav type={type(wav)}, wav shape={wav.shape if hasattr(wav, 'shape') else 'N/A'}")
+        # Ensure wav is a numpy array
+        if not isinstance(wav, np.ndarray):
+            wav = np.array(wav)
+            print(f"Converted wav to numpy array: shape={wav.shape}, dtype={wav.dtype}")
+        # Ensure audio is 1D (mono) format
+        if wav.ndim > 1:
+            wav = wav.squeeze()
+            if wav.ndim > 1:
+                # If still multi-dimensional, take first channel
+                wav = wav[0] if wav.shape[0] < wav.shape[-1] else wav[:, 0]
+            print(f"Squeezed wav to 1D: shape={wav.shape}")
+        # Normalize audio to [-1, 1] range if needed
+        if wav.dtype == np.int16:
+            wav = wav.astype(np.float32) / 32768.0
+        elif wav.dtype == np.int32:
+            wav = wav.astype(np.float32) / 2147483648.0
+        elif wav.dtype != np.float32:
+            # If already in a reasonable range, just convert to float32
+            if np.abs(wav).max() > 1.0:
+                wav = wav.astype(np.float32) / np.abs(wav).max()
+            else:
+                wav = wav.astype(np.float32)
+        print(f"Final audio: shape={wav.shape}, dtype={wav.dtype}, min={wav.min()}, max={wav.max()}")
+        # Calculate sample rate from duration and audio length
+        # sample_rate = samples / duration_in_seconds
+        if duration > 0:
+            calculated_sample_rate = int(len(wav) / duration)
+            print(f"Calculated sample rate: {calculated_sample_rate} Hz (from {len(wav)} samples / {duration}s)")
+            sample_rate = calculated_sample_rate
+        else:
+            # Fallback: Try common TTS sample rates
+            # Many TTS systems use 24000 Hz or 16000 Hz
+            # If audio sounds slow, try higher sample rate; if fast, try lower
+            sample_rate = 24000  # Common TTS sample rate
+            print(f"Using default sample rate: {sample_rate} Hz (duration was 0 or invalid)")
+        return (sample_rate, wav), generated_text
+    except Exception as e:
+        error_msg = f"Error processing image: {str(e)}"
+        full_error = f"Error: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
+        print(full_error)  # Print full traceback for debugging
+        return None, error_msg
+# Create Gradio interface
+with gr.Blocks(title="Image to Voice") as demo:
+    gr.Markdown("# Image to Voice Converter")
+    gr.Markdown("Upload an image to convert it to text, then hear it as speech!")
+    with gr.Row():
+        with gr.Column():
+            image_input = gr.Image(type="pil", label="Upload Image")
+            voice_dropdown = gr.Dropdown(
+                choices=AVAILABLE_VOICES,
+                value="M5",
+                label="Voice Style",
+                info="Select a voice style for text-to-speech"
+            )
+            generate_btn = gr.Button("Generate Speech", variant="primary")
+        with gr.Column():
+            audio_output = gr.Audio(label="Generated Speech", type="numpy")
+            text_output = gr.Textbox(label="Extracted Text", lines=5)
+    generate_btn.click(
+        fn=image_to_voice,
+        inputs=[image_input, voice_dropdown],
+        outputs=[audio_output, text_output]
+    )
+    gr.Examples(
+        examples=[],
+        inputs=image_input
+    )
+if __name__ == "__main__":
+    demo.launch()