Spaces:

jonloporto
/

ImageToSpeechTest

Sleeping

App Files Files Community

jonloporto commited on Jan 5

Commit

1d08bb9

verified ·

1 Parent(s): f7f7d8f

Upload app.py

Browse files

Files changed (1) hide show

app.py +285 -0

app.py ADDED Viewed

	@@ -0,0 +1,285 @@

+# -*- coding: utf-8 -*-
+"""ImageToVoice Hugging Face Space
+Converts images to text using Hugging Face's image-to-text pipeline,
+then converts the text to speech using Supertonic TTS.
+"""
+import gradio as gr
+from supertonic import TTS
+from transformers import pipeline
+from PIL import Image
+import numpy as np
+import traceback
+# Initialize models (load once at startup)
+image_to_text = None
+tts = None
+init_error = None
+# Available voice styles for supertonic
+AVAILABLE_VOICES = ["M1", "M2", "M3", "M4", "M5", "F1", "F2", "F3", "F4"]
+try:
+    print("Initializing image-to-text pipeline...")
+    image_to_text = pipeline("image-to-text")
+    print("Image-to-text pipeline initialized successfully")
+except Exception as e:
+    init_error = f"Failed to initialize image-to-text: {str(e)}"
+    print(init_error)
+    traceback.print_exc()
+try:
+    print("Initializing TTS...")
+    tts = TTS(auto_download=True)
+    print("TTS initialized successfully")
+except Exception as e:
+    if init_error:
+        init_error += f"\nFailed to initialize TTS: {str(e)}"
+    else:
+        init_error = f"Failed to initialize TTS: {str(e)}"
+    print(init_error)
+    traceback.print_exc()
+def image_to_voice(image, voice_name):
+    """Convert image to text, then text to speech."""
+    if image is None:
+        return None, "Please upload an image."
+    if image_to_text is None or tts is None:
+        error_msg = "Error: Models failed to initialize. "
+        if init_error:
+            error_msg += f"\n\nDetails: {init_error}"
+        else:
+            error_msg += "Please check the logs for more information."
+        return None, error_msg
+    # Validate and get voice style
+    if voice_name not in AVAILABLE_VOICES:
+        voice_name = "M5"  # Default fallback
+        print(f"Invalid voice name, using default: M5")
+    try:
+        print(f"Getting voice style: {voice_name}")
+        style = tts.get_voice_style(voice_name=voice_name)
+        print(f"Voice style '{voice_name}' loaded successfully")
+    except Exception as e:
+        error_msg = f"Error: Failed to load voice style '{voice_name}': {str(e)}"
+        print(error_msg)
+        return None, error_msg
+    try:
+        print(f"Processing image: type={type(image)}, mode={image.mode if hasattr(image, 'mode') else 'N/A'}")
+        # Convert PIL Image to format expected by pipeline
+        if isinstance(image, Image.Image):
+            # PIL Image should work directly, but ensure it's RGB
+            if image.mode != 'RGB':
+                image = image.convert('RGB')
+                print(f"Converted image to RGB mode")
+        # Convert image to text
+        print("Running image-to-text pipeline...")
+        result = image_to_text(image)
+        print(f"Image-to-text result: {result}")
+        if not result or len(result) == 0:
+            return None, "Error: Could not extract text from image. The pipeline returned an empty result."
+        generated_text = result[0].get('generated_text', '')
+        if not generated_text:
+            return None, "Error: No text was extracted from the image. The generated text is empty."
+        print(f"Extracted text: {generated_text}")
+        # Convert text to speech
+        print(f"Synthesizing speech with voice '{voice_name}'...")
+        wav, duration = tts.synthesize(generated_text, voice_style=style)
+        print(f"Speech synthesized: duration={duration}, wav type={type(wav)}, wav shape={wav.shape if hasattr(wav, 'shape') else 'N/A'}")
+        # Ensure wav is a numpy array
+        if not isinstance(wav, np.ndarray):
+            wav = np.array(wav)
+            print(f"Converted wav to numpy array: shape={wav.shape}, dtype={wav.dtype}")
+        # Ensure audio is 1D (mono) format
+        if wav.ndim > 1:
+            wav = wav.squeeze()
+            if wav.ndim > 1:
+                # If still multi-dimensional, take first channel
+                wav = wav[0] if wav.shape[0] < wav.shape[-1] else wav[:, 0]
+            print(f"Squeezed wav to 1D: shape={wav.shape}")
+        # Normalize audio to [-1, 1] range if needed
+        if wav.dtype == np.int16:
+            wav = wav.astype(np.float32) / 32768.0
+        elif wav.dtype == np.int32:
+            wav = wav.astype(np.float32) / 2147483648.0
+        elif wav.dtype != np.float32:
+            # If already in a reasonable range, just convert to float32
+            if np.abs(wav).max() > 1.0:
+                wav = wav.astype(np.float32) / np.abs(wav).max()
+            else:
+                wav = wav.astype(np.float32)
+        print(f"Final audio: shape={wav.shape}, dtype={wav.dtype}, min={wav.min()}, max={wav.max()}")
+        # Calculate sample rate from duration and audio length
+        # sample_rate = samples / duration_in_seconds
+        if duration > 0:
+            calculated_sample_rate = int(len(wav) / duration)
+            print(f"Calculated sample rate: {calculated_sample_rate} Hz (from {len(wav)} samples / {duration}s)")
+            sample_rate = calculated_sample_rate
+        else:
+            # Fallback: Try common TTS sample rates
+            # Many TTS systems use 24000 Hz or 16000 Hz
+            # If audio sounds slow, try higher sample rate; if fast, try lower
+            sample_rate = 24000  # Common TTS sample rate
+            print(f"Using default sample rate: {sample_rate} Hz (duration was 0 or invalid)")
+        return (sample_rate, wav), generated_text
+    except Exception as e:
+        error_msg = f"Error processing image: {str(e)}"
+        full_error = f"Error: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
+        print(full_error)  # Print full traceback for debugging
+        return None, error_msg
+# Create Gradio interface with playful styling
+custom_css = """
+    /* Playful background gradient */
+    .gradio-container {
+        background: linear-gradient(135deg, #667eea 0%, #764ba2 25%, #f093fb 50%, #4facfe 75%, #00f2fe 100%);
+        background-size: 400% 400%;
+        animation: gradientShift 15s ease infinite;
+        min-height: 100vh;
+        padding: 20px;
+    }
+    @keyframes gradientShift {
+        0% { background-position: 0% 50%; }
+        50% { background-position: 100% 50%; }
+        100% { background-position: 0% 50%; }
+    }
+    /* Fun title styling */
+    h1 {
+        color: #FFD700 !important;
+        font-family: 'Comic Sans MS', 'Chalkboard SE', 'Marker Felt', cursive !important;
+        text-shadow: 3px 3px 0px #FF6B9D, 6px 6px 0px #4ECDC4, 9px 9px 0px #45B7D1 !important;
+        font-size: 3em !important;
+        text-align: center !important;
+        margin-bottom: 20px !important;
+        animation: bounce 2s infinite;
+    }
+    @keyframes bounce {
+        0%, 100% { transform: translateY(0); }
+        50% { transform: translateY(-10px); }
+    }
+    /* Playful paragraph text */
+    p, .markdown-text {
+        color: #FFFFFF !important;
+        font-family: 'Comic Sans MS', 'Chalkboard SE', sans-serif !important;
+        font-size: 1.2em !important;
+        text-shadow: 2px 2px 4px rgba(0,0,0,0.3) !important;
+    }
+    /* Card/panel styling */
+    .panel, .block, .gradio-block {
+        background: rgba(255, 255, 255, 0.95) !important;
+        border-radius: 20px !important;
+        padding: 20px !important;
+        box-shadow: 0 10px 30px rgba(0,0,0,0.3) !important;
+        border: 3px solid #FFD700 !important;
+    }
+    /* Label styling */
+    label {
+        color: #764ba2 !important;
+        font-family: 'Comic Sans MS', 'Chalkboard SE', sans-serif !important;
+        font-weight: bold !important;
+        font-size: 1.1em !important;
+    }
+    /* Button styling */
+    button.primary {
+        background: linear-gradient(45deg, #FF6B9D, #4ECDC4) !important;
+        color: white !important;
+        font-family: 'Comic Sans MS', 'Chalkboard SE', sans-serif !important;
+        font-size: 1.3em !important;
+        font-weight: bold !important;
+        border-radius: 25px !important;
+        padding: 15px 30px !important;
+        border: 3px solid #FFD700 !important;
+        box-shadow: 0 5px 15px rgba(0,0,0,0.3) !important;
+        transition: all 0.3s ease !important;
+    }
+    button.primary:hover {
+        transform: scale(1.1) !important;
+        box-shadow: 0 8px 20px rgba(0,0,0,0.4) !important;
+    }
+    /* Input fields */
+    input, textarea, select {
+        border-radius: 15px !important;
+        border: 2px solid #4ECDC4 !important;
+        font-family: 'Comic Sans MS', 'Chalkboard SE', sans-serif !important;
+    }
+    /* Dropdown styling */
+    select {
+        background: linear-gradient(45deg, #f093fb, #4facfe) !important;
+        color: white !important;
+        font-weight: bold !important;
+    }
+    /* Textbox styling */
+    textarea {
+        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
+        color: white !important;
+        font-weight: bold !important;
+    }
+"""
+with gr.Blocks(title="Image to Voice", theme=gr.themes.Soft(), css=custom_css) as demo:
+    gr.Markdown(
+        """
+        # 🎨✨ Image to Voice Converter ✨🎨
+        ### Upload an image to convert it to text, then hear it as speech! 🎤🎵
+        """
+    )
+    with gr.Row():
+        with gr.Column():
+            image_input = gr.Image(type="pil", label="📸 Upload Image")
+            voice_dropdown = gr.Dropdown(
+                choices=AVAILABLE_VOICES,
+                value="M5",
+                label="🎭 Voice Style",
+                info="Select a voice style for text-to-speech 🎪"
+            )
+            generate_btn = gr.Button("🚀 Generate Speech 🚀", variant="primary")
+        with gr.Column():
+            audio_output = gr.Audio(label="🎵 Generated Speech", type="numpy")
+            text_output = gr.Textbox(label="📝 Extracted Text", lines=5)
+    generate_btn.click(
+        fn=image_to_voice,
+        inputs=[image_input, voice_dropdown],
+        outputs=[audio_output, text_output]
+    )
+    gr.Examples(
+        examples=[],
+        inputs=image_input
+    )
+if __name__ == "__main__":
+    demo.launch()