Spaces:

jonloporto
/

ImageToSpeechTest

Sleeping

App Files Files Community

jonloporto commited on Jan 5

Commit

f7f7d8f

verified ·

1 Parent(s): b78885b

Delete app.py

Browse files

Files changed (1) hide show

app.py +0 -183

app.py DELETED Viewed

@@ -1,183 +0,0 @@
-# -*- coding: utf-8 -*-
-"""ImageToVoice Hugging Face Space
-Converts images to text using Hugging Face's image-to-text pipeline,
-then converts the text to speech using Supertonic TTS.
-"""
-import gradio as gr
-from supertonic import TTS
-from transformers import pipeline
-from PIL import Image
-import numpy as np
-import traceback
-# Initialize models (load once at startup)
-image_to_text = None
-tts = None
-init_error = None
-# Available voice styles for supertonic
-AVAILABLE_VOICES = ["M1", "M2", "M3", "M4", "M5", "F1", "F2", "F3", "F4"]
-try:
-    print("Initializing image-to-text pipeline...")
-    image_to_text = pipeline("image-to-text")
-    print("Image-to-text pipeline initialized successfully")
-except Exception as e:
-    init_error = f"Failed to initialize image-to-text: {str(e)}"
-    print(init_error)
-    traceback.print_exc()
-try:
-    print("Initializing TTS...")
-    tts = TTS(auto_download=True)
-    print("TTS initialized successfully")
-except Exception as e:
-    if init_error:
-        init_error += f"\nFailed to initialize TTS: {str(e)}"
-    else:
-        init_error = f"Failed to initialize TTS: {str(e)}"
-    print(init_error)
-    traceback.print_exc()
-def image_to_voice(image, voice_name):
-    """Convert image to text, then text to speech."""
-    if image is None:
-        return None, "Please upload an image."
-    if image_to_text is None or tts is None:
-        error_msg = "Error: Models failed to initialize. "
-        if init_error:
-            error_msg += f"\n\nDetails: {init_error}"
-        else:
-            error_msg += "Please check the logs for more information."
-        return None, error_msg
-    # Validate and get voice style
-    if voice_name not in AVAILABLE_VOICES:
-        voice_name = "M5"  # Default fallback
-        print(f"Invalid voice name, using default: M5")
-    try:
-        print(f"Getting voice style: {voice_name}")
-        style = tts.get_voice_style(voice_name=voice_name)
-        print(f"Voice style '{voice_name}' loaded successfully")
-    except Exception as e:
-        error_msg = f"Error: Failed to load voice style '{voice_name}': {str(e)}"
-        print(error_msg)
-        return None, error_msg
-    try:
-        print(f"Processing image: type={type(image)}, mode={image.mode if hasattr(image, 'mode') else 'N/A'}")
-        # Convert PIL Image to format expected by pipeline
-        if isinstance(image, Image.Image):
-            # PIL Image should work directly, but ensure it's RGB
-            if image.mode != 'RGB':
-                image = image.convert('RGB')
-                print(f"Converted image to RGB mode")
-        # Convert image to text
-        print("Running image-to-text pipeline...")
-        result = image_to_text(image)
-        print(f"Image-to-text result: {result}")
-        if not result or len(result) == 0:
-            return None, "Error: Could not extract text from image. The pipeline returned an empty result."
-        generated_text = result[0].get('generated_text', '')
-        if not generated_text:
-            return None, "Error: No text was extracted from the image. The generated text is empty."
-        print(f"Extracted text: {generated_text}")
-        # Convert text to speech
-        print(f"Synthesizing speech with voice '{voice_name}'...")
-        wav, duration = tts.synthesize(generated_text, voice_style=style)
-        print(f"Speech synthesized: duration={duration}, wav type={type(wav)}, wav shape={wav.shape if hasattr(wav, 'shape') else 'N/A'}")
-        # Ensure wav is a numpy array
-        if not isinstance(wav, np.ndarray):
-            wav = np.array(wav)
-            print(f"Converted wav to numpy array: shape={wav.shape}, dtype={wav.dtype}")
-        # Ensure audio is 1D (mono) format
-        if wav.ndim > 1:
-            wav = wav.squeeze()
-            if wav.ndim > 1:
-                # If still multi-dimensional, take first channel
-                wav = wav[0] if wav.shape[0] < wav.shape[-1] else wav[:, 0]
-            print(f"Squeezed wav to 1D: shape={wav.shape}")
-        # Normalize audio to [-1, 1] range if needed
-        if wav.dtype == np.int16:
-            wav = wav.astype(np.float32) / 32768.0
-        elif wav.dtype == np.int32:
-            wav = wav.astype(np.float32) / 2147483648.0
-        elif wav.dtype != np.float32:
-            # If already in a reasonable range, just convert to float32
-            if np.abs(wav).max() > 1.0:
-                wav = wav.astype(np.float32) / np.abs(wav).max()
-            else:
-                wav = wav.astype(np.float32)
-        print(f"Final audio: shape={wav.shape}, dtype={wav.dtype}, min={wav.min()}, max={wav.max()}")
-        # Calculate sample rate from duration and audio length
-        # sample_rate = samples / duration_in_seconds
-        if duration > 0:
-            calculated_sample_rate = int(len(wav) / duration)
-            print(f"Calculated sample rate: {calculated_sample_rate} Hz (from {len(wav)} samples / {duration}s)")
-            sample_rate = calculated_sample_rate
-        else:
-            # Fallback: Try common TTS sample rates
-            # Many TTS systems use 24000 Hz or 16000 Hz
-            # If audio sounds slow, try higher sample rate; if fast, try lower
-            sample_rate = 24000  # Common TTS sample rate
-            print(f"Using default sample rate: {sample_rate} Hz (duration was 0 or invalid)")
-        return (sample_rate, wav), generated_text
-    except Exception as e:
-        error_msg = f"Error processing image: {str(e)}"
-        full_error = f"Error: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
-        print(full_error)  # Print full traceback for debugging
-        return None, error_msg
-# Create Gradio interface
-with gr.Blocks(title="Image to Voice") as demo:
-    gr.Markdown("# Image to Voice Converter")
-    gr.Markdown("Upload an image to convert it to text, then hear it as speech!")
-    with gr.Row():
-        with gr.Column():
-            image_input = gr.Image(type="pil", label="Upload Image")
-            voice_dropdown = gr.Dropdown(
-                choices=AVAILABLE_VOICES,
-                value="M5",
-                label="Voice Style",
-                info="Select a voice style for text-to-speech"
-            )
-            generate_btn = gr.Button("Generate Speech", variant="primary")
-        with gr.Column():
-            audio_output = gr.Audio(label="Generated Speech", type="numpy")
-            text_output = gr.Textbox(label="Extracted Text", lines=5)
-    generate_btn.click(
-        fn=image_to_voice,
-        inputs=[image_input, voice_dropdown],
-        outputs=[audio_output, text_output]
-    )
-    gr.Examples(
-        examples=[],
-        inputs=image_input
-    )
-if __name__ == "__main__":
-    demo.launch()