# -*- coding: utf-8 -*- """ImageToVoice Hugging Face Space Converts images to text using Hugging Face's image-to-text pipeline, then converts the text to speech using Supertonic TTS. """ import gradio as gr from supertonic import TTS from transformers import pipeline from PIL import Image import numpy as np import traceback # Initialize models (load once at startup) image_to_text = None tts = None init_error = None # Available voice styles for supertonic AVAILABLE_VOICES = ["M1", "M2", "M3", "M4", "M5", "F1", "F2", "F3", "F4"] try: print("Initializing image-to-text pipeline...") image_to_text = pipeline("image-to-text") print("Image-to-text pipeline initialized successfully") except Exception as e: init_error = f"Failed to initialize image-to-text: {str(e)}" print(init_error) traceback.print_exc() try: print("Initializing TTS...") tts = TTS(auto_download=True) print("TTS initialized successfully") except Exception as e: if init_error: init_error += f"\nFailed to initialize TTS: {str(e)}" else: init_error = f"Failed to initialize TTS: {str(e)}" print(init_error) traceback.print_exc() def image_to_voice(image, voice_name): """Convert image to text, then text to speech.""" if image is None: return None, "Please upload an image." if image_to_text is None or tts is None: error_msg = "Error: Models failed to initialize. " if init_error: error_msg += f"\n\nDetails: {init_error}" else: error_msg += "Please check the logs for more information." return None, error_msg # Validate and get voice style if voice_name not in AVAILABLE_VOICES: voice_name = "M5" # Default fallback print(f"Invalid voice name, using default: M5") try: print(f"Getting voice style: {voice_name}") style = tts.get_voice_style(voice_name=voice_name) print(f"Voice style '{voice_name}' loaded successfully") except Exception as e: error_msg = f"Error: Failed to load voice style '{voice_name}': {str(e)}" print(error_msg) return None, error_msg try: print(f"Processing image: type={type(image)}, mode={image.mode if hasattr(image, 'mode') else 'N/A'}") # Convert PIL Image to format expected by pipeline if isinstance(image, Image.Image): # PIL Image should work directly, but ensure it's RGB if image.mode != 'RGB': image = image.convert('RGB') print(f"Converted image to RGB mode") # Convert image to text print("Running image-to-text pipeline...") result = image_to_text(image) print(f"Image-to-text result: {result}") if not result or len(result) == 0: return None, "Error: Could not extract text from image. The pipeline returned an empty result." generated_text = result[0].get('generated_text', '') if not generated_text: return None, "Error: No text was extracted from the image. The generated text is empty." print(f"Extracted text: {generated_text}") # Convert text to speech print(f"Synthesizing speech with voice '{voice_name}'...") wav, duration = tts.synthesize(generated_text, voice_style=style) print(f"Speech synthesized: duration={duration}, wav type={type(wav)}, wav shape={wav.shape if hasattr(wav, 'shape') else 'N/A'}") # Ensure wav is a numpy array if not isinstance(wav, np.ndarray): wav = np.array(wav) print(f"Converted wav to numpy array: shape={wav.shape}, dtype={wav.dtype}") # Ensure audio is 1D (mono) format if wav.ndim > 1: wav = wav.squeeze() if wav.ndim > 1: # If still multi-dimensional, take first channel wav = wav[0] if wav.shape[0] < wav.shape[-1] else wav[:, 0] print(f"Squeezed wav to 1D: shape={wav.shape}") # Normalize audio to [-1, 1] range if needed if wav.dtype == np.int16: wav = wav.astype(np.float32) / 32768.0 elif wav.dtype == np.int32: wav = wav.astype(np.float32) / 2147483648.0 elif wav.dtype != np.float32: # If already in a reasonable range, just convert to float32 if np.abs(wav).max() > 1.0: wav = wav.astype(np.float32) / np.abs(wav).max() else: wav = wav.astype(np.float32) print(f"Final audio: shape={wav.shape}, dtype={wav.dtype}, min={wav.min()}, max={wav.max()}") # Calculate sample rate from duration and audio length # sample_rate = samples / duration_in_seconds if duration > 0: calculated_sample_rate = int(len(wav) / duration) print(f"Calculated sample rate: {calculated_sample_rate} Hz (from {len(wav)} samples / {duration}s)") sample_rate = calculated_sample_rate else: # Fallback: Try common TTS sample rates # Many TTS systems use 24000 Hz or 16000 Hz # If audio sounds slow, try higher sample rate; if fast, try lower sample_rate = 24000 # Common TTS sample rate print(f"Using default sample rate: {sample_rate} Hz (duration was 0 or invalid)") return (sample_rate, wav), generated_text except Exception as e: error_msg = f"Error processing image: {str(e)}" full_error = f"Error: {str(e)}\n\nTraceback:\n{traceback.format_exc()}" print(full_error) # Print full traceback for debugging return None, error_msg # Create Gradio interface with playful styling custom_css = """ /* Playful background gradient */ .gradio-container { background: linear-gradient(135deg, #667eea 0%, #764ba2 25%, #f093fb 50%, #4facfe 75%, #00f2fe 100%); background-size: 400% 400%; animation: gradientShift 15s ease infinite; min-height: 100vh; padding: 20px; } @keyframes gradientShift { 0% { background-position: 0% 50%; } 50% { background-position: 100% 50%; } 100% { background-position: 0% 50%; } } /* Fun title styling */ h1 { color: #000000 !important; font-family: 'Comic Sans MS', 'Chalkboard SE', 'Marker Felt', cursive !important; text-shadow: 3px 3px 0px #FF6B9D, 6px 6px 0px #4ECDC4, 9px 9px 0px #45B7D1 !important; font-size: 3em !important; text-align: center !important; margin-bottom: 20px !important; animation: bounce 2s infinite; } @keyframes bounce { 0%, 100% { transform: translateY(0); } 50% { transform: translateY(-10px); } } /* Playful paragraph text */ p, .markdown-text { color: #000000 !important; font-family: 'Comic Sans MS', 'Chalkboard SE', sans-serif !important; font-size: 1.2em !important; text-shadow: 2px 2px 4px rgba(0,0,0,0.3) !important; } /* Card/panel styling */ .panel, .block, .gradio-block { background: rgba(255, 255, 255, 0.95) !important; border-radius: 20px !important; padding: 20px !important; box-shadow: 0 10px 30px rgba(0,0,0,0.3) !important; border: 3px solid #FFD700 !important; } /* Label styling */ label { color: #000000 !important; font-family: 'Comic Sans MS', 'Chalkboard SE', sans-serif !important; font-weight: bold !important; font-size: 1.1em !important; } /* Button styling */ button.primary { background: linear-gradient(45deg, #FF6B9D, #4ECDC4) !important; color: white !important; font-family: 'Comic Sans MS', 'Chalkboard SE', sans-serif !important; font-size: 1.3em !important; font-weight: bold !important; border-radius: 25px !important; padding: 15px 30px !important; border: 3px solid #FFD700 !important; box-shadow: 0 5px 15px rgba(0,0,0,0.3) !important; transition: all 0.3s ease !important; } button.primary:hover { transform: scale(1.1) !important; box-shadow: 0 8px 20px rgba(0,0,0,0.4) !important; } /* Input fields */ input, textarea, select { border-radius: 15px !important; border: 2px solid #4ECDC4 !important; font-family: 'Comic Sans MS', 'Chalkboard SE', sans-serif !important; } /* Dropdown styling */ select { background: linear-gradient(45deg, #f093fb, #4facfe) !important; color: white !important; font-weight: bold !important; } /* Textbox styling */ textarea { background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important; color: white !important; font-weight: bold !important; } """ with gr.Blocks(title="Image to Voice", theme=gr.themes.Soft(), css=custom_css) as demo: gr.Markdown( """ # 🎨✨ Image to Voice Converter ✨🎨 ### Upload an image to convert it to text, then hear it as speech! 🎤🎵 """ ) with gr.Row(): with gr.Column(): image_input = gr.Image(type="pil", label="📸 Upload Image") voice_dropdown = gr.Dropdown( choices=AVAILABLE_VOICES, value="M5", label="🎭 Voice Style", info="Select a voice style for text-to-speech 🎪" ) generate_btn = gr.Button("🚀 Generate Speech 🚀", variant="primary") with gr.Column(): audio_output = gr.Audio(label="🎵 Generated Speech", type="numpy") text_output = gr.Textbox(label="📝 Extracted Text", lines=5) generate_btn.click( fn=image_to_voice, inputs=[image_input, voice_dropdown], outputs=[audio_output, text_output] ) gr.Examples( examples=[], inputs=image_input ) if __name__ == "__main__": demo.launch()