Spaces:
Sleeping
Sleeping
| # -*- coding: utf-8 -*- | |
| """ImageToVoice Hugging Face Space | |
| Converts images to text using Hugging Face's image-to-text pipeline, | |
| then converts the text to speech using Supertonic TTS. | |
| """ | |
| import gradio as gr | |
| from supertonic import TTS | |
| from transformers import pipeline | |
| from PIL import Image | |
| import numpy as np | |
| import traceback | |
| # Initialize models (load once at startup) | |
| image_to_text = None | |
| tts = None | |
| init_error = None | |
| # Available voice styles for supertonic | |
| AVAILABLE_VOICES = ["M1", "M2", "M3", "M4", "M5", "F1", "F2", "F3", "F4"] | |
| try: | |
| print("Initializing image-to-text pipeline...") | |
| image_to_text = pipeline("image-to-text") | |
| print("Image-to-text pipeline initialized successfully") | |
| except Exception as e: | |
| init_error = f"Failed to initialize image-to-text: {str(e)}" | |
| print(init_error) | |
| traceback.print_exc() | |
| try: | |
| print("Initializing TTS...") | |
| tts = TTS(auto_download=True) | |
| print("TTS initialized successfully") | |
| except Exception as e: | |
| if init_error: | |
| init_error += f"\nFailed to initialize TTS: {str(e)}" | |
| else: | |
| init_error = f"Failed to initialize TTS: {str(e)}" | |
| print(init_error) | |
| traceback.print_exc() | |
| def image_to_voice(image, voice_name): | |
| """Convert image to text, then text to speech.""" | |
| if image is None: | |
| return None, "Please upload an image." | |
| if image_to_text is None or tts is None: | |
| error_msg = "Error: Models failed to initialize. " | |
| if init_error: | |
| error_msg += f"\n\nDetails: {init_error}" | |
| else: | |
| error_msg += "Please check the logs for more information." | |
| return None, error_msg | |
| # Validate and get voice style | |
| if voice_name not in AVAILABLE_VOICES: | |
| voice_name = "M5" # Default fallback | |
| print(f"Invalid voice name, using default: M5") | |
| try: | |
| print(f"Getting voice style: {voice_name}") | |
| style = tts.get_voice_style(voice_name=voice_name) | |
| print(f"Voice style '{voice_name}' loaded successfully") | |
| except Exception as e: | |
| error_msg = f"Error: Failed to load voice style '{voice_name}': {str(e)}" | |
| print(error_msg) | |
| return None, error_msg | |
| try: | |
| print(f"Processing image: type={type(image)}, mode={image.mode if hasattr(image, 'mode') else 'N/A'}") | |
| # Convert PIL Image to format expected by pipeline | |
| if isinstance(image, Image.Image): | |
| # PIL Image should work directly, but ensure it's RGB | |
| if image.mode != 'RGB': | |
| image = image.convert('RGB') | |
| print(f"Converted image to RGB mode") | |
| # Convert image to text | |
| print("Running image-to-text pipeline...") | |
| result = image_to_text(image) | |
| print(f"Image-to-text result: {result}") | |
| if not result or len(result) == 0: | |
| return None, "Error: Could not extract text from image. The pipeline returned an empty result." | |
| generated_text = result[0].get('generated_text', '') | |
| if not generated_text: | |
| return None, "Error: No text was extracted from the image. The generated text is empty." | |
| print(f"Extracted text: {generated_text}") | |
| # Convert text to speech | |
| print(f"Synthesizing speech with voice '{voice_name}'...") | |
| wav, duration = tts.synthesize(generated_text, voice_style=style) | |
| print(f"Speech synthesized: duration={duration}, wav type={type(wav)}, wav shape={wav.shape if hasattr(wav, 'shape') else 'N/A'}") | |
| # Ensure wav is a numpy array | |
| if not isinstance(wav, np.ndarray): | |
| wav = np.array(wav) | |
| print(f"Converted wav to numpy array: shape={wav.shape}, dtype={wav.dtype}") | |
| # Ensure audio is 1D (mono) format | |
| if wav.ndim > 1: | |
| wav = wav.squeeze() | |
| if wav.ndim > 1: | |
| # If still multi-dimensional, take first channel | |
| wav = wav[0] if wav.shape[0] < wav.shape[-1] else wav[:, 0] | |
| print(f"Squeezed wav to 1D: shape={wav.shape}") | |
| # Normalize audio to [-1, 1] range if needed | |
| if wav.dtype == np.int16: | |
| wav = wav.astype(np.float32) / 32768.0 | |
| elif wav.dtype == np.int32: | |
| wav = wav.astype(np.float32) / 2147483648.0 | |
| elif wav.dtype != np.float32: | |
| # If already in a reasonable range, just convert to float32 | |
| if np.abs(wav).max() > 1.0: | |
| wav = wav.astype(np.float32) / np.abs(wav).max() | |
| else: | |
| wav = wav.astype(np.float32) | |
| print(f"Final audio: shape={wav.shape}, dtype={wav.dtype}, min={wav.min()}, max={wav.max()}") | |
| # Calculate sample rate from duration and audio length | |
| # sample_rate = samples / duration_in_seconds | |
| if duration > 0: | |
| calculated_sample_rate = int(len(wav) / duration) | |
| print(f"Calculated sample rate: {calculated_sample_rate} Hz (from {len(wav)} samples / {duration}s)") | |
| sample_rate = calculated_sample_rate | |
| else: | |
| # Fallback: Try common TTS sample rates | |
| # Many TTS systems use 24000 Hz or 16000 Hz | |
| # If audio sounds slow, try higher sample rate; if fast, try lower | |
| sample_rate = 24000 # Common TTS sample rate | |
| print(f"Using default sample rate: {sample_rate} Hz (duration was 0 or invalid)") | |
| return (sample_rate, wav), generated_text | |
| except Exception as e: | |
| error_msg = f"Error processing image: {str(e)}" | |
| full_error = f"Error: {str(e)}\n\nTraceback:\n{traceback.format_exc()}" | |
| print(full_error) # Print full traceback for debugging | |
| return None, error_msg | |
| # Create Gradio interface with playful styling | |
| custom_css = """ | |
| /* Playful background gradient */ | |
| .gradio-container { | |
| background: linear-gradient(135deg, #667eea 0%, #764ba2 25%, #f093fb 50%, #4facfe 75%, #00f2fe 100%); | |
| background-size: 400% 400%; | |
| animation: gradientShift 15s ease infinite; | |
| min-height: 100vh; | |
| padding: 20px; | |
| } | |
| @keyframes gradientShift { | |
| 0% { background-position: 0% 50%; } | |
| 50% { background-position: 100% 50%; } | |
| 100% { background-position: 0% 50%; } | |
| } | |
| /* Fun title styling */ | |
| h1 { | |
| color: #000000 !important; | |
| font-family: 'Comic Sans MS', 'Chalkboard SE', 'Marker Felt', cursive !important; | |
| text-shadow: 3px 3px 0px #FF6B9D, 6px 6px 0px #4ECDC4, 9px 9px 0px #45B7D1 !important; | |
| font-size: 3em !important; | |
| text-align: center !important; | |
| margin-bottom: 20px !important; | |
| animation: bounce 2s infinite; | |
| } | |
| @keyframes bounce { | |
| 0%, 100% { transform: translateY(0); } | |
| 50% { transform: translateY(-10px); } | |
| } | |
| /* Playful paragraph text */ | |
| p, .markdown-text { | |
| color: #000000 !important; | |
| font-family: 'Comic Sans MS', 'Chalkboard SE', sans-serif !important; | |
| font-size: 1.2em !important; | |
| text-shadow: 2px 2px 4px rgba(0,0,0,0.3) !important; | |
| } | |
| /* Card/panel styling */ | |
| .panel, .block, .gradio-block { | |
| background: rgba(255, 255, 255, 0.95) !important; | |
| border-radius: 20px !important; | |
| padding: 20px !important; | |
| box-shadow: 0 10px 30px rgba(0,0,0,0.3) !important; | |
| border: 3px solid #FFD700 !important; | |
| } | |
| /* Label styling */ | |
| label { | |
| color: #000000 !important; | |
| font-family: 'Comic Sans MS', 'Chalkboard SE', sans-serif !important; | |
| font-weight: bold !important; | |
| font-size: 1.1em !important; | |
| } | |
| /* Button styling */ | |
| button.primary { | |
| background: linear-gradient(45deg, #FF6B9D, #4ECDC4) !important; | |
| color: white !important; | |
| font-family: 'Comic Sans MS', 'Chalkboard SE', sans-serif !important; | |
| font-size: 1.3em !important; | |
| font-weight: bold !important; | |
| border-radius: 25px !important; | |
| padding: 15px 30px !important; | |
| border: 3px solid #FFD700 !important; | |
| box-shadow: 0 5px 15px rgba(0,0,0,0.3) !important; | |
| transition: all 0.3s ease !important; | |
| } | |
| button.primary:hover { | |
| transform: scale(1.1) !important; | |
| box-shadow: 0 8px 20px rgba(0,0,0,0.4) !important; | |
| } | |
| /* Input fields */ | |
| input, textarea, select { | |
| border-radius: 15px !important; | |
| border: 2px solid #4ECDC4 !important; | |
| font-family: 'Comic Sans MS', 'Chalkboard SE', sans-serif !important; | |
| } | |
| /* Dropdown styling */ | |
| select { | |
| background: linear-gradient(45deg, #f093fb, #4facfe) !important; | |
| color: white !important; | |
| font-weight: bold !important; | |
| } | |
| /* Textbox styling */ | |
| textarea { | |
| background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important; | |
| color: white !important; | |
| font-weight: bold !important; | |
| } | |
| """ | |
| with gr.Blocks(title="Image to Voice", theme=gr.themes.Soft(), css=custom_css) as demo: | |
| gr.Markdown( | |
| """ | |
| # π¨β¨ Image to Voice Converter β¨π¨ | |
| ### Upload an image to convert it to text, then hear it as speech! π€π΅ | |
| """ | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| image_input = gr.Image(type="pil", label="πΈ Upload Image") | |
| voice_dropdown = gr.Dropdown( | |
| choices=AVAILABLE_VOICES, | |
| value="M5", | |
| label="π Voice Style", | |
| info="Select a voice style for text-to-speech πͺ" | |
| ) | |
| generate_btn = gr.Button("π Generate Speech π", variant="primary") | |
| with gr.Column(): | |
| audio_output = gr.Audio(label="π΅ Generated Speech", type="numpy") | |
| text_output = gr.Textbox(label="π Extracted Text", lines=5) | |
| generate_btn.click( | |
| fn=image_to_voice, | |
| inputs=[image_input, voice_dropdown], | |
| outputs=[audio_output, text_output] | |
| ) | |
| gr.Examples( | |
| examples=[], | |
| inputs=image_input | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |