jonloporto's picture
Update app.py
456f8ff verified
# -*- coding: utf-8 -*-
"""ImageToVoice Hugging Face Space
Converts images to text using Hugging Face's image-to-text pipeline,
then converts the text to speech using Supertonic TTS.
"""
import gradio as gr
from supertonic import TTS
from transformers import pipeline
from PIL import Image
import numpy as np
import traceback
# Initialize models (load once at startup)
image_to_text = None
tts = None
init_error = None
# Available voice styles for supertonic
AVAILABLE_VOICES = ["M1", "M2", "M3", "M4", "M5", "F1", "F2", "F3", "F4"]
try:
print("Initializing image-to-text pipeline...")
image_to_text = pipeline("image-to-text")
print("Image-to-text pipeline initialized successfully")
except Exception as e:
init_error = f"Failed to initialize image-to-text: {str(e)}"
print(init_error)
traceback.print_exc()
try:
print("Initializing TTS...")
tts = TTS(auto_download=True)
print("TTS initialized successfully")
except Exception as e:
if init_error:
init_error += f"\nFailed to initialize TTS: {str(e)}"
else:
init_error = f"Failed to initialize TTS: {str(e)}"
print(init_error)
traceback.print_exc()
def image_to_voice(image, voice_name):
"""Convert image to text, then text to speech."""
if image is None:
return None, "Please upload an image."
if image_to_text is None or tts is None:
error_msg = "Error: Models failed to initialize. "
if init_error:
error_msg += f"\n\nDetails: {init_error}"
else:
error_msg += "Please check the logs for more information."
return None, error_msg
# Validate and get voice style
if voice_name not in AVAILABLE_VOICES:
voice_name = "M5" # Default fallback
print(f"Invalid voice name, using default: M5")
try:
print(f"Getting voice style: {voice_name}")
style = tts.get_voice_style(voice_name=voice_name)
print(f"Voice style '{voice_name}' loaded successfully")
except Exception as e:
error_msg = f"Error: Failed to load voice style '{voice_name}': {str(e)}"
print(error_msg)
return None, error_msg
try:
print(f"Processing image: type={type(image)}, mode={image.mode if hasattr(image, 'mode') else 'N/A'}")
# Convert PIL Image to format expected by pipeline
if isinstance(image, Image.Image):
# PIL Image should work directly, but ensure it's RGB
if image.mode != 'RGB':
image = image.convert('RGB')
print(f"Converted image to RGB mode")
# Convert image to text
print("Running image-to-text pipeline...")
result = image_to_text(image)
print(f"Image-to-text result: {result}")
if not result or len(result) == 0:
return None, "Error: Could not extract text from image. The pipeline returned an empty result."
generated_text = result[0].get('generated_text', '')
if not generated_text:
return None, "Error: No text was extracted from the image. The generated text is empty."
print(f"Extracted text: {generated_text}")
# Convert text to speech
print(f"Synthesizing speech with voice '{voice_name}'...")
wav, duration = tts.synthesize(generated_text, voice_style=style)
print(f"Speech synthesized: duration={duration}, wav type={type(wav)}, wav shape={wav.shape if hasattr(wav, 'shape') else 'N/A'}")
# Ensure wav is a numpy array
if not isinstance(wav, np.ndarray):
wav = np.array(wav)
print(f"Converted wav to numpy array: shape={wav.shape}, dtype={wav.dtype}")
# Ensure audio is 1D (mono) format
if wav.ndim > 1:
wav = wav.squeeze()
if wav.ndim > 1:
# If still multi-dimensional, take first channel
wav = wav[0] if wav.shape[0] < wav.shape[-1] else wav[:, 0]
print(f"Squeezed wav to 1D: shape={wav.shape}")
# Normalize audio to [-1, 1] range if needed
if wav.dtype == np.int16:
wav = wav.astype(np.float32) / 32768.0
elif wav.dtype == np.int32:
wav = wav.astype(np.float32) / 2147483648.0
elif wav.dtype != np.float32:
# If already in a reasonable range, just convert to float32
if np.abs(wav).max() > 1.0:
wav = wav.astype(np.float32) / np.abs(wav).max()
else:
wav = wav.astype(np.float32)
print(f"Final audio: shape={wav.shape}, dtype={wav.dtype}, min={wav.min()}, max={wav.max()}")
# Calculate sample rate from duration and audio length
# sample_rate = samples / duration_in_seconds
if duration > 0:
calculated_sample_rate = int(len(wav) / duration)
print(f"Calculated sample rate: {calculated_sample_rate} Hz (from {len(wav)} samples / {duration}s)")
sample_rate = calculated_sample_rate
else:
# Fallback: Try common TTS sample rates
# Many TTS systems use 24000 Hz or 16000 Hz
# If audio sounds slow, try higher sample rate; if fast, try lower
sample_rate = 24000 # Common TTS sample rate
print(f"Using default sample rate: {sample_rate} Hz (duration was 0 or invalid)")
return (sample_rate, wav), generated_text
except Exception as e:
error_msg = f"Error processing image: {str(e)}"
full_error = f"Error: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
print(full_error) # Print full traceback for debugging
return None, error_msg
# Create Gradio interface with playful styling
custom_css = """
/* Playful background gradient */
.gradio-container {
background: linear-gradient(135deg, #667eea 0%, #764ba2 25%, #f093fb 50%, #4facfe 75%, #00f2fe 100%);
background-size: 400% 400%;
animation: gradientShift 15s ease infinite;
min-height: 100vh;
padding: 20px;
}
@keyframes gradientShift {
0% { background-position: 0% 50%; }
50% { background-position: 100% 50%; }
100% { background-position: 0% 50%; }
}
/* Fun title styling */
h1 {
color: #000000 !important;
font-family: 'Comic Sans MS', 'Chalkboard SE', 'Marker Felt', cursive !important;
text-shadow: 3px 3px 0px #FF6B9D, 6px 6px 0px #4ECDC4, 9px 9px 0px #45B7D1 !important;
font-size: 3em !important;
text-align: center !important;
margin-bottom: 20px !important;
animation: bounce 2s infinite;
}
@keyframes bounce {
0%, 100% { transform: translateY(0); }
50% { transform: translateY(-10px); }
}
/* Playful paragraph text */
p, .markdown-text {
color: #000000 !important;
font-family: 'Comic Sans MS', 'Chalkboard SE', sans-serif !important;
font-size: 1.2em !important;
text-shadow: 2px 2px 4px rgba(0,0,0,0.3) !important;
}
/* Card/panel styling */
.panel, .block, .gradio-block {
background: rgba(255, 255, 255, 0.95) !important;
border-radius: 20px !important;
padding: 20px !important;
box-shadow: 0 10px 30px rgba(0,0,0,0.3) !important;
border: 3px solid #FFD700 !important;
}
/* Label styling */
label {
color: #000000 !important;
font-family: 'Comic Sans MS', 'Chalkboard SE', sans-serif !important;
font-weight: bold !important;
font-size: 1.1em !important;
}
/* Button styling */
button.primary {
background: linear-gradient(45deg, #FF6B9D, #4ECDC4) !important;
color: white !important;
font-family: 'Comic Sans MS', 'Chalkboard SE', sans-serif !important;
font-size: 1.3em !important;
font-weight: bold !important;
border-radius: 25px !important;
padding: 15px 30px !important;
border: 3px solid #FFD700 !important;
box-shadow: 0 5px 15px rgba(0,0,0,0.3) !important;
transition: all 0.3s ease !important;
}
button.primary:hover {
transform: scale(1.1) !important;
box-shadow: 0 8px 20px rgba(0,0,0,0.4) !important;
}
/* Input fields */
input, textarea, select {
border-radius: 15px !important;
border: 2px solid #4ECDC4 !important;
font-family: 'Comic Sans MS', 'Chalkboard SE', sans-serif !important;
}
/* Dropdown styling */
select {
background: linear-gradient(45deg, #f093fb, #4facfe) !important;
color: white !important;
font-weight: bold !important;
}
/* Textbox styling */
textarea {
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
color: white !important;
font-weight: bold !important;
}
"""
with gr.Blocks(title="Image to Voice", theme=gr.themes.Soft(), css=custom_css) as demo:
gr.Markdown(
"""
# 🎨✨ Image to Voice Converter ✨🎨
### Upload an image to convert it to text, then hear it as speech! 🎀🎡
"""
)
with gr.Row():
with gr.Column():
image_input = gr.Image(type="pil", label="πŸ“Έ Upload Image")
voice_dropdown = gr.Dropdown(
choices=AVAILABLE_VOICES,
value="M5",
label="🎭 Voice Style",
info="Select a voice style for text-to-speech πŸŽͺ"
)
generate_btn = gr.Button("πŸš€ Generate Speech πŸš€", variant="primary")
with gr.Column():
audio_output = gr.Audio(label="🎡 Generated Speech", type="numpy")
text_output = gr.Textbox(label="πŸ“ Extracted Text", lines=5)
generate_btn.click(
fn=image_to_voice,
inputs=[image_input, voice_dropdown],
outputs=[audio_output, text_output]
)
gr.Examples(
examples=[],
inputs=image_input
)
if __name__ == "__main__":
demo.launch()