Spaces:

jonloporto
/

ImageToSpeechTest

Sleeping

App Files Files Community

ImageToSpeechTest / app.py

jonloporto

Update app.py

456f8ff verified 29 days ago

raw

history blame contribute delete

10.2 kB

	# -- coding: utf-8 --
	"""ImageToVoice Hugging Face Space

	Converts images to text using Hugging Face's image-to-text pipeline,
	then converts the text to speech using Supertonic TTS.
	"""

	import gradio as gr
	from supertonic import TTS
	from transformers import pipeline
	from PIL import Image
	import numpy as np
	import traceback

	# Initialize models (load once at startup)
	image_to_text = None
	tts = None
	init_error = None

	# Available voice styles for supertonic
	AVAILABLE_VOICES = ["M1", "M2", "M3", "M4", "M5", "F1", "F2", "F3", "F4"]

	try:
	print("Initializing image-to-text pipeline...")
	image_to_text = pipeline("image-to-text")
	print("Image-to-text pipeline initialized successfully")
	except Exception as e:
	init_error = f"Failed to initialize image-to-text: {str(e)}"
	print(init_error)
	traceback.print_exc()

	try:
	print("Initializing TTS...")
	tts = TTS(auto_download=True)
	print("TTS initialized successfully")
	except Exception as e:
	if init_error:
	init_error += f"\nFailed to initialize TTS: {str(e)}"
	else:
	init_error = f"Failed to initialize TTS: {str(e)}"
	print(init_error)
	traceback.print_exc()


	def image_to_voice(image, voice_name):
	"""Convert image to text, then text to speech."""
	if image is None:
	return None, "Please upload an image."

	if image_to_text is None or tts is None:
	error_msg = "Error: Models failed to initialize. "
	if init_error:
	error_msg += f"\n\nDetails: {init_error}"
	else:
	error_msg += "Please check the logs for more information."
	return None, error_msg

	# Validate and get voice style
	if voice_name not in AVAILABLE_VOICES:
	voice_name = "M5" # Default fallback
	print(f"Invalid voice name, using default: M5")

	try:
	print(f"Getting voice style: {voice_name}")
	style = tts.get_voice_style(voice_name=voice_name)
	print(f"Voice style '{voice_name}' loaded successfully")
	except Exception as e:
	error_msg = f"Error: Failed to load voice style '{voice_name}': {str(e)}"
	print(error_msg)
	return None, error_msg

	try:
	print(f"Processing image: type={type(image)}, mode={image.mode if hasattr(image, 'mode') else 'N/A'}")

	# Convert PIL Image to format expected by pipeline
	if isinstance(image, Image.Image):
	# PIL Image should work directly, but ensure it's RGB
	if image.mode != 'RGB':
	image = image.convert('RGB')
	print(f"Converted image to RGB mode")

	# Convert image to text
	print("Running image-to-text pipeline...")
	result = image_to_text(image)
	print(f"Image-to-text result: {result}")

	if not result or len(result) == 0:
	return None, "Error: Could not extract text from image. The pipeline returned an empty result."

	generated_text = result[0].get('generated_text', '')
	if not generated_text:
	return None, "Error: No text was extracted from the image. The generated text is empty."

	print(f"Extracted text: {generated_text}")

	# Convert text to speech
	print(f"Synthesizing speech with voice '{voice_name}'...")
	wav, duration = tts.synthesize(generated_text, voice_style=style)
	print(f"Speech synthesized: duration={duration}, wav type={type(wav)}, wav shape={wav.shape if hasattr(wav, 'shape') else 'N/A'}")

	# Ensure wav is a numpy array
	if not isinstance(wav, np.ndarray):
	wav = np.array(wav)
	print(f"Converted wav to numpy array: shape={wav.shape}, dtype={wav.dtype}")

	# Ensure audio is 1D (mono) format
	if wav.ndim > 1:
	wav = wav.squeeze()
	if wav.ndim > 1:
	# If still multi-dimensional, take first channel
	wav = wav[0] if wav.shape[0] < wav.shape[-1] else wav[:, 0]
	print(f"Squeezed wav to 1D: shape={wav.shape}")

	# Normalize audio to [-1, 1] range if needed
	if wav.dtype == np.int16:
	wav = wav.astype(np.float32) / 32768.0
	elif wav.dtype == np.int32:
	wav = wav.astype(np.float32) / 2147483648.0
	elif wav.dtype != np.float32:
	# If already in a reasonable range, just convert to float32
	if np.abs(wav).max() > 1.0:
	wav = wav.astype(np.float32) / np.abs(wav).max()
	else:
	wav = wav.astype(np.float32)

	print(f"Final audio: shape={wav.shape}, dtype={wav.dtype}, min={wav.min()}, max={wav.max()}")

	# Calculate sample rate from duration and audio length
	# sample_rate = samples / duration_in_seconds
	if duration > 0:
	calculated_sample_rate = int(len(wav) / duration)
	print(f"Calculated sample rate: {calculated_sample_rate} Hz (from {len(wav)} samples / {duration}s)")
	sample_rate = calculated_sample_rate
	else:
	# Fallback: Try common TTS sample rates
	# Many TTS systems use 24000 Hz or 16000 Hz
	# If audio sounds slow, try higher sample rate; if fast, try lower
	sample_rate = 24000 # Common TTS sample rate
	print(f"Using default sample rate: {sample_rate} Hz (duration was 0 or invalid)")

	return (sample_rate, wav), generated_text

	except Exception as e:
	error_msg = f"Error processing image: {str(e)}"
	full_error = f"Error: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
	print(full_error) # Print full traceback for debugging
	return None, error_msg


	# Create Gradio interface with playful styling
	custom_css = """
	/* Playful background gradient */
	.gradio-container {
	background: linear-gradient(135deg, #667eea 0%, #764ba2 25%, #f093fb 50%, #4facfe 75%, #00f2fe 100%);
	background-size: 400% 400%;
	animation: gradientShift 15s ease infinite;
	min-height: 100vh;
	padding: 20px;
	}

	@keyframes gradientShift {
	0% { background-position: 0% 50%; }
	50% { background-position: 100% 50%; }
	100% { background-position: 0% 50%; }
	}

	/* Fun title styling */
	h1 {
	color: #000000 !important;
	font-family: 'Comic Sans MS', 'Chalkboard SE', 'Marker Felt', cursive !important;
	text-shadow: 3px 3px 0px #FF6B9D, 6px 6px 0px #4ECDC4, 9px 9px 0px #45B7D1 !important;
	font-size: 3em !important;
	text-align: center !important;
	margin-bottom: 20px !important;
	animation: bounce 2s infinite;
	}

	@keyframes bounce {
	0%, 100% { transform: translateY(0); }
	50% { transform: translateY(-10px); }
	}

	/* Playful paragraph text */
	p, .markdown-text {
	color: #000000 !important;
	font-family: 'Comic Sans MS', 'Chalkboard SE', sans-serif !important;
	font-size: 1.2em !important;
	text-shadow: 2px 2px 4px rgba(0,0,0,0.3) !important;
	}

	/* Card/panel styling */
	.panel, .block, .gradio-block {
	background: rgba(255, 255, 255, 0.95) !important;
	border-radius: 20px !important;
	padding: 20px !important;
	box-shadow: 0 10px 30px rgba(0,0,0,0.3) !important;
	border: 3px solid #FFD700 !important;
	}

	/* Label styling */
	label {
	color: #000000 !important;
	font-family: 'Comic Sans MS', 'Chalkboard SE', sans-serif !important;
	font-weight: bold !important;
	font-size: 1.1em !important;
	}

	/* Button styling */
	button.primary {
	background: linear-gradient(45deg, #FF6B9D, #4ECDC4) !important;
	color: white !important;
	font-family: 'Comic Sans MS', 'Chalkboard SE', sans-serif !important;
	font-size: 1.3em !important;
	font-weight: bold !important;
	border-radius: 25px !important;
	padding: 15px 30px !important;
	border: 3px solid #FFD700 !important;
	box-shadow: 0 5px 15px rgba(0,0,0,0.3) !important;
	transition: all 0.3s ease !important;
	}

	button.primary:hover {
	transform: scale(1.1) !important;
	box-shadow: 0 8px 20px rgba(0,0,0,0.4) !important;
	}

	/* Input fields */
	input, textarea, select {
	border-radius: 15px !important;
	border: 2px solid #4ECDC4 !important;
	font-family: 'Comic Sans MS', 'Chalkboard SE', sans-serif !important;
	}

	/* Dropdown styling */
	select {
	background: linear-gradient(45deg, #f093fb, #4facfe) !important;
	color: white !important;
	font-weight: bold !important;
	}

	/* Textbox styling */
	textarea {
	background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
	color: white !important;
	font-weight: bold !important;
	}
	"""

	with gr.Blocks(title="Image to Voice", theme=gr.themes.Soft(), css=custom_css) as demo:
	gr.Markdown(
	"""
	# 🎨✨ Image to Voice Converter ✨🎨
	### Upload an image to convert it to text, then hear it as speech! 🎤🎵
	"""
	)

	with gr.Row():
	with gr.Column():
	image_input = gr.Image(type="pil", label="📸 Upload Image")
	voice_dropdown = gr.Dropdown(
	choices=AVAILABLE_VOICES,
	value="M5",
	label="🎭 Voice Style",
	info="Select a voice style for text-to-speech 🎪"
	)
	generate_btn = gr.Button("🚀 Generate Speech 🚀", variant="primary")

	with gr.Column():
	audio_output = gr.Audio(label="🎵 Generated Speech", type="numpy")
	text_output = gr.Textbox(label="📝 Extracted Text", lines=5)

	generate_btn.click(
	fn=image_to_voice,
	inputs=[image_input, voice_dropdown],
	outputs=[audio_output, text_output]
	)

	gr.Examples(
	examples=[],
	inputs=image_input
	)

	if __name__ == "__main__":
	demo.launch()