import gradio as gr from supertonic import TTS from transformers import pipeline import tempfile import os from PIL import Image import numpy as np # Initialize the image-to-text pipeline image_to_text = pipeline("image-to-text") # Initialize text generation pipeline for story creation text_generation = pipeline("text-generation", model="gpt2") # Initialize Hugging Face image-to-text model for advanced story generation try: from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer image_to_story_model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning") image_feature_extractor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-224") image_to_story_tokenizer = AutoTokenizer.from_pretrained("gpt2") except: image_to_story_model = None image_feature_extractor = None image_to_story_tokenizer = None # Initialize the TTS model tts = TTS(auto_download=True) # Initialize emotion detection pipeline try: emotion_detection = pipeline("image-classification", model="nateraw/vit-base-beans") except: emotion_detection = None # Available voice styles (common Supertonic voices) VOICE_OPTIONS = [ ("M5 - Male Voice (Default)", "M5"), ("M1 - Male Voice 1", "M1"), ("M2 - Male Voice 2", "M2"), ("M3 - Male Voice 3", "M3"), ("M4 - Male Voice 4", "M4"), ("F1 - Female Voice 1", "F1"), ("F2 - Female Voice 2", "F2"), ("F3 - Female Voice 3", "F3"), ("F4 - Female Voice 4", "F4"), ("F5 - Female Voice 5", "F5"), ] def image_to_voice(image, voice_selection): """ Convert an image to text, then text to speech. Args: image: Input image (PIL Image or numpy array) voice_selection: Selected voice style from dropdown (e.g., "M5 - Male Voice (Default)") Returns: Path to the generated audio file and extracted text """ if image is None: return None, "Please upload an image to get started." try: # Extract voice name from selection (e.g., "M5 - Male Voice (Default)" -> "M5") voice_name = None for opt_label, opt_value in VOICE_OPTIONS: if opt_label == voice_selection: voice_name = opt_value break if voice_name is None: # Fallback: try to extract from the selection if format is unexpected voice_name = voice_selection.split(" - ")[0] if " - " in voice_selection else voice_selection # Convert image to text result = image_to_text(image) generated_text = result[0]['generated_text'] # Get the selected voice style style = tts.get_voice_style(voice_name=voice_name) # Convert text to speech wav, duration = tts.synthesize(generated_text, voice_style=style) # Save to a temporary file temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav") tts.save_audio(wav, temp_file.name) return temp_file.name, generated_text except Exception as e: return None, f"❌ Error: {str(e)}" def analyze_mood_from_image(image): """ Analyze mood/emotions detected in an image and create a mood chart. Args: image: Input image (PIL Image or numpy array) Returns: Chart data and mood analysis text """ if image is None: return "Please upload an image.", {} try: # Simple mood detection based on color analysis img_array = np.array(image) # Calculate average colors avg_brightness = np.mean(img_array) avg_red = np.mean(img_array[:, :, 0]) if img_array.shape[2] > 0 else 0 avg_green = np.mean(img_array[:, :, 1]) if img_array.shape[2] > 1 else 0 avg_blue = np.mean(img_array[:, :, 2]) if img_array.shape[2] > 2 else 0 # Create mood mapping based on color analysis mood_scores = { "Happy": min(100, int((avg_brightness / 255 * 60) + (avg_yellow := (avg_red + avg_green) / 2 - avg_blue) / 2.55 * 40)), "Calm": min(100, int((avg_blue / 255 * 50) + (avg_green / 255 * 50))), "Energetic": min(100, int(avg_red / 255 * 100)), "Peaceful": min(100, int((255 - avg_brightness) / 255 * 70 + avg_blue / 255 * 30)), } # Normalize scores total = sum(mood_scores.values()) mood_scores = {k: int((v / total * 100)) for k, v in mood_scores.items()} if total > 0 else mood_scores mood_text = f""" **Mood Analysis Results:** - 😊 Happy: {mood_scores.get('Happy', 0)}% - 😌 Calm: {mood_scores.get('Calm', 0)}% - ⚡ Energetic: {mood_scores.get('Energetic', 0)}% - 🧘 Peaceful: {mood_scores.get('Peaceful', 0)}% **Interpretation:** Based on color analysis, this image conveys a {max(mood_scores, key=mood_scores.get)} mood. """ return mood_text, mood_scores except Exception as e: return f"❌ Error analyzing mood: {str(e)}", {} def ai_story_generation(image, story_theme): """ Generate a creative story based on the image content and selected theme. Args: image: Input image (PIL Image or numpy array) story_theme: Selected theme for the story Returns: Generated story text """ if image is None: return "Please upload an image to generate a story." try: # Extract text from image first result = image_to_text(image) image_description = result[0]['generated_text'] # Create a prompt for story generation prompt = f"""Based on an image showing: {image_description} Theme: {story_theme} Generate a creative and engaging short story (150-200 words) incorporating elements from the image:""" # Generate story using text generation pipeline story = text_generation(prompt, max_length=250, num_return_sequences=1) generated_story = story[0]['generated_text'] return generated_story except Exception as e: return f"❌ Error generating story: {str(e)}" def huggingface_picture_to_story(image): """ Transform a picture into a story using Hugging Face image-to-text model. Uses the specialized vit-gpt2-image-captioning model. Args: image: Input image (PIL Image or numpy array) Returns: Generated story based on image """ if image is None: return "Please upload an image to generate a story." try: if image_to_story_model is None or image_feature_extractor is None: return "Hugging Face story model not available. Using alternative method..." # Prepare image if isinstance(image, np.ndarray): image = Image.fromarray(image) # Extract features from image pixel_values = image_feature_extractor(images=image, return_tensors="pt").pixel_values # Generate story output_ids = image_to_story_model.generate(pixel_values, max_length=100) # Decode the generated text story = image_to_story_tokenizer.batch_decode(output_ids, skip_special_tokens=True) generated_story = story[0].strip() if story else "No story generated" # Expand the basic caption into a more complete story expanded_story = f""" **AI-Generated Story from Image:** {generated_story} --- **Extended Story:** In this captivating scene, {generated_story.lower()}. The image captures a moment of pure artistry and wonder, where every detail tells a part of a larger narrative. As you observe the composition, your mind fills with possibilities and untold stories waiting to be discovered. The interplay of light and shadow creates an atmosphere that invites contemplation and imagination, transporting you to a world where reality meets fantasy. """ return expanded_story except Exception as e: return f"❌ Error generating story: {str(e)}" def ai_study_helper(image, study_type): """ Provide AI-powered study insights based on image content. Args: image: Input image (PIL Image or numpy array) study_type: Type of study aid requested Returns: Study insights and recommendations """ if image is None: return "Please upload an image for study assistance." try: # Extract text from image result = image_to_text(image) extracted_text = result[0]['generated_text'] study_insights = "" if study_type == "Summary": study_insights = f""" **AI-Generated Summary:** {extracted_text[:200]}... **Key Points:** - Content extracted from image: {extracted_text} - Length: {len(extracted_text.split())} words - Recommended study time: {max(5, len(extracted_text.split()) // 100)} minutes """ elif study_type == "Quiz Questions": study_insights = f""" **AI-Generated Study Questions:** Based on the image content: "{extracted_text[:100]}..." 1. What are the main topics covered in the image? 2. Can you explain the concepts in your own words? 3. How would you apply this information? 4. What are the key takeaways? 5. What additional research would enhance your understanding? """ elif study_type == "Learning Tips": study_insights = f""" **Personalized Learning Tips:** 📚 Study Strategy: - Break down the content: {extracted_text[:50]}... - Use the Feynman Technique to explain concepts simply - Create mind maps for visual learning - Practice active recall with the quiz questions feature - Review regularly (spaced repetition) 🎯 Focus Areas: - Main concept: Extract and understand key terms - Relationships: Connect ideas together - Application: Practice with real-world examples """ else: # Note-Taking study_insights = f""" **AI-Generated Study Notes:** **Original Content:** {extracted_text} **Simplified Notes:** - Main idea: {extracted_text[:80]}... - Key details: Analyze and list important points - Examples: Look for practical applications - Conclusion: What did you learn? **Action Items:** ☐ Review these notes daily ☐ Create flashcards for key terms ☐ Test yourself with quiz questions """ return study_insights except Exception as e: return f"❌ Error generating study insights: {str(e)}" def ai_study_helper_for_kids(image, learning_style): """ Provide AI-powered kid-friendly study assistance based on image content. Uses simple language, fun facts, and gamified learning elements. Args: image: Input image (PIL Image or numpy array) learning_style: Type of kid-friendly learning aid Returns: Kid-friendly study content with fun and engaging format """ if image is None: return "Please upload an image for your learning adventure! 🌟" try: # Extract text from image result = image_to_text(image) extracted_text = result[0]['generated_text'] study_content = "" if learning_style == "Fun Summary": study_content = f""" ✨ **FUN SUMMARY FOR KIDS!** ✨ 📖 What We're Learning About: {extracted_text} 🎯 Super Cool Points to Remember: ⭐ The main idea is: {extracted_text[:60]}... ⭐ This is important because it helps us understand cool stuff! ⭐ You can find examples of this everywhere around you! 💡 Fun Fact: Did you know? Learning by playing is the best way! 🎮 ⏱️ Perfect Study Time: 10-15 minutes is awesome! Then take a break! 🎉 """ elif learning_style == "Interactive Quiz": study_content = f""" 🎮 **SUPER FUN QUIZ TIME!** 🎮 Based on: {extracted_text[:80]}... 📝 Try to Answer These Fun Questions: ❓ Question 1: What's the MAIN thing about this topic? 💭 Think about it... You got this! 💪 ❓ Question 2: Can you tell your friend about this in simple words? 💭 Teaching others is the BEST way to learn! 📚 ❓ Question 3: Where do you see this in real life? 💭 (Hint: Look around you!) 👀 ❓ Question 4: What's the coolest part of this? 💭 Everyone learns what's cool to THEM! 🌟 🏆 YOU'RE AMAZING FOR TRYING! 🏆 """ elif learning_style == "Memory Game": words = extracted_text.split()[:5] study_content = f""" 🧠 **MEMORY CHAMPION CHALLENGE!** 🧠 Let's train your SUPER BRAIN! 🎯 📚 Key Words to Remember: {', '.join([f'✨ {word}' for word in words])} 🎮 MEMORY GAME RULES: 1️⃣ Read the words above carefully (10 seconds) 2️⃣ Close your eyes and think about them 3️⃣ Can you remember them all? Try it! 4️⃣ Repeat this game 3 times to be a MEMORY MASTER! 👑 💪 YOUR BRAIN POWER IS INCREASING! 📊 Track your progress: - Try 1: How many did you remember? ___/5 - Try 2: How many did you remember? ___/5 - Try 3: How many did you remember? ___/5 🎉 AWESOME JOB! Your brain is SUPER POWERFUL! 🌟 """ else: # Learning Tips for Kids study_content = f""" 🌟 **SUPER COOL LEARNING TIPS FOR YOU!** 🌟 Topic: {extracted_text[:100]}... 🎯 AWESOME STUDY TRICKS: 🎨 Make It Colorful! - Use different colored pens or pencils - Draw pictures to remember things - Make it FUN and PRETTY! 🖍️ 🎵 Use Music & Rhythm! - Make up a song about what you're learning - Sing it while you study - Dance while learning = SUPER FUN! 🎶 🎬 Act It Out! - Use hand movements to remember ideas - Tell your friends like you're a teacher - Pretend you're explaining to an alien! 👽 🏃 Move Your Body! - Study for 10 minutes, then play for 5 minutes - Jump, stretch, or dance between lessons - Exercise helps your brain grow BIGGER & STRONGER! 💪 👥 Study with Friends! - Teaching each other is the BEST way to learn - Play learning games together - Make it a FUN GROUP ACTIVITY! 🎉 🏆 YOU'RE A LEARNING SUPERSTAR! ⭐ """ return study_content except Exception as e: return f"🙈 Oops! Something went wrong. Let's try again! Error: {str(e)}" # Custom CSS for professional styling custom_css = """ .gradio-container { font-family: 'Inter', 'Segoe UI', system-ui, sans-serif !important; } .header { text-align: center; padding: 2rem 1rem; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); border-radius: 12px; margin-bottom: 2rem; color: white; box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1); } .header h1 { margin: 0; font-size: 2.5rem; font-weight: 700; letter-spacing: -0.02em; } .header p { margin: 0.5rem 0 0 0; opacity: 0.95; font-size: 1.1rem; } .feature-box { background: #f8f9fa; border-radius: 10px; padding: 1.5rem; margin: 1rem 0; border-left: 4px solid #667eea; } .feature-box h3 { margin-top: 0; color: #333; font-size: 1.1rem; } .main-content { max-width: 1200px; margin: 0 auto; } .upload-section { background: white; border-radius: 12px; padding: 2rem; box-shadow: 0 2px 8px rgba(0, 0, 0, 0.08); margin-bottom: 1.5rem; } .output-section { background: white; border-radius: 12px; padding: 2rem; box-shadow: 0 2px 8px rgba(0, 0, 0, 0.08); } .generate-btn { width: 100%; padding: 1rem !important; font-size: 1.1rem !important; font-weight: 600 !important; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important; border: none !important; border-radius: 8px !important; transition: transform 0.2s, box-shadow 0.2s !important; } .generate-btn:hover { transform: translateY(-2px); box-shadow: 0 6px 12px rgba(102, 126, 234, 0.4) !important; } .footer { text-align: center; padding: 2rem 1rem; margin-top: 3rem; color: #666; font-size: 0.9rem; } .section-title { margin-top: 1rem; margin-bottom: 1rem; color: #333; font-weight: 600; } select, .gr-dropdown { border-radius: 8px !important; border: 2px solid #e0e0e0 !important; padding: 0.75rem !important; font-size: 1rem !important; transition: border-color 0.2s !important; } select:focus, .gr-dropdown:focus { border-color: #667eea !important; outline: none !important; } """ # Create Gradio interface with gr.Blocks(title="AI Multimedia Studio", theme=gr.themes.Soft(), css=custom_css) as demo: # Header Section gr.HTML("""
Transform images with AI-powered technology: voice, stories, mood analysis & study tools
Upload any image containing text. Our AI will extract it automatically.
Advanced vision-language models analyze and extract text from your image.
Text is converted to natural-sounding speech using Supertonic TTS.