import streamlit as st from PIL import Image import os import tempfile import subprocess import sys # Check for required dependencies and install if missing def check_and_install_dependencies(): required_packages = { "transformers": "transformers", "sentencepiece": "sentencepiece", "gtts": "gTTS" } missing_packages = [] for package, pip_name in required_packages.items(): try: __import__(package) except ImportError: missing_packages.append((package, pip_name)) if missing_packages: st.warning("Missing required dependencies. Please install them before continuing.") for package, pip_name in missing_packages: st.code(f"pip install {pip_name}", language="bash") if st.button("Install Dependencies Automatically"): with st.spinner("Installing dependencies..."): for package, pip_name in missing_packages: try: subprocess.check_call([sys.executable, "-m", "pip", "install", pip_name]) st.success(f"Successfully installed {pip_name}") except Exception as e: st.error(f"Failed to install {pip_name}: {str(e)}") st.info("Please restart the application after installing dependencies.") return False return True # function part # img2text def img2text(image_path): try: # Import here to ensure dependencies are checked first from transformers import pipeline # Load the image-to-text model image_to_text_model = pipeline("image-to-text", model="naver-clova-ix/donut-base") # Open the image file image = Image.open(image_path) # Extract text from the image result = image_to_text_model(image) # Get the generated text text = result[0]["generated_text"] if result else "No text detected" return text except Exception as e: st.error(f"Error processing image: {str(e)}") return f"Error: {str(e)}" # text2story def text2story(text): # For now, just return the extracted text as the story story_text = f"Here's a story based on the text: {text}" return story_text # text2audio using Google Text-to-Speech def text2audio(story_text): try: from gtts import gTTS # Create a temporary file temp_audio = tempfile.NamedTemporaryFile(delete=False, suffix='.wav') temp_audio_path = temp_audio.name temp_audio.close() # Initialize gTTS and generate audio tts = gTTS(text=story_text, lang='en', slow=False) # Save to the temporary file tts.save(temp_audio_path) return temp_audio_path except Exception as e: st.error(f"Error generating audio: {str(e)}") return None # main part st.set_page_config(page_title="Your Image to Audio Story", page_icon="🦜") st.header("Turn Your Image to Audio Story") st.subheader("Using Donut model for text extraction") # Check dependencies before proceeding dependencies_ok = check_and_install_dependencies() if dependencies_ok: uploaded_file = st.file_uploader("Select an Image...", type=['png', 'jpg', 'jpeg', 'gif', 'bmp', 'webp']) if uploaded_file is not None: # Save the uploaded file temporarily bytes_data = uploaded_file.getvalue() image_temp_path = os.path.join(tempfile.gettempdir(), uploaded_file.name) with open(image_temp_path, "wb") as file: file.write(bytes_data) # Display the uploaded image st.image(uploaded_file, caption="Uploaded Image", use_column_width=True) # Stage 1: Image to Text with st.spinner('Processing img2text...'): extracted_text = img2text(image_temp_path) st.subheader("Extracted Text:")