import os import streamlit as st from transformers import pipeline from PIL import Image st.title("Image-to-Text and Text-to-Speech App") # Use the token from environment variables HF_TOKEN = os.environ["HF_TOKEN"] # Load pipelines using the new 'token' argument image_to_text = pipeline( "image-to-text", model="nlpconnect/vit-gpt2-image-captioning", token=HF_TOKEN ) text_to_speech = pipeline( "text-to-speech", model="facebook/mms-tts-eng", token=HF_TOKEN ) uploaded_file = st.file_uploader("Upload an image", type=["jpg", "jpeg", "png"]) if uploaded_file: image = Image.open(uploaded_file) st.image(image) caption = image_to_text(image)[0]["generated_text"] st.write("Caption:", caption) audio = text_to_speech(caption) audio_path = "speech.wav" with open(audio_path, "wb") as f: f.write(audio["audio"]) st.audio(audio_path)