import os
import streamlit as st
from transformers import pipeline
from PIL import Image

st.title("Image-to-Text and Text-to-Speech App")

# Use the token from environment variables
HF_TOKEN = os.environ["HF_TOKEN"]

# Load pipelines using the new 'token' argument
image_to_text = pipeline(
    "image-to-text",
    model="nlpconnect/vit-gpt2-image-captioning",
    token=HF_TOKEN
)

text_to_speech = pipeline(
    "text-to-speech",
    model="facebook/mms-tts-eng",
    token=HF_TOKEN
)

uploaded_file = st.file_uploader("Upload an image", type=["jpg", "jpeg", "png"])

if uploaded_file:
    image = Image.open(uploaded_file)
    st.image(image)

    caption = image_to_text(image)[0]["generated_text"]
    st.write("Caption:", caption)

    audio = text_to_speech(caption)
    audio_path = "speech.wav"
    with open(audio_path, "wb") as f:
        f.write(audio["audio"])

    st.audio(audio_path)