Spaces:

sungyi654
/

Image_Narration

Runtime error

File size: 2,425 Bytes

d433b9f
f8ee7c1
3ca247c
 
f8ee7c1
d433b9f
28c4939
d433b9f
f8ee7c1
d433b9f
f8ee7c1
 
 
28c4939
d433b9f
 
 
 
 
f8ee7c1
 
 
d433b9f
 
 
3ca247c
f8ee7c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d433b9f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f8ee7c1
d433b9f
3ca247c
 
 
 
 
 
d433b9f

import os
from PIL import Image
from gtts import gTTS
from io import BytesIO
import io
from openai import OpenAI
#from dotenv import load_dotenv
import streamlit as st
from transformers import pipeline

# For explaining what is going on in the image
img_nar = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")

#load_dotenv()

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

st.header("Image Narrator")

# Temporary
uploaded_image = st.file_uploader("Choose an image...", type=["jpg", "jpeg", "png"])

if 'history' not in st.session_state:
    st.session_state['history'] = []

personality = st.text_input("Enter a personality")
image_narration = "No narration given"

# Check if an image has been uploaded
if uploaded_image is not None:
    # Convert the uploaded file to a PIL image
    bytes_data = uploaded_image.getvalue()
    pil_image = Image.open(io.BytesIO(bytes_data))

    # Now, use the PIL image with the pipeline
    image_narration = img_nar(pil_image)

    # Display the uploaded image using the original bytes data
    st.image(pil_image, caption='Uploaded Image.', use_column_width=True)

    image_narration = image_narration[0]["generated_text"]

#st.write(image_narration)

def update_and_get_narration(personality, user_input):
    if personality and user_input:
        st.session_state['history'].append({"role": "user", "content": user_input})
        
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": f"You reiterate what is said to you but narrate it like a {personality}."}
            ] + st.session_state['history']
        )
        
        gpt_response = response.choices[0].message.content
        st.session_state['history'].append({"role": "assistant", "content": gpt_response})
        
        return gpt_response
    else:
        return "Please enter both a personality and some image classification text."

if st.button('Narrate'):
    narration = update_and_get_narration(personality, image_narration)
    st.write(narration)
    tts = gTTS(text=narration, lang='en')
    audio_buffer = BytesIO()
    tts.write_to_fp(audio_buffer)
    audio_buffer.seek(0)

    st.audio(audio_buffer, format='audio/mp3', start_time=0)
else:
    st.write(st.session_state['history'][-1]['content'] if st.session_state['history'] else "Narration will appear here.")