from dotenv import find_dotenv, load_dotenv
from transformers import pipeline
import os
import requests
import streamlit as st

load_dotenv(find_dotenv())
HUGGINGFACE_API_TOKEN = os.getenv("HUGGINGFACE_API_TOKEN")

pipe = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")

#img to text
def img_to_text(url):
    text = pipe(url)[0]["generated_text"]
    print(text)
    return text

def text_to_speech(message):
    API_URL = "https://api-inference.huggingface.co/models/espnet/kan-bayashi_ljspeech_vits"
    headers = {"Authorization": f"Bearer {HUGGINGFACE_API_TOKEN}"}
    payloads = {
        "inputs":message
    }

    response = requests.post(API_URL, headers=headers, json=payloads)
    with open('audio.flac', 'wb') as file:
        file.write(response.content)
        
def main():
    st.set_page_config(page_title="Image to Text", page_icon="🎙️")

    st.header("Image to Text")
    # Image.
    image = "narrator.jpeg"
    left_co, cent_co, last_co = st.columns(3)
    with cent_co:
        st.image(image=image)
    uploaded_file = st.file_uploader("Choose an image: ", type=["jpg", "jpeg", "png"])

    if uploaded_file is not None:
        print(uploaded_file)
        bytes_data = uploaded_file.getvalue()
        with open(uploaded_file.name, "wb") as file:
            file.write(bytes_data)
            st.image(uploaded_file, caption='Uploaded image', use_column_width=True)
            scenario=img_to_text(uploaded_file.name)
            text_to_speech(scenario)

            with st.expander("scenatio"):
                st.write(scenario)
            
            st.audio("audio.flac")


if __name__== "__main__":
    main()