File size: 1,678 Bytes
c4ceecb 764cb13 c4ceecb 764cb13 c4ceecb 764cb13 c4ceecb 764cb13 c4ceecb 764cb13 c4ceecb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 |
from dotenv import find_dotenv, load_dotenv
from transformers import pipeline
import os
import requests
import streamlit as st
load_dotenv(find_dotenv())
HUGGINGFACE_API_TOKEN = os.getenv("HUGGINGFACE_API_TOKEN")
pipe = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
#img to text
def img_to_text(url):
text = pipe(url)[0]["generated_text"]
print(text)
return text
def text_to_speech(message):
API_URL = "https://api-inference.huggingface.co/models/espnet/kan-bayashi_ljspeech_vits"
headers = {"Authorization": f"Bearer {HUGGINGFACE_API_TOKEN}"}
payloads = {
"inputs":message
}
response = requests.post(API_URL, headers=headers, json=payloads)
with open('audio.flac', 'wb') as file:
file.write(response.content)
def main():
st.set_page_config(page_title="Image to Text", page_icon="🎙️")
st.header("Image to Text")
# Image.
image = "narrator.jpeg"
left_co, cent_co, last_co = st.columns(3)
with cent_co:
st.image(image=image)
uploaded_file = st.file_uploader("Choose an image: ", type=["jpg", "jpeg", "png"])
if uploaded_file is not None:
print(uploaded_file)
bytes_data = uploaded_file.getvalue()
with open(uploaded_file.name, "wb") as file:
file.write(bytes_data)
st.image(uploaded_file, caption='Uploaded image', use_column_width=True)
scenario=img_to_text(uploaded_file.name)
text_to_speech(scenario)
with st.expander("scenatio"):
st.write(scenario)
st.audio("audio.flac")
if __name__== "__main__":
main() |