Spaces:

eagle0504
/

ocr-basics

Sleeping

File size: 5,206 Bytes

bed1057
 
 
53b8f4b
28e97a7
bed1057
0cd5480
bed1057
 
28e97a7
bed1057
 
 
0758797
bed1057
511942b
bed1057
 
 
 
 
 
 
511942b
e384767
 
b7a3caf
28e97a7
bed1057
b7a3caf
 
 
 
 
28e97a7
53b8f4b
28e97a7
 
 
 
b7a3caf
 
28e97a7
 
 
 
 
b7a3caf
53b8f4b
b7a3caf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8f6fd08
 
511942b
d471962
511942b
 
 
 
 
bed1057
 
 
 
 
 
 
 
 
 
b7a3caf
bed1057
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b7a3caf
bed1057

import streamlit as st
from PIL import Image
import os
import base64
import fitz  # PyMuPDF
from helper import (
    custom_file_uploader, resize_image, convert_image_to_base64, post_request_and_parse_response, 
    draw_bounding_boxes_for_textract, extract_text_from_textract_blocks, ChatGPTClient
)
import io

# Load OpenAI API Key from environment variable
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
TEXTRACT_API_URL = os.environ["TEXTRACT_API_URL"]

st.set_page_config(page_title="💬 Chat with OCR 📝", layout="wide")

# Initialize chat history if not in session state
if "messages" not in st.session_state:
    st.session_state.messages = []

# Sidebar for image upload
with st.sidebar:
    st.title("🖼️ Upload and Display Images")
    
    # Display a placeholder for uploaded image
    st.warning("Please upload an image or a single-page PDF file!")
    uploaded_file = st.file_uploader("Upload an Image or PDF", type=['PDF'], label_visibility="collapsed")
    
    pil_image = None
    if uploaded_file:
        # Handle PDF file
        if uploaded_file.type == "application/pdf":
            try:
                # Read PDF as bytes
                pdf_bytes = uploaded_file.read()
                pdf_document = fitz.open(stream=pdf_bytes, filetype="pdf")
                
                # Check if the PDF has only one page
                if pdf_document.page_count != 1:
                    st.warning("Please upload a PDF with only one page!")
                else:
                    # Convert the first page to an image
                    page = pdf_document.load_page(0)
                    pix = page.get_pixmap()
                    image_bytes = pix.tobytes()
                    pil_image = Image.open(io.BytesIO(image_bytes))
            except Exception as e:
                st.error(f"Failed to convert PDF to image: {e}")
        else:
            # Handle image file
            pil_image = Image.open(uploaded_file)

        if pil_image:
            resized_image = resize_image(pil_image)

            with st.expander("Original Image", expanded=False):
                st.image(pil_image, caption="Uploaded Image", use_column_width=True)
            
            # Convert image to base64 and send to Textract API
            image_base64 = convert_image_to_base64(resized_image)
            payload = {"image": image_base64}
            result_dict = post_request_and_parse_response(TEXTRACT_API_URL, payload)
            
            # Draw bounding boxes
            image_with_boxes = draw_bounding_boxes_for_textract(resized_image.copy(), result_dict)
            
            with st.expander("Image with Bounding Boxes", expanded=True):
                st.image(image_with_boxes, caption="Image with Bounding Boxes", use_column_width=True)
            
            # Extract text from Textract
            cleaned_up_body = extract_text_from_textract_blocks(result_dict['body'])

            # Display JSON body in the sidebar inside an expander (default not expanded)
            with st.expander("View JSON Body", expanded=False):
                st.json(result_dict)

            # Display cleaned-up body (text extracted from JSON) in the sidebar inside an expander (default not expanded)
            with st.expander("View Cleaned-up Text", expanded=True):
                st.text_area("Cleaned-up Text", cleaned_up_body, height=200, key="cleaned_text_area", help="Drag the bottom right corner to resize")
    
    # Add some space at the bottom of the sidebar before the "Clear Session" button
    st.sidebar.markdown("<br><br><br><br>", unsafe_allow_html=True)
    
    # Clear session button at the bottom of the sidebar
    if st.sidebar.button("Clear Session"):
        st.session_state.messages = []

# Main chat interface
st.title("Chat with OCR Output")

# Display previous messages from session state
for message in st.session_state.messages:
    with st.chat_message(message["role"]):
        st.markdown(message["content"])

# Initialize ChatGPTClient with session state history
if uploaded_file and pil_image:
    history_copy = st.session_state.messages.copy()

    if cleaned_up_body:
        history_copy.append({"role": "system", "content": cleaned_up_body})

    bot = ChatGPTClient(
        api_key=OPENAI_API_KEY,
        protocol="You are fed with the text portion of json file that come out of OCR after scanning an image. User will ask you questions about this json file.",
        body=cleaned_up_body
    )
    bot.history = history_copy  # Set ChatGPT history to session state messages

# React to user input
if prompt := st.chat_input("Ask me about the image"):
    # Display user message in chat container
    st.chat_message("user").markdown(prompt)
    st.session_state.messages.append({"role": "user", "content": prompt})

    # Generate a response using ChatGPTClient
    if uploaded_file and pil_image:
        response = bot.generate_response(prompt)
    else:
        response = "Please upload an image before asking questions."

    # Display assistant message in chat container
    st.chat_message("assistant").markdown(response)
    st.session_state.messages.append({"role": "assistant", "content": response})