Spaces:

jhonparra18
/

ocr-LLM-image-summarizer

Runtime error

App Files Files Community

jhonparra18 commited on Aug 26, 2023

Commit

2804e13

1 Parent(s): f82ebcc

app definition

Browse files

Files changed (7) hide show

.streamlit/config.toml +5 -0
app.py +65 -0
app_utils.py +14 -0
config.py +3 -0
image_processor.py +100 -0
requirements.txt +8 -0
text_summarizer.py +53 -0

.streamlit/config.toml ADDED Viewed

	@@ -0,0 +1,5 @@

+[theme]
+base = "dark"
+[server]
+runOnSave = true

app.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import streamlit as st
+from langchain.callbacks import StreamlitCallbackHandler
+import numpy as np
+from PIL import Image
+from app_utils import TEMP_DIR_NAME,save_uploaded_file,reset_chat
+import os
+import sys
+from text_summarizer import agent
+BOT_DEFAULT_MSG="Hello 👋 I'm a test AI assistant to help you with your questions about an input file, or feel free to ask me anything"
+st.set_page_config(page_title="Invoice|Receipt LLM Summarizer",layout='wide',page_icon=":shark:")
+IMAGE_TMP_PATH=None
+with st.sidebar:
+    st.markdown(
+        f"<h1 style='text-align: center;'> Invoice|Receipt LLM Summarizer using OpenCV+Tesseract+LLM</h1><br><br>",
+        unsafe_allow_html=True
+    )
+    input_image = st.file_uploader(label='Receipt|Invoice Image',help="Upload an image",type=['jpg','png','jpeg'])
+    if input_image is not None:
+        save_uploaded_file(input_image)
+        IMAGE_TMP_PATH=os.path.join(TEMP_DIR_NAME,input_image.name)
+        st.markdown(f"<h1 style='text-align: center;'> Image Uploaded and saved<br>",unsafe_allow_html=True)
+        st.image(Image.open(IMAGE_TMP_PATH))
+    st.markdown("***")
+    st.button("Reset Chat History", type="secondary", on_click=reset_chat,use_container_width=True)
+    st.markdown("[![Foo](https://img.icons8.com/material-outlined/96/000000/github.png)](https://github.com/statscol/invoice-llm-summarizer)")
+# Initialize chat history based on streamlit doc for chat applications https://docs.streamlit.io/knowledge-base/tutorials/build-conversational-apps
+if "messages" not in st.session_state:
+    st.session_state.messages = []
+# Display chat messages from history on app rerun
+for message in st.session_state.messages:
+    with st.chat_message(message["role"]):
+        st.markdown(message["content"])
+# Set default message on chat
+with st.chat_message("assistant"):
+    st.write(BOT_DEFAULT_MSG)
+if prompt := st.chat_input("Write a message to the AI assistant | Escribe un mensaje para el asistente de IA"):
+    st.chat_message("user").markdown(prompt)
+    st.session_state.messages.append({"role": "user", "content": prompt})
+    agent.memory.memory_variables()
+    prompt_ad=f'{prompt}, image path: {IMAGE_TMP_PATH}' if input_image is not None else prompt
+    ##streamlit callback https://python.langchain.com/docs/integrations/callbacks/streamlit
+    st_callback = StreamlitCallbackHandler(st.container())
+    #hotfix to errors
+    try:
+        response = response = agent.run(prompt_ad,callbacks=[st_callback])
+    except ValueError as e:
+        response = "Sorry i could't understand your last question."
+    with st.chat_message("assistant"):
+        st.markdown(response)
+    # Add assistant response to chat history
+    st.session_state.messages.append({"role": "assistant", "content": response})

app_utils.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import os
+import streamlit as st
+TEMP_DIR_NAME="/tmp/"
+def save_uploaded_file(uploadedfile):
+     with open(os.path.join(TEMP_DIR_NAME,uploadedfile.name),"wb") as f:
+         f.write(uploadedfile.getbuffer())
+     return st.success("Saved File:{} to {}".format(uploadedfile.name,TEMP_DIR_NAME))
+def reset_chat():
+    st.session_state.messages = []

config.py ADDED Viewed

	@@ -0,0 +1,3 @@

+PYTESSERACT_DEFAULT_CONFIG= r'--psm 4'
+OPEN_AI_MODEL_NAME="gpt-3.5-turbo-0613" #fine-tuned for function detection see https://python.langchain.com/docs/modules/agents/agent_types/openai_functions_agent
+DEBUG_MODE_LLM=False

image_processor.py ADDED Viewed

	@@ -0,0 +1,100 @@

+import cv2
+import pytesseract
+from config import PYTESSERACT_DEFAULT_CONFIG
+from pathlib import Path
+from tqdm import tqdm
+import numpy as np
+from langchain.tools import BaseTool
+from typing import Optional, Type
+from langchain.callbacks.manager import AsyncCallbackManagerForToolRun
+class ImageProcessor(BaseTool):
+    name = "ImageProcessor"
+    description = "useful when you need to extract info from an image in an img_path corresponding to a receipt or invoice and tries to preprocess it returning all the text in the image using an OCR system."
+    def binarize(self,img_path:str):
+        """
+        This function is to binarize an input image
+        :param img: image in format of (h, w, channel)
+        :return: am image in format of (h, w)
+        """
+        img=cv2.imread(img_path)
+        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+        #gray = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1] #threshold may affect performance for invoices|receipts as seen in our test dataset
+        return gray
+    def remove_watermark(self,img,alpha = 1.8,beta = -180):
+        """remove watermark from image
+        img: cv2 image| np.array"""
+        new = alpha * img + beta
+        new = np.clip(new, 0, 255).astype(np.uint8)
+        return new
+    def deskew(self,image):
+        coords = np.column_stack(np.where(image > 0))
+        angle = cv2.minAreaRect(coords)[-1]
+        if angle < -45:
+            angle = -(90 + angle)
+        else:
+            angle = -angle
+        (h, w) = image.shape[:2]
+        center = (w // 2, h // 2)
+        M = cv2.getRotationMatrix2D(center, angle, 1.0)
+        rotated = cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
+        return rotated
+    def dilate_erode(self,img):
+        """
+        apply an erosion and dilation kernel
+        img: cv2 image| np.array
+        """
+        kernel = np.ones((2, 1), np.uint8)
+        kernel2 = np.ones((1, 1), np.uint8)
+        img = cv2.blur(img,(6,5))
+        img=cv2.dilate(img, kernel, iterations=3)
+        img = cv2.erode(img, (2,1), iterations=1)
+        img = cv2.blur(img,(1,1))
+        img = cv2.bilateralFilter(img,10,35,30)
+        img= cv2.dilate(img, kernel2, iterations=1)
+        return img
+    def opening(self,image):
+        kernel = np.ones((5,5),np.uint8)
+        return cv2.morphologyEx(image, cv2.MORPH_OPEN, kernel)
+    def process_image(self,img_path:str):
+        img=self.binarize(img_path)
+        img=self.remove_watermark(img)
+        return img
+    def img_to_text(self,img,lang="spa"):
+        text=pytesseract.image_to_string(img,lang=lang,config=PYTESSERACT_DEFAULT_CONFIG)
+        return text
+    def _run(self,img_path):
+        img=self.process_image(str(img_path))
+        text=self.img_to_text(img)
+        return text
+    # as used in langchain documentation https://python.langchain.com/docs/modules/agents/tools/custom_tools
+    async def _arun(self, query: str, run_manager: Optional[AsyncCallbackManagerForToolRun] = None
+    ) -> str:
+        """Use the tool asynchronously."""
+        raise NotImplementedError("custom_search does not support async")
+if __name__=="__main__":
+    processor=ImageProcessor()
+    image_paths=list(Path("images/raw").glob("*.jpg"))
+    for img_pth in tqdm(image_paths,desc="Img Preproc+ OCR "):
+        img_processed=processor.process_image(str(img_pth))
+        text=processor.run(str(img_pth))
+        cv2.imwrite(f"images/processed/{img_pth.name}",img_processed)
+        with open(f"images/text/{img_pth.name.replace('.jpg','.txt')}",'w') as f:
+            f.write(text)

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+opencv-python
+numpy
+requests
+langchain
+openai
+pytesseract
+python-dotenv
+streamlit

text_summarizer.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import langchain
+from langchain.schema import SystemMessage
+from langchain.agents import OpenAIFunctionsAgent,initialize_agent
+from langchain.agents import AgentType
+from langchain.chat_models import ChatOpenAI
+from langchain.chains.conversation.memory import ConversationBufferWindowMemory
+from dotenv import load_dotenv
+from config import OPEN_AI_MODEL_NAME,DEBUG_MODE_LLM
+from image_processor import ImageProcessor
+langchain.debug = DEBUG_MODE_LLM
+load_dotenv()
+#img preproc and ocr helper
+processor=ImageProcessor()
+system_message = SystemMessage(content="""You are an expert invoice, receipt summarizer, you're supposed to analyze every text in english or spanish and return data like restaurant name, items or products bought and its price as well as the total amount, however you cannot read images so you must use a tool to convert and image to text""")
+#initial system prompt
+prompt = OpenAIFunctionsAgent.create_prompt(system_message=system_message)
+#define LLM to use
+llm = ChatOpenAI(temperature=0.1, model=OPEN_AI_MODEL_NAME,)
+#tools to use as functions to trigger from the llm
+tools = [
+    ImageProcessor()
+]
+#memory placeholder
+conversational_memory = ConversationBufferWindowMemory(
+    memory_key='chat_history',
+    k=5,
+    return_messages=True
+)
+llm = ChatOpenAI(
+    temperature=0,
+    model_name=OPEN_AI_MODEL_NAME,
+    max_tokens=2048
+)
+agent = initialize_agent(
+    agent=AgentType.OPENAI_FUNCTIONS, ## does not use memory
+    tools=tools,
+    llm=llm,
+    max_iterations=5,
+    verbose=False,
+    memory=conversational_memory,
+    early_stopping_method='generate',
+    prompt=prompt
+)
+##TO DO, Remove agent and test sequential chain