Spaces:

jhonparra18
/

ocr-LLM-image-summarizer

Runtime error

App Files Files Community

jhonparra18 commited on Aug 30, 2023

Commit

08f01a2

1 Parent(s): cdaaee8

app behaviour

Browse files

Files changed (3) hide show

app.py +17 -17
config.py +1 -1
image_processor.py +26 -9

app.py CHANGED Viewed

@@ -4,24 +4,23 @@ import numpy as np
 from PIL import Image
 from app_utils import TEMP_DIR_NAME,save_uploaded_file,reset_chat
 import os
-import sys
 from text_summarizer import agent,processor
-import logging
-logging.basicConfig(level=logging.INFO)
-BOT_DEFAULT_MSG="Hello 👋 I'm a test AI assistant to help you with your questions about an input file, or feel free to ask me anything"
-st.set_page_config(page_title="Invoice | Receipt LLM Summarizer",layout='wide',page_icon=":shark:")
 #placeholders for temporal image path and an image processor in case we want to read img text separately
 IMAGE_TMP_PATH=None
 PROCESSOR=processor
 img_text=""
 with st.sidebar:
     st.markdown(
-        f"<h1 style='text-align: center;'> Invoice|Receipt LLM Summarizer using OpenCV+Tesseract+LLM</h1><br><br>",
         unsafe_allow_html=True
     )
     input_image = st.file_uploader(label='Receipt|Invoice Image',help="Upload an image",type=['jpg','png','jpeg'])
@@ -46,7 +45,7 @@ with st.sidebar:
 # Initialize chat history based on streamlit doc for chat applications https://docs.streamlit.io/knowledge-base/tutorials/build-conversational-apps
 if "messages" not in st.session_state:
-    st.session_state.messages = []
     st.session_state.messages.append({"role": "assistant", "content": BOT_DEFAULT_MSG})
@@ -55,22 +54,23 @@ for message in st.session_state.messages:
     with st.chat_message(message["role"]):
         st.markdown(message["content"])
-if prompt := st.chat_input("Write a message to the AI assistant | Escribe un mensaje para el asistente de IA"):
-    st.chat_message("user").markdown(prompt)
     st.session_state.messages.append({"role": "user", "content": prompt})
-    prompt_ad=f'{prompt}, img path: {IMAGE_TMP_PATH}' if (input_image is not None and not inject_text) else f'{prompt} text: {img_text}'
-    logging.info(f'PROMPT: {prompt_ad}')
-    ##streamlit callback https://python.langchain.com/docs/integrations/callbacks/streamlit
-    st_callback = StreamlitCallbackHandler(st.container())
     #hotfix to errors
     try:
         response = agent.run(prompt_ad,callbacks=[st_callback])
     except ValueError as e:
         response = "Sorry i could't understand your last question."
-    with st.chat_message("assistant"):
-        st.markdown(response)
     # Add assistant response to chat history
     st.session_state.messages.append({"role": "assistant", "content": response})

 from PIL import Image
 from app_utils import TEMP_DIR_NAME,save_uploaded_file,reset_chat
 import os
+from pathlib import Path
 from text_summarizer import agent,processor
+BOT_DEFAULT_MSG="Hello 👋 I'm a test AI OCR assistant to help you with your questions about your receipts or similar images containing text. Also feel free to ask me anything"
+st.set_page_config(page_title="OCR+LLM Image summarizer",layout='wide',page_icon=":shark:")
 #placeholders for temporal image path and an image processor in case we want to read img text separately
 IMAGE_TMP_PATH=None
 PROCESSOR=processor
 img_text=""
+inject_text=False
 with st.sidebar:
     st.markdown(
+        f"<h1 style='text-align: center;'> Invoice|Receipt Summarizer using OpenCV+Tesseract+LLM</h1><br><br>",
         unsafe_allow_html=True
     )
     input_image = st.file_uploader(label='Receipt|Invoice Image',help="Upload an image",type=['jpg','png','jpeg'])
 # Initialize chat history based on streamlit doc for chat applications https://docs.streamlit.io/knowledge-base/tutorials/build-conversational-apps
 if "messages" not in st.session_state:
+    reset_chat()
     st.session_state.messages.append({"role": "assistant", "content": BOT_DEFAULT_MSG})
     with st.chat_message(message["role"]):
         st.markdown(message["content"])
+prompt=st.chat_input("Write a message to the AI assistant | Escribe un mensaje para el asistente de IA")
+if prompt:
     st.session_state.messages.append({"role": "user", "content": prompt})
+    st.chat_message("user").markdown(prompt)
+    prompt_ad=f'{prompt}, img path: {IMAGE_TMP_PATH}' if (input_image is not None and not inject_text) else (f'{prompt} text: {img_text}' if inject_text else prompt)
+    #streamlit callback https://python.langchain.com/docs/integrations/callbacks/streamlit
+    print(f'PROMPT: {prompt_ad}')
+    st_callback = StreamlitCallbackHandler(st.container())
     #hotfix to errors
     try:
         response = agent.run(prompt_ad,callbacks=[st_callback])
     except ValueError as e:
         response = "Sorry i could't understand your last question."
     # Add assistant response to chat history
     st.session_state.messages.append({"role": "assistant", "content": response})
+    st.chat_message("assistant").markdown(response)

config.py CHANGED Viewed

@@ -1,3 +1,3 @@
-PYTESSERACT_DEFAULT_CONFIG= r'--psm 4'
 OPEN_AI_MODEL_NAME="gpt-3.5-turbo-0613" #fine-tuned for function detection see https://python.langchain.com/docs/modules/agents/agent_types/openai_functions_agent
 DEBUG_MODE_LLM=False

+PYTESSERACT_DEFAULT_CONFIG= r'--oem 3 --psm 4'
 OPEN_AI_MODEL_NAME="gpt-3.5-turbo-0613" #fine-tuned for function detection see https://python.langchain.com/docs/modules/agents/agent_types/openai_functions_agent
 DEBUG_MODE_LLM=False

image_processor.py CHANGED Viewed

@@ -7,14 +7,14 @@ import numpy as np
 from langchain.tools import BaseTool
 from typing import Optional, Type
 from langchain.callbacks.manager import AsyncCallbackManagerForToolRun
 class ImageProcessor(BaseTool):
     name = "ImageProcessor"
     description = "useful when you need to extract info from an image in an img_path corresponding to a receipt or invoice and tries to preprocess it returning all the text in the image using an OCR system."
-    def binarize(self,img_path:str):
         """
         This function is to binarize an input image
         :param img: image in format of (h, w, channel)
@@ -32,17 +32,17 @@ class ImageProcessor(BaseTool):
         new = np.clip(new, 0, 255).astype(np.uint8)
         return new
-    def deskew(self,image):
-        coords = np.column_stack(np.where(image > 0))
         angle = cv2.minAreaRect(coords)[-1]
         if angle < -45:
             angle = -(90 + angle)
         else:
             angle = -angle
-        (h, w) = image.shape[:2]
         center = (w // 2, h // 2)
         M = cv2.getRotationMatrix2D(center, angle, 1.0)
-        rotated = cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
         return rotated
     def dilate_erode(self,img):
@@ -60,11 +60,27 @@ class ImageProcessor(BaseTool):
         img= cv2.dilate(img, kernel2, iterations=1)
         return img
     def opening(self,image):
         kernel = np.ones((5,5),np.uint8)
         return cv2.morphologyEx(image, cv2.MORPH_OPEN, kernel)
-    def process_image(self,img_path:str):
         img=self.binarize(img_path)
         img=self.remove_watermark(img)
         return img
@@ -74,17 +90,18 @@ class ImageProcessor(BaseTool):
         return text
     def _run(self,img_path,save_to_disk=False):
-        img=self.process_image(str(img_path))
         text=self.img_to_text(img)
         if save_to_disk:
             with open(f"/tmp/{str(img_path).split('/')[-1].replace('.jpg','.txt')}",'w') as f:
                 f.write(text)
         return text
     # as used in langchain documentation https://python.langchain.com/docs/modules/agents/tools/custom_tools
     async def _arun(self, img_path: str,save_to_disk=False, run_manager: Optional[AsyncCallbackManagerForToolRun] = None
     ) -> str:
         """Use the tool asynchronously."""
-        raise NotImplementedError("custom_search does not support async")

 from langchain.tools import BaseTool
 from typing import Optional, Type
 from langchain.callbacks.manager import AsyncCallbackManagerForToolRun
+from PIL import Image
 class ImageProcessor(BaseTool):
     name = "ImageProcessor"
     description = "useful when you need to extract info from an image in an img_path corresponding to a receipt or invoice and tries to preprocess it returning all the text in the image using an OCR system."
+    def binarize(self,img_path):
         """
         This function is to binarize an input image
         :param img: image in format of (h, w, channel)
         new = np.clip(new, 0, 255).astype(np.uint8)
         return new
+    def deskew(self,img):
+        coords = np.column_stack(np.where(img > 0))
         angle = cv2.minAreaRect(coords)[-1]
         if angle < -45:
             angle = -(90 + angle)
         else:
             angle = -angle
+        (h, w) = img.shape[:2]
         center = (w // 2, h // 2)
         M = cv2.getRotationMatrix2D(center, angle, 1.0)
+        rotated = cv2.warpAffine(img, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
         return rotated
     def dilate_erode(self,img):
         img= cv2.dilate(img, kernel2, iterations=1)
         return img
+    def detect_angle(self,img_path):
+        """detects angle of rotation in the image using the text lines found"""
+        ##taken from https://stackoverflow.com/questions/13872331/rotating-an-image-with-orientation-specified-in-exif-using-python-without-pil-in
+        pil_img=Image.open(img_path)
+        img_exif = pil_img.getexif()
+        if len(img_exif):
+            if img_exif[274] == 3:
+                pil_img = pil_img.transpose(Image.ROTATE_180)
+            elif img_exif[274] == 6:
+                pil_img = pil_img.transpose(Image.ROTATE_270)
+            elif img_exif[274] == 8:
+                pil_img = pil_img.transpose(Image.ROTATE_90)
+        return np.array(pil_img)[:, :, ::-1] #convert to BGR
     def opening(self,image):
         kernel = np.ones((5,5),np.uint8)
         return cv2.morphologyEx(image, cv2.MORPH_OPEN, kernel)
+    def process_image(self,img_path):
         img=self.binarize(img_path)
         img=self.remove_watermark(img)
         return img
         return text
     def _run(self,img_path,save_to_disk=False):
+        img=self.process_image(img_path)
         text=self.img_to_text(img)
         if save_to_disk:
             with open(f"/tmp/{str(img_path).split('/')[-1].replace('.jpg','.txt')}",'w') as f:
                 f.write(text)
+            cv2.imwrite(f"images/rotated-{img_pth.name}",img)
         return text
     # as used in langchain documentation https://python.langchain.com/docs/modules/agents/tools/custom_tools
     async def _arun(self, img_path: str,save_to_disk=False, run_manager: Optional[AsyncCallbackManagerForToolRun] = None
     ) -> str:
         """Use the tool asynchronously."""
+        raise NotImplementedError("does not support async")