Spaces:
Runtime error
Runtime error
Commit ·
08f01a2
1
Parent(s): cdaaee8
app behaviour
Browse files- app.py +17 -17
- config.py +1 -1
- image_processor.py +26 -9
app.py
CHANGED
|
@@ -4,24 +4,23 @@ import numpy as np
|
|
| 4 |
from PIL import Image
|
| 5 |
from app_utils import TEMP_DIR_NAME,save_uploaded_file,reset_chat
|
| 6 |
import os
|
| 7 |
-
import
|
| 8 |
from text_summarizer import agent,processor
|
| 9 |
-
import logging
|
| 10 |
|
| 11 |
-
|
| 12 |
|
| 13 |
-
|
| 14 |
-
st.set_page_config(page_title="Invoice | Receipt LLM Summarizer",layout='wide',page_icon=":shark:")
|
| 15 |
|
| 16 |
#placeholders for temporal image path and an image processor in case we want to read img text separately
|
| 17 |
IMAGE_TMP_PATH=None
|
| 18 |
PROCESSOR=processor
|
| 19 |
img_text=""
|
|
|
|
| 20 |
|
| 21 |
with st.sidebar:
|
| 22 |
|
| 23 |
st.markdown(
|
| 24 |
-
f"<h1 style='text-align: center;'> Invoice|Receipt
|
| 25 |
unsafe_allow_html=True
|
| 26 |
)
|
| 27 |
input_image = st.file_uploader(label='Receipt|Invoice Image',help="Upload an image",type=['jpg','png','jpeg'])
|
|
@@ -46,7 +45,7 @@ with st.sidebar:
|
|
| 46 |
|
| 47 |
# Initialize chat history based on streamlit doc for chat applications https://docs.streamlit.io/knowledge-base/tutorials/build-conversational-apps
|
| 48 |
if "messages" not in st.session_state:
|
| 49 |
-
|
| 50 |
st.session_state.messages.append({"role": "assistant", "content": BOT_DEFAULT_MSG})
|
| 51 |
|
| 52 |
|
|
@@ -55,22 +54,23 @@ for message in st.session_state.messages:
|
|
| 55 |
with st.chat_message(message["role"]):
|
| 56 |
st.markdown(message["content"])
|
| 57 |
|
|
|
|
|
|
|
|
|
|
| 58 |
|
| 59 |
-
if prompt := st.chat_input("Write a message to the AI assistant | Escribe un mensaje para el asistente de IA"):
|
| 60 |
-
|
| 61 |
-
st.chat_message("user").markdown(prompt)
|
| 62 |
st.session_state.messages.append({"role": "user", "content": prompt})
|
| 63 |
-
|
| 64 |
-
prompt_ad=f'{prompt}, img path: {IMAGE_TMP_PATH}' if (input_image is not None and not inject_text) else f'{prompt} text: {img_text}'
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
st_callback = StreamlitCallbackHandler(st.container())
|
|
|
|
| 68 |
#hotfix to errors
|
| 69 |
try:
|
| 70 |
response = agent.run(prompt_ad,callbacks=[st_callback])
|
| 71 |
except ValueError as e:
|
| 72 |
response = "Sorry i could't understand your last question."
|
| 73 |
-
|
| 74 |
-
st.markdown(response)
|
| 75 |
# Add assistant response to chat history
|
| 76 |
st.session_state.messages.append({"role": "assistant", "content": response})
|
|
|
|
|
|
| 4 |
from PIL import Image
|
| 5 |
from app_utils import TEMP_DIR_NAME,save_uploaded_file,reset_chat
|
| 6 |
import os
|
| 7 |
+
from pathlib import Path
|
| 8 |
from text_summarizer import agent,processor
|
|
|
|
| 9 |
|
| 10 |
+
BOT_DEFAULT_MSG="Hello 👋 I'm a test AI OCR assistant to help you with your questions about your receipts or similar images containing text. Also feel free to ask me anything"
|
| 11 |
|
| 12 |
+
st.set_page_config(page_title="OCR+LLM Image summarizer",layout='wide',page_icon=":shark:")
|
|
|
|
| 13 |
|
| 14 |
#placeholders for temporal image path and an image processor in case we want to read img text separately
|
| 15 |
IMAGE_TMP_PATH=None
|
| 16 |
PROCESSOR=processor
|
| 17 |
img_text=""
|
| 18 |
+
inject_text=False
|
| 19 |
|
| 20 |
with st.sidebar:
|
| 21 |
|
| 22 |
st.markdown(
|
| 23 |
+
f"<h1 style='text-align: center;'> Invoice|Receipt Summarizer using OpenCV+Tesseract+LLM</h1><br><br>",
|
| 24 |
unsafe_allow_html=True
|
| 25 |
)
|
| 26 |
input_image = st.file_uploader(label='Receipt|Invoice Image',help="Upload an image",type=['jpg','png','jpeg'])
|
|
|
|
| 45 |
|
| 46 |
# Initialize chat history based on streamlit doc for chat applications https://docs.streamlit.io/knowledge-base/tutorials/build-conversational-apps
|
| 47 |
if "messages" not in st.session_state:
|
| 48 |
+
reset_chat()
|
| 49 |
st.session_state.messages.append({"role": "assistant", "content": BOT_DEFAULT_MSG})
|
| 50 |
|
| 51 |
|
|
|
|
| 54 |
with st.chat_message(message["role"]):
|
| 55 |
st.markdown(message["content"])
|
| 56 |
|
| 57 |
+
prompt=st.chat_input("Write a message to the AI assistant | Escribe un mensaje para el asistente de IA")
|
| 58 |
+
|
| 59 |
+
if prompt:
|
| 60 |
|
|
|
|
|
|
|
|
|
|
| 61 |
st.session_state.messages.append({"role": "user", "content": prompt})
|
| 62 |
+
st.chat_message("user").markdown(prompt)
|
| 63 |
+
prompt_ad=f'{prompt}, img path: {IMAGE_TMP_PATH}' if (input_image is not None and not inject_text) else (f'{prompt} text: {img_text}' if inject_text else prompt)
|
| 64 |
+
#streamlit callback https://python.langchain.com/docs/integrations/callbacks/streamlit
|
| 65 |
+
print(f'PROMPT: {prompt_ad}')
|
| 66 |
+
st_callback = StreamlitCallbackHandler(st.container())
|
| 67 |
+
|
| 68 |
#hotfix to errors
|
| 69 |
try:
|
| 70 |
response = agent.run(prompt_ad,callbacks=[st_callback])
|
| 71 |
except ValueError as e:
|
| 72 |
response = "Sorry i could't understand your last question."
|
| 73 |
+
|
|
|
|
| 74 |
# Add assistant response to chat history
|
| 75 |
st.session_state.messages.append({"role": "assistant", "content": response})
|
| 76 |
+
st.chat_message("assistant").markdown(response)
|
config.py
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
-
PYTESSERACT_DEFAULT_CONFIG= r'--psm 4'
|
| 2 |
OPEN_AI_MODEL_NAME="gpt-3.5-turbo-0613" #fine-tuned for function detection see https://python.langchain.com/docs/modules/agents/agent_types/openai_functions_agent
|
| 3 |
DEBUG_MODE_LLM=False
|
|
|
|
| 1 |
+
PYTESSERACT_DEFAULT_CONFIG= r'--oem 3 --psm 4'
|
| 2 |
OPEN_AI_MODEL_NAME="gpt-3.5-turbo-0613" #fine-tuned for function detection see https://python.langchain.com/docs/modules/agents/agent_types/openai_functions_agent
|
| 3 |
DEBUG_MODE_LLM=False
|
image_processor.py
CHANGED
|
@@ -7,14 +7,14 @@ import numpy as np
|
|
| 7 |
from langchain.tools import BaseTool
|
| 8 |
from typing import Optional, Type
|
| 9 |
from langchain.callbacks.manager import AsyncCallbackManagerForToolRun
|
| 10 |
-
|
| 11 |
|
| 12 |
class ImageProcessor(BaseTool):
|
| 13 |
|
| 14 |
name = "ImageProcessor"
|
| 15 |
description = "useful when you need to extract info from an image in an img_path corresponding to a receipt or invoice and tries to preprocess it returning all the text in the image using an OCR system."
|
| 16 |
|
| 17 |
-
def binarize(self,img_path
|
| 18 |
"""
|
| 19 |
This function is to binarize an input image
|
| 20 |
:param img: image in format of (h, w, channel)
|
|
@@ -32,17 +32,17 @@ class ImageProcessor(BaseTool):
|
|
| 32 |
new = np.clip(new, 0, 255).astype(np.uint8)
|
| 33 |
return new
|
| 34 |
|
| 35 |
-
def deskew(self,
|
| 36 |
-
coords = np.column_stack(np.where(
|
| 37 |
angle = cv2.minAreaRect(coords)[-1]
|
| 38 |
if angle < -45:
|
| 39 |
angle = -(90 + angle)
|
| 40 |
else:
|
| 41 |
angle = -angle
|
| 42 |
-
(h, w) =
|
| 43 |
center = (w // 2, h // 2)
|
| 44 |
M = cv2.getRotationMatrix2D(center, angle, 1.0)
|
| 45 |
-
rotated = cv2.warpAffine(
|
| 46 |
return rotated
|
| 47 |
|
| 48 |
def dilate_erode(self,img):
|
|
@@ -60,11 +60,27 @@ class ImageProcessor(BaseTool):
|
|
| 60 |
img= cv2.dilate(img, kernel2, iterations=1)
|
| 61 |
return img
|
| 62 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
def opening(self,image):
|
| 64 |
kernel = np.ones((5,5),np.uint8)
|
| 65 |
return cv2.morphologyEx(image, cv2.MORPH_OPEN, kernel)
|
| 66 |
|
| 67 |
-
def process_image(self,img_path
|
| 68 |
img=self.binarize(img_path)
|
| 69 |
img=self.remove_watermark(img)
|
| 70 |
return img
|
|
@@ -74,17 +90,18 @@ class ImageProcessor(BaseTool):
|
|
| 74 |
return text
|
| 75 |
|
| 76 |
def _run(self,img_path,save_to_disk=False):
|
| 77 |
-
img=self.process_image(
|
| 78 |
text=self.img_to_text(img)
|
| 79 |
if save_to_disk:
|
| 80 |
with open(f"/tmp/{str(img_path).split('/')[-1].replace('.jpg','.txt')}",'w') as f:
|
| 81 |
f.write(text)
|
|
|
|
| 82 |
return text
|
| 83 |
|
| 84 |
# as used in langchain documentation https://python.langchain.com/docs/modules/agents/tools/custom_tools
|
| 85 |
async def _arun(self, img_path: str,save_to_disk=False, run_manager: Optional[AsyncCallbackManagerForToolRun] = None
|
| 86 |
) -> str:
|
| 87 |
"""Use the tool asynchronously."""
|
| 88 |
-
raise NotImplementedError("
|
| 89 |
|
| 90 |
|
|
|
|
| 7 |
from langchain.tools import BaseTool
|
| 8 |
from typing import Optional, Type
|
| 9 |
from langchain.callbacks.manager import AsyncCallbackManagerForToolRun
|
| 10 |
+
from PIL import Image
|
| 11 |
|
| 12 |
class ImageProcessor(BaseTool):
|
| 13 |
|
| 14 |
name = "ImageProcessor"
|
| 15 |
description = "useful when you need to extract info from an image in an img_path corresponding to a receipt or invoice and tries to preprocess it returning all the text in the image using an OCR system."
|
| 16 |
|
| 17 |
+
def binarize(self,img_path):
|
| 18 |
"""
|
| 19 |
This function is to binarize an input image
|
| 20 |
:param img: image in format of (h, w, channel)
|
|
|
|
| 32 |
new = np.clip(new, 0, 255).astype(np.uint8)
|
| 33 |
return new
|
| 34 |
|
| 35 |
+
def deskew(self,img):
|
| 36 |
+
coords = np.column_stack(np.where(img > 0))
|
| 37 |
angle = cv2.minAreaRect(coords)[-1]
|
| 38 |
if angle < -45:
|
| 39 |
angle = -(90 + angle)
|
| 40 |
else:
|
| 41 |
angle = -angle
|
| 42 |
+
(h, w) = img.shape[:2]
|
| 43 |
center = (w // 2, h // 2)
|
| 44 |
M = cv2.getRotationMatrix2D(center, angle, 1.0)
|
| 45 |
+
rotated = cv2.warpAffine(img, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
|
| 46 |
return rotated
|
| 47 |
|
| 48 |
def dilate_erode(self,img):
|
|
|
|
| 60 |
img= cv2.dilate(img, kernel2, iterations=1)
|
| 61 |
return img
|
| 62 |
|
| 63 |
+
|
| 64 |
+
def detect_angle(self,img_path):
|
| 65 |
+
"""detects angle of rotation in the image using the text lines found"""
|
| 66 |
+
##taken from https://stackoverflow.com/questions/13872331/rotating-an-image-with-orientation-specified-in-exif-using-python-without-pil-in
|
| 67 |
+
pil_img=Image.open(img_path)
|
| 68 |
+
img_exif = pil_img.getexif()
|
| 69 |
+
if len(img_exif):
|
| 70 |
+
if img_exif[274] == 3:
|
| 71 |
+
pil_img = pil_img.transpose(Image.ROTATE_180)
|
| 72 |
+
elif img_exif[274] == 6:
|
| 73 |
+
pil_img = pil_img.transpose(Image.ROTATE_270)
|
| 74 |
+
elif img_exif[274] == 8:
|
| 75 |
+
pil_img = pil_img.transpose(Image.ROTATE_90)
|
| 76 |
+
|
| 77 |
+
return np.array(pil_img)[:, :, ::-1] #convert to BGR
|
| 78 |
+
|
| 79 |
def opening(self,image):
|
| 80 |
kernel = np.ones((5,5),np.uint8)
|
| 81 |
return cv2.morphologyEx(image, cv2.MORPH_OPEN, kernel)
|
| 82 |
|
| 83 |
+
def process_image(self,img_path):
|
| 84 |
img=self.binarize(img_path)
|
| 85 |
img=self.remove_watermark(img)
|
| 86 |
return img
|
|
|
|
| 90 |
return text
|
| 91 |
|
| 92 |
def _run(self,img_path,save_to_disk=False):
|
| 93 |
+
img=self.process_image(img_path)
|
| 94 |
text=self.img_to_text(img)
|
| 95 |
if save_to_disk:
|
| 96 |
with open(f"/tmp/{str(img_path).split('/')[-1].replace('.jpg','.txt')}",'w') as f:
|
| 97 |
f.write(text)
|
| 98 |
+
cv2.imwrite(f"images/rotated-{img_pth.name}",img)
|
| 99 |
return text
|
| 100 |
|
| 101 |
# as used in langchain documentation https://python.langchain.com/docs/modules/agents/tools/custom_tools
|
| 102 |
async def _arun(self, img_path: str,save_to_disk=False, run_manager: Optional[AsyncCallbackManagerForToolRun] = None
|
| 103 |
) -> str:
|
| 104 |
"""Use the tool asynchronously."""
|
| 105 |
+
raise NotImplementedError("does not support async")
|
| 106 |
|
| 107 |
|