jhonparra18 commited on
Commit
2804e13
·
1 Parent(s): f82ebcc

app definition

Browse files
Files changed (7) hide show
  1. .streamlit/config.toml +5 -0
  2. app.py +65 -0
  3. app_utils.py +14 -0
  4. config.py +3 -0
  5. image_processor.py +100 -0
  6. requirements.txt +8 -0
  7. text_summarizer.py +53 -0
.streamlit/config.toml ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ [theme]
2
+ base = "dark"
3
+
4
+ [server]
5
+ runOnSave = true
app.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from langchain.callbacks import StreamlitCallbackHandler
3
+ import numpy as np
4
+ from PIL import Image
5
+ from app_utils import TEMP_DIR_NAME,save_uploaded_file,reset_chat
6
+ import os
7
+ import sys
8
+ from text_summarizer import agent
9
+
10
+ BOT_DEFAULT_MSG="Hello 👋 I'm a test AI assistant to help you with your questions about an input file, or feel free to ask me anything"
11
+ st.set_page_config(page_title="Invoice|Receipt LLM Summarizer",layout='wide',page_icon=":shark:")
12
+ IMAGE_TMP_PATH=None
13
+
14
+ with st.sidebar:
15
+
16
+ st.markdown(
17
+ f"<h1 style='text-align: center;'> Invoice|Receipt LLM Summarizer using OpenCV+Tesseract+LLM</h1><br><br>",
18
+ unsafe_allow_html=True
19
+ )
20
+ input_image = st.file_uploader(label='Receipt|Invoice Image',help="Upload an image",type=['jpg','png','jpeg'])
21
+
22
+ if input_image is not None:
23
+ save_uploaded_file(input_image)
24
+ IMAGE_TMP_PATH=os.path.join(TEMP_DIR_NAME,input_image.name)
25
+ st.markdown(f"<h1 style='text-align: center;'> Image Uploaded and saved<br>",unsafe_allow_html=True)
26
+ st.image(Image.open(IMAGE_TMP_PATH))
27
+ st.markdown("***")
28
+
29
+ st.button("Reset Chat History", type="secondary", on_click=reset_chat,use_container_width=True)
30
+ st.markdown("[![Foo](https://img.icons8.com/material-outlined/96/000000/github.png)](https://github.com/statscol/invoice-llm-summarizer)")
31
+
32
+
33
+ # Initialize chat history based on streamlit doc for chat applications https://docs.streamlit.io/knowledge-base/tutorials/build-conversational-apps
34
+ if "messages" not in st.session_state:
35
+ st.session_state.messages = []
36
+
37
+
38
+
39
+ # Display chat messages from history on app rerun
40
+ for message in st.session_state.messages:
41
+ with st.chat_message(message["role"]):
42
+ st.markdown(message["content"])
43
+
44
+ # Set default message on chat
45
+ with st.chat_message("assistant"):
46
+ st.write(BOT_DEFAULT_MSG)
47
+
48
+
49
+ if prompt := st.chat_input("Write a message to the AI assistant | Escribe un mensaje para el asistente de IA"):
50
+
51
+ st.chat_message("user").markdown(prompt)
52
+ st.session_state.messages.append({"role": "user", "content": prompt})
53
+ agent.memory.memory_variables()
54
+ prompt_ad=f'{prompt}, image path: {IMAGE_TMP_PATH}' if input_image is not None else prompt
55
+ ##streamlit callback https://python.langchain.com/docs/integrations/callbacks/streamlit
56
+ st_callback = StreamlitCallbackHandler(st.container())
57
+ #hotfix to errors
58
+ try:
59
+ response = response = agent.run(prompt_ad,callbacks=[st_callback])
60
+ except ValueError as e:
61
+ response = "Sorry i could't understand your last question."
62
+ with st.chat_message("assistant"):
63
+ st.markdown(response)
64
+ # Add assistant response to chat history
65
+ st.session_state.messages.append({"role": "assistant", "content": response})
app_utils.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import os
3
+ import streamlit as st
4
+
5
+
6
+ TEMP_DIR_NAME="/tmp/"
7
+
8
+ def save_uploaded_file(uploadedfile):
9
+ with open(os.path.join(TEMP_DIR_NAME,uploadedfile.name),"wb") as f:
10
+ f.write(uploadedfile.getbuffer())
11
+ return st.success("Saved File:{} to {}".format(uploadedfile.name,TEMP_DIR_NAME))
12
+
13
+ def reset_chat():
14
+ st.session_state.messages = []
config.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ PYTESSERACT_DEFAULT_CONFIG= r'--psm 4'
2
+ OPEN_AI_MODEL_NAME="gpt-3.5-turbo-0613" #fine-tuned for function detection see https://python.langchain.com/docs/modules/agents/agent_types/openai_functions_agent
3
+ DEBUG_MODE_LLM=False
image_processor.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cv2
2
+ import pytesseract
3
+ from config import PYTESSERACT_DEFAULT_CONFIG
4
+ from pathlib import Path
5
+ from tqdm import tqdm
6
+ import numpy as np
7
+ from langchain.tools import BaseTool
8
+ from typing import Optional, Type
9
+ from langchain.callbacks.manager import AsyncCallbackManagerForToolRun
10
+
11
+
12
+ class ImageProcessor(BaseTool):
13
+
14
+ name = "ImageProcessor"
15
+ description = "useful when you need to extract info from an image in an img_path corresponding to a receipt or invoice and tries to preprocess it returning all the text in the image using an OCR system."
16
+
17
+ def binarize(self,img_path:str):
18
+ """
19
+ This function is to binarize an input image
20
+ :param img: image in format of (h, w, channel)
21
+ :return: am image in format of (h, w)
22
+ """
23
+ img=cv2.imread(img_path)
24
+ gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
25
+ #gray = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1] #threshold may affect performance for invoices|receipts as seen in our test dataset
26
+ return gray
27
+
28
+ def remove_watermark(self,img,alpha = 1.8,beta = -180):
29
+ """remove watermark from image
30
+ img: cv2 image| np.array"""
31
+ new = alpha * img + beta
32
+ new = np.clip(new, 0, 255).astype(np.uint8)
33
+ return new
34
+
35
+ def deskew(self,image):
36
+ coords = np.column_stack(np.where(image > 0))
37
+ angle = cv2.minAreaRect(coords)[-1]
38
+ if angle < -45:
39
+ angle = -(90 + angle)
40
+ else:
41
+ angle = -angle
42
+ (h, w) = image.shape[:2]
43
+ center = (w // 2, h // 2)
44
+ M = cv2.getRotationMatrix2D(center, angle, 1.0)
45
+ rotated = cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
46
+ return rotated
47
+
48
+ def dilate_erode(self,img):
49
+ """
50
+ apply an erosion and dilation kernel
51
+ img: cv2 image| np.array
52
+ """
53
+ kernel = np.ones((2, 1), np.uint8)
54
+ kernel2 = np.ones((1, 1), np.uint8)
55
+ img = cv2.blur(img,(6,5))
56
+ img=cv2.dilate(img, kernel, iterations=3)
57
+ img = cv2.erode(img, (2,1), iterations=1)
58
+ img = cv2.blur(img,(1,1))
59
+ img = cv2.bilateralFilter(img,10,35,30)
60
+ img= cv2.dilate(img, kernel2, iterations=1)
61
+ return img
62
+
63
+ def opening(self,image):
64
+ kernel = np.ones((5,5),np.uint8)
65
+ return cv2.morphologyEx(image, cv2.MORPH_OPEN, kernel)
66
+
67
+ def process_image(self,img_path:str):
68
+ img=self.binarize(img_path)
69
+ img=self.remove_watermark(img)
70
+ return img
71
+
72
+ def img_to_text(self,img,lang="spa"):
73
+ text=pytesseract.image_to_string(img,lang=lang,config=PYTESSERACT_DEFAULT_CONFIG)
74
+ return text
75
+
76
+ def _run(self,img_path):
77
+ img=self.process_image(str(img_path))
78
+ text=self.img_to_text(img)
79
+ return text
80
+
81
+ # as used in langchain documentation https://python.langchain.com/docs/modules/agents/tools/custom_tools
82
+ async def _arun(self, query: str, run_manager: Optional[AsyncCallbackManagerForToolRun] = None
83
+ ) -> str:
84
+ """Use the tool asynchronously."""
85
+ raise NotImplementedError("custom_search does not support async")
86
+
87
+
88
+
89
+ if __name__=="__main__":
90
+ processor=ImageProcessor()
91
+ image_paths=list(Path("images/raw").glob("*.jpg"))
92
+ for img_pth in tqdm(image_paths,desc="Img Preproc+ OCR "):
93
+ img_processed=processor.process_image(str(img_pth))
94
+ text=processor.run(str(img_pth))
95
+ cv2.imwrite(f"images/processed/{img_pth.name}",img_processed)
96
+ with open(f"images/text/{img_pth.name.replace('.jpg','.txt')}",'w') as f:
97
+ f.write(text)
98
+
99
+
100
+
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ opencv-python
2
+ numpy
3
+ requests
4
+ langchain
5
+ openai
6
+ pytesseract
7
+ python-dotenv
8
+ streamlit
text_summarizer.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import langchain
2
+ from langchain.schema import SystemMessage
3
+ from langchain.agents import OpenAIFunctionsAgent,initialize_agent
4
+ from langchain.agents import AgentType
5
+ from langchain.chat_models import ChatOpenAI
6
+ from langchain.chains.conversation.memory import ConversationBufferWindowMemory
7
+ from dotenv import load_dotenv
8
+ from config import OPEN_AI_MODEL_NAME,DEBUG_MODE_LLM
9
+ from image_processor import ImageProcessor
10
+
11
+ langchain.debug = DEBUG_MODE_LLM
12
+ load_dotenv()
13
+
14
+ #img preproc and ocr helper
15
+ processor=ImageProcessor()
16
+ system_message = SystemMessage(content="""You are an expert invoice, receipt summarizer, you're supposed to analyze every text in english or spanish and return data like restaurant name, items or products bought and its price as well as the total amount, however you cannot read images so you must use a tool to convert and image to text""")
17
+
18
+
19
+ #initial system prompt
20
+ prompt = OpenAIFunctionsAgent.create_prompt(system_message=system_message)
21
+ #define LLM to use
22
+ llm = ChatOpenAI(temperature=0.1, model=OPEN_AI_MODEL_NAME,)
23
+
24
+ #tools to use as functions to trigger from the llm
25
+ tools = [
26
+ ImageProcessor()
27
+ ]
28
+
29
+ #memory placeholder
30
+ conversational_memory = ConversationBufferWindowMemory(
31
+ memory_key='chat_history',
32
+ k=5,
33
+ return_messages=True
34
+ )
35
+
36
+ llm = ChatOpenAI(
37
+ temperature=0,
38
+ model_name=OPEN_AI_MODEL_NAME,
39
+ max_tokens=2048
40
+ )
41
+
42
+
43
+ agent = initialize_agent(
44
+ agent=AgentType.OPENAI_FUNCTIONS, ## does not use memory
45
+ tools=tools,
46
+ llm=llm,
47
+ max_iterations=5,
48
+ verbose=False,
49
+ memory=conversational_memory,
50
+ early_stopping_method='generate',
51
+ prompt=prompt
52
+ )
53
+ ##TO DO, Remove agent and test sequential chain