Shrikrishna commited on
Commit
b19d5a1
·
1 Parent(s): ed22f62

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +156 -0
app.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.chat_models import ChatOpenAI
2
+ from langchain.prompts import PromptTemplate
3
+ from langchain.chains import LLMChain
4
+ from dotenv import load_dotenv
5
+ from pytesseract import image_to_string
6
+ from PIL import Image
7
+ from io import BytesIO
8
+ import pypdfium2 as pdfium
9
+ import streamlit as st
10
+ import multiprocessing
11
+ from tempfile import NamedTemporaryFile
12
+ import pandas as pd
13
+ import json
14
+ import requests
15
+
16
+ OPENAI_API_KEY = "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"
17
+
18
+ # 1. Convert PDF file into images via pypdfium2
19
+
20
+
21
+ def convert_pdf_to_images(file_path, scale=300/72):
22
+ print("convert_pdf_to_images:")
23
+
24
+ pdf_file = pdfium.PdfDocument(file_path)
25
+
26
+ page_indices = [i for i in range(len(pdf_file))]
27
+
28
+ renderer = pdf_file.render(
29
+ pdfium.PdfBitmap.to_pil,
30
+ page_indices=page_indices,
31
+ scale=scale,
32
+ )
33
+
34
+ final_images = []
35
+
36
+ for i, image in zip(page_indices, renderer):
37
+
38
+ image_byte_array = BytesIO()
39
+ image.save(image_byte_array, format='jpeg', optimize=True)
40
+ image_byte_array = image_byte_array.getvalue()
41
+ final_images.append(dict({i: image_byte_array}))
42
+ print("convert_pdf_to_images Completed!")
43
+
44
+ return final_images
45
+
46
+ # 2. Extract text from images via pytesseract
47
+
48
+
49
+ def extract_text_from_img(list_dict_final_images):
50
+ print("extract_text_from_img:")
51
+
52
+ image_list = [list(data.values())[0] for data in list_dict_final_images]
53
+ image_content = []
54
+
55
+ for index, image_bytes in enumerate(image_list):
56
+
57
+ image = Image.open(BytesIO(image_bytes))
58
+ raw_text = str(image_to_string(image))
59
+ image_content.append(raw_text)
60
+ print("extract_text_from_img completed!")
61
+ return "\n".join(image_content)
62
+
63
+
64
+ def extract_content_from_url(url: str):
65
+ print("extract_content_from_url:" + url)
66
+ images_list = convert_pdf_to_images(url)
67
+ text_with_pytesseract = extract_text_from_img(images_list)
68
+ print("Content Extracted from URL!")
69
+ return text_with_pytesseract
70
+
71
+ # 3. Extract structured info from text via LLM
72
+ def extract_structured_data(content: str, data_points):
73
+ llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-0613", openai_api_key=OPENAI_API_KEY)
74
+ template = """
75
+ You are an expert admin people who will extract core information from documents
76
+
77
+ {content}
78
+
79
+ Above is the content; please try to extract all data points from the content above
80
+ and export in a JSON array format:
81
+ {data_points}
82
+
83
+ Now please extract details from the content and export in a JSON array format,
84
+ return ONLY the JSON array:
85
+ """
86
+
87
+ prompt = PromptTemplate(
88
+ input_variables=["content", "data_points"],
89
+ template=template,
90
+ )
91
+
92
+ chain = LLMChain(llm=llm, prompt=prompt)
93
+
94
+ results = chain.run(content=content, data_points=data_points)
95
+
96
+ return results
97
+
98
+ # 5. Streamlit app
99
+ def main():
100
+ default_data_points = """{
101
+ "order_id": "what is the order id",
102
+ "Invoice_Number":"what is the invice number",
103
+ "order_date":"what is the date of the order",
104
+ "bill_to":"what is the bill to details i.e. name and the address",
105
+ "ship_to":"what is the ship to details i.e. name and the address",
106
+ "Product_name":"what is the name of the product",
107
+ "Title":"what is the title of the product",
108
+ "qty": "what is the qty of the product",
109
+ "cst_%":"what is the cst %",
110
+ "cst_amount":"What is the cst amount"
111
+ "taxable value":"what is the taxable value",
112
+ "total":"what is the total of the product",
113
+ "Grand_total":"What is the grand totalof the product",
114
+ }"""
115
+
116
+ st.set_page_config(page_title="Doc extraction", page_icon=":bird:")
117
+
118
+ st.header("Doc extraction :bird:")
119
+
120
+ data_points = st.text_area(
121
+ "Data points", value=default_data_points, height=170)
122
+
123
+ uploaded_files = st.file_uploader(
124
+ "upload PDFs", accept_multiple_files=True)
125
+
126
+ if uploaded_files is not None and data_points is not None:
127
+ results = []
128
+ for file in uploaded_files:
129
+ with NamedTemporaryFile(dir='.', suffix='.csv') as f:
130
+ f.write(file.getbuffer())
131
+ content = extract_content_from_url(f.name)
132
+ print(content)
133
+ data = extract_structured_data(content, data_points)
134
+ json_data = json.loads(data)
135
+ if isinstance(json_data, list):
136
+ results.extend(json_data) # Use extend() for lists
137
+ else:
138
+ results.append(json_data) # Wrap the dict in a list
139
+
140
+ if len(results) > 0:
141
+ try:
142
+ df = pd.DataFrame(results)
143
+ st.subheader("Results")
144
+ st.data_editor(df)
145
+ if st.button("Sync to Make"):
146
+ send_to_make(results)
147
+ st.write("Synced to Make!")
148
+ except Exception as e:
149
+ st.error(
150
+ f"An error occurred while creating the DataFrame: {e}")
151
+ st.write(results) # Print the data to see its content
152
+
153
+
154
+ if __name__ == '__main__':
155
+ multiprocessing.freeze_support()
156
+ main()