File size: 5,242 Bytes
b19d5a1
 
 
 
6515ef9
b19d5a1
 
 
 
 
 
 
 
 
 
6515ef9
b19d5a1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a8508cc
b19d5a1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e348353
 
 
b19d5a1
 
 
 
7ec2f8a
b19d5a1
 
 
 
 
 
 
 
 
 
 
 
 
82616f3
b19d5a1
82616f3
b19d5a1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e348353
82616f3
e348353
 
 
 
 
b19d5a1
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from pytesseract import image_to_string
from dotenv import load_dotenv
from PIL import Image
from io import BytesIO
import pypdfium2 as pdfium
import streamlit as st
import multiprocessing
from tempfile import NamedTemporaryFile
import pandas as pd
import json
import requests

load_dotenv()

# 1. Convert PDF file into images via pypdfium2


def convert_pdf_to_images(file_path, scale=300/72):
    print("convert_pdf_to_images:")

    pdf_file = pdfium.PdfDocument(file_path)

    page_indices = [i for i in range(len(pdf_file))]

    renderer = pdf_file.render(
        pdfium.PdfBitmap.to_pil,
        page_indices=page_indices,
        scale=scale,
    )

    final_images = []

    for i, image in zip(page_indices, renderer):

        image_byte_array = BytesIO()
        image.save(image_byte_array, format='jpeg', optimize=True)
        image_byte_array = image_byte_array.getvalue()
        final_images.append(dict({i: image_byte_array}))
    print("convert_pdf_to_images Completed!")
    
    return final_images

# 2. Extract text from images via pytesseract


def extract_text_from_img(list_dict_final_images):
    print("extract_text_from_img:")

    image_list = [list(data.values())[0] for data in list_dict_final_images]
    image_content = []

    for index, image_bytes in enumerate(image_list):

        image = Image.open(BytesIO(image_bytes))
        raw_text = str(image_to_string(image))
        image_content.append(raw_text)
    print("extract_text_from_img completed!")
    return "\n".join(image_content)


def extract_content_from_url(url: str):
    print("extract_content_from_url:" + url)
    images_list = convert_pdf_to_images(url)
    text_with_pytesseract = extract_text_from_img(images_list)
    print("Content Extracted from URL!")
    return text_with_pytesseract

# 3. Extract structured info from text via LLM
def extract_structured_data(content: str, data_points):
    llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-0613")
    template = """
    You are an expert admin people who will extract core information from documents

    {content}

    Above is the content; please try to extract all data points from the content above 
    and export in a JSON array format:
    {data_points}

    Now please extract details from the content  and export in a JSON array format, 
    return ONLY the JSON array:
    """

    prompt = PromptTemplate(
        input_variables=["content", "data_points"],
        template=template,
    )

    chain = LLMChain(llm=llm, prompt=prompt)

    results = chain.run(content=content, data_points=data_points)

    return results

def convert_df(df):
   return df.to_csv(index=False).encode('utf-8')

# 5. Streamlit app
def main():
    default_data_points = """{
        "order_id": "what is the order id",
        "Invoice_Number":"what is the full invoice number after #",
        "order_date":"what is the date of the order",
        "bill_to":"what is the bill to details i.e. name and the address",
        "ship_to":"what is the ship to details i.e. name and the address",
        "Product_name":"what is the name of the product",
        "Title":"what is the title of the product",
        "qty": "what is the qty of the product",
        "cst_%":"what is the cst %",
        "cst_amount":"What is the cst amount"
        "taxable value":"what is the taxable value",
        "total":"what is the total of the product",
        "Grand_total":"What is the grand totalof the product",
    }"""

    st.set_page_config(page_title="Data Extraction", page_icon=":technologist:")

    st.header("Data Extraction :technologist:")

    data_points = st.text_area(
        "Data points", value=default_data_points, height=170)

    uploaded_files = st.file_uploader(
        "upload PDFs", accept_multiple_files=True)

    if uploaded_files is not None and data_points is not None:
        results = []
        for file in uploaded_files:
            with NamedTemporaryFile(dir='.', suffix='.csv') as f:
                f.write(file.getbuffer())
                content = extract_content_from_url(f.name)
                print(content)
                data = extract_structured_data(content, data_points)
                json_data = json.loads(data)
                if isinstance(json_data, list):
                    results.extend(json_data)  # Use extend() for lists
                else:
                    results.append(json_data)  # Wrap the dict in a list

        if len(results) > 0:
            try:
                df = pd.DataFrame(results)
                st.subheader("Results")
                st.data_editor(df)
                st.download_button(
                   "Download CSV",
                   convert_df(df),
                   "file.csv",
                   "text/csv",
                   key='download-csv'
                )
            except Exception as e:
                st.error(
                    f"An error occurred while creating the DataFrame: {e}")
                st.write(results)  # Print the data to see its content


if __name__ == '__main__':
    multiprocessing.freeze_support()
    main()