PDF_to_CSV / app.py
Shrikrishna's picture
Update app.py
a8508cc
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from pytesseract import image_to_string
from dotenv import load_dotenv
from PIL import Image
from io import BytesIO
import pypdfium2 as pdfium
import streamlit as st
import multiprocessing
from tempfile import NamedTemporaryFile
import pandas as pd
import json
import requests
load_dotenv()
# 1. Convert PDF file into images via pypdfium2
def convert_pdf_to_images(file_path, scale=300/72):
print("convert_pdf_to_images:")
pdf_file = pdfium.PdfDocument(file_path)
page_indices = [i for i in range(len(pdf_file))]
renderer = pdf_file.render(
pdfium.PdfBitmap.to_pil,
page_indices=page_indices,
scale=scale,
)
final_images = []
for i, image in zip(page_indices, renderer):
image_byte_array = BytesIO()
image.save(image_byte_array, format='jpeg', optimize=True)
image_byte_array = image_byte_array.getvalue()
final_images.append(dict({i: image_byte_array}))
print("convert_pdf_to_images Completed!")
return final_images
# 2. Extract text from images via pytesseract
def extract_text_from_img(list_dict_final_images):
print("extract_text_from_img:")
image_list = [list(data.values())[0] for data in list_dict_final_images]
image_content = []
for index, image_bytes in enumerate(image_list):
image = Image.open(BytesIO(image_bytes))
raw_text = str(image_to_string(image))
image_content.append(raw_text)
print("extract_text_from_img completed!")
return "\n".join(image_content)
def extract_content_from_url(url: str):
print("extract_content_from_url:" + url)
images_list = convert_pdf_to_images(url)
text_with_pytesseract = extract_text_from_img(images_list)
print("Content Extracted from URL!")
return text_with_pytesseract
# 3. Extract structured info from text via LLM
def extract_structured_data(content: str, data_points):
llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-0613")
template = """
You are an expert admin people who will extract core information from documents
{content}
Above is the content; please try to extract all data points from the content above
and export in a JSON array format:
{data_points}
Now please extract details from the content and export in a JSON array format,
return ONLY the JSON array:
"""
prompt = PromptTemplate(
input_variables=["content", "data_points"],
template=template,
)
chain = LLMChain(llm=llm, prompt=prompt)
results = chain.run(content=content, data_points=data_points)
return results
def convert_df(df):
return df.to_csv(index=False).encode('utf-8')
# 5. Streamlit app
def main():
default_data_points = """{
"order_id": "what is the order id",
"Invoice_Number":"what is the full invoice number after #",
"order_date":"what is the date of the order",
"bill_to":"what is the bill to details i.e. name and the address",
"ship_to":"what is the ship to details i.e. name and the address",
"Product_name":"what is the name of the product",
"Title":"what is the title of the product",
"qty": "what is the qty of the product",
"cst_%":"what is the cst %",
"cst_amount":"What is the cst amount"
"taxable value":"what is the taxable value",
"total":"what is the total of the product",
"Grand_total":"What is the grand totalof the product",
}"""
st.set_page_config(page_title="Data Extraction", page_icon=":technologist:")
st.header("Data Extraction :technologist:")
data_points = st.text_area(
"Data points", value=default_data_points, height=170)
uploaded_files = st.file_uploader(
"upload PDFs", accept_multiple_files=True)
if uploaded_files is not None and data_points is not None:
results = []
for file in uploaded_files:
with NamedTemporaryFile(dir='.', suffix='.csv') as f:
f.write(file.getbuffer())
content = extract_content_from_url(f.name)
print(content)
data = extract_structured_data(content, data_points)
json_data = json.loads(data)
if isinstance(json_data, list):
results.extend(json_data) # Use extend() for lists
else:
results.append(json_data) # Wrap the dict in a list
if len(results) > 0:
try:
df = pd.DataFrame(results)
st.subheader("Results")
st.data_editor(df)
st.download_button(
"Download CSV",
convert_df(df),
"file.csv",
"text/csv",
key='download-csv'
)
except Exception as e:
st.error(
f"An error occurred while creating the DataFrame: {e}")
st.write(results) # Print the data to see its content
if __name__ == '__main__':
multiprocessing.freeze_support()
main()