Spaces:
Runtime error
Runtime error
File size: 5,242 Bytes
b19d5a1 6515ef9 b19d5a1 6515ef9 b19d5a1 a8508cc b19d5a1 e348353 b19d5a1 7ec2f8a b19d5a1 82616f3 b19d5a1 82616f3 b19d5a1 e348353 82616f3 e348353 b19d5a1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 |
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from pytesseract import image_to_string
from dotenv import load_dotenv
from PIL import Image
from io import BytesIO
import pypdfium2 as pdfium
import streamlit as st
import multiprocessing
from tempfile import NamedTemporaryFile
import pandas as pd
import json
import requests
load_dotenv()
# 1. Convert PDF file into images via pypdfium2
def convert_pdf_to_images(file_path, scale=300/72):
print("convert_pdf_to_images:")
pdf_file = pdfium.PdfDocument(file_path)
page_indices = [i for i in range(len(pdf_file))]
renderer = pdf_file.render(
pdfium.PdfBitmap.to_pil,
page_indices=page_indices,
scale=scale,
)
final_images = []
for i, image in zip(page_indices, renderer):
image_byte_array = BytesIO()
image.save(image_byte_array, format='jpeg', optimize=True)
image_byte_array = image_byte_array.getvalue()
final_images.append(dict({i: image_byte_array}))
print("convert_pdf_to_images Completed!")
return final_images
# 2. Extract text from images via pytesseract
def extract_text_from_img(list_dict_final_images):
print("extract_text_from_img:")
image_list = [list(data.values())[0] for data in list_dict_final_images]
image_content = []
for index, image_bytes in enumerate(image_list):
image = Image.open(BytesIO(image_bytes))
raw_text = str(image_to_string(image))
image_content.append(raw_text)
print("extract_text_from_img completed!")
return "\n".join(image_content)
def extract_content_from_url(url: str):
print("extract_content_from_url:" + url)
images_list = convert_pdf_to_images(url)
text_with_pytesseract = extract_text_from_img(images_list)
print("Content Extracted from URL!")
return text_with_pytesseract
# 3. Extract structured info from text via LLM
def extract_structured_data(content: str, data_points):
llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-0613")
template = """
You are an expert admin people who will extract core information from documents
{content}
Above is the content; please try to extract all data points from the content above
and export in a JSON array format:
{data_points}
Now please extract details from the content and export in a JSON array format,
return ONLY the JSON array:
"""
prompt = PromptTemplate(
input_variables=["content", "data_points"],
template=template,
)
chain = LLMChain(llm=llm, prompt=prompt)
results = chain.run(content=content, data_points=data_points)
return results
def convert_df(df):
return df.to_csv(index=False).encode('utf-8')
# 5. Streamlit app
def main():
default_data_points = """{
"order_id": "what is the order id",
"Invoice_Number":"what is the full invoice number after #",
"order_date":"what is the date of the order",
"bill_to":"what is the bill to details i.e. name and the address",
"ship_to":"what is the ship to details i.e. name and the address",
"Product_name":"what is the name of the product",
"Title":"what is the title of the product",
"qty": "what is the qty of the product",
"cst_%":"what is the cst %",
"cst_amount":"What is the cst amount"
"taxable value":"what is the taxable value",
"total":"what is the total of the product",
"Grand_total":"What is the grand totalof the product",
}"""
st.set_page_config(page_title="Data Extraction", page_icon=":technologist:")
st.header("Data Extraction :technologist:")
data_points = st.text_area(
"Data points", value=default_data_points, height=170)
uploaded_files = st.file_uploader(
"upload PDFs", accept_multiple_files=True)
if uploaded_files is not None and data_points is not None:
results = []
for file in uploaded_files:
with NamedTemporaryFile(dir='.', suffix='.csv') as f:
f.write(file.getbuffer())
content = extract_content_from_url(f.name)
print(content)
data = extract_structured_data(content, data_points)
json_data = json.loads(data)
if isinstance(json_data, list):
results.extend(json_data) # Use extend() for lists
else:
results.append(json_data) # Wrap the dict in a list
if len(results) > 0:
try:
df = pd.DataFrame(results)
st.subheader("Results")
st.data_editor(df)
st.download_button(
"Download CSV",
convert_df(df),
"file.csv",
"text/csv",
key='download-csv'
)
except Exception as e:
st.error(
f"An error occurred while creating the DataFrame: {e}")
st.write(results) # Print the data to see its content
if __name__ == '__main__':
multiprocessing.freeze_support()
main() |