from langchain.chat_models import ChatOpenAI from langchain.prompts import PromptTemplate from langchain.chains import LLMChain from pytesseract import image_to_string from dotenv import load_dotenv from PIL import Image from io import BytesIO import pypdfium2 as pdfium import streamlit as st import multiprocessing from tempfile import NamedTemporaryFile import pandas as pd import json import requests load_dotenv() # 1. Convert PDF file into images via pypdfium2 def convert_pdf_to_images(file_path, scale=300/72): print("convert_pdf_to_images:") pdf_file = pdfium.PdfDocument(file_path) page_indices = [i for i in range(len(pdf_file))] renderer = pdf_file.render( pdfium.PdfBitmap.to_pil, page_indices=page_indices, scale=scale, ) final_images = [] for i, image in zip(page_indices, renderer): image_byte_array = BytesIO() image.save(image_byte_array, format='jpeg', optimize=True) image_byte_array = image_byte_array.getvalue() final_images.append(dict({i: image_byte_array})) print("convert_pdf_to_images Completed!") return final_images # 2. Extract text from images via pytesseract def extract_text_from_img(list_dict_final_images): print("extract_text_from_img:") image_list = [list(data.values())[0] for data in list_dict_final_images] image_content = [] for index, image_bytes in enumerate(image_list): image = Image.open(BytesIO(image_bytes)) raw_text = str(image_to_string(image)) image_content.append(raw_text) print("extract_text_from_img completed!") return "\n".join(image_content) def extract_content_from_url(url: str): print("extract_content_from_url:" + url) images_list = convert_pdf_to_images(url) text_with_pytesseract = extract_text_from_img(images_list) print("Content Extracted from URL!") return text_with_pytesseract # 3. Extract structured info from text via LLM def extract_structured_data(content: str, data_points): llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-0613") template = """ You are an expert admin people who will extract core information from documents {content} Above is the content; please try to extract all data points from the content above and export in a JSON array format: {data_points} Now please extract details from the content and export in a JSON array format, return ONLY the JSON array: """ prompt = PromptTemplate( input_variables=["content", "data_points"], template=template, ) chain = LLMChain(llm=llm, prompt=prompt) results = chain.run(content=content, data_points=data_points) return results def convert_df(df): return df.to_csv(index=False).encode('utf-8') # 5. Streamlit app def main(): default_data_points = """{ "order_id": "what is the order id", "Invoice_Number":"what is the full invoice number after #", "order_date":"what is the date of the order", "bill_to":"what is the bill to details i.e. name and the address", "ship_to":"what is the ship to details i.e. name and the address", "Product_name":"what is the name of the product", "Title":"what is the title of the product", "qty": "what is the qty of the product", "cst_%":"what is the cst %", "cst_amount":"What is the cst amount" "taxable value":"what is the taxable value", "total":"what is the total of the product", "Grand_total":"What is the grand totalof the product", }""" st.set_page_config(page_title="Data Extraction", page_icon=":technologist:") st.header("Data Extraction :technologist:") data_points = st.text_area( "Data points", value=default_data_points, height=170) uploaded_files = st.file_uploader( "upload PDFs", accept_multiple_files=True) if uploaded_files is not None and data_points is not None: results = [] for file in uploaded_files: with NamedTemporaryFile(dir='.', suffix='.csv') as f: f.write(file.getbuffer()) content = extract_content_from_url(f.name) print(content) data = extract_structured_data(content, data_points) json_data = json.loads(data) if isinstance(json_data, list): results.extend(json_data) # Use extend() for lists else: results.append(json_data) # Wrap the dict in a list if len(results) > 0: try: df = pd.DataFrame(results) st.subheader("Results") st.data_editor(df) st.download_button( "Download CSV", convert_df(df), "file.csv", "text/csv", key='download-csv' ) except Exception as e: st.error( f"An error occurred while creating the DataFrame: {e}") st.write(results) # Print the data to see its content if __name__ == '__main__': multiprocessing.freeze_support() main()