Spaces:
Runtime error
Runtime error
| from langchain.chat_models import ChatOpenAI | |
| from langchain.prompts import PromptTemplate | |
| from langchain.chains import LLMChain | |
| from pytesseract import image_to_string | |
| from dotenv import load_dotenv | |
| from PIL import Image | |
| from io import BytesIO | |
| import pypdfium2 as pdfium | |
| import streamlit as st | |
| import multiprocessing | |
| from tempfile import NamedTemporaryFile | |
| import pandas as pd | |
| import json | |
| import requests | |
| load_dotenv() | |
| # 1. Convert PDF file into images via pypdfium2 | |
| def convert_pdf_to_images(file_path, scale=300/72): | |
| print("convert_pdf_to_images:") | |
| pdf_file = pdfium.PdfDocument(file_path) | |
| page_indices = [i for i in range(len(pdf_file))] | |
| renderer = pdf_file.render( | |
| pdfium.PdfBitmap.to_pil, | |
| page_indices=page_indices, | |
| scale=scale, | |
| ) | |
| final_images = [] | |
| for i, image in zip(page_indices, renderer): | |
| image_byte_array = BytesIO() | |
| image.save(image_byte_array, format='jpeg', optimize=True) | |
| image_byte_array = image_byte_array.getvalue() | |
| final_images.append(dict({i: image_byte_array})) | |
| print("convert_pdf_to_images Completed!") | |
| return final_images | |
| # 2. Extract text from images via pytesseract | |
| def extract_text_from_img(list_dict_final_images): | |
| print("extract_text_from_img:") | |
| image_list = [list(data.values())[0] for data in list_dict_final_images] | |
| image_content = [] | |
| for index, image_bytes in enumerate(image_list): | |
| image = Image.open(BytesIO(image_bytes)) | |
| raw_text = str(image_to_string(image)) | |
| image_content.append(raw_text) | |
| print("extract_text_from_img completed!") | |
| return "\n".join(image_content) | |
| def extract_content_from_url(url: str): | |
| print("extract_content_from_url:" + url) | |
| images_list = convert_pdf_to_images(url) | |
| text_with_pytesseract = extract_text_from_img(images_list) | |
| print("Content Extracted from URL!") | |
| return text_with_pytesseract | |
| # 3. Extract structured info from text via LLM | |
| def extract_structured_data(content: str, data_points): | |
| llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-0613") | |
| template = """ | |
| You are an expert admin people who will extract core information from documents | |
| {content} | |
| Above is the content; please try to extract all data points from the content above | |
| and export in a JSON array format: | |
| {data_points} | |
| Now please extract details from the content and export in a JSON array format, | |
| return ONLY the JSON array: | |
| """ | |
| prompt = PromptTemplate( | |
| input_variables=["content", "data_points"], | |
| template=template, | |
| ) | |
| chain = LLMChain(llm=llm, prompt=prompt) | |
| results = chain.run(content=content, data_points=data_points) | |
| return results | |
| def convert_df(df): | |
| return df.to_csv(index=False).encode('utf-8') | |
| # 5. Streamlit app | |
| def main(): | |
| default_data_points = """{ | |
| "order_id": "what is the order id", | |
| "Invoice_Number":"what is the full invoice number after #", | |
| "order_date":"what is the date of the order", | |
| "bill_to":"what is the bill to details i.e. name and the address", | |
| "ship_to":"what is the ship to details i.e. name and the address", | |
| "Product_name":"what is the name of the product", | |
| "Title":"what is the title of the product", | |
| "qty": "what is the qty of the product", | |
| "cst_%":"what is the cst %", | |
| "cst_amount":"What is the cst amount" | |
| "taxable value":"what is the taxable value", | |
| "total":"what is the total of the product", | |
| "Grand_total":"What is the grand totalof the product", | |
| }""" | |
| st.set_page_config(page_title="Data Extraction", page_icon=":technologist:") | |
| st.header("Data Extraction :technologist:") | |
| data_points = st.text_area( | |
| "Data points", value=default_data_points, height=170) | |
| uploaded_files = st.file_uploader( | |
| "upload PDFs", accept_multiple_files=True) | |
| if uploaded_files is not None and data_points is not None: | |
| results = [] | |
| for file in uploaded_files: | |
| with NamedTemporaryFile(dir='.', suffix='.csv') as f: | |
| f.write(file.getbuffer()) | |
| content = extract_content_from_url(f.name) | |
| print(content) | |
| data = extract_structured_data(content, data_points) | |
| json_data = json.loads(data) | |
| if isinstance(json_data, list): | |
| results.extend(json_data) # Use extend() for lists | |
| else: | |
| results.append(json_data) # Wrap the dict in a list | |
| if len(results) > 0: | |
| try: | |
| df = pd.DataFrame(results) | |
| st.subheader("Results") | |
| st.data_editor(df) | |
| st.download_button( | |
| "Download CSV", | |
| convert_df(df), | |
| "file.csv", | |
| "text/csv", | |
| key='download-csv' | |
| ) | |
| except Exception as e: | |
| st.error( | |
| f"An error occurred while creating the DataFrame: {e}") | |
| st.write(results) # Print the data to see its content | |
| if __name__ == '__main__': | |
| multiprocessing.freeze_support() | |
| main() |