Spaces:

Shrikrishna
/

PDF_to_CSV

Runtime error

App Files Files Community

PDF_to_CSV / app.py

Shrikrishna

Update app.py

a8508cc over 2 years ago

raw

history blame contribute delete

5.24 kB

	from langchain.chat_models import ChatOpenAI
	from langchain.prompts import PromptTemplate
	from langchain.chains import LLMChain
	from pytesseract import image_to_string
	from dotenv import load_dotenv
	from PIL import Image
	from io import BytesIO
	import pypdfium2 as pdfium
	import streamlit as st
	import multiprocessing
	from tempfile import NamedTemporaryFile
	import pandas as pd
	import json
	import requests

	load_dotenv()

	# 1. Convert PDF file into images via pypdfium2


	def convert_pdf_to_images(file_path, scale=300/72):
	print("convert_pdf_to_images:")

	pdf_file = pdfium.PdfDocument(file_path)

	page_indices = [i for i in range(len(pdf_file))]

	renderer = pdf_file.render(
	pdfium.PdfBitmap.to_pil,
	page_indices=page_indices,
	scale=scale,
	)

	final_images = []

	for i, image in zip(page_indices, renderer):

	image_byte_array = BytesIO()
	image.save(image_byte_array, format='jpeg', optimize=True)
	image_byte_array = image_byte_array.getvalue()
	final_images.append(dict({i: image_byte_array}))
	print("convert_pdf_to_images Completed!")

	return final_images

	# 2. Extract text from images via pytesseract


	def extract_text_from_img(list_dict_final_images):
	print("extract_text_from_img:")

	image_list = [list(data.values())[0] for data in list_dict_final_images]
	image_content = []

	for index, image_bytes in enumerate(image_list):

	image = Image.open(BytesIO(image_bytes))
	raw_text = str(image_to_string(image))
	image_content.append(raw_text)
	print("extract_text_from_img completed!")
	return "\n".join(image_content)


	def extract_content_from_url(url: str):
	print("extract_content_from_url:" + url)
	images_list = convert_pdf_to_images(url)
	text_with_pytesseract = extract_text_from_img(images_list)
	print("Content Extracted from URL!")
	return text_with_pytesseract

	# 3. Extract structured info from text via LLM
	def extract_structured_data(content: str, data_points):
	llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-0613")
	template = """
	You are an expert admin people who will extract core information from documents

	{content}

	Above is the content; please try to extract all data points from the content above
	and export in a JSON array format:
	{data_points}

	Now please extract details from the content and export in a JSON array format,
	return ONLY the JSON array:
	"""

	prompt = PromptTemplate(
	input_variables=["content", "data_points"],
	template=template,
	)

	chain = LLMChain(llm=llm, prompt=prompt)

	results = chain.run(content=content, data_points=data_points)

	return results

	def convert_df(df):
	return df.to_csv(index=False).encode('utf-8')

	# 5. Streamlit app
	def main():
	default_data_points = """{
	"order_id": "what is the order id",
	"Invoice_Number":"what is the full invoice number after #",
	"order_date":"what is the date of the order",
	"bill_to":"what is the bill to details i.e. name and the address",
	"ship_to":"what is the ship to details i.e. name and the address",
	"Product_name":"what is the name of the product",
	"Title":"what is the title of the product",
	"qty": "what is the qty of the product",
	"cst_%":"what is the cst %",
	"cst_amount":"What is the cst amount"
	"taxable value":"what is the taxable value",
	"total":"what is the total of the product",
	"Grand_total":"What is the grand totalof the product",
	}"""

	st.set_page_config(page_title="Data Extraction", page_icon=":technologist:")

	st.header("Data Extraction :technologist:")

	data_points = st.text_area(
	"Data points", value=default_data_points, height=170)

	uploaded_files = st.file_uploader(
	"upload PDFs", accept_multiple_files=True)

	if uploaded_files is not None and data_points is not None:
	results = []
	for file in uploaded_files:
	with NamedTemporaryFile(dir='.', suffix='.csv') as f:
	f.write(file.getbuffer())
	content = extract_content_from_url(f.name)
	print(content)
	data = extract_structured_data(content, data_points)
	json_data = json.loads(data)
	if isinstance(json_data, list):
	results.extend(json_data) # Use extend() for lists
	else:
	results.append(json_data) # Wrap the dict in a list

	if len(results) > 0:
	try:
	df = pd.DataFrame(results)
	st.subheader("Results")
	st.data_editor(df)
	st.download_button(
	"Download CSV",
	convert_df(df),
	"file.csv",
	"text/csv",
	key='download-csv'
	)
	except Exception as e:
	st.error(
	f"An error occurred while creating the DataFrame: {e}")
	st.write(results) # Print the data to see its content


	if __name__ == '__main__':
	multiprocessing.freeze_support()
	main()