Spaces:

amoldwalunj
/

amazon_invoice_extraction

Runtime error

App Files Files Community

amazon_invoice_extraction / app.py

amoldwalunj

Update app.py

0cbedaa almost 3 years ago

raw

history blame contribute delete

2.65 kB

	import pytesseract
	from PIL import Image
	import pandas as pd
	import os
	import gradio as gr
	import numpy as np
	import gradio as gr
	import re
	import pandas as pd


	def ocr_df_using_pytesseract(image):
	#pytesseract.pytesseract.tesseract_cmd =r"C:\Users\amold\Desktop\Upwork\pdf to image and pytesseract\tesseact_exe\Tesseract-OCR\tesseract.exe"

	pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'
	#image = Image.open(example['image_path'])

	width, height = image.size

	# apply ocr to the image
	ocr_df = pytesseract.image_to_data(image, output_type='data.frame')
	float_cols = ocr_df.select_dtypes('float').columns
	ocr_df = ocr_df.dropna().reset_index(drop=True)
	ocr_df[float_cols] = ocr_df[float_cols].round(0).astype(int)
	ocr_df = ocr_df.replace(r'^\s*$', np.nan, regex=True)
	ocr_df = ocr_df.dropna().reset_index(drop=True)

	ocr_df

	ocr_df['X1']=ocr_df['left']

	ocr_df['Y1']=ocr_df['top']

	ocr_df['X2']= ocr_df['left'] + ocr_df['width']

	ocr_df['Y2']= ocr_df['top'] + ocr_df['height']

	return ocr_df


	def image_to_text(image):
	ocr_df= ocr_df_using_pytesseract(image)


	grouped_text = ocr_df.groupby(['block_num', 'line_num'])['text'].agg(' '.join).reset_index()

	# sort the text by line numbers within each block
	grouped_text = grouped_text.sort_values(['block_num', 'line_num'])

	# join the text by blocks and add newlines
	result = ''
	for i, row in grouped_text.iterrows():
	if i > 0 and row['block_num'] != grouped_text.loc[i-1, 'block_num']:
	result += '\n\n'
	result += row['text'].rstrip() + '\n'

	return result

	def getting_extractions(image):
	text= image_to_text(image)

	item_pattern = r"(\d+)\sof:(.?)\$(\d+\.\d{2})"

	# Extracting the matches using regex
	item_matches = re.findall(item_pattern, text, re.DOTALL)

	items = []

	for match in item_matches:
	quantity, description, price = match
	quantity = int(quantity)
	description = description.strip()
	price = float(price)

	item = {
	"quantity": quantity,
	"description": description,
	"price": price
	}
	items.append(item)

	# Creating a pandas DataFrame
	df = pd.DataFrame(items, columns=["quantity", "description", "price"])

	return df





	demo = gr.Interface(fn=getting_extractions,
	inputs= gr.Image(type="pil"),
	outputs=["dataframe"],
	title="Amazon_invoice_to_text",
	description= "Upload invoice image here")
	demo.launch(share=False)