Spaces:

amoldwalunj
/

amazon_invoice_extraction

Runtime error

File size: 2,649 Bytes

import pytesseract
from PIL import Image
import pandas as pd
import os
import gradio as gr
import numpy as np
import gradio as gr
import re
import pandas as pd


def ocr_df_using_pytesseract(image):
    #pytesseract.pytesseract.tesseract_cmd =r"C:\Users\amold\Desktop\Upwork\pdf to image and pytesseract\tesseact_exe\Tesseract-OCR\tesseract.exe"

    pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'
    #image = Image.open(example['image_path'])

    width, height = image.size

    # apply ocr to the image 
    ocr_df = pytesseract.image_to_data(image, output_type='data.frame')
    float_cols = ocr_df.select_dtypes('float').columns
    ocr_df = ocr_df.dropna().reset_index(drop=True)
    ocr_df[float_cols] = ocr_df[float_cols].round(0).astype(int)
    ocr_df = ocr_df.replace(r'^\s*$', np.nan, regex=True)
    ocr_df = ocr_df.dropna().reset_index(drop=True)

    ocr_df

    ocr_df['X1']=ocr_df['left']

    ocr_df['Y1']=ocr_df['top']

    ocr_df['X2']= ocr_df['left'] + ocr_df['width']

    ocr_df['Y2']= ocr_df['top'] + ocr_df['height']

    return ocr_df


def image_to_text(image):
    ocr_df= ocr_df_using_pytesseract(image)


    grouped_text = ocr_df.groupby(['block_num', 'line_num'])['text'].agg(' '.join).reset_index()

    # sort the text by line numbers within each block
    grouped_text = grouped_text.sort_values(['block_num', 'line_num'])

    # join the text by blocks and add newlines
    result = ''
    for i, row in grouped_text.iterrows():
        if i > 0 and row['block_num'] != grouped_text.loc[i-1, 'block_num']:
            result += '\n\n'
        result += row['text'].rstrip() + '\n'

    return result

def getting_extractions(image):
    text= image_to_text(image)

    item_pattern = r"(\d+)\s*of:(.*?)\$(\d+\.\d{2})"

    # Extracting the matches using regex
    item_matches = re.findall(item_pattern, text, re.DOTALL)
    
    items = []
    
    for match in item_matches:
        quantity, description, price = match
        quantity = int(quantity)
        description = description.strip()
        price = float(price)
    
        item = {
            "quantity": quantity,
            "description": description,
            "price": price
        }
        items.append(item)
    
    # Creating a pandas DataFrame
    df = pd.DataFrame(items, columns=["quantity", "description", "price"])

    return df





demo = gr.Interface(fn=getting_extractions, 
                    inputs= gr.Image(type="pil"), 
                    outputs=["dataframe"],
                    title="Amazon_invoice_to_text",
                    description= "Upload invoice image here")
demo.launch(share=False)