File size: 2,649 Bytes
4c24cb0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84ab68f
4c24cb0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0cbedaa
4c24cb0
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import pytesseract
from PIL import Image
import pandas as pd
import os
import gradio as gr
import numpy as np
import gradio as gr
import re
import pandas as pd


def ocr_df_using_pytesseract(image):
    #pytesseract.pytesseract.tesseract_cmd =r"C:\Users\amold\Desktop\Upwork\pdf to image and pytesseract\tesseact_exe\Tesseract-OCR\tesseract.exe"

    pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'
    #image = Image.open(example['image_path'])

    width, height = image.size

    # apply ocr to the image 
    ocr_df = pytesseract.image_to_data(image, output_type='data.frame')
    float_cols = ocr_df.select_dtypes('float').columns
    ocr_df = ocr_df.dropna().reset_index(drop=True)
    ocr_df[float_cols] = ocr_df[float_cols].round(0).astype(int)
    ocr_df = ocr_df.replace(r'^\s*$', np.nan, regex=True)
    ocr_df = ocr_df.dropna().reset_index(drop=True)

    ocr_df

    ocr_df['X1']=ocr_df['left']

    ocr_df['Y1']=ocr_df['top']

    ocr_df['X2']= ocr_df['left'] + ocr_df['width']

    ocr_df['Y2']= ocr_df['top'] + ocr_df['height']

    return ocr_df


def image_to_text(image):
    ocr_df= ocr_df_using_pytesseract(image)


    grouped_text = ocr_df.groupby(['block_num', 'line_num'])['text'].agg(' '.join).reset_index()

    # sort the text by line numbers within each block
    grouped_text = grouped_text.sort_values(['block_num', 'line_num'])

    # join the text by blocks and add newlines
    result = ''
    for i, row in grouped_text.iterrows():
        if i > 0 and row['block_num'] != grouped_text.loc[i-1, 'block_num']:
            result += '\n\n'
        result += row['text'].rstrip() + '\n'

    return result

def getting_extractions(image):
    text= image_to_text(image)

    item_pattern = r"(\d+)\s*of:(.*?)\$(\d+\.\d{2})"

    # Extracting the matches using regex
    item_matches = re.findall(item_pattern, text, re.DOTALL)
    
    items = []
    
    for match in item_matches:
        quantity, description, price = match
        quantity = int(quantity)
        description = description.strip()
        price = float(price)
    
        item = {
            "quantity": quantity,
            "description": description,
            "price": price
        }
        items.append(item)
    
    # Creating a pandas DataFrame
    df = pd.DataFrame(items, columns=["quantity", "description", "price"])

    return df





demo = gr.Interface(fn=getting_extractions, 
                    inputs= gr.Image(type="pil"), 
                    outputs=["dataframe"],
                    title="Amazon_invoice_to_text",
                    description= "Upload invoice image here")
demo.launch(share=False)