edwinrajeev commited on
Commit
115bcf1
·
verified ·
1 Parent(s): f3b7a56

Upload 6 files

Browse files
Blank Quad Datasheets.xlsx ADDED
Binary file (13.1 kB). View file
 
Line Intercept-BB Transect.xlsx ADDED
Binary file (12.6 kB). View file
 
SAV Survey Datasheet.xlsx ADDED
Binary file (16.1 kB). View file
 
Transect Datasheets.xlsx ADDED
Binary file (11.4 kB). View file
 
extractor.py ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import boto3
3
+ from pdf2image import convert_from_path
4
+ from io import BytesIO
5
+ import time
6
+ from aws import get_region, get_key_id, get_access_key
7
+ from openpyxl import load_workbook
8
+ import os
9
+
10
+ textract = boto3.client('textract',
11
+ region_name=get_region(),
12
+ aws_access_key_id=get_key_id(),
13
+ aws_secret_access_key=get_access_key())
14
+
15
+ def analyze_document_local(image_bytes):
16
+ """Calls Textract's analyze_document API using the local image bytes."""
17
+ response = textract.analyze_document(
18
+ Document={'Bytes': image_bytes},
19
+ FeatureTypes=['LAYOUT', 'TABLES']
20
+ )
21
+ return response
22
+
23
+ def get_text_from_cell(cell, page):
24
+ """Extracts the text from a table cell by looking up child WORD blocks."""
25
+ text = ''
26
+ for relationship in cell.get('Relationships', []):
27
+ if relationship['Type'] == 'CHILD':
28
+ for child_id in relationship['Ids']:
29
+ # Find the child block in the page
30
+ word_block = next((b for b in page['Blocks'] if b['Id'] == child_id), None)
31
+ if word_block and word_block['BlockType'] == 'WORD':
32
+ text += word_block.get('Text', '') + ' '
33
+ return text.strip()
34
+
35
+ def extract_table_data(page_response):
36
+ """
37
+ Extracts table data from a single Textract response (representing one image/page).
38
+ Returns a list of tables (each table is represented as a list of rows).
39
+ """
40
+ blocks = page_response['Blocks']
41
+ tables_data = []
42
+ for block in blocks:
43
+ if block['BlockType'] == 'TABLE':
44
+ table = []
45
+ rows = {}
46
+ # Get each cell in the table
47
+ for relationship in block.get('Relationships', []):
48
+ if relationship['Type'] == 'CHILD':
49
+ for child_id in relationship['Ids']:
50
+ cell = next((b for b in blocks if b['Id'] == child_id), None)
51
+ if cell and cell['BlockType'] == 'CELL':
52
+ row_index = cell['RowIndex']
53
+ col_index = cell['ColumnIndex']
54
+ cell_text = get_text_from_cell(cell, page_response)
55
+ if row_index not in rows:
56
+ rows[row_index] = {}
57
+ rows[row_index][col_index] = cell_text
58
+ # Create a sorted table by row and column indices
59
+ for row in sorted(rows.keys()):
60
+ sorted_row = [rows[row].get(col, '') for col in sorted(rows[row].keys())]
61
+ table.append(sorted_row)
62
+ tables_data.append(table)
63
+ return tables_data
64
+
65
+ def ocr_to_excel(pdf_file, template_file):
66
+ """
67
+ Given a PDF file and an Excel template file, converts the PDF pages to images,
68
+ extracts table data using AWS Textract (skipping the first two rows of each table),
69
+ then creates a new Excel file by copying the template sheet for each table
70
+ and writing the filtered data starting at cell A7.
71
+
72
+ The output file is saved with the same base name as the PDF but with a .xlsx extension.
73
+ """
74
+
75
+ pages = convert_from_path(pdf_file, dpi=200,
76
+ poppler_path=r"C:\Users\sshivlani\Projects\Inspection-Form-Transcriber\poppler-23.07.0\Library\bin") # Adjust later
77
+
78
+ all_tables = []
79
+ for i, page_image in enumerate(pages):
80
+ buffer = BytesIO()
81
+ # Save image as JPEG (quality=90)
82
+ page_image.save(buffer, format='JPEG')
83
+ image_bytes = buffer.getvalue()
84
+
85
+ # Call Textract synchronously on this image
86
+ response = analyze_document_local(image_bytes)
87
+ # Delay to avoid throttling
88
+ time.sleep(1)
89
+ tables = extract_table_data(response)
90
+ if tables:
91
+ all_tables.extend(tables)
92
+
93
+ # Load the Excel template workbook
94
+
95
+ wb = load_workbook(template_file)
96
+ # Use the first worksheet as the template sheet
97
+ template_sheet = wb.worksheets[0]
98
+
99
+ if (template_file == 'Transect Datasheets.xlsx'):
100
+
101
+ for idx, table in enumerate(all_tables, start=1):
102
+ new_sheet = wb.copy_worksheet(template_sheet)
103
+ new_sheet.title = f"Page_{idx}"
104
+ start_row = 12 # Data will start at row 7 (i.e. cell A7)
105
+ # Exclude the first two rows of the extracted table data
106
+ data_rows = table[1:]
107
+ for r_idx, row in enumerate(data_rows):
108
+ for c_idx, value in enumerate(row):
109
+ new_sheet.cell(row=start_row + r_idx, column=c_idx + 1, value=value)
110
+
111
+ if (template_file == 'Line Intercept-BB Transect.xlsx' or template_file == 'SAV Survey Datasheet.xlsx'):
112
+
113
+ for idx, table in enumerate(all_tables, start=1):
114
+ new_sheet = wb.copy_worksheet(template_sheet)
115
+ new_sheet.title = f"Page_{idx}"
116
+ start_row = 7 # Data will start at row 7 (i.e. cell A7)
117
+ # Exclude the first two rows of the extracted table data
118
+ data_rows = table[2:]
119
+ for r_idx, row in enumerate(data_rows):
120
+ for c_idx, value in enumerate(row):
121
+ new_sheet.cell(row=start_row + r_idx, column=c_idx + 1, value=value)
122
+
123
+ if (template_file == 'Blank Quad Datasheets.xlsx'):
124
+
125
+ for idx, table in enumerate(all_tables, start=1):
126
+ new_sheet = wb.copy_worksheet(template_sheet)
127
+ new_sheet.title = f"Page_{idx}"
128
+ start_row = 5 # Data will start at row 7 (i.e. cell A7)
129
+ # Exclude the first two rows of the extracted table data
130
+ data_rows = table[1:]
131
+ for r_idx, row in enumerate(data_rows):
132
+ for c_idx, value in enumerate(row):
133
+ new_sheet.cell(row=start_row + r_idx, column=c_idx + 1, value=value)
134
+
135
+ wb.remove(template_sheet)
136
+
137
+ # Construct output filename: same as PDF's base name with .xlsx extension
138
+ base_name = os.path.splitext(os.path.basename(pdf_file))[0]
139
+ output_path = base_name + ".xlsx"
140
+ wb.save(output_path)
141
+ return output_path
142
+
143
+ iface = gr.Interface(
144
+ fn=ocr_to_excel,
145
+ inputs=[
146
+ gr.File(label="PDF for OCR"),
147
+ gr.Dropdown(
148
+ choices=["Transect Datasheets.xlsx", "Line Intercept-BB Transect.xlsx", "SAV Survey Datasheet.xlsx", "Blank Quad Datasheets.xlsx"],
149
+ label="Excel Template"
150
+ )
151
+ ],
152
+ outputs=gr.File(label="Output Excel File"),
153
+ title="Handwritten Datasheets to Excel File (.xlsx)",
154
+ description="Upload a PDF file and select an Excel template. The OCR data will be appended to duplicated copies of the template sheet, and the resulting Excel file will be returned."
155
+ )
156
+
157
+ iface.launch()
requirements (3).txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ gradio
2
+ pdf2image
3
+ boto3
4
+ openpyxl