Spaces:
Runtime error
Runtime error
Upload 6 files
Browse files- Blank Quad Datasheets.xlsx +0 -0
- Line Intercept-BB Transect.xlsx +0 -0
- SAV Survey Datasheet.xlsx +0 -0
- Transect Datasheets.xlsx +0 -0
- extractor.py +157 -0
- requirements (3).txt +4 -0
Blank Quad Datasheets.xlsx
ADDED
|
Binary file (13.1 kB). View file
|
|
|
Line Intercept-BB Transect.xlsx
ADDED
|
Binary file (12.6 kB). View file
|
|
|
SAV Survey Datasheet.xlsx
ADDED
|
Binary file (16.1 kB). View file
|
|
|
Transect Datasheets.xlsx
ADDED
|
Binary file (11.4 kB). View file
|
|
|
extractor.py
ADDED
|
@@ -0,0 +1,157 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import boto3
|
| 3 |
+
from pdf2image import convert_from_path
|
| 4 |
+
from io import BytesIO
|
| 5 |
+
import time
|
| 6 |
+
from aws import get_region, get_key_id, get_access_key
|
| 7 |
+
from openpyxl import load_workbook
|
| 8 |
+
import os
|
| 9 |
+
|
| 10 |
+
textract = boto3.client('textract',
|
| 11 |
+
region_name=get_region(),
|
| 12 |
+
aws_access_key_id=get_key_id(),
|
| 13 |
+
aws_secret_access_key=get_access_key())
|
| 14 |
+
|
| 15 |
+
def analyze_document_local(image_bytes):
|
| 16 |
+
"""Calls Textract's analyze_document API using the local image bytes."""
|
| 17 |
+
response = textract.analyze_document(
|
| 18 |
+
Document={'Bytes': image_bytes},
|
| 19 |
+
FeatureTypes=['LAYOUT', 'TABLES']
|
| 20 |
+
)
|
| 21 |
+
return response
|
| 22 |
+
|
| 23 |
+
def get_text_from_cell(cell, page):
|
| 24 |
+
"""Extracts the text from a table cell by looking up child WORD blocks."""
|
| 25 |
+
text = ''
|
| 26 |
+
for relationship in cell.get('Relationships', []):
|
| 27 |
+
if relationship['Type'] == 'CHILD':
|
| 28 |
+
for child_id in relationship['Ids']:
|
| 29 |
+
# Find the child block in the page
|
| 30 |
+
word_block = next((b for b in page['Blocks'] if b['Id'] == child_id), None)
|
| 31 |
+
if word_block and word_block['BlockType'] == 'WORD':
|
| 32 |
+
text += word_block.get('Text', '') + ' '
|
| 33 |
+
return text.strip()
|
| 34 |
+
|
| 35 |
+
def extract_table_data(page_response):
|
| 36 |
+
"""
|
| 37 |
+
Extracts table data from a single Textract response (representing one image/page).
|
| 38 |
+
Returns a list of tables (each table is represented as a list of rows).
|
| 39 |
+
"""
|
| 40 |
+
blocks = page_response['Blocks']
|
| 41 |
+
tables_data = []
|
| 42 |
+
for block in blocks:
|
| 43 |
+
if block['BlockType'] == 'TABLE':
|
| 44 |
+
table = []
|
| 45 |
+
rows = {}
|
| 46 |
+
# Get each cell in the table
|
| 47 |
+
for relationship in block.get('Relationships', []):
|
| 48 |
+
if relationship['Type'] == 'CHILD':
|
| 49 |
+
for child_id in relationship['Ids']:
|
| 50 |
+
cell = next((b for b in blocks if b['Id'] == child_id), None)
|
| 51 |
+
if cell and cell['BlockType'] == 'CELL':
|
| 52 |
+
row_index = cell['RowIndex']
|
| 53 |
+
col_index = cell['ColumnIndex']
|
| 54 |
+
cell_text = get_text_from_cell(cell, page_response)
|
| 55 |
+
if row_index not in rows:
|
| 56 |
+
rows[row_index] = {}
|
| 57 |
+
rows[row_index][col_index] = cell_text
|
| 58 |
+
# Create a sorted table by row and column indices
|
| 59 |
+
for row in sorted(rows.keys()):
|
| 60 |
+
sorted_row = [rows[row].get(col, '') for col in sorted(rows[row].keys())]
|
| 61 |
+
table.append(sorted_row)
|
| 62 |
+
tables_data.append(table)
|
| 63 |
+
return tables_data
|
| 64 |
+
|
| 65 |
+
def ocr_to_excel(pdf_file, template_file):
|
| 66 |
+
"""
|
| 67 |
+
Given a PDF file and an Excel template file, converts the PDF pages to images,
|
| 68 |
+
extracts table data using AWS Textract (skipping the first two rows of each table),
|
| 69 |
+
then creates a new Excel file by copying the template sheet for each table
|
| 70 |
+
and writing the filtered data starting at cell A7.
|
| 71 |
+
|
| 72 |
+
The output file is saved with the same base name as the PDF but with a .xlsx extension.
|
| 73 |
+
"""
|
| 74 |
+
|
| 75 |
+
pages = convert_from_path(pdf_file, dpi=200,
|
| 76 |
+
poppler_path=r"C:\Users\sshivlani\Projects\Inspection-Form-Transcriber\poppler-23.07.0\Library\bin") # Adjust later
|
| 77 |
+
|
| 78 |
+
all_tables = []
|
| 79 |
+
for i, page_image in enumerate(pages):
|
| 80 |
+
buffer = BytesIO()
|
| 81 |
+
# Save image as JPEG (quality=90)
|
| 82 |
+
page_image.save(buffer, format='JPEG')
|
| 83 |
+
image_bytes = buffer.getvalue()
|
| 84 |
+
|
| 85 |
+
# Call Textract synchronously on this image
|
| 86 |
+
response = analyze_document_local(image_bytes)
|
| 87 |
+
# Delay to avoid throttling
|
| 88 |
+
time.sleep(1)
|
| 89 |
+
tables = extract_table_data(response)
|
| 90 |
+
if tables:
|
| 91 |
+
all_tables.extend(tables)
|
| 92 |
+
|
| 93 |
+
# Load the Excel template workbook
|
| 94 |
+
|
| 95 |
+
wb = load_workbook(template_file)
|
| 96 |
+
# Use the first worksheet as the template sheet
|
| 97 |
+
template_sheet = wb.worksheets[0]
|
| 98 |
+
|
| 99 |
+
if (template_file == 'Transect Datasheets.xlsx'):
|
| 100 |
+
|
| 101 |
+
for idx, table in enumerate(all_tables, start=1):
|
| 102 |
+
new_sheet = wb.copy_worksheet(template_sheet)
|
| 103 |
+
new_sheet.title = f"Page_{idx}"
|
| 104 |
+
start_row = 12 # Data will start at row 7 (i.e. cell A7)
|
| 105 |
+
# Exclude the first two rows of the extracted table data
|
| 106 |
+
data_rows = table[1:]
|
| 107 |
+
for r_idx, row in enumerate(data_rows):
|
| 108 |
+
for c_idx, value in enumerate(row):
|
| 109 |
+
new_sheet.cell(row=start_row + r_idx, column=c_idx + 1, value=value)
|
| 110 |
+
|
| 111 |
+
if (template_file == 'Line Intercept-BB Transect.xlsx' or template_file == 'SAV Survey Datasheet.xlsx'):
|
| 112 |
+
|
| 113 |
+
for idx, table in enumerate(all_tables, start=1):
|
| 114 |
+
new_sheet = wb.copy_worksheet(template_sheet)
|
| 115 |
+
new_sheet.title = f"Page_{idx}"
|
| 116 |
+
start_row = 7 # Data will start at row 7 (i.e. cell A7)
|
| 117 |
+
# Exclude the first two rows of the extracted table data
|
| 118 |
+
data_rows = table[2:]
|
| 119 |
+
for r_idx, row in enumerate(data_rows):
|
| 120 |
+
for c_idx, value in enumerate(row):
|
| 121 |
+
new_sheet.cell(row=start_row + r_idx, column=c_idx + 1, value=value)
|
| 122 |
+
|
| 123 |
+
if (template_file == 'Blank Quad Datasheets.xlsx'):
|
| 124 |
+
|
| 125 |
+
for idx, table in enumerate(all_tables, start=1):
|
| 126 |
+
new_sheet = wb.copy_worksheet(template_sheet)
|
| 127 |
+
new_sheet.title = f"Page_{idx}"
|
| 128 |
+
start_row = 5 # Data will start at row 7 (i.e. cell A7)
|
| 129 |
+
# Exclude the first two rows of the extracted table data
|
| 130 |
+
data_rows = table[1:]
|
| 131 |
+
for r_idx, row in enumerate(data_rows):
|
| 132 |
+
for c_idx, value in enumerate(row):
|
| 133 |
+
new_sheet.cell(row=start_row + r_idx, column=c_idx + 1, value=value)
|
| 134 |
+
|
| 135 |
+
wb.remove(template_sheet)
|
| 136 |
+
|
| 137 |
+
# Construct output filename: same as PDF's base name with .xlsx extension
|
| 138 |
+
base_name = os.path.splitext(os.path.basename(pdf_file))[0]
|
| 139 |
+
output_path = base_name + ".xlsx"
|
| 140 |
+
wb.save(output_path)
|
| 141 |
+
return output_path
|
| 142 |
+
|
| 143 |
+
iface = gr.Interface(
|
| 144 |
+
fn=ocr_to_excel,
|
| 145 |
+
inputs=[
|
| 146 |
+
gr.File(label="PDF for OCR"),
|
| 147 |
+
gr.Dropdown(
|
| 148 |
+
choices=["Transect Datasheets.xlsx", "Line Intercept-BB Transect.xlsx", "SAV Survey Datasheet.xlsx", "Blank Quad Datasheets.xlsx"],
|
| 149 |
+
label="Excel Template"
|
| 150 |
+
)
|
| 151 |
+
],
|
| 152 |
+
outputs=gr.File(label="Output Excel File"),
|
| 153 |
+
title="Handwritten Datasheets to Excel File (.xlsx)",
|
| 154 |
+
description="Upload a PDF file and select an Excel template. The OCR data will be appended to duplicated copies of the template sheet, and the resulting Excel file will be returned."
|
| 155 |
+
)
|
| 156 |
+
|
| 157 |
+
iface.launch()
|
requirements (3).txt
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio
|
| 2 |
+
pdf2image
|
| 3 |
+
boto3
|
| 4 |
+
openpyxl
|