import gradio as gr import boto3 from pdf2image import convert_from_path from io import BytesIO import time from aws import get_region, get_key_id, get_access_key from openpyxl import load_workbook import os textract = boto3.client('textract', region_name=get_region(), aws_access_key_id=get_key_id(), aws_secret_access_key=get_access_key()) def analyze_document_local(image_bytes): """Calls Textract's analyze_document API using the local image bytes.""" response = textract.analyze_document( Document={'Bytes': image_bytes}, FeatureTypes=['LAYOUT', 'TABLES'] ) return response def get_text_from_cell(cell, page): """Extracts the text from a table cell by looking up child WORD blocks.""" text = '' for relationship in cell.get('Relationships', []): if relationship['Type'] == 'CHILD': for child_id in relationship['Ids']: # Find the child block in the page word_block = next((b for b in page['Blocks'] if b['Id'] == child_id), None) if word_block and word_block['BlockType'] == 'WORD': text += word_block.get('Text', '') + ' ' return text.strip() def extract_table_data(page_response): """ Extracts table data from a single Textract response (representing one image/page). Returns a list of tables (each table is represented as a list of rows). """ blocks = page_response['Blocks'] tables_data = [] for block in blocks: if block['BlockType'] == 'TABLE': table = [] rows = {} # Get each cell in the table for relationship in block.get('Relationships', []): if relationship['Type'] == 'CHILD': for child_id in relationship['Ids']: cell = next((b for b in blocks if b['Id'] == child_id), None) if cell and cell['BlockType'] == 'CELL': row_index = cell['RowIndex'] col_index = cell['ColumnIndex'] cell_text = get_text_from_cell(cell, page_response) if row_index not in rows: rows[row_index] = {} rows[row_index][col_index] = cell_text # Create a sorted table by row and column indices for row in sorted(rows.keys()): sorted_row = [rows[row].get(col, '') for col in sorted(rows[row].keys())] table.append(sorted_row) tables_data.append(table) return tables_data def ocr_to_excel(pdf_file, template_file): """ Given a PDF file and an Excel template file, converts the PDF pages to images, extracts table data using AWS Textract (skipping the first two rows of each table), then creates a new Excel file by copying the template sheet for each table and writing the filtered data starting at cell A7. The output file is saved with the same base name as the PDF but with a .xlsx extension. """ pages = convert_from_path(pdf_file, dpi=200, poppler_path=r"C:\Users\sshivlani\Projects\Inspection-Form-Transcriber\poppler-23.07.0\Library\bin") # Adjust later all_tables = [] for i, page_image in enumerate(pages): buffer = BytesIO() # Save image as JPEG (quality=90) page_image.save(buffer, format='JPEG') image_bytes = buffer.getvalue() # Call Textract synchronously on this image response = analyze_document_local(image_bytes) # Delay to avoid throttling time.sleep(1) tables = extract_table_data(response) if tables: all_tables.extend(tables) # Load the Excel template workbook wb = load_workbook(template_file) # Use the first worksheet as the template sheet template_sheet = wb.worksheets[0] if (template_file == 'Transect Datasheets.xlsx'): for idx, table in enumerate(all_tables, start=1): new_sheet = wb.copy_worksheet(template_sheet) new_sheet.title = f"Page_{idx}" start_row = 12 # Data will start at row 7 (i.e. cell A7) # Exclude the first two rows of the extracted table data data_rows = table[1:] for r_idx, row in enumerate(data_rows): for c_idx, value in enumerate(row): new_sheet.cell(row=start_row + r_idx, column=c_idx + 1, value=value) if (template_file == 'Line Intercept-BB Transect.xlsx' or template_file == 'SAV Survey Datasheet.xlsx'): for idx, table in enumerate(all_tables, start=1): new_sheet = wb.copy_worksheet(template_sheet) new_sheet.title = f"Page_{idx}" start_row = 7 # Data will start at row 7 (i.e. cell A7) # Exclude the first two rows of the extracted table data data_rows = table[2:] for r_idx, row in enumerate(data_rows): for c_idx, value in enumerate(row): new_sheet.cell(row=start_row + r_idx, column=c_idx + 1, value=value) if (template_file == 'Blank Quad Datasheets.xlsx'): for idx, table in enumerate(all_tables, start=1): new_sheet = wb.copy_worksheet(template_sheet) new_sheet.title = f"Page_{idx}" start_row = 5 # Data will start at row 7 (i.e. cell A7) # Exclude the first two rows of the extracted table data data_rows = table[1:] for r_idx, row in enumerate(data_rows): for c_idx, value in enumerate(row): new_sheet.cell(row=start_row + r_idx, column=c_idx + 1, value=value) wb.remove(template_sheet) # Construct output filename: same as PDF's base name with .xlsx extension base_name = os.path.splitext(os.path.basename(pdf_file))[0] output_path = base_name + ".xlsx" wb.save(output_path) return output_path iface = gr.Interface( fn=ocr_to_excel, inputs=[ gr.File(label="PDF for OCR"), gr.Dropdown( choices=["Transect Datasheets.xlsx", "Line Intercept-BB Transect.xlsx", "SAV Survey Datasheet.xlsx", "Blank Quad Datasheets.xlsx"], label="Excel Template" ) ], outputs=gr.File(label="Output Excel File"), title="Handwritten Datasheets to Excel File (.xlsx)", description="Upload a PDF file and select an Excel template. The OCR data will be appended to duplicated copies of the template sheet, and the resulting Excel file will be returned." ) iface.launch()