Spaces:

edwinrajeev
/

Environmental_Textract

Runtime error

+import gradio as gr
+import boto3
+from pdf2image import convert_from_path
+from io import BytesIO
+import time
+from aws import get_region, get_key_id, get_access_key
+from openpyxl import load_workbook
+import os
+textract = boto3.client('textract',
+                        region_name=get_region(),
+                        aws_access_key_id=get_key_id(),
+                        aws_secret_access_key=get_access_key())
+def analyze_document_local(image_bytes):
+    """Calls Textract's analyze_document API using the local image bytes."""
+    response = textract.analyze_document(
+        Document={'Bytes': image_bytes},
+        FeatureTypes=['LAYOUT', 'TABLES']
+    )
+    return response
+def get_text_from_cell(cell, page):
+    """Extracts the text from a table cell by looking up child WORD blocks."""
+    text = ''
+    for relationship in cell.get('Relationships', []):
+        if relationship['Type'] == 'CHILD':
+            for child_id in relationship['Ids']:
+                # Find the child block in the page
+                word_block = next((b for b in page['Blocks'] if b['Id'] == child_id), None)
+                if word_block and word_block['BlockType'] == 'WORD':
+                    text += word_block.get('Text', '') + ' '
+    return text.strip()
+def extract_table_data(page_response):
+    """
+    Extracts table data from a single Textract response (representing one image/page).
+    Returns a list of tables (each table is represented as a list of rows).
+    """
+    blocks = page_response['Blocks']
+    tables_data = []
+    for block in blocks:
+        if block['BlockType'] == 'TABLE':
+            table = []
+            rows = {}
+            # Get each cell in the table
+            for relationship in block.get('Relationships', []):
+                if relationship['Type'] == 'CHILD':
+                    for child_id in relationship['Ids']:
+                        cell = next((b for b in blocks if b['Id'] == child_id), None)
+                        if cell and cell['BlockType'] == 'CELL':
+                            row_index = cell['RowIndex']
+                            col_index = cell['ColumnIndex']
+                            cell_text = get_text_from_cell(cell, page_response)
+                            if row_index not in rows:
+                                rows[row_index] = {}
+                            rows[row_index][col_index] = cell_text
+            # Create a sorted table by row and column indices
+            for row in sorted(rows.keys()):
+                sorted_row = [rows[row].get(col, '') for col in sorted(rows[row].keys())]
+                table.append(sorted_row)
+            tables_data.append(table)
+    return tables_data
+def ocr_to_excel(pdf_file, template_file):
+    """
+    Given a PDF file and an Excel template file, converts the PDF pages to images,
+    extracts table data using AWS Textract (skipping the first two rows of each table),
+    then creates a new Excel file by copying the template sheet for each table
+    and writing the filtered data starting at cell A7.
+    The output file is saved with the same base name as the PDF but with a .xlsx extension.
+    """
+    pages = convert_from_path(pdf_file, dpi=200,
+                              poppler_path=r"C:\Users\sshivlani\Projects\Inspection-Form-Transcriber\poppler-23.07.0\Library\bin") # Adjust later
+    all_tables = []
+    for i, page_image in enumerate(pages):
+        buffer = BytesIO()
+        # Save image as JPEG (quality=90)
+        page_image.save(buffer, format='JPEG')
+        image_bytes = buffer.getvalue()
+        # Call Textract synchronously on this image
+        response = analyze_document_local(image_bytes)
+        # Delay to avoid throttling
+        time.sleep(1)
+        tables = extract_table_data(response)
+        if tables:
+            all_tables.extend(tables)
+    # Load the Excel template workbook
+    wb = load_workbook(template_file)
+    # Use the first worksheet as the template sheet
+    template_sheet = wb.worksheets[0]
+    if (template_file == 'Transect Datasheets.xlsx'):
+        for idx, table in enumerate(all_tables, start=1):
+            new_sheet = wb.copy_worksheet(template_sheet)
+            new_sheet.title = f"Page_{idx}"
+            start_row = 12  # Data will start at row 7 (i.e. cell A7)
+            # Exclude the first two rows of the extracted table data
+            data_rows = table[1:]
+            for r_idx, row in enumerate(data_rows):
+                for c_idx, value in enumerate(row):
+                    new_sheet.cell(row=start_row + r_idx, column=c_idx + 1, value=value)
+    if (template_file == 'Line Intercept-BB Transect.xlsx' or template_file == 'SAV Survey Datasheet.xlsx'):
+        for idx, table in enumerate(all_tables, start=1):
+            new_sheet = wb.copy_worksheet(template_sheet)
+            new_sheet.title = f"Page_{idx}"
+            start_row = 7  # Data will start at row 7 (i.e. cell A7)
+            # Exclude the first two rows of the extracted table data
+            data_rows = table[2:]
+            for r_idx, row in enumerate(data_rows):
+                for c_idx, value in enumerate(row):
+                    new_sheet.cell(row=start_row + r_idx, column=c_idx + 1, value=value)
+    if (template_file == 'Blank Quad Datasheets.xlsx'):
+        for idx, table in enumerate(all_tables, start=1):
+            new_sheet = wb.copy_worksheet(template_sheet)
+            new_sheet.title = f"Page_{idx}"
+            start_row = 5  # Data will start at row 7 (i.e. cell A7)
+            # Exclude the first two rows of the extracted table data
+            data_rows = table[1:]
+            for r_idx, row in enumerate(data_rows):
+                for c_idx, value in enumerate(row):
+                    new_sheet.cell(row=start_row + r_idx, column=c_idx + 1, value=value)
+    wb.remove(template_sheet)
+    # Construct output filename: same as PDF's base name with .xlsx extension
+    base_name = os.path.splitext(os.path.basename(pdf_file))[0]
+    output_path = base_name + ".xlsx"
+    wb.save(output_path)
+    return output_path
+iface = gr.Interface(
+    fn=ocr_to_excel,
+    inputs=[
+        gr.File(label="PDF for OCR"),
+        gr.Dropdown(
+            choices=["Transect Datasheets.xlsx", "Line Intercept-BB Transect.xlsx", "SAV Survey Datasheet.xlsx", "Blank Quad Datasheets.xlsx"],
+            label="Excel Template"
+        )
+    ],
+    outputs=gr.File(label="Output Excel File"),
+    title="Handwritten Datasheets to Excel File (.xlsx)",
+    description="Upload a PDF file and select an Excel template. The OCR data will be appended to duplicated copies of the template sheet, and the resulting Excel file will be returned."
+)
+iface.launch()

requirements (3).txt ADDED Viewed

	@@ -0,0 +1,4 @@

+gradio
+pdf2image
+boto3
+openpyxl