Spaces:

Marthee
/

Measurement

Paused

File size: 4,250 Bytes

import io
import os
from azure.core.credentials import AzureKeyCredential
from azure.ai.formrecognizer import DocumentAnalysisClient
import pandas as pd
from io import BytesIO

def detect_tables(pdflist, pdfnames):
    """
    - pdflist:  a list of PDF bytes (each element is a bytes or bytearray object).
    - pdfnames: a list of strings, where pdfnames[i] is the path or name for pdflist[i].
    Both lists must have the same length.

    The function:
      1. Calls Azure Form Recognizer (prebuilt-layout) on each PDF bytes.
      2. Extracts all tables, adding columns: 'pdf_name', 'table_id', 'page_number'.
      3. Concatenates everything into one Excel worksheet named "Tables", leaving two blank rows between each PDF’s block.
      4. Returns a BytesIO buffer containing the .xlsx. If no tables are found, returns None.
    """

    # 1. Validate inputs
    if not isinstance(pdflist, (list, tuple)) or not isinstance(pdfnames, (list, tuple)):
        raise ValueError("Both pdflist and pdfnames must be lists (or tuples).")
    if len(pdflist) != len(pdfnames):
        raise ValueError("pdflist and pdfnames must have the same length.")

    # 2. Set up Azure Form Recognizer client
    endpoint = "https://tabledetection2.cognitiveservices.azure.com/"
    key = "5lr94dODMJihbGOMw2Vdz29zXRBiqt528fSGoGmzSJHTrWtHSnRdJQQJ99BEACYeBjFXJ3w3AAALACOGBANH"
    credential = AzureKeyCredential(key)
    client     = DocumentAnalysisClient(endpoint=endpoint, credential=credential)

    tables_by_pdf = []

    # 3. Loop over each PDF-bytes / name pair
    for pdf_bytes, pdf_path in zip(pdflist, pdfnames):
        # Skip anything that isn’t raw bytes or whose name isn’t a string
        if not isinstance(pdf_bytes, (bytes, bytearray)) or not isinstance(pdf_path, str):
            continue

        # Extract the filename from the path
        pdf_name = os.path.basename(pdf_path)

        stream = io.BytesIO(pdf_bytes)
        per_pdf_tables = []

        # Call Form Recognizer on this PDF bytes
        poller = client.begin_analyze_document("prebuilt-layout", document=stream)
        result = poller.result()

        # Extract every table as a DataFrame
        for table_idx, table in enumerate(result.tables, start=1):
            # Determine the grid size
            cols = max(cell.column_index for cell in table.cells) + 1
            rows = max(cell.row_index   for cell in table.cells) + 1
            grid = [["" for _ in range(cols)] for _ in range(rows)]

            for cell in table.cells:
                grid[cell.row_index][cell.column_index] = cell.content

            df = pd.DataFrame(grid)
            df["page_number"] = table.bounding_regions[0].page_number
            df["table_id"]     = table_idx
            df["pdf_name"]     = pdf_name
            df = df.replace(r':+(?:selected|unselected):*', '', regex=True)


            per_pdf_tables.append(df)

        if per_pdf_tables:
            tables_by_pdf.append((pdf_name, per_pdf_tables))

    # If no tables at all, return None
    if not tables_by_pdf:
        return None

    # 4. Write all tables into one sheet, with 2 blank rows between PDFs
    excel_buffer = BytesIO()
    with pd.ExcelWriter(excel_buffer, engine="openpyxl") as writer:
        sheet_name  = "Tables"
        current_row = 0
        first_block = True

        for pdf_name, dfs in tables_by_pdf:
            for df in dfs:
                # Only write headers on the very first table in the sheet
                write_header = first_block and (current_row == 0)

                df.to_excel(
                    writer,
                    sheet_name=sheet_name,
                    index=False,
                    header=write_header,
                    startrow=current_row
                )

                # Advance current_row by the number of rows written:
                #   • df.shape[0] data rows
                #   • +1 if header was written
                rows_written = df.shape[0] + (1 if write_header else 0)
                current_row += rows_written
                first_block = False

            # After finishing this PDF’s tables, insert two blank rows
            current_row += 2

    excel_buffer.seek(0)
    return excel_buffer