File size: 4,250 Bytes
86c797e
 
4446c35
86c797e
4446c35
 
 
86c797e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4446c35
 
86c797e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dcdd550
86c797e
 
4446c35
 
86c797e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4446c35
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import io
import os
from azure.core.credentials import AzureKeyCredential
from azure.ai.formrecognizer import DocumentAnalysisClient
import pandas as pd
from io import BytesIO

def detect_tables(pdflist, pdfnames):
    """
    - pdflist:  a list of PDF bytes (each element is a bytes or bytearray object).
    - pdfnames: a list of strings, where pdfnames[i] is the path or name for pdflist[i].
    Both lists must have the same length.

    The function:
      1. Calls Azure Form Recognizer (prebuilt-layout) on each PDF bytes.
      2. Extracts all tables, adding columns: 'pdf_name', 'table_id', 'page_number'.
      3. Concatenates everything into one Excel worksheet named "Tables", leaving two blank rows between each PDF’s block.
      4. Returns a BytesIO buffer containing the .xlsx. If no tables are found, returns None.
    """

    # 1. Validate inputs
    if not isinstance(pdflist, (list, tuple)) or not isinstance(pdfnames, (list, tuple)):
        raise ValueError("Both pdflist and pdfnames must be lists (or tuples).")
    if len(pdflist) != len(pdfnames):
        raise ValueError("pdflist and pdfnames must have the same length.")

    # 2. Set up Azure Form Recognizer client
    endpoint = "https://tabledetection2.cognitiveservices.azure.com/"
    key = "5lr94dODMJihbGOMw2Vdz29zXRBiqt528fSGoGmzSJHTrWtHSnRdJQQJ99BEACYeBjFXJ3w3AAALACOGBANH"
    credential = AzureKeyCredential(key)
    client     = DocumentAnalysisClient(endpoint=endpoint, credential=credential)

    tables_by_pdf = []

    # 3. Loop over each PDF-bytes / name pair
    for pdf_bytes, pdf_path in zip(pdflist, pdfnames):
        # Skip anything that isn’t raw bytes or whose name isn’t a string
        if not isinstance(pdf_bytes, (bytes, bytearray)) or not isinstance(pdf_path, str):
            continue

        # Extract the filename from the path
        pdf_name = os.path.basename(pdf_path)

        stream = io.BytesIO(pdf_bytes)
        per_pdf_tables = []

        # Call Form Recognizer on this PDF bytes
        poller = client.begin_analyze_document("prebuilt-layout", document=stream)
        result = poller.result()

        # Extract every table as a DataFrame
        for table_idx, table in enumerate(result.tables, start=1):
            # Determine the grid size
            cols = max(cell.column_index for cell in table.cells) + 1
            rows = max(cell.row_index   for cell in table.cells) + 1
            grid = [["" for _ in range(cols)] for _ in range(rows)]

            for cell in table.cells:
                grid[cell.row_index][cell.column_index] = cell.content

            df = pd.DataFrame(grid)
            df["page_number"] = table.bounding_regions[0].page_number
            df["table_id"]     = table_idx
            df["pdf_name"]     = pdf_name
            df = df.replace(r':+(?:selected|unselected):*', '', regex=True)


            per_pdf_tables.append(df)

        if per_pdf_tables:
            tables_by_pdf.append((pdf_name, per_pdf_tables))

    # If no tables at all, return None
    if not tables_by_pdf:
        return None

    # 4. Write all tables into one sheet, with 2 blank rows between PDFs
    excel_buffer = BytesIO()
    with pd.ExcelWriter(excel_buffer, engine="openpyxl") as writer:
        sheet_name  = "Tables"
        current_row = 0
        first_block = True

        for pdf_name, dfs in tables_by_pdf:
            for df in dfs:
                # Only write headers on the very first table in the sheet
                write_header = first_block and (current_row == 0)

                df.to_excel(
                    writer,
                    sheet_name=sheet_name,
                    index=False,
                    header=write_header,
                    startrow=current_row
                )

                # Advance current_row by the number of rows written:
                #   • df.shape[0] data rows
                #   • +1 if header was written
                rows_written = df.shape[0] + (1 if write_header else 0)
                current_row += rows_written
                first_block = False

            # After finishing this PDF’s tables, insert two blank rows
            current_row += 2

    excel_buffer.seek(0)
    return excel_buffer