File size: 1,865 Bytes
4446c35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dcdd550
 
 
4446c35
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
from azure.ai.formrecognizer import DocumentAnalysisClient
from azure.core.credentials import AzureKeyCredential
import pandas as pd
from io import BytesIO
import io

# Set your Azure credentials
def detect_tables(pdf_source):
    endpoint = "https://tableextractiontsa.openai.azure.com"
    key = "9JusYnfKj4av5PQDOBpfZOj77NF88r2xcvfXNW9D7Or2Bk0F4dCpJQQJ99BDACYeBjFXJ3w3AAAAACOGVOJr"

    # Create client
    client = DocumentAnalysisClient(endpoint=endpoint, credential=AzureKeyCredential(key))

    # Load your PDF
    if isinstance(pdf_source, (bytes, bytearray)):
        stream = io.BytesIO(pdf_source)
    else:
        stream = open(pdf_source, "rb")

    # 2. Call Azure Form Recognizer
    with stream:
        client = DocumentAnalysisClient(endpoint=endpoint,
                                        credential=AzureKeyCredential(key))
        poller = client.begin_analyze_document("prebuilt-layout",
                                               document=stream)
        result = poller.result()

    # 3. Build DataFrames
    tables = []
    for table in result.tables:
        cols = max(c.column_index for c in table.cells) + 1
        rows = max(c.row_index for c in table.cells) + 1
        grid = [["" for _ in range(cols)] for _ in range(rows)]
        for c in table.cells:
            grid[c.row_index][c.column_index] = c.content
        df = pd.DataFrame(grid)
        df["page_number"] = table.bounding_regions[0].page_number
        tables.append(df)

    # 4. Write all sheets into a BytesIO
    if not tables:
        return None
        
    excel_buffer = BytesIO()
    with pd.ExcelWriter(excel_buffer, engine="openpyxl") as writer:
        for idx, df in enumerate(tables, start=1):
            sheet = f"Table_{idx}"
            df.to_excel(writer, sheet_name=sheet, index=False)
    excel_buffer.seek(0)
    return excel_buffer