from azure.ai.formrecognizer import DocumentAnalysisClient from azure.core.credentials import AzureKeyCredential import pandas as pd from io import BytesIO import io # Set your Azure credentials def detect_tables(pdf_source): endpoint = "https://tableextractiontsa.openai.azure.com" key = "9JusYnfKj4av5PQDOBpfZOj77NF88r2xcvfXNW9D7Or2Bk0F4dCpJQQJ99BDACYeBjFXJ3w3AAAAACOGVOJr" # Create client client = DocumentAnalysisClient(endpoint=endpoint, credential=AzureKeyCredential(key)) # Load your PDF if isinstance(pdf_source, (bytes, bytearray)): stream = io.BytesIO(pdf_source) else: stream = open(pdf_source, "rb") # 2. Call Azure Form Recognizer with stream: client = DocumentAnalysisClient(endpoint=endpoint, credential=AzureKeyCredential(key)) poller = client.begin_analyze_document("prebuilt-layout", document=stream) result = poller.result() # 3. Build DataFrames tables = [] for table in result.tables: cols = max(c.column_index for c in table.cells) + 1 rows = max(c.row_index for c in table.cells) + 1 grid = [["" for _ in range(cols)] for _ in range(rows)] for c in table.cells: grid[c.row_index][c.column_index] = c.content df = pd.DataFrame(grid) df["page_number"] = table.bounding_regions[0].page_number tables.append(df) # 4. Write all sheets into a BytesIO if not tables: return None excel_buffer = BytesIO() with pd.ExcelWriter(excel_buffer, engine="openpyxl") as writer: for idx, df in enumerate(tables, start=1): sheet = f"Table_{idx}" df.to_excel(writer, sheet_name=sheet, index=False) excel_buffer.seek(0) return excel_buffer