MeasurementDUPLICATE / Azure_api.py
Marthee's picture
Update Azure_api.py
dcdd550 verified
from azure.ai.formrecognizer import DocumentAnalysisClient
from azure.core.credentials import AzureKeyCredential
import pandas as pd
from io import BytesIO
import io
# Set your Azure credentials
def detect_tables(pdf_source):
endpoint = "https://tableextractiontsa.openai.azure.com"
key = "9JusYnfKj4av5PQDOBpfZOj77NF88r2xcvfXNW9D7Or2Bk0F4dCpJQQJ99BDACYeBjFXJ3w3AAAAACOGVOJr"
# Create client
client = DocumentAnalysisClient(endpoint=endpoint, credential=AzureKeyCredential(key))
# Load your PDF
if isinstance(pdf_source, (bytes, bytearray)):
stream = io.BytesIO(pdf_source)
else:
stream = open(pdf_source, "rb")
# 2. Call Azure Form Recognizer
with stream:
client = DocumentAnalysisClient(endpoint=endpoint,
credential=AzureKeyCredential(key))
poller = client.begin_analyze_document("prebuilt-layout",
document=stream)
result = poller.result()
# 3. Build DataFrames
tables = []
for table in result.tables:
cols = max(c.column_index for c in table.cells) + 1
rows = max(c.row_index for c in table.cells) + 1
grid = [["" for _ in range(cols)] for _ in range(rows)]
for c in table.cells:
grid[c.row_index][c.column_index] = c.content
df = pd.DataFrame(grid)
df["page_number"] = table.bounding_regions[0].page_number
tables.append(df)
# 4. Write all sheets into a BytesIO
if not tables:
return None
excel_buffer = BytesIO()
with pd.ExcelWriter(excel_buffer, engine="openpyxl") as writer:
for idx, df in enumerate(tables, start=1):
sheet = f"Table_{idx}"
df.to_excel(writer, sheet_name=sheet, index=False)
excel_buffer.seek(0)
return excel_buffer