MeasurementOrg

Sleeping

App Files Files Community

Marthee commited on May 2, 2025

Commit

4446c35

verified ·

1 Parent(s): 8cb34de

Upload Azure_api.py

Browse files

Files changed (1) hide show

Azure_api.py +49 -0

Azure_api.py ADDED Viewed

	@@ -0,0 +1,49 @@

+from azure.ai.formrecognizer import DocumentAnalysisClient
+from azure.core.credentials import AzureKeyCredential
+import pandas as pd
+from io import BytesIO
+import io
+# Set your Azure credentials
+def detect_tables(pdf_source):
+    endpoint = "https://tableextractiontsa.openai.azure.com"
+    key = "9JusYnfKj4av5PQDOBpfZOj77NF88r2xcvfXNW9D7Or2Bk0F4dCpJQQJ99BDACYeBjFXJ3w3AAAAACOGVOJr"
+    # Create client
+    client = DocumentAnalysisClient(endpoint=endpoint, credential=AzureKeyCredential(key))
+    # Load your PDF
+    if isinstance(pdf_source, (bytes, bytearray)):
+        stream = io.BytesIO(pdf_source)
+    else:
+        stream = open(pdf_source, "rb")
+    # 2. Call Azure Form Recognizer
+    with stream:
+        client = DocumentAnalysisClient(endpoint=endpoint,
+                                        credential=AzureKeyCredential(key))
+        poller = client.begin_analyze_document("prebuilt-layout",
+                                               document=stream)
+        result = poller.result()
+    # 3. Build DataFrames
+    tables = []
+    for table in result.tables:
+        cols = max(c.column_index for c in table.cells) + 1
+        rows = max(c.row_index for c in table.cells) + 1
+        grid = [["" for _ in range(cols)] for _ in range(rows)]
+        for c in table.cells:
+            grid[c.row_index][c.column_index] = c.content
+        df = pd.DataFrame(grid)
+        df["page_number"] = table.bounding_regions[0].page_number
+        tables.append(df)
+    # 4. Write all sheets into a BytesIO
+    excel_buffer = BytesIO()
+    with pd.ExcelWriter(excel_buffer, engine="openpyxl") as writer:
+        for idx, df in enumerate(tables, start=1):
+            sheet = f"Table_{idx}"
+            df.to_excel(writer, sheet_name=sheet, index=False)
+    excel_buffer.seek(0)
+    return excel_buffer