Marthee commited on
Commit
4446c35
·
verified ·
1 Parent(s): 8cb34de

Upload Azure_api.py

Browse files
Files changed (1) hide show
  1. Azure_api.py +49 -0
Azure_api.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from azure.ai.formrecognizer import DocumentAnalysisClient
2
+ from azure.core.credentials import AzureKeyCredential
3
+ import pandas as pd
4
+ from io import BytesIO
5
+ import io
6
+
7
+ # Set your Azure credentials
8
+ def detect_tables(pdf_source):
9
+ endpoint = "https://tableextractiontsa.openai.azure.com"
10
+ key = "9JusYnfKj4av5PQDOBpfZOj77NF88r2xcvfXNW9D7Or2Bk0F4dCpJQQJ99BDACYeBjFXJ3w3AAAAACOGVOJr"
11
+
12
+ # Create client
13
+ client = DocumentAnalysisClient(endpoint=endpoint, credential=AzureKeyCredential(key))
14
+
15
+ # Load your PDF
16
+ if isinstance(pdf_source, (bytes, bytearray)):
17
+ stream = io.BytesIO(pdf_source)
18
+ else:
19
+ stream = open(pdf_source, "rb")
20
+
21
+ # 2. Call Azure Form Recognizer
22
+ with stream:
23
+ client = DocumentAnalysisClient(endpoint=endpoint,
24
+ credential=AzureKeyCredential(key))
25
+ poller = client.begin_analyze_document("prebuilt-layout",
26
+ document=stream)
27
+ result = poller.result()
28
+
29
+ # 3. Build DataFrames
30
+ tables = []
31
+ for table in result.tables:
32
+ cols = max(c.column_index for c in table.cells) + 1
33
+ rows = max(c.row_index for c in table.cells) + 1
34
+ grid = [["" for _ in range(cols)] for _ in range(rows)]
35
+ for c in table.cells:
36
+ grid[c.row_index][c.column_index] = c.content
37
+ df = pd.DataFrame(grid)
38
+ df["page_number"] = table.bounding_regions[0].page_number
39
+ tables.append(df)
40
+
41
+ # 4. Write all sheets into a BytesIO
42
+ excel_buffer = BytesIO()
43
+ with pd.ExcelWriter(excel_buffer, engine="openpyxl") as writer:
44
+ for idx, df in enumerate(tables, start=1):
45
+ sheet = f"Table_{idx}"
46
+ df.to_excel(writer, sheet_name=sheet, index=False)
47
+ excel_buffer.seek(0)
48
+ return excel_buffer
49
+