Marthee commited on
Commit
86c797e
·
verified ·
1 Parent(s): 7a36a2b

Update Azure_api.py

Browse files
Files changed (1) hide show
  1. Azure_api.py +97 -42
Azure_api.py CHANGED
@@ -1,53 +1,108 @@
1
- from azure.ai.formrecognizer import DocumentAnalysisClient
 
2
  from azure.core.credentials import AzureKeyCredential
 
3
  import pandas as pd
4
  from io import BytesIO
5
- import io
6
 
7
- # Set your Azure credentials
8
- def detect_tables(pdf_source):
9
- endpoint = "https://tableextractiontsa.openai.azure.com"
10
- key = "9JusYnfKj4av5PQDOBpfZOj77NF88r2xcvfXNW9D7Or2Bk0F4dCpJQQJ99BDACYeBjFXJ3w3AAAAACOGVOJr"
11
-
12
- # Create client
13
- client = DocumentAnalysisClient(endpoint=endpoint, credential=AzureKeyCredential(key))
14
-
15
- # Load your PDF
16
- if isinstance(pdf_source, (bytes, bytearray)):
17
- stream = io.BytesIO(pdf_source)
18
- else:
19
- stream = open(pdf_source, "rb")
20
-
21
- # 2. Call Azure Form Recognizer
22
- with stream:
23
- client = DocumentAnalysisClient(endpoint=endpoint,
24
- credential=AzureKeyCredential(key))
25
- poller = client.begin_analyze_document("prebuilt-layout",
26
- document=stream)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  result = poller.result()
28
 
29
- # 3. Build DataFrames
30
- tables = []
31
- for table in result.tables:
32
- cols = max(c.column_index for c in table.cells) + 1
33
- rows = max(c.row_index for c in table.cells) + 1
34
- grid = [["" for _ in range(cols)] for _ in range(rows)]
35
- for c in table.cells:
36
- grid[c.row_index][c.column_index] = c.content
37
- df = pd.DataFrame(grid)
38
- df = df.replace(r':+(?:selected|unselected):*', '', regex=True) # :contentReference[oaicite:1]{index=1}
39
- df["page_number"] = table.bounding_regions[0].page_number
40
- tables.append(df)
41
-
42
- # 4. Write all sheets into a BytesIO
43
- if not tables:
 
 
 
 
 
 
 
 
 
44
  return None
45
-
 
46
  excel_buffer = BytesIO()
47
  with pd.ExcelWriter(excel_buffer, engine="openpyxl") as writer:
48
- for idx, df in enumerate(tables, start=1):
49
- sheet = f"Table_{idx}"
50
- df.to_excel(writer, sheet_name=sheet, index=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  excel_buffer.seek(0)
52
  return excel_buffer
53
-
 
1
+ import io
2
+ import os
3
  from azure.core.credentials import AzureKeyCredential
4
+ from azure.ai.formrecognizer import DocumentAnalysisClient
5
  import pandas as pd
6
  from io import BytesIO
 
7
 
8
+ def detect_tables(pdflist, pdfnames):
9
+ """
10
+ - pdflist: a list of PDF bytes (each element is a bytes or bytearray object).
11
+ - pdfnames: a list of strings, where pdfnames[i] is the path or name for pdflist[i].
12
+ Both lists must have the same length.
13
+
14
+ The function:
15
+ 1. Calls Azure Form Recognizer (prebuilt-layout) on each PDF bytes.
16
+ 2. Extracts all tables, adding columns: 'pdf_name', 'table_id', 'page_number'.
17
+ 3. Concatenates everything into one Excel worksheet named "Tables", leaving two blank rows between each PDF’s block.
18
+ 4. Returns a BytesIO buffer containing the .xlsx. If no tables are found, returns None.
19
+ """
20
+
21
+ # 1. Validate inputs
22
+ if not isinstance(pdflist, (list, tuple)) or not isinstance(pdfnames, (list, tuple)):
23
+ raise ValueError("Both pdflist and pdfnames must be lists (or tuples).")
24
+ if len(pdflist) != len(pdfnames):
25
+ raise ValueError("pdflist and pdfnames must have the same length.")
26
+
27
+ # 2. Set up Azure Form Recognizer client
28
+ endpoint = "https://tabledetection2.cognitiveservices.azure.com/"
29
+ key = "5lr94dODMJihbGOMw2Vdz29zXRBiqt528fSGoGmzSJHTrWtHSnRdJQQJ99BEACYeBjFXJ3w3AAALACOGBANH"
30
+ credential = AzureKeyCredential(key)
31
+ client = DocumentAnalysisClient(endpoint=endpoint, credential=credential)
32
+
33
+ tables_by_pdf = []
34
+
35
+ # 3. Loop over each PDF-bytes / name pair
36
+ for pdf_bytes, pdf_path in zip(pdflist, pdfnames):
37
+ # Skip anything that isn’t raw bytes or whose name isn’t a string
38
+ if not isinstance(pdf_bytes, (bytes, bytearray)) or not isinstance(pdf_path, str):
39
+ continue
40
+
41
+ # Extract the filename from the path
42
+ pdf_name = os.path.basename(pdf_path)
43
+
44
+ stream = io.BytesIO(pdf_bytes)
45
+ per_pdf_tables = []
46
+
47
+ # Call Form Recognizer on this PDF bytes
48
+ poller = client.begin_analyze_document("prebuilt-layout", document=stream)
49
  result = poller.result()
50
 
51
+ # Extract every table as a DataFrame
52
+ for table_idx, table in enumerate(result.tables, start=1):
53
+ # Determine the grid size
54
+ cols = max(cell.column_index for cell in table.cells) + 1
55
+ rows = max(cell.row_index for cell in table.cells) + 1
56
+ grid = [["" for _ in range(cols)] for _ in range(rows)]
57
+
58
+ for cell in table.cells:
59
+ grid[cell.row_index][cell.column_index] = cell.content
60
+
61
+ df = pd.DataFrame(grid)
62
+ df["page_number"] = table.bounding_regions[0].page_number
63
+ df["table_id"] = table_idx
64
+ df["pdf_name"] = pdf_name
65
+ df = df.replace(r':+(?:selected|unselected):*', '', regex=True)
66
+
67
+
68
+ per_pdf_tables.append(df)
69
+
70
+ if per_pdf_tables:
71
+ tables_by_pdf.append((pdf_name, per_pdf_tables))
72
+
73
+ # If no tables at all, return None
74
+ if not tables_by_pdf:
75
  return None
76
+
77
+ # 4. Write all tables into one sheet, with 2 blank rows between PDFs
78
  excel_buffer = BytesIO()
79
  with pd.ExcelWriter(excel_buffer, engine="openpyxl") as writer:
80
+ sheet_name = "Tables"
81
+ current_row = 0
82
+ first_block = True
83
+
84
+ for pdf_name, dfs in tables_by_pdf:
85
+ for df in dfs:
86
+ # Only write headers on the very first table in the sheet
87
+ write_header = first_block and (current_row == 0)
88
+
89
+ df.to_excel(
90
+ writer,
91
+ sheet_name=sheet_name,
92
+ index=False,
93
+ header=write_header,
94
+ startrow=current_row
95
+ )
96
+
97
+ # Advance current_row by the number of rows written:
98
+ # • df.shape[0] data rows
99
+ # • +1 if header was written
100
+ rows_written = df.shape[0] + (1 if write_header else 0)
101
+ current_row += rows_written
102
+ first_block = False
103
+
104
+ # After finishing this PDF’s tables, insert two blank rows
105
+ current_row += 2
106
+
107
  excel_buffer.seek(0)
108
  return excel_buffer