LatestDuplicate_Working

Paused

App Files Files Community

LatestDuplicate_Working / Azure_api.py

Marthee

Update Azure_api.py

86c797e verified 8 months ago

raw

history blame contribute delete

4.25 kB

	import io
	import os
	from azure.core.credentials import AzureKeyCredential
	from azure.ai.formrecognizer import DocumentAnalysisClient
	import pandas as pd
	from io import BytesIO

	def detect_tables(pdflist, pdfnames):
	"""
	- pdflist: a list of PDF bytes (each element is a bytes or bytearray object).
	- pdfnames: a list of strings, where pdfnames[i] is the path or name for pdflist[i].
	Both lists must have the same length.

	The function:
	1. Calls Azure Form Recognizer (prebuilt-layout) on each PDF bytes.
	2. Extracts all tables, adding columns: 'pdf_name', 'table_id', 'page_number'.
	3. Concatenates everything into one Excel worksheet named "Tables", leaving two blank rows between each PDF’s block.
	4. Returns a BytesIO buffer containing the .xlsx. If no tables are found, returns None.
	"""

	# 1. Validate inputs
	if not isinstance(pdflist, (list, tuple)) or not isinstance(pdfnames, (list, tuple)):
	raise ValueError("Both pdflist and pdfnames must be lists (or tuples).")
	if len(pdflist) != len(pdfnames):
	raise ValueError("pdflist and pdfnames must have the same length.")

	# 2. Set up Azure Form Recognizer client
	endpoint = "https://tabledetection2.cognitiveservices.azure.com/"
	key = "5lr94dODMJihbGOMw2Vdz29zXRBiqt528fSGoGmzSJHTrWtHSnRdJQQJ99BEACYeBjFXJ3w3AAALACOGBANH"
	credential = AzureKeyCredential(key)
	client = DocumentAnalysisClient(endpoint=endpoint, credential=credential)

	tables_by_pdf = []

	# 3. Loop over each PDF-bytes / name pair
	for pdf_bytes, pdf_path in zip(pdflist, pdfnames):
	# Skip anything that isn’t raw bytes or whose name isn’t a string
	if not isinstance(pdf_bytes, (bytes, bytearray)) or not isinstance(pdf_path, str):
	continue

	# Extract the filename from the path
	pdf_name = os.path.basename(pdf_path)

	stream = io.BytesIO(pdf_bytes)
	per_pdf_tables = []

	# Call Form Recognizer on this PDF bytes
	poller = client.begin_analyze_document("prebuilt-layout", document=stream)
	result = poller.result()

	# Extract every table as a DataFrame
	for table_idx, table in enumerate(result.tables, start=1):
	# Determine the grid size
	cols = max(cell.column_index for cell in table.cells) + 1
	rows = max(cell.row_index for cell in table.cells) + 1
	grid = [["" for _ in range(cols)] for _ in range(rows)]

	for cell in table.cells:
	grid[cell.row_index][cell.column_index] = cell.content

	df = pd.DataFrame(grid)
	df["page_number"] = table.bounding_regions[0].page_number
	df["table_id"] = table_idx
	df["pdf_name"] = pdf_name
	df = df.replace(r':+(?:selected\|unselected):*', '', regex=True)


	per_pdf_tables.append(df)

	if per_pdf_tables:
	tables_by_pdf.append((pdf_name, per_pdf_tables))

	# If no tables at all, return None
	if not tables_by_pdf:
	return None

	# 4. Write all tables into one sheet, with 2 blank rows between PDFs
	excel_buffer = BytesIO()
	with pd.ExcelWriter(excel_buffer, engine="openpyxl") as writer:
	sheet_name = "Tables"
	current_row = 0
	first_block = True

	for pdf_name, dfs in tables_by_pdf:
	for df in dfs:
	# Only write headers on the very first table in the sheet
	write_header = first_block and (current_row == 0)

	df.to_excel(
	writer,
	sheet_name=sheet_name,
	index=False,
	header=write_header,
	startrow=current_row
	)

	# Advance current_row by the number of rows written:
	# • df.shape[0] data rows
	# • +1 if header was written
	rows_written = df.shape[0] + (1 if write_header else 0)
	current_row += rows_written
	first_block = False

	# After finishing this PDF’s tables, insert two blank rows
	current_row += 2

	excel_buffer.seek(0)
	return excel_buffer