Spaces:
Sleeping
Sleeping
| from azure.ai.formrecognizer import DocumentAnalysisClient | |
| from azure.core.credentials import AzureKeyCredential | |
| import pandas as pd | |
| from io import BytesIO | |
| import io | |
| # Set your Azure credentials | |
| def detect_tables(pdf_source): | |
| endpoint = "https://tableextractiontsa.openai.azure.com" | |
| key = "9JusYnfKj4av5PQDOBpfZOj77NF88r2xcvfXNW9D7Or2Bk0F4dCpJQQJ99BDACYeBjFXJ3w3AAAAACOGVOJr" | |
| # Create client | |
| client = DocumentAnalysisClient(endpoint=endpoint, credential=AzureKeyCredential(key)) | |
| # Load your PDF | |
| if isinstance(pdf_source, (bytes, bytearray)): | |
| stream = io.BytesIO(pdf_source) | |
| else: | |
| stream = open(pdf_source, "rb") | |
| # 2. Call Azure Form Recognizer | |
| with stream: | |
| client = DocumentAnalysisClient(endpoint=endpoint, | |
| credential=AzureKeyCredential(key)) | |
| poller = client.begin_analyze_document("prebuilt-layout", | |
| document=stream) | |
| result = poller.result() | |
| # 3. Build DataFrames | |
| tables = [] | |
| for table in result.tables: | |
| cols = max(c.column_index for c in table.cells) + 1 | |
| rows = max(c.row_index for c in table.cells) + 1 | |
| grid = [["" for _ in range(cols)] for _ in range(rows)] | |
| for c in table.cells: | |
| grid[c.row_index][c.column_index] = c.content | |
| df = pd.DataFrame(grid) | |
| df["page_number"] = table.bounding_regions[0].page_number | |
| tables.append(df) | |
| # 4. Write all sheets into a BytesIO | |
| if not tables: | |
| return None | |
| excel_buffer = BytesIO() | |
| with pd.ExcelWriter(excel_buffer, engine="openpyxl") as writer: | |
| for idx, df in enumerate(tables, start=1): | |
| sheet = f"Table_{idx}" | |
| df.to_excel(writer, sheet_name=sheet, index=False) | |
| excel_buffer.seek(0) | |
| return excel_buffer | |