Spaces:
Sleeping
Sleeping
File size: 2,833 Bytes
8882944 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 |
from openpyxl import load_workbook
from openpyxl.drawing.image import Image as OpenPyXLImage
from typing import List, Dict, Any
from PIL import Image
from io import BytesIO
import pytesseract
import os
from typing import List, Dict
def extract_xlsx_with_meta(xlsx_path: str, tesseract_cmd: str = None) -> List[Dict[str, Any]]:
if tesseract_cmd:
pytesseract.pytesseract.tesseract_cmd = tesseract_cmd
wb = load_workbook(xlsx_path, data_only=True)
all_sheets_content = []
for sheet in wb.worksheets:
sheet_data = {
"sheet_name": sheet.title,
"content_blocks": []
}
# Extract table data
for row in sheet.iter_rows(max_row=sheet.max_row, values_only=True):
if all(cell is None for cell in row):
continue # skip completely empty rows
row_data = [str(cell).strip() if cell is not None else "" for cell in row]
content_block = {
"type": "table_row",
"content": ",".join(row_data)
}
sheet_data["content_blocks"].append(content_block)
# Extract images from the sheet
if hasattr(sheet, '_images'):
for img in sheet._images:
try:
if hasattr(img, '_data'): # if it's a real OpenPyXL Image
image_data = img._data()
elif hasattr(img, '_ref'):
continue # cell ref-only images; ignore
else:
continue
pil_img = Image.open(BytesIO(image_data))
ocr_text = pytesseract.image_to_string(pil_img).strip()
content_block = {
"type": "image",
"content": ocr_text if ocr_text else "[No OCR text detected]"
}
except Exception as e:
content_block = {
"type": "image",
"content": f"[OCR failed: {str(e)}]"
}
sheet_data["content_blocks"].append(content_block)
all_sheets_content.append(sheet_data)
return all_sheets_content
def extract_xlsx(filepath: str) -> str:
lines = []
for sheet in extract_xlsx_with_meta(filepath):
lines.append(f"### Sheet: {sheet['sheet_name']}")
for block in sheet['content_blocks']:
if block['type'] == "table_row":
lines.append(f"- {block['content']}")
elif block['type'] == "image":
lines.append(f"[Image OCR Content] {block['content']}")
else:
lines.append(f"[Unknown Content Type] {block['content']}")
lines.append("") # newline between sheets
return "\n".join(lines)
|