Spaces:

Rahul-Samedavar
/

ShastraDocs2

Sleeping

File size: 2,833 Bytes
from openpyxl import load_workbook
from openpyxl.drawing.image import Image as OpenPyXLImage
from typing import List, Dict, Any
from PIL import Image
from io import BytesIO
import pytesseract
import os

from typing import List, Dict

def extract_xlsx_with_meta(xlsx_path: str, tesseract_cmd: str = None) -> List[Dict[str, Any]]:
    if tesseract_cmd:
        pytesseract.pytesseract.tesseract_cmd = tesseract_cmd

    wb = load_workbook(xlsx_path, data_only=True)
    all_sheets_content = []

    for sheet in wb.worksheets:
        sheet_data = {
            "sheet_name": sheet.title,
            "content_blocks": []
        }

        # Extract table data
        for row in sheet.iter_rows(max_row=sheet.max_row, values_only=True):
            if all(cell is None for cell in row):
                continue  # skip completely empty rows
            row_data = [str(cell).strip() if cell is not None else "" for cell in row]
            content_block = {
                "type": "table_row",
                "content": ",".join(row_data)
            }
            sheet_data["content_blocks"].append(content_block)

        # Extract images from the sheet
        if hasattr(sheet, '_images'):
            for img in sheet._images:
                try:
                    if hasattr(img, '_data'):  # if it's a real OpenPyXL Image
                        image_data = img._data()
                    elif hasattr(img, '_ref'):
                        continue  # cell ref-only images; ignore
                    else:
                        continue

                    pil_img = Image.open(BytesIO(image_data))
                    ocr_text = pytesseract.image_to_string(pil_img).strip()

                    content_block = {
                        "type": "image",
                        "content": ocr_text if ocr_text else "[No OCR text detected]"
                    }
                except Exception as e:
                    content_block = {
                        "type": "image",
                        "content": f"[OCR failed: {str(e)}]"
                    }

                sheet_data["content_blocks"].append(content_block)

        all_sheets_content.append(sheet_data)

    return all_sheets_content


def extract_xlsx(filepath: str) -> str:
    lines = []

    for sheet in extract_xlsx_with_meta(filepath):
        lines.append(f"### Sheet: {sheet['sheet_name']}")
        for block in sheet['content_blocks']:
            if block['type'] == "table_row":
                lines.append(f"- {block['content']}")
            elif block['type'] == "image":
                lines.append(f"[Image OCR Content] {block['content']}")
            else:
                lines.append(f"[Unknown Content Type] {block['content']}")
        lines.append("")  # newline between sheets

    return "\n".join(lines)