Spaces:
Sleeping
Sleeping
| from openpyxl import load_workbook | |
| from openpyxl.drawing.image import Image as OpenPyXLImage | |
| from typing import List, Dict, Any | |
| from PIL import Image | |
| from io import BytesIO | |
| import pytesseract | |
| import os | |
| import pandas as pd | |
| def extract_xlsx(xlsx_path: str, tesseract_cmd: str = None) -> str: | |
| """Extract data from Excel files including text and images.""" | |
| if tesseract_cmd: | |
| pytesseract.pytesseract.tesseract_cmd = tesseract_cmd | |
| try: | |
| wb = load_workbook(xlsx_path, data_only=True) | |
| except Exception as e: | |
| return f"Error loading Excel file: {str(e)}" | |
| all_sheets_content: list[str] = [] | |
| preview_text: str | None = None | |
| any_data_found = False | |
| for sheet in wb.worksheets: | |
| sheet_content = [f"\n=== Sheet: {sheet.title} ===\n"] | |
| # Extract table data | |
| has_data = False | |
| non_empty_rows = 0 | |
| for row in sheet.iter_rows(max_row=sheet.max_row, values_only=True): | |
| if row is None or all(cell is None for cell in row): | |
| continue # skip completely empty rows | |
| has_data = True | |
| non_empty_rows += 1 | |
| any_data_found = True | |
| row_data = [str(cell).strip() if cell is not None else "" for cell in row] | |
| joined = " | ".join(row_data) | |
| sheet_content.append(joined) | |
| if preview_text is None and joined.strip(): | |
| preview_text = joined[:15] | |
| if not has_data: | |
| sheet_content.append("[No data in this sheet]") | |
| print(f"ℹ️ XLSX: Sheet '{sheet.title}' has no data (openpyxl)") | |
| else: | |
| print(f"🧾 XLSX: Sheet '{sheet.title}' non-empty rows: {non_empty_rows}") | |
| # Extract images from the sheet | |
| if hasattr(sheet, '_images'): | |
| image_count = 0 | |
| for img in sheet._images: | |
| try: | |
| if hasattr(img, '_data'): # if it's a real OpenPyXL Image | |
| image_data = img._data() | |
| elif hasattr(img, '_ref'): | |
| continue # cell ref-only images; ignore | |
| else: | |
| continue | |
| pil_img = Image.open(BytesIO(image_data)) | |
| try: | |
| ocr_text = pytesseract.image_to_string(pil_img).strip() | |
| if ocr_text: | |
| sheet_content.append(f"[Image {image_count + 1} Text]: {ocr_text}") | |
| else: | |
| sheet_content.append(f"[Image {image_count + 1}]: No text detected") | |
| except Exception as ocr_e: | |
| sheet_content.append(f"[Image {image_count + 1}]: OCR failed - {str(ocr_e)}") | |
| image_count += 1 | |
| except Exception as e: | |
| sheet_content.append(f"[Image extraction error: {str(e)}]") | |
| if image_count == 0: | |
| sheet_content.append("[No images found in this sheet]") | |
| all_sheets_content.append("\n".join(sheet_content)) | |
| # If no data found using openpyxl, try pandas fallback (handles some edge cases better) | |
| if not any_data_found: | |
| print("ℹ️ XLSX: No data via openpyxl, trying pandas fallback…") | |
| try: | |
| xls = pd.ExcelFile(xlsx_path, engine="openpyxl") | |
| pandas_parts = [] | |
| extracted_sheets = 0 | |
| for sheet_name in xls.sheet_names: | |
| df = pd.read_excel(xls, sheet_name=sheet_name, dtype=str) | |
| if not df.empty: | |
| any_data_found = True | |
| header = f"\n=== Sheet: {sheet_name} ===\n" | |
| csv_like = df.fillna("").astype(str).to_csv(index=False) | |
| pandas_parts.append(header + csv_like) | |
| extracted_sheets += 1 | |
| if preview_text is None: | |
| flat = "".join(csv_like.splitlines()) | |
| if flat: | |
| preview_text = flat[:15] | |
| else: | |
| pandas_parts.append(f"\n=== Sheet: {sheet_name} ===\n[No data in this sheet]") | |
| if pandas_parts: | |
| all_sheets_content = pandas_parts | |
| print(f"✅ XLSX: Pandas fallback extracted {extracted_sheets} non-empty sheet(s)") | |
| except Exception as pe: | |
| # If pandas also fails, keep whatever we had | |
| all_sheets_content.append(f"[Pandas fallback failed: {str(pe)}]") | |
| print(f"❌ XLSX: Pandas fallback failed: {pe}") | |
| combined = "\n\n".join(all_sheets_content) | |
| # Print a small preview for verification | |
| if preview_text is None: | |
| # fallback: take from combined text | |
| flat_combined = "".join(combined.splitlines()).strip() | |
| if flat_combined: | |
| preview_text = flat_combined[:15] | |
| if preview_text: | |
| print(f"🔎 XLSX content preview: {preview_text}") | |
| return combined | |