from docx import Document from docx.document import Document as _Document from docx.table import Table from docx.text.paragraph import Paragraph from typing import Union, List, Dict, Any from PIL import Image from io import BytesIO import pytesseract import os from zipfile import ZipFile from lxml import etree from pathlib import Path import io def extract_docx(docx_input) -> str: """Extract text from DOCX files with table and text handling.""" zipf = ZipFile(docx_input) xml_content = zipf.read("word/document.xml") tree = etree.fromstring(xml_content) ns = { "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main", "a": "http://schemas.openxmlformats.org/drawingml/2006/main", "wps": "http://schemas.microsoft.com/office/word/2010/wordprocessingShape" } text_blocks = [] # Extract all tables with gridSpan handling tables = tree.xpath("//w:tbl", namespaces=ns) table_elements = set(tables) table_index = 0 for tbl in tables: rows = tbl.xpath("./w:tr", namespaces=ns) sub_tables = [] current_table = [] prev_col_count = None for row in rows: row_texts = [] cells = row.xpath("./w:tc", namespaces=ns) col_count = 0 for cell in cells: cell_text = "" paragraphs = cell.xpath(".//w:p", namespaces=ns) for para in paragraphs: text_nodes = para.xpath(".//w:t", namespaces=ns) para_text = "".join(node.text for node in text_nodes if node.text) if para_text.strip(): cell_text += para_text + " " # Handle gridSpan (merged cells) gridspan_elem = cell.xpath(".//w:gridSpan", namespaces=ns) span = int(gridspan_elem[0].get(ns["w"] + "val", "1")) if gridspan_elem else 1 row_texts.append(cell_text.strip()) col_count += span if row_texts and any(text.strip() for text in row_texts): if prev_col_count is not None and col_count != prev_col_count: # Column count changed, save current table and start new one if current_table: sub_tables.append(current_table) current_table = [] current_table.append(row_texts) prev_col_count = col_count if current_table: sub_tables.append(current_table) # Format tables for sub_table in sub_tables: table_text = f"\\n--- Table {table_index + 1} ---\\n" for row in sub_table: table_text += " | ".join(row) + "\\n" text_blocks.append(table_text) table_index += 1 # Extract non-table paragraphs paragraphs = tree.xpath("//w:p", namespaces=ns) for para in paragraphs: # Check if paragraph is inside a table is_in_table = any(table in para.xpath("ancestor::*") for table in table_elements) if not is_in_table: text_nodes = para.xpath(".//w:t", namespaces=ns) para_text = "".join(node.text for node in text_nodes if node.text) if para_text.strip(): text_blocks.append(para_text.strip()) return "\\n\\n".join(text_blocks)