Spaces:
Sleeping
Sleeping
| from docx import Document | |
| from docx.document import Document as _Document | |
| from docx.table import Table | |
| from docx.text.paragraph import Paragraph | |
| from typing import Union, List, Dict, Any | |
| from PIL import Image | |
| from io import BytesIO | |
| import pytesseract | |
| import os | |
| from zipfile import ZipFile | |
| from lxml import etree | |
| from pathlib import Path | |
| import io | |
| def extract_docx(docx_input) -> str: | |
| """Extract text from DOCX files with table and text handling.""" | |
| zipf = ZipFile(docx_input) | |
| xml_content = zipf.read("word/document.xml") | |
| tree = etree.fromstring(xml_content) | |
| ns = { | |
| "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main", | |
| "a": "http://schemas.openxmlformats.org/drawingml/2006/main", | |
| "wps": "http://schemas.microsoft.com/office/word/2010/wordprocessingShape" | |
| } | |
| text_blocks = [] | |
| # Extract all tables with gridSpan handling | |
| tables = tree.xpath("//w:tbl", namespaces=ns) | |
| table_elements = set(tables) | |
| table_index = 0 | |
| for tbl in tables: | |
| rows = tbl.xpath("./w:tr", namespaces=ns) | |
| sub_tables = [] | |
| current_table = [] | |
| prev_col_count = None | |
| for row in rows: | |
| row_texts = [] | |
| cells = row.xpath("./w:tc", namespaces=ns) | |
| col_count = 0 | |
| for cell in cells: | |
| cell_text = "" | |
| paragraphs = cell.xpath(".//w:p", namespaces=ns) | |
| for para in paragraphs: | |
| text_nodes = para.xpath(".//w:t", namespaces=ns) | |
| para_text = "".join(node.text for node in text_nodes if node.text) | |
| if para_text.strip(): | |
| cell_text += para_text + " " | |
| # Handle gridSpan (merged cells) | |
| gridspan_elem = cell.xpath(".//w:gridSpan", namespaces=ns) | |
| span = int(gridspan_elem[0].get(ns["w"] + "val", "1")) if gridspan_elem else 1 | |
| row_texts.append(cell_text.strip()) | |
| col_count += span | |
| if row_texts and any(text.strip() for text in row_texts): | |
| if prev_col_count is not None and col_count != prev_col_count: | |
| # Column count changed, save current table and start new one | |
| if current_table: | |
| sub_tables.append(current_table) | |
| current_table = [] | |
| current_table.append(row_texts) | |
| prev_col_count = col_count | |
| if current_table: | |
| sub_tables.append(current_table) | |
| # Format tables | |
| for sub_table in sub_tables: | |
| table_text = f"\\n--- Table {table_index + 1} ---\\n" | |
| for row in sub_table: | |
| table_text += " | ".join(row) + "\\n" | |
| text_blocks.append(table_text) | |
| table_index += 1 | |
| # Extract non-table paragraphs | |
| paragraphs = tree.xpath("//w:p", namespaces=ns) | |
| for para in paragraphs: | |
| # Check if paragraph is inside a table | |
| is_in_table = any(table in para.xpath("ancestor::*") for table in table_elements) | |
| if not is_in_table: | |
| text_nodes = para.xpath(".//w:t", namespaces=ns) | |
| para_text = "".join(node.text for node in text_nodes if node.text) | |
| if para_text.strip(): | |
| text_blocks.append(para_text.strip()) | |
| return "\\n\\n".join(text_blocks) | |