Spaces:
Sleeping
Sleeping
| from docx import Document | |
| from docx.document import Document as _Document | |
| from docx.table import Table | |
| from docx.text.paragraph import Paragraph | |
| from typing import Union, List, Dict, Any | |
| from PIL import Image | |
| from io import BytesIO | |
| import pytesseract | |
| import os | |
| from zipfile import ZipFile | |
| from lxml import etree | |
| from pathlib import Path | |
| import io | |
| from zipfile import ZipFile | |
| from lxml import etree | |
| from zipfile import ZipFile | |
| from lxml import etree | |
| from zipfile import ZipFile | |
| from lxml import etree | |
| def extract_docx(docx_input) -> str: | |
| zipf = ZipFile(docx_input) | |
| xml_content = zipf.read("word/document.xml") | |
| tree = etree.fromstring(xml_content) | |
| ns = { | |
| "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main", | |
| "a": "http://schemas.openxmlformats.org/drawingml/2006/main", | |
| "wps": "http://schemas.microsoft.com/office/word/2010/wordprocessingShape" | |
| } | |
| text_blocks = [] | |
| # 1. Extract all tables with gridSpan handling (same as before) | |
| tables = tree.xpath("//w:tbl", namespaces=ns) | |
| table_elements = set(tables) # To compare against ancestors | |
| table_index = 0 | |
| for tbl in tables: | |
| rows = tbl.xpath("./w:tr", namespaces=ns) | |
| sub_tables = [] | |
| current_table = [] | |
| prev_col_count = None | |
| for row in rows: | |
| row_texts = [] | |
| cells = row.xpath("./w:tc", namespaces=ns) | |
| col_count = 0 | |
| for cell in cells: | |
| grid_span_el = cell.xpath("./w:tcPr/w:gridSpan", namespaces=ns) | |
| span = int(grid_span_el[0].get(f"{{{ns['w']}}}val")) if grid_span_el else 1 | |
| col_count += span | |
| texts = cell.xpath(".//w:t", namespaces=ns) | |
| cell_text = " ".join(t.text for t in texts if t.text).strip() | |
| row_texts.extend([cell_text] * span) | |
| # Heuristic to split: if row has 1 cell or empty row, or sharp col_count drop | |
| if not any(row_texts) or (prev_col_count and col_count < prev_col_count // 2): | |
| if current_table: | |
| sub_tables.append(current_table) | |
| current_table = [] | |
| prev_col_count = None | |
| continue | |
| current_table.append(row_texts) | |
| prev_col_count = col_count | |
| # Append any remaining rows | |
| if current_table: | |
| sub_tables.append(current_table) | |
| for sub_index, sub_table in enumerate(sub_tables): | |
| table_lines = [] | |
| for row in sub_table: | |
| table_lines.append(", ".join(str(t) for t in row)) | |
| table_csv = f"--- TABLE {table_index} ---\n" + "\n".join(table_lines) | |
| text_blocks.append(table_csv) | |
| table_index += 1 | |
| all_paragraphs = tree.xpath("//w:p", namespaces=ns) | |
| for p in all_paragraphs: | |
| # Check if this paragraph is inside a table by walking up to the root | |
| if not any(ancestor.tag == f"{{{ns['w']}}}tbl" for ancestor in p.iterancestors()): | |
| texts = p.xpath(".//w:t", namespaces=ns) | |
| para_text = "".join(t.text for t in texts if t.text) | |
| if para_text.strip(): | |
| text_blocks.append(para_text.strip()) | |
| # 3. Extract textboxes separately | |
| tb_contents = tree.xpath("//w:txbxContent", namespaces=ns) | |
| for tb in tb_contents: | |
| texts = tb.xpath(".//w:t", namespaces=ns) | |
| tb_text = " ".join(t.text for t in texts if t.text) | |
| if tb_text.strip(): | |
| text_blocks.append(tb_text.strip()) | |
| return "\n\n".join(text_blocks) | |