Rahul-Samedavar's picture
made onseshotter faster
8882944
from docx import Document
from docx.document import Document as _Document
from docx.table import Table
from docx.text.paragraph import Paragraph
from typing import Union, List, Dict, Any
from PIL import Image
from io import BytesIO
import pytesseract
import os
from zipfile import ZipFile
from lxml import etree
from pathlib import Path
import io
from zipfile import ZipFile
from lxml import etree
from zipfile import ZipFile
from lxml import etree
from zipfile import ZipFile
from lxml import etree
def extract_docx(docx_input) -> str:
zipf = ZipFile(docx_input)
xml_content = zipf.read("word/document.xml")
tree = etree.fromstring(xml_content)
ns = {
"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
"a": "http://schemas.openxmlformats.org/drawingml/2006/main",
"wps": "http://schemas.microsoft.com/office/word/2010/wordprocessingShape"
}
text_blocks = []
# 1. Extract all tables with gridSpan handling (same as before)
tables = tree.xpath("//w:tbl", namespaces=ns)
table_elements = set(tables) # To compare against ancestors
table_index = 0
for tbl in tables:
rows = tbl.xpath("./w:tr", namespaces=ns)
sub_tables = []
current_table = []
prev_col_count = None
for row in rows:
row_texts = []
cells = row.xpath("./w:tc", namespaces=ns)
col_count = 0
for cell in cells:
grid_span_el = cell.xpath("./w:tcPr/w:gridSpan", namespaces=ns)
span = int(grid_span_el[0].get(f"{{{ns['w']}}}val")) if grid_span_el else 1
col_count += span
texts = cell.xpath(".//w:t", namespaces=ns)
cell_text = " ".join(t.text for t in texts if t.text).strip()
row_texts.extend([cell_text] * span)
# Heuristic to split: if row has 1 cell or empty row, or sharp col_count drop
if not any(row_texts) or (prev_col_count and col_count < prev_col_count // 2):
if current_table:
sub_tables.append(current_table)
current_table = []
prev_col_count = None
continue
current_table.append(row_texts)
prev_col_count = col_count
# Append any remaining rows
if current_table:
sub_tables.append(current_table)
for sub_index, sub_table in enumerate(sub_tables):
table_lines = []
for row in sub_table:
table_lines.append(", ".join(str(t) for t in row))
table_csv = f"--- TABLE {table_index} ---\n" + "\n".join(table_lines)
text_blocks.append(table_csv)
table_index += 1
all_paragraphs = tree.xpath("//w:p", namespaces=ns)
for p in all_paragraphs:
# Check if this paragraph is inside a table by walking up to the root
if not any(ancestor.tag == f"{{{ns['w']}}}tbl" for ancestor in p.iterancestors()):
texts = p.xpath(".//w:t", namespaces=ns)
para_text = "".join(t.text for t in texts if t.text)
if para_text.strip():
text_blocks.append(para_text.strip())
# 3. Extract textboxes separately
tb_contents = tree.xpath("//w:txbxContent", namespaces=ns)
for tb in tb_contents:
texts = tb.xpath(".//w:t", namespaces=ns)
tb_text = " ".join(t.text for t in texts if t.text)
if tb_text.strip():
text_blocks.append(tb_text.strip())
return "\n\n".join(text_blocks)