File size: 3,575 Bytes
8882944
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
from docx import Document
from docx.document import Document as _Document
from docx.table import Table
from docx.text.paragraph import Paragraph
from typing import Union, List, Dict, Any
from PIL import Image
from io import BytesIO
import pytesseract
import os

from zipfile import ZipFile
from lxml import etree
from pathlib import Path
import io
from zipfile import ZipFile
from lxml import etree

from zipfile import ZipFile
from lxml import etree

from zipfile import ZipFile
from lxml import etree

def extract_docx(docx_input) -> str:
    zipf = ZipFile(docx_input)
    xml_content = zipf.read("word/document.xml")
    tree = etree.fromstring(xml_content)

    ns = {
        "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
        "a": "http://schemas.openxmlformats.org/drawingml/2006/main",
        "wps": "http://schemas.microsoft.com/office/word/2010/wordprocessingShape"
    }

    text_blocks = []

    # 1. Extract all tables with gridSpan handling (same as before)
    tables = tree.xpath("//w:tbl", namespaces=ns)
    table_elements = set(tables)  # To compare against ancestors
    table_index = 0
    for tbl in tables:
        rows = tbl.xpath("./w:tr", namespaces=ns)
        sub_tables = []
        current_table = []

        prev_col_count = None
        for row in rows:
            row_texts = []
            cells = row.xpath("./w:tc", namespaces=ns)
            col_count = 0
            for cell in cells:
                grid_span_el = cell.xpath("./w:tcPr/w:gridSpan", namespaces=ns)
                span = int(grid_span_el[0].get(f"{{{ns['w']}}}val")) if grid_span_el else 1
                col_count += span

                texts = cell.xpath(".//w:t", namespaces=ns)
                cell_text = " ".join(t.text for t in texts if t.text).strip()
                row_texts.extend([cell_text] * span)

            # Heuristic to split: if row has 1 cell or empty row, or sharp col_count drop
            if not any(row_texts) or (prev_col_count and col_count < prev_col_count // 2):
                if current_table:
                    sub_tables.append(current_table)
                    current_table = []
                prev_col_count = None
                continue

            current_table.append(row_texts)
            prev_col_count = col_count

        # Append any remaining rows
        if current_table:
            sub_tables.append(current_table)

        for sub_index, sub_table in enumerate(sub_tables):
            table_lines = []
            for row in sub_table:
                table_lines.append(", ".join(str(t) for t in row))
            table_csv = f"--- TABLE {table_index} ---\n" + "\n".join(table_lines)
            text_blocks.append(table_csv)
            table_index += 1



    all_paragraphs = tree.xpath("//w:p", namespaces=ns)
    for p in all_paragraphs:
        # Check if this paragraph is inside a table by walking up to the root
        if not any(ancestor.tag == f"{{{ns['w']}}}tbl" for ancestor in p.iterancestors()):
            texts = p.xpath(".//w:t", namespaces=ns)
            para_text = "".join(t.text for t in texts if t.text)
            if para_text.strip():
                text_blocks.append(para_text.strip())

    # 3. Extract textboxes separately
    tb_contents = tree.xpath("//w:txbxContent", namespaces=ns)
    for tb in tb_contents:
        texts = tb.xpath(".//w:t", namespaces=ns)
        tb_text = " ".join(t.text for t in texts if t.text)
        if tb_text.strip():
            text_blocks.append(tb_text.strip())

    return "\n\n".join(text_blocks)