Spaces:

Rahul-Samedavar
/

ShastraDocs2

Sleeping

App Files Files Community

ShastraDocs2 / preprocessing /preprocessing_modules /docx_extractor.py

Rahul-Samedavar

made onseshotter faster

8882944 7 months ago

raw

history blame contribute delete

3.58 kB

	from docx import Document
	from docx.document import Document as _Document
	from docx.table import Table
	from docx.text.paragraph import Paragraph
	from typing import Union, List, Dict, Any
	from PIL import Image
	from io import BytesIO
	import pytesseract
	import os

	from zipfile import ZipFile
	from lxml import etree
	from pathlib import Path
	import io
	from zipfile import ZipFile
	from lxml import etree

	from zipfile import ZipFile
	from lxml import etree

	from zipfile import ZipFile
	from lxml import etree

	def extract_docx(docx_input) -> str:
	zipf = ZipFile(docx_input)
	xml_content = zipf.read("word/document.xml")
	tree = etree.fromstring(xml_content)

	ns = {
	"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
	"a": "http://schemas.openxmlformats.org/drawingml/2006/main",
	"wps": "http://schemas.microsoft.com/office/word/2010/wordprocessingShape"
	}

	text_blocks = []

	# 1. Extract all tables with gridSpan handling (same as before)
	tables = tree.xpath("//w:tbl", namespaces=ns)
	table_elements = set(tables) # To compare against ancestors
	table_index = 0
	for tbl in tables:
	rows = tbl.xpath("./w:tr", namespaces=ns)
	sub_tables = []
	current_table = []

	prev_col_count = None
	for row in rows:
	row_texts = []
	cells = row.xpath("./w:tc", namespaces=ns)
	col_count = 0
	for cell in cells:
	grid_span_el = cell.xpath("./w:tcPr/w:gridSpan", namespaces=ns)
	span = int(grid_span_el[0].get(f"{{{ns['w']}}}val")) if grid_span_el else 1
	col_count += span

	texts = cell.xpath(".//w:t", namespaces=ns)
	cell_text = " ".join(t.text for t in texts if t.text).strip()
	row_texts.extend([cell_text] * span)

	# Heuristic to split: if row has 1 cell or empty row, or sharp col_count drop
	if not any(row_texts) or (prev_col_count and col_count < prev_col_count // 2):
	if current_table:
	sub_tables.append(current_table)
	current_table = []
	prev_col_count = None
	continue

	current_table.append(row_texts)
	prev_col_count = col_count

	# Append any remaining rows
	if current_table:
	sub_tables.append(current_table)

	for sub_index, sub_table in enumerate(sub_tables):
	table_lines = []
	for row in sub_table:
	table_lines.append(", ".join(str(t) for t in row))
	table_csv = f"--- TABLE {table_index} ---\n" + "\n".join(table_lines)
	text_blocks.append(table_csv)
	table_index += 1



	all_paragraphs = tree.xpath("//w:p", namespaces=ns)
	for p in all_paragraphs:
	# Check if this paragraph is inside a table by walking up to the root
	if not any(ancestor.tag == f"{{{ns['w']}}}tbl" for ancestor in p.iterancestors()):
	texts = p.xpath(".//w:t", namespaces=ns)
	para_text = "".join(t.text for t in texts if t.text)
	if para_text.strip():
	text_blocks.append(para_text.strip())

	# 3. Extract textboxes separately
	tb_contents = tree.xpath("//w:txbxContent", namespaces=ns)
	for tb in tb_contents:
	texts = tb.xpath(".//w:t", namespaces=ns)
	tb_text = " ".join(t.text for t in texts if t.text)
	if tb_text.strip():
	text_blocks.append(tb_text.strip())

	return "\n\n".join(text_blocks)