Spaces:

Krish-05
/

text-extraction-api

Sleeping

text-extraction-api / extractors /docx_extractor.py

krishnachoudhary-hclguvi

Deploy text extraction API files

52a0fe9 unverified about 2 months ago

3.43 kB

	"""
	DOCX text extraction using python-docx.
	Extracts text preserving paragraph structure, tables, and document properties.
	"""
	import time
	import os
	from docx import Document
	from models.schemas import ExtractionResult, DocumentMetadata


	def extract_docx(file_path: str) -> ExtractionResult:
	"""Extract text and metadata from a DOCX file."""
	start_time = time.time()

	try:
	doc = Document(file_path)

	# Extract paragraphs
	paragraphs = []
	for para in doc.paragraphs:
	text = para.text.strip()
	if text:
	# Preserve heading structure
	if para.style and para.style.name.startswith("Heading"):
	level = para.style.name.replace("Heading ", "").strip()
	prefix = "#" * int(level) if level.isdigit() else "##"
	paragraphs.append(f"{prefix} {text}")
	else:
	paragraphs.append(text)

	# Extract tables
	tables_text = []
	for table_idx, table in enumerate(doc.tables):
	table_data = []
	for row in table.rows:
	row_data = [cell.text.strip() for cell in row.cells]
	table_data.append(" \| ".join(row_data))
	if table_data:
	tables_text.append(f"\n[Table {table_idx + 1}]\n" + "\n".join(table_data))

	# Combine all text
	full_text = "\n\n".join(paragraphs)
	if tables_text:
	full_text += "\n\n" + "\n".join(tables_text)

	# Extract metadata from core properties
	props = doc.core_properties
	metadata = DocumentMetadata(
	title=props.title or os.path.basename(file_path),
	author=props.author or "Unknown",
	creation_date=str(props.created) if props.created else "",
	modification_date=str(props.modified) if props.modified else "",
	page_count=None, # DOCX doesn't expose page count easily
	word_count=len(full_text.split()) if full_text else 0,
	character_count=len(full_text),
	file_type="DOCX",
	extra={
	"category": props.category or "",
	"comments": props.comments or "",
	"last_modified_by": props.last_modified_by or "",
	"revision": props.revision,
	"subject": props.subject or "",
	"keywords": props.keywords or "",
	"paragraph_count": len(doc.paragraphs),
	"table_count": len(doc.tables),
	}
	)

	elapsed = (time.time() - start_time) * 1000

	if not full_text.strip():
	return ExtractionResult(
	raw_text="",
	metadata=metadata,
	success=False,
	error_message="No text content found in the DOCX file.",
	extraction_time_ms=elapsed,
	)

	return ExtractionResult(
	raw_text=full_text,
	metadata=metadata,
	success=True,
	extraction_time_ms=elapsed,
	)

	except Exception as e:
	elapsed = (time.time() - start_time) * 1000
	return ExtractionResult(
	raw_text="",
	metadata=DocumentMetadata(file_type="DOCX"),
	success=False,
	error_message=f"DOCX extraction failed: {str(e)}",
	extraction_time_ms=elapsed,
	)