text-extraction-api / extractors /docx_extractor.py
krishnachoudhary-hclguvi
Deploy text extraction API files
52a0fe9 unverified
"""
DOCX text extraction using python-docx.
Extracts text preserving paragraph structure, tables, and document properties.
"""
import time
import os
from docx import Document
from models.schemas import ExtractionResult, DocumentMetadata
def extract_docx(file_path: str) -> ExtractionResult:
"""Extract text and metadata from a DOCX file."""
start_time = time.time()
try:
doc = Document(file_path)
# Extract paragraphs
paragraphs = []
for para in doc.paragraphs:
text = para.text.strip()
if text:
# Preserve heading structure
if para.style and para.style.name.startswith("Heading"):
level = para.style.name.replace("Heading ", "").strip()
prefix = "#" * int(level) if level.isdigit() else "##"
paragraphs.append(f"{prefix} {text}")
else:
paragraphs.append(text)
# Extract tables
tables_text = []
for table_idx, table in enumerate(doc.tables):
table_data = []
for row in table.rows:
row_data = [cell.text.strip() for cell in row.cells]
table_data.append(" | ".join(row_data))
if table_data:
tables_text.append(f"\n[Table {table_idx + 1}]\n" + "\n".join(table_data))
# Combine all text
full_text = "\n\n".join(paragraphs)
if tables_text:
full_text += "\n\n" + "\n".join(tables_text)
# Extract metadata from core properties
props = doc.core_properties
metadata = DocumentMetadata(
title=props.title or os.path.basename(file_path),
author=props.author or "Unknown",
creation_date=str(props.created) if props.created else "",
modification_date=str(props.modified) if props.modified else "",
page_count=None, # DOCX doesn't expose page count easily
word_count=len(full_text.split()) if full_text else 0,
character_count=len(full_text),
file_type="DOCX",
extra={
"category": props.category or "",
"comments": props.comments or "",
"last_modified_by": props.last_modified_by or "",
"revision": props.revision,
"subject": props.subject or "",
"keywords": props.keywords or "",
"paragraph_count": len(doc.paragraphs),
"table_count": len(doc.tables),
}
)
elapsed = (time.time() - start_time) * 1000
if not full_text.strip():
return ExtractionResult(
raw_text="",
metadata=metadata,
success=False,
error_message="No text content found in the DOCX file.",
extraction_time_ms=elapsed,
)
return ExtractionResult(
raw_text=full_text,
metadata=metadata,
success=True,
extraction_time_ms=elapsed,
)
except Exception as e:
elapsed = (time.time() - start_time) * 1000
return ExtractionResult(
raw_text="",
metadata=DocumentMetadata(file_type="DOCX"),
success=False,
error_message=f"DOCX extraction failed: {str(e)}",
extraction_time_ms=elapsed,
)