""" DOCX Format Handler Extracts text and structure from Word documents using python-docx and mammoth. Supports paragraph-level coordinate mapping for span annotations. Usage: from potato.format_handlers.docx_handler import DocxHandler handler = DocxHandler() output = handler.extract("document.docx", { "preserve_styles": True, "include_headers": True, }) """ from typing import Dict, List, Any, Optional from pathlib import Path import html import logging import uuid from .base import BaseFormatHandler, FormatOutput from .coordinate_mapping import CoordinateMapper, DocumentCoordinate logger = logging.getLogger(__name__) # Check if dependencies are available try: import docx from docx.document import Document from docx.oxml.ns import qn DOCX_AVAILABLE = True except ImportError: DOCX_AVAILABLE = False docx = None try: import mammoth MAMMOTH_AVAILABLE = True except ImportError: MAMMOTH_AVAILABLE = False mammoth = None class DocxHandler(BaseFormatHandler): """ Handler for Word documents (.docx). Uses python-docx for text extraction with structure preservation and mammoth for HTML conversion. """ format_name = "docx" supported_extensions = [".docx"] description = "Word document extraction with paragraph/section mapping" requires_dependencies = ["python-docx", "mammoth"] def get_default_options(self) -> Dict[str, Any]: """Get default extraction options.""" return { "preserve_styles": True, "include_headers": True, "include_footers": False, "include_tables": True, "paragraph_separator": "\n\n", "use_mammoth_html": True, # Use mammoth for rich HTML conversion } def extract( self, file_path: str, options: Optional[Dict[str, Any]] = None ) -> FormatOutput: """ Extract text and structure from a Word document. Args: file_path: Path to the .docx file options: Extraction options: - preserve_styles: Keep heading levels and formatting - include_headers: Include document headers - include_footers: Include document footers - include_tables: Extract table content Returns: FormatOutput with extracted text, HTML, and coordinate mappings """ if not DOCX_AVAILABLE: raise ImportError( "python-docx is required for DOCX extraction. " "Install with: pip install python-docx" ) opts = self.merge_options(options) mapper = CoordinateMapper() # Open document doc = docx.Document(file_path) # Extract text with structure text_parts = [] html_parts = [] current_offset = 0 metadata = { "format": "docx", "source_file": str(file_path), "paragraphs": [], "sections": [], "tables": [], } # Check if mammoth is available for rich HTML if opts.get("use_mammoth_html") and MAMMOTH_AVAILABLE: rendered_html = self._extract_with_mammoth(file_path, opts) else: rendered_html = None html_parts.append('
') # Extract headers if requested if opts.get("include_headers"): for section in doc.sections: header = section.header if header and header.paragraphs: header_text = "\n".join(p.text for p in header.paragraphs if p.text.strip()) if header_text: para_id = f"header_{uuid.uuid4().hex[:8]}" text_parts.append(header_text) text_parts.append("\n\n") html_parts.append(f'
') html_parts.append(f'{html.escape(header_text)}') html_parts.append('
') mapper.add_mapping( current_offset, current_offset + len(header_text), DocumentCoordinate( paragraph_id=para_id, local_offset=0, section="header", ) ) current_offset += len(header_text) + 2 metadata["sections"].append({"type": "header", "id": para_id}) # Extract main body current_section = None for i, para in enumerate(doc.paragraphs): para_text = para.text if not para_text.strip(): continue para_id = f"p_{i}_{uuid.uuid4().hex[:8]}" # Detect heading level heading_level = None if para.style and para.style.name: style_name = para.style.name.lower() if style_name.startswith("heading"): try: heading_level = int(style_name.replace("heading", "").strip()) except ValueError: pass # Update section tracking if heading_level: current_section = para_text.strip() # Build text start_offset = current_offset text_parts.append(para_text) end_offset = current_offset + len(para_text) # Add paragraph separator text_parts.append(opts["paragraph_separator"]) current_offset = end_offset + len(opts["paragraph_separator"]) # Build HTML css_class = "docx-paragraph" if heading_level: css_class = f"docx-heading docx-h{heading_level}" html_tag = f"h{min(heading_level, 6)}" else: html_tag = "p" html_parts.append( f'<{html_tag} class="{css_class}" ' f'data-para-id="{para_id}" ' f'data-start="{start_offset}" ' f'data-end="{end_offset}">' f'{html.escape(para_text)}' f'' ) # Add coordinate mapping mapper.add_mapping( start_offset, end_offset, DocumentCoordinate( paragraph_id=para_id, local_offset=0, section=current_section, heading_level=heading_level, ) ) # Track paragraph metadata metadata["paragraphs"].append({ "id": para_id, "start": start_offset, "end": end_offset, "heading_level": heading_level, "section": current_section, "char_count": len(para_text), }) # Extract tables if requested if opts.get("include_tables"): for t_idx, table in enumerate(doc.tables): table_id = f"table_{t_idx}_{uuid.uuid4().hex[:8]}" table_text, table_html = self._extract_table( table, table_id, current_offset ) if table_text: text_parts.append("\n") text_parts.append(table_text) text_parts.append("\n") html_parts.append(table_html) mapper.add_mapping( current_offset, current_offset + len(table_text), DocumentCoordinate( paragraph_id=table_id, section=current_section, ) ) current_offset += len(table_text) + 2 metadata["tables"].append({"id": table_id}) html_parts.append('
') full_text = "".join(text_parts) # Use mammoth HTML if available, otherwise use our generated HTML if rendered_html: final_html = rendered_html else: final_html = "\n".join(html_parts) coord_dict = mapper.to_dict() coord_dict["get_coords_for_range"] = mapper.get_coords_for_range return FormatOutput( text=full_text, rendered_html=final_html, coordinate_map=coord_dict, metadata=metadata, format_name=self.format_name, source_path=str(file_path), ) def _extract_with_mammoth(self, file_path: str, opts: Dict[str, Any]) -> str: """ Use mammoth for rich HTML conversion. """ if not MAMMOTH_AVAILABLE: return None try: with open(file_path, "rb") as f: result = mammoth.convert_to_html(f) html_content = result.value # Wrap in container return f'
{html_content}
' except Exception as e: logger.warning(f"Mammoth conversion failed: {e}") return None def _extract_table( self, table, table_id: str, base_offset: int ) -> tuple: """ Extract text and HTML from a table. Returns: Tuple of (text, html) """ text_rows = [] html_parts = [] html_parts.append(f'') for row_idx, row in enumerate(table.rows): row_texts = [] html_parts.append('') for cell_idx, cell in enumerate(row.cells): cell_text = cell.text.strip() row_texts.append(cell_text) html_parts.append(f'') html_parts.append('') text_rows.append("\t".join(row_texts)) html_parts.append('
{html.escape(cell_text)}
') return "\n".join(text_rows), "\n".join(html_parts) def extract_metadata(self, file_path: str) -> Dict[str, Any]: """ Extract document metadata (author, title, etc.). Args: file_path: Path to the .docx file Returns: Dictionary of metadata properties """ if not DOCX_AVAILABLE: raise ImportError("python-docx is required") doc = docx.Document(file_path) core_props = doc.core_properties return { "author": core_props.author, "title": core_props.title, "subject": core_props.subject, "keywords": core_props.keywords, "created": str(core_props.created) if core_props.created else None, "modified": str(core_props.modified) if core_props.modified else None, "last_modified_by": core_props.last_modified_by, "revision": core_props.revision, "category": core_props.category, "comments": core_props.comments, }