Spaces:

Blablablab
/

codebook

Paused

App Files Files Community

codebook / potato /format_handlers /docx_handler.py

davidjurgens

Deploy: Potato — Codebook Annotation

aceb1b2 verified 11 days ago

Raw

History Blame Contribute Delete

11.3 kB

	"""
	DOCX Format Handler

	Extracts text and structure from Word documents using python-docx and mammoth.
	Supports paragraph-level coordinate mapping for span annotations.

	Usage:
	from potato.format_handlers.docx_handler import DocxHandler

	handler = DocxHandler()
	output = handler.extract("document.docx", {
	"preserve_styles": True,
	"include_headers": True,
	})
	"""

	from typing import Dict, List, Any, Optional
	from pathlib import Path
	import html
	import logging
	import uuid

	from .base import BaseFormatHandler, FormatOutput
	from .coordinate_mapping import CoordinateMapper, DocumentCoordinate

	logger = logging.getLogger(__name__)

	# Check if dependencies are available
	try:
	import docx
	from docx.document import Document
	from docx.oxml.ns import qn
	DOCX_AVAILABLE = True
	except ImportError:
	DOCX_AVAILABLE = False
	docx = None

	try:
	import mammoth
	MAMMOTH_AVAILABLE = True
	except ImportError:
	MAMMOTH_AVAILABLE = False
	mammoth = None


	class DocxHandler(BaseFormatHandler):
	"""
	Handler for Word documents (.docx).

	Uses python-docx for text extraction with structure preservation
	and mammoth for HTML conversion.
	"""

	format_name = "docx"
	supported_extensions = [".docx"]
	description = "Word document extraction with paragraph/section mapping"
	requires_dependencies = ["python-docx", "mammoth"]

	def get_default_options(self) -> Dict[str, Any]:
	"""Get default extraction options."""
	return {
	"preserve_styles": True,
	"include_headers": True,
	"include_footers": False,
	"include_tables": True,
	"paragraph_separator": "\n\n",
	"use_mammoth_html": True, # Use mammoth for rich HTML conversion
	}

	def extract(
	self,
	file_path: str,
	options: Optional[Dict[str, Any]] = None
	) -> FormatOutput:
	"""
	Extract text and structure from a Word document.

	Args:
	file_path: Path to the .docx file
	options: Extraction options:
	- preserve_styles: Keep heading levels and formatting
	- include_headers: Include document headers
	- include_footers: Include document footers
	- include_tables: Extract table content

	Returns:
	FormatOutput with extracted text, HTML, and coordinate mappings
	"""
	if not DOCX_AVAILABLE:
	raise ImportError(
	"python-docx is required for DOCX extraction. "
	"Install with: pip install python-docx"
	)

	opts = self.merge_options(options)
	mapper = CoordinateMapper()

	# Open document
	doc = docx.Document(file_path)

	# Extract text with structure
	text_parts = []
	html_parts = []
	current_offset = 0

	metadata = {
	"format": "docx",
	"source_file": str(file_path),
	"paragraphs": [],
	"sections": [],
	"tables": [],
	}

	# Check if mammoth is available for rich HTML
	if opts.get("use_mammoth_html") and MAMMOTH_AVAILABLE:
	rendered_html = self._extract_with_mammoth(file_path, opts)
	else:
	rendered_html = None

	html_parts.append('<div class="docx-content">')

	# Extract headers if requested
	if opts.get("include_headers"):
	for section in doc.sections:
	header = section.header
	if header and header.paragraphs:
	header_text = "\n".join(p.text for p in header.paragraphs if p.text.strip())
	if header_text:
	para_id = f"header_{uuid.uuid4().hex[:8]}"
	text_parts.append(header_text)
	text_parts.append("\n\n")

	html_parts.append(f'<div class="docx-header" data-para-id="{para_id}">')
	html_parts.append(f'{html.escape(header_text)}')
	html_parts.append('</div>')

	mapper.add_mapping(
	current_offset,
	current_offset + len(header_text),
	DocumentCoordinate(
	paragraph_id=para_id,
	local_offset=0,
	section="header",
	)
	)
	current_offset += len(header_text) + 2
	metadata["sections"].append({"type": "header", "id": para_id})

	# Extract main body
	current_section = None
	for i, para in enumerate(doc.paragraphs):
	para_text = para.text
	if not para_text.strip():
	continue

	para_id = f"p_{i}_{uuid.uuid4().hex[:8]}"

	# Detect heading level
	heading_level = None
	if para.style and para.style.name:
	style_name = para.style.name.lower()
	if style_name.startswith("heading"):
	try:
	heading_level = int(style_name.replace("heading", "").strip())
	except ValueError:
	pass

	# Update section tracking
	if heading_level:
	current_section = para_text.strip()

	# Build text
	start_offset = current_offset
	text_parts.append(para_text)
	end_offset = current_offset + len(para_text)

	# Add paragraph separator
	text_parts.append(opts["paragraph_separator"])
	current_offset = end_offset + len(opts["paragraph_separator"])

	# Build HTML
	css_class = "docx-paragraph"
	if heading_level:
	css_class = f"docx-heading docx-h{heading_level}"
	html_tag = f"h{min(heading_level, 6)}"
	else:
	html_tag = "p"

	html_parts.append(
	f'<{html_tag} class="{css_class}" '
	f'data-para-id="{para_id}" '
	f'data-start="{start_offset}" '
	f'data-end="{end_offset}">'
	f'{html.escape(para_text)}'
	f'</{html_tag}>'
	)

	# Add coordinate mapping
	mapper.add_mapping(
	start_offset,
	end_offset,
	DocumentCoordinate(
	paragraph_id=para_id,
	local_offset=0,
	section=current_section,
	heading_level=heading_level,
	)
	)

	# Track paragraph metadata
	metadata["paragraphs"].append({
	"id": para_id,
	"start": start_offset,
	"end": end_offset,
	"heading_level": heading_level,
	"section": current_section,
	"char_count": len(para_text),
	})

	# Extract tables if requested
	if opts.get("include_tables"):
	for t_idx, table in enumerate(doc.tables):
	table_id = f"table_{t_idx}_{uuid.uuid4().hex[:8]}"
	table_text, table_html = self._extract_table(
	table, table_id, current_offset
	)

	if table_text:
	text_parts.append("\n")
	text_parts.append(table_text)
	text_parts.append("\n")
	html_parts.append(table_html)

	mapper.add_mapping(
	current_offset,
	current_offset + len(table_text),
	DocumentCoordinate(
	paragraph_id=table_id,
	section=current_section,
	)
	)
	current_offset += len(table_text) + 2
	metadata["tables"].append({"id": table_id})

	html_parts.append('</div>')

	full_text = "".join(text_parts)

	# Use mammoth HTML if available, otherwise use our generated HTML
	if rendered_html:
	final_html = rendered_html
	else:
	final_html = "\n".join(html_parts)

	coord_dict = mapper.to_dict()
	coord_dict["get_coords_for_range"] = mapper.get_coords_for_range

	return FormatOutput(
	text=full_text,
	rendered_html=final_html,
	coordinate_map=coord_dict,
	metadata=metadata,
	format_name=self.format_name,
	source_path=str(file_path),
	)

	def _extract_with_mammoth(self, file_path: str, opts: Dict[str, Any]) -> str:
	"""
	Use mammoth for rich HTML conversion.
	"""
	if not MAMMOTH_AVAILABLE:
	return None

	try:
	with open(file_path, "rb") as f:
	result = mammoth.convert_to_html(f)
	html_content = result.value

	# Wrap in container
	return f'<div class="docx-content docx-mammoth">{html_content}</div>'
	except Exception as e:
	logger.warning(f"Mammoth conversion failed: {e}")
	return None

	def _extract_table(
	self,
	table,
	table_id: str,
	base_offset: int
	) -> tuple:
	"""
	Extract text and HTML from a table.

	Returns:
	Tuple of (text, html)
	"""
	text_rows = []
	html_parts = []

	html_parts.append(f'<table class="docx-table" data-table-id="{table_id}">')

	for row_idx, row in enumerate(table.rows):
	row_texts = []
	html_parts.append('<tr>')

	for cell_idx, cell in enumerate(row.cells):
	cell_text = cell.text.strip()
	row_texts.append(cell_text)
	html_parts.append(f'<td>{html.escape(cell_text)}</td>')

	html_parts.append('</tr>')
	text_rows.append("\t".join(row_texts))

	html_parts.append('</table>')

	return "\n".join(text_rows), "\n".join(html_parts)

	def extract_metadata(self, file_path: str) -> Dict[str, Any]:
	"""
	Extract document metadata (author, title, etc.).

	Args:
	file_path: Path to the .docx file

	Returns:
	Dictionary of metadata properties
	"""
	if not DOCX_AVAILABLE:
	raise ImportError("python-docx is required")

	doc = docx.Document(file_path)
	core_props = doc.core_properties

	return {
	"author": core_props.author,
	"title": core_props.title,
	"subject": core_props.subject,
	"keywords": core_props.keywords,
	"created": str(core_props.created) if core_props.created else None,
	"modified": str(core_props.modified) if core_props.modified else None,
	"last_modified_by": core_props.last_modified_by,
	"revision": core_props.revision,
	"category": core_props.category,
	"comments": core_props.comments,
	}