"""
DOCX Format Handler

Extracts text and structure from Word documents using python-docx and mammoth.
Supports paragraph-level coordinate mapping for span annotations.

Usage:
    from potato.format_handlers.docx_handler import DocxHandler

    handler = DocxHandler()
    output = handler.extract("document.docx", {
        "preserve_styles": True,
        "include_headers": True,
    })
"""

from typing import Dict, List, Any, Optional
from pathlib import Path
import html
import logging
import uuid

from .base import BaseFormatHandler, FormatOutput
from .coordinate_mapping import CoordinateMapper, DocumentCoordinate

logger = logging.getLogger(__name__)

# Check if dependencies are available
try:
    import docx
    from docx.document import Document
    from docx.oxml.ns import qn
    DOCX_AVAILABLE = True
except ImportError:
    DOCX_AVAILABLE = False
    docx = None

try:
    import mammoth
    MAMMOTH_AVAILABLE = True
except ImportError:
    MAMMOTH_AVAILABLE = False
    mammoth = None


class DocxHandler(BaseFormatHandler):
    """
    Handler for Word documents (.docx).

    Uses python-docx for text extraction with structure preservation
    and mammoth for HTML conversion.
    """

    format_name = "docx"
    supported_extensions = [".docx"]
    description = "Word document extraction with paragraph/section mapping"
    requires_dependencies = ["python-docx", "mammoth"]

    def get_default_options(self) -> Dict[str, Any]:
        """Get default extraction options."""
        return {
            "preserve_styles": True,
            "include_headers": True,
            "include_footers": False,
            "include_tables": True,
            "paragraph_separator": "\n\n",
            "use_mammoth_html": True,  # Use mammoth for rich HTML conversion
        }

    def extract(
        self,
        file_path: str,
        options: Optional[Dict[str, Any]] = None
    ) -> FormatOutput:
        """
        Extract text and structure from a Word document.

        Args:
            file_path: Path to the .docx file
            options: Extraction options:
                - preserve_styles: Keep heading levels and formatting
                - include_headers: Include document headers
                - include_footers: Include document footers
                - include_tables: Extract table content

        Returns:
            FormatOutput with extracted text, HTML, and coordinate mappings
        """
        if not DOCX_AVAILABLE:
            raise ImportError(
                "python-docx is required for DOCX extraction. "
                "Install with: pip install python-docx"
            )

        opts = self.merge_options(options)
        mapper = CoordinateMapper()

        # Open document
        doc = docx.Document(file_path)

        # Extract text with structure
        text_parts = []
        html_parts = []
        current_offset = 0

        metadata = {
            "format": "docx",
            "source_file": str(file_path),
            "paragraphs": [],
            "sections": [],
            "tables": [],
        }

        # Check if mammoth is available for rich HTML
        if opts.get("use_mammoth_html") and MAMMOTH_AVAILABLE:
            rendered_html = self._extract_with_mammoth(file_path, opts)
        else:
            rendered_html = None

        html_parts.append('<div class="docx-content">')

        # Extract headers if requested
        if opts.get("include_headers"):
            for section in doc.sections:
                header = section.header
                if header and header.paragraphs:
                    header_text = "\n".join(p.text for p in header.paragraphs if p.text.strip())
                    if header_text:
                        para_id = f"header_{uuid.uuid4().hex[:8]}"
                        text_parts.append(header_text)
                        text_parts.append("\n\n")

                        html_parts.append(f'<div class="docx-header" data-para-id="{para_id}">')
                        html_parts.append(f'{html.escape(header_text)}')
                        html_parts.append('</div>')

                        mapper.add_mapping(
                            current_offset,
                            current_offset + len(header_text),
                            DocumentCoordinate(
                                paragraph_id=para_id,
                                local_offset=0,
                                section="header",
                            )
                        )
                        current_offset += len(header_text) + 2
                        metadata["sections"].append({"type": "header", "id": para_id})

        # Extract main body
        current_section = None
        for i, para in enumerate(doc.paragraphs):
            para_text = para.text
            if not para_text.strip():
                continue

            para_id = f"p_{i}_{uuid.uuid4().hex[:8]}"

            # Detect heading level
            heading_level = None
            if para.style and para.style.name:
                style_name = para.style.name.lower()
                if style_name.startswith("heading"):
                    try:
                        heading_level = int(style_name.replace("heading", "").strip())
                    except ValueError:
                        pass

            # Update section tracking
            if heading_level:
                current_section = para_text.strip()

            # Build text
            start_offset = current_offset
            text_parts.append(para_text)
            end_offset = current_offset + len(para_text)

            # Add paragraph separator
            text_parts.append(opts["paragraph_separator"])
            current_offset = end_offset + len(opts["paragraph_separator"])

            # Build HTML
            css_class = "docx-paragraph"
            if heading_level:
                css_class = f"docx-heading docx-h{heading_level}"
                html_tag = f"h{min(heading_level, 6)}"
            else:
                html_tag = "p"

            html_parts.append(
                f'<{html_tag} class="{css_class}" '
                f'data-para-id="{para_id}" '
                f'data-start="{start_offset}" '
                f'data-end="{end_offset}">'
                f'{html.escape(para_text)}'
                f'</{html_tag}>'
            )

            # Add coordinate mapping
            mapper.add_mapping(
                start_offset,
                end_offset,
                DocumentCoordinate(
                    paragraph_id=para_id,
                    local_offset=0,
                    section=current_section,
                    heading_level=heading_level,
                )
            )

            # Track paragraph metadata
            metadata["paragraphs"].append({
                "id": para_id,
                "start": start_offset,
                "end": end_offset,
                "heading_level": heading_level,
                "section": current_section,
                "char_count": len(para_text),
            })

        # Extract tables if requested
        if opts.get("include_tables"):
            for t_idx, table in enumerate(doc.tables):
                table_id = f"table_{t_idx}_{uuid.uuid4().hex[:8]}"
                table_text, table_html = self._extract_table(
                    table, table_id, current_offset
                )

                if table_text:
                    text_parts.append("\n")
                    text_parts.append(table_text)
                    text_parts.append("\n")
                    html_parts.append(table_html)

                    mapper.add_mapping(
                        current_offset,
                        current_offset + len(table_text),
                        DocumentCoordinate(
                            paragraph_id=table_id,
                            section=current_section,
                        )
                    )
                    current_offset += len(table_text) + 2
                    metadata["tables"].append({"id": table_id})

        html_parts.append('</div>')

        full_text = "".join(text_parts)

        # Use mammoth HTML if available, otherwise use our generated HTML
        if rendered_html:
            final_html = rendered_html
        else:
            final_html = "\n".join(html_parts)

        coord_dict = mapper.to_dict()
        coord_dict["get_coords_for_range"] = mapper.get_coords_for_range

        return FormatOutput(
            text=full_text,
            rendered_html=final_html,
            coordinate_map=coord_dict,
            metadata=metadata,
            format_name=self.format_name,
            source_path=str(file_path),
        )

    def _extract_with_mammoth(self, file_path: str, opts: Dict[str, Any]) -> str:
        """
        Use mammoth for rich HTML conversion.
        """
        if not MAMMOTH_AVAILABLE:
            return None

        try:
            with open(file_path, "rb") as f:
                result = mammoth.convert_to_html(f)
                html_content = result.value

                # Wrap in container
                return f'<div class="docx-content docx-mammoth">{html_content}</div>'
        except Exception as e:
            logger.warning(f"Mammoth conversion failed: {e}")
            return None

    def _extract_table(
        self,
        table,
        table_id: str,
        base_offset: int
    ) -> tuple:
        """
        Extract text and HTML from a table.

        Returns:
            Tuple of (text, html)
        """
        text_rows = []
        html_parts = []

        html_parts.append(f'<table class="docx-table" data-table-id="{table_id}">')

        for row_idx, row in enumerate(table.rows):
            row_texts = []
            html_parts.append('<tr>')

            for cell_idx, cell in enumerate(row.cells):
                cell_text = cell.text.strip()
                row_texts.append(cell_text)
                html_parts.append(f'<td>{html.escape(cell_text)}</td>')

            html_parts.append('</tr>')
            text_rows.append("\t".join(row_texts))

        html_parts.append('</table>')

        return "\n".join(text_rows), "\n".join(html_parts)

    def extract_metadata(self, file_path: str) -> Dict[str, Any]:
        """
        Extract document metadata (author, title, etc.).

        Args:
            file_path: Path to the .docx file

        Returns:
            Dictionary of metadata properties
        """
        if not DOCX_AVAILABLE:
            raise ImportError("python-docx is required")

        doc = docx.Document(file_path)
        core_props = doc.core_properties

        return {
            "author": core_props.author,
            "title": core_props.title,
            "subject": core_props.subject,
            "keywords": core_props.keywords,
            "created": str(core_props.created) if core_props.created else None,
            "modified": str(core_props.modified) if core_props.modified else None,
            "last_modified_by": core_props.last_modified_by,
            "revision": core_props.revision,
            "category": core_props.category,
            "comments": core_props.comments,
        }