import os
from typing import List
from langchain_core.documents import Document
from docx import Document as DocxDocument
from docx.oxml.table import CT_Tbl
from docx.oxml.text.paragraph import CT_P
from ingestion.loaders.normalization import normalize_text

def table_to_text(table) -> str:
    """Convert DOCX table to plain, readable text without numeric headers."""
    data = []
    try:
        for row in table.rows:
            row_data = [normalize_text(cell.text) for cell in row.cells]
            if any(row_data):  # skip empty rows
                data.append(row_data)

        if not data:
            return ""

        # Format as a readable markdown-like table instead of CSV with numbers
        return "\n".join([" | ".join(row) for row in data])

    except Exception as e:
        print(f"Error converting table to text: {e}")
        return ""


def load_docx(file_path: str) -> List[Document]:
    """Load DOCX file safely, preserving tables and skipping corrupted sections."""
    docs = []

    if not os.path.exists(file_path):
        print(f"File not found: {file_path}")
        return []

    try:
        doc = DocxDocument(file_path)
    except Exception as e:
        print(f"Failed to open DOCX ({file_path}): {e}")
        return []

    try:
        body_elements = list(doc.element.body)
        paragraph_iter = iter(doc.paragraphs)
        table_iter = iter(doc.tables)

        for element in body_elements:
            if isinstance(element, CT_P):
                try:
                    para = next(paragraph_iter)
                    cleaned = normalize_text(para.text)
                    if cleaned:
                        docs.append(
                            Document(
                                page_content=cleaned,
                                metadata={"source": file_path, "type": "text"},
                            )
                        )

                except StopIteration:
                    continue
                except Exception as e:
                    print(f"Error reading paragraph: {e}")
                    continue
            elif isinstance(element, CT_Tbl):
                try:
                    table = next(table_iter)
                    table_text = table_to_text(table)
                    if table_text:
                        docs.append(
                            Document(
                                page_content=table_text,
                                metadata={"source": file_path, "type": "table"},
                            )
                        )
                except StopIteration:
                    continue
                except Exception as e:
                    print(f"Error reading table: {e}")
                    continue

    except Exception as e:
        print(f"[WARN] Error processing DOCX ({file_path}): {e}")
        return []

    return docs