import os
import re
from typing import List
from langchain_core.documents import Document
from ingestion.loaders.normalization import normalize_text


def load_md(file_path: str) -> List[Document]:
    """Load Markdown safely, preserving inline tables and skipping unreadable sections."""
    if not os.path.exists(file_path):
        print(f"File not found: {file_path}")
        return []

    text = ""
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            text = f.read()
    except UnicodeDecodeError:
        try:
            with open(file_path, "r", encoding="latin-1") as f:
                text = f.read()
        except Exception as e:
            print(f"Failed to read Markdown file ({file_path}): {e}")
            return []
    except Exception as e:
        print(f"Could not open Markdown file ({file_path}): {e}")
        return []

    docs = []
    try:
        # Split into segments alternating between text and tables
        parts = re.split(r"((?:\|.*\|\n)+)", text)
        for part in parts:
            if not part.strip():
                continue

            # Detect if segment is a table
            content_type = "table" if re.match(r"(?:\|.*\|\n)+", part) else "text"

            # Clean markdown formatting but keep structure
            cleaned = normalize_text(re.sub(r'(```.*?```|`.*?`|\*\*|__|#)', '', part, flags=re.DOTALL))
            if cleaned:
                docs.append(Document(page_content=cleaned, metadata={"source": file_path, "type": content_type}))
    except Exception as e:
        print(f"Error parsing Markdown file ({file_path}): {e}")
        return []

    return docs