| """Helpers for the optional markitdown document-extraction dependency.
|
|
|
| markitdown (MIT, Microsoft) converts Office/EPUB documents to Markdown, which is
|
| more token-efficient and model-legible than a raw text dump. It is **optional**:
|
| install with `pip install -r requirements-optional.txt`. When absent, callers
|
| degrade gracefully (chat shows a hint; the RAG indexer skips the file) — the MIT
|
| core never hard-depends on it. Mirrors the optional-dependency pattern in
|
| `src/pdf_runtime.py`.
|
| """
|
|
|
| import logging
|
| import os
|
|
|
| logger = logging.getLogger(__name__)
|
|
|
| MARKITDOWN_MISSING = (
|
| "Office/EPUB document extraction requires markitdown. Install optional "
|
| "dependencies with `pip install -r requirements-optional.txt`."
|
| )
|
|
|
|
|
|
|
|
|
| MARKITDOWN_EXTS = frozenset({".docx", ".pptx", ".xlsx", ".xls", ".epub"})
|
|
|
|
|
| def is_markitdown_format(path: str) -> bool:
|
| """True if the file extension is one we route through markitdown."""
|
| if not isinstance(path, str):
|
| return False
|
| return os.path.splitext(path)[1].lower() in MARKITDOWN_EXTS
|
|
|
|
|
| def load_markitdown():
|
| """Return the MarkItDown class, or raise a user-facing setup hint."""
|
| try:
|
| from markitdown import MarkItDown
|
| except ImportError as exc:
|
| raise RuntimeError(MARKITDOWN_MISSING) from exc
|
| return MarkItDown
|
|
|
|
|
| def convert_to_markdown(path: str) -> str | None:
|
| """Convert a document to Markdown text via markitdown.
|
|
|
| Returns the extracted Markdown, or ``None`` if markitdown is unavailable or
|
| the conversion fails — callers degrade gracefully rather than erroring.
|
| """
|
| try:
|
| markitdown_cls = load_markitdown()
|
| except RuntimeError:
|
| logger.warning("markitdown not installed; cannot extract %s", path)
|
| return None
|
| try:
|
| result = markitdown_cls().convert(path)
|
| text = getattr(result, "text_content", None)
|
| if text is None:
|
| text = getattr(result, "markdown", None)
|
| return text
|
| except Exception as e:
|
| logger.warning("markitdown failed to convert %s: %s", path, e)
|
| return None
|
|
|