File size: 2,764 Bytes
a4b70d9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 |
from typing import BinaryIO, Any
import asyncio
from markitdown._base_converter import DocumentConverter, DocumentConverterResult
from markitdown._stream_info import StreamInfo
from markitdown.converters._llm_caption import llm_caption
from markitdown.converters._exiftool import exiftool_metadata
from ._base_converter import AsyncDocumentConverterResult
ACCEPTED_MIME_TYPE_PREFIXES = [
"image/jpeg",
"image/png",
]
ACCEPTED_FILE_EXTENSIONS = [".jpg", ".jpeg", ".png"]
class ImageConverter(DocumentConverter):
"""
Converts images to markdown via extraction of metadata (if `exiftool` is installed), and description via a multimodal LLM (if an llm_client is configured).
"""
def accepts(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any,
) -> bool:
mimetype = (stream_info.mimetype or "").lower()
extension = (stream_info.extension or "").lower()
if extension in ACCEPTED_FILE_EXTENSIONS:
return True
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
if mimetype.startswith(prefix):
return True
return False
def convert(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
) -> DocumentConverterResult:
md_content = ""
# Add metadata
metadata = exiftool_metadata(
file_stream, exiftool_path=kwargs.get("exiftool_path")
)
if metadata:
for f in [
"ImageSize",
"Title",
"Caption",
"Description",
"Keywords",
"Artist",
"Author",
"DateTimeOriginal",
"CreateDate",
"GPSPosition",
]:
if f in metadata:
md_content += f"{f}: {metadata[f]}\n"
# Try describing the image with GPT
llm_client = kwargs.get("llm_client")
llm_model = kwargs.get("llm_model")
if llm_client is not None and llm_model is not None:
llm_description = llm_caption(
file_stream,
stream_info,
client=llm_client,
model=llm_model,
prompt=kwargs.get("llm_prompt"),
)
if asyncio.iscoroutine(llm_description):
return AsyncDocumentConverterResult(
llm_description,
)
if llm_description is not None:
md_content += "\n# Description:\n" + llm_description.strip() + "\n"
return DocumentConverterResult(
markdown=md_content,
) |