|
|
from typing import BinaryIO, Union, Awaitable |
|
|
import base64 |
|
|
import mimetypes |
|
|
import asyncio |
|
|
from markitdown._stream_info import StreamInfo |
|
|
|
|
|
|
|
|
def llm_caption( |
|
|
file_stream: BinaryIO, stream_info: StreamInfo, *, client, model, prompt=None |
|
|
) -> Union[None, str, Awaitable[str]]: |
|
|
if prompt is None or prompt.strip() == "": |
|
|
prompt = "Write a detailed caption for this image." |
|
|
|
|
|
|
|
|
content_type = stream_info.mimetype |
|
|
if not content_type: |
|
|
content_type, _ = mimetypes.guess_type("_dummy" + (stream_info.extension or "")) |
|
|
if not content_type: |
|
|
content_type = "application/octet-stream" |
|
|
|
|
|
|
|
|
cur_pos = file_stream.tell() |
|
|
try: |
|
|
base64_image = base64.b64encode(file_stream.read()).decode("utf-8") |
|
|
except Exception as e: |
|
|
return None |
|
|
finally: |
|
|
file_stream.seek(cur_pos) |
|
|
|
|
|
|
|
|
data_uri = f"data:{content_type};base64,{base64_image}" |
|
|
|
|
|
|
|
|
messages = [ |
|
|
{ |
|
|
"role": "user", |
|
|
"content": [ |
|
|
{"type": "text", "text": prompt}, |
|
|
{ |
|
|
"type": "image_url", |
|
|
"image_url": { |
|
|
"url": data_uri, |
|
|
}, |
|
|
}, |
|
|
], |
|
|
} |
|
|
] |
|
|
|
|
|
|
|
|
response = client.chat.completions.create(model=model, messages=messages) |
|
|
if asyncio.iscoroutine(response): |
|
|
async def read_content(response): |
|
|
response = await response |
|
|
return response.choices[0].message.content |
|
|
return read_content(response) |
|
|
return response.choices[0].message.content |