|
|
from __future__ import annotations |
|
|
|
|
|
import os |
|
|
import asyncio |
|
|
from typing import Any |
|
|
|
|
|
try: |
|
|
from ...integration.markitdown import MarkItDown as MaItDo, StreamInfo |
|
|
has_markitdown = True |
|
|
except ImportError: |
|
|
has_markitdown = False |
|
|
|
|
|
from ...typing import AsyncResult, Messages, MediaListType |
|
|
from ...tools.files import get_tempfile |
|
|
from ..base_provider import AsyncGeneratorProvider, ProviderModelMixin |
|
|
|
|
|
class MarkItDown(AsyncGeneratorProvider, ProviderModelMixin): |
|
|
working = has_markitdown |
|
|
|
|
|
@classmethod |
|
|
async def create_async_generator( |
|
|
cls, |
|
|
model: str, |
|
|
messages: Messages, |
|
|
media: MediaListType = None, |
|
|
llm_client: Any = None, |
|
|
**kwargs |
|
|
) -> AsyncResult: |
|
|
if media is None: |
|
|
raise ValueError("MarkItDown requires media to be provided.") |
|
|
if not has_markitdown: |
|
|
raise ImportError("MarkItDown is not installed. Please install it with `pip install markitdown`.") |
|
|
md = MaItDo() |
|
|
for file, filename in media: |
|
|
text = None |
|
|
try: |
|
|
result = md.convert( |
|
|
file, |
|
|
stream_info=StreamInfo(filename=filename) if filename else None, |
|
|
llm_client=llm_client, |
|
|
llm_model=model |
|
|
) |
|
|
if asyncio.iscoroutine(result.text_content): |
|
|
text = await result.text_content |
|
|
else: |
|
|
text = result.text_content |
|
|
except TypeError: |
|
|
copyfile = get_tempfile(file, filename) |
|
|
try: |
|
|
result = md.convert( |
|
|
copyfile, |
|
|
llm_client=llm_client, |
|
|
llm_model=model |
|
|
) |
|
|
if asyncio.iscoroutine(result.text_content): |
|
|
text = await result.text_content |
|
|
else: |
|
|
text = result.text_content |
|
|
finally: |
|
|
os.remove(copyfile) |
|
|
text = text.split("### Audio Transcript:\n")[-1] |
|
|
if text: |
|
|
yield text |