File size: 2,160 Bytes
a4b70d9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
from __future__ import annotations

import os
import asyncio
from typing import Any

try:
    from ...integration.markitdown import MarkItDown as MaItDo, StreamInfo
    has_markitdown = True
except ImportError:
    has_markitdown = False

from ...typing import AsyncResult, Messages, MediaListType
from ...tools.files import get_tempfile
from ..base_provider import AsyncGeneratorProvider, ProviderModelMixin

class MarkItDown(AsyncGeneratorProvider, ProviderModelMixin):
    working = has_markitdown

    @classmethod
    async def create_async_generator(
        cls,
        model: str,
        messages: Messages,
        media: MediaListType = None,
        llm_client: Any = None,
        **kwargs
    ) -> AsyncResult:
        if media is None:
            raise ValueError("MarkItDown requires media to be provided.")
        if not has_markitdown:
            raise ImportError("MarkItDown is not installed. Please install it with `pip install markitdown`.")
        md = MaItDo()
        for file, filename in media:
            text = None
            try:
                result = md.convert(
                    file,
                    stream_info=StreamInfo(filename=filename) if filename else None,
                    llm_client=llm_client,
                    llm_model=model
                )
                if asyncio.iscoroutine(result.text_content):
                    text = await result.text_content
                else:
                    text = result.text_content
            except TypeError:
                copyfile = get_tempfile(file, filename)
                try:
                    result = md.convert(
                        copyfile, 
                        llm_client=llm_client,
                        llm_model=model
                    )
                    if asyncio.iscoroutine(result.text_content):
                        text = await result.text_content
                    else:
                        text = result.text_content
                finally:
                    os.remove(copyfile)
            text = text.split("### Audio Transcript:\n")[-1]
            if text:
                yield text