File size: 1,954 Bytes
cf450f7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import logging
import os

from .providers import BaseConverter
from .providers.docling import DoclingConverter
from .providers.kreuzberg_converter import KreuzbergConverter
from .providers.markdown_converter import MarkdownConverter

logger = logging.getLogger(__name__)


class ConvertersFactory:
    @staticmethod
    def get_converters(
        content_type: str, embedding_model: str
    ) -> list[BaseConverter]:
        preferred = os.getenv("KG_PDF_CONVERTER", "docling").lower()
        fallback_enabled = os.getenv("KG_PDF_FALLBACK", "false").lower() in {
            "1",
            "true",
            "yes",
            "on",
        }
        converters: list[BaseConverter] = []

        def add_docling() -> None:
            if DoclingConverter.supports_content_type(content_type):
                converters.append(DoclingConverter(embedding_model))

        def add_kreuzberg() -> None:
            if KreuzbergConverter.supports_content_type(content_type):
                converters.append(KreuzbergConverter(content_type))

        def add_markdown() -> None:
            if MarkdownConverter.supports_content_type(content_type):
                converters.append(MarkdownConverter())

        add_markdown()
        if converters:
            return converters

        match preferred:
            case "docling":
                add_docling()
                if fallback_enabled:
                    add_kreuzberg()
            case "kreuzberg":
                add_kreuzberg()
                if fallback_enabled:
                    add_docling()
            case "auto":
                add_docling()
                if fallback_enabled:
                    add_kreuzberg()
            case _:
                add_docling()
                if fallback_enabled:
                    add_kreuzberg()

        if not converters:
            raise ValueError(f"Unsupported content type: {content_type}")

        return converters