File size: 2,032 Bytes
049b491
 
 
 
 
308e34c
049b491
 
 
 
 
308e34c
049b491
308e34c
 
8ae2e57
 
 
049b491
 
308e34c
049b491
 
 
308e34c
 
 
 
049b491
 
 
 
 
8ae2e57
 
308e34c
8ae2e57
 
308e34c
049b491
8ae2e57
 
 
 
049b491
8ae2e57
308e34c
 
 
049b491
8ae2e57
049b491
 
 
 
308e34c
049b491
8ae2e57
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# configuration_deepseekocr.py
# ------------------------------------------------------------
# Configuration class for the Deepseek-OCR model
# ------------------------------------------------------------
from transformers.utils import logging
from .configuration_deepseek_v2 import DeepseekV2Config

logger = logging.get_logger(__name__)

DEEPSEEK_OCR_PRETRAINED_CONFIG_ARCHIVE_MAP = {}

class DeepseekOCRConfig(DeepseekV2Config):
    """
    Config for Deepseek-OCR.

    Inherits all language-model fields from DeepseekV2Config
    (hidden_size, hidden_act, attention_bias, etc.) and adds
    OCR / vision specific metadata.
    """

    model_type = "deepseekocr"

    def __init__(
        self,
        # OCR / vision specific
        candidate_resolutions=None,
        global_view_pos="head",
        tile_tag="2D",
        projector_config=None,
        vision_config=None,
        language_config=None,
        **kwargs,
    ):
        # If a nested language_config dict is provided in config.json,
        # merge it into kwargs so DeepseekV2Config sees all LM params.
        if language_config is not None and isinstance(language_config, dict):
            base = dict(language_config)  # copy
            base.update(kwargs)           # top-level overrides nested
            kwargs = base

        # Let DeepseekV2Config handle all core model parameters.
        # NOTE: we do NOT pass torch_dtype explicitly here, it will be
        # picked from kwargs if present, so no "multiple values" error.
        super().__init__(**kwargs)

        # Store OCR-specific attributes
        self.candidate_resolutions = candidate_resolutions or [[1024, 1024]]
        self.global_view_pos = global_view_pos
        self.tile_tag = tile_tag

        # Keep sub-configs around for the modeling code
        self.projector_config = projector_config
        self.vision_config = vision_config
        self.language_config = language_config

        logger.info("✅ DeepseekOCRConfig initialized (inherits DeepseekV2Config).")