PaddleOCR-VL-MLX / configuration_paddleocr_vl.py
gamhtoi's picture
Upload PaddleOCR-VL-MLX - MLX optimized for Apple Silicon
d48a40f verified
# PaddleOCR-VL Configuration for MLX
from transformers import PretrainedConfig
class PaddleOCRVLConfig(PretrainedConfig):
"""
Configuration class for PaddleOCR-VL MLX model.
This is the configuration class to store the configuration of a PaddleOCRVLForConditionalGeneration.
It is used to instantiate a PaddleOCR-VL model according to the specified arguments.
"""
model_type = "paddleocr_vl"
def __init__(
self,
# Vision config
vision_config=None,
# Language model config
hidden_size=1024,
intermediate_size=3072,
num_hidden_layers=18,
num_attention_heads=16,
num_key_value_heads=2,
head_dim=128,
hidden_act="silu",
max_position_embeddings=131072,
rms_norm_eps=1e-5,
rope_theta=500000,
rope_is_neox_style=True,
use_3d_rope=True,
# Special tokens
vocab_size=103424,
image_token_id=100295,
vision_start_token_id=101305,
vision_end_token_id=101306,
video_token_id=101307,
# Other
tie_word_embeddings=False,
use_cache=False,
use_flash_attention=False,
attention_probs_dropout_prob=0.0,
hidden_dropout_prob=0.0,
**kwargs
):
self.vision_config = vision_config or {}
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.num_key_value_heads = num_key_value_heads
self.head_dim = head_dim
self.hidden_act = hidden_act
self.max_position_embeddings = max_position_embeddings
self.rms_norm_eps = rms_norm_eps
self.rope_theta = rope_theta
self.rope_is_neox_style = rope_is_neox_style
self.use_3d_rope = use_3d_rope
self.vocab_size = vocab_size
self.image_token_id = image_token_id
self.vision_start_token_id = vision_start_token_id
self.vision_end_token_id = vision_end_token_id
self.video_token_id = video_token_id
self.tie_word_embeddings = tie_word_embeddings
self.use_cache = use_cache
self.use_flash_attention = use_flash_attention
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.hidden_dropout_prob = hidden_dropout_prob
super().__init__(**kwargs)