MiniCPM-o-4.5-nvidia-FlagOS / configuration_minicpmo.py
YummyYum's picture
Upload folder using huggingface_hub
be99bcf verified
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import json
import os
from typing import Union
from transformers import PretrainedConfig
from transformers import Qwen3Config
from transformers import WhisperConfig
from transformers.utils import logging
from .configuration_minicpmtts import MiniCPMTTSConfig
from .modeling_navit_siglip import SiglipVisionConfig
logger = logging.get_logger(__name__)
class MiniCPMVSliceConfig(PretrainedConfig):
model_type = "minicpmv"
def __init__(
self,
patch_size=14,
max_slice_nums=9,
scale_resolution=448,
**kwargs,
):
super().__init__(**kwargs)
self.patch_size = patch_size
self.max_slice_nums = max_slice_nums
self.scale_resolution = scale_resolution
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
cls._set_token_in_kwargs(kwargs)
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
if config_dict.get("model_type") == "minicpmv":
config_dict = config_dict["slice_config"]
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
logger.warning(
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
)
return cls.from_dict(config_dict, **kwargs)
class MiniCPMODuplexConfig(PretrainedConfig):
"""Configuration class for MiniCPMODuplex."""
model_type = "minicpmo_duplex"
def __init__(
self,
# duplex init params
generate_audio: bool = True,
ls_mode: str = "explicit",
# llm generation config
max_new_speak_tokens_per_chunk: int = 20,
text_repetition_penalty: float = 1.05,
temperature: float = 0.7,
top_k: int = 20,
top_p: float = 0.8,
text_repetition_window_size: int = 512,
listen_prob_scale: float = 1.0,
# tts generation config
tts_temperature: float = 0.8,
tts_repetition_penalty: float = 1.05,
# stream config
chunk_ms: int = 1000,
first_chunk_ms: int = 1035,
cnn_redundancy_ms: int = 20,
sample_rate: int = 16000,
# attn implementation
attn_implementation: str = "flash_attention_2",
# sliding window config
sliding_window_mode: str = "off", # "off" / "basic" / "context"
basic_window_high_tokens: int = 8000,
basic_window_low_tokens: int = 4000,
context_previous_max_tokens: int = 500,
context_max_units: int = 24,
**kwargs,
):
super().__init__(**kwargs)
self.generate_audio = generate_audio
self.ls_mode = ls_mode
self.max_new_speak_tokens_per_chunk = max_new_speak_tokens_per_chunk
self.text_repetition_penalty = text_repetition_penalty
self.temperature = temperature
self.top_k = top_k
self.top_p = top_p
self.text_repetition_window_size = text_repetition_window_size
self.listen_prob_scale = listen_prob_scale
self.tts_temperature = tts_temperature
self.tts_repetition_penalty = tts_repetition_penalty
self.chunk_ms = chunk_ms
self.first_chunk_ms = first_chunk_ms
self.cnn_redundancy_ms = cnn_redundancy_ms
self.sample_rate = sample_rate
self.attn_implementation = attn_implementation
# sliding window
self.sliding_window_mode = sliding_window_mode
self.basic_window_high_tokens = basic_window_high_tokens
self.basic_window_low_tokens = basic_window_low_tokens
self.context_previous_max_tokens = context_previous_max_tokens
self.context_max_units = context_max_units
@classmethod
def from_pretrained(
cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
) -> "MiniCPMODuplexConfig":
config_file = os.path.join(pretrained_model_name_or_path, "duplex_config.json")
if os.path.exists(config_file):
with open(config_file, "r", encoding="utf-8") as f:
config_dict = json.load(f)
# Override with any kwargs provided
config_dict.update(kwargs)
return cls(**config_dict)
else:
# Return default config if duplex_config.json doesn't exist
logger.info(
f"duplex_config.json not found at {pretrained_model_name_or_path}, using default MiniCPMODuplexConfig"
)
return cls(**kwargs)
def save_pretrained(self, save_directory: Union[str, os.PathLike], **kwargs):
os.makedirs(save_directory, exist_ok=True)
config_file = os.path.join(save_directory, "duplex_config.json")
with open(config_file, "w", encoding="utf-8") as f:
json.dump(self.to_dict(), f, indent=2, ensure_ascii=False)
logger.info(f"Duplex configuration saved to {config_file}")
class MiniCPMOConfig(Qwen3Config):
model_type = "minicpmo"
keys_to_ignore_at_inference = ["past_key_values"]
default_vision_config = {
"hidden_size": 1152,
"image_size": 980,
"intermediate_size": 4304,
"model_type": "siglip",
"num_attention_heads": 16,
"num_hidden_layers": 27,
"patch_size": 14,
}
def __init__(
self,
use_cache=True,
query_num=64,
image_size=448,
drop_vision_last_layer=True,
batch_vision_input=True,
slice_config=None,
vision_config=None,
audio_config=None,
tts_config=None,
use_image_id=True,
vision_batch_size=16,
audio_pool_step=5,
audio_chunk_length=1.0,
stream_input=False,
listen_speak_type="asr",
init_vision=True,
init_audio=True,
init_tts=True,
**kwargs,
):
self.use_cache = use_cache
self.query_num = query_num
self.image_size = image_size
self.drop_vision_last_layer = drop_vision_last_layer
self.batch_vision_input = batch_vision_input
self.use_image_id = use_image_id
self.vision_batch_size = vision_batch_size
self.audio_pool_step = audio_pool_step
self.audio_chunk_length = audio_chunk_length
self.stream_input = stream_input
self.listen_speak_type = listen_speak_type
self.init_vision = init_vision
self.init_audio = init_audio
self.init_tts = init_tts
if slice_config is None:
self.slice_config = MiniCPMVSliceConfig(max_slice_nums=1)
else:
self.slice_config = MiniCPMVSliceConfig(**slice_config)
self.slice_mode = True
# same as HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit add tgt_sizes
if vision_config is None:
self.vision_config = SiglipVisionConfig(**self.default_vision_config)
logger.info("vision_config is None, using default vision config")
elif isinstance(vision_config, dict):
self.vision_config = SiglipVisionConfig(**vision_config)
elif isinstance(vision_config, SiglipVisionConfig):
self.vision_config = vision_config
if audio_config is None:
self.audio_config = WhisperConfig()
elif isinstance(audio_config, dict):
self.audio_config = WhisperConfig(**audio_config)
elif isinstance(audio_config, WhisperConfig):
self.audio_config = audio_config
if tts_config is None:
self.tts_config = MiniCPMTTSConfig()
elif isinstance(tts_config, dict):
self.tts_config = MiniCPMTTSConfig(**tts_config)
elif isinstance(tts_config, MiniCPMTTSConfig):
self.tts_config = tts_config
self.patch_size = self.vision_config.patch_size
super().__init__(**kwargs)