| import torch |
| from typing import Dict, Tuple, List |
| from transformers import PretrainedConfig |
|
|
| class PathummaAudioConfig(PretrainedConfig): |
|
|
| model_type: str = "pathumma_audio" |
|
|
| def __init__( |
| self, |
| llm_path: str = "Qwen/Qwen2-7B-Instruct", |
| whisper_path: str = "nectec/Pathumma-whisper-th-large-v3", |
| beats_path: str = "", |
| init_from_scratch: bool = True, |
| |
| lora: bool = True, |
| lora_infer_mode: bool = True, |
| lora_rank: int = 8, |
| lora_alpha: int = 32, |
| lora_dropout: float = 0.1, |
| target_modules: List[str] = ["q_proj", "v_proj"], |
| qformer_query_token: int = 1, |
| qformer_hidden_layers: int = 2, |
| second_per_window: float = 0.333333, |
| second_stride: float = 0.333333, |
| |
| torch_dtype: torch.dtype = torch.bfloat16, |
| **kwargs |
| ): |
| super().__init__(**kwargs) |
| |
| self.architectures = kwargs.get("architectures", ["PathummaAudioModel"]) |
| self.auto_map = kwargs.get("auto_map", { |
| "AutoConfig": "configuration_pathumma_audio.PathummaAudioConfig", |
| "AutoModel": "modeling_pathumma_audio.PathummaAudioModel" |
| }) |
| |
| self.llm_path = llm_path |
| self.whisper_path = whisper_path |
| self.beats_path = beats_path |
| self.init_from_scratch = init_from_scratch |
|
|
| self.lora = lora |
| self.lora_infer_mode = lora_infer_mode |
| self.lora_rank = lora_rank |
| self.lora_alpha = lora_alpha |
| self.lora_dropout = lora_dropout |
| self.target_modules = target_modules |
|
|
| self.qformer_query_token = qformer_query_token |
| self.qformer_hidden_layers = qformer_hidden_layers |
| self.second_per_window = second_per_window |
| self.second_stride = second_stride |
|
|
| self.torch_dtype = torch_dtype |