mazesmazes
/

tiny-audio-glm

@@ -38,6 +38,8 @@ class ASRModel(PreTrainedModel, GenerationMixin):
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
         """Load model from pretrained, handling device placement correctly."""
         from safetensors.torch import load_file
         from transformers.utils.hub import cached_file
@@ -72,6 +74,22 @@ class ASRModel(PreTrainedModel, GenerationMixin):
                 state_dict = load_file(model_file)
                 model.load_state_dict(state_dict, strict=False)
             return model
         finally:
             cls._is_loading_from_pretrained = False

     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
         """Load model from pretrained, handling device placement correctly."""
+        from pathlib import Path
         from safetensors.torch import load_file
         from transformers.utils.hub import cached_file
                 state_dict = load_file(model_file)
                 model.load_state_dict(state_dict, strict=False)
+            # Load LoRA adapter if present
+            adapter_config = cached_file(
+                pretrained_model_name_or_path,
+                "adapter_config.json",
+                _raise_exceptions_for_missing_entries=False,
+                **cache_kwargs,
+            )
+            if adapter_config is not None:
+                from peft import PeftModel
+                # Get adapter directory (parent of adapter_config.json)
+                adapter_path = Path(adapter_config).parent
+                model.language_model = PeftModel.from_pretrained(
+                    model.language_model, adapter_path, is_trainable=False
+                )
             return model
         finally:
             cls._is_loading_from_pretrained = False

config.json CHANGED Viewed

@@ -161,6 +161,10 @@
   "label_smoothing": 0.0,
   "length_penalty": 1.0,
   "llm_dim": 2048,
   "max_new_tokens": 96,
   "model_dtype": "bfloat16",
   "model_type": "asr_model",
@@ -169,6 +173,7 @@
   "num_experts": 4,
   "num_experts_per_tok": 2,
   "pipeline_tag": "automatic-speech-recognition",
   "projector_dropout": 0.0,
   "projector_hidden_dim": null,
   "projector_init_std": 0.02,
@@ -249,6 +254,7 @@
   "text_model_id": "Qwen/Qwen3-1.7B",
   "transformers_version": "5.0.0.dev0",
   "use_cache": false,
   "use_specaugment": true,
   "user_prompt": "Please transcribe this English audio into text: <audio>",
   "vocab_size": 151670

   "label_smoothing": 0.0,
   "length_penalty": 1.0,
   "llm_dim": 2048,
+  "lora_alpha": 32,
+  "lora_dropout": 0.0,
+  "lora_r": 32,
+  "lora_target_modules": "all-linear",
   "max_new_tokens": 96,
   "model_dtype": "bfloat16",
   "model_type": "asr_model",
   "num_experts": 4,
   "num_experts_per_tok": 2,
   "pipeline_tag": "automatic-speech-recognition",
+  "pretrained_model_path": "mazesmazes/tiny-audio-glm",
   "projector_dropout": 0.0,
   "projector_hidden_dim": null,
   "projector_init_std": 0.02,
   "text_model_id": "Qwen/Qwen3-1.7B",
   "transformers_version": "5.0.0.dev0",
   "use_cache": false,
+  "use_lora": true,
   "use_specaugment": true,
   "user_prompt": "Please transcribe this English audio into text: <audio>",
   "vocab_size": 151670