Update custom model files, README, and requirements
Browse files- asr_modeling.py +0 -8
- asr_pipeline.py +0 -4
- requirements.txt +5 -9
asr_modeling.py
CHANGED
|
@@ -629,8 +629,6 @@ class ASRModel(PreTrainedModel):
|
|
| 629 |
if audio_inputs is None:
|
| 630 |
raise ValueError("input_values or input_features must be provided for generation")
|
| 631 |
|
| 632 |
-
# Debug: Check audio inputs
|
| 633 |
-
|
| 634 |
audio_embeds = self._encode_audio(audio_inputs)
|
| 635 |
batch_size = audio_embeds.shape[0]
|
| 636 |
device = audio_embeds.device
|
|
@@ -673,12 +671,6 @@ class ASRModel(PreTrainedModel):
|
|
| 673 |
|
| 674 |
num_audio_tokens = audio_embeds.shape[1]
|
| 675 |
expanded_prompt_ids = self._expand_audio_tokens(prompt_ids, num_audio_tokens)
|
| 676 |
-
|
| 677 |
-
# Debug: Show what prompt we built
|
| 678 |
-
import sys
|
| 679 |
-
prompt_text = self.tokenizer.decode(expanded_prompt_ids[0], skip_special_tokens=False)
|
| 680 |
-
print(f"DEBUG generate: Built prompt: {prompt_text[:200]}", file=sys.stderr)
|
| 681 |
-
|
| 682 |
inputs_embeds = self._prepare_audio_inputs_embeds(expanded_prompt_ids, audio_embeds)
|
| 683 |
total_seq_len = inputs_embeds.shape[1]
|
| 684 |
attention_mask = torch.ones(batch_size, total_seq_len, dtype=torch.long, device=device)
|
|
|
|
| 629 |
if audio_inputs is None:
|
| 630 |
raise ValueError("input_values or input_features must be provided for generation")
|
| 631 |
|
|
|
|
|
|
|
| 632 |
audio_embeds = self._encode_audio(audio_inputs)
|
| 633 |
batch_size = audio_embeds.shape[0]
|
| 634 |
device = audio_embeds.device
|
|
|
|
| 671 |
|
| 672 |
num_audio_tokens = audio_embeds.shape[1]
|
| 673 |
expanded_prompt_ids = self._expand_audio_tokens(prompt_ids, num_audio_tokens)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 674 |
inputs_embeds = self._prepare_audio_inputs_embeds(expanded_prompt_ids, audio_embeds)
|
| 675 |
total_seq_len = inputs_embeds.shape[1]
|
| 676 |
attention_mask = torch.ones(batch_size, total_seq_len, dtype=torch.long, device=device)
|
asr_pipeline.py
CHANGED
|
@@ -219,10 +219,6 @@ class ASRPipeline(transformers.AutomaticSpeechRecognitionPipeline):
|
|
| 219 |
generate_kwargs.setdefault("eos_token_id", im_end_id)
|
| 220 |
generate_kwargs.setdefault("max_new_tokens", self.model.config.max_new_tokens)
|
| 221 |
|
| 222 |
-
# Debug: Log what we're passing to generate
|
| 223 |
-
import sys
|
| 224 |
-
print(f"DEBUG _forward: task={task}, system_prompt={self.model.config.system_prompt}", file=sys.stderr)
|
| 225 |
-
|
| 226 |
# Pass the appropriate input type to generate
|
| 227 |
if is_whisper:
|
| 228 |
# Whisper model - use input_features
|
|
|
|
| 219 |
generate_kwargs.setdefault("eos_token_id", im_end_id)
|
| 220 |
generate_kwargs.setdefault("max_new_tokens", self.model.config.max_new_tokens)
|
| 221 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 222 |
# Pass the appropriate input type to generate
|
| 223 |
if is_whisper:
|
| 224 |
# Whisper model - use input_features
|
requirements.txt
CHANGED
|
@@ -1,9 +1,5 @@
|
|
| 1 |
-
#
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
torchcodec
|
| 7 |
-
peft
|
| 8 |
-
truecase
|
| 9 |
-
nltk
|
|
|
|
| 1 |
+
# Core dependencies for tiny-audio model inference
|
| 2 |
+
# This file is pushed to HuggingFace for model repository
|
| 3 |
+
|
| 4 |
+
# Transformers - main library for model loading and inference
|
| 5 |
+
transformers>=4.57.0
|
|
|
|
|
|
|
|
|
|
|
|