AlexWortega commited on
Commit
d95e2f8
·
verified ·
1 Parent(s): 8be0d09

Upload processing_borealis.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. processing_borealis.py +12 -2
processing_borealis.py CHANGED
@@ -51,6 +51,7 @@ class BorealisProcessor(ProcessorMixin):
51
  self,
52
  text: Optional[Union[str, List[str]]] = None,
53
  audio: Optional[Union[torch.Tensor, List[torch.Tensor]]] = None,
 
54
  sampling_rate: Optional[int] = None,
55
  return_tensors: Optional[str] = "pt",
56
  **kwargs,
@@ -61,12 +62,17 @@ class BorealisProcessor(ProcessorMixin):
61
  Args:
62
  text: Text prompt(s)
63
  audio: Audio waveform(s) at 16kHz
 
64
  sampling_rate: Audio sampling rate (default: 16000)
65
  return_tensors: Return tensor type
66
 
67
  Returns:
68
  BatchFeature with input_ids and optionally input_features
69
  """
 
 
 
 
70
  if sampling_rate is None:
71
  sampling_rate = self.sampling_rate
72
 
@@ -74,7 +80,7 @@ class BorealisProcessor(ProcessorMixin):
74
 
75
  # Process audio if provided
76
  if audio is not None:
77
- if isinstance(audio, torch.Tensor):
78
  audio = [audio]
79
 
80
  # Convert to numpy for feature extractor
@@ -96,11 +102,15 @@ class BorealisProcessor(ProcessorMixin):
96
  if isinstance(text, str):
97
  text = [text]
98
 
 
 
 
 
99
  text_inputs = self.tokenizer(
100
  text,
101
  return_tensors=return_tensors,
102
  padding=True,
103
- **kwargs,
104
  )
105
  data["input_ids"] = text_inputs.input_ids
106
  if "attention_mask" in text_inputs:
 
51
  self,
52
  text: Optional[Union[str, List[str]]] = None,
53
  audio: Optional[Union[torch.Tensor, List[torch.Tensor]]] = None,
54
+ audios: Optional[List] = None, # vLLM uses plural
55
  sampling_rate: Optional[int] = None,
56
  return_tensors: Optional[str] = "pt",
57
  **kwargs,
 
62
  Args:
63
  text: Text prompt(s)
64
  audio: Audio waveform(s) at 16kHz
65
+ audios: Audio waveform(s) at 16kHz (vLLM style)
66
  sampling_rate: Audio sampling rate (default: 16000)
67
  return_tensors: Return tensor type
68
 
69
  Returns:
70
  BatchFeature with input_ids and optionally input_features
71
  """
72
+ # vLLM uses 'audios' (plural)
73
+ if audios is not None and audio is None:
74
+ audio = audios
75
+
76
  if sampling_rate is None:
77
  sampling_rate = self.sampling_rate
78
 
 
80
 
81
  # Process audio if provided
82
  if audio is not None:
83
+ if not isinstance(audio, list):
84
  audio = [audio]
85
 
86
  # Convert to numpy for feature extractor
 
102
  if isinstance(text, str):
103
  text = [text]
104
 
105
+ # Filter out kwargs that tokenizer doesn't accept
106
+ tok_kwargs = {k: v for k, v in kwargs.items()
107
+ if k in ['padding', 'truncation', 'max_length', 'add_special_tokens']}
108
+
109
  text_inputs = self.tokenizer(
110
  text,
111
  return_tensors=return_tensors,
112
  padding=True,
113
+ **tok_kwargs,
114
  )
115
  data["input_ids"] = text_inputs.input_ids
116
  if "attention_mask" in text_inputs: