Upload processing_borealis.py with huggingface_hub
Browse files- processing_borealis.py +12 -2
processing_borealis.py
CHANGED
|
@@ -51,6 +51,7 @@ class BorealisProcessor(ProcessorMixin):
|
|
| 51 |
self,
|
| 52 |
text: Optional[Union[str, List[str]]] = None,
|
| 53 |
audio: Optional[Union[torch.Tensor, List[torch.Tensor]]] = None,
|
|
|
|
| 54 |
sampling_rate: Optional[int] = None,
|
| 55 |
return_tensors: Optional[str] = "pt",
|
| 56 |
**kwargs,
|
|
@@ -61,12 +62,17 @@ class BorealisProcessor(ProcessorMixin):
|
|
| 61 |
Args:
|
| 62 |
text: Text prompt(s)
|
| 63 |
audio: Audio waveform(s) at 16kHz
|
|
|
|
| 64 |
sampling_rate: Audio sampling rate (default: 16000)
|
| 65 |
return_tensors: Return tensor type
|
| 66 |
|
| 67 |
Returns:
|
| 68 |
BatchFeature with input_ids and optionally input_features
|
| 69 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
if sampling_rate is None:
|
| 71 |
sampling_rate = self.sampling_rate
|
| 72 |
|
|
@@ -74,7 +80,7 @@ class BorealisProcessor(ProcessorMixin):
|
|
| 74 |
|
| 75 |
# Process audio if provided
|
| 76 |
if audio is not None:
|
| 77 |
-
if isinstance(audio,
|
| 78 |
audio = [audio]
|
| 79 |
|
| 80 |
# Convert to numpy for feature extractor
|
|
@@ -96,11 +102,15 @@ class BorealisProcessor(ProcessorMixin):
|
|
| 96 |
if isinstance(text, str):
|
| 97 |
text = [text]
|
| 98 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
text_inputs = self.tokenizer(
|
| 100 |
text,
|
| 101 |
return_tensors=return_tensors,
|
| 102 |
padding=True,
|
| 103 |
-
**
|
| 104 |
)
|
| 105 |
data["input_ids"] = text_inputs.input_ids
|
| 106 |
if "attention_mask" in text_inputs:
|
|
|
|
| 51 |
self,
|
| 52 |
text: Optional[Union[str, List[str]]] = None,
|
| 53 |
audio: Optional[Union[torch.Tensor, List[torch.Tensor]]] = None,
|
| 54 |
+
audios: Optional[List] = None, # vLLM uses plural
|
| 55 |
sampling_rate: Optional[int] = None,
|
| 56 |
return_tensors: Optional[str] = "pt",
|
| 57 |
**kwargs,
|
|
|
|
| 62 |
Args:
|
| 63 |
text: Text prompt(s)
|
| 64 |
audio: Audio waveform(s) at 16kHz
|
| 65 |
+
audios: Audio waveform(s) at 16kHz (vLLM style)
|
| 66 |
sampling_rate: Audio sampling rate (default: 16000)
|
| 67 |
return_tensors: Return tensor type
|
| 68 |
|
| 69 |
Returns:
|
| 70 |
BatchFeature with input_ids and optionally input_features
|
| 71 |
"""
|
| 72 |
+
# vLLM uses 'audios' (plural)
|
| 73 |
+
if audios is not None and audio is None:
|
| 74 |
+
audio = audios
|
| 75 |
+
|
| 76 |
if sampling_rate is None:
|
| 77 |
sampling_rate = self.sampling_rate
|
| 78 |
|
|
|
|
| 80 |
|
| 81 |
# Process audio if provided
|
| 82 |
if audio is not None:
|
| 83 |
+
if not isinstance(audio, list):
|
| 84 |
audio = [audio]
|
| 85 |
|
| 86 |
# Convert to numpy for feature extractor
|
|
|
|
| 102 |
if isinstance(text, str):
|
| 103 |
text = [text]
|
| 104 |
|
| 105 |
+
# Filter out kwargs that tokenizer doesn't accept
|
| 106 |
+
tok_kwargs = {k: v for k, v in kwargs.items()
|
| 107 |
+
if k in ['padding', 'truncation', 'max_length', 'add_special_tokens']}
|
| 108 |
+
|
| 109 |
text_inputs = self.tokenizer(
|
| 110 |
text,
|
| 111 |
return_tensors=return_tensors,
|
| 112 |
padding=True,
|
| 113 |
+
**tok_kwargs,
|
| 114 |
)
|
| 115 |
data["input_ids"] = text_inputs.input_ids
|
| 116 |
if "attention_mask" in text_inputs:
|