Update custom model files, README, and requirements
Browse files- handler.py +17 -15
handler.py
CHANGED
|
@@ -16,24 +16,25 @@ except ImportError:
|
|
| 16 |
|
| 17 |
class EndpointHandler:
|
| 18 |
def __init__(self, path: str = ""):
|
| 19 |
-
# Set environment variables for PyTorch/CUDA (must be before imports/operations)
|
| 20 |
import os
|
| 21 |
|
| 22 |
-
# Download NLTK data for truecasing (needed by the pipeline)
|
| 23 |
import nltk
|
| 24 |
|
| 25 |
nltk.download("punkt_tab", quiet=True)
|
| 26 |
|
| 27 |
-
# Enable expandable segments to reduce fragmentation
|
| 28 |
os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")
|
| 29 |
|
| 30 |
-
# Enable TF32 for faster matmul on
|
|
|
|
| 31 |
torch.backends.cuda.matmul.allow_tf32 = True
|
| 32 |
torch.backends.cudnn.allow_tf32 = True
|
| 33 |
|
| 34 |
# Set device and dtype
|
| 35 |
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 36 |
-
|
|
|
|
|
|
|
|
|
|
| 37 |
|
| 38 |
# Enable CUDA optimizations
|
| 39 |
if torch.cuda.is_available():
|
|
@@ -61,21 +62,21 @@ class EndpointHandler:
|
|
| 61 |
)
|
| 62 |
|
| 63 |
# Apply torch.compile if enabled (after model is loaded by pipeline)
|
| 64 |
-
#
|
| 65 |
-
|
| 66 |
-
|
|
|
|
| 67 |
self.model = torch.compile(self.model, mode=compile_mode)
|
| 68 |
-
# Update the pipeline with the compiled model
|
| 69 |
self.pipe.model = self.model
|
| 70 |
|
| 71 |
-
# Warmup the model
|
| 72 |
if torch.cuda.is_available():
|
| 73 |
self._warmup()
|
| 74 |
|
|
|
|
| 75 |
def _is_flash_attn_available(self):
|
| 76 |
"""Check if flash attention is available."""
|
| 77 |
import importlib.util
|
| 78 |
-
|
| 79 |
return importlib.util.find_spec("flash_attn") is not None
|
| 80 |
|
| 81 |
def _warmup(self):
|
|
@@ -85,11 +86,12 @@ class EndpointHandler:
|
|
| 85 |
sample_rate = self.pipe.model.config.audio_sample_rate
|
| 86 |
dummy_audio = torch.randn(sample_rate, dtype=torch.float32)
|
| 87 |
|
| 88 |
-
#
|
| 89 |
with torch.inference_mode():
|
| 90 |
warmup_tokens = self.pipe.model.config.inference_warmup_tokens
|
| 91 |
_ = self.pipe(
|
| 92 |
-
{"raw": dummy_audio, "sampling_rate": sample_rate},
|
|
|
|
| 93 |
)
|
| 94 |
|
| 95 |
# Force CUDA synchronization to ensure kernels are compiled
|
|
@@ -107,11 +109,11 @@ class EndpointHandler:
|
|
| 107 |
raise ValueError("Missing 'inputs' in request data")
|
| 108 |
|
| 109 |
params = data.get("parameters", {})
|
| 110 |
-
max_new_tokens = params.get("max_new_tokens",
|
| 111 |
num_beams = params.get("num_beams", 1)
|
| 112 |
do_sample = params.get("do_sample", False)
|
| 113 |
length_penalty = params.get("length_penalty", 1.0)
|
| 114 |
-
repetition_penalty = params.get("repetition_penalty", 1.
|
| 115 |
no_repeat_ngram_size = params.get("no_repeat_ngram_size", 0)
|
| 116 |
early_stopping = params.get("early_stopping", True)
|
| 117 |
default_diversity = self.pipe.model.config.inference_diversity_penalty
|
|
|
|
| 16 |
|
| 17 |
class EndpointHandler:
|
| 18 |
def __init__(self, path: str = ""):
|
|
|
|
| 19 |
import os
|
| 20 |
|
|
|
|
| 21 |
import nltk
|
| 22 |
|
| 23 |
nltk.download("punkt_tab", quiet=True)
|
| 24 |
|
|
|
|
| 25 |
os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")
|
| 26 |
|
| 27 |
+
# Enable TF32 for faster matmul on Ampere+ GPUs (A100, etc.)
|
| 28 |
+
# Also beneficial for T4 (Turing) which supports TensorFloat-32
|
| 29 |
torch.backends.cuda.matmul.allow_tf32 = True
|
| 30 |
torch.backends.cudnn.allow_tf32 = True
|
| 31 |
|
| 32 |
# Set device and dtype
|
| 33 |
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 34 |
+
|
| 35 |
+
# Use float16 for better T4 compatibility (bfloat16 not well supported on T4)
|
| 36 |
+
# T4 has excellent float16 performance with tensor cores
|
| 37 |
+
self.dtype = torch.float16 if self.device == "cuda" else torch.float32
|
| 38 |
|
| 39 |
# Enable CUDA optimizations
|
| 40 |
if torch.cuda.is_available():
|
|
|
|
| 62 |
)
|
| 63 |
|
| 64 |
# Apply torch.compile if enabled (after model is loaded by pipeline)
|
| 65 |
+
# Use "default" mode for T4 - better compatibility than "reduce-overhead"
|
| 66 |
+
# "reduce-overhead" is better for A100+ but can be slower on older GPUs
|
| 67 |
+
if torch.cuda.is_available() and os.getenv("ENABLE_TORCH_COMPILE", "1") == "1":
|
| 68 |
+
compile_mode = os.getenv("TORCH_COMPILE_MODE", "default")
|
| 69 |
self.model = torch.compile(self.model, mode=compile_mode)
|
|
|
|
| 70 |
self.pipe.model = self.model
|
| 71 |
|
| 72 |
+
# Warmup the model to trigger compilation and optimize kernels
|
| 73 |
if torch.cuda.is_available():
|
| 74 |
self._warmup()
|
| 75 |
|
| 76 |
+
|
| 77 |
def _is_flash_attn_available(self):
|
| 78 |
"""Check if flash attention is available."""
|
| 79 |
import importlib.util
|
|
|
|
| 80 |
return importlib.util.find_spec("flash_attn") is not None
|
| 81 |
|
| 82 |
def _warmup(self):
|
|
|
|
| 86 |
sample_rate = self.pipe.model.config.audio_sample_rate
|
| 87 |
dummy_audio = torch.randn(sample_rate, dtype=torch.float32)
|
| 88 |
|
| 89 |
+
# Run inference to trigger torch.compile and kernel optimization
|
| 90 |
with torch.inference_mode():
|
| 91 |
warmup_tokens = self.pipe.model.config.inference_warmup_tokens
|
| 92 |
_ = self.pipe(
|
| 93 |
+
{"raw": dummy_audio, "sampling_rate": sample_rate},
|
| 94 |
+
max_new_tokens=warmup_tokens,
|
| 95 |
)
|
| 96 |
|
| 97 |
# Force CUDA synchronization to ensure kernels are compiled
|
|
|
|
| 109 |
raise ValueError("Missing 'inputs' in request data")
|
| 110 |
|
| 111 |
params = data.get("parameters", {})
|
| 112 |
+
max_new_tokens = params.get("max_new_tokens", 128)
|
| 113 |
num_beams = params.get("num_beams", 1)
|
| 114 |
do_sample = params.get("do_sample", False)
|
| 115 |
length_penalty = params.get("length_penalty", 1.0)
|
| 116 |
+
repetition_penalty = params.get("repetition_penalty", 1.05)
|
| 117 |
no_repeat_ngram_size = params.get("no_repeat_ngram_size", 0)
|
| 118 |
early_stopping = params.get("early_stopping", True)
|
| 119 |
default_diversity = self.pipe.model.config.inference_diversity_penalty
|