mazesmazes commited on
Commit
d29757e
·
verified ·
1 Parent(s): 3b25064

Update custom model files, README, and requirements

Browse files
Files changed (1) hide show
  1. handler.py +17 -15
handler.py CHANGED
@@ -16,24 +16,25 @@ except ImportError:
16
 
17
  class EndpointHandler:
18
  def __init__(self, path: str = ""):
19
- # Set environment variables for PyTorch/CUDA (must be before imports/operations)
20
  import os
21
 
22
- # Download NLTK data for truecasing (needed by the pipeline)
23
  import nltk
24
 
25
  nltk.download("punkt_tab", quiet=True)
26
 
27
- # Enable expandable segments to reduce fragmentation
28
  os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")
29
 
30
- # Enable TF32 for faster matmul on A40/A100
 
31
  torch.backends.cuda.matmul.allow_tf32 = True
32
  torch.backends.cudnn.allow_tf32 = True
33
 
34
  # Set device and dtype
35
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
36
- self.dtype = torch.bfloat16 if self.device == "cuda" else torch.float32
 
 
 
37
 
38
  # Enable CUDA optimizations
39
  if torch.cuda.is_available():
@@ -61,21 +62,21 @@ class EndpointHandler:
61
  )
62
 
63
  # Apply torch.compile if enabled (after model is loaded by pipeline)
64
- # Enable by default for significant speedup (20-40%)
65
- if torch.cuda.is_available():
66
- compile_mode = os.getenv("TORCH_COMPILE_MODE", "reduce-overhead")
 
67
  self.model = torch.compile(self.model, mode=compile_mode)
68
- # Update the pipeline with the compiled model
69
  self.pipe.model = self.model
70
 
71
- # Warmup the model
72
  if torch.cuda.is_available():
73
  self._warmup()
74
 
 
75
  def _is_flash_attn_available(self):
76
  """Check if flash attention is available."""
77
  import importlib.util
78
-
79
  return importlib.util.find_spec("flash_attn") is not None
80
 
81
  def _warmup(self):
@@ -85,11 +86,12 @@ class EndpointHandler:
85
  sample_rate = self.pipe.model.config.audio_sample_rate
86
  dummy_audio = torch.randn(sample_rate, dtype=torch.float32)
87
 
88
- # The pipeline now handles GPU optimization internally
89
  with torch.inference_mode():
90
  warmup_tokens = self.pipe.model.config.inference_warmup_tokens
91
  _ = self.pipe(
92
- {"raw": dummy_audio, "sampling_rate": sample_rate}, max_new_tokens=warmup_tokens
 
93
  )
94
 
95
  # Force CUDA synchronization to ensure kernels are compiled
@@ -107,11 +109,11 @@ class EndpointHandler:
107
  raise ValueError("Missing 'inputs' in request data")
108
 
109
  params = data.get("parameters", {})
110
- max_new_tokens = params.get("max_new_tokens", 200)
111
  num_beams = params.get("num_beams", 1)
112
  do_sample = params.get("do_sample", False)
113
  length_penalty = params.get("length_penalty", 1.0)
114
- repetition_penalty = params.get("repetition_penalty", 1.0)
115
  no_repeat_ngram_size = params.get("no_repeat_ngram_size", 0)
116
  early_stopping = params.get("early_stopping", True)
117
  default_diversity = self.pipe.model.config.inference_diversity_penalty
 
16
 
17
  class EndpointHandler:
18
  def __init__(self, path: str = ""):
 
19
  import os
20
 
 
21
  import nltk
22
 
23
  nltk.download("punkt_tab", quiet=True)
24
 
 
25
  os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")
26
 
27
+ # Enable TF32 for faster matmul on Ampere+ GPUs (A100, etc.)
28
+ # Also beneficial for T4 (Turing) which supports TensorFloat-32
29
  torch.backends.cuda.matmul.allow_tf32 = True
30
  torch.backends.cudnn.allow_tf32 = True
31
 
32
  # Set device and dtype
33
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
34
+
35
+ # Use float16 for better T4 compatibility (bfloat16 not well supported on T4)
36
+ # T4 has excellent float16 performance with tensor cores
37
+ self.dtype = torch.float16 if self.device == "cuda" else torch.float32
38
 
39
  # Enable CUDA optimizations
40
  if torch.cuda.is_available():
 
62
  )
63
 
64
  # Apply torch.compile if enabled (after model is loaded by pipeline)
65
+ # Use "default" mode for T4 - better compatibility than "reduce-overhead"
66
+ # "reduce-overhead" is better for A100+ but can be slower on older GPUs
67
+ if torch.cuda.is_available() and os.getenv("ENABLE_TORCH_COMPILE", "1") == "1":
68
+ compile_mode = os.getenv("TORCH_COMPILE_MODE", "default")
69
  self.model = torch.compile(self.model, mode=compile_mode)
 
70
  self.pipe.model = self.model
71
 
72
+ # Warmup the model to trigger compilation and optimize kernels
73
  if torch.cuda.is_available():
74
  self._warmup()
75
 
76
+
77
  def _is_flash_attn_available(self):
78
  """Check if flash attention is available."""
79
  import importlib.util
 
80
  return importlib.util.find_spec("flash_attn") is not None
81
 
82
  def _warmup(self):
 
86
  sample_rate = self.pipe.model.config.audio_sample_rate
87
  dummy_audio = torch.randn(sample_rate, dtype=torch.float32)
88
 
89
+ # Run inference to trigger torch.compile and kernel optimization
90
  with torch.inference_mode():
91
  warmup_tokens = self.pipe.model.config.inference_warmup_tokens
92
  _ = self.pipe(
93
+ {"raw": dummy_audio, "sampling_rate": sample_rate},
94
+ max_new_tokens=warmup_tokens,
95
  )
96
 
97
  # Force CUDA synchronization to ensure kernels are compiled
 
109
  raise ValueError("Missing 'inputs' in request data")
110
 
111
  params = data.get("parameters", {})
112
+ max_new_tokens = params.get("max_new_tokens", 128)
113
  num_beams = params.get("num_beams", 1)
114
  do_sample = params.get("do_sample", False)
115
  length_penalty = params.get("length_penalty", 1.0)
116
+ repetition_penalty = params.get("repetition_penalty", 1.05)
117
  no_repeat_ngram_size = params.get("no_repeat_ngram_size", 0)
118
  early_stopping = params.get("early_stopping", True)
119
  default_diversity = self.pipe.model.config.inference_diversity_penalty