Update custom model files, README, and requirements
Browse files- handler.py +6 -1
handler.py
CHANGED
|
@@ -19,6 +19,11 @@ class EndpointHandler:
|
|
| 19 |
# Set environment variables for PyTorch/CUDA (must be before imports/operations)
|
| 20 |
import os
|
| 21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
# Enable expandable segments to reduce fragmentation
|
| 23 |
os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")
|
| 24 |
|
|
@@ -57,7 +62,7 @@ class EndpointHandler:
|
|
| 57 |
|
| 58 |
# Apply torch.compile if enabled (after model is loaded by pipeline)
|
| 59 |
# Enable by default for significant speedup (20-40%)
|
| 60 |
-
if torch.cuda.is_available()
|
| 61 |
compile_mode = os.getenv("TORCH_COMPILE_MODE", "reduce-overhead")
|
| 62 |
self.model = torch.compile(self.model, mode=compile_mode)
|
| 63 |
# Update the pipeline with the compiled model
|
|
|
|
| 19 |
# Set environment variables for PyTorch/CUDA (must be before imports/operations)
|
| 20 |
import os
|
| 21 |
|
| 22 |
+
# Download NLTK data for truecasing (needed by the pipeline)
|
| 23 |
+
import nltk
|
| 24 |
+
|
| 25 |
+
nltk.download("punkt_tab", quiet=True)
|
| 26 |
+
|
| 27 |
# Enable expandable segments to reduce fragmentation
|
| 28 |
os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")
|
| 29 |
|
|
|
|
| 62 |
|
| 63 |
# Apply torch.compile if enabled (after model is loaded by pipeline)
|
| 64 |
# Enable by default for significant speedup (20-40%)
|
| 65 |
+
if torch.cuda.is_available():
|
| 66 |
compile_mode = os.getenv("TORCH_COMPILE_MODE", "reduce-overhead")
|
| 67 |
self.model = torch.compile(self.model, mode=compile_mode)
|
| 68 |
# Update the pipeline with the compiled model
|