Spaces:
Running
Running
ming
commited on
Commit
·
7fff563
1
Parent(s):
fd2a8c1
Implement Option 3: Use FP16 for 2-3x faster inference
Browse filesSpeed optimization:
- Added v4_use_fp16_for_speed config option
- When enabled, uses FP16 instead of 4-bit quantization
- FP16 is 2-3x faster than 4-bit NF4 quantization
- Enabled by default in Dockerfile for maximum speed
Memory trade-off:
- FP16 uses ~2-3GB GPU memory (vs ~1GB for 4-bit)
- Still fits comfortably on T4 GPU (16GB total)
Expected results:
- Generation time: 24.9s → 8-12s (2-3x speedup)
- Same output quality
- Faster token generation (~20-30 tokens/sec vs ~6 tokens/sec)
This completes the speed optimization plan (Option 1 + 2 + 3)
- Dockerfile +1 -0
- app/core/config.py +5 -0
- app/services/structured_summarizer.py +17 -1
Dockerfile
CHANGED
|
@@ -12,6 +12,7 @@ ENV PYTHONDONTWRITEBYTECODE=1 \
|
|
| 12 |
ENABLE_V4_WARMUP=true \
|
| 13 |
V4_MODEL_ID=Qwen/Qwen2.5-1.5B-Instruct \
|
| 14 |
V4_ENABLE_QUANTIZATION=true \
|
|
|
|
| 15 |
HF_HOME=/tmp/huggingface \
|
| 16 |
TRANSFORMERS_NO_TORCHAO=1
|
| 17 |
|
|
|
|
| 12 |
ENABLE_V4_WARMUP=true \
|
| 13 |
V4_MODEL_ID=Qwen/Qwen2.5-1.5B-Instruct \
|
| 14 |
V4_ENABLE_QUANTIZATION=true \
|
| 15 |
+
V4_USE_FP16_FOR_SPEED=true \
|
| 16 |
HF_HOME=/tmp/huggingface \
|
| 17 |
TRANSFORMERS_NO_TORCHAO=1
|
| 18 |
|
app/core/config.py
CHANGED
|
@@ -122,6 +122,11 @@ class Settings(BaseSettings):
|
|
| 122 |
env="V4_ENABLE_QUANTIZATION",
|
| 123 |
description="Enable INT8 quantization for V4 model (reduces memory from ~2GB to ~1GB). Quantization takes ~1-2 minutes on startup.",
|
| 124 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
|
| 126 |
@validator("log_level")
|
| 127 |
def validate_log_level(cls, v):
|
|
|
|
| 122 |
env="V4_ENABLE_QUANTIZATION",
|
| 123 |
description="Enable INT8 quantization for V4 model (reduces memory from ~2GB to ~1GB). Quantization takes ~1-2 minutes on startup.",
|
| 124 |
)
|
| 125 |
+
v4_use_fp16_for_speed: bool = Field(
|
| 126 |
+
default=False,
|
| 127 |
+
env="V4_USE_FP16_FOR_SPEED",
|
| 128 |
+
description="Use FP16 instead of 4-bit quantization for 2-3x faster inference (uses ~2-3GB GPU memory instead of ~1GB)",
|
| 129 |
+
)
|
| 130 |
|
| 131 |
@validator("log_level")
|
| 132 |
def validate_log_level(cls, v):
|
app/services/structured_summarizer.py
CHANGED
|
@@ -81,10 +81,14 @@ class StructuredSummarizer:
|
|
| 81 |
logger.info("CUDA is NOT available. V4 model will run on CPU.")
|
| 82 |
|
| 83 |
# ------------------------------------------------------------------
|
| 84 |
-
# Preferred path: 4-bit NF4 on GPU via bitsandbytes
|
|
|
|
| 85 |
# ------------------------------------------------------------------
|
|
|
|
|
|
|
| 86 |
if (
|
| 87 |
use_cuda
|
|
|
|
| 88 |
and getattr(settings, "v4_enable_quantization", True)
|
| 89 |
and HAS_BITSANDBYTES
|
| 90 |
):
|
|
@@ -104,6 +108,18 @@ class StructuredSummarizer:
|
|
| 104 |
trust_remote_code=True,
|
| 105 |
)
|
| 106 |
quantization_desc = "4-bit NF4 (bitsandbytes, GPU)"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
|
| 108 |
else:
|
| 109 |
# ------------------------------------------------------------------
|
|
|
|
| 81 |
logger.info("CUDA is NOT available. V4 model will run on CPU.")
|
| 82 |
|
| 83 |
# ------------------------------------------------------------------
|
| 84 |
+
# Preferred path: 4-bit NF4 on GPU via bitsandbytes (memory efficient)
|
| 85 |
+
# OR FP16 for speed (2-3x faster, uses more memory)
|
| 86 |
# ------------------------------------------------------------------
|
| 87 |
+
use_fp16_for_speed = getattr(settings, "v4_use_fp16_for_speed", False)
|
| 88 |
+
|
| 89 |
if (
|
| 90 |
use_cuda
|
| 91 |
+
and not use_fp16_for_speed
|
| 92 |
and getattr(settings, "v4_enable_quantization", True)
|
| 93 |
and HAS_BITSANDBYTES
|
| 94 |
):
|
|
|
|
| 108 |
trust_remote_code=True,
|
| 109 |
)
|
| 110 |
quantization_desc = "4-bit NF4 (bitsandbytes, GPU)"
|
| 111 |
+
|
| 112 |
+
elif use_cuda and use_fp16_for_speed:
|
| 113 |
+
# Use FP16 for 2-3x faster inference (uses ~2-3GB GPU memory)
|
| 114 |
+
logger.info("Loading V4 model in FP16 for maximum speed (2-3x faster than 4-bit)...")
|
| 115 |
+
self.model = AutoModelForCausalLM.from_pretrained(
|
| 116 |
+
settings.v4_model_id,
|
| 117 |
+
torch_dtype=torch.float16,
|
| 118 |
+
device_map="auto",
|
| 119 |
+
cache_dir=settings.hf_cache_dir,
|
| 120 |
+
trust_remote_code=True,
|
| 121 |
+
)
|
| 122 |
+
quantization_desc = "FP16 (GPU, fast)"
|
| 123 |
|
| 124 |
else:
|
| 125 |
# ------------------------------------------------------------------
|