Convert to pure custom Docker deployment for Hugging Face
Browse filesThis commit refactors the repository to focus exclusively on custom Docker
deployment, removing HF Inference Endpoint handler approach.
Changes:
- Remove handler.py and test_local.py (HF handler approach)
- Add Dockerfile with CUDA 12.9 support
- Add app.py with FastAPI server and VRAM-aware concurrency control
- Add requirements.txt with proper dependency management
- Fix .gitattributes to only track large model files in LFS
- Update README.md with comprehensive Docker deployment guide
- Add sam.code-workspace to .gitignore
The new setup provides:
- Optimized for 1920x1080 images on L4/A10G GPUs
- Automatic VRAM management for concurrent requests
- Health check endpoints with VRAM monitoring
- Scale-to-zero support on Hugging Face Endpoints
🤖 Generated with [Claude Code](https://claude.ai/code)
via [Happy](https://happy.engineering)
Co-Authored-By: Claude <noreply@anthropic.com>
Co-Authored-By: Happy <yesreply@happy.engineering>
- .gitattributes +3 -4
- .gitignore +2 -0
- Dockerfile +40 -0
- README.md +162 -21
- app.py +246 -0
- handler.py +0 -81
- requirements.txt +17 -0
- test_local.py +0 -24
|
@@ -33,7 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
*.jpg filter=lfs diff=lfs merge=lfs -text
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
# Only track large model JSON files in LFS (tokenizer, vocab, etc.)
|
| 37 |
+
model/*.json filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
model/*.txt filter=lfs diff=lfs merge=lfs -text
|
|
|
|
@@ -27,6 +27,7 @@ huggingface/
|
|
| 27 |
transformers_cache/
|
| 28 |
|
| 29 |
# Data
|
|
|
|
| 30 |
*.tmp
|
| 31 |
*.temp
|
| 32 |
*.bak
|
|
@@ -37,3 +38,4 @@ transformers_cache/
|
|
| 37 |
|
| 38 |
# Don’t ignore model folder (HF needs it)
|
| 39 |
# /model/ is intentionally NOT ignored
|
|
|
|
|
|
| 27 |
transformers_cache/
|
| 28 |
|
| 29 |
# Data
|
| 30 |
+
.temp/
|
| 31 |
*.tmp
|
| 32 |
*.temp
|
| 33 |
*.bak
|
|
|
|
| 38 |
|
| 39 |
# Don’t ignore model folder (HF needs it)
|
| 40 |
# /model/ is intentionally NOT ignored
|
| 41 |
+
sam.code-workspace
|
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Modern NVIDIA base image with CUDA 12.9 and Ubuntu 24.04 LTS
|
| 2 |
+
FROM nvidia/cuda:12.9.1-runtime-ubuntu24.04
|
| 3 |
+
|
| 4 |
+
# Avoid interactive errors
|
| 5 |
+
ENV DEBIAN_FRONTEND=noninteractive
|
| 6 |
+
|
| 7 |
+
# System packages
|
| 8 |
+
RUN apt-get update && apt-get install -y \
|
| 9 |
+
python3 \
|
| 10 |
+
python3-pip \
|
| 11 |
+
git \
|
| 12 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 13 |
+
|
| 14 |
+
# Make python3 the default python
|
| 15 |
+
RUN update-alternatives --install /usr/bin/python python /usr/bin/python3 1
|
| 16 |
+
|
| 17 |
+
# Working directory
|
| 18 |
+
WORKDIR /app
|
| 19 |
+
|
| 20 |
+
# Copy requirements first (to enable Docker cache)
|
| 21 |
+
COPY requirements.txt /app/requirements.txt
|
| 22 |
+
|
| 23 |
+
# Install PyTorch with CUDA support first (separate to use correct index URL)
|
| 24 |
+
RUN pip install --no-cache-dir torch==2.9.1 --index-url https://download.pytorch.org/whl/cu129 --break-system-packages
|
| 25 |
+
|
| 26 |
+
# Install remaining dependencies
|
| 27 |
+
RUN pip install --no-cache-dir -r requirements.txt --break-system-packages
|
| 28 |
+
|
| 29 |
+
# Copy application code
|
| 30 |
+
COPY app.py /app/app.py
|
| 31 |
+
COPY model /app/model
|
| 32 |
+
|
| 33 |
+
# Uvicorn exposed port
|
| 34 |
+
EXPOSE 7860
|
| 35 |
+
|
| 36 |
+
# Optimize Python for container
|
| 37 |
+
ENV PYTHONUNBUFFERED=1
|
| 38 |
+
ENV TRANSFORMERS_NO_ADVISORY_WARNINGS=1
|
| 39 |
+
|
| 40 |
+
CMD ["python", "app.py"]
|
|
@@ -1,39 +1,180 @@
|
|
| 1 |
---
|
| 2 |
-
title: "SAM3
|
| 3 |
pipeline_tag: "image-segmentation"
|
| 4 |
-
license: apache-2.0
|
| 5 |
tags:
|
| 6 |
-
- segmentation
|
| 7 |
- sam3
|
| 8 |
-
-
|
| 9 |
-
-
|
| 10 |
-
-
|
| 11 |
-
|
| 12 |
---
|
| 13 |
|
| 14 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
|
| 18 |
```json
|
| 19 |
{
|
| 20 |
-
"inputs": "<
|
| 21 |
-
"parameters": {
|
| 22 |
-
"classes": ["pothole", "marking"]
|
| 23 |
-
}
|
| 24 |
}
|
| 25 |
```
|
| 26 |
|
| 27 |
-
|
| 28 |
-
## Output
|
| 29 |
|
| 30 |
```json
|
| 31 |
[
|
| 32 |
-
{
|
| 33 |
-
"label": "pothole",
|
| 34 |
-
"mask": "<base64_png>",
|
| 35 |
-
"score": 1.0
|
| 36 |
-
}
|
| 37 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
|
| 39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: "SAM3 Custom Docker Endpoint"
|
| 3 |
pipeline_tag: "image-segmentation"
|
|
|
|
| 4 |
tags:
|
|
|
|
| 5 |
- sam3
|
| 6 |
+
- custom-docker
|
| 7 |
+
- segmentation
|
| 8 |
+
- inference-endpoint
|
| 9 |
+
license: apache-2.0
|
| 10 |
---
|
| 11 |
|
| 12 |
+
# Segment Anything 3 – Custom Docker Deployment
|
| 13 |
+
|
| 14 |
+
This repository provides a **custom Docker image** for SAM3 text-prompted segmentation,
|
| 15 |
+
deployable on Hugging Face Inference Endpoints or any Docker-compatible platform.
|
| 16 |
+
|
| 17 |
+
## Features
|
| 18 |
+
|
| 19 |
+
- **SAM3** (Segment Anything Model 3) with text-prompted segmentation
|
| 20 |
+
- **FastAPI** server with HF-compatible API
|
| 21 |
+
- **GPU-accelerated** inference (CUDA 12.9)
|
| 22 |
+
- **VRAM-aware** concurrency control for large images
|
| 23 |
+
- **Scale-to-zero** support on Hugging Face Endpoints
|
| 24 |
+
- Optimized for **1920×1080** images on A10/L4 GPUs
|
| 25 |
+
|
| 26 |
+
## Quick Deploy on Hugging Face
|
| 27 |
|
| 28 |
+
### Option 1: Pre-built Docker Image (Fastest)
|
| 29 |
+
|
| 30 |
+
1. Build and push your Docker image:
|
| 31 |
+
```bash
|
| 32 |
+
docker build -t yourusername/sam3:latest .
|
| 33 |
+
docker push yourusername/sam3:latest
|
| 34 |
+
```
|
| 35 |
+
|
| 36 |
+
2. Create Inference Endpoint at https://huggingface.co/inference-endpoints
|
| 37 |
+
- Choose **Custom Docker Image**
|
| 38 |
+
- Image: `yourusername/sam3:latest`
|
| 39 |
+
- Hardware: **L4** or **A10G** (recommended)
|
| 40 |
+
- Min replicas: **0** (scale-to-zero)
|
| 41 |
+
- Max replicas: **5**
|
| 42 |
+
|
| 43 |
+
### Option 2: Build from Repository
|
| 44 |
+
|
| 45 |
+
1. Upload this repository to Hugging Face
|
| 46 |
+
2. Create endpoint pointing to your repo
|
| 47 |
+
3. HF will build the Docker image (takes ~5-10 min first time)
|
| 48 |
+
|
| 49 |
+
## API
|
| 50 |
+
|
| 51 |
+
### Input
|
| 52 |
|
| 53 |
```json
|
| 54 |
{
|
| 55 |
+
"inputs": "<base64_image>",
|
| 56 |
+
"parameters": { "classes": ["pothole", "marking"] }
|
|
|
|
|
|
|
| 57 |
}
|
| 58 |
```
|
| 59 |
|
| 60 |
+
### Output
|
|
|
|
| 61 |
|
| 62 |
```json
|
| 63 |
[
|
| 64 |
+
{ "label": "pothole", "mask": "...", "score": 1.0 }
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
]
|
| 66 |
+
```
|
| 67 |
+
|
| 68 |
+
---
|
| 69 |
+
|
| 70 |
+
## Local Development & Testing
|
| 71 |
+
|
| 72 |
+
### Build and Run Locally
|
| 73 |
+
|
| 74 |
+
```bash
|
| 75 |
+
# Build the Docker image
|
| 76 |
+
docker build -t sam3:latest .
|
| 77 |
+
|
| 78 |
+
# Run locally with GPU
|
| 79 |
+
docker run --gpus all -p 7860:7860 sam3:latest
|
| 80 |
+
|
| 81 |
+
# Run without GPU (CPU mode - slower)
|
| 82 |
+
docker run -p 7860:7860 sam3:latest
|
| 83 |
+
```
|
| 84 |
+
|
| 85 |
+
### Test the API
|
| 86 |
+
|
| 87 |
+
Using the included test script:
|
| 88 |
+
```bash
|
| 89 |
+
python test_remote.py
|
| 90 |
+
```
|
| 91 |
|
| 92 |
+
Or with curl:
|
| 93 |
+
```bash
|
| 94 |
+
curl -X POST http://localhost:7860 \
|
| 95 |
+
-H "Content-Type: application/json" \
|
| 96 |
+
-d '{
|
| 97 |
+
"inputs": "<base64_encoded_image>",
|
| 98 |
+
"parameters": {"classes": ["pothole", "marking"]}
|
| 99 |
+
}'
|
| 100 |
+
```
|
| 101 |
+
|
| 102 |
+
Check health:
|
| 103 |
+
```bash
|
| 104 |
+
curl http://localhost:7860/health
|
| 105 |
+
```
|
| 106 |
+
|
| 107 |
+
## Repository Structure
|
| 108 |
+
|
| 109 |
+
```
|
| 110 |
+
.
|
| 111 |
+
├── app.py # FastAPI server with VRAM management
|
| 112 |
+
├── Dockerfile # Custom Docker image definition
|
| 113 |
+
├── requirements.txt # Python dependencies
|
| 114 |
+
├── test_remote.py # Test script for remote endpoints
|
| 115 |
+
├── test.jpg # Sample test image
|
| 116 |
+
├── model/ # SAM3 model files (Git LFS)
|
| 117 |
+
│ ├── config.json
|
| 118 |
+
│ ├── model.safetensors (3.4GB)
|
| 119 |
+
│ ├── processor_config.json
|
| 120 |
+
│ ├── tokenizer.json
|
| 121 |
+
│ ├── vocab.json
|
| 122 |
+
│ └── ...
|
| 123 |
+
└── README.md # This file
|
| 124 |
+
```
|
| 125 |
+
|
| 126 |
+
## Production Deployment Tips
|
| 127 |
+
|
| 128 |
+
### Docker Registry Workflow
|
| 129 |
+
|
| 130 |
+
For fastest deployment, pre-build and push to Docker Hub:
|
| 131 |
+
|
| 132 |
+
```bash
|
| 133 |
+
docker build -t yourusername/sam3:latest .
|
| 134 |
+
docker login
|
| 135 |
+
docker push yourusername/sam3:latest
|
| 136 |
+
```
|
| 137 |
+
|
| 138 |
+
Then use `yourusername/sam3:latest` when creating your HF Endpoint.
|
| 139 |
+
|
| 140 |
+
### Performance Expectations
|
| 141 |
+
|
| 142 |
+
- **Image size:** 1920×1080
|
| 143 |
+
- **Inference time:** 5-10 seconds
|
| 144 |
+
- **VRAM usage:** 8-12GB per inference
|
| 145 |
+
- **Recommended GPU:** L4 (24GB) or A10G (24GB)
|
| 146 |
+
- **Max concurrent:** 1-2 requests (automatically managed)
|
| 147 |
+
|
| 148 |
+
---
|
| 149 |
+
|
| 150 |
+
## Troubleshooting
|
| 151 |
+
|
| 152 |
+
### Common Issues
|
| 153 |
+
|
| 154 |
+
- **GPU not detected**: Ensure `--gpus all` flag is used with Docker
|
| 155 |
+
- **Out of memory**: The app automatically manages VRAM. If issues persist, reduce image resolution
|
| 156 |
+
- **Model loading fails**: Verify Git LFS pulled all files (`git lfs pull`)
|
| 157 |
+
- **API timeout**: Increase timeout in endpoint config (recommend 300s for large images)
|
| 158 |
+
- **Slow inference**: First request is slower due to model warmup (~10s), subsequent requests are faster
|
| 159 |
+
|
| 160 |
+
### Health Check
|
| 161 |
+
|
| 162 |
+
The `/health` endpoint provides VRAM status:
|
| 163 |
+
```bash
|
| 164 |
+
curl http://your-endpoint/health
|
| 165 |
+
```
|
| 166 |
+
|
| 167 |
+
Returns:
|
| 168 |
+
```json
|
| 169 |
+
{
|
| 170 |
+
"status": "healthy",
|
| 171 |
+
"gpu_available": true,
|
| 172 |
+
"vram": {
|
| 173 |
+
"total_gb": 24.0,
|
| 174 |
+
"allocated_gb": 6.8,
|
| 175 |
+
"free_gb": 17.2,
|
| 176 |
+
"max_concurrent": 2,
|
| 177 |
+
"processing_now": 0
|
| 178 |
+
}
|
| 179 |
+
}
|
| 180 |
+
```
|
|
@@ -0,0 +1,246 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
SAM3 FastAPI Server with Dynamic VRAM-based Concurrency Control
|
| 3 |
+
|
| 4 |
+
Optimized for:
|
| 5 |
+
- Large images (1920x1080)
|
| 6 |
+
- A10 GPU (24GB VRAM)
|
| 7 |
+
- Automatic concurrency adjustment based on available VRAM
|
| 8 |
+
"""
|
| 9 |
+
import base64
|
| 10 |
+
import io
|
| 11 |
+
import asyncio
|
| 12 |
+
import torch
|
| 13 |
+
from PIL import Image
|
| 14 |
+
from fastapi import FastAPI, HTTPException
|
| 15 |
+
from pydantic import BaseModel
|
| 16 |
+
from transformers import AutoProcessor, SamModel
|
| 17 |
+
from collections import deque
|
| 18 |
+
import logging
|
| 19 |
+
|
| 20 |
+
logging.basicConfig(level=logging.INFO)
|
| 21 |
+
logger = logging.getLogger(__name__)
|
| 22 |
+
|
| 23 |
+
# Load SAM3 model
|
| 24 |
+
processor = AutoProcessor.from_pretrained("./model")
|
| 25 |
+
model = SamModel.from_pretrained(
|
| 26 |
+
"./model",
|
| 27 |
+
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
+
model.eval()
|
| 31 |
+
if torch.cuda.is_available():
|
| 32 |
+
model.cuda()
|
| 33 |
+
logger.info(f"GPU detected: {torch.cuda.get_device_name()}")
|
| 34 |
+
logger.info(f"Total VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
|
| 35 |
+
|
| 36 |
+
# VRAM-based concurrency control
|
| 37 |
+
class VRAMManager:
|
| 38 |
+
"""Dynamically manage concurrency based on available VRAM"""
|
| 39 |
+
|
| 40 |
+
def __init__(self):
|
| 41 |
+
self.total_vram_gb = torch.cuda.get_device_properties(0).total_memory / 1e9 if torch.cuda.is_available() else 0
|
| 42 |
+
self.model_vram_gb = torch.cuda.memory_allocated() / 1e9 if torch.cuda.is_available() else 0
|
| 43 |
+
|
| 44 |
+
# Estimate VRAM per inference for 1920x1080 images with SAM3
|
| 45 |
+
# Conservative estimate: 8-12GB per inference at this resolution
|
| 46 |
+
self.estimated_inference_vram_gb = 10.0
|
| 47 |
+
|
| 48 |
+
# Calculate max concurrent inferences
|
| 49 |
+
available_vram = self.total_vram_gb - self.model_vram_gb - 2.0 # Keep 2GB buffer
|
| 50 |
+
self.max_concurrent = max(1, int(available_vram / self.estimated_inference_vram_gb))
|
| 51 |
+
|
| 52 |
+
self.semaphore = asyncio.Semaphore(self.max_concurrent)
|
| 53 |
+
self.request_queue = deque()
|
| 54 |
+
self.processing_count = 0
|
| 55 |
+
|
| 56 |
+
logger.info(f"VRAM Manager initialized:")
|
| 57 |
+
logger.info(f" Total VRAM: {self.total_vram_gb:.2f} GB")
|
| 58 |
+
logger.info(f" Model VRAM: {self.model_vram_gb:.2f} GB")
|
| 59 |
+
logger.info(f" Estimated per inference: {self.estimated_inference_vram_gb:.2f} GB")
|
| 60 |
+
logger.info(f" Max concurrent inferences: {self.max_concurrent}")
|
| 61 |
+
|
| 62 |
+
def get_vram_status(self):
|
| 63 |
+
"""Get current VRAM usage"""
|
| 64 |
+
if not torch.cuda.is_available():
|
| 65 |
+
return {}
|
| 66 |
+
|
| 67 |
+
return {
|
| 68 |
+
"total_gb": self.total_vram_gb,
|
| 69 |
+
"allocated_gb": torch.cuda.memory_allocated() / 1e9,
|
| 70 |
+
"reserved_gb": torch.cuda.memory_reserved() / 1e9,
|
| 71 |
+
"free_gb": (self.total_vram_gb - torch.cuda.memory_reserved() / 1e9),
|
| 72 |
+
"max_concurrent": self.max_concurrent,
|
| 73 |
+
"processing_now": self.processing_count,
|
| 74 |
+
"queued": len(self.request_queue)
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
async def acquire(self, request_id):
|
| 78 |
+
"""Acquire GPU slot with VRAM check"""
|
| 79 |
+
self.request_queue.append(request_id)
|
| 80 |
+
position = len(self.request_queue)
|
| 81 |
+
|
| 82 |
+
logger.info(f"Request {request_id}: Queued at position {position}")
|
| 83 |
+
|
| 84 |
+
# Wait for semaphore slot
|
| 85 |
+
await self.semaphore.acquire()
|
| 86 |
+
|
| 87 |
+
# Remove from queue and increment processing count
|
| 88 |
+
if request_id in self.request_queue:
|
| 89 |
+
self.request_queue.remove(request_id)
|
| 90 |
+
self.processing_count += 1
|
| 91 |
+
|
| 92 |
+
# Check actual VRAM before proceeding
|
| 93 |
+
vram_status = self.get_vram_status()
|
| 94 |
+
if vram_status.get("free_gb", 0) < 5.0: # Need at least 5GB free
|
| 95 |
+
self.processing_count -= 1
|
| 96 |
+
self.semaphore.release()
|
| 97 |
+
raise HTTPException(
|
| 98 |
+
status_code=503,
|
| 99 |
+
detail=f"Insufficient VRAM: {vram_status.get('free_gb', 0):.2f}GB free, need 5GB+"
|
| 100 |
+
)
|
| 101 |
+
|
| 102 |
+
logger.info(f"Request {request_id}: Processing started (VRAM: {vram_status['free_gb']:.2f}GB free)")
|
| 103 |
+
|
| 104 |
+
def release(self, request_id):
|
| 105 |
+
"""Release GPU slot"""
|
| 106 |
+
self.processing_count -= 1
|
| 107 |
+
self.semaphore.release()
|
| 108 |
+
|
| 109 |
+
# Clean up memory
|
| 110 |
+
if torch.cuda.is_available():
|
| 111 |
+
torch.cuda.empty_cache()
|
| 112 |
+
|
| 113 |
+
logger.info(f"Request {request_id}: Completed and released")
|
| 114 |
+
|
| 115 |
+
# Initialize VRAM manager
|
| 116 |
+
vram_manager = VRAMManager()
|
| 117 |
+
|
| 118 |
+
app = FastAPI(title="SAM3 Inference API")
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
class Request(BaseModel):
|
| 122 |
+
inputs: str # base64 image
|
| 123 |
+
parameters: dict # { "classes": [...] }
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
def run_inference(image_b64: str, classes: list, request_id: str):
|
| 127 |
+
"""
|
| 128 |
+
Run SAM3 inference on a single image
|
| 129 |
+
|
| 130 |
+
For 1920x1080 images, this will take 5-10 seconds and use ~8-12GB VRAM
|
| 131 |
+
"""
|
| 132 |
+
try:
|
| 133 |
+
# Decode image
|
| 134 |
+
image_bytes = base64.b64decode(image_b64)
|
| 135 |
+
pil_image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
|
| 136 |
+
|
| 137 |
+
logger.info(f"Request {request_id}: Image size {pil_image.size}")
|
| 138 |
+
|
| 139 |
+
# Preprocess
|
| 140 |
+
inputs = processor(
|
| 141 |
+
images=pil_image,
|
| 142 |
+
text=classes,
|
| 143 |
+
return_tensors="pt"
|
| 144 |
+
)
|
| 145 |
+
if torch.cuda.is_available():
|
| 146 |
+
inputs = {k: v.cuda() for k, v in inputs.items()}
|
| 147 |
+
|
| 148 |
+
# Inference
|
| 149 |
+
with torch.no_grad():
|
| 150 |
+
outputs = model(**inputs)
|
| 151 |
+
|
| 152 |
+
pred_masks = outputs.pred_masks.squeeze(1) # [N, H, W]
|
| 153 |
+
|
| 154 |
+
results = []
|
| 155 |
+
for cls, mask_tensor in zip(classes, pred_masks):
|
| 156 |
+
mask = mask_tensor.float().cpu()
|
| 157 |
+
binary_mask = (mask > 0.5).numpy().astype("uint8") * 255
|
| 158 |
+
|
| 159 |
+
# Convert to PNG
|
| 160 |
+
pil_mask = Image.fromarray(binary_mask, mode="L")
|
| 161 |
+
buf = io.BytesIO()
|
| 162 |
+
pil_mask.save(buf, format="PNG")
|
| 163 |
+
mask_b64 = base64.b64encode(buf.getvalue()).decode("utf-8")
|
| 164 |
+
|
| 165 |
+
results.append({
|
| 166 |
+
"label": cls,
|
| 167 |
+
"mask": mask_b64,
|
| 168 |
+
"score": 1.0
|
| 169 |
+
})
|
| 170 |
+
|
| 171 |
+
logger.info(f"Request {request_id}: Inference completed successfully")
|
| 172 |
+
return results
|
| 173 |
+
|
| 174 |
+
except Exception as e:
|
| 175 |
+
logger.error(f"Request {request_id}: Inference failed - {str(e)}")
|
| 176 |
+
raise
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
@app.post("/")
|
| 180 |
+
async def predict(req: Request):
|
| 181 |
+
"""
|
| 182 |
+
Predict segmentation masks for given classes
|
| 183 |
+
|
| 184 |
+
Expected performance for 1920x1080 images:
|
| 185 |
+
- Processing time: 5-10 seconds
|
| 186 |
+
- VRAM usage: 8-12GB per inference
|
| 187 |
+
- Concurrent capacity: 1-2 inferences on A10 24GB GPU
|
| 188 |
+
"""
|
| 189 |
+
request_id = str(id(req))
|
| 190 |
+
|
| 191 |
+
try:
|
| 192 |
+
# Acquire GPU slot (with VRAM check)
|
| 193 |
+
await vram_manager.acquire(request_id)
|
| 194 |
+
|
| 195 |
+
try:
|
| 196 |
+
# Run inference in thread pool (non-blocking)
|
| 197 |
+
results = await asyncio.to_thread(
|
| 198 |
+
run_inference,
|
| 199 |
+
req.inputs,
|
| 200 |
+
req.parameters.get("classes", []),
|
| 201 |
+
request_id
|
| 202 |
+
)
|
| 203 |
+
return results
|
| 204 |
+
|
| 205 |
+
finally:
|
| 206 |
+
# Always release GPU slot
|
| 207 |
+
vram_manager.release(request_id)
|
| 208 |
+
|
| 209 |
+
except HTTPException:
|
| 210 |
+
raise
|
| 211 |
+
except Exception as e:
|
| 212 |
+
logger.error(f"Request {request_id}: Unexpected error - {str(e)}")
|
| 213 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 214 |
+
|
| 215 |
+
|
| 216 |
+
@app.get("/health")
|
| 217 |
+
async def health():
|
| 218 |
+
"""Health check endpoint"""
|
| 219 |
+
vram_status = vram_manager.get_vram_status()
|
| 220 |
+
|
| 221 |
+
return {
|
| 222 |
+
"status": "healthy",
|
| 223 |
+
"gpu_available": torch.cuda.is_available(),
|
| 224 |
+
"vram": vram_status
|
| 225 |
+
}
|
| 226 |
+
|
| 227 |
+
|
| 228 |
+
@app.get("/metrics")
|
| 229 |
+
async def metrics():
|
| 230 |
+
"""Detailed metrics endpoint"""
|
| 231 |
+
return vram_manager.get_vram_status()
|
| 232 |
+
|
| 233 |
+
|
| 234 |
+
if __name__ == "__main__":
|
| 235 |
+
import uvicorn
|
| 236 |
+
|
| 237 |
+
# Configuration for large images (1920x1080) on A10 GPU
|
| 238 |
+
uvicorn.run(
|
| 239 |
+
app,
|
| 240 |
+
host="0.0.0.0",
|
| 241 |
+
port=7860,
|
| 242 |
+
workers=1, # Single worker for single GPU
|
| 243 |
+
limit_concurrency=50, # Queue up to 50 requests
|
| 244 |
+
timeout_keep_alive=300, # 5 min keepalive for long inferences
|
| 245 |
+
log_level="info"
|
| 246 |
+
)
|
|
@@ -1,81 +0,0 @@
|
|
| 1 |
-
import base64
|
| 2 |
-
import io
|
| 3 |
-
import torch
|
| 4 |
-
from PIL import Image
|
| 5 |
-
from transformers import AutoProcessor, SamModel
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
class EndpointHandler:
|
| 9 |
-
"""
|
| 10 |
-
Hugging Face Inference Endpoint handler for text-prompt SAM3 segmentation.
|
| 11 |
-
|
| 12 |
-
Input:
|
| 13 |
-
{
|
| 14 |
-
"inputs": "<base64_image>",
|
| 15 |
-
"parameters": {
|
| 16 |
-
"classes": ["pothole", "marking"]
|
| 17 |
-
}
|
| 18 |
-
}
|
| 19 |
-
"""
|
| 20 |
-
|
| 21 |
-
def __init__(self, path="model"):
|
| 22 |
-
# Load from local path to bypass HF model registry
|
| 23 |
-
self.processor = AutoProcessor.from_pretrained(path)
|
| 24 |
-
self.model = SamModel.from_pretrained(
|
| 25 |
-
path,
|
| 26 |
-
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
|
| 27 |
-
)
|
| 28 |
-
|
| 29 |
-
self.model.eval()
|
| 30 |
-
if torch.cuda.is_available():
|
| 31 |
-
self.model = self.model.cuda()
|
| 32 |
-
|
| 33 |
-
def __call__(self, data):
|
| 34 |
-
# HF pipeline standard format
|
| 35 |
-
image_b64 = data.get("inputs", None)
|
| 36 |
-
params = data.get("parameters", {})
|
| 37 |
-
classes = params.get("classes", None)
|
| 38 |
-
|
| 39 |
-
if image_b64 is None or classes is None:
|
| 40 |
-
return {
|
| 41 |
-
"error": "Required fields: `inputs` (base64 image), `parameters.classes`"
|
| 42 |
-
}
|
| 43 |
-
|
| 44 |
-
# Decode image
|
| 45 |
-
image_bytes = base64.b64decode(image_b64)
|
| 46 |
-
pil_image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
|
| 47 |
-
|
| 48 |
-
# Preprocess
|
| 49 |
-
inputs = self.processor(
|
| 50 |
-
images=pil_image,
|
| 51 |
-
text=classes,
|
| 52 |
-
return_tensors="pt"
|
| 53 |
-
)
|
| 54 |
-
|
| 55 |
-
if torch.cuda.is_available():
|
| 56 |
-
inputs = {k: v.cuda() for k, v in inputs.items()}
|
| 57 |
-
|
| 58 |
-
# Inference
|
| 59 |
-
with torch.no_grad():
|
| 60 |
-
outputs = self.model(**inputs)
|
| 61 |
-
|
| 62 |
-
pred_masks = outputs.pred_masks.squeeze(1) # [num_classes, H, W]
|
| 63 |
-
|
| 64 |
-
# Convert to HF segmentation pipeline output format
|
| 65 |
-
results = []
|
| 66 |
-
for i, class_name in enumerate(classes):
|
| 67 |
-
mask = pred_masks[i].float().cpu()
|
| 68 |
-
binary_mask = (mask > 0.5).numpy().astype("uint8") * 255
|
| 69 |
-
|
| 70 |
-
pil_mask = Image.fromarray(binary_mask, mode="L")
|
| 71 |
-
buf = io.BytesIO()
|
| 72 |
-
pil_mask.save(buf, format="PNG")
|
| 73 |
-
mask_b64 = base64.b64encode(buf.getvalue()).decode("utf-8")
|
| 74 |
-
|
| 75 |
-
results.append({
|
| 76 |
-
"label": class_name,
|
| 77 |
-
"mask": mask_b64,
|
| 78 |
-
"score": 1.0
|
| 79 |
-
})
|
| 80 |
-
|
| 81 |
-
return results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Web framework
|
| 2 |
+
fastapi==0.121.3
|
| 3 |
+
uvicorn==0.38.0
|
| 4 |
+
|
| 5 |
+
# PyTorch with CUDA 12.9 (for HF L4/A10G/A100 GPUs)
|
| 6 |
+
# Note: Install with: pip install torch==2.9.1 --index-url https://download.pytorch.org/whl/cu129
|
| 7 |
+
torch==2.9.1
|
| 8 |
+
|
| 9 |
+
# Transformers with SAM3 support
|
| 10 |
+
transformers==4.57.1
|
| 11 |
+
|
| 12 |
+
# Hugging Face Hub
|
| 13 |
+
huggingface_hub>=0.34.0,<1.0
|
| 14 |
+
|
| 15 |
+
# Core dependencies
|
| 16 |
+
numpy>=2.3.0
|
| 17 |
+
pillow>=12.0.0
|
|
@@ -1,24 +0,0 @@
|
|
| 1 |
-
import base64
|
| 2 |
-
import json
|
| 3 |
-
from handler import EndpointHandler
|
| 4 |
-
|
| 5 |
-
# 1. Load the handler (loads SAM3 from ./model)
|
| 6 |
-
handler = EndpointHandler("model")
|
| 7 |
-
|
| 8 |
-
# 2. Load an image and convert to base64
|
| 9 |
-
with open("test.jpg", "rb") as f:
|
| 10 |
-
img_b64 = base64.b64encode(f.read()).decode("utf-8")
|
| 11 |
-
|
| 12 |
-
# 3. Build a fake HF request
|
| 13 |
-
payload = {
|
| 14 |
-
"inputs": img_b64,
|
| 15 |
-
"parameters": {
|
| 16 |
-
"classes": ["pothole", "marking"]
|
| 17 |
-
}
|
| 18 |
-
}
|
| 19 |
-
|
| 20 |
-
# 4. Run
|
| 21 |
-
output = handler(payload)
|
| 22 |
-
|
| 23 |
-
# 5. Print results
|
| 24 |
-
print(json.dumps(output, indent=2)[:2000]) # limit print to avoid huge logs
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|