Spaces:
Paused
Paused
feat(t4-gpu): add t4 gpu capability
Browse files- Dockerfile +1 -1
- main.py +26 -4
- poetry.lock +0 -0
- pyproject.toml +1 -0
Dockerfile
CHANGED
|
@@ -7,7 +7,7 @@ ENV PATH="/home/user/.local/bin:$PATH"
|
|
| 7 |
WORKDIR /app
|
| 8 |
|
| 9 |
COPY --chown=user ./requirements.txt requirements.txt
|
| 10 |
-
RUN pip install --no-cache-dir
|
| 11 |
|
| 12 |
COPY --chown=user . /app
|
| 13 |
|
|
|
|
| 7 |
WORKDIR /app
|
| 8 |
|
| 9 |
COPY --chown=user ./requirements.txt requirements.txt
|
| 10 |
+
RUN pip install --no-cache-dir -r requirements.txt --extra-index-url https://download.pytorch.org/whl/cu113
|
| 11 |
|
| 12 |
COPY --chown=user . /app
|
| 13 |
|
main.py
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
|
|
|
|
|
| 1 |
from typing import Optional
|
| 2 |
from fastapi import FastAPI
|
| 3 |
from pydantic import BaseModel
|
|
@@ -11,7 +13,7 @@ app = FastAPI()
|
|
| 11 |
# Initialize the LLM engine
|
| 12 |
# Replace 'your-model-path' with the actual path or name of your model
|
| 13 |
|
| 14 |
-
|
| 15 |
model='meta-llama/Llama-3.2-3B-Instruct',
|
| 16 |
revision="0cb88a4f764b7a12671c53f0838cd831a0843b95",
|
| 17 |
max_num_batched_tokens=512, # Reduced for T4
|
|
@@ -19,13 +21,33 @@ engine = LLM(
|
|
| 19 |
gpu_memory_utilization=0.85, # Slightly increased, adjust if needed
|
| 20 |
max_model_len=131072, # Llama-3.2-3B-Instruct context length
|
| 21 |
enforce_eager=True, # Disable CUDA graph
|
| 22 |
-
dtype='half', # Use half precision
|
| 23 |
)
|
| 24 |
|
| 25 |
|
| 26 |
@app.get("/")
|
| 27 |
def greet_json():
|
| 28 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
|
| 30 |
|
| 31 |
class GenerationRequest(BaseModel):
|
|
@@ -50,7 +72,7 @@ def generate_text(request: GenerationRequest) -> list[RequestOutput] | dict[str,
|
|
| 50 |
)
|
| 51 |
|
| 52 |
# Generate text
|
| 53 |
-
return
|
| 54 |
prompts=request.prompt,
|
| 55 |
sampling_params=sampling_params
|
| 56 |
)
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
from typing import Any
|
| 3 |
from typing import Optional
|
| 4 |
from fastapi import FastAPI
|
| 5 |
from pydantic import BaseModel
|
|
|
|
| 13 |
# Initialize the LLM engine
|
| 14 |
# Replace 'your-model-path' with the actual path or name of your model
|
| 15 |
|
| 16 |
+
engine_llama_3_2: LLM = LLM(
|
| 17 |
model='meta-llama/Llama-3.2-3B-Instruct',
|
| 18 |
revision="0cb88a4f764b7a12671c53f0838cd831a0843b95",
|
| 19 |
max_num_batched_tokens=512, # Reduced for T4
|
|
|
|
| 21 |
gpu_memory_utilization=0.85, # Slightly increased, adjust if needed
|
| 22 |
max_model_len=131072, # Llama-3.2-3B-Instruct context length
|
| 23 |
enforce_eager=True, # Disable CUDA graph
|
| 24 |
+
dtype='half', # Use 'half' if you want half precision
|
| 25 |
)
|
| 26 |
|
| 27 |
|
| 28 |
@app.get("/")
|
| 29 |
def greet_json():
|
| 30 |
+
cuda_info: dict[str, Any] = {}
|
| 31 |
+
if torch.cuda.is_available():
|
| 32 |
+
cuda_current_device: int = torch.cuda.current_device()
|
| 33 |
+
cuda_info = {
|
| 34 |
+
"device_count": torch.cuda.device_count(),
|
| 35 |
+
"cuda_device": torch.cuda.get_device_name(cuda_current_device),
|
| 36 |
+
"cuda_capability": torch.cuda.get_device_capability(cuda_current_device),
|
| 37 |
+
"allocated": f"{round(torch.cuda.memory_allocated(cuda_current_device) / 1024 ** 3, 1)} GB",
|
| 38 |
+
"cached": f"{round(torch.cuda.memory_reserved(cuda_current_device) / 1024 ** 3, 1)} GB",
|
| 39 |
+
}
|
| 40 |
+
|
| 41 |
+
return {
|
| 42 |
+
"message": f"CUDA availability is {torch.cuda.is_available()}",
|
| 43 |
+
"cuda_info": cuda_info,
|
| 44 |
+
"model": [
|
| 45 |
+
{
|
| 46 |
+
"name": "meta-llama/Llama-3.2-3B-Instruct",
|
| 47 |
+
"revision": "0cb88a4f764b7a12671c53f0838cd831a0843b95",
|
| 48 |
+
}
|
| 49 |
+
]
|
| 50 |
+
}
|
| 51 |
|
| 52 |
|
| 53 |
class GenerationRequest(BaseModel):
|
|
|
|
| 72 |
)
|
| 73 |
|
| 74 |
# Generate text
|
| 75 |
+
return engine_llama_3_2.generate(
|
| 76 |
prompts=request.prompt,
|
| 77 |
sampling_params=sampling_params
|
| 78 |
)
|
poetry.lock
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
pyproject.toml
CHANGED
|
@@ -12,6 +12,7 @@ vllm = "^0.6.4.post1"
|
|
| 12 |
fastapi = "^0.115.5"
|
| 13 |
pydantic = "^2.10.2"
|
| 14 |
uvicorn = "^0.32.1"
|
|
|
|
| 15 |
|
| 16 |
|
| 17 |
[build-system]
|
|
|
|
| 12 |
fastapi = "^0.115.5"
|
| 13 |
pydantic = "^2.10.2"
|
| 14 |
uvicorn = "^0.32.1"
|
| 15 |
+
torch = "^2.5.1"
|
| 16 |
|
| 17 |
|
| 18 |
[build-system]
|