Spaces:
Runtime error
Runtime error
Rúben Almeida
commited on
Commit
·
4d163d0
1
Parent(s):
1a1e448
Add GPU support
Browse files- .dockerignore +4 -0
- Dockerfile +1 -1
- main.py +10 -2
- requirements.txt +1 -1
.dockerignore
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
**.venv
|
| 2 |
+
**.pytest_cache
|
| 3 |
+
**__pycache__
|
| 4 |
+
**.env
|
Dockerfile
CHANGED
|
@@ -18,7 +18,7 @@ RUN pip install --upgrade pip
|
|
| 18 |
RUN pip install -U setuptools wheel
|
| 19 |
|
| 20 |
# Install torch cpu version
|
| 21 |
-
RUN pip install -U torch torchvision torchaudio
|
| 22 |
|
| 23 |
# Copy the requirements file into the container
|
| 24 |
COPY --chown=user requirements.txt .
|
|
|
|
| 18 |
RUN pip install -U setuptools wheel
|
| 19 |
|
| 20 |
# Install torch cpu version
|
| 21 |
+
RUN pip install -U torch torchvision torchaudio
|
| 22 |
|
| 23 |
# Copy the requirements file into the container
|
| 24 |
COPY --chown=user requirements.txt .
|
main.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
|
|
| 1 |
import zipfile
|
| 2 |
from typing import Union
|
| 3 |
from awq import AutoAWQForCausalLM
|
|
@@ -6,13 +7,20 @@ from tempfile import NamedTemporaryFile
|
|
| 6 |
from contextlib import asynccontextmanager
|
| 7 |
from fastapi import FastAPI, HTTPException
|
| 8 |
from fastapi.responses import RedirectResponse, FileResponse
|
| 9 |
-
from dto import AWQConvertionRequest, GGUFConvertionRequest, GPTQConvertionRequest
|
| 10 |
|
| 11 |
### FastAPI Initialization
|
| 12 |
@asynccontextmanager
|
| 13 |
async def lifespan(app:FastAPI):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
yield
|
| 15 |
|
|
|
|
|
|
|
| 16 |
app = FastAPI(title="Huggingface Safetensor Model Converter to AWQ", version="0.1.0", lifespan=lifespan)
|
| 17 |
### -------
|
| 18 |
|
|
@@ -26,7 +34,7 @@ def redirect_to_docs():
|
|
| 26 |
def convert(request: AWQConvertionRequest)->Union[FileResponse, dict]:
|
| 27 |
|
| 28 |
try:
|
| 29 |
-
model = AutoAWQForCausalLM.from_pretrained(request.hf_model_name,
|
| 30 |
except TypeError as e:
|
| 31 |
raise HTTPException(status_code=400, detail=f"Is this model supported by AWQ Quantization? Check:https://github.com/mit-han-lab/llm-awq?tab=readme-ov-file {e}")
|
| 32 |
|
|
|
|
| 1 |
+
import torch
|
| 2 |
import zipfile
|
| 3 |
from typing import Union
|
| 4 |
from awq import AutoAWQForCausalLM
|
|
|
|
| 7 |
from contextlib import asynccontextmanager
|
| 8 |
from fastapi import FastAPI, HTTPException
|
| 9 |
from fastapi.responses import RedirectResponse, FileResponse
|
| 10 |
+
from .dto import AWQConvertionRequest, GGUFConvertionRequest, GPTQConvertionRequest
|
| 11 |
|
| 12 |
### FastAPI Initialization
|
| 13 |
@asynccontextmanager
|
| 14 |
async def lifespan(app:FastAPI):
|
| 15 |
+
torch.cuda.empty_cache()
|
| 16 |
+
|
| 17 |
+
print("Starting FastAPI server...")
|
| 18 |
+
print(f"Running on {"cuda" if torch.cuda.is_available() else "cpu"}")
|
| 19 |
+
|
| 20 |
yield
|
| 21 |
|
| 22 |
+
torch.cuda.empty_cache()
|
| 23 |
+
|
| 24 |
app = FastAPI(title="Huggingface Safetensor Model Converter to AWQ", version="0.1.0", lifespan=lifespan)
|
| 25 |
### -------
|
| 26 |
|
|
|
|
| 34 |
def convert(request: AWQConvertionRequest)->Union[FileResponse, dict]:
|
| 35 |
|
| 36 |
try:
|
| 37 |
+
model = AutoAWQForCausalLM.from_pretrained(request.hf_model_name, trust_remote_code=True)
|
| 38 |
except TypeError as e:
|
| 39 |
raise HTTPException(status_code=400, detail=f"Is this model supported by AWQ Quantization? Check:https://github.com/mit-han-lab/llm-awq?tab=readme-ov-file {e}")
|
| 40 |
|
requirements.txt
CHANGED
|
@@ -8,7 +8,7 @@ pydantic
|
|
| 8 |
fastapi[standard]
|
| 9 |
transformers
|
| 10 |
huggingface_hub[hf_xet]
|
| 11 |
-
autoawq
|
| 12 |
pytest
|
| 13 |
requests
|
| 14 |
environs
|
|
|
|
| 8 |
fastapi[standard]
|
| 9 |
transformers
|
| 10 |
huggingface_hub[hf_xet]
|
| 11 |
+
autoawq>=0.2.8
|
| 12 |
pytest
|
| 13 |
requests
|
| 14 |
environs
|