Spaces:
Running
Running
Soumik Bose commited on
Commit ·
257c70f
1
Parent(s): d5c9ae8
ok
Browse files- Dockerfile +7 -15
- main.py +58 -55
Dockerfile
CHANGED
|
@@ -1,46 +1,38 @@
|
|
| 1 |
-
# Use Python 3.11 Slim as requested
|
| 2 |
FROM python:3.11-slim
|
| 3 |
|
| 4 |
# Set environment variables
|
| 5 |
ENV PYTHONDONTWRITEBYTECODE=1 \
|
| 6 |
PYTHONUNBUFFERED=1 \
|
| 7 |
-
# Hugging Face Spaces specific port
|
| 8 |
PORT=7860 \
|
| 9 |
-
# Configure cache to be writable by non-root user
|
| 10 |
HF_HOME=/app/cache \
|
| 11 |
TRANSFORMERS_CACHE=/app/cache
|
| 12 |
|
| 13 |
WORKDIR /app
|
| 14 |
|
| 15 |
-
# 1. Install
|
| 16 |
RUN apt-get update && apt-get install -y \
|
| 17 |
build-essential \
|
| 18 |
curl \
|
| 19 |
&& rm -rf /var/lib/apt/lists/*
|
| 20 |
|
| 21 |
-
# 2. Create
|
| 22 |
RUN useradd -m -u 1000 user
|
| 23 |
|
| 24 |
-
# 3. Create
|
| 25 |
-
# We need a cache folder that the user can write to when downloading the model
|
| 26 |
RUN mkdir -p /app/cache && \
|
| 27 |
mkdir -p /app/models && \
|
| 28 |
chown -R user:user /app
|
| 29 |
|
| 30 |
-
# 4. Switch
|
| 31 |
USER user
|
| 32 |
|
| 33 |
-
# 5. Install
|
| 34 |
-
# We copy requirements first to leverage Docker layer caching
|
| 35 |
COPY --chown=user:user requirements.txt .
|
| 36 |
RUN pip install --no-cache-dir -r requirements.txt
|
| 37 |
|
| 38 |
-
# 6. Copy
|
| 39 |
COPY --chown=user:user main.py .
|
| 40 |
|
| 41 |
-
# 7.
|
| 42 |
EXPOSE 7860
|
| 43 |
-
|
| 44 |
-
# 8. Run the application
|
| 45 |
-
# host 0.0.0.0 is required for Docker networking
|
| 46 |
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
|
|
|
|
|
|
|
| 1 |
FROM python:3.11-slim
|
| 2 |
|
| 3 |
# Set environment variables
|
| 4 |
ENV PYTHONDONTWRITEBYTECODE=1 \
|
| 5 |
PYTHONUNBUFFERED=1 \
|
|
|
|
| 6 |
PORT=7860 \
|
|
|
|
| 7 |
HF_HOME=/app/cache \
|
| 8 |
TRANSFORMERS_CACHE=/app/cache
|
| 9 |
|
| 10 |
WORKDIR /app
|
| 11 |
|
| 12 |
+
# 1. Install compilers (Required for llama-cpp build)
|
| 13 |
RUN apt-get update && apt-get install -y \
|
| 14 |
build-essential \
|
| 15 |
curl \
|
| 16 |
&& rm -rf /var/lib/apt/lists/*
|
| 17 |
|
| 18 |
+
# 2. Create non-root user (Security requirement for HF Spaces)
|
| 19 |
RUN useradd -m -u 1000 user
|
| 20 |
|
| 21 |
+
# 3. Create writable cache directories
|
|
|
|
| 22 |
RUN mkdir -p /app/cache && \
|
| 23 |
mkdir -p /app/models && \
|
| 24 |
chown -R user:user /app
|
| 25 |
|
| 26 |
+
# 4. Switch user
|
| 27 |
USER user
|
| 28 |
|
| 29 |
+
# 5. Install dependencies
|
|
|
|
| 30 |
COPY --chown=user:user requirements.txt .
|
| 31 |
RUN pip install --no-cache-dir -r requirements.txt
|
| 32 |
|
| 33 |
+
# 6. Copy application code
|
| 34 |
COPY --chown=user:user main.py .
|
| 35 |
|
| 36 |
+
# 7. Launch
|
| 37 |
EXPOSE 7860
|
|
|
|
|
|
|
|
|
|
| 38 |
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
|
main.py
CHANGED
|
@@ -1,75 +1,85 @@
|
|
| 1 |
import os
|
| 2 |
import logging
|
|
|
|
| 3 |
from contextlib import asynccontextmanager
|
| 4 |
-
from fastapi import FastAPI, HTTPException, Request
|
| 5 |
-
from pydantic import BaseModel, Field
|
| 6 |
from typing import List, Optional
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
from llama_cpp import Llama
|
| 8 |
from huggingface_hub import hf_hub_download
|
| 9 |
|
| 10 |
-
# --- 1.
|
| 11 |
logging.basicConfig(
|
| 12 |
level=logging.INFO,
|
| 13 |
-
format="%(asctime)s
|
| 14 |
)
|
| 15 |
-
logger = logging.getLogger("
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
|
| 17 |
-
#
|
| 18 |
-
|
| 19 |
|
| 20 |
-
|
| 21 |
-
REPO_ID = "Qwen/Qwen2.5-1.5B-Instruct-GGUF"
|
| 22 |
-
FILENAME = "qwen2.5-1.5b-instruct-q4_k_m.gguf"
|
| 23 |
-
N_THREADS = int(os.getenv("CPU_THREADS", "2")) # Default to 2 for your hardware
|
| 24 |
|
| 25 |
-
# --- 3.
|
| 26 |
@asynccontextmanager
|
| 27 |
async def lifespan(app: FastAPI):
|
| 28 |
-
global
|
| 29 |
-
logger.info("
|
| 30 |
|
| 31 |
try:
|
| 32 |
-
#
|
| 33 |
-
logger.info(f"Downloading
|
| 34 |
model_path = hf_hub_download(
|
| 35 |
repo_id=REPO_ID,
|
| 36 |
filename=FILENAME,
|
| 37 |
-
|
| 38 |
)
|
| 39 |
-
logger.info(f"
|
| 40 |
|
| 41 |
-
# Load
|
| 42 |
-
logger.info("
|
| 43 |
-
|
|
|
|
|
|
|
| 44 |
model_path=model_path,
|
| 45 |
-
n_ctx=
|
| 46 |
-
n_threads=N_THREADS,
|
| 47 |
-
n_batch=512,
|
| 48 |
-
verbose=False
|
| 49 |
)
|
| 50 |
-
|
|
|
|
|
|
|
| 51 |
|
| 52 |
except Exception as e:
|
| 53 |
-
logger.
|
| 54 |
raise e
|
| 55 |
-
|
| 56 |
-
yield
|
| 57 |
|
| 58 |
-
#
|
| 59 |
-
|
| 60 |
-
|
|
|
|
| 61 |
|
| 62 |
-
# --- 4. FastAPI App
|
| 63 |
-
app = FastAPI(title="
|
| 64 |
|
| 65 |
# --- 5. Data Models ---
|
| 66 |
class Message(BaseModel):
|
| 67 |
role: str
|
| 68 |
content: str
|
| 69 |
|
| 70 |
-
class
|
| 71 |
messages: List[Message]
|
| 72 |
-
temperature: Optional[float] = 0.
|
| 73 |
max_tokens: Optional[int] = 512
|
| 74 |
stream: Optional[bool] = False
|
| 75 |
|
|
@@ -77,36 +87,29 @@ class ChatCompletionRequest(BaseModel):
|
|
| 77 |
|
| 78 |
@app.get("/")
|
| 79 |
async def root():
|
| 80 |
-
"""
|
| 81 |
-
logger.info("Health check on root / accessed")
|
| 82 |
-
return {
|
| 83 |
-
"message": "Qwen 2.5 (1.5B) CPU Inference API is Running",
|
| 84 |
-
"docs_url": "/docs"
|
| 85 |
-
}
|
| 86 |
|
| 87 |
@app.get("/ping")
|
| 88 |
async def ping():
|
| 89 |
-
|
| 90 |
-
|
|
|
|
| 91 |
|
| 92 |
@app.post("/v1/chat/completions")
|
| 93 |
-
async def
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
logger.error("Request received but model not loaded")
|
| 97 |
-
raise HTTPException(status_code=503, detail="Model is not ready yet")
|
| 98 |
|
| 99 |
-
logger.info(f"
|
| 100 |
|
| 101 |
try:
|
| 102 |
-
# llama-cpp-python handles the chat
|
| 103 |
-
|
| 104 |
messages=[m.model_dump() for m in request.messages],
|
| 105 |
temperature=request.temperature,
|
| 106 |
max_tokens=request.max_tokens,
|
| 107 |
stream=request.stream
|
| 108 |
)
|
| 109 |
-
return response
|
| 110 |
except Exception as e:
|
| 111 |
-
logger.error(f"
|
| 112 |
-
raise HTTPException(status_code=500, detail=
|
|
|
|
| 1 |
import os
|
| 2 |
import logging
|
| 3 |
+
import time
|
| 4 |
from contextlib import asynccontextmanager
|
|
|
|
|
|
|
| 5 |
from typing import List, Optional
|
| 6 |
+
|
| 7 |
+
from fastapi import FastAPI, HTTPException
|
| 8 |
+
from fastapi.responses import JSONResponse
|
| 9 |
+
from pydantic import BaseModel
|
| 10 |
from llama_cpp import Llama
|
| 11 |
from huggingface_hub import hf_hub_download
|
| 12 |
|
| 13 |
+
# --- 1. Logging Setup ---
|
| 14 |
logging.basicConfig(
|
| 15 |
level=logging.INFO,
|
| 16 |
+
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s"
|
| 17 |
)
|
| 18 |
+
logger = logging.getLogger("SmolLM-API")
|
| 19 |
+
|
| 20 |
+
# --- 2. Model Configuration ---
|
| 21 |
+
# SWITCHED TO SMOLLM2 1.7B (Instruct Version)
|
| 22 |
+
REPO_ID = "HuggingFaceTB/SmolLM2-1.7B-Instruct-GGUF"
|
| 23 |
+
FILENAME = "smollm2-1.7b-instruct-q4_k_m.gguf"
|
| 24 |
|
| 25 |
+
# CPU Threads (Matches your hardware)
|
| 26 |
+
N_THREADS = int(os.getenv("CPU_THREADS", "2"))
|
| 27 |
|
| 28 |
+
llm_model: Optional[Llama] = None
|
|
|
|
|
|
|
|
|
|
| 29 |
|
| 30 |
+
# --- 3. Lifecycle Manager ---
|
| 31 |
@asynccontextmanager
|
| 32 |
async def lifespan(app: FastAPI):
|
| 33 |
+
global llm_model
|
| 34 |
+
logger.info("--- STARTING SMOLLM2 API ---")
|
| 35 |
|
| 36 |
try:
|
| 37 |
+
# Step A: Download
|
| 38 |
+
logger.info(f"Downloading {FILENAME} from Hugging Face...")
|
| 39 |
model_path = hf_hub_download(
|
| 40 |
repo_id=REPO_ID,
|
| 41 |
filename=FILENAME,
|
| 42 |
+
cache_dir=os.getenv("HF_HOME", "/app/cache")
|
| 43 |
)
|
| 44 |
+
logger.info(f"Download complete: {model_path}")
|
| 45 |
|
| 46 |
+
# Step B: Load into RAM
|
| 47 |
+
logger.info(f"Initializing Engine (Threads: {N_THREADS})...")
|
| 48 |
+
start_time = time.time()
|
| 49 |
+
|
| 50 |
+
llm_model = Llama(
|
| 51 |
model_path=model_path,
|
| 52 |
+
n_ctx=2048, # 2048 is standard for SmolLM
|
| 53 |
+
n_threads=N_THREADS,
|
| 54 |
+
n_batch=512,
|
| 55 |
+
verbose=False
|
| 56 |
)
|
| 57 |
+
|
| 58 |
+
duration = time.time() - start_time
|
| 59 |
+
logger.info(f"SmolLM2 Loaded in {duration:.2f} seconds.")
|
| 60 |
|
| 61 |
except Exception as e:
|
| 62 |
+
logger.critical(f"Startup Failed: {e}")
|
| 63 |
raise e
|
| 64 |
+
|
| 65 |
+
yield
|
| 66 |
|
| 67 |
+
# Cleanup
|
| 68 |
+
if llm_model:
|
| 69 |
+
del llm_model
|
| 70 |
+
logger.info("Model unloaded.")
|
| 71 |
|
| 72 |
+
# --- 4. FastAPI App ---
|
| 73 |
+
app = FastAPI(title="SmolLM2 API", version="2.0", lifespan=lifespan)
|
| 74 |
|
| 75 |
# --- 5. Data Models ---
|
| 76 |
class Message(BaseModel):
|
| 77 |
role: str
|
| 78 |
content: str
|
| 79 |
|
| 80 |
+
class ChatRequest(BaseModel):
|
| 81 |
messages: List[Message]
|
| 82 |
+
temperature: Optional[float] = 0.6
|
| 83 |
max_tokens: Optional[int] = 512
|
| 84 |
stream: Optional[bool] = False
|
| 85 |
|
|
|
|
| 87 |
|
| 88 |
@app.get("/")
|
| 89 |
async def root():
|
| 90 |
+
return {"status": "Running", "model": "SmolLM2-1.7B-Instruct"}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
|
| 92 |
@app.get("/ping")
|
| 93 |
async def ping():
|
| 94 |
+
if llm_model:
|
| 95 |
+
return {"status": "pong", "ready": True}
|
| 96 |
+
return JSONResponse(status_code=503, content={"status": "loading"})
|
| 97 |
|
| 98 |
@app.post("/v1/chat/completions")
|
| 99 |
+
async def chat(request: ChatRequest):
|
| 100 |
+
if not llm_model:
|
| 101 |
+
raise HTTPException(status_code=503, detail="Model loading...")
|
|
|
|
|
|
|
| 102 |
|
| 103 |
+
logger.info(f"Processing request: {len(request.messages)} msgs")
|
| 104 |
|
| 105 |
try:
|
| 106 |
+
# llama-cpp-python handles the chat template automatically
|
| 107 |
+
return llm_model.create_chat_completion(
|
| 108 |
messages=[m.model_dump() for m in request.messages],
|
| 109 |
temperature=request.temperature,
|
| 110 |
max_tokens=request.max_tokens,
|
| 111 |
stream=request.stream
|
| 112 |
)
|
|
|
|
| 113 |
except Exception as e:
|
| 114 |
+
logger.error(f"Error: {e}")
|
| 115 |
+
raise HTTPException(status_code=500, detail=str(e))
|