Soumik Bose commited on
Commit
257c70f
·
1 Parent(s): d5c9ae8
Files changed (2) hide show
  1. Dockerfile +7 -15
  2. main.py +58 -55
Dockerfile CHANGED
@@ -1,46 +1,38 @@
1
- # Use Python 3.11 Slim as requested
2
  FROM python:3.11-slim
3
 
4
  # Set environment variables
5
  ENV PYTHONDONTWRITEBYTECODE=1 \
6
  PYTHONUNBUFFERED=1 \
7
- # Hugging Face Spaces specific port
8
  PORT=7860 \
9
- # Configure cache to be writable by non-root user
10
  HF_HOME=/app/cache \
11
  TRANSFORMERS_CACHE=/app/cache
12
 
13
  WORKDIR /app
14
 
15
- # 1. Install system tools (needed for compiling llama-cpp if wheels miss)
16
  RUN apt-get update && apt-get install -y \
17
  build-essential \
18
  curl \
19
  && rm -rf /var/lib/apt/lists/*
20
 
21
- # 2. Create a non-root user "user" with ID 1000 (Required for HF Spaces)
22
  RUN useradd -m -u 1000 user
23
 
24
- # 3. Create necessary directories with correct permissions
25
- # We need a cache folder that the user can write to when downloading the model
26
  RUN mkdir -p /app/cache && \
27
  mkdir -p /app/models && \
28
  chown -R user:user /app
29
 
30
- # 4. Switch to the non-root user
31
  USER user
32
 
33
- # 5. Install Python dependencies
34
- # We copy requirements first to leverage Docker layer caching
35
  COPY --chown=user:user requirements.txt .
36
  RUN pip install --no-cache-dir -r requirements.txt
37
 
38
- # 6. Copy the application code
39
  COPY --chown=user:user main.py .
40
 
41
- # 7. Expose the port
42
  EXPOSE 7860
43
-
44
- # 8. Run the application
45
- # host 0.0.0.0 is required for Docker networking
46
  CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
 
 
1
  FROM python:3.11-slim
2
 
3
  # Set environment variables
4
  ENV PYTHONDONTWRITEBYTECODE=1 \
5
  PYTHONUNBUFFERED=1 \
 
6
  PORT=7860 \
 
7
  HF_HOME=/app/cache \
8
  TRANSFORMERS_CACHE=/app/cache
9
 
10
  WORKDIR /app
11
 
12
+ # 1. Install compilers (Required for llama-cpp build)
13
  RUN apt-get update && apt-get install -y \
14
  build-essential \
15
  curl \
16
  && rm -rf /var/lib/apt/lists/*
17
 
18
+ # 2. Create non-root user (Security requirement for HF Spaces)
19
  RUN useradd -m -u 1000 user
20
 
21
+ # 3. Create writable cache directories
 
22
  RUN mkdir -p /app/cache && \
23
  mkdir -p /app/models && \
24
  chown -R user:user /app
25
 
26
+ # 4. Switch user
27
  USER user
28
 
29
+ # 5. Install dependencies
 
30
  COPY --chown=user:user requirements.txt .
31
  RUN pip install --no-cache-dir -r requirements.txt
32
 
33
+ # 6. Copy application code
34
  COPY --chown=user:user main.py .
35
 
36
+ # 7. Launch
37
  EXPOSE 7860
 
 
 
38
  CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
main.py CHANGED
@@ -1,75 +1,85 @@
1
  import os
2
  import logging
 
3
  from contextlib import asynccontextmanager
4
- from fastapi import FastAPI, HTTPException, Request
5
- from pydantic import BaseModel, Field
6
  from typing import List, Optional
 
 
 
 
7
  from llama_cpp import Llama
8
  from huggingface_hub import hf_hub_download
9
 
10
- # --- 1. Logger Setup (Production Standard) ---
11
  logging.basicConfig(
12
  level=logging.INFO,
13
- format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
14
  )
15
- logger = logging.getLogger("qwen-api")
 
 
 
 
 
16
 
17
- # --- 2. Global State ---
18
- model_instance: Optional[Llama] = None
19
 
20
- # Settings
21
- REPO_ID = "Qwen/Qwen2.5-1.5B-Instruct-GGUF"
22
- FILENAME = "qwen2.5-1.5b-instruct-q4_k_m.gguf"
23
- N_THREADS = int(os.getenv("CPU_THREADS", "2")) # Default to 2 for your hardware
24
 
25
- # --- 3. Lifespan (Startup/Shutdown Logic) ---
26
  @asynccontextmanager
27
  async def lifespan(app: FastAPI):
28
- global model_instance
29
- logger.info("STARTUP: Initializing Application...")
30
 
31
  try:
32
- # Download model using huggingface_hub (More robust than curl)
33
- logger.info(f"Downloading model {REPO_ID} -> {FILENAME}...")
34
  model_path = hf_hub_download(
35
  repo_id=REPO_ID,
36
  filename=FILENAME,
37
- local_dir="./models"
38
  )
39
- logger.info(f"Model downloaded to: {model_path}")
40
 
41
- # Load Model
42
- logger.info("Loading Llama model into memory...")
43
- model_instance = Llama(
 
 
44
  model_path=model_path,
45
- n_ctx=4096,
46
- n_threads=N_THREADS,
47
- n_batch=512,
48
- verbose=False
49
  )
50
- logger.info("STARTUP: Model loaded successfully!")
 
 
51
 
52
  except Exception as e:
53
- logger.error(f"CRITICAL: Failed to load model: {e}")
54
  raise e
55
-
56
- yield
57
 
58
- # Shutdown logic (if needed)
59
- logger.info("SHUTDOWN: Cleaning up resources...")
60
- model_instance = None
 
61
 
62
- # --- 4. FastAPI App Definition ---
63
- app = FastAPI(title="Qwen Production API", version="1.0.0", lifespan=lifespan)
64
 
65
  # --- 5. Data Models ---
66
  class Message(BaseModel):
67
  role: str
68
  content: str
69
 
70
- class ChatCompletionRequest(BaseModel):
71
  messages: List[Message]
72
- temperature: Optional[float] = 0.7
73
  max_tokens: Optional[int] = 512
74
  stream: Optional[bool] = False
75
 
@@ -77,36 +87,29 @@ class ChatCompletionRequest(BaseModel):
77
 
78
  @app.get("/")
79
  async def root():
80
- """Root endpoint to verify api is reachable"""
81
- logger.info("Health check on root / accessed")
82
- return {
83
- "message": "Qwen 2.5 (1.5B) CPU Inference API is Running",
84
- "docs_url": "/docs"
85
- }
86
 
87
  @app.get("/ping")
88
  async def ping():
89
- """Simple health check for monitoring tools"""
90
- return "pong"
 
91
 
92
  @app.post("/v1/chat/completions")
93
- async def chat_completions(request: ChatCompletionRequest):
94
- """OpenAI-compatible chat completion endpoint"""
95
- if not model_instance:
96
- logger.error("Request received but model not loaded")
97
- raise HTTPException(status_code=503, detail="Model is not ready yet")
98
 
99
- logger.info(f"Generating completion. Temp: {request.temperature}, MaxTokens: {request.max_tokens}")
100
 
101
  try:
102
- # llama-cpp-python handles the chat formatting automatically
103
- response = model_instance.create_chat_completion(
104
  messages=[m.model_dump() for m in request.messages],
105
  temperature=request.temperature,
106
  max_tokens=request.max_tokens,
107
  stream=request.stream
108
  )
109
- return response
110
  except Exception as e:
111
- logger.error(f"Inference Error: {str(e)}")
112
- raise HTTPException(status_code=500, detail="Internal inference error")
 
1
  import os
2
  import logging
3
+ import time
4
  from contextlib import asynccontextmanager
 
 
5
  from typing import List, Optional
6
+
7
+ from fastapi import FastAPI, HTTPException
8
+ from fastapi.responses import JSONResponse
9
+ from pydantic import BaseModel
10
  from llama_cpp import Llama
11
  from huggingface_hub import hf_hub_download
12
 
13
+ # --- 1. Logging Setup ---
14
  logging.basicConfig(
15
  level=logging.INFO,
16
+ format="%(asctime)s [%(levelname)s] %(name)s: %(message)s"
17
  )
18
+ logger = logging.getLogger("SmolLM-API")
19
+
20
+ # --- 2. Model Configuration ---
21
+ # SWITCHED TO SMOLLM2 1.7B (Instruct Version)
22
+ REPO_ID = "HuggingFaceTB/SmolLM2-1.7B-Instruct-GGUF"
23
+ FILENAME = "smollm2-1.7b-instruct-q4_k_m.gguf"
24
 
25
+ # CPU Threads (Matches your hardware)
26
+ N_THREADS = int(os.getenv("CPU_THREADS", "2"))
27
 
28
+ llm_model: Optional[Llama] = None
 
 
 
29
 
30
+ # --- 3. Lifecycle Manager ---
31
  @asynccontextmanager
32
  async def lifespan(app: FastAPI):
33
+ global llm_model
34
+ logger.info("--- STARTING SMOLLM2 API ---")
35
 
36
  try:
37
+ # Step A: Download
38
+ logger.info(f"Downloading {FILENAME} from Hugging Face...")
39
  model_path = hf_hub_download(
40
  repo_id=REPO_ID,
41
  filename=FILENAME,
42
+ cache_dir=os.getenv("HF_HOME", "/app/cache")
43
  )
44
+ logger.info(f"Download complete: {model_path}")
45
 
46
+ # Step B: Load into RAM
47
+ logger.info(f"Initializing Engine (Threads: {N_THREADS})...")
48
+ start_time = time.time()
49
+
50
+ llm_model = Llama(
51
  model_path=model_path,
52
+ n_ctx=2048, # 2048 is standard for SmolLM
53
+ n_threads=N_THREADS,
54
+ n_batch=512,
55
+ verbose=False
56
  )
57
+
58
+ duration = time.time() - start_time
59
+ logger.info(f"SmolLM2 Loaded in {duration:.2f} seconds.")
60
 
61
  except Exception as e:
62
+ logger.critical(f"Startup Failed: {e}")
63
  raise e
64
+
65
+ yield
66
 
67
+ # Cleanup
68
+ if llm_model:
69
+ del llm_model
70
+ logger.info("Model unloaded.")
71
 
72
+ # --- 4. FastAPI App ---
73
+ app = FastAPI(title="SmolLM2 API", version="2.0", lifespan=lifespan)
74
 
75
  # --- 5. Data Models ---
76
  class Message(BaseModel):
77
  role: str
78
  content: str
79
 
80
+ class ChatRequest(BaseModel):
81
  messages: List[Message]
82
+ temperature: Optional[float] = 0.6
83
  max_tokens: Optional[int] = 512
84
  stream: Optional[bool] = False
85
 
 
87
 
88
  @app.get("/")
89
  async def root():
90
+ return {"status": "Running", "model": "SmolLM2-1.7B-Instruct"}
 
 
 
 
 
91
 
92
  @app.get("/ping")
93
  async def ping():
94
+ if llm_model:
95
+ return {"status": "pong", "ready": True}
96
+ return JSONResponse(status_code=503, content={"status": "loading"})
97
 
98
  @app.post("/v1/chat/completions")
99
+ async def chat(request: ChatRequest):
100
+ if not llm_model:
101
+ raise HTTPException(status_code=503, detail="Model loading...")
 
 
102
 
103
+ logger.info(f"Processing request: {len(request.messages)} msgs")
104
 
105
  try:
106
+ # llama-cpp-python handles the chat template automatically
107
+ return llm_model.create_chat_completion(
108
  messages=[m.model_dump() for m in request.messages],
109
  temperature=request.temperature,
110
  max_tokens=request.max_tokens,
111
  stream=request.stream
112
  )
 
113
  except Exception as e:
114
+ logger.error(f"Error: {e}")
115
+ raise HTTPException(status_code=500, detail=str(e))