scriptsledge commited on
Commit
9b12d46
·
verified ·
1 Parent(s): 71c6963

perf: switch to transformers library and native pytorch model for optimized inference

Browse files
Files changed (3) hide show
  1. Dockerfile +16 -38
  2. model_service.py +27 -33
  3. requirements.txt +3 -2
Dockerfile CHANGED
@@ -1,55 +1,33 @@
1
- # Stage 1: Builder
2
- FROM python:3.10-slim-bookworm AS builder
3
- WORKDIR /app
4
-
5
- # Install build tools
6
- RUN apt-get update && apt-get install -y \
7
- build-essential \
8
- cmake \
9
- && rm -rf /var/lib/apt/lists/*
10
-
11
- # Install uv
12
- RUN pip install uv
13
-
14
- # Configure uv
15
- ENV UV_COMPILE_BYTECODE=1
16
- ENV UV_LINK_MODE=copy
17
-
18
- # Copy requirements
19
- COPY requirements.txt .
20
-
21
- # Create venv and install dependencies
22
- # We allow building from source for llama-cpp-python to ensure libc compatibility
23
- RUN uv venv /app/.venv && \
24
- uv pip install \
25
- --no-cache \
26
- -r requirements.txt \
27
- --python /app/.venv
28
-
29
- # Stage 2: Final Runtime Image
30
  FROM python:3.10-slim-bookworm
 
31
  WORKDIR /app
32
 
33
- # Install runtime dependencies (OpenMP support for llama.cpp)
 
34
  RUN apt-get update && apt-get install -y \
35
  libgomp1 \
 
36
  && rm -rf /var/lib/apt/lists/*
37
 
38
- # Copy the virtual environment from the builder stage
39
- COPY --from=builder /app/.venv /app/.venv
40
 
41
- # Set environment variables
42
- ENV PATH="/app/.venv/bin:$PATH"
 
43
 
44
- # Copy the application code
45
  COPY . .
46
 
47
- # Create a non-root user
48
  RUN useradd -m -u 1000 user
49
  USER user
50
  ENV HOME=/home/user
 
51
 
 
 
52
  EXPOSE 7860
53
- ENV MODEL_CTX_SIZE=8192
54
 
55
- CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  FROM python:3.10-slim-bookworm
2
+
3
  WORKDIR /app
4
 
5
+ # Install runtime dependencies
6
+ # libgomp1 is often needed by torch for CPU parallelism
7
  RUN apt-get update && apt-get install -y \
8
  libgomp1 \
9
+ git \
10
  && rm -rf /var/lib/apt/lists/*
11
 
12
+ # Install uv for fast package installation
13
+ RUN pip install uv
14
 
15
+ # Copy requirements and install dependencies
16
+ COPY requirements.txt .
17
+ RUN uv pip install --no-cache-dir --system -r requirements.txt
18
 
19
+ # Copy the rest of the application
20
  COPY . .
21
 
22
+ # Create a non-root user (Hugging Face Spaces requirement)
23
  RUN useradd -m -u 1000 user
24
  USER user
25
  ENV HOME=/home/user
26
+ ENV PATH="/home/user/.local/bin:$PATH"
27
 
28
+ # Set environment variables
29
+ # HF Spaces uses port 7860 by default
30
  EXPOSE 7860
31
+ ENV PYTHONUNBUFFERED=1
32
 
33
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
model_service.py CHANGED
@@ -1,41 +1,32 @@
1
  import os
2
- from llama_cpp import Llama
3
- from huggingface_hub import hf_hub_download
4
 
5
  # --- Configuration ---
6
- # Using the ultra-lightweight Qwen 2.5 Coder 0.5B
7
- # This is the fastest possible option for CPU/Edge devices.
8
- REPO_ID = "Qwen/Qwen2.5-Coder-0.5B-Instruct-GGUF"
9
- FILENAME = "qwen2.5-coder-0.5b-instruct-q4_k_m.gguf"
10
 
11
- print(f"Initializing Clarity AI Engine (llama.cpp)...")
12
- print(f"Target Model: {REPO_ID} [{FILENAME}]")
13
 
14
- llm = None
15
 
16
  try:
17
- print("Downloading/Loading model...")
18
- model_path = hf_hub_download(
19
- repo_id=REPO_ID,
20
- filename=FILENAME,
21
- # This caches the model in ~/.cache/huggingface/hub
22
- )
23
-
24
- # Initialize Llama
25
- # Use environment variable to toggle context size (8192 for HF Spaces, 4096 for local)
26
- ctx_size = int(os.getenv("MODEL_CTX_SIZE", "4096"))
27
- llm = Llama(
28
- model_path=model_path,
29
- n_ctx=ctx_size,
30
- n_batch=512,
31
- n_threads=os.cpu_count(),
32
- verbose=False
33
  )
34
  print("Success: Clarity AI Model loaded.")
35
 
36
  except Exception as e:
37
  print(f"CRITICAL ERROR: Failed to load model. {e}")
38
- llm = None
39
 
40
  def detect_language(code: str) -> dict:
41
  """
@@ -120,7 +111,7 @@ def correct_code_with_ai(code: str) -> dict:
120
  """
121
  detected_lang = detect_language(code)
122
 
123
- if not llm:
124
  return {
125
  "code": "# Model failed to load. Check server logs.",
126
  "language": detected_lang
@@ -164,15 +155,18 @@ def correct_code_with_ai(code: str) -> dict:
164
  ]
165
 
166
  try:
167
- # llama-cpp-python chat completion
168
- response = llm.create_chat_completion(
169
- messages=messages,
170
- max_tokens=1024, # Optimized for 1.5B speed
171
  temperature=0.1, # Lower temperature for stricter adherence
 
172
  )
173
 
174
  # Extract content
175
- response_content = response["choices"][0]["message"]["content"]
 
 
176
 
177
  # Clean up (double check for markdown or chatty intros)
178
  cleaned_response = response_content.strip()
@@ -202,4 +196,4 @@ def correct_code_with_ai(code: str) -> dict:
202
  return {
203
  "code": f"# An error occurred during processing: {str(e)}",
204
  "language": detected_lang
205
- }
 
1
  import os
2
+ from transformers import pipeline
3
+ import torch
4
 
5
  # --- Configuration ---
6
+ # Using the standard Qwen 2.5 Coder 0.5B Instruct model (Native PyTorch)
7
+ REPO_ID = "Qwen/Qwen2.5-Coder-0.5B-Instruct"
 
 
8
 
9
+ print(f"Initializing Clarity AI Engine (Transformers)...")
10
+ print(f"Target Model: {REPO_ID}")
11
 
12
+ pipe = None
13
 
14
  try:
15
+ print("Loading model...")
16
+ # Initialize the pipeline
17
+ # device_map="auto" will use GPU if available, otherwise CPU.
18
+ # torch_dtype="auto" will use appropriate precision (fp16 on GPU, fp32 on CPU typically)
19
+ pipe = pipeline(
20
+ "text-generation",
21
+ model=REPO_ID,
22
+ torch_dtype="auto",
23
+ device_map="auto"
 
 
 
 
 
 
 
24
  )
25
  print("Success: Clarity AI Model loaded.")
26
 
27
  except Exception as e:
28
  print(f"CRITICAL ERROR: Failed to load model. {e}")
29
+ pipe = None
30
 
31
  def detect_language(code: str) -> dict:
32
  """
 
111
  """
112
  detected_lang = detect_language(code)
113
 
114
+ if not pipe:
115
  return {
116
  "code": "# Model failed to load. Check server logs.",
117
  "language": detected_lang
 
155
  ]
156
 
157
  try:
158
+ # Transformers pipeline inference
159
+ outputs = pipe(
160
+ messages,
161
+ max_new_tokens=1024, # Optimized for 1.5B speed
162
  temperature=0.1, # Lower temperature for stricter adherence
163
+ do_sample=True, # Required for temperature usage
164
  )
165
 
166
  # Extract content
167
+ # Pipeline with list of messages returns a list containing one dict, which contains 'generated_text'.
168
+ # 'generated_text' is the list of messages (history + new response).
169
+ response_content = outputs[0]["generated_text"][-1]["content"]
170
 
171
  # Clean up (double check for markdown or chatty intros)
172
  cleaned_response = response_content.strip()
 
196
  return {
197
  "code": f"# An error occurred during processing: {str(e)}",
198
  "language": detected_lang
199
+ }
requirements.txt CHANGED
@@ -1,4 +1,5 @@
1
  fastapi
2
  uvicorn
3
- llama-cpp-python==0.3.2
4
- huggingface-hub
 
 
1
  fastapi
2
  uvicorn
3
+ transformers
4
+ torch
5
+ accelerate