Spaces:

scriptsledge
/

clarity-backend

Sleeping

App Files Files Community

scriptsledge commited on Dec 22, 2025

Commit

9b12d46

verified ·

1 Parent(s): 71c6963

perf: switch to transformers library and native pytorch model for optimized inference

Browse files

Files changed (3) hide show

Dockerfile +16 -38
model_service.py +27 -33
requirements.txt +3 -2

Dockerfile CHANGED Viewed

@@ -1,55 +1,33 @@
-# Stage 1: Builder
-FROM python:3.10-slim-bookworm AS builder
-WORKDIR /app
-# Install build tools
-RUN apt-get update && apt-get install -y \
-    build-essential \
-    cmake \
-    && rm -rf /var/lib/apt/lists/*
-# Install uv
-RUN pip install uv
-# Configure uv
-ENV UV_COMPILE_BYTECODE=1
-ENV UV_LINK_MODE=copy
-# Copy requirements
-COPY requirements.txt .
-# Create venv and install dependencies
-# We allow building from source for llama-cpp-python to ensure libc compatibility
-RUN uv venv /app/.venv && \
-    uv pip install \
-    --no-cache \
-    -r requirements.txt \
-    --python /app/.venv
-# Stage 2: Final Runtime Image
 FROM python:3.10-slim-bookworm
 WORKDIR /app
-# Install runtime dependencies (OpenMP support for llama.cpp)
 RUN apt-get update && apt-get install -y \
     libgomp1 \
     && rm -rf /var/lib/apt/lists/*
-# Copy the virtual environment from the builder stage
-COPY --from=builder /app/.venv /app/.venv
-# Set environment variables
-ENV PATH="/app/.venv/bin:$PATH"
-# Copy the application code
 COPY . .
-# Create a non-root user
 RUN useradd -m -u 1000 user
 USER user
 ENV HOME=/home/user
 EXPOSE 7860
-ENV MODEL_CTX_SIZE=8192
-CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]

 FROM python:3.10-slim-bookworm
 WORKDIR /app
+# Install runtime dependencies
+# libgomp1 is often needed by torch for CPU parallelism
 RUN apt-get update && apt-get install -y \
     libgomp1 \
+    git \
     && rm -rf /var/lib/apt/lists/*
+# Install uv for fast package installation
+RUN pip install uv
+# Copy requirements and install dependencies
+COPY requirements.txt .
+RUN uv pip install --no-cache-dir --system -r requirements.txt
+# Copy the rest of the application
 COPY . .
+# Create a non-root user (Hugging Face Spaces requirement)
 RUN useradd -m -u 1000 user
 USER user
 ENV HOME=/home/user
+ENV PATH="/home/user/.local/bin:$PATH"
+# Set environment variables
+# HF Spaces uses port 7860 by default
 EXPOSE 7860
+ENV PYTHONUNBUFFERED=1
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]

model_service.py CHANGED Viewed

@@ -1,41 +1,32 @@
 import os
-from llama_cpp import Llama
-from huggingface_hub import hf_hub_download
 # --- Configuration ---
-# Using the ultra-lightweight Qwen 2.5 Coder 0.5B
-# This is the fastest possible option for CPU/Edge devices.
-REPO_ID = "Qwen/Qwen2.5-Coder-0.5B-Instruct-GGUF"
-FILENAME = "qwen2.5-coder-0.5b-instruct-q4_k_m.gguf"
-print(f"Initializing Clarity AI Engine (llama.cpp)...")
-print(f"Target Model: {REPO_ID} [{FILENAME}]")
-llm = None
 try:
-    print("Downloading/Loading model...")
-    model_path = hf_hub_download(
-        repo_id=REPO_ID,
-        filename=FILENAME,
-        # This caches the model in ~/.cache/huggingface/hub
-    )
-    # Initialize Llama
-    # Use environment variable to toggle context size (8192 for HF Spaces, 4096 for local)
-    ctx_size = int(os.getenv("MODEL_CTX_SIZE", "4096"))
-    llm = Llama(
-        model_path=model_path,
-        n_ctx=ctx_size,
-        n_batch=512,
-        n_threads=os.cpu_count(),
-        verbose=False
     )
     print("Success: Clarity AI Model loaded.")
 except Exception as e:
     print(f"CRITICAL ERROR: Failed to load model. {e}")
-    llm = None
 def detect_language(code: str) -> dict:
     """
@@ -120,7 +111,7 @@ def correct_code_with_ai(code: str) -> dict:
     """
     detected_lang = detect_language(code)
-    if not llm:
         return {
             "code": "# Model failed to load. Check server logs.",
             "language": detected_lang
@@ -164,15 +155,18 @@ def correct_code_with_ai(code: str) -> dict:
     ]
     try:
-        # llama-cpp-python chat completion
-        response = llm.create_chat_completion(
-            messages=messages,
-            max_tokens=1024, # Optimized for 1.5B speed
             temperature=0.1, # Lower temperature for stricter adherence
         )
         # Extract content
-        response_content = response["choices"][0]["message"]["content"]
         # Clean up (double check for markdown or chatty intros)
         cleaned_response = response_content.strip()
@@ -202,4 +196,4 @@ def correct_code_with_ai(code: str) -> dict:
         return {
             "code": f"# An error occurred during processing: {str(e)}",
             "language": detected_lang
-        }

 import os
+from transformers import pipeline
+import torch
 # --- Configuration ---
+# Using the standard Qwen 2.5 Coder 0.5B Instruct model (Native PyTorch)
+REPO_ID = "Qwen/Qwen2.5-Coder-0.5B-Instruct"
+print(f"Initializing Clarity AI Engine (Transformers)...")
+print(f"Target Model: {REPO_ID}")
+pipe = None
 try:
+    print("Loading model...")
+    # Initialize the pipeline
+    # device_map="auto" will use GPU if available, otherwise CPU.
+    # torch_dtype="auto" will use appropriate precision (fp16 on GPU, fp32 on CPU typically)
+    pipe = pipeline(
+        "text-generation",
+        model=REPO_ID,
+        torch_dtype="auto",
+        device_map="auto"
     )
     print("Success: Clarity AI Model loaded.")
 except Exception as e:
     print(f"CRITICAL ERROR: Failed to load model. {e}")
+    pipe = None
 def detect_language(code: str) -> dict:
     """
     """
     detected_lang = detect_language(code)
+    if not pipe:
         return {
             "code": "# Model failed to load. Check server logs.",
             "language": detected_lang
     ]
     try:
+        # Transformers pipeline inference
+        outputs = pipe(
+            messages,
+            max_new_tokens=1024, # Optimized for 1.5B speed
             temperature=0.1, # Lower temperature for stricter adherence
+            do_sample=True, # Required for temperature usage
         )
         # Extract content
+        # Pipeline with list of messages returns a list containing one dict, which contains 'generated_text'.
+        # 'generated_text' is the list of messages (history + new response).
+        response_content = outputs[0]["generated_text"][-1]["content"]
         # Clean up (double check for markdown or chatty intros)
         cleaned_response = response_content.strip()
         return {
             "code": f"# An error occurred during processing: {str(e)}",
             "language": detected_lang
+        }

requirements.txt CHANGED Viewed

@@ -1,4 +1,5 @@
 fastapi
 uvicorn
-llama-cpp-python==0.3.2
-huggingface-hub

 fastapi
 uvicorn
+transformers
+torch
+accelerate