Spaces:
Sleeping
Sleeping
Upload 8 files
Browse files- Dockerfile +6 -8
- api.py +53 -52
- requirements.txt +1 -3
Dockerfile
CHANGED
|
@@ -4,7 +4,7 @@ FROM python:3.11-slim
|
|
| 4 |
# Set up environment variables
|
| 5 |
ENV PYTHONUNBUFFERED=1 \
|
| 6 |
HF_HUB_DISABLE_SYMLINKS_WARNING=1 \
|
| 7 |
-
|
| 8 |
|
| 9 |
# Set up working directory
|
| 10 |
WORKDIR /app
|
|
@@ -12,15 +12,13 @@ WORKDIR /app
|
|
| 12 |
# Copy requirements
|
| 13 |
COPY requirements.txt .
|
| 14 |
|
| 15 |
-
# Install dependencies (CPU
|
| 16 |
-
RUN pip install --no-cache-dir
|
| 17 |
-
pip install --no-cache-dir -r requirements.txt
|
| 18 |
|
| 19 |
-
# Pre-download and cache the
|
| 20 |
# This ensures the Hugging Face Space starts instantly and never times out on startup
|
| 21 |
-
RUN python -c "from
|
| 22 |
-
|
| 23 |
-
AutoModelForCausalLM.from_pretrained('huihui-ai/Qwen2.5-Coder-1.5B-Instruct-abliterated', cache_dir='/app/model_cache')"
|
| 24 |
|
| 25 |
# Copy the rest of the application files
|
| 26 |
COPY . .
|
|
|
|
| 4 |
# Set up environment variables
|
| 5 |
ENV PYTHONUNBUFFERED=1 \
|
| 6 |
HF_HUB_DISABLE_SYMLINKS_WARNING=1 \
|
| 7 |
+
HF_HUB_CACHE=/app/model_cache
|
| 8 |
|
| 9 |
# Set up working directory
|
| 10 |
WORKDIR /app
|
|
|
|
| 12 |
# Copy requirements
|
| 13 |
COPY requirements.txt .
|
| 14 |
|
| 15 |
+
# Install dependencies (use the precompiled CPU wheel for llama-cpp-python to keep builds fast and successful)
|
| 16 |
+
RUN pip install --no-cache-dir -r requirements.txt --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
|
|
|
|
| 17 |
|
| 18 |
+
# Pre-download and cache the GGUF model weights during build time
|
| 19 |
# This ensures the Hugging Face Space starts instantly and never times out on startup
|
| 20 |
+
RUN python -c "from huggingface_hub import hf_hub_download; \
|
| 21 |
+
hf_hub_download(repo_id='mradermacher/Qwen2.5-Coder-1.5B-Instruct-abliterated-GGUF', filename='Qwen2.5-Coder-1.5B-Instruct-abliterated.Q5_K_M.gguf')"
|
|
|
|
| 22 |
|
| 23 |
# Copy the rest of the application files
|
| 24 |
COPY . .
|
api.py
CHANGED
|
@@ -4,50 +4,54 @@ from contextlib import asynccontextmanager
|
|
| 4 |
from fastapi import FastAPI, Request
|
| 5 |
from fastapi.middleware.cors import CORSMiddleware
|
| 6 |
from fastapi.responses import StreamingResponse, FileResponse
|
| 7 |
-
import
|
| 8 |
-
from
|
| 9 |
|
| 10 |
-
# Global model
|
| 11 |
-
|
| 12 |
-
tokenizer = None
|
| 13 |
model_loading = False
|
| 14 |
model_loaded = False
|
| 15 |
-
device = "
|
| 16 |
-
|
|
|
|
| 17 |
|
| 18 |
def load_model():
|
| 19 |
-
global
|
| 20 |
if model_loaded or model_loading:
|
| 21 |
return
|
| 22 |
model_loading = True
|
| 23 |
-
print(f"[CodeCraft AI]
|
| 24 |
try:
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
)
|
|
|
|
| 31 |
model_loaded = True
|
| 32 |
model_loading = False
|
| 33 |
-
print(
|
| 34 |
except Exception as e:
|
| 35 |
model_loading = False
|
| 36 |
-
print(f"[CodeCraft AI] Error loading model: {e}")
|
| 37 |
|
| 38 |
@asynccontextmanager
|
| 39 |
async def lifespan(app: FastAPI):
|
| 40 |
-
# Load model on startup in a separate thread so
|
| 41 |
threading.Thread(target=load_model).start()
|
| 42 |
yield
|
| 43 |
-
# Cleanup
|
| 44 |
-
global
|
| 45 |
-
if
|
| 46 |
-
del
|
| 47 |
-
if tokenizer is not None:
|
| 48 |
-
del tokenizer
|
| 49 |
-
if torch.cuda.is_available():
|
| 50 |
-
torch.cuda.empty_cache()
|
| 51 |
|
| 52 |
app = FastAPI(lifespan=lifespan)
|
| 53 |
|
|
@@ -72,7 +76,7 @@ async def get_status():
|
|
| 72 |
|
| 73 |
@app.post("/api/chat")
|
| 74 |
async def chat(request: Request):
|
| 75 |
-
global
|
| 76 |
|
| 77 |
if not model_loaded:
|
| 78 |
return StreamingResponse(
|
|
@@ -82,8 +86,8 @@ async def chat(request: Request):
|
|
| 82 |
|
| 83 |
data = await request.json()
|
| 84 |
messages = data.get("messages", [])
|
| 85 |
-
temperature = float(data.get("temperature", 0.
|
| 86 |
-
max_tokens = int(data.get("max_tokens",
|
| 87 |
|
| 88 |
# Inject system instructions to optimize for programming and Luau/Python
|
| 89 |
has_system = any(msg.get("role") == "system" for msg in messages)
|
|
@@ -98,38 +102,35 @@ async def chat(request: Request):
|
|
| 98 |
)
|
| 99 |
messages.insert(0, {"role": "system", "content": system_prompt})
|
| 100 |
|
| 101 |
-
# Generate tokens using transformers TextIteratorStreamer
|
| 102 |
try:
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
)
|
| 119 |
|
| 120 |
-
# Start generation in a background thread
|
| 121 |
-
thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
|
| 122 |
-
thread.start()
|
| 123 |
-
|
| 124 |
def token_generator():
|
| 125 |
-
for
|
| 126 |
-
|
| 127 |
-
|
|
|
|
|
|
|
| 128 |
return StreamingResponse(token_generator(), media_type="text/event-stream")
|
| 129 |
except Exception as e:
|
| 130 |
-
print(f"[CodeCraft AI] Error during inference: {e}")
|
| 131 |
return StreamingResponse(
|
| 132 |
-
(chunk for chunk in [f"An error occurred in the local
|
| 133 |
media_type="text/plain"
|
| 134 |
)
|
| 135 |
|
|
|
|
| 4 |
from fastapi import FastAPI, Request
|
| 5 |
from fastapi.middleware.cors import CORSMiddleware
|
| 6 |
from fastapi.responses import StreamingResponse, FileResponse
|
| 7 |
+
from huggingface_hub import hf_hub_download
|
| 8 |
+
from llama_cpp import Llama
|
| 9 |
|
| 10 |
+
# Global model handles
|
| 11 |
+
llm = None
|
|
|
|
| 12 |
model_loading = False
|
| 13 |
model_loaded = False
|
| 14 |
+
device = "cpu" # Default representation for GGUF execution state
|
| 15 |
+
repo_id = "mradermacher/Qwen2.5-Coder-1.5B-Instruct-abliterated-GGUF"
|
| 16 |
+
filename = "Qwen2.5-Coder-1.5B-Instruct-abliterated.Q5_K_M.gguf"
|
| 17 |
|
| 18 |
def load_model():
|
| 19 |
+
global llm, model_loaded, model_loading
|
| 20 |
if model_loaded or model_loading:
|
| 21 |
return
|
| 22 |
model_loading = True
|
| 23 |
+
print(f"[CodeCraft AI] Downloading and caching GGUF model '{filename}'...")
|
| 24 |
try:
|
| 25 |
+
# Download GGUF file (cached automatically)
|
| 26 |
+
model_path = hf_hub_download(repo_id=repo_id, filename=filename)
|
| 27 |
+
|
| 28 |
+
print(f"[CodeCraft AI] Loading model via llama-cpp-python...")
|
| 29 |
+
# Load GGUF engine optimized for CPU/Threads
|
| 30 |
+
# n_threads set to 4 (good default for virtual spaces/CPUs)
|
| 31 |
+
# n_ctx set to 2048 for solid code context length
|
| 32 |
+
llm = Llama(
|
| 33 |
+
model_path=model_path,
|
| 34 |
+
n_ctx=2048,
|
| 35 |
+
n_threads=4,
|
| 36 |
+
verbose=False
|
| 37 |
)
|
| 38 |
+
|
| 39 |
model_loaded = True
|
| 40 |
model_loading = False
|
| 41 |
+
print("[CodeCraft AI] Success! GGUF model loaded successfully.")
|
| 42 |
except Exception as e:
|
| 43 |
model_loading = False
|
| 44 |
+
print(f"[CodeCraft AI] Error loading GGUF model: {e}")
|
| 45 |
|
| 46 |
@asynccontextmanager
|
| 47 |
async def lifespan(app: FastAPI):
|
| 48 |
+
# Load model on startup in a separate thread so server starts instantly
|
| 49 |
threading.Thread(target=load_model).start()
|
| 50 |
yield
|
| 51 |
+
# Cleanup
|
| 52 |
+
global llm
|
| 53 |
+
if llm is not None:
|
| 54 |
+
del llm
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
|
| 56 |
app = FastAPI(lifespan=lifespan)
|
| 57 |
|
|
|
|
| 76 |
|
| 77 |
@app.post("/api/chat")
|
| 78 |
async def chat(request: Request):
|
| 79 |
+
global llm, model_loaded
|
| 80 |
|
| 81 |
if not model_loaded:
|
| 82 |
return StreamingResponse(
|
|
|
|
| 86 |
|
| 87 |
data = await request.json()
|
| 88 |
messages = data.get("messages", [])
|
| 89 |
+
temperature = float(data.get("temperature", 0.5))
|
| 90 |
+
max_tokens = int(data.get("max_tokens", 1024))
|
| 91 |
|
| 92 |
# Inject system instructions to optimize for programming and Luau/Python
|
| 93 |
has_system = any(msg.get("role") == "system" for msg in messages)
|
|
|
|
| 102 |
)
|
| 103 |
messages.insert(0, {"role": "system", "content": system_prompt})
|
| 104 |
|
|
|
|
| 105 |
try:
|
| 106 |
+
# Build prompt using Qwen template
|
| 107 |
+
prompt = ""
|
| 108 |
+
for msg in messages:
|
| 109 |
+
role = msg.get("role")
|
| 110 |
+
content = msg.get("content")
|
| 111 |
+
prompt += f"<|im_start|>{role}\n{content}<|im_end|>\n"
|
| 112 |
+
prompt += "<|im_start|>assistant\n"
|
| 113 |
|
| 114 |
+
# Stream generation
|
| 115 |
+
response_stream = llm(
|
| 116 |
+
prompt,
|
| 117 |
+
max_tokens=max_tokens,
|
| 118 |
+
temperature=temperature,
|
| 119 |
+
top_p=0.9,
|
| 120 |
+
stream=True
|
| 121 |
)
|
| 122 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
def token_generator():
|
| 124 |
+
for chunk in response_stream:
|
| 125 |
+
text = chunk["choices"][0]["text"]
|
| 126 |
+
if text:
|
| 127 |
+
yield text
|
| 128 |
+
|
| 129 |
return StreamingResponse(token_generator(), media_type="text/event-stream")
|
| 130 |
except Exception as e:
|
| 131 |
+
print(f"[CodeCraft AI] Error during GGUF inference: {e}")
|
| 132 |
return StreamingResponse(
|
| 133 |
+
(chunk for chunk in [f"An error occurred in the local GGUF engine: {str(e)}"]),
|
| 134 |
media_type="text/plain"
|
| 135 |
)
|
| 136 |
|
requirements.txt
CHANGED
|
@@ -1,6 +1,4 @@
|
|
| 1 |
fastapi
|
| 2 |
uvicorn
|
| 3 |
-
transformers
|
| 4 |
-
torch
|
| 5 |
huggingface_hub
|
| 6 |
-
|
|
|
|
| 1 |
fastapi
|
| 2 |
uvicorn
|
|
|
|
|
|
|
| 3 |
huggingface_hub
|
| 4 |
+
llama-cpp-python
|