Spaces:

Nx5hh23
/

codecraft-ai

Sleeping

App Files Files Community

Nx5hh23 commited on 2 days ago

Commit

635d0b8

verified ·

1 Parent(s): 7407b44

Upload 8 files

Browse files

Files changed (3) hide show

Dockerfile +6 -8
api.py +53 -52
requirements.txt +1 -3

Dockerfile CHANGED Viewed

@@ -4,7 +4,7 @@ FROM python:3.11-slim
 # Set up environment variables
 ENV PYTHONUNBUFFERED=1 \
     HF_HUB_DISABLE_SYMLINKS_WARNING=1 \
-    TRANSFORMERS_CACHE=/app/model_cache
 # Set up working directory
 WORKDIR /app
@@ -12,15 +12,13 @@ WORKDIR /app
 # Copy requirements
 COPY requirements.txt .
-# Install dependencies (CPU-optimized PyTorch to fit standard free CPU containers)
-RUN pip install --no-cache-dir torch --index-url https://download.pytorch.org/whl/cpu && \
-    pip install --no-cache-dir -r requirements.txt
-# Pre-download and cache the uncensored model weights during build time
 # This ensures the Hugging Face Space starts instantly and never times out on startup
-RUN python -c "from transformers import AutoTokenizer, AutoModelForCausalLM; \
-    AutoTokenizer.from_pretrained('huihui-ai/Qwen2.5-Coder-1.5B-Instruct-abliterated', cache_dir='/app/model_cache'); \
-    AutoModelForCausalLM.from_pretrained('huihui-ai/Qwen2.5-Coder-1.5B-Instruct-abliterated', cache_dir='/app/model_cache')"
 # Copy the rest of the application files
 COPY . .

 # Set up environment variables
 ENV PYTHONUNBUFFERED=1 \
     HF_HUB_DISABLE_SYMLINKS_WARNING=1 \
+    HF_HUB_CACHE=/app/model_cache
 # Set up working directory
 WORKDIR /app
 # Copy requirements
 COPY requirements.txt .
+# Install dependencies (use the precompiled CPU wheel for llama-cpp-python to keep builds fast and successful)
+RUN pip install --no-cache-dir -r requirements.txt --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
+# Pre-download and cache the GGUF model weights during build time
 # This ensures the Hugging Face Space starts instantly and never times out on startup
+RUN python -c "from huggingface_hub import hf_hub_download; \
+    hf_hub_download(repo_id='mradermacher/Qwen2.5-Coder-1.5B-Instruct-abliterated-GGUF', filename='Qwen2.5-Coder-1.5B-Instruct-abliterated.Q5_K_M.gguf')"
 # Copy the rest of the application files
 COPY . .

api.py CHANGED Viewed

@@ -4,50 +4,54 @@ from contextlib import asynccontextmanager
 from fastapi import FastAPI, Request
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import StreamingResponse, FileResponse
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
-# Global model and tokenizer handles
-model = None
-tokenizer = None
 model_loading = False
 model_loaded = False
-device = "cuda" if torch.cuda.is_available() else "cpu"
-model_name = "huihui-ai/Qwen2.5-Coder-1.5B-Instruct-abliterated"
 def load_model():
-    global model, tokenizer, model_loaded, model_loading
     if model_loaded or model_loading:
         return
     model_loading = True
-    print(f"[CodeCraft AI] Loading model '{model_name}' on device '{device}'...")
     try:
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
-        model = AutoModelForCausalLM.from_pretrained(
-            model_name,
-            torch_dtype=torch.float16 if device == "cuda" else torch.float32,
-            device_map="auto"
         )
         model_loaded = True
         model_loading = False
-        print(f"[CodeCraft AI] Success! Model loaded on {model.device}.")
     except Exception as e:
         model_loading = False
-        print(f"[CodeCraft AI] Error loading model: {e}")
 @asynccontextmanager
 async def lifespan(app: FastAPI):
-    # Load model on startup in a separate thread so the server starts instantly
     threading.Thread(target=load_model).start()
     yield
-    # Cleanup on shutdown
-    global model, tokenizer
-    if model is not None:
-        del model
-    if tokenizer is not None:
-        del tokenizer
-    if torch.cuda.is_available():
-        torch.cuda.empty_cache()
 app = FastAPI(lifespan=lifespan)
@@ -72,7 +76,7 @@ async def get_status():
 @app.post("/api/chat")
 async def chat(request: Request):
-    global model, tokenizer, model_loaded
     if not model_loaded:
         return StreamingResponse(
@@ -82,8 +86,8 @@ async def chat(request: Request):
     data = await request.json()
     messages = data.get("messages", [])
-    temperature = float(data.get("temperature", 0.7))
-    max_tokens = int(data.get("max_tokens", 2048))
     # Inject system instructions to optimize for programming and Luau/Python
     has_system = any(msg.get("role") == "system" for msg in messages)
@@ -98,38 +102,35 @@ async def chat(request: Request):
         )
         messages.insert(0, {"role": "system", "content": system_prompt})
-    # Generate tokens using transformers TextIteratorStreamer
     try:
-        text = tokenizer.apply_chat_template(
-            messages,
-            tokenize=False,
-            add_generation_prompt=True
-        )
-        model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
-        streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
-        generation_kwargs = dict(
-            model_inputs,
-            streamer=streamer,
-            max_new_tokens=max_tokens,
-            do_sample=temperature > 0,
-            temperature=temperature if temperature > 0 else None,
-            top_p=0.9 if temperature > 0 else None,
         )
-        # Start generation in a background thread
-        thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
-        thread.start()
         def token_generator():
-            for new_text in streamer:
-                yield new_text
         return StreamingResponse(token_generator(), media_type="text/event-stream")
     except Exception as e:
-        print(f"[CodeCraft AI] Error during inference: {e}")
         return StreamingResponse(
-            (chunk for chunk in [f"An error occurred in the local inference engine: {str(e)}"]),
             media_type="text/plain"
         )

 from fastapi import FastAPI, Request
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import StreamingResponse, FileResponse
+from huggingface_hub import hf_hub_download
+from llama_cpp import Llama
+# Global model handles
+llm = None
 model_loading = False
 model_loaded = False
+device = "cpu" # Default representation for GGUF execution state
+repo_id = "mradermacher/Qwen2.5-Coder-1.5B-Instruct-abliterated-GGUF"
+filename = "Qwen2.5-Coder-1.5B-Instruct-abliterated.Q5_K_M.gguf"
 def load_model():
+    global llm, model_loaded, model_loading
     if model_loaded or model_loading:
         return
     model_loading = True
+    print(f"[CodeCraft AI] Downloading and caching GGUF model '{filename}'...")
     try:
+        # Download GGUF file (cached automatically)
+        model_path = hf_hub_download(repo_id=repo_id, filename=filename)
+        print(f"[CodeCraft AI] Loading model via llama-cpp-python...")
+        # Load GGUF engine optimized for CPU/Threads
+        # n_threads set to 4 (good default for virtual spaces/CPUs)
+        # n_ctx set to 2048 for solid code context length
+        llm = Llama(
+            model_path=model_path,
+            n_ctx=2048,
+            n_threads=4,
+            verbose=False
         )
         model_loaded = True
         model_loading = False
+        print("[CodeCraft AI] Success! GGUF model loaded successfully.")
     except Exception as e:
         model_loading = False
+        print(f"[CodeCraft AI] Error loading GGUF model: {e}")
 @asynccontextmanager
 async def lifespan(app: FastAPI):
+    # Load model on startup in a separate thread so server starts instantly
     threading.Thread(target=load_model).start()
     yield
+    # Cleanup
+    global llm
+    if llm is not None:
+        del llm
 app = FastAPI(lifespan=lifespan)
 @app.post("/api/chat")
 async def chat(request: Request):
+    global llm, model_loaded
     if not model_loaded:
         return StreamingResponse(
     data = await request.json()
     messages = data.get("messages", [])
+    temperature = float(data.get("temperature", 0.5))
+    max_tokens = int(data.get("max_tokens", 1024))
     # Inject system instructions to optimize for programming and Luau/Python
     has_system = any(msg.get("role") == "system" for msg in messages)
         )
         messages.insert(0, {"role": "system", "content": system_prompt})
     try:
+        # Build prompt using Qwen template
+        prompt = ""
+        for msg in messages:
+            role = msg.get("role")
+            content = msg.get("content")
+            prompt += f"<|im_start|>{role}\n{content}<|im_end|>\n"
+        prompt += "<|im_start|>assistant\n"
+        # Stream generation
+        response_stream = llm(
+            prompt,
+            max_tokens=max_tokens,
+            temperature=temperature,
+            top_p=0.9,
+            stream=True
         )
         def token_generator():
+            for chunk in response_stream:
+                text = chunk["choices"][0]["text"]
+                if text:
+                    yield text
         return StreamingResponse(token_generator(), media_type="text/event-stream")
     except Exception as e:
+        print(f"[CodeCraft AI] Error during GGUF inference: {e}")
         return StreamingResponse(
+            (chunk for chunk in [f"An error occurred in the local GGUF engine: {str(e)}"]),
             media_type="text/plain"
         )

requirements.txt CHANGED Viewed

@@ -1,6 +1,4 @@
 fastapi
 uvicorn
-transformers
-torch
 huggingface_hub
-accelerate

 fastapi
 uvicorn
 huggingface_hub
+llama-cpp-python