Spaces:

Loomisgitarrist
/

personal-coder-ai

Sleeping

App Files Files Community

Loomis Green commited on Jan 25

Commit

cf85c62

1 Parent(s): d838982

Switch to Transformers + Qwen2.5-Coder-1.5B for instant build

Browse files

Files changed (3) hide show

Dockerfile +3 -12
app.py +37 -29
requirements.txt +4 -2

Dockerfile CHANGED Viewed

@@ -2,30 +2,21 @@ FROM python:3.10-slim
 WORKDIR /app
-# Install build tools for llama-cpp-python and other dependencies
-RUN apt-get update && apt-get install -y \
-    build-essential \
-    cmake \
-    git \
-    wget \
-    && rm -rf /var/lib/apt/lists/*
 # Copy requirements and install
 COPY requirements.txt .
-# Upgrade pip to ensure we can handle wheels correctly
-RUN pip install --upgrade pip
-# Install dependencies (this will build llama-cpp-python if needed)
 RUN pip install --no-cache-dir -r requirements.txt
 # Copy application code
 COPY . .
 # Create a writable directory for the Hugging Face cache
-# HF Spaces run with a specific user ID (usually 1000), so we ensure permissions
 ENV HF_HOME=/app/cache
 RUN mkdir -p /app/cache && chmod -R 777 /app/cache
-# Expose the port (standard for HF Spaces)
 EXPOSE 7860
 # Run the application

 WORKDIR /app
+# Install git (sometimes needed for transformers to download specific configs)
+RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
 # Copy requirements and install
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
 # Copy application code
 COPY . .
 # Create a writable directory for the Hugging Face cache
 ENV HF_HOME=/app/cache
 RUN mkdir -p /app/cache && chmod -R 777 /app/cache
+# Expose the port
 EXPOSE 7860
 # Run the application

app.py CHANGED Viewed

@@ -2,27 +2,20 @@ from fastapi import FastAPI
 from fastapi.staticfiles import StaticFiles
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import FileResponse
-from huggingface_hub import hf_hub_download
-from llama_cpp import Llama
 import os
 # Define Model details
-REPO_ID = "roleplaiapp/Qwen2.5-Coder-14B-Instruct-Uncensored-Q4_K_S-GGUF"
-FILENAME = "Qwen2.5-Coder-14B-Instruct-Uncensored.Q4_K_S.gguf"
-print(f"Downloading {FILENAME} from {REPO_ID}...")
-model_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME)
-print(f"Model downloaded to: {model_path}")
-print("Loading Llama model...")
-# Initialize Llama model
-# n_ctx=4096: Context window (RAM usage scales with this)
-# n_threads=2: Hugging Face Spaces free tier usually has 2 vCPUs
-llm = Llama(
-    model_path=model_path,
-    n_ctx=4096,
-    n_threads=2,
-    verbose=True
 )
 print("Model Loaded Successfully!")
@@ -36,7 +29,7 @@ DEFAULT_SYSTEM_PROMPT = {
         "You are chatting with a user named Loomis (unless they tell you otherwise). "
         "Your name is Loomyloo. The user's name is Loomis. "
         "Never confuse your name with the user's name. "
-        "You are running on the powerful Qwen2.5-Coder-14B-Instruct-Uncensored model. "
         "Keep your answers concise, friendly, and helpful."
     )
 }
@@ -73,22 +66,37 @@ def ask(prompt: str):
     print(f"Current History Length: {len(conversation_history)}")
-    # 3. Generate Response using llama-cpp-python chat completion
-    response = llm.create_chat_completion(
-        messages=conversation_history,
-        max_tokens=512,
         temperature=0.7,
-        top_p=0.9
     )
-    # Extract text from response
-    generated_text = response['choices'][0]['message']['content']
-    # 4. Add Assistant Response to History
-    conversation_history.append({"role": "assistant", "content": generated_text})
-    # 5. Return Result (keeping format consistent with previous API)
-    return {"generated_text": generated_text}
 # Serve Static Files
 app.mount("/static", StaticFiles(directory="static"), name="static")

 from fastapi.staticfiles import StaticFiles
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import FileResponse
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import torch
 import os
 # Define Model details
+# We use the 1.5B model because it runs fast on CPU and installs instantly (no compilation needed).
+MODEL_ID = "Qwen/Qwen2.5-Coder-1.5B-Instruct"
+print(f"Loading {MODEL_ID}...")
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID,
+    torch_dtype=torch.float32,  # Use float32 for CPU compatibility
+    device_map="auto"
 )
 print("Model Loaded Successfully!")
         "You are chatting with a user named Loomis (unless they tell you otherwise). "
         "Your name is Loomyloo. The user's name is Loomis. "
         "Never confuse your name with the user's name. "
+        "You are running on the fast Qwen2.5-Coder-1.5B-Instruct model. "
         "Keep your answers concise, friendly, and helpful."
     )
 }
     print(f"Current History Length: {len(conversation_history)}")
+    # 3. Format inputs using the tokenizer's chat template
+    text = tokenizer.apply_chat_template(
+        conversation_history,
+        tokenize=False,
+        add_generation_prompt=True
+    )
+    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
+    # 4. Generate Response
+    # max_new_tokens: limit response length
+    generated_ids = model.generate(
+        **model_inputs,
+        max_new_tokens=512,
         temperature=0.7,
+        top_p=0.9,
+        do_sample=True
     )
+    # 5. Decode Response
+    # We strip the prompt from the output to get only the new text
+    generated_ids = [
+        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
+    ]
+    response_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+    # 6. Add Assistant Response to History
+    conversation_history.append({"role": "assistant", "content": response_text})
+    # 7. Return Result
+    return {"generated_text": response_text}
 # Serve Static Files
 app.mount("/static", StaticFiles(directory="static"), name="static")

requirements.txt CHANGED Viewed

@@ -1,6 +1,8 @@
---extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
-llama-cpp-python
 fastapi[standard]
 uvicorn
 aiofiles
 huggingface_hub

 fastapi[standard]
 uvicorn
 aiofiles
 huggingface_hub
+torch
+transformers
+accelerate
+sentencepiece