Spaces:
Sleeping
Sleeping
Loomis Green commited on
Commit ·
cf85c62
1
Parent(s): d838982
Switch to Transformers + Qwen2.5-Coder-1.5B for instant build
Browse files- Dockerfile +3 -12
- app.py +37 -29
- requirements.txt +4 -2
Dockerfile
CHANGED
|
@@ -2,30 +2,21 @@ FROM python:3.10-slim
|
|
| 2 |
|
| 3 |
WORKDIR /app
|
| 4 |
|
| 5 |
-
# Install
|
| 6 |
-
RUN apt-get update && apt-get install -y
|
| 7 |
-
build-essential \
|
| 8 |
-
cmake \
|
| 9 |
-
git \
|
| 10 |
-
wget \
|
| 11 |
-
&& rm -rf /var/lib/apt/lists/*
|
| 12 |
|
| 13 |
# Copy requirements and install
|
| 14 |
COPY requirements.txt .
|
| 15 |
-
# Upgrade pip to ensure we can handle wheels correctly
|
| 16 |
-
RUN pip install --upgrade pip
|
| 17 |
-
# Install dependencies (this will build llama-cpp-python if needed)
|
| 18 |
RUN pip install --no-cache-dir -r requirements.txt
|
| 19 |
|
| 20 |
# Copy application code
|
| 21 |
COPY . .
|
| 22 |
|
| 23 |
# Create a writable directory for the Hugging Face cache
|
| 24 |
-
# HF Spaces run with a specific user ID (usually 1000), so we ensure permissions
|
| 25 |
ENV HF_HOME=/app/cache
|
| 26 |
RUN mkdir -p /app/cache && chmod -R 777 /app/cache
|
| 27 |
|
| 28 |
-
# Expose the port
|
| 29 |
EXPOSE 7860
|
| 30 |
|
| 31 |
# Run the application
|
|
|
|
| 2 |
|
| 3 |
WORKDIR /app
|
| 4 |
|
| 5 |
+
# Install git (sometimes needed for transformers to download specific configs)
|
| 6 |
+
RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
# Copy requirements and install
|
| 9 |
COPY requirements.txt .
|
|
|
|
|
|
|
|
|
|
| 10 |
RUN pip install --no-cache-dir -r requirements.txt
|
| 11 |
|
| 12 |
# Copy application code
|
| 13 |
COPY . .
|
| 14 |
|
| 15 |
# Create a writable directory for the Hugging Face cache
|
|
|
|
| 16 |
ENV HF_HOME=/app/cache
|
| 17 |
RUN mkdir -p /app/cache && chmod -R 777 /app/cache
|
| 18 |
|
| 19 |
+
# Expose the port
|
| 20 |
EXPOSE 7860
|
| 21 |
|
| 22 |
# Run the application
|
app.py
CHANGED
|
@@ -2,27 +2,20 @@ from fastapi import FastAPI
|
|
| 2 |
from fastapi.staticfiles import StaticFiles
|
| 3 |
from fastapi.middleware.cors import CORSMiddleware
|
| 4 |
from fastapi.responses import FileResponse
|
| 5 |
-
from
|
| 6 |
-
|
| 7 |
import os
|
| 8 |
|
| 9 |
# Define Model details
|
| 10 |
-
|
| 11 |
-
|
| 12 |
|
| 13 |
-
print(f"
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
# n_ctx=4096: Context window (RAM usage scales with this)
|
| 20 |
-
# n_threads=2: Hugging Face Spaces free tier usually has 2 vCPUs
|
| 21 |
-
llm = Llama(
|
| 22 |
-
model_path=model_path,
|
| 23 |
-
n_ctx=4096,
|
| 24 |
-
n_threads=2,
|
| 25 |
-
verbose=True
|
| 26 |
)
|
| 27 |
print("Model Loaded Successfully!")
|
| 28 |
|
|
@@ -36,7 +29,7 @@ DEFAULT_SYSTEM_PROMPT = {
|
|
| 36 |
"You are chatting with a user named Loomis (unless they tell you otherwise). "
|
| 37 |
"Your name is Loomyloo. The user's name is Loomis. "
|
| 38 |
"Never confuse your name with the user's name. "
|
| 39 |
-
"You are running on the
|
| 40 |
"Keep your answers concise, friendly, and helpful."
|
| 41 |
)
|
| 42 |
}
|
|
@@ -73,22 +66,37 @@ def ask(prompt: str):
|
|
| 73 |
|
| 74 |
print(f"Current History Length: {len(conversation_history)}")
|
| 75 |
|
| 76 |
-
# 3.
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
temperature=0.7,
|
| 81 |
-
top_p=0.9
|
|
|
|
| 82 |
)
|
| 83 |
|
| 84 |
-
#
|
| 85 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
|
| 87 |
-
#
|
| 88 |
-
conversation_history.append({"role": "assistant", "content":
|
| 89 |
|
| 90 |
-
#
|
| 91 |
-
return {"generated_text":
|
| 92 |
|
| 93 |
# Serve Static Files
|
| 94 |
app.mount("/static", StaticFiles(directory="static"), name="static")
|
|
|
|
| 2 |
from fastapi.staticfiles import StaticFiles
|
| 3 |
from fastapi.middleware.cors import CORSMiddleware
|
| 4 |
from fastapi.responses import FileResponse
|
| 5 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 6 |
+
import torch
|
| 7 |
import os
|
| 8 |
|
| 9 |
# Define Model details
|
| 10 |
+
# We use the 1.5B model because it runs fast on CPU and installs instantly (no compilation needed).
|
| 11 |
+
MODEL_ID = "Qwen/Qwen2.5-Coder-1.5B-Instruct"
|
| 12 |
|
| 13 |
+
print(f"Loading {MODEL_ID}...")
|
| 14 |
+
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
| 15 |
+
model = AutoModelForCausalLM.from_pretrained(
|
| 16 |
+
MODEL_ID,
|
| 17 |
+
torch_dtype=torch.float32, # Use float32 for CPU compatibility
|
| 18 |
+
device_map="auto"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
)
|
| 20 |
print("Model Loaded Successfully!")
|
| 21 |
|
|
|
|
| 29 |
"You are chatting with a user named Loomis (unless they tell you otherwise). "
|
| 30 |
"Your name is Loomyloo. The user's name is Loomis. "
|
| 31 |
"Never confuse your name with the user's name. "
|
| 32 |
+
"You are running on the fast Qwen2.5-Coder-1.5B-Instruct model. "
|
| 33 |
"Keep your answers concise, friendly, and helpful."
|
| 34 |
)
|
| 35 |
}
|
|
|
|
| 66 |
|
| 67 |
print(f"Current History Length: {len(conversation_history)}")
|
| 68 |
|
| 69 |
+
# 3. Format inputs using the tokenizer's chat template
|
| 70 |
+
text = tokenizer.apply_chat_template(
|
| 71 |
+
conversation_history,
|
| 72 |
+
tokenize=False,
|
| 73 |
+
add_generation_prompt=True
|
| 74 |
+
)
|
| 75 |
+
|
| 76 |
+
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
|
| 77 |
+
|
| 78 |
+
# 4. Generate Response
|
| 79 |
+
# max_new_tokens: limit response length
|
| 80 |
+
generated_ids = model.generate(
|
| 81 |
+
**model_inputs,
|
| 82 |
+
max_new_tokens=512,
|
| 83 |
temperature=0.7,
|
| 84 |
+
top_p=0.9,
|
| 85 |
+
do_sample=True
|
| 86 |
)
|
| 87 |
|
| 88 |
+
# 5. Decode Response
|
| 89 |
+
# We strip the prompt from the output to get only the new text
|
| 90 |
+
generated_ids = [
|
| 91 |
+
output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
|
| 92 |
+
]
|
| 93 |
+
response_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
| 94 |
|
| 95 |
+
# 6. Add Assistant Response to History
|
| 96 |
+
conversation_history.append({"role": "assistant", "content": response_text})
|
| 97 |
|
| 98 |
+
# 7. Return Result
|
| 99 |
+
return {"generated_text": response_text}
|
| 100 |
|
| 101 |
# Serve Static Files
|
| 102 |
app.mount("/static", StaticFiles(directory="static"), name="static")
|
requirements.txt
CHANGED
|
@@ -1,6 +1,8 @@
|
|
| 1 |
-
--extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
|
| 2 |
-
llama-cpp-python
|
| 3 |
fastapi[standard]
|
| 4 |
uvicorn
|
| 5 |
aiofiles
|
| 6 |
huggingface_hub
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
fastapi[standard]
|
| 2 |
uvicorn
|
| 3 |
aiofiles
|
| 4 |
huggingface_hub
|
| 5 |
+
torch
|
| 6 |
+
transformers
|
| 7 |
+
accelerate
|
| 8 |
+
sentencepiece
|