Loomis Green commited on
Commit
cf85c62
·
1 Parent(s): d838982

Switch to Transformers + Qwen2.5-Coder-1.5B for instant build

Browse files
Files changed (3) hide show
  1. Dockerfile +3 -12
  2. app.py +37 -29
  3. requirements.txt +4 -2
Dockerfile CHANGED
@@ -2,30 +2,21 @@ FROM python:3.10-slim
2
 
3
  WORKDIR /app
4
 
5
- # Install build tools for llama-cpp-python and other dependencies
6
- RUN apt-get update && apt-get install -y \
7
- build-essential \
8
- cmake \
9
- git \
10
- wget \
11
- && rm -rf /var/lib/apt/lists/*
12
 
13
  # Copy requirements and install
14
  COPY requirements.txt .
15
- # Upgrade pip to ensure we can handle wheels correctly
16
- RUN pip install --upgrade pip
17
- # Install dependencies (this will build llama-cpp-python if needed)
18
  RUN pip install --no-cache-dir -r requirements.txt
19
 
20
  # Copy application code
21
  COPY . .
22
 
23
  # Create a writable directory for the Hugging Face cache
24
- # HF Spaces run with a specific user ID (usually 1000), so we ensure permissions
25
  ENV HF_HOME=/app/cache
26
  RUN mkdir -p /app/cache && chmod -R 777 /app/cache
27
 
28
- # Expose the port (standard for HF Spaces)
29
  EXPOSE 7860
30
 
31
  # Run the application
 
2
 
3
  WORKDIR /app
4
 
5
+ # Install git (sometimes needed for transformers to download specific configs)
6
+ RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
 
 
 
 
 
7
 
8
  # Copy requirements and install
9
  COPY requirements.txt .
 
 
 
10
  RUN pip install --no-cache-dir -r requirements.txt
11
 
12
  # Copy application code
13
  COPY . .
14
 
15
  # Create a writable directory for the Hugging Face cache
 
16
  ENV HF_HOME=/app/cache
17
  RUN mkdir -p /app/cache && chmod -R 777 /app/cache
18
 
19
+ # Expose the port
20
  EXPOSE 7860
21
 
22
  # Run the application
app.py CHANGED
@@ -2,27 +2,20 @@ from fastapi import FastAPI
2
  from fastapi.staticfiles import StaticFiles
3
  from fastapi.middleware.cors import CORSMiddleware
4
  from fastapi.responses import FileResponse
5
- from huggingface_hub import hf_hub_download
6
- from llama_cpp import Llama
7
  import os
8
 
9
  # Define Model details
10
- REPO_ID = "roleplaiapp/Qwen2.5-Coder-14B-Instruct-Uncensored-Q4_K_S-GGUF"
11
- FILENAME = "Qwen2.5-Coder-14B-Instruct-Uncensored.Q4_K_S.gguf"
12
 
13
- print(f"Downloading {FILENAME} from {REPO_ID}...")
14
- model_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME)
15
- print(f"Model downloaded to: {model_path}")
16
-
17
- print("Loading Llama model...")
18
- # Initialize Llama model
19
- # n_ctx=4096: Context window (RAM usage scales with this)
20
- # n_threads=2: Hugging Face Spaces free tier usually has 2 vCPUs
21
- llm = Llama(
22
- model_path=model_path,
23
- n_ctx=4096,
24
- n_threads=2,
25
- verbose=True
26
  )
27
  print("Model Loaded Successfully!")
28
 
@@ -36,7 +29,7 @@ DEFAULT_SYSTEM_PROMPT = {
36
  "You are chatting with a user named Loomis (unless they tell you otherwise). "
37
  "Your name is Loomyloo. The user's name is Loomis. "
38
  "Never confuse your name with the user's name. "
39
- "You are running on the powerful Qwen2.5-Coder-14B-Instruct-Uncensored model. "
40
  "Keep your answers concise, friendly, and helpful."
41
  )
42
  }
@@ -73,22 +66,37 @@ def ask(prompt: str):
73
 
74
  print(f"Current History Length: {len(conversation_history)}")
75
 
76
- # 3. Generate Response using llama-cpp-python chat completion
77
- response = llm.create_chat_completion(
78
- messages=conversation_history,
79
- max_tokens=512,
 
 
 
 
 
 
 
 
 
 
80
  temperature=0.7,
81
- top_p=0.9
 
82
  )
83
 
84
- # Extract text from response
85
- generated_text = response['choices'][0]['message']['content']
 
 
 
 
86
 
87
- # 4. Add Assistant Response to History
88
- conversation_history.append({"role": "assistant", "content": generated_text})
89
 
90
- # 5. Return Result (keeping format consistent with previous API)
91
- return {"generated_text": generated_text}
92
 
93
  # Serve Static Files
94
  app.mount("/static", StaticFiles(directory="static"), name="static")
 
2
  from fastapi.staticfiles import StaticFiles
3
  from fastapi.middleware.cors import CORSMiddleware
4
  from fastapi.responses import FileResponse
5
+ from transformers import AutoModelForCausalLM, AutoTokenizer
6
+ import torch
7
  import os
8
 
9
  # Define Model details
10
+ # We use the 1.5B model because it runs fast on CPU and installs instantly (no compilation needed).
11
+ MODEL_ID = "Qwen/Qwen2.5-Coder-1.5B-Instruct"
12
 
13
+ print(f"Loading {MODEL_ID}...")
14
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
15
+ model = AutoModelForCausalLM.from_pretrained(
16
+ MODEL_ID,
17
+ torch_dtype=torch.float32, # Use float32 for CPU compatibility
18
+ device_map="auto"
 
 
 
 
 
 
 
19
  )
20
  print("Model Loaded Successfully!")
21
 
 
29
  "You are chatting with a user named Loomis (unless they tell you otherwise). "
30
  "Your name is Loomyloo. The user's name is Loomis. "
31
  "Never confuse your name with the user's name. "
32
+ "You are running on the fast Qwen2.5-Coder-1.5B-Instruct model. "
33
  "Keep your answers concise, friendly, and helpful."
34
  )
35
  }
 
66
 
67
  print(f"Current History Length: {len(conversation_history)}")
68
 
69
+ # 3. Format inputs using the tokenizer's chat template
70
+ text = tokenizer.apply_chat_template(
71
+ conversation_history,
72
+ tokenize=False,
73
+ add_generation_prompt=True
74
+ )
75
+
76
+ model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
77
+
78
+ # 4. Generate Response
79
+ # max_new_tokens: limit response length
80
+ generated_ids = model.generate(
81
+ **model_inputs,
82
+ max_new_tokens=512,
83
  temperature=0.7,
84
+ top_p=0.9,
85
+ do_sample=True
86
  )
87
 
88
+ # 5. Decode Response
89
+ # We strip the prompt from the output to get only the new text
90
+ generated_ids = [
91
+ output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
92
+ ]
93
+ response_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
94
 
95
+ # 6. Add Assistant Response to History
96
+ conversation_history.append({"role": "assistant", "content": response_text})
97
 
98
+ # 7. Return Result
99
+ return {"generated_text": response_text}
100
 
101
  # Serve Static Files
102
  app.mount("/static", StaticFiles(directory="static"), name="static")
requirements.txt CHANGED
@@ -1,6 +1,8 @@
1
- --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
2
- llama-cpp-python
3
  fastapi[standard]
4
  uvicorn
5
  aiofiles
6
  huggingface_hub
 
 
 
 
 
 
 
1
  fastapi[standard]
2
  uvicorn
3
  aiofiles
4
  huggingface_hub
5
+ torch
6
+ transformers
7
+ accelerate
8
+ sentencepiece