Nx5hh23 commited on
Commit
635d0b8
·
verified ·
1 Parent(s): 7407b44

Upload 8 files

Browse files
Files changed (3) hide show
  1. Dockerfile +6 -8
  2. api.py +53 -52
  3. requirements.txt +1 -3
Dockerfile CHANGED
@@ -4,7 +4,7 @@ FROM python:3.11-slim
4
  # Set up environment variables
5
  ENV PYTHONUNBUFFERED=1 \
6
  HF_HUB_DISABLE_SYMLINKS_WARNING=1 \
7
- TRANSFORMERS_CACHE=/app/model_cache
8
 
9
  # Set up working directory
10
  WORKDIR /app
@@ -12,15 +12,13 @@ WORKDIR /app
12
  # Copy requirements
13
  COPY requirements.txt .
14
 
15
- # Install dependencies (CPU-optimized PyTorch to fit standard free CPU containers)
16
- RUN pip install --no-cache-dir torch --index-url https://download.pytorch.org/whl/cpu && \
17
- pip install --no-cache-dir -r requirements.txt
18
 
19
- # Pre-download and cache the uncensored model weights during build time
20
  # This ensures the Hugging Face Space starts instantly and never times out on startup
21
- RUN python -c "from transformers import AutoTokenizer, AutoModelForCausalLM; \
22
- AutoTokenizer.from_pretrained('huihui-ai/Qwen2.5-Coder-1.5B-Instruct-abliterated', cache_dir='/app/model_cache'); \
23
- AutoModelForCausalLM.from_pretrained('huihui-ai/Qwen2.5-Coder-1.5B-Instruct-abliterated', cache_dir='/app/model_cache')"
24
 
25
  # Copy the rest of the application files
26
  COPY . .
 
4
  # Set up environment variables
5
  ENV PYTHONUNBUFFERED=1 \
6
  HF_HUB_DISABLE_SYMLINKS_WARNING=1 \
7
+ HF_HUB_CACHE=/app/model_cache
8
 
9
  # Set up working directory
10
  WORKDIR /app
 
12
  # Copy requirements
13
  COPY requirements.txt .
14
 
15
+ # Install dependencies (use the precompiled CPU wheel for llama-cpp-python to keep builds fast and successful)
16
+ RUN pip install --no-cache-dir -r requirements.txt --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
 
17
 
18
+ # Pre-download and cache the GGUF model weights during build time
19
  # This ensures the Hugging Face Space starts instantly and never times out on startup
20
+ RUN python -c "from huggingface_hub import hf_hub_download; \
21
+ hf_hub_download(repo_id='mradermacher/Qwen2.5-Coder-1.5B-Instruct-abliterated-GGUF', filename='Qwen2.5-Coder-1.5B-Instruct-abliterated.Q5_K_M.gguf')"
 
22
 
23
  # Copy the rest of the application files
24
  COPY . .
api.py CHANGED
@@ -4,50 +4,54 @@ from contextlib import asynccontextmanager
4
  from fastapi import FastAPI, Request
5
  from fastapi.middleware.cors import CORSMiddleware
6
  from fastapi.responses import StreamingResponse, FileResponse
7
- import torch
8
- from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
9
 
10
- # Global model and tokenizer handles
11
- model = None
12
- tokenizer = None
13
  model_loading = False
14
  model_loaded = False
15
- device = "cuda" if torch.cuda.is_available() else "cpu"
16
- model_name = "huihui-ai/Qwen2.5-Coder-1.5B-Instruct-abliterated"
 
17
 
18
  def load_model():
19
- global model, tokenizer, model_loaded, model_loading
20
  if model_loaded or model_loading:
21
  return
22
  model_loading = True
23
- print(f"[CodeCraft AI] Loading model '{model_name}' on device '{device}'...")
24
  try:
25
- tokenizer = AutoTokenizer.from_pretrained(model_name)
26
- model = AutoModelForCausalLM.from_pretrained(
27
- model_name,
28
- torch_dtype=torch.float16 if device == "cuda" else torch.float32,
29
- device_map="auto"
 
 
 
 
 
 
 
30
  )
 
31
  model_loaded = True
32
  model_loading = False
33
- print(f"[CodeCraft AI] Success! Model loaded on {model.device}.")
34
  except Exception as e:
35
  model_loading = False
36
- print(f"[CodeCraft AI] Error loading model: {e}")
37
 
38
  @asynccontextmanager
39
  async def lifespan(app: FastAPI):
40
- # Load model on startup in a separate thread so the server starts instantly
41
  threading.Thread(target=load_model).start()
42
  yield
43
- # Cleanup on shutdown
44
- global model, tokenizer
45
- if model is not None:
46
- del model
47
- if tokenizer is not None:
48
- del tokenizer
49
- if torch.cuda.is_available():
50
- torch.cuda.empty_cache()
51
 
52
  app = FastAPI(lifespan=lifespan)
53
 
@@ -72,7 +76,7 @@ async def get_status():
72
 
73
  @app.post("/api/chat")
74
  async def chat(request: Request):
75
- global model, tokenizer, model_loaded
76
 
77
  if not model_loaded:
78
  return StreamingResponse(
@@ -82,8 +86,8 @@ async def chat(request: Request):
82
 
83
  data = await request.json()
84
  messages = data.get("messages", [])
85
- temperature = float(data.get("temperature", 0.7))
86
- max_tokens = int(data.get("max_tokens", 2048))
87
 
88
  # Inject system instructions to optimize for programming and Luau/Python
89
  has_system = any(msg.get("role") == "system" for msg in messages)
@@ -98,38 +102,35 @@ async def chat(request: Request):
98
  )
99
  messages.insert(0, {"role": "system", "content": system_prompt})
100
 
101
- # Generate tokens using transformers TextIteratorStreamer
102
  try:
103
- text = tokenizer.apply_chat_template(
104
- messages,
105
- tokenize=False,
106
- add_generation_prompt=True
107
- )
108
- model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
109
- streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
110
 
111
- generation_kwargs = dict(
112
- model_inputs,
113
- streamer=streamer,
114
- max_new_tokens=max_tokens,
115
- do_sample=temperature > 0,
116
- temperature=temperature if temperature > 0 else None,
117
- top_p=0.9 if temperature > 0 else None,
118
  )
119
 
120
- # Start generation in a background thread
121
- thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
122
- thread.start()
123
-
124
  def token_generator():
125
- for new_text in streamer:
126
- yield new_text
127
-
 
 
128
  return StreamingResponse(token_generator(), media_type="text/event-stream")
129
  except Exception as e:
130
- print(f"[CodeCraft AI] Error during inference: {e}")
131
  return StreamingResponse(
132
- (chunk for chunk in [f"An error occurred in the local inference engine: {str(e)}"]),
133
  media_type="text/plain"
134
  )
135
 
 
4
  from fastapi import FastAPI, Request
5
  from fastapi.middleware.cors import CORSMiddleware
6
  from fastapi.responses import StreamingResponse, FileResponse
7
+ from huggingface_hub import hf_hub_download
8
+ from llama_cpp import Llama
9
 
10
+ # Global model handles
11
+ llm = None
 
12
  model_loading = False
13
  model_loaded = False
14
+ device = "cpu" # Default representation for GGUF execution state
15
+ repo_id = "mradermacher/Qwen2.5-Coder-1.5B-Instruct-abliterated-GGUF"
16
+ filename = "Qwen2.5-Coder-1.5B-Instruct-abliterated.Q5_K_M.gguf"
17
 
18
  def load_model():
19
+ global llm, model_loaded, model_loading
20
  if model_loaded or model_loading:
21
  return
22
  model_loading = True
23
+ print(f"[CodeCraft AI] Downloading and caching GGUF model '{filename}'...")
24
  try:
25
+ # Download GGUF file (cached automatically)
26
+ model_path = hf_hub_download(repo_id=repo_id, filename=filename)
27
+
28
+ print(f"[CodeCraft AI] Loading model via llama-cpp-python...")
29
+ # Load GGUF engine optimized for CPU/Threads
30
+ # n_threads set to 4 (good default for virtual spaces/CPUs)
31
+ # n_ctx set to 2048 for solid code context length
32
+ llm = Llama(
33
+ model_path=model_path,
34
+ n_ctx=2048,
35
+ n_threads=4,
36
+ verbose=False
37
  )
38
+
39
  model_loaded = True
40
  model_loading = False
41
+ print("[CodeCraft AI] Success! GGUF model loaded successfully.")
42
  except Exception as e:
43
  model_loading = False
44
+ print(f"[CodeCraft AI] Error loading GGUF model: {e}")
45
 
46
  @asynccontextmanager
47
  async def lifespan(app: FastAPI):
48
+ # Load model on startup in a separate thread so server starts instantly
49
  threading.Thread(target=load_model).start()
50
  yield
51
+ # Cleanup
52
+ global llm
53
+ if llm is not None:
54
+ del llm
 
 
 
 
55
 
56
  app = FastAPI(lifespan=lifespan)
57
 
 
76
 
77
  @app.post("/api/chat")
78
  async def chat(request: Request):
79
+ global llm, model_loaded
80
 
81
  if not model_loaded:
82
  return StreamingResponse(
 
86
 
87
  data = await request.json()
88
  messages = data.get("messages", [])
89
+ temperature = float(data.get("temperature", 0.5))
90
+ max_tokens = int(data.get("max_tokens", 1024))
91
 
92
  # Inject system instructions to optimize for programming and Luau/Python
93
  has_system = any(msg.get("role") == "system" for msg in messages)
 
102
  )
103
  messages.insert(0, {"role": "system", "content": system_prompt})
104
 
 
105
  try:
106
+ # Build prompt using Qwen template
107
+ prompt = ""
108
+ for msg in messages:
109
+ role = msg.get("role")
110
+ content = msg.get("content")
111
+ prompt += f"<|im_start|>{role}\n{content}<|im_end|>\n"
112
+ prompt += "<|im_start|>assistant\n"
113
 
114
+ # Stream generation
115
+ response_stream = llm(
116
+ prompt,
117
+ max_tokens=max_tokens,
118
+ temperature=temperature,
119
+ top_p=0.9,
120
+ stream=True
121
  )
122
 
 
 
 
 
123
  def token_generator():
124
+ for chunk in response_stream:
125
+ text = chunk["choices"][0]["text"]
126
+ if text:
127
+ yield text
128
+
129
  return StreamingResponse(token_generator(), media_type="text/event-stream")
130
  except Exception as e:
131
+ print(f"[CodeCraft AI] Error during GGUF inference: {e}")
132
  return StreamingResponse(
133
+ (chunk for chunk in [f"An error occurred in the local GGUF engine: {str(e)}"]),
134
  media_type="text/plain"
135
  )
136
 
requirements.txt CHANGED
@@ -1,6 +1,4 @@
1
  fastapi
2
  uvicorn
3
- transformers
4
- torch
5
  huggingface_hub
6
- accelerate
 
1
  fastapi
2
  uvicorn
 
 
3
  huggingface_hub
4
+ llama-cpp-python