Ngixdev commited on
Commit
31b5080
·
verified ·
1 Parent(s): 634a67a

Switch to Docker SDK with CUDA for llama-cpp

Browse files
Files changed (4) hide show
  1. Dockerfile +28 -0
  2. README.md +7 -6
  3. app.py +15 -19
  4. requirements.txt +0 -2
Dockerfile ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM nvidia/cuda:12.4.0-devel-ubuntu22.04
2
+
3
+ ENV DEBIAN_FRONTEND=noninteractive
4
+ ENV CMAKE_ARGS="-DGGML_CUDA=on"
5
+ ENV FORCE_CMAKE=1
6
+
7
+ RUN apt-get update && apt-get install -y \
8
+ python3 \
9
+ python3-pip \
10
+ git \
11
+ cmake \
12
+ build-essential \
13
+ && rm -rf /var/lib/apt/lists/*
14
+
15
+ WORKDIR /app
16
+
17
+ RUN pip3 install --no-cache-dir --upgrade pip
18
+
19
+ RUN pip3 install --no-cache-dir llama-cpp-python
20
+
21
+ COPY requirements.txt .
22
+ RUN pip3 install --no-cache-dir -r requirements.txt
23
+
24
+ COPY app.py .
25
+
26
+ EXPOSE 7860
27
+
28
+ CMD ["python3", "app.py"]
README.md CHANGED
@@ -3,8 +3,7 @@ title: Qwen API
3
  emoji: 🤖
4
  colorFrom: blue
5
  colorTo: purple
6
- sdk: gradio
7
- sdk_version: 5.29.0
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
@@ -12,7 +11,8 @@ tags:
12
  - qwen
13
  - uncensored
14
  - llama-cpp
15
- - zerogpu
 
16
  ---
17
 
18
  # Qwen3.5-9B Uncensored API Interface
@@ -25,6 +25,7 @@ API interface for [HauhauCS/Qwen3.5-9B-Uncensored-HauhauCS-Aggressive](https://h
25
  - Fully uncensored (0/465 refusals)
26
  - Multimodal capable (text, image, video)
27
  - Supports 201 languages
 
28
 
29
  ## API Usage
30
 
@@ -40,7 +41,7 @@ result = client.predict(
40
  system_prompt="You are a helpful assistant",
41
  temperature=0.7,
42
  top_p=0.8,
43
- max_tokens=2048,
44
  api_name="/api_generate"
45
  )
46
  print(result)
@@ -57,7 +58,7 @@ curl -X POST https://ngixdev-qwen-api.hf.space/api/api_generate \
57
  "You are a helpful assistant",
58
  0.7,
59
  0.8,
60
- 2048
61
  ]
62
  }'
63
  ```
@@ -70,4 +71,4 @@ curl -X POST https://ngixdev-qwen-api.hf.space/api/api_generate \
70
  | system_prompt | string | "" | System instruction |
71
  | temperature | float | 0.7 | Sampling temperature (0.0-2.0) |
72
  | top_p | float | 0.8 | Nucleus sampling (0.0-1.0) |
73
- | max_tokens | int | 2048 | Maximum tokens to generate |
 
3
  emoji: 🤖
4
  colorFrom: blue
5
  colorTo: purple
6
+ sdk: docker
 
7
  app_file: app.py
8
  pinned: false
9
  license: apache-2.0
 
11
  - qwen
12
  - uncensored
13
  - llama-cpp
14
+ - gguf
15
+ suggested_hardware: a10g-small
16
  ---
17
 
18
  # Qwen3.5-9B Uncensored API Interface
 
25
  - Fully uncensored (0/465 refusals)
26
  - Multimodal capable (text, image, video)
27
  - Supports 201 languages
28
+ - Q4_K_M quantization via llama.cpp
29
 
30
  ## API Usage
31
 
 
41
  system_prompt="You are a helpful assistant",
42
  temperature=0.7,
43
  top_p=0.8,
44
+ max_tokens=1024,
45
  api_name="/api_generate"
46
  )
47
  print(result)
 
58
  "You are a helpful assistant",
59
  0.7,
60
  0.8,
61
+ 1024
62
  ]
63
  }'
64
  ```
 
71
  | system_prompt | string | "" | System instruction |
72
  | temperature | float | 0.7 | Sampling temperature (0.0-2.0) |
73
  | top_p | float | 0.8 | Nucleus sampling (0.0-1.0) |
74
+ | max_tokens | int | 1024 | Maximum tokens to generate |
app.py CHANGED
@@ -1,24 +1,23 @@
 
1
  import gradio as gr
2
- import spaces
3
  from huggingface_hub import hf_hub_download
4
  from llama_cpp import Llama
5
 
6
  MODEL_REPO = "HauhauCS/Qwen3.5-9B-Uncensored-HauhauCS-Aggressive"
7
  MODEL_FILE = "Qwen3.5-9B-Uncensored-HauhauCS-Aggressive-Q4_K_M.gguf"
8
 
9
- llm = None
 
 
10
 
11
- def load_model():
12
- global llm
13
- if llm is None:
14
- model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE)
15
- llm = Llama(
16
- model_path=model_path,
17
- n_ctx=8192,
18
- n_gpu_layers=-1,
19
- verbose=False,
20
- )
21
- return llm
22
 
23
 
24
  def format_messages(message: str, history: list, system_prompt: str = "") -> str:
@@ -37,7 +36,6 @@ def format_messages(message: str, history: list, system_prompt: str = "") -> str
37
  return formatted
38
 
39
 
40
- @spaces.GPU
41
  def generate_response(
42
  message: str,
43
  history: list,
@@ -47,10 +45,9 @@ def generate_response(
47
  top_k: int = 20,
48
  max_tokens: int = 2048,
49
  ) -> str:
50
- model = load_model()
51
  prompt = format_messages(message, history, system_prompt)
52
 
53
- output = model(
54
  prompt,
55
  max_tokens=max_tokens,
56
  temperature=temperature,
@@ -62,7 +59,6 @@ def generate_response(
62
  return output["choices"][0]["text"].strip()
63
 
64
 
65
- @spaces.GPU
66
  def api_generate(
67
  prompt: str,
68
  system_prompt: str = "",
@@ -109,7 +105,7 @@ with gr.Blocks(title="Qwen3.5-9B Uncensored API", theme=gr.themes.Soft()) as dem
109
  - Fully uncensored (0/465 refusals)
110
  - Multimodal capable (text, image, video)
111
  - Supports 201 languages
112
- - Running on ZeroGPU with Q4_K_M quantization
113
 
114
  Use the chat interface below or access via API.
115
  """
@@ -281,4 +277,4 @@ with gr.Blocks(title="Qwen3.5-9B Uncensored API", theme=gr.themes.Soft()) as dem
281
  api_name="api_generate",
282
  )
283
 
284
- demo.launch()
 
1
+ import os
2
  import gradio as gr
 
3
  from huggingface_hub import hf_hub_download
4
  from llama_cpp import Llama
5
 
6
  MODEL_REPO = "HauhauCS/Qwen3.5-9B-Uncensored-HauhauCS-Aggressive"
7
  MODEL_FILE = "Qwen3.5-9B-Uncensored-HauhauCS-Aggressive-Q4_K_M.gguf"
8
 
9
+ print("Downloading model...")
10
+ model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE)
11
+ print(f"Model downloaded to: {model_path}")
12
 
13
+ print("Loading model...")
14
+ llm = Llama(
15
+ model_path=model_path,
16
+ n_ctx=8192,
17
+ n_gpu_layers=-1,
18
+ verbose=False,
19
+ )
20
+ print("Model loaded!")
 
 
 
21
 
22
 
23
  def format_messages(message: str, history: list, system_prompt: str = "") -> str:
 
36
  return formatted
37
 
38
 
 
39
  def generate_response(
40
  message: str,
41
  history: list,
 
45
  top_k: int = 20,
46
  max_tokens: int = 2048,
47
  ) -> str:
 
48
  prompt = format_messages(message, history, system_prompt)
49
 
50
+ output = llm(
51
  prompt,
52
  max_tokens=max_tokens,
53
  temperature=temperature,
 
59
  return output["choices"][0]["text"].strip()
60
 
61
 
 
62
  def api_generate(
63
  prompt: str,
64
  system_prompt: str = "",
 
105
  - Fully uncensored (0/465 refusals)
106
  - Multimodal capable (text, image, video)
107
  - Supports 201 languages
108
+ - Running with Q4_K_M quantization via llama.cpp
109
 
110
  Use the chat interface below or access via API.
111
  """
 
277
  api_name="api_generate",
278
  )
279
 
280
+ demo.launch(server_name="0.0.0.0", server_port=7860)
requirements.txt CHANGED
@@ -1,4 +1,2 @@
1
  gradio>=4.0.0
2
  huggingface_hub>=0.20.0
3
- spaces
4
- llama-cpp-python --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu124
 
1
  gradio>=4.0.0
2
  huggingface_hub>=0.20.0