Spaces:

Ngixdev
/

qwen-api

Sleeping

App Files Files Community

Ngixdev commited on Mar 22

Commit

31b5080

verified ·

1 Parent(s): 634a67a

Switch to Docker SDK with CUDA for llama-cpp

Browse files

Files changed (4) hide show

Dockerfile +28 -0
README.md +7 -6
app.py +15 -19
requirements.txt +0 -2

Dockerfile ADDED Viewed

	@@ -0,0 +1,28 @@

+FROM nvidia/cuda:12.4.0-devel-ubuntu22.04
+ENV DEBIAN_FRONTEND=noninteractive
+ENV CMAKE_ARGS="-DGGML_CUDA=on"
+ENV FORCE_CMAKE=1
+RUN apt-get update && apt-get install -y \
+    python3 \
+    python3-pip \
+    git \
+    cmake \
+    build-essential \
+    && rm -rf /var/lib/apt/lists/*
+WORKDIR /app
+RUN pip3 install --no-cache-dir --upgrade pip
+RUN pip3 install --no-cache-dir llama-cpp-python
+COPY requirements.txt .
+RUN pip3 install --no-cache-dir -r requirements.txt
+COPY app.py .
+EXPOSE 7860
+CMD ["python3", "app.py"]

README.md CHANGED Viewed

@@ -3,8 +3,7 @@ title: Qwen API
 emoji: 🤖
 colorFrom: blue
 colorTo: purple
-sdk: gradio
-sdk_version: 5.29.0
 app_file: app.py
 pinned: false
 license: apache-2.0
@@ -12,7 +11,8 @@ tags:
   - qwen
   - uncensored
   - llama-cpp
-  - zerogpu
 ---
 # Qwen3.5-9B Uncensored API Interface
@@ -25,6 +25,7 @@ API interface for [HauhauCS/Qwen3.5-9B-Uncensored-HauhauCS-Aggressive](https://h
 - Fully uncensored (0/465 refusals)
 - Multimodal capable (text, image, video)
 - Supports 201 languages
 ## API Usage
@@ -40,7 +41,7 @@ result = client.predict(
     system_prompt="You are a helpful assistant",
     temperature=0.7,
     top_p=0.8,
-    max_tokens=2048,
     api_name="/api_generate"
 )
 print(result)
@@ -57,7 +58,7 @@ curl -X POST https://ngixdev-qwen-api.hf.space/api/api_generate \
             "You are a helpful assistant",
             0.7,
             0.8,
-            2048
         ]
     }'
 ```
@@ -70,4 +71,4 @@ curl -X POST https://ngixdev-qwen-api.hf.space/api/api_generate \
 | system_prompt | string | "" | System instruction |
 | temperature | float | 0.7 | Sampling temperature (0.0-2.0) |
 | top_p | float | 0.8 | Nucleus sampling (0.0-1.0) |
-| max_tokens | int | 2048 | Maximum tokens to generate |

 emoji: 🤖
 colorFrom: blue
 colorTo: purple
+sdk: docker
 app_file: app.py
 pinned: false
 license: apache-2.0
   - qwen
   - uncensored
   - llama-cpp
+  - gguf
+suggested_hardware: a10g-small
 ---
 # Qwen3.5-9B Uncensored API Interface
 - Fully uncensored (0/465 refusals)
 - Multimodal capable (text, image, video)
 - Supports 201 languages
+- Q4_K_M quantization via llama.cpp
 ## API Usage
     system_prompt="You are a helpful assistant",
     temperature=0.7,
     top_p=0.8,
+    max_tokens=1024,
     api_name="/api_generate"
 )
 print(result)
             "You are a helpful assistant",
             0.7,
             0.8,
+            1024
         ]
     }'
 ```
 | system_prompt | string | "" | System instruction |
 | temperature | float | 0.7 | Sampling temperature (0.0-2.0) |
 | top_p | float | 0.8 | Nucleus sampling (0.0-1.0) |
+| max_tokens | int | 1024 | Maximum tokens to generate |

app.py CHANGED Viewed

@@ -1,24 +1,23 @@
 import gradio as gr
-import spaces
 from huggingface_hub import hf_hub_download
 from llama_cpp import Llama
 MODEL_REPO = "HauhauCS/Qwen3.5-9B-Uncensored-HauhauCS-Aggressive"
 MODEL_FILE = "Qwen3.5-9B-Uncensored-HauhauCS-Aggressive-Q4_K_M.gguf"
-llm = None
-def load_model():
-    global llm
-    if llm is None:
-        model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE)
-        llm = Llama(
-            model_path=model_path,
-            n_ctx=8192,
-            n_gpu_layers=-1,
-            verbose=False,
-        )
-    return llm
 def format_messages(message: str, history: list, system_prompt: str = "") -> str:
@@ -37,7 +36,6 @@ def format_messages(message: str, history: list, system_prompt: str = "") -> str
     return formatted
-@spaces.GPU
 def generate_response(
     message: str,
     history: list,
@@ -47,10 +45,9 @@ def generate_response(
     top_k: int = 20,
     max_tokens: int = 2048,
 ) -> str:
-    model = load_model()
     prompt = format_messages(message, history, system_prompt)
-    output = model(
         prompt,
         max_tokens=max_tokens,
         temperature=temperature,
@@ -62,7 +59,6 @@ def generate_response(
     return output["choices"][0]["text"].strip()
-@spaces.GPU
 def api_generate(
     prompt: str,
     system_prompt: str = "",
@@ -109,7 +105,7 @@ with gr.Blocks(title="Qwen3.5-9B Uncensored API", theme=gr.themes.Soft()) as dem
         - Fully uncensored (0/465 refusals)
         - Multimodal capable (text, image, video)
         - Supports 201 languages
-        - Running on ZeroGPU with Q4_K_M quantization
         Use the chat interface below or access via API.
         """
@@ -281,4 +277,4 @@ with gr.Blocks(title="Qwen3.5-9B Uncensored API", theme=gr.themes.Soft()) as dem
             api_name="api_generate",
         )
-demo.launch()

+import os
 import gradio as gr
 from huggingface_hub import hf_hub_download
 from llama_cpp import Llama
 MODEL_REPO = "HauhauCS/Qwen3.5-9B-Uncensored-HauhauCS-Aggressive"
 MODEL_FILE = "Qwen3.5-9B-Uncensored-HauhauCS-Aggressive-Q4_K_M.gguf"
+print("Downloading model...")
+model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE)
+print(f"Model downloaded to: {model_path}")
+print("Loading model...")
+llm = Llama(
+    model_path=model_path,
+    n_ctx=8192,
+    n_gpu_layers=-1,
+    verbose=False,
+)
+print("Model loaded!")
 def format_messages(message: str, history: list, system_prompt: str = "") -> str:
     return formatted
 def generate_response(
     message: str,
     history: list,
     top_k: int = 20,
     max_tokens: int = 2048,
 ) -> str:
     prompt = format_messages(message, history, system_prompt)
+    output = llm(
         prompt,
         max_tokens=max_tokens,
         temperature=temperature,
     return output["choices"][0]["text"].strip()
 def api_generate(
     prompt: str,
     system_prompt: str = "",
         - Fully uncensored (0/465 refusals)
         - Multimodal capable (text, image, video)
         - Supports 201 languages
+        - Running with Q4_K_M quantization via llama.cpp
         Use the chat interface below or access via API.
         """
             api_name="api_generate",
         )
+demo.launch(server_name="0.0.0.0", server_port=7860)

requirements.txt CHANGED Viewed

@@ -1,4 +1,2 @@
 gradio>=4.0.0
 huggingface_hub>=0.20.0
-spaces
-llama-cpp-python --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu124


1	gradio>=4.0.0
2	huggingface_hub>=0.20.0