Spaces:

Ngixdev
/

qwen-api

Sleeping

App Files Files Community

Ngixdev commited on Mar 22

Commit

2dad00a

verified ·

1 Parent(s): d7860c8

Switch to Gradio + ZeroGPU with llama-cpp-python

Browse files

Files changed (4) hide show

Dockerfile +0 -22
README.md +16 -83
app.py +199 -0
requirements.txt +4 -0

Dockerfile DELETED Viewed

@@ -1,22 +0,0 @@
-FROM ghcr.io/ggml-org/llama.cpp:full
-WORKDIR /app
-RUN apt update && apt install -y python3-pip
-RUN pip install -U huggingface_hub
-RUN python3 -c 'from huggingface_hub import hf_hub_download; \
-    repo="HauhauCS/Qwen3.5-9B-Uncensored-HauhauCS-Aggressive"; \
-    hf_hub_download(repo_id=repo, filename="Qwen3.5-9B-Uncensored-HauhauCS-Aggressive-Q4_K_M.gguf", local_dir="/app"); \
-    hf_hub_download(repo_id=repo, filename="mmproj-Qwen3.5-9B-Uncensored-HauhauCS-Aggressive-BF16.gguf", local_dir="/app")'
-CMD ["--server", \
-     "-m", "/app/Qwen3.5-9B-Uncensored-HauhauCS-Aggressive-Q4_K_M.gguf", \
-     "--mmproj", "/app/mmproj-Qwen3.5-9B-Uncensored-HauhauCS-Aggressive-BF16.gguf", \
-     "--host", "0.0.0.0", \
-     "--port", "7860", \
-     "-t", "2", \
-     "--cache-type-k", "q8_0", \
-     "--cache-type-v", "iq4_nl", \
-     "-c", "32768", \
-     "-n", "8192"]

README.md CHANGED Viewed

@@ -3,108 +3,41 @@ title: Qwen API
 emoji: 🤖
 colorFrom: blue
 colorTo: purple
-sdk: docker
 pinned: false
 license: apache-2.0
 tags:
   - qwen
   - uncensored
   - llama-cpp
-  - gguf
-  - openai-compatible
-suggested_hardware: a10g-small
 ---
 # Qwen3.5-9B Uncensored API
-OpenAI-compatible API for [HauhauCS/Qwen3.5-9B-Uncensored-HauhauCS-Aggressive](https://huggingface.co/HauhauCS/Qwen3.5-9B-Uncensored-HauhauCS-Aggressive).
 ## Features
-- 9B parameters with 262K context window
-- Fully uncensored (0/465 refusals)
-- Multimodal capable (text, image, video)
-- Supports 201 languages
 - Q4_K_M quantization via llama.cpp
-- OpenAI-compatible API
 ## API Usage
-### Python (OpenAI SDK)
 ```python
-from openai import OpenAI
-client = OpenAI(
-    base_url="https://ngixdev-qwen-api.hf.space/v1",
-    api_key="not-needed"
-)
-response = client.chat.completions.create(
-    model="qwen",
-    messages=[
-        {"role": "system", "content": "You are a helpful assistant."},
-        {"role": "user", "content": "Hello, who are you?"}
-    ],
     temperature=0.7,
-    max_tokens=1024
-)
-print(response.choices[0].message.content)
-```
-### cURL
-```bash
-curl https://ngixdev-qwen-api.hf.space/v1/chat/completions \
-    -H "Content-Type: application/json" \
-    -d '{
-        "model": "qwen",
-        "messages": [
-            {"role": "system", "content": "You are a helpful assistant."},
-            {"role": "user", "content": "Hello!"}
-        ],
-        "temperature": 0.7,
-        "max_tokens": 1024
-    }'
-```
-### Streaming
-```python
-from openai import OpenAI
-client = OpenAI(
-    base_url="https://ngixdev-qwen-api.hf.space/v1",
-    api_key="not-needed"
-)
-stream = client.chat.completions.create(
-    model="qwen",
-    messages=[{"role": "user", "content": "Tell me a story"}],
-    stream=True
 )
-for chunk in stream:
-    if chunk.choices[0].delta.content:
-        print(chunk.choices[0].delta.content, end="")
 ```
-## Endpoints
-| Endpoint | Description |
-|----------|-------------|
-| `/v1/chat/completions` | Chat completions (OpenAI-compatible) |
-| `/v1/completions` | Text completions |
-| `/v1/models` | List available models |
-| `/health` | Health check |
-## Parameters
-| Parameter | Type | Default | Description |
-|-----------|------|---------|-------------|
-| messages | array | required | Chat messages |
-| temperature | float | 0.7 | Sampling temperature (0.0-2.0) |
-| top_p | float | 0.8 | Nucleus sampling (0.0-1.0) |
-| max_tokens | int | 1024 | Maximum tokens to generate |
-| stream | bool | false | Enable streaming response |

 emoji: 🤖
 colorFrom: blue
 colorTo: purple
+sdk: gradio
+sdk_version: 5.29.0
+app_file: app.py
 pinned: false
 license: apache-2.0
 tags:
   - qwen
   - uncensored
   - llama-cpp
+  - zerogpu
 ---
 # Qwen3.5-9B Uncensored API
+API interface for [HauhauCS/Qwen3.5-9B-Uncensored-HauhauCS-Aggressive](https://huggingface.co/HauhauCS/Qwen3.5-9B-Uncensored-HauhauCS-Aggressive).
 ## Features
+- 9B parameters, fully uncensored (0/465 refusals)
 - Q4_K_M quantization via llama.cpp
+- Running on ZeroGPU
 ## API Usage
 ```python
+from gradio_client import Client
+client = Client("Ngixdev/qwen-api")
+result = client.predict(
+    prompt="Hello!",
+    system_prompt="You are helpful.",
     temperature=0.7,
+    top_p=0.8,
+    max_tokens=1024,
+    api_name="/api_generate"
 )
+print(result)
 ```

app.py ADDED Viewed

	@@ -0,0 +1,199 @@

+import os
+import gradio as gr
+import spaces
+from huggingface_hub import hf_hub_download
+MODEL_REPO = "HauhauCS/Qwen3.5-9B-Uncensored-HauhauCS-Aggressive"
+MODEL_FILE = "Qwen3.5-9B-Uncensored-HauhauCS-Aggressive-Q4_K_M.gguf"
+model_path = None
+llm = None
+def download_model():
+    global model_path
+    if model_path is None:
+        print("Downloading model...")
+        model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE)
+        print(f"Model downloaded: {model_path}")
+    return model_path
+def get_llm():
+    global llm
+    if llm is None:
+        from llama_cpp import Llama
+        path = download_model()
+        print("Loading model into GPU...")
+        llm = Llama(
+            model_path=path,
+            n_ctx=8192,
+            n_gpu_layers=-1,
+            verbose=False,
+        )
+        print("Model loaded!")
+    return llm
+def format_messages(message: str, history: list, system_prompt: str = "") -> str:
+    formatted = ""
+    if system_prompt.strip():
+        formatted += f"<|im_start|>system\n{system_prompt}<|im_end|>\n"
+    for user_msg, assistant_msg in history:
+        if user_msg:
+            formatted += f"<|im_start|>user\n{user_msg}<|im_end|>\n"
+        if assistant_msg:
+            formatted += f"<|im_start|>assistant\n{assistant_msg}<|im_end|>\n"
+    formatted += f"<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n"
+    return formatted
+@spaces.GPU(duration=120)
+def generate_response(
+    message: str,
+    history: list,
+    system_prompt: str = "",
+    temperature: float = 0.7,
+    top_p: float = 0.8,
+    top_k: int = 20,
+    max_tokens: int = 1024,
+) -> str:
+    model = get_llm()
+    prompt = format_messages(message, history, system_prompt)
+    output = model(
+        prompt,
+        max_tokens=max_tokens,
+        temperature=temperature,
+        top_p=top_p,
+        top_k=top_k,
+        stop=["<|im_end|>", "<|im_start|>"],
+    )
+    return output["choices"][0]["text"].strip()
+@spaces.GPU(duration=120)
+def api_generate(
+    prompt: str,
+    system_prompt: str = "",
+    temperature: float = 0.7,
+    top_p: float = 0.8,
+    max_tokens: int = 1024,
+) -> dict:
+    """
+    API endpoint for text generation.
+    Args:
+        prompt: The user prompt/question
+        system_prompt: Optional system instruction
+        temperature: Sampling temperature (0.0-2.0)
+        top_p: Nucleus sampling parameter (0.0-1.0)
+        max_tokens: Maximum tokens to generate
+    Returns:
+        Dictionary with 'response' key containing generated text
+    """
+    try:
+        response = generate_response(
+            message=prompt,
+            history=[],
+            system_prompt=system_prompt,
+            temperature=temperature,
+            top_p=top_p,
+            max_tokens=max_tokens,
+        )
+        return {"response": response, "status": "success"}
+    except Exception as e:
+        return {"response": None, "status": "error", "error": str(e)}
+with gr.Blocks(title="Qwen3.5-9B Uncensored API", theme=gr.themes.Soft()) as demo:
+    gr.Markdown(
+        """
+        # 🤖 Qwen3.5-9B Uncensored API
+        Powered by [HauhauCS/Qwen3.5-9B-Uncensored-HauhauCS-Aggressive](https://huggingface.co/HauhauCS/Qwen3.5-9B-Uncensored-HauhauCS-Aggressive)
+        - 9B parameters, fully uncensored (0/465 refusals)
+        - Q4_K_M quantization via llama.cpp on ZeroGPU
+        """
+    )
+    with gr.Tab("💬 Chat"):
+        chatbot = gr.Chatbot(height=450, label="Conversation")
+        with gr.Row():
+            msg = gr.Textbox(label="Message", placeholder="Type here...", scale=4, lines=2)
+            submit_btn = gr.Button("Send", variant="primary", scale=1)
+        with gr.Accordion("⚙️ Settings", open=False):
+            system_prompt = gr.Textbox(label="System Prompt", placeholder="Optional", lines=2)
+            with gr.Row():
+                temperature = gr.Slider(0.0, 2.0, 0.7, step=0.1, label="Temperature")
+                top_p = gr.Slider(0.0, 1.0, 0.8, step=0.05, label="Top P")
+            with gr.Row():
+                top_k = gr.Slider(1, 100, 20, step=1, label="Top K")
+                max_tokens = gr.Slider(64, 2048, 1024, step=64, label="Max Tokens")
+        clear_btn = gr.Button("🗑️ Clear")
+        def user_submit(message, history):
+            return "", history + [[message, None]]
+        def bot_response(history, system_prompt, temperature, top_p, top_k, max_tokens):
+            if not history:
+                return history
+            message = history[-1][0]
+            history_without_last = history[:-1]
+            response = generate_response(message, history_without_last, system_prompt, temperature, top_p, top_k, max_tokens)
+            history[-1][1] = response
+            return history
+        msg.submit(user_submit, [msg, chatbot], [msg, chatbot]).then(
+            bot_response, [chatbot, system_prompt, temperature, top_p, top_k, max_tokens], chatbot
+        )
+        submit_btn.click(user_submit, [msg, chatbot], [msg, chatbot]).then(
+            bot_response, [chatbot, system_prompt, temperature, top_p, top_k, max_tokens], chatbot
+        )
+        clear_btn.click(lambda: [], None, chatbot)
+    with gr.Tab("🔌 API"):
+        gr.Markdown(
+            """
+            ## API Usage
+            ```python
+            from gradio_client import Client
+            client = Client("Ngixdev/qwen-api")
+            result = client.predict(
+                prompt="Hello!",
+                system_prompt="You are helpful.",
+                temperature=0.7,
+                top_p=0.8,
+                max_tokens=1024,
+                api_name="/api_generate"
+            )
+            print(result)
+            ```
+            """
+        )
+        with gr.Row():
+            with gr.Column():
+                api_prompt = gr.Textbox(label="Prompt", lines=3)
+                api_system = gr.Textbox(label="System Prompt", lines=2)
+                with gr.Row():
+                    api_temp = gr.Slider(0.0, 2.0, 0.7, step=0.1, label="Temperature")
+                    api_top_p = gr.Slider(0.0, 1.0, 0.8, step=0.05, label="Top P")
+                api_max_tokens = gr.Slider(64, 2048, 1024, step=64, label="Max Tokens")
+                api_submit = gr.Button("Generate", variant="primary")
+            with gr.Column():
+                api_output = gr.JSON(label="Response")
+        api_submit.click(
+            api_generate,
+            [api_prompt, api_system, api_temp, api_top_p, api_max_tokens],
+            api_output,
+            api_name="api_generate",
+        )
+demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+gradio>=4.0.0
+huggingface_hub>=0.20.0
+spaces
+llama-cpp-python>=0.3.0