Spaces:

Ngixdev
/

qwen-api

Sleeping

App Files Files Community

Ngixdev commited on Mar 23

Commit

15dcc64

verified ·

1 Parent(s): 85cfd66

Switch to transformers with Qwen2.5-7B-Instruct

Browse files

Files changed (3) hide show

README.md +3 -10
app.py +46 -55
requirements.txt +3 -2

README.md CHANGED Viewed

@@ -10,20 +10,13 @@ pinned: false
 license: apache-2.0
 tags:
   - qwen
-  - uncensored
-  - llama-cpp
   - zerogpu
 ---
-# Qwen3.5-9B Uncensored API
-API interface for [HauhauCS/Qwen3.5-9B-Uncensored-HauhauCS-Aggressive](https://huggingface.co/HauhauCS/Qwen3.5-9B-Uncensored-HauhauCS-Aggressive).
-## Features
-- 9B parameters, fully uncensored (0/465 refusals)
-- Q4_K_M quantization via llama.cpp
-- Running on ZeroGPU
 ## API Usage

 license: apache-2.0
 tags:
   - qwen
+  - transformers
   - zerogpu
 ---
+# Qwen2.5-7B-Instruct API
+API interface for [Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) on ZeroGPU.
 ## API Usage

app.py CHANGED Viewed

@@ -1,49 +1,28 @@
 import os
 import gradio as gr
 import spaces
-from huggingface_hub import hf_hub_download
-MODEL_REPO = "HauhauCS/Qwen3.5-9B-Uncensored-HauhauCS-Aggressive"
-MODEL_FILE = "Qwen3.5-9B-Uncensored-HauhauCS-Aggressive-Q4_K_M.gguf"
-model_path = None
-llm = None
-def download_model():
-    global model_path
-    if model_path is None:
-        print("Downloading model...")
-        model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE)
-        print(f"Model downloaded: {model_path}")
-    return model_path
-def get_llm():
-    global llm
-    if llm is None:
-        from llama_cpp import Llama
-        path = download_model()
-        print("Loading model into GPU...")
-        llm = Llama(
-            model_path=path,
-            n_ctx=8192,
-            n_gpu_layers=-1,
-            verbose=False,
         )
         print("Model loaded!")
-    return llm
-def format_messages(message: str, history: list, system_prompt: str = "") -> str:
-    formatted = ""
-    if system_prompt.strip():
-        formatted += f"<|im_start|>system\n{system_prompt}<|im_end|>\n"
-    for user_msg, assistant_msg in history:
-        if user_msg:
-            formatted += f"<|im_start|>user\n{user_msg}<|im_end|>\n"
-        if assistant_msg:
-            formatted += f"<|im_start|>assistant\n{assistant_msg}<|im_end|>\n"
-    formatted += f"<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n"
-    return formatted
 @spaces.GPU(duration=120)
@@ -56,18 +35,33 @@ def generate_response(
     top_k: int = 20,
     max_tokens: int = 1024,
 ) -> str:
-    model = get_llm()
-    prompt = format_messages(message, history, system_prompt)
-    output = model(
-        prompt,
-        max_tokens=max_tokens,
         temperature=temperature,
         top_p=top_p,
         top_k=top_k,
-        stop=["<|im_end|>", "<|im_start|>"],
     )
-    return output["choices"][0]["text"].strip()
 @spaces.GPU(duration=120)
@@ -105,26 +99,23 @@ def api_generate(
         return {"response": None, "status": "error", "error": str(e)}
-with gr.Blocks(title="Qwen3.5-9B Uncensored API", theme=gr.themes.Soft()) as demo:
     gr.Markdown(
         """
-        # 🤖 Qwen3.5-9B Uncensored API
-        Powered by [HauhauCS/Qwen3.5-9B-Uncensored-HauhauCS-Aggressive](https://huggingface.co/HauhauCS/Qwen3.5-9B-Uncensored-HauhauCS-Aggressive)
-        - 9B parameters, fully uncensored (0/465 refusals)
-        - Q4_K_M quantization via llama.cpp on ZeroGPU
         """
     )
-    with gr.Tab("💬 Chat"):
         chatbot = gr.Chatbot(height=450, label="Conversation")
         with gr.Row():
             msg = gr.Textbox(label="Message", placeholder="Type here...", scale=4, lines=2)
             submit_btn = gr.Button("Send", variant="primary", scale=1)
-        with gr.Accordion("⚙️ Settings", open=False):
             system_prompt = gr.Textbox(label="System Prompt", placeholder="Optional", lines=2)
             with gr.Row():
                 temperature = gr.Slider(0.0, 2.0, 0.7, step=0.1, label="Temperature")
@@ -133,7 +124,7 @@ with gr.Blocks(title="Qwen3.5-9B Uncensored API", theme=gr.themes.Soft()) as dem
                 top_k = gr.Slider(1, 100, 20, step=1, label="Top K")
                 max_tokens = gr.Slider(64, 2048, 1024, step=64, label="Max Tokens")
-        clear_btn = gr.Button("🗑️ Clear")
         def user_submit(message, history):
             return "", history + [[message, None]]
@@ -155,7 +146,7 @@ with gr.Blocks(title="Qwen3.5-9B Uncensored API", theme=gr.themes.Soft()) as dem
         )
         clear_btn.click(lambda: [], None, chatbot)
-    with gr.Tab("🔌 API"):
         gr.Markdown(
             """
             ## API Usage

 import os
+import torch
 import gradio as gr
 import spaces
+from transformers import AutoModelForCausalLM, AutoTokenizer
+MODEL_ID = "Qwen/Qwen2.5-7B-Instruct"
+tokenizer = None
+model = None
+def load_model():
+    global tokenizer, model
+    if model is None:
+        print("Loading tokenizer...")
+        tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
+        print("Loading model...")
+        model = AutoModelForCausalLM.from_pretrained(
+            MODEL_ID,
+            torch_dtype=torch.bfloat16,
+            device_map="auto",
+            trust_remote_code=True,
         )
         print("Model loaded!")
+    return tokenizer, model
 @spaces.GPU(duration=120)
     top_k: int = 20,
     max_tokens: int = 1024,
 ) -> str:
+    tok, mdl = load_model()
+    messages = []
+    if system_prompt.strip():
+        messages.append({"role": "system", "content": system_prompt})
+    for user_msg, assistant_msg in history:
+        if user_msg:
+            messages.append({"role": "user", "content": user_msg})
+        if assistant_msg:
+            messages.append({"role": "assistant", "content": assistant_msg})
+    messages.append({"role": "user", "content": message})
+    text = tok.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    inputs = tok([text], return_tensors="pt").to(mdl.device)
+    outputs = mdl.generate(
+        **inputs,
+        max_new_tokens=max_tokens,
         temperature=temperature,
         top_p=top_p,
         top_k=top_k,
+        do_sample=True,
+        pad_token_id=tok.eos_token_id,
     )
+    generated = outputs[0][inputs['input_ids'].shape[-1]:]
+    return tok.decode(generated, skip_special_tokens=True)
 @spaces.GPU(duration=120)
         return {"response": None, "status": "error", "error": str(e)}
+with gr.Blocks(title="Qwen API", theme=gr.themes.Soft()) as demo:
     gr.Markdown(
         """
+        # Qwen2.5-7B-Instruct API
+        Powered by [Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) on ZeroGPU
         """
     )
+    with gr.Tab("Chat"):
         chatbot = gr.Chatbot(height=450, label="Conversation")
         with gr.Row():
             msg = gr.Textbox(label="Message", placeholder="Type here...", scale=4, lines=2)
             submit_btn = gr.Button("Send", variant="primary", scale=1)
+        with gr.Accordion("Settings", open=False):
             system_prompt = gr.Textbox(label="System Prompt", placeholder="Optional", lines=2)
             with gr.Row():
                 temperature = gr.Slider(0.0, 2.0, 0.7, step=0.1, label="Temperature")
                 top_k = gr.Slider(1, 100, 20, step=1, label="Top K")
                 max_tokens = gr.Slider(64, 2048, 1024, step=64, label="Max Tokens")
+        clear_btn = gr.Button("Clear")
         def user_submit(message, history):
             return "", history + [[message, None]]
         )
         clear_btn.click(lambda: [], None, chatbot)
+    with gr.Tab("API"):
         gr.Markdown(
             """
             ## API Usage

requirements.txt CHANGED Viewed

@@ -1,5 +1,6 @@
 gradio>=4.0.0
 huggingface_hub>=0.20.0
 spaces
---extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu124
-llama-cpp-python

 gradio>=4.0.0
 huggingface_hub>=0.20.0
 spaces
+torch
+transformers
+accelerate