Ngixdev commited on
Commit
15dcc64
·
verified ·
1 Parent(s): 85cfd66

Switch to transformers with Qwen2.5-7B-Instruct

Browse files
Files changed (3) hide show
  1. README.md +3 -10
  2. app.py +46 -55
  3. requirements.txt +3 -2
README.md CHANGED
@@ -10,20 +10,13 @@ pinned: false
10
  license: apache-2.0
11
  tags:
12
  - qwen
13
- - uncensored
14
- - llama-cpp
15
  - zerogpu
16
  ---
17
 
18
- # Qwen3.5-9B Uncensored API
19
 
20
- API interface for [HauhauCS/Qwen3.5-9B-Uncensored-HauhauCS-Aggressive](https://huggingface.co/HauhauCS/Qwen3.5-9B-Uncensored-HauhauCS-Aggressive).
21
-
22
- ## Features
23
-
24
- - 9B parameters, fully uncensored (0/465 refusals)
25
- - Q4_K_M quantization via llama.cpp
26
- - Running on ZeroGPU
27
 
28
  ## API Usage
29
 
 
10
  license: apache-2.0
11
  tags:
12
  - qwen
13
+ - transformers
 
14
  - zerogpu
15
  ---
16
 
17
+ # Qwen2.5-7B-Instruct API
18
 
19
+ API interface for [Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) on ZeroGPU.
 
 
 
 
 
 
20
 
21
  ## API Usage
22
 
app.py CHANGED
@@ -1,49 +1,28 @@
1
  import os
 
2
  import gradio as gr
3
  import spaces
4
- from huggingface_hub import hf_hub_download
5
 
6
- MODEL_REPO = "HauhauCS/Qwen3.5-9B-Uncensored-HauhauCS-Aggressive"
7
- MODEL_FILE = "Qwen3.5-9B-Uncensored-HauhauCS-Aggressive-Q4_K_M.gguf"
8
 
9
- model_path = None
10
- llm = None
11
 
12
- def download_model():
13
- global model_path
14
- if model_path is None:
15
- print("Downloading model...")
16
- model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE)
17
- print(f"Model downloaded: {model_path}")
18
- return model_path
19
-
20
- def get_llm():
21
- global llm
22
- if llm is None:
23
- from llama_cpp import Llama
24
- path = download_model()
25
- print("Loading model into GPU...")
26
- llm = Llama(
27
- model_path=path,
28
- n_ctx=8192,
29
- n_gpu_layers=-1,
30
- verbose=False,
31
  )
32
  print("Model loaded!")
33
- return llm
34
-
35
-
36
- def format_messages(message: str, history: list, system_prompt: str = "") -> str:
37
- formatted = ""
38
- if system_prompt.strip():
39
- formatted += f"<|im_start|>system\n{system_prompt}<|im_end|>\n"
40
- for user_msg, assistant_msg in history:
41
- if user_msg:
42
- formatted += f"<|im_start|>user\n{user_msg}<|im_end|>\n"
43
- if assistant_msg:
44
- formatted += f"<|im_start|>assistant\n{assistant_msg}<|im_end|>\n"
45
- formatted += f"<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n"
46
- return formatted
47
 
48
 
49
  @spaces.GPU(duration=120)
@@ -56,18 +35,33 @@ def generate_response(
56
  top_k: int = 20,
57
  max_tokens: int = 1024,
58
  ) -> str:
59
- model = get_llm()
60
- prompt = format_messages(message, history, system_prompt)
61
 
62
- output = model(
63
- prompt,
64
- max_tokens=max_tokens,
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  temperature=temperature,
66
  top_p=top_p,
67
  top_k=top_k,
68
- stop=["<|im_end|>", "<|im_start|>"],
 
69
  )
70
- return output["choices"][0]["text"].strip()
 
 
71
 
72
 
73
  @spaces.GPU(duration=120)
@@ -105,26 +99,23 @@ def api_generate(
105
  return {"response": None, "status": "error", "error": str(e)}
106
 
107
 
108
- with gr.Blocks(title="Qwen3.5-9B Uncensored API", theme=gr.themes.Soft()) as demo:
109
  gr.Markdown(
110
  """
111
- # 🤖 Qwen3.5-9B Uncensored API
112
-
113
- Powered by [HauhauCS/Qwen3.5-9B-Uncensored-HauhauCS-Aggressive](https://huggingface.co/HauhauCS/Qwen3.5-9B-Uncensored-HauhauCS-Aggressive)
114
 
115
- - 9B parameters, fully uncensored (0/465 refusals)
116
- - Q4_K_M quantization via llama.cpp on ZeroGPU
117
  """
118
  )
119
 
120
- with gr.Tab("💬 Chat"):
121
  chatbot = gr.Chatbot(height=450, label="Conversation")
122
 
123
  with gr.Row():
124
  msg = gr.Textbox(label="Message", placeholder="Type here...", scale=4, lines=2)
125
  submit_btn = gr.Button("Send", variant="primary", scale=1)
126
 
127
- with gr.Accordion("⚙️ Settings", open=False):
128
  system_prompt = gr.Textbox(label="System Prompt", placeholder="Optional", lines=2)
129
  with gr.Row():
130
  temperature = gr.Slider(0.0, 2.0, 0.7, step=0.1, label="Temperature")
@@ -133,7 +124,7 @@ with gr.Blocks(title="Qwen3.5-9B Uncensored API", theme=gr.themes.Soft()) as dem
133
  top_k = gr.Slider(1, 100, 20, step=1, label="Top K")
134
  max_tokens = gr.Slider(64, 2048, 1024, step=64, label="Max Tokens")
135
 
136
- clear_btn = gr.Button("🗑️ Clear")
137
 
138
  def user_submit(message, history):
139
  return "", history + [[message, None]]
@@ -155,7 +146,7 @@ with gr.Blocks(title="Qwen3.5-9B Uncensored API", theme=gr.themes.Soft()) as dem
155
  )
156
  clear_btn.click(lambda: [], None, chatbot)
157
 
158
- with gr.Tab("🔌 API"):
159
  gr.Markdown(
160
  """
161
  ## API Usage
 
1
  import os
2
+ import torch
3
  import gradio as gr
4
  import spaces
5
+ from transformers import AutoModelForCausalLM, AutoTokenizer
6
 
7
+ MODEL_ID = "Qwen/Qwen2.5-7B-Instruct"
 
8
 
9
+ tokenizer = None
10
+ model = None
11
 
12
+ def load_model():
13
+ global tokenizer, model
14
+ if model is None:
15
+ print("Loading tokenizer...")
16
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
17
+ print("Loading model...")
18
+ model = AutoModelForCausalLM.from_pretrained(
19
+ MODEL_ID,
20
+ torch_dtype=torch.bfloat16,
21
+ device_map="auto",
22
+ trust_remote_code=True,
 
 
 
 
 
 
 
 
23
  )
24
  print("Model loaded!")
25
+ return tokenizer, model
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
 
28
  @spaces.GPU(duration=120)
 
35
  top_k: int = 20,
36
  max_tokens: int = 1024,
37
  ) -> str:
38
+ tok, mdl = load_model()
 
39
 
40
+ messages = []
41
+ if system_prompt.strip():
42
+ messages.append({"role": "system", "content": system_prompt})
43
+ for user_msg, assistant_msg in history:
44
+ if user_msg:
45
+ messages.append({"role": "user", "content": user_msg})
46
+ if assistant_msg:
47
+ messages.append({"role": "assistant", "content": assistant_msg})
48
+ messages.append({"role": "user", "content": message})
49
+
50
+ text = tok.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
51
+ inputs = tok([text], return_tensors="pt").to(mdl.device)
52
+
53
+ outputs = mdl.generate(
54
+ **inputs,
55
+ max_new_tokens=max_tokens,
56
  temperature=temperature,
57
  top_p=top_p,
58
  top_k=top_k,
59
+ do_sample=True,
60
+ pad_token_id=tok.eos_token_id,
61
  )
62
+
63
+ generated = outputs[0][inputs['input_ids'].shape[-1]:]
64
+ return tok.decode(generated, skip_special_tokens=True)
65
 
66
 
67
  @spaces.GPU(duration=120)
 
99
  return {"response": None, "status": "error", "error": str(e)}
100
 
101
 
102
+ with gr.Blocks(title="Qwen API", theme=gr.themes.Soft()) as demo:
103
  gr.Markdown(
104
  """
105
+ # Qwen2.5-7B-Instruct API
 
 
106
 
107
+ Powered by [Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) on ZeroGPU
 
108
  """
109
  )
110
 
111
+ with gr.Tab("Chat"):
112
  chatbot = gr.Chatbot(height=450, label="Conversation")
113
 
114
  with gr.Row():
115
  msg = gr.Textbox(label="Message", placeholder="Type here...", scale=4, lines=2)
116
  submit_btn = gr.Button("Send", variant="primary", scale=1)
117
 
118
+ with gr.Accordion("Settings", open=False):
119
  system_prompt = gr.Textbox(label="System Prompt", placeholder="Optional", lines=2)
120
  with gr.Row():
121
  temperature = gr.Slider(0.0, 2.0, 0.7, step=0.1, label="Temperature")
 
124
  top_k = gr.Slider(1, 100, 20, step=1, label="Top K")
125
  max_tokens = gr.Slider(64, 2048, 1024, step=64, label="Max Tokens")
126
 
127
+ clear_btn = gr.Button("Clear")
128
 
129
  def user_submit(message, history):
130
  return "", history + [[message, None]]
 
146
  )
147
  clear_btn.click(lambda: [], None, chatbot)
148
 
149
+ with gr.Tab("API"):
150
  gr.Markdown(
151
  """
152
  ## API Usage
requirements.txt CHANGED
@@ -1,5 +1,6 @@
1
  gradio>=4.0.0
2
  huggingface_hub>=0.20.0
3
  spaces
4
- --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu124
5
- llama-cpp-python
 
 
1
  gradio>=4.0.0
2
  huggingface_hub>=0.20.0
3
  spaces
4
+ torch
5
+ transformers
6
+ accelerate