shuarya2011 commited on
Commit
d3dccb3
·
verified ·
1 Parent(s): 3518954

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -28
app.py CHANGED
@@ -5,7 +5,6 @@ from threading import Thread
5
 
6
  model_id = "DavidAU/Qwen3-0.6B-heretic-abliterated-uncensored"
7
 
8
- # Load model and tokenizer
9
  tokenizer = AutoTokenizer.from_pretrained(model_id)
10
  model = AutoModelForCausalLM.from_pretrained(
11
  model_id,
@@ -15,25 +14,16 @@ model = AutoModelForCausalLM.from_pretrained(
15
  )
16
 
17
  def stream_chat(message, history):
18
- # Strict system prompt to kill the internal monologue
19
- system_message = "You are a direct assistant. Answer immediately. DO NOT explain your thinking. DO NOT show internal reasoning."
20
 
21
  conversation = f"System: {system_message}\n"
22
 
23
- # Robust history handling: Checks if history is list of lists or list of dicts
24
  for msg in history:
25
- if isinstance(msg, dict):
26
- role = "User" if msg.get('role') == 'user' else "Assistant"
27
- content = msg.get('content', '')
28
- else:
29
- # Fallback for older Gradio versions (list of lists)
30
- role, content = "User", msg[0]
31
- conversation += f"{role}: {content}\n"
32
- role, content = "Assistant", msg[1]
33
-
34
- conversation += f"{role}: {content}\n"
35
 
36
- conversation += f"User: {message}\nAssistant: "
37
 
38
  inputs = tokenizer([conversation], return_tensors="pt").to("cpu")
39
  streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
@@ -41,11 +31,9 @@ def stream_chat(message, history):
41
  generation_kwargs = dict(
42
  inputs,
43
  streamer=streamer,
44
- max_new_tokens=512,
45
- do_sample=True,
46
- temperature=0.3, # Low temp ensures the model doesn't drift into 'thinking'
47
- top_p=0.9,
48
- repetition_penalty=1.2,
49
  pad_token_id=tokenizer.eos_token_id
50
  )
51
 
@@ -53,21 +41,30 @@ def stream_chat(message, history):
53
  thread.start()
54
 
55
  partial_text = ""
 
 
 
56
  for new_text in streamer:
57
- # Hard stop if model tries to start its 'Okay, let me think' routine
58
- if any(stop in new_text for stop in ["User:", "Okay,", "I think", "First"]):
59
- break
60
  partial_text += new_text
61
- yield partial_text
 
 
 
 
 
 
 
 
 
 
62
 
63
- # Removed 'type="messages"' to fix the TypeError
64
  with gr.Blocks() as demo:
65
  gr.ChatInterface(
66
  fn=stream_chat,
67
- title="QWEN3-0.6B DIRECT MAX",
68
- description="Reasoning suppressed. Fast, direct, uncensored responses."
 
69
  )
70
 
71
  if __name__ == "__main__":
72
- # Passing theme here as per Gradio 6.0 logs
73
  demo.launch(server_name="0.0.0.0", server_port=7860, theme="soft")
 
5
 
6
  model_id = "DavidAU/Qwen3-0.6B-heretic-abliterated-uncensored"
7
 
 
8
  tokenizer = AutoTokenizer.from_pretrained(model_id)
9
  model = AutoModelForCausalLM.from_pretrained(
10
  model_id,
 
14
  )
15
 
16
  def stream_chat(message, history):
17
+ # This system prompt is designed to override the 'quiz' persona
18
+ system_message = "You are a direct chat interface. Provide ONLY the spoken response. NEVER explain your logic. NEVER use 'The user says'. NEVER show multiple choice answers. Respond in English only."
19
 
20
  conversation = f"System: {system_message}\n"
21
 
 
22
  for msg in history:
23
+ role = "User" if msg['role'] == 'user' else "Assistant"
24
+ conversation += f"{role}: {msg['content']}\n"
 
 
 
 
 
 
 
 
25
 
26
+ conversation += f"User: {message}\nAssistant:"
27
 
28
  inputs = tokenizer([conversation], return_tensors="pt").to("cpu")
29
  streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
 
31
  generation_kwargs = dict(
32
  inputs,
33
  streamer=streamer,
34
+ max_new_tokens=128, # Shortened to force directness
35
+ do_sample=False, # Greedy search: disables the 'creativity' that leads to thinking
36
+ repetition_penalty=1.5, # Very high to prevent looping phrases
 
 
37
  pad_token_id=tokenizer.eos_token_id
38
  )
39
 
 
41
  thread.start()
42
 
43
  partial_text = ""
44
+ # These are 'Hard Stop' keywords that trigger the moment the model starts thinking
45
+ stop_triggers = ["The user says", "Answer:", "Option", "A)", "B)", "Okay,", "In this case"]
46
+
47
  for new_text in streamer:
 
 
 
48
  partial_text += new_text
49
+
50
+ # Check if the model is drifting into reasoning
51
+ if any(trigger in partial_text for trigger in stop_triggers):
52
+ # Clean up the output to remove the trigger word itself
53
+ for trigger in stop_triggers:
54
+ if trigger in partial_text:
55
+ partial_text = partial_text.split(trigger)[0]
56
+ yield partial_text.strip()
57
+ break
58
+
59
+ yield partial_text.strip()
60
 
 
61
  with gr.Blocks() as demo:
62
  gr.ChatInterface(
63
  fn=stream_chat,
64
+ type="messages",
65
+ title="QWEN3-0.6B ZERO-THINKING",
66
+ description="Greedy search and hard-coded stop triggers to ensure direct output."
67
  )
68
 
69
  if __name__ == "__main__":
 
70
  demo.launch(server_name="0.0.0.0", server_port=7860, theme="soft")