shuarya2011 commited on
Commit
cedfec7
·
verified ·
1 Parent(s): 8da3f0d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -51
app.py CHANGED
@@ -1,69 +1,49 @@
1
  import gradio as gr
2
- import torch
3
- from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
4
- from threading import Thread
5
-
6
- model_id = "DavidAU/Qwen3-0.6B-heretic-abliterated-uncensored"
7
-
8
- tokenizer = AutoTokenizer.from_pretrained(model_id)
9
- model = AutoModelForCausalLM.from_pretrained(
10
- model_id,
11
- dtype=torch.float32,
12
- device_map="cpu",
13
- low_cpu_mem_usage=True
14
  )
15
 
16
  def stream_chat(message, history):
17
- # This system prompt is designed to override the 'quiz' persona
18
- system_message = "You are a direct chat interface. Provide ONLY the spoken response. NEVER explain your logic. NEVER use 'The user says'. NEVER show multiple choice answers. Respond in English only."
19
-
20
- conversation = f"System: {system_message}\n"
21
 
22
  for msg in history:
23
- role = "User" if msg['role'] == 'user' else "Assistant"
24
- conversation += f"{role}: {msg['content']}\n"
 
25
 
26
- conversation += f"User: {message}\nAssistant:"
27
-
28
- inputs = tokenizer([conversation], return_tensors="pt").to("cpu")
29
- streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
30
-
31
- generation_kwargs = dict(
32
- inputs,
33
- streamer=streamer,
34
- max_new_tokens=128, # Shortened to force directness
35
- do_sample=False, # Greedy search: disables the 'creativity' that leads to thinking
36
- repetition_penalty=1.5, # Very high to prevent looping phrases
37
- pad_token_id=tokenizer.eos_token_id
38
  )
39
 
40
- thread = Thread(target=model.generate, kwargs=generation_kwargs)
41
- thread.start()
42
-
43
  partial_text = ""
44
- # These are 'Hard Stop' keywords that trigger the moment the model starts thinking
45
- stop_triggers = ["The user says", "Answer:", "Option", "A)", "B)", "Okay,", "In this case"]
46
-
47
- for new_text in streamer:
48
- partial_text += new_text
49
-
50
- # Check if the model is drifting into reasoning
51
- if any(trigger in partial_text for trigger in stop_triggers):
52
- # Clean up the output to remove the trigger word itself
53
- for trigger in stop_triggers:
54
- if trigger in partial_text:
55
- partial_text = partial_text.split(trigger)[0]
56
- yield partial_text.strip()
57
- break
58
-
59
- yield partial_text.strip()
60
 
61
  with gr.Blocks() as demo:
62
  gr.ChatInterface(
63
  fn=stream_chat,
64
  type="messages",
65
- title="QWEN3-0.6B ZERO-THINKING",
66
- description="Greedy search and hard-coded stop triggers to ensure direct output."
67
  )
68
 
69
  if __name__ == "__main__":
 
1
  import gradio as gr
2
+ from llama_cpp import Llama
3
+
4
+ # Initialize the model
5
+ # n_ctx is the context window; 2048 is plenty for fast chat
6
+ llm = Llama(
7
+ model_path="llama3.2-1b-Uncensored-Q4_K_M.gguf",
8
+ n_ctx=2048,
9
+ n_threads=4 # Optimized for Hugging Face Free Tier CPU
 
 
 
 
10
  )
11
 
12
  def stream_chat(message, history):
13
+ # Llama 3 Prompt Format - Strict Directives
14
+ prompt = f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n" \
15
+ f"You are a direct AI. Respond immediately. No thinking. No internal monologue.<|eot_id|>"
 
16
 
17
  for msg in history:
18
+ role = msg['role']
19
+ content = msg['content']
20
+ prompt += f"<|start_header_id|>{role}<|end_header_id|>\n\n{content}<|eot_id|>"
21
 
22
+ prompt += f"<|start_header_id|>user<|end_header_id|>\n\n{message}<|eot_id|>" \
23
+ f"<|start_header_id|>assistant<|end_header_id|>\n\n"
24
+
25
+ # Streaming generation
26
+ stream = llm(
27
+ prompt,
28
+ max_tokens=512,
29
+ stop=["<|eot_id|>", "User:", "Assistant:"],
30
+ stream=True,
31
+ temperature=0, # GREEDY: Absolute directness, no 'wandering' thoughts
32
+ repeat_penalty=1.2
 
33
  )
34
 
 
 
 
35
  partial_text = ""
36
+ for output in stream:
37
+ token = output["choices"][0]["text"]
38
+ partial_text += token
39
+ yield partial_text
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
  with gr.Blocks() as demo:
42
  gr.ChatInterface(
43
  fn=stream_chat,
44
  type="messages",
45
+ title="LLAMA-3.2-1B UNCENSORED (GGUF)",
46
+ description="Running on Llama.cpp for maximum CPU speed. No-thinking mode active."
47
  )
48
 
49
  if __name__ == "__main__":