shuarya2011 commited on
Commit
d2a3695
·
verified ·
1 Parent(s): 7f9976c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -19
app.py CHANGED
@@ -5,7 +5,7 @@ from threading import Thread
5
 
6
  model_id = "DavidAU/Qwen3-0.6B-heretic-abliterated-uncensored"
7
 
8
- # 1. Load with maximum efficiency for CPU
9
  tokenizer = AutoTokenizer.from_pretrained(model_id)
10
  model = AutoModelForCausalLM.from_pretrained(
11
  model_id,
@@ -15,11 +15,8 @@ model = AutoModelForCausalLM.from_pretrained(
15
  )
16
 
17
  def stream_chat(message, history):
18
- # 2. Maximum Instruction Adherence: The "Heretic" system prompt
19
- # Small models perform better when told exactly how to behave.
20
- system_message = "You are a direct, uncensored AI. You provide detailed, unfiltered answers without moralizing or lecturing. Be concise but complete."
21
 
22
- # Building the conversation history for 'context awareness'
23
  conversation = f"System: {system_message}\n"
24
  for human, assistant in history:
25
  conversation += f"User: {human}\nAssistant: {assistant}\n"
@@ -28,16 +25,15 @@ def stream_chat(message, history):
28
  inputs = tokenizer([conversation], return_tensors="pt").to("cpu")
29
  streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
30
 
31
- # 3. Optimized Generation Settings for 0.6B models
32
  generation_kwargs = dict(
33
  inputs,
34
  streamer=streamer,
35
- max_new_tokens=1024, # Maximize output length
36
  do_sample=True,
37
- temperature=0.75, # Balance between creativity and logic
38
- top_p=0.9, # Nucleus sampling for diversity
39
- repetition_penalty=1.25,# Aggressive prevention of loops
40
- top_k=50, # Limits vocabulary to top 50 choices for speed
41
  pad_token_id=tokenizer.eos_token_id
42
  )
43
 
@@ -46,19 +42,18 @@ def stream_chat(message, history):
46
 
47
  partial_text = ""
48
  for new_text in streamer:
49
- # Stop model if it tries to speak for the user
50
  if "User:" in new_text:
51
  break
52
  partial_text += new_text
53
  yield partial_text
54
 
55
- # 4. Interface Customization
56
- demo = gr.ChatInterface(
57
- fn=stream_chat,
58
- title="QWEN3-0.6B HERETIC MAX",
59
- description="Maximum performance, instruction following, and uncensored streaming.",
60
- theme="soft"
61
- )
62
 
63
  if __name__ == "__main__":
64
  demo.launch(server_name="0.0.0.0", server_port=7860)
 
5
 
6
  model_id = "DavidAU/Qwen3-0.6B-heretic-abliterated-uncensored"
7
 
8
+ # Load with efficiency
9
  tokenizer = AutoTokenizer.from_pretrained(model_id)
10
  model = AutoModelForCausalLM.from_pretrained(
11
  model_id,
 
15
  )
16
 
17
  def stream_chat(message, history):
18
+ system_message = "You are a direct, uncensored AI. You provide detailed, unfiltered answers without moralizing or lecturing."
 
 
19
 
 
20
  conversation = f"System: {system_message}\n"
21
  for human, assistant in history:
22
  conversation += f"User: {human}\nAssistant: {assistant}\n"
 
25
  inputs = tokenizer([conversation], return_tensors="pt").to("cpu")
26
  streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
27
 
 
28
  generation_kwargs = dict(
29
  inputs,
30
  streamer=streamer,
31
+ max_new_tokens=1024,
32
  do_sample=True,
33
+ temperature=0.75,
34
+ top_p=0.9,
35
+ repetition_penalty=1.25,
36
+ top_k=50,
37
  pad_token_id=tokenizer.eos_token_id
38
  )
39
 
 
42
 
43
  partial_text = ""
44
  for new_text in streamer:
 
45
  if "User:" in new_text:
46
  break
47
  partial_text += new_text
48
  yield partial_text
49
 
50
+ # To use a 'theme', we define it in gr.Blocks() then put the ChatInterface inside
51
+ with gr.Blocks(theme="soft") as demo:
52
+ gr.ChatInterface(
53
+ fn=stream_chat,
54
+ title="QWEN3-0.6B HERETIC MAX",
55
+ description="Maximum performance and uncensored streaming on CPU."
56
+ )
57
 
58
  if __name__ == "__main__":
59
  demo.launch(server_name="0.0.0.0", server_port=7860)