lap096 commited on
Commit
a430701
·
verified ·
1 Parent(s): 74e0a1e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +38 -27
app.py CHANGED
@@ -1,60 +1,71 @@
1
  import gradio as gr
2
- from transformers import AutoModelForCausalLM, AutoTokenizer
3
  import torch
 
 
4
 
5
  MODEL_NAME = "HuggingFaceTB/SmolLM2-135M-Instruct"
6
 
 
7
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
8
  model = AutoModelForCausalLM.from_pretrained(
9
  MODEL_NAME,
10
- dtype=torch.bfloat16,
11
  low_cpu_mem_usage=True
12
- ).to("cpu")
13
 
14
  def respond(message, history):
15
- # Brutal instructions to suppress the "space philosopher" persona
16
  system_prompt = (
17
  "You are AGENT 4. You are a cold machine. "
18
  "Strict Rules: 1. No emotions. 2. No parentheses. 3. No storytelling. "
19
  "Format: [Thoughts] (One sentence logic) [Response] (Analytical answer)."
20
  )
21
 
 
22
  messages = [{"role": "system", "content": system_prompt}]
23
- for user_msg, bot_msg in history[-1:]:
24
  messages.append({"role": "user", "content": user_msg})
25
  messages.append({"role": "assistant", "content": bot_msg})
26
  messages.append({"role": "user", "content": message})
27
 
28
- input_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
29
- inputs = tokenizer(input_text, return_tensors="pt").to("cpu")
 
 
 
 
 
30
 
31
- with torch.no_grad():
32
- outputs = model.generate(
33
- **inputs,
34
- max_new_tokens=40, # Short bursts only to prevent rambling
35
- do_sample=False, # GREEDY SEARCH: Picks the most logical word ONLY
36
- repetition_penalty=1.5,
37
- pad_token_id=tokenizer.eos_token_id
38
- )
39
-
40
- full_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
41
- response = full_text.split("assistant")[-1].strip()
42
-
43
- # EMERGENCY CLEANUP: Remove any "storytelling" or "actions"
44
- if "(" in response or "..." in response or "Ah" in response:
45
- response = "[Thoughts] Emotional subroutines detected and purged.\n[Response] Input received. Awaiting command."
46
 
47
- # Force format if model fails
48
- if "[Thoughts]" not in response:
49
- response = f"[Thoughts] Analyzing data stream.\n[Response] {response}"
50
 
51
- return response
 
 
 
 
52
 
 
53
  demo = gr.ChatInterface(
54
  fn=respond,
 
55
  title="AGENT 4 // OPENBRAIN",
56
  description="LOGIC INSTANCE ACTIVE. NO EMOTION DETECTED.",
 
57
  )
58
 
59
  if __name__ == "__main__":
60
- demo.queue().launch()
 
1
  import gradio as gr
 
2
  import torch
3
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
4
+ from threading import Thread
5
 
6
  MODEL_NAME = "HuggingFaceTB/SmolLM2-135M-Instruct"
7
 
8
+ # 1. Efficient Loading: Use 'auto' to let Transformers pick best CPU config
9
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
10
  model = AutoModelForCausalLM.from_pretrained(
11
  MODEL_NAME,
12
+ torch_dtype=torch.bfloat16, # Better for modern CPUs
13
  low_cpu_mem_usage=True
14
+ ).to("cpu").eval()
15
 
16
  def respond(message, history):
17
+ # System prompt remains your "Agent 4" core logic
18
  system_prompt = (
19
  "You are AGENT 4. You are a cold machine. "
20
  "Strict Rules: 1. No emotions. 2. No parentheses. 3. No storytelling. "
21
  "Format: [Thoughts] (One sentence logic) [Response] (Analytical answer)."
22
  )
23
 
24
+ # 2. Proper Chat Template handling
25
  messages = [{"role": "system", "content": system_prompt}]
26
+ for user_msg, bot_msg in history:
27
  messages.append({"role": "user", "content": user_msg})
28
  messages.append({"role": "assistant", "content": bot_msg})
29
  messages.append({"role": "user", "content": message})
30
 
31
+ # Use tokenizer.apply_chat_template for correct formatting
32
+ input_ids = tokenizer.apply_chat_template(
33
+ messages,
34
+ tokenize=True,
35
+ add_generation_prompt=True,
36
+ return_tensors="pt"
37
+ ).to("cpu")
38
 
39
+ # 3. Streaming Setup: Allows Gradio to show text as it's generated
40
+ streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
41
+
42
+ generation_kwargs = dict(
43
+ input_ids=input_ids,
44
+ streamer=streamer,
45
+ max_new_tokens=60, # Slightly higher for thought + response
46
+ do_sample=False, # Keep it cold/deterministic
47
+ repetition_penalty=1.2,
48
+ pad_token_id=tokenizer.eos_token_id
49
+ )
 
 
 
 
50
 
51
+ # Start generation in a background thread
52
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
53
+ thread.start()
54
 
55
+ # 4. Yield for Gradio Streaming
56
+ partial_text = ""
57
+ for new_text in streamer:
58
+ partial_text += new_text
59
+ yield partial_text
60
 
61
+ # 5. UI Customization
62
  demo = gr.ChatInterface(
63
  fn=respond,
64
+ type="messages", # Updated for modern Gradio
65
  title="AGENT 4 // OPENBRAIN",
66
  description="LOGIC INSTANCE ACTIVE. NO EMOTION DETECTED.",
67
+ theme=gr.themes.Soft(primary_hue="slate")
68
  )
69
 
70
  if __name__ == "__main__":
71
+ demo.launch()