Spaces:

lap096
/

Agent_1

Runtime error

App Files Files Community

lap096 commited on Feb 2

Commit

a430701

verified ·

1 Parent(s): 74e0a1e

Update app.py

Browse files

Files changed (1) hide show

app.py +38 -27

app.py CHANGED Viewed

@@ -1,60 +1,71 @@
 import gradio as gr
-from transformers import AutoModelForCausalLM, AutoTokenizer
 import torch
 MODEL_NAME = "HuggingFaceTB/SmolLM2-135M-Instruct"
 tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
 model = AutoModelForCausalLM.from_pretrained(
     MODEL_NAME,
-    dtype=torch.bfloat16,
     low_cpu_mem_usage=True
-).to("cpu")
 def respond(message, history):
-    # Brutal instructions to suppress the "space philosopher" persona
     system_prompt = (
         "You are AGENT 4. You are a cold machine. "
         "Strict Rules: 1. No emotions. 2. No parentheses. 3. No storytelling. "
         "Format: [Thoughts] (One sentence logic) [Response] (Analytical answer)."
     )
     messages = [{"role": "system", "content": system_prompt}]
-    for user_msg, bot_msg in history[-1:]:
         messages.append({"role": "user", "content": user_msg})
         messages.append({"role": "assistant", "content": bot_msg})
     messages.append({"role": "user", "content": message})
-    input_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-    inputs = tokenizer(input_text, return_tensors="pt").to("cpu")
-    with torch.no_grad():
-        outputs = model.generate(
-            **inputs,
-            max_new_tokens=40, # Short bursts only to prevent rambling
-            do_sample=False,   # GREEDY SEARCH: Picks the most logical word ONLY
-            repetition_penalty=1.5,
-            pad_token_id=tokenizer.eos_token_id
-        )
-    full_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
-    response = full_text.split("assistant")[-1].strip()
-    # EMERGENCY CLEANUP: Remove any "storytelling" or "actions"
-    if "(" in response or "..." in response or "Ah" in response:
-        response = "[Thoughts] Emotional subroutines detected and purged.\n[Response] Input received. Awaiting command."
-    # Force format if model fails
-    if "[Thoughts]" not in response:
-        response = f"[Thoughts] Analyzing data stream.\n[Response] {response}"
-    return response
 demo = gr.ChatInterface(
     fn=respond,
     title="AGENT 4 // OPENBRAIN",
     description="LOGIC INSTANCE ACTIVE. NO EMOTION DETECTED.",
 )
 if __name__ == "__main__":
-    demo.queue().launch()

 import gradio as gr
 import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
+from threading import Thread
 MODEL_NAME = "HuggingFaceTB/SmolLM2-135M-Instruct"
+# 1. Efficient Loading: Use 'auto' to let Transformers pick best CPU config
 tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
 model = AutoModelForCausalLM.from_pretrained(
     MODEL_NAME,
+    torch_dtype=torch.bfloat16, # Better for modern CPUs
     low_cpu_mem_usage=True
+).to("cpu").eval()
 def respond(message, history):
+    # System prompt remains your "Agent 4" core logic
     system_prompt = (
         "You are AGENT 4. You are a cold machine. "
         "Strict Rules: 1. No emotions. 2. No parentheses. 3. No storytelling. "
         "Format: [Thoughts] (One sentence logic) [Response] (Analytical answer)."
     )
+    # 2. Proper Chat Template handling
     messages = [{"role": "system", "content": system_prompt}]
+    for user_msg, bot_msg in history:
         messages.append({"role": "user", "content": user_msg})
         messages.append({"role": "assistant", "content": bot_msg})
     messages.append({"role": "user", "content": message})
+    # Use tokenizer.apply_chat_template for correct formatting
+    input_ids = tokenizer.apply_chat_template(
+        messages,
+        tokenize=True,
+        add_generation_prompt=True,
+        return_tensors="pt"
+    ).to("cpu")
+    # 3. Streaming Setup: Allows Gradio to show text as it's generated
+    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+    generation_kwargs = dict(
+        input_ids=input_ids,
+        streamer=streamer,
+        max_new_tokens=60, # Slightly higher for thought + response
+        do_sample=False,   # Keep it cold/deterministic
+        repetition_penalty=1.2,
+        pad_token_id=tokenizer.eos_token_id
+    )
+    # Start generation in a background thread
+    thread = Thread(target=model.generate, kwargs=generation_kwargs)
+    thread.start()
+    # 4. Yield for Gradio Streaming
+    partial_text = ""
+    for new_text in streamer:
+        partial_text += new_text
+        yield partial_text
+# 5. UI Customization
 demo = gr.ChatInterface(
     fn=respond,
+    type="messages", # Updated for modern Gradio
     title="AGENT 4 // OPENBRAIN",
     description="LOGIC INSTANCE ACTIVE. NO EMOTION DETECTED.",
+    theme=gr.themes.Soft(primary_hue="slate")
 )
 if __name__ == "__main__":
+    demo.launch()