Code-agent-team-beta-updates

Runtime error

App Files Files Community

Keeby-smilyai commited on Sep 23, 2025

Commit

814e2eb

verified ·

1 Parent(s): c954cfb

Update models/loader.py

Browse files

Files changed (1) hide show

models/loader.py +38 -16

models/loader.py CHANGED Viewed

@@ -1,12 +1,9 @@
 # models/loader.py
 import torch
-import os
 from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
 from backend.agents import ROLE_PROMPTS
-num_threads = os.cpu_count() or 2
-print(num_threads)
-torch.set_num_threads(num_threads)
-# The following configs are no longer used for CPU, but kept for future GPU use.
 QUANTIZATION_CONFIG = BitsAndBytesConfig(
     load_in_4bit=True,
     bnb_4bit_quant_type="nf4",
@@ -27,24 +24,48 @@ MODEL_REGISTRY = {
 }
 _MODEL_CACHE = {}
-def get_model_and_tokenizer(model_name="Qwen/Qwen3-0.6B"):
     if model_name not in _MODEL_CACHE:
         print(f"Loading model: {model_name}...")
         _MODEL_CACHE[model_name] = {
-            "model": AutoModelForCausalLM.from_pretrained(
-                model_name,
-                device_map=None,
-                quantization_config=None,
-                trust_remote_code=True,
-            ),
-            "tokenizer": AutoTokenizer.from_pretrained(model_name)
         }
-        # Explicitly move the model to the CPU after loading
-        _MODEL_CACHE[model_name]["model"].to("cpu")
     return _MODEL_CACHE[model_name]["model"], _MODEL_CACHE[model_name]["tokenizer"]
 def generate_with_model(agent_role, prompt):
     model_name = MODEL_REGISTRY.get(agent_role, "Qwen/Qwen3-0.6B")
     model, tokenizer = get_model_and_tokenizer(model_name)
@@ -59,7 +80,8 @@ def generate_with_model(agent_role, prompt):
             do_sample=True,
             temperature=0.7,
             top_p=0.9,
-            repetition_penalty=1.1
         )
     decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)

 # models/loader.py
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
 from backend.agents import ROLE_PROMPTS
+# This configuration is not used for CPU, but is kept for future GPU use.
 QUANTIZATION_CONFIG = BitsAndBytesConfig(
     load_in_4bit=True,
     bnb_4bit_quant_type="nf4",
 }
 _MODEL_CACHE = {}
+def get_model_and_tokenizer(model_name):
+    """
+    Loads a model and its tokenizer from the Hugging Face Hub.
+    Implements caching to avoid reloading the model for each call.
+    """
     if model_name not in _MODEL_CACHE:
         print(f"Loading model: {model_name}...")
+        # Load the tokenizer first
+        tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+        # FIX: Explicitly set the pad token if it's the same as the eos token.
+        # This prevents the model from getting stuck in a generation loop.
+        if tokenizer.pad_token is None:
+            if tokenizer.eos_token is not None:
+                tokenizer.pad_token = tokenizer.eos_token
+            else:
+                tokenizer.add_special_tokens({'pad_token': '[PAD]'})
+                print("Added a new [PAD] token to the tokenizer.")
+        # Load the model with no device map or quantization for CPU inference
+        model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            device_map=None,
+            quantization_config=None,
+            trust_remote_code=True,
+        )
+        # Explicitly move the model to the CPU after loading
+        model.to("cpu")
         _MODEL_CACHE[model_name] = {
+            "model": model,
+            "tokenizer": tokenizer
         }
     return _MODEL_CACHE[model_name]["model"], _MODEL_CACHE[model_name]["tokenizer"]
 def generate_with_model(agent_role, prompt):
+    """
+    Generates a response using the specified agent's model.
+    """
     model_name = MODEL_REGISTRY.get(agent_role, "Qwen/Qwen3-0.6B")
     model, tokenizer = get_model_and_tokenizer(model_name)
             do_sample=True,
             temperature=0.7,
             top_p=0.9,
+            repetition_penalty=1.1,
+            pad_token_id=tokenizer.pad_token_id
         )
     decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)