Keeby-smilyai commited on
Commit
814e2eb
·
verified ·
1 Parent(s): c954cfb

Update models/loader.py

Browse files
Files changed (1) hide show
  1. models/loader.py +38 -16
models/loader.py CHANGED
@@ -1,12 +1,9 @@
1
  # models/loader.py
2
  import torch
3
- import os
4
  from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
5
  from backend.agents import ROLE_PROMPTS
6
- num_threads = os.cpu_count() or 2
7
- print(num_threads)
8
- torch.set_num_threads(num_threads)
9
- # The following configs are no longer used for CPU, but kept for future GPU use.
10
  QUANTIZATION_CONFIG = BitsAndBytesConfig(
11
  load_in_4bit=True,
12
  bnb_4bit_quant_type="nf4",
@@ -27,24 +24,48 @@ MODEL_REGISTRY = {
27
  }
28
  _MODEL_CACHE = {}
29
 
30
- def get_model_and_tokenizer(model_name="Qwen/Qwen3-0.6B"):
 
 
 
 
31
  if model_name not in _MODEL_CACHE:
32
  print(f"Loading model: {model_name}...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  _MODEL_CACHE[model_name] = {
34
- "model": AutoModelForCausalLM.from_pretrained(
35
- model_name,
36
- device_map=None,
37
- quantization_config=None,
38
- trust_remote_code=True,
39
- ),
40
- "tokenizer": AutoTokenizer.from_pretrained(model_name)
41
  }
42
- # Explicitly move the model to the CPU after loading
43
- _MODEL_CACHE[model_name]["model"].to("cpu")
44
 
45
  return _MODEL_CACHE[model_name]["model"], _MODEL_CACHE[model_name]["tokenizer"]
46
 
47
  def generate_with_model(agent_role, prompt):
 
 
 
48
  model_name = MODEL_REGISTRY.get(agent_role, "Qwen/Qwen3-0.6B")
49
  model, tokenizer = get_model_and_tokenizer(model_name)
50
 
@@ -59,7 +80,8 @@ def generate_with_model(agent_role, prompt):
59
  do_sample=True,
60
  temperature=0.7,
61
  top_p=0.9,
62
- repetition_penalty=1.1
 
63
  )
64
 
65
  decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)
 
1
  # models/loader.py
2
  import torch
 
3
  from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
4
  from backend.agents import ROLE_PROMPTS
5
+
6
+ # This configuration is not used for CPU, but is kept for future GPU use.
 
 
7
  QUANTIZATION_CONFIG = BitsAndBytesConfig(
8
  load_in_4bit=True,
9
  bnb_4bit_quant_type="nf4",
 
24
  }
25
  _MODEL_CACHE = {}
26
 
27
+ def get_model_and_tokenizer(model_name):
28
+ """
29
+ Loads a model and its tokenizer from the Hugging Face Hub.
30
+ Implements caching to avoid reloading the model for each call.
31
+ """
32
  if model_name not in _MODEL_CACHE:
33
  print(f"Loading model: {model_name}...")
34
+
35
+ # Load the tokenizer first
36
+ tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
37
+
38
+ # FIX: Explicitly set the pad token if it's the same as the eos token.
39
+ # This prevents the model from getting stuck in a generation loop.
40
+ if tokenizer.pad_token is None:
41
+ if tokenizer.eos_token is not None:
42
+ tokenizer.pad_token = tokenizer.eos_token
43
+ else:
44
+ tokenizer.add_special_tokens({'pad_token': '[PAD]'})
45
+ print("Added a new [PAD] token to the tokenizer.")
46
+
47
+ # Load the model with no device map or quantization for CPU inference
48
+ model = AutoModelForCausalLM.from_pretrained(
49
+ model_name,
50
+ device_map=None,
51
+ quantization_config=None,
52
+ trust_remote_code=True,
53
+ )
54
+
55
+ # Explicitly move the model to the CPU after loading
56
+ model.to("cpu")
57
+
58
  _MODEL_CACHE[model_name] = {
59
+ "model": model,
60
+ "tokenizer": tokenizer
 
 
 
 
 
61
  }
 
 
62
 
63
  return _MODEL_CACHE[model_name]["model"], _MODEL_CACHE[model_name]["tokenizer"]
64
 
65
  def generate_with_model(agent_role, prompt):
66
+ """
67
+ Generates a response using the specified agent's model.
68
+ """
69
  model_name = MODEL_REGISTRY.get(agent_role, "Qwen/Qwen3-0.6B")
70
  model, tokenizer = get_model_and_tokenizer(model_name)
71
 
 
80
  do_sample=True,
81
  temperature=0.7,
82
  top_p=0.9,
83
+ repetition_penalty=1.1,
84
+ pad_token_id=tokenizer.pad_token_id
85
  )
86
 
87
  decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)