Keeby-smilyai commited on
Commit
fa87e26
·
verified ·
1 Parent(s): dcd1081

Update models/loader.py

Browse files
Files changed (1) hide show
  1. models/loader.py +17 -11
models/loader.py CHANGED
@@ -29,11 +29,13 @@ MODEL_REGISTRY = {
29
  }
30
  _MODEL_CACHE = {}
31
 
32
- # Default generation config (can be overridden per call if needed)
33
  GENERATION_CONFIG = GenerationConfig(
34
  max_new_tokens=512,
 
35
  temperature=0.7,
36
  top_p=0.9,
 
37
  repetition_penalty=1.1,
38
  )
39
 
@@ -48,14 +50,9 @@ def get_model_and_tokenizer(model_name):
48
 
49
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
50
 
51
- # Ensure a valid pad token exists
52
- if tokenizer.pad_token is None:
53
- if tokenizer.eos_token:
54
- tokenizer.pad_token = tokenizer.eos_token
55
- elif tokenizer.unk_token:
56
- tokenizer.pad_token = tokenizer.unk_token
57
- else:
58
- tokenizer.add_special_tokens({"pad_token": "<|pad|>"})
59
 
60
  # Load model with GPU/CPU awareness
61
  use_gpu = torch.cuda.is_available()
@@ -66,7 +63,7 @@ def get_model_and_tokenizer(model_name):
66
  trust_remote_code=True,
67
  )
68
 
69
- # If new tokens were added, resize embeddings
70
  model.resize_token_embeddings(len(tokenizer))
71
 
72
  # Explicitly move to CPU if no GPU
@@ -87,11 +84,20 @@ def generate_with_model(agent_role, prompt, generation_config: GenerationConfig
87
 
88
  full_prompt = f"You are a helpful assistant. {ROLE_PROMPTS.get(agent_role, '')}\n\nUser prompt: {prompt}"
89
 
90
- input_ids = tokenizer.encode(full_prompt, return_tensors="pt").to(model.device)
 
 
 
 
 
 
 
 
91
 
92
  with torch.no_grad():
93
  output = model.generate(
94
  input_ids,
 
95
  generation_config=generation_config,
96
  pad_token_id=tokenizer.pad_token_id,
97
  )
 
29
  }
30
  _MODEL_CACHE = {}
31
 
32
+ # Explicit generation config (avoids model-specific overrides)
33
  GENERATION_CONFIG = GenerationConfig(
34
  max_new_tokens=512,
35
+ do_sample=True,
36
  temperature=0.7,
37
  top_p=0.9,
38
+ top_k=50,
39
  repetition_penalty=1.1,
40
  )
41
 
 
50
 
51
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
52
 
53
+ # Ensure a dedicated pad token exists (not EOS)
54
+ if tokenizer.pad_token is None or tokenizer.pad_token == tokenizer.eos_token:
55
+ tokenizer.add_special_tokens({"pad_token": "<|pad|>"})
 
 
 
 
 
56
 
57
  # Load model with GPU/CPU awareness
58
  use_gpu = torch.cuda.is_available()
 
63
  trust_remote_code=True,
64
  )
65
 
66
+ # Resize embeddings if new tokens were added
67
  model.resize_token_embeddings(len(tokenizer))
68
 
69
  # Explicitly move to CPU if no GPU
 
84
 
85
  full_prompt = f"You are a helpful assistant. {ROLE_PROMPTS.get(agent_role, '')}\n\nUser prompt: {prompt}"
86
 
87
+ # Use tokenizer(...) to get both input_ids and attention_mask
88
+ inputs = tokenizer(
89
+ full_prompt,
90
+ return_tensors="pt",
91
+ padding=True,
92
+ truncation=True,
93
+ )
94
+ input_ids = inputs["input_ids"].to(model.device)
95
+ attention_mask = inputs["attention_mask"].to(model.device)
96
 
97
  with torch.no_grad():
98
  output = model.generate(
99
  input_ids,
100
+ attention_mask=attention_mask, # ✅ ensures padding is ignored
101
  generation_config=generation_config,
102
  pad_token_id=tokenizer.pad_token_id,
103
  )