Keeby-smilyai's picture
Update models/loader.py
fa87e26 verified
# models/loader.py
import torch
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
BitsAndBytesConfig,
GenerationConfig,
)
from backend.agents import ROLE_PROMPTS
# Optional quantization config (used only if GPU is available)
QUANTIZATION_CONFIG = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16,
)
MODEL_REGISTRY = {
"ceo": "Qwen/Qwen3-0.6B",
"planner": "Qwen/Qwen3-0.6B",
"manager": "Qwen/Qwen3-0.6B",
"debugger": "Qwen/Qwen3-0.6B",
"business_analyst": "Qwen/Qwen3-0.6B",
"ux_ui_designer": "Qwen/Qwen3-0.6B",
"worker_backend_coder": "Qwen/Qwen3-0.6B",
"worker_front_end_coder": "Qwen/Qwen3-0.6B",
"worker_tester": "Qwen/Qwen3-0.6B",
"code_analyst": "Qwen/Qwen3-0.6B",
}
_MODEL_CACHE = {}
# Explicit generation config (avoids model-specific overrides)
GENERATION_CONFIG = GenerationConfig(
max_new_tokens=512,
do_sample=True,
temperature=0.7,
top_p=0.9,
top_k=50,
repetition_penalty=1.1,
)
def get_model_and_tokenizer(model_name):
"""
Loads a model and its tokenizer from the Hugging Face Hub.
Implements caching to avoid reloading the model for each call.
"""
if model_name not in _MODEL_CACHE:
print(f"Loading model: {model_name}...")
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
# Ensure a dedicated pad token exists (not EOS)
if tokenizer.pad_token is None or tokenizer.pad_token == tokenizer.eos_token:
tokenizer.add_special_tokens({"pad_token": "<|pad|>"})
# Load model with GPU/CPU awareness
use_gpu = torch.cuda.is_available()
model = AutoModelForCausalLM.from_pretrained(
model_name,
device_map="auto" if use_gpu else None,
quantization_config=QUANTIZATION_CONFIG if use_gpu else None,
trust_remote_code=True,
)
# Resize embeddings if new tokens were added
model.resize_token_embeddings(len(tokenizer))
# Explicitly move to CPU if no GPU
if not use_gpu:
model.to("cpu")
_MODEL_CACHE[model_name] = {"model": model, "tokenizer": tokenizer}
return _MODEL_CACHE[model_name]["model"], _MODEL_CACHE[model_name]["tokenizer"]
def generate_with_model(agent_role, prompt, generation_config: GenerationConfig = GENERATION_CONFIG):
"""
Generates a response using the specified agent's model.
"""
model_name = MODEL_REGISTRY.get(agent_role, "Qwen/Qwen3-0.6B")
model, tokenizer = get_model_and_tokenizer(model_name)
full_prompt = f"You are a helpful assistant. {ROLE_PROMPTS.get(agent_role, '')}\n\nUser prompt: {prompt}"
# Use tokenizer(...) to get both input_ids and attention_mask
inputs = tokenizer(
full_prompt,
return_tensors="pt",
padding=True,
truncation=True,
)
input_ids = inputs["input_ids"].to(model.device)
attention_mask = inputs["attention_mask"].to(model.device)
with torch.no_grad():
output = model.generate(
input_ids,
attention_mask=attention_mask, # ✅ ensures padding is ignored
generation_config=generation_config,
pad_token_id=tokenizer.pad_token_id,
)
# Slice off the prompt tokens to avoid prompt-echo issues
generated_tokens = output[0][input_ids.shape[-1]:]
decoded_output = tokenizer.decode(generated_tokens, skip_special_tokens=True).strip()
return decoded_output