# models/loader.py import torch from transformers import ( AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, GenerationConfig, ) from backend.agents import ROLE_PROMPTS # Optional quantization config (used only if GPU is available) QUANTIZATION_CONFIG = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16, ) MODEL_REGISTRY = { "ceo": "Qwen/Qwen3-0.6B", "planner": "Qwen/Qwen3-0.6B", "manager": "Qwen/Qwen3-0.6B", "debugger": "Qwen/Qwen3-0.6B", "business_analyst": "Qwen/Qwen3-0.6B", "ux_ui_designer": "Qwen/Qwen3-0.6B", "worker_backend_coder": "Qwen/Qwen3-0.6B", "worker_front_end_coder": "Qwen/Qwen3-0.6B", "worker_tester": "Qwen/Qwen3-0.6B", "code_analyst": "Qwen/Qwen3-0.6B", } _MODEL_CACHE = {} # Explicit generation config (avoids model-specific overrides) GENERATION_CONFIG = GenerationConfig( max_new_tokens=512, do_sample=True, temperature=0.7, top_p=0.9, top_k=50, repetition_penalty=1.1, ) def get_model_and_tokenizer(model_name): """ Loads a model and its tokenizer from the Hugging Face Hub. Implements caching to avoid reloading the model for each call. """ if model_name not in _MODEL_CACHE: print(f"Loading model: {model_name}...") tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) # Ensure a dedicated pad token exists (not EOS) if tokenizer.pad_token is None or tokenizer.pad_token == tokenizer.eos_token: tokenizer.add_special_tokens({"pad_token": "<|pad|>"}) # Load model with GPU/CPU awareness use_gpu = torch.cuda.is_available() model = AutoModelForCausalLM.from_pretrained( model_name, device_map="auto" if use_gpu else None, quantization_config=QUANTIZATION_CONFIG if use_gpu else None, trust_remote_code=True, ) # Resize embeddings if new tokens were added model.resize_token_embeddings(len(tokenizer)) # Explicitly move to CPU if no GPU if not use_gpu: model.to("cpu") _MODEL_CACHE[model_name] = {"model": model, "tokenizer": tokenizer} return _MODEL_CACHE[model_name]["model"], _MODEL_CACHE[model_name]["tokenizer"] def generate_with_model(agent_role, prompt, generation_config: GenerationConfig = GENERATION_CONFIG): """ Generates a response using the specified agent's model. """ model_name = MODEL_REGISTRY.get(agent_role, "Qwen/Qwen3-0.6B") model, tokenizer = get_model_and_tokenizer(model_name) full_prompt = f"You are a helpful assistant. {ROLE_PROMPTS.get(agent_role, '')}\n\nUser prompt: {prompt}" # Use tokenizer(...) to get both input_ids and attention_mask inputs = tokenizer( full_prompt, return_tensors="pt", padding=True, truncation=True, ) input_ids = inputs["input_ids"].to(model.device) attention_mask = inputs["attention_mask"].to(model.device) with torch.no_grad(): output = model.generate( input_ids, attention_mask=attention_mask, # ✅ ensures padding is ignored generation_config=generation_config, pad_token_id=tokenizer.pad_token_id, ) # Slice off the prompt tokens to avoid prompt-echo issues generated_tokens = output[0][input_ids.shape[-1]:] decoded_output = tokenizer.decode(generated_tokens, skip_special_tokens=True).strip() return decoded_output