Code-agent-team-beta-updates

Sleeping

App Files Files Community

Code-agent-team-beta-updates / models /loader.py

Keeby-smilyai

Update models/loader.py

fa87e26 verified 5 months ago

raw

history blame contribute delete

3.57 kB

	# models/loader.py
	import torch
	from transformers import (
	AutoModelForCausalLM,
	AutoTokenizer,
	BitsAndBytesConfig,
	GenerationConfig,
	)
	from backend.agents import ROLE_PROMPTS

	# Optional quantization config (used only if GPU is available)
	QUANTIZATION_CONFIG = BitsAndBytesConfig(
	load_in_4bit=True,
	bnb_4bit_quant_type="nf4",
	bnb_4bit_compute_dtype=torch.bfloat16,
	)

	MODEL_REGISTRY = {
	"ceo": "Qwen/Qwen3-0.6B",
	"planner": "Qwen/Qwen3-0.6B",
	"manager": "Qwen/Qwen3-0.6B",
	"debugger": "Qwen/Qwen3-0.6B",
	"business_analyst": "Qwen/Qwen3-0.6B",
	"ux_ui_designer": "Qwen/Qwen3-0.6B",
	"worker_backend_coder": "Qwen/Qwen3-0.6B",
	"worker_front_end_coder": "Qwen/Qwen3-0.6B",
	"worker_tester": "Qwen/Qwen3-0.6B",
	"code_analyst": "Qwen/Qwen3-0.6B",
	}
	_MODEL_CACHE = {}

	# Explicit generation config (avoids model-specific overrides)
	GENERATION_CONFIG = GenerationConfig(
	max_new_tokens=512,
	do_sample=True,
	temperature=0.7,
	top_p=0.9,
	top_k=50,
	repetition_penalty=1.1,
	)


	def get_model_and_tokenizer(model_name):
	"""
	Loads a model and its tokenizer from the Hugging Face Hub.
	Implements caching to avoid reloading the model for each call.
	"""
	if model_name not in _MODEL_CACHE:
	print(f"Loading model: {model_name}...")

	tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

	# Ensure a dedicated pad token exists (not EOS)
	if tokenizer.pad_token is None or tokenizer.pad_token == tokenizer.eos_token:
	tokenizer.add_special_tokens({"pad_token": "<\|pad\|>"})

	# Load model with GPU/CPU awareness
	use_gpu = torch.cuda.is_available()
	model = AutoModelForCausalLM.from_pretrained(
	model_name,
	device_map="auto" if use_gpu else None,
	quantization_config=QUANTIZATION_CONFIG if use_gpu else None,
	trust_remote_code=True,
	)

	# Resize embeddings if new tokens were added
	model.resize_token_embeddings(len(tokenizer))

	# Explicitly move to CPU if no GPU
	if not use_gpu:
	model.to("cpu")

	_MODEL_CACHE[model_name] = {"model": model, "tokenizer": tokenizer}

	return _MODEL_CACHE[model_name]["model"], _MODEL_CACHE[model_name]["tokenizer"]


	def generate_with_model(agent_role, prompt, generation_config: GenerationConfig = GENERATION_CONFIG):
	"""
	Generates a response using the specified agent's model.
	"""
	model_name = MODEL_REGISTRY.get(agent_role, "Qwen/Qwen3-0.6B")
	model, tokenizer = get_model_and_tokenizer(model_name)

	full_prompt = f"You are a helpful assistant. {ROLE_PROMPTS.get(agent_role, '')}\n\nUser prompt: {prompt}"

	# Use tokenizer(...) to get both input_ids and attention_mask
	inputs = tokenizer(
	full_prompt,
	return_tensors="pt",
	padding=True,
	truncation=True,
	)
	input_ids = inputs["input_ids"].to(model.device)
	attention_mask = inputs["attention_mask"].to(model.device)

	with torch.no_grad():
	output = model.generate(
	input_ids,
	attention_mask=attention_mask, # ✅ ensures padding is ignored
	generation_config=generation_config,
	pad_token_id=tokenizer.pad_token_id,
	)

	# Slice off the prompt tokens to avoid prompt-echo issues
	generated_tokens = output[0][input_ids.shape[-1]:]
	decoded_output = tokenizer.decode(generated_tokens, skip_special_tokens=True).strip()

	return decoded_output