|
|
|
|
|
import torch |
|
|
from transformers import ( |
|
|
AutoModelForCausalLM, |
|
|
AutoTokenizer, |
|
|
BitsAndBytesConfig, |
|
|
GenerationConfig, |
|
|
) |
|
|
from backend.agents import ROLE_PROMPTS |
|
|
|
|
|
|
|
|
QUANTIZATION_CONFIG = BitsAndBytesConfig( |
|
|
load_in_4bit=True, |
|
|
bnb_4bit_quant_type="nf4", |
|
|
bnb_4bit_compute_dtype=torch.bfloat16, |
|
|
) |
|
|
|
|
|
MODEL_REGISTRY = { |
|
|
"ceo": "Qwen/Qwen3-0.6B", |
|
|
"planner": "Qwen/Qwen3-0.6B", |
|
|
"manager": "Qwen/Qwen3-0.6B", |
|
|
"debugger": "Qwen/Qwen3-0.6B", |
|
|
"business_analyst": "Qwen/Qwen3-0.6B", |
|
|
"ux_ui_designer": "Qwen/Qwen3-0.6B", |
|
|
"worker_backend_coder": "Qwen/Qwen3-0.6B", |
|
|
"worker_front_end_coder": "Qwen/Qwen3-0.6B", |
|
|
"worker_tester": "Qwen/Qwen3-0.6B", |
|
|
"code_analyst": "Qwen/Qwen3-0.6B", |
|
|
} |
|
|
_MODEL_CACHE = {} |
|
|
|
|
|
|
|
|
GENERATION_CONFIG = GenerationConfig( |
|
|
max_new_tokens=512, |
|
|
do_sample=True, |
|
|
temperature=0.7, |
|
|
top_p=0.9, |
|
|
top_k=50, |
|
|
repetition_penalty=1.1, |
|
|
) |
|
|
|
|
|
|
|
|
def get_model_and_tokenizer(model_name): |
|
|
""" |
|
|
Loads a model and its tokenizer from the Hugging Face Hub. |
|
|
Implements caching to avoid reloading the model for each call. |
|
|
""" |
|
|
if model_name not in _MODEL_CACHE: |
|
|
print(f"Loading model: {model_name}...") |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) |
|
|
|
|
|
|
|
|
if tokenizer.pad_token is None or tokenizer.pad_token == tokenizer.eos_token: |
|
|
tokenizer.add_special_tokens({"pad_token": "<|pad|>"}) |
|
|
|
|
|
|
|
|
use_gpu = torch.cuda.is_available() |
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
|
model_name, |
|
|
device_map="auto" if use_gpu else None, |
|
|
quantization_config=QUANTIZATION_CONFIG if use_gpu else None, |
|
|
trust_remote_code=True, |
|
|
) |
|
|
|
|
|
|
|
|
model.resize_token_embeddings(len(tokenizer)) |
|
|
|
|
|
|
|
|
if not use_gpu: |
|
|
model.to("cpu") |
|
|
|
|
|
_MODEL_CACHE[model_name] = {"model": model, "tokenizer": tokenizer} |
|
|
|
|
|
return _MODEL_CACHE[model_name]["model"], _MODEL_CACHE[model_name]["tokenizer"] |
|
|
|
|
|
|
|
|
def generate_with_model(agent_role, prompt, generation_config: GenerationConfig = GENERATION_CONFIG): |
|
|
""" |
|
|
Generates a response using the specified agent's model. |
|
|
""" |
|
|
model_name = MODEL_REGISTRY.get(agent_role, "Qwen/Qwen3-0.6B") |
|
|
model, tokenizer = get_model_and_tokenizer(model_name) |
|
|
|
|
|
full_prompt = f"You are a helpful assistant. {ROLE_PROMPTS.get(agent_role, '')}\n\nUser prompt: {prompt}" |
|
|
|
|
|
|
|
|
inputs = tokenizer( |
|
|
full_prompt, |
|
|
return_tensors="pt", |
|
|
padding=True, |
|
|
truncation=True, |
|
|
) |
|
|
input_ids = inputs["input_ids"].to(model.device) |
|
|
attention_mask = inputs["attention_mask"].to(model.device) |
|
|
|
|
|
with torch.no_grad(): |
|
|
output = model.generate( |
|
|
input_ids, |
|
|
attention_mask=attention_mask, |
|
|
generation_config=generation_config, |
|
|
pad_token_id=tokenizer.pad_token_id, |
|
|
) |
|
|
|
|
|
|
|
|
generated_tokens = output[0][input_ids.shape[-1]:] |
|
|
decoded_output = tokenizer.decode(generated_tokens, skip_special_tokens=True).strip() |
|
|
|
|
|
return decoded_output |
|
|
|