Update model.py

ce118fe verified 7 months ago

5.72 kB

	import os
	from typing import Optional
	from transformers import AutoModelForCausalLM, Qwen3ForCausalLM, AutoTokenizer, AutoConfig
	from huggingface_hub import hf_hub_download
	import torch
	import torch.nn as nn
	from warnings import warn


	# Define a custom model that wraps a causal LM and adds a regression head
	class CausalLMForRegression(nn.Module):
	config_class = Qwen3ForCausalLM.config_class
	base_model_prefix = "model"

	def __init__(self, base_model_name):
	super().__init__()
	# Load the causal LM with hidden states enabled
	self.model = AutoModelForCausalLM.from_pretrained(
	base_model_name,
	output_hidden_states=True
	)
	self.base_model = base_model_name
	# Using pooled hidden state to a single scalar
	self.regression_head = nn.Linear(self.model.config.hidden_size, 1)

	print(f"Initializing difficulty scorer from scratch using {base_model_name} as a base!")
	self._keys_to_ignore_on_save = []

	def forward(self, input_ids, attention_mask=None, labels=None):
	# Flatten extra dimensions if present
	if input_ids.dim() == 3:
	# e.g. from (accum_steps, batch_size, seq_length) to (accum_steps * batch_size, seq_length)
	input_ids = input_ids.view(-1, input_ids.size(-1))
	if attention_mask is not None and attention_mask.dim() == 3:
	attention_mask = attention_mask.view(-1, attention_mask.size(-1))

	outputs = self.model(input_ids, attention_mask=attention_mask)
	hidden_states = outputs.hidden_states[-1] # Now should have shape: (batch, seq_length, hidden_size)

	# Mean-pooling over non-padding tokens
	if attention_mask is not None:
	mask = attention_mask.unsqueeze(-1).expand_as(hidden_states).to(hidden_states.dtype)
	hidden_sum = torch.sum(hidden_states * mask, dim=1)
	lengths = mask.sum(dim=1)
	pooled = hidden_sum / lengths
	else:
	pooled = hidden_states.mean(dim=1)

	logits = self.regression_head(pooled).squeeze(-1)

	loss = None
	if labels is not None:
	loss_fn = nn.HuberLoss() #nn.MSELoss()
	loss = loss_fn(logits, labels)

	return {"loss": loss, "logits": logits}

	def get_input_embeddings(self):
	# Delegate to the underlying causal LM's get_input_embeddings method.
	return self.model.get_input_embeddings()

	def save_pretrained(self, output_dir, safe_serialization=False):
	os.makedirs(output_dir, exist_ok=True)

	# Ensure we are saving the entire model properly
	model_state_dict = self.model.state_dict()
	for key, value in model_state_dict.items():
	if value.shape[0] == 0:
	print(f"Warning: Tensor {key} has shape {value.shape}, which may be problematic.")

	# Save model with proper weight tie handling
	self.model.save_pretrained(output_dir, safe_serialization=False)
	torch.save(self.regression_head.state_dict(), os.path.join(output_dir, "regression_head.bin"))


	def get_tokenizer(self):
	try:
	tokenizer = AutoTokenizer.from_pretrained(self.model.name_or_path)
	print(f"Loaded tokenizer from {self.model.name_or_path}")
	except:
	tokenizer = AutoTokenizer.from_pretrained(self.base_model)
	print(f"Loaded tokenizer from {self.base_model}")
	return tokenizer

	@classmethod
	def from_pretrained(cls, pretrained_model_name_or_path, model_args, *kwargs):
	warn(f"The `from_pretrained` method is currently only implemented for models with Qwen3-base.")
	cfg = kwargs.pop("config", None)
	if cfg is None:
	cfg = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
	cfg.output_hidden_states = True

	if "trust_remote_code" in kwargs:
	_ = kwargs.pop("trust_remote_code")

	backbone = Qwen3ForCausalLM.from_pretrained(
	pretrained_model_name_or_path,
	*model_args,
	config=cfg,
	trust_remote_code=False,
	**kwargs
	)

	if os.path.isdir(pretrained_model_name_or_path):
	head_path = os.path.join(pretrained_model_name_or_path,
	"regression_head.bin")
	else:
	head_path = hf_hub_download(
	repo_id=pretrained_model_name_or_path,
	filename="regression_head.bin",
	repo_type="model"
	)

	inst = cls.__new__(cls)
	nn.Module.__init__(inst)
	inst.model = backbone
	inst.regression_head = nn.Linear(cfg.hidden_size, 1)
	inst._keys_to_ignore_on_save = []
	inst.base_model = "Qwen/Qwen3-8B"

	if os.path.exists(head_path):
	inst.regression_head.load_state_dict(
	torch.load(head_path, map_location="cpu")
	)
	else:
	print("'regression_head.bin' not found – initialising randomly.")

	return inst

	@torch.no_grad()
	def generate(self, args, *kwargs):
	"""
	Wrapper that forwards all arguments to the underlying causal‑LM so that GenerationMixin‑based helpers
	(sampling, beam search, prepare_inputs_for_generation, etc.) keep working.
	"""
	return self.model.generate(args, *kwargs)

	def prepare_inputs_for_generation(self, args, *kwargs):
	"""
	Same here: to be able to load the model with AutoModelForCausalLM, we have to forward this method
	"""
	return self.model.prepare_inputs_for_generation(args, *kwargs)