Crystalcareai
/

Quiet-Star-Custom

@@ -20,7 +20,7 @@
 """ PyTorch Quiet model."""
 import inspect
 import math
-# import pdb
 import warnings
 from collections import defaultdict
 from typing import List, Optional, Tuple, Union
@@ -32,8 +32,8 @@ from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 from transformers.generation.utils import GenerationMixin
 from transformers.generation.stopping_criteria import StoppingCriteriaList, validate_stopping_criteria
-from transformers import TextStreamer
-from transformers import AutoTokenizer
 from transformers.activations import ACT2FN
 from transformers.cache_utils import Cache, DynamicCache
 from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
@@ -48,7 +48,7 @@ from transformers.utils import (
 	replace_return_docstrings,
 )
 from .configuration_quiet import QuietConfig
-# from .generate import generate
 import time
 from typing import Optional, List
@@ -354,26 +354,28 @@ class QuietAttention(nn.Module):
 				f" {attn_weights.size()}"
 			)
 		if self._attn_implementation == "flash_attention_2":
-			# 2d mask is passed through the layers
 			attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
-		elif self._attn_implementation == "sdpa" and not output_attentions and (attention_mask is None or attention_mask.dim() == 2) and False:
-			# output_attentions=True can not be supported when using SDPA, and we fall back on
-			# the manual implementation that requires a 4D causal mask in all cases.
-			attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
-				attention_mask,
-				(batch_size, seq_length),
-				inputs_embeds,
-				past_key_values_length,
-			)
-		elif attention_mask is None or attention_mask.dim() == 2:
-			# 4d mask is passed through the layers
-			attention_mask = _prepare_4d_causal_attention_mask(
-				attention_mask,
-				(batch_size, seq_length),
-				inputs_embeds,
-				past_key_values_length,
-				sliding_window=self.config.sliding_window,
-			)
 		if attention_mask is not None:
 			if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
@@ -772,7 +774,7 @@ class QuietSdpaAttention(QuietAttention):
 				raise ValueError(
 					f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
 				)
 		# SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
 		# Reference: https://github.com/pytorch/pytorch/issues/112577.
 		if query_states.device.type == "cuda" and attention_mask is not None:
@@ -784,7 +786,7 @@ class QuietSdpaAttention(QuietAttention):
 			query_states,
 			key_states,
 			value_states,
-			attn_mask=attention_mask.to(torch.bool).to(query_states.device) if attention_mask is not None else None,
 			dropout_p=self.attention_dropout if self.training else 0.0,
 			# The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
 			is_causal=self.is_causal and attention_mask is None and q_len > 1,
@@ -1069,7 +1071,7 @@ class QuietModel(QuietPreTrainedModel):
 		if self._attn_implementation == "flash_attention_2":
 			# 2d mask is passed through the layers
 			attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
-		elif self._attn_implementation == "sdpa" and not output_attentions and (attention_mask is None or (attention_mask.dim() == 2 and False)):
 			# output_attentions=True can not be supported when using SDPA, and we fall back on
 			# the manual implementation that requires a 4D causal mask in all cases.
 			attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
@@ -1078,16 +1080,15 @@ class QuietModel(QuietPreTrainedModel):
 				inputs_embeds,
 				past_key_values_length,
 			)
-		else:
 			# 4d mask is passed through the layers
-			if attention_mask is None or attention_mask.dim() == 2:
-				attention_mask = _prepare_4d_causal_attention_mask(
-					attention_mask,
-					(batch_size, seq_length),
-					inputs_embeds,
-					past_key_values_length,
-					sliding_window=self.config.sliding_window,
-				)
 		hidden_states = inputs_embeds
@@ -1309,7 +1310,6 @@ class QuietForCausalLM(QuietPreTrainedModel, GenerationMixin):
 		elif isinstance(module, nn.Embedding):
 			nn.init.xavier_uniform_(module.weight)
 	@torch.no_grad()
 	def infer(
 		self,
@@ -1342,6 +1342,9 @@ class QuietForCausalLM(QuietPreTrainedModel, GenerationMixin):
 		continuation_length = self.n_ahead - 2
 		new_key_values = past_key_values
 		start_time = time.time()
 		for continuation_idx in range(continuation_length):
 			outputs = self.model(
@@ -1367,7 +1370,7 @@ class QuietForCausalLM(QuietPreTrainedModel, GenerationMixin):
 			next_token_id = torch.argmax(next_token_logits, dim=-1)
 			# Append the generated token to the input sequence
-			input_ids = torch.cat([input_ids, next_token_id.unsqueeze(-1).to(input_ids.device)], dim=-1)
 			seq_len += 1
 			# Update the attention mask
@@ -1399,8 +1402,8 @@ class QuietForCausalLM(QuietPreTrainedModel, GenerationMixin):
 		# two new tokens: last continuation token and end thought token
 		outputs_after = self.model(
-			input_ids=torch.cat([next_token_id.unsqueeze(-1).to(input_ids.device), torch.tensor(end_thought_token_id).unsqueeze(-1).unsqueeze(-1).to(input_ids.device)], dim=-1),
-			attention_mask=attention_mask,
 			position_ids=position_ids,
 			past_key_values=new_key_values,
 			inputs_embeds=inputs_embeds,
@@ -1421,218 +1424,10 @@ class QuietForCausalLM(QuietPreTrainedModel, GenerationMixin):
 		logits = self.lm_head(mixed_hidden_states)
 		return logits
-	# from transformers.generation.utils import (
-	# 	GenerationMixin,
-	# 	validate_stopping_criteria,
-	# 	StoppingCriteriaList,
-	# )
-	# logger = logging.get_logger(__name__)
-	# def custom_generate(
-	# 	self,
-	# 	input_ids,
-	# 	attention_mask=None,
-	# 	max_length=None,
-	# 	min_length=None,
-	# 	do_sample=None,
-	# 	early_stopping=None,
-	# 	num_beams=None,
-	# 	temperature=None,
-	# 	top_k=None,
-	# 	top_p=None,
-	# 	repetition_penalty=None,
-	# 	bad_words_ids=None,
-	# 	bos_token_id=None,
-	# 	pad_token_id=None,
-	# 	eos_token_id=None,
-	# 	streamer=None,
-	# 	length_penalty=None,
-	# 	no_repeat_ngram_size=None,
-	# 	num_return_sequences=None,
-	# 	decoder_start_token_id=None,
-	# 	use_cache=None,
-	# 	num_beam_groups=None,
-	# 	diversity_penalty=None,
-	# 	prefix_allowed_tokens_fn=None,
-	# 	output_attentions=None,
-	# 	output_hidden_states=None,
-	# 	output_scores=None,
-	# 	return_dict_in_generate=None,
-	# 	forced_bos_token_id=None,
-	# 	forced_eos_token_id=None,
-	# 	remove_invalid_values=None,
-	# 	synced_gpus=None,
-	# 	**kwargs,
-	# ):
-	# 	with torch.no_grad():
-	# 		finished_generating = torch.zeros(len(input_ids), dtype=torch.bool, device=input_ids.device)
-	# 		while not finished_generating.all() and input_ids.shape[1] < max_length:
-	# 			# Sample the next token
-	# 			new_ids = self(
-	# 				input_ids[~finished_generating],
-	# 				attention_mask=attention_mask[~finished_generating] if attention_mask is not None else None,
-	# 				**kwargs
-	# 			)['logits']
-	# 			# Mask out the start and end thought tokens so we don't accidentally sample them
-	# 			new_ids[:, :, self.tokenizer.vocab_size:] = -float("inf")
-	# 			for list_idx, answer_idx in enumerate((~finished_generating).nonzero(as_tuple=True)[0]):
-	# 				# Find the index of the last token that is not padding
-	# 				base_answer_ids = input_ids[answer_idx]
-	# 				new_answer_ids = new_ids[list_idx]
-	# 				last_token_idx = (base_answer_ids != self.tokenizer.pad_token_id).nonzero(as_tuple=True)[0].max()
-	# 				new_ids_sampled = torch.multinomial(
-	# 					torch.nn.functional.softmax(new_answer_ids[last_token_idx] / temperature, dim=-1), 1)
-	# 				# Assign the new id to the last token
-	# 				if last_token_idx + 1 >= len(base_answer_ids):
-	# 					# Add padding everywhere
-	# 					new_padding = torch.full((len(input_ids), 1), self.tokenizer.pad_token_id, dtype=torch.long,
-	# 											device=input_ids.device)
-	# 					input_ids = torch.cat([input_ids, new_padding], dim=-1)
-	# 					if attention_mask is not None:
-	# 						attention_mask = torch.cat([attention_mask, torch.zeros_like(new_padding)], dim=-1)
-	# 				if attention_mask is not None:
-	# 					attention_mask[answer_idx, last_token_idx + 1] = 1
-	# 				input_ids[answer_idx, last_token_idx + 1] = new_ids_sampled
-	# 				if new_ids_sampled == self.tokenizer.eos_token_id or new_ids_sampled == self.tokenizer.bos_token_id or new_ids_sampled == self.tokenizer.pad_token_id:
-	# 					finished_generating[answer_idx] = 1
-	# 				# Check if the end token is generated
-	# 				if new_ids_sampled == self.tokenizer.convert_tokens_to_ids("</s>"):
-	# 					finished_generating[answer_idx] = 1
-	# 			if streamer is not None:
-	# 				streamer.put(new_ids_sampled)
-	# 		generated_token_ids = input_ids.tolist()
-	# 		return generated_token_ids
-	# def use_generate(
-	# 	self,
-	# 	input_ids,
-	# 	attention_mask=None,
-	# 	max_length=None,
-	# 	min_length=None,
-	# 	do_sample=None,
-	# 	early_stopping=None,
-	# 	num_beams=None,
-	# 	temperature=None,
-	# 	streamer=None,
-	# 	top_k=None,
-	# 	top_p=None,
-	# 	repetition_penalty=None,
-	# 	bad_words_ids=None,
-	# 	bos_token_id=None,
-	# 	pad_token_id=None,
-	# 	eos_token_id=None,
-	# 	length_penalty=None,
-	# 	no_repeat_ngram_size=None,
-	# 	num_return_sequences=None,
-	# 	decoder_start_token_id=None,
-	# 	use_cache=None,
-	# 	num_beam_groups=None,
-	# 	diversity_penalty=None,
-	# 	prefix_allowed_tokens_fn=None,
-	# 	output_attentions=None,
-	# 	output_hidden_states=None,
-	# 	output_scores=None,
-	# 	return_dict_in_generate=None,
-	# 	forced_bos_token_id=None,
-	# 	forced_eos_token_id=None,
-	# 	remove_invalid_values=None,
-	# 	synced_gpus=None,
-	# 	n_ahead=8,
-	# 	n_ahead_talk=4,
-	# 	merged_talk_heads=True,
-	# 	merged_lm_and_talk_heads=False,
-	# 	merged_lm_and_think_heads=True,
-	# 	use_concat_talk_head=True,
-	# 	use_shallow_think=True,
-	# 	use_shallow_talk=False,
-	# 	use_complex_think_head=False,
-	# 	use_complex_talk_head=True,
-	# 	use_weighted_talk_head=True,
-	# 	trust_remote_code=True,
-	# 	torch_dtype=torch.bfloat16,
-	# 	**model_kwargs,
-	# ):
-	# 	# Set model attributes
-	# 	self.max_thoughts = n_ahead + n_ahead_talk + 1
-	# 	self.merged_talk_heads = merged_talk_heads
-	# 	self.merged_lm_and_talk_heads = merged_lm_and_talk_heads
-	# 	self.merged_lm_and_think_heads = merged_lm_and_think_heads
-	# 	self.use_concat_talk_head = use_concat_talk_head
-	# 	self.use_shallow_think = use_shallow_think
-	# 	self.use_shallow_talk = use_shallow_talk
-	# 	self.use_complex_think_head = use_complex_think_head
-	# 	self.use_complex_talk_head = use_complex_talk_head
-	# 	self.use_weighted_talk_head = use_weighted_talk_head
-	# 	# Set model properties
-	# 	self.use_end_thought_token = True
-	# 	self.use_start_thought_token = True
-	# 	self.wandb_enabled = True
-	# 	self.n_ahead = n_ahead
-	# 	self.n_passes = 1
-	# 	self.eval_mode = True
-	# 	self.first_run = False
-	# 	self.kill_after = 100
-	# 	self.rm_initialized = True
-	# 	self.original_mode = False
-	# 	# Generate using the custom generate function
-	# 	generated_token_ids = custom_generate(
-	# 		self,
-	# 		input_ids=input_ids,
-	# 		attention_mask=attention_mask,
-	# 		max_length=max_length,
-	# 		min_length=min_length,
-	# 		do_sample=do_sample,
-	# 		early_stopping=early_stopping,
-	# 		num_beams=num_beams,
-	# 		temperature=temperature,
-	# 		top_k=top_k,
-	# 		top_p=top_p,
-	# 		repetition_penalty=repetition_penalty,
-	# 		bad_words_ids=bad_words_ids,
-	# 		bos_token_id=bos_token_id,
-	# 		pad_token_id=pad_token_id,
-	# 		eos_token_id=eos_token_id,
-	# 		length_penalty=length_penalty,
-	# 		no_repeat_ngram_size=no_repeat_ngram_size,
-	# 		num_return_sequences=num_return_sequences,
-	# 		decoder_start_token_id=decoder_start_token_id,
-	# 		use_cache=use_cache,
-	# 		num_beam_groups=num_beam_groups,
-	# 		diversity_penalty=diversity_penalty,
-	# 		prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
-	# 		output_attentions=output_attentions,
-	# 		output_hidden_states=output_hidden_states,
-	# 		output_scores=output_scores,
-	# 		return_dict_in_generate=return_dict_in_generate,
-	# 		forced_bos_token_id=forced_bos_token_id,
-	# 		forced_eos_token_id=forced_eos_token_id,
-	# 		remove_invalid_values=remove_invalid_values,
-	# 		synced_gpus=synced_gpus,
-	# 		streamer=streamer,
-	# 		**model_kwargs,
-	# 	)
-	# 	return generated_token_ids
-	# def generate(self, input_ids, attention_mask=None, max_length=None, temperature=1.0, **kwargs):
-	# 	from .generate import generate
-	# 	return generate(self, input_ids=input_ids, attention_mask=attention_mask, max_length=max_length, temperature=temperature, **kwargs)
 	@add_start_docstrings_to_model_forward(QUIET_INPUTS_DOCSTRING)
 	@replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
 	def forward(
@@ -1648,7 +1443,6 @@ class QuietForCausalLM(QuietPreTrainedModel, GenerationMixin):
 		output_attentions: Optional[bool] = None,
 		output_hidden_states: Optional[bool] = None,
 		return_dict: Optional[bool] = None,
-		streamer: Optional[TextStreamer] = None,
 	) -> Union[Tuple, CausalLMOutputWithPast]:
 		r"""
 		Args:
@@ -1822,17 +1616,15 @@ class QuietForCausalLM(QuietPreTrainedModel, GenerationMixin):
 		sample_probs_history = []
 		action_loglikelihoods_list = []
-		# complexity_scores = self.compute_complexity_scores(input_ids, attention_mask)
-		temperature = self.temperature #* complexity_scores.unsqueeze(-1)
 		if self.use_end_thought_token or self.use_start_thought_token:
 			if not self.use_reparam_for_thought_embeddings:
-				start_embedding = self.start_embedding[0].unsqueeze(0) * self.embedding_scale
-				end_embedding = self.end_embedding[0].unsqueeze(0) * self.embedding_scale
 			else:
-				start_embedding = self.start_embedding * self.embedding_scale
-				end_embedding = self.end_embedding * self.embedding_scale
 			base_embeddings = self.model.embed_tokens.weight
 			if self.train_only_thinking_embedding:
 				base_embeddings = base_embeddings.detach()
@@ -2328,6 +2120,7 @@ class QuietForCausalLM(QuietPreTrainedModel, GenerationMixin):
 		del start_embedding
 		del end_embedding
 		torch.cuda.empty_cache()
 		return CausalLMOutputWithPast(
 			loss=loss if loss is not None else None,
@@ -2336,6 +2129,8 @@ class QuietForCausalLM(QuietPreTrainedModel, GenerationMixin):
 			hidden_states=outputs.hidden_states,
 			attentions=outputs.attentions,
 		)
 	def prepare_inputs_for_generation(
 		self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs

 """ PyTorch Quiet model."""
 import inspect
 import math
+import pdb
 import warnings
 from collections import defaultdict
 from typing import List, Optional, Tuple, Union
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 from transformers.generation.utils import GenerationMixin
 from transformers.generation.stopping_criteria import StoppingCriteriaList, validate_stopping_criteria
+from transformers import TextStreamer, AutoTokenizer
 from transformers.activations import ACT2FN
 from transformers.cache_utils import Cache, DynamicCache
 from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
 	replace_return_docstrings,
 )
 from .configuration_quiet import QuietConfig
 import time
 from typing import Optional, List
 				f" {attn_weights.size()}"
 			)
 		if self._attn_implementation == "flash_attention_2":
+			# Prepare attention mask for flash-attn
 			attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
+		elif self._attn_implementation == "sdpa":
+			# Prepare attention mask for SDPA
+			if attention_mask is None or attention_mask.dim() == 2:
+				attention_mask = _prepare_4d_causal_attention_mask(
+					attention_mask,
+					(batch_size, seq_length),
+					inputs_embeds,
+					past_key_values_length,
+					sliding_window=self.config.sliding_window,
+				)
+		else:
+			# Prepare attention mask for other implementations
+			if attention_mask is None or attention_mask.dim() == 2:
+				attention_mask = _prepare_4d_causal_attention_mask(
+					attention_mask,
+					(batch_size, seq_length),
+					inputs_embeds,
+					past_key_values_length,
+					sliding_window=self.config.sliding_window,
+				)
 		if attention_mask is not None:
 			if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
 				raise ValueError(
 					f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
 				)
+			attention_mask = attention_mask.to(query_states.dtype)
 		# SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
 		# Reference: https://github.com/pytorch/pytorch/issues/112577.
 		if query_states.device.type == "cuda" and attention_mask is not None:
 			query_states,
 			key_states,
 			value_states,
+			attn_mask=attention_mask.to(query_states.device) if attention_mask is not None else None,
 			dropout_p=self.attention_dropout if self.training else 0.0,
 			# The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
 			is_causal=self.is_causal and attention_mask is None and q_len > 1,
 		if self._attn_implementation == "flash_attention_2":
 			# 2d mask is passed through the layers
 			attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
+		elif self._attn_implementation == "sdpa" and not output_attentions and attention_mask.dim() == 2 and False:
 			# output_attentions=True can not be supported when using SDPA, and we fall back on
 			# the manual implementation that requires a 4D causal mask in all cases.
 			attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
 				inputs_embeds,
 				past_key_values_length,
 			)
+		elif attention_mask is None or attention_mask.dim() == 2:
 			# 4d mask is passed through the layers
+			attention_mask = _prepare_4d_causal_attention_mask(
+				attention_mask,
+				(batch_size, seq_length),
+				inputs_embeds,
+				past_key_values_length,
+				sliding_window=self.config.sliding_window,
+			)
 		hidden_states = inputs_embeds
 		elif isinstance(module, nn.Embedding):
 			nn.init.xavier_uniform_(module.weight)
 	@torch.no_grad()
 	def infer(
 		self,
 		continuation_length = self.n_ahead - 2
 		new_key_values = past_key_values
+		# Initialize next_token_id with a default value
+		next_token_id = torch.zeros(batch_size, dtype=torch.long).to(input_ids.device)
 		start_time = time.time()
 		for continuation_idx in range(continuation_length):
 			outputs = self.model(
 			next_token_id = torch.argmax(next_token_logits, dim=-1)
 			# Append the generated token to the input sequence
+			# input_ids = torch.cat([input_ids, next_token_id.unsqueeze(-1).to(input_ids.device)], dim=-1)
 			seq_len += 1
 			# Update the attention mask
 		# two new tokens: last continuation token and end thought token
 		outputs_after = self.model(
+			input_ids=torch.cat([next_token_id.unsqueeze(-1).to(input_ids.device), torch.tensor([[end_thought_token_id]] * batch_size).to(input_ids.device)], dim=-1),
+			attention_mask=torch.cat([attention_mask[:, -1:], torch.ones((batch_size, 1)).to(attention_mask.device)], dim=-1),
 			position_ids=position_ids,
 			past_key_values=new_key_values,
 			inputs_embeds=inputs_embeds,
 		logits = self.lm_head(mixed_hidden_states)
 		return logits
+	def generate(self, input_ids, attention_mask=None, max_length=None, temperature=1.0, **kwargs):
+		from .generate import generate
+		return generate(self, input_ids=input_ids, attention_mask=attention_mask, max_length=max_length, temperature=temperature, **kwargs)
 	@add_start_docstrings_to_model_forward(QUIET_INPUTS_DOCSTRING)
 	@replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
 	def forward(
 		output_attentions: Optional[bool] = None,
 		output_hidden_states: Optional[bool] = None,
 		return_dict: Optional[bool] = None,
 	) -> Union[Tuple, CausalLMOutputWithPast]:
 		r"""
 		Args:
 		sample_probs_history = []
 		action_loglikelihoods_list = []
+		temperature = self.temperature
 		if self.use_end_thought_token or self.use_start_thought_token:
 			if not self.use_reparam_for_thought_embeddings:
+				start_embedding = self.start_embedding[0].unsqueeze(0) * self.embedding_scale * temperature
+				end_embedding = self.end_embedding[0].unsqueeze(0) * self.embedding_scale * temperature
 			else:
+				start_embedding = self.start_embedding * self.embedding_scale * temperature
+				end_embedding = self.end_embedding * self.embedding_scale * temperature
 			base_embeddings = self.model.embed_tokens.weight
 			if self.train_only_thinking_embedding:
 				base_embeddings = base_embeddings.detach()
 		del start_embedding
 		del end_embedding
 		torch.cuda.empty_cache()
 		return CausalLMOutputWithPast(
 			loss=loss if loss is not None else None,
 			hidden_states=outputs.hidden_states,
 			attentions=outputs.attentions,
 		)
 	def prepare_inputs_for_generation(
 		self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs