Upload folder using huggingface_hub

a0d95b0 verified about 2 months ago

25.7 kB

	# mypy: ignore-errors


	# A single seq of representive cross-attention tokens is added at the begining only.
	# the next layer re-use output from the previous layer

	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from torch import Tensor
	from typing import Callable, Optional, Union
	import functools
	from dataclasses import asdict

	from transformers.models.llama.modeling_llama import (
	LlamaMLP,
	LlamaAttention,
	LlamaDecoderLayer,
	LlamaModel,
	LlamaForCausalLM
	)

	from transformers import AutoConfig, PretrainedConfig
	from transformers.modeling_outputs import (
	BaseModelOutputWithPast,
	CausalLMOutputWithPast,
	)
	from transformers.models.llama.modeling_llama import LlamaConfig as HFLlamaConfig
	from transformers.processing_utils import Unpack
	from transformers.masking_utils import create_causal_mask
	from transformers.cache_utils import Cache, DynamicCache
	from transformers.utils.deprecation import deprecate_kwarg
	from transformers.utils.generic import check_model_inputs
	from transformers.utils import TransformersKwargs, auto_docstring, can_return_tuple, logging

	from .Xslora import LoraXSLinear, HyperNetXSexp
	from .configIBA import MainConfig, HyperXSConfig, TrainingConfig, from_dict



	class IbaXs_LlamaAttention(LlamaAttention):
	def __init__(self, config: HFLlamaConfig, layer_idx: int):
	super().__init__(config, layer_idx)
	# Get main_config as a dataclass object
	main_cfg = from_dict(MainConfig, config.main_cfg)

	lora_attn_dim = main_cfg.hyperxs.lora_attn_dim
	train_cfg = main_cfg.training

	self.q_proj = LoraXSLinear(
	config.hidden_size, config.num_attention_heads * self.head_dim,
	train_cfg=train_cfg, rank = lora_attn_dim,
	bias=config.attention_bias
	)
	self.k_proj = LoraXSLinear(
	config.hidden_size, config.num_key_value_heads * self.head_dim,
	train_cfg=train_cfg, rank = lora_attn_dim,
	bias=config.attention_bias
	)
	self.v_proj = LoraXSLinear(
	config.hidden_size, config.num_key_value_heads * self.head_dim,
	train_cfg=train_cfg, rank = lora_attn_dim,
	bias=config.attention_bias
	)
	self.o_proj = LoraXSLinear(
	config.num_attention_heads * self.head_dim, config.hidden_size,
	train_cfg=train_cfg, rank = lora_attn_dim,
	bias=config.attention_bias
	)


	class IbaXs_LlamaMLP(LlamaMLP):
	def __init__(self, config: HFLlamaConfig):
	super().__init__(config)
	# Get main_config as a dataclass object
	main_cfg = from_dict(MainConfig, config.main_cfg)
	lora_attn_dim = main_cfg.hyperxs.lora_attn_dim
	train_cfg = main_cfg.training
	self.gate_proj = LoraXSLinear(self.hidden_size, self.intermediate_size,
	train_cfg=train_cfg, rank = lora_attn_dim,
	bias=config.mlp_bias)
	self.up_proj = LoraXSLinear(self.hidden_size, self.intermediate_size,
	train_cfg=train_cfg, rank = lora_attn_dim,
	bias=config.mlp_bias)
	self.down_proj = LoraXSLinear(self.intermediate_size, self.hidden_size,
	train_cfg=train_cfg, rank = lora_attn_dim,
	bias=config.mlp_bias)


	# block layer
	class IbaXs_LlamaDecoderLayer(LlamaDecoderLayer):
	def __init__(self, config: HFLlamaConfig,
	layer_idx: int,
	hypernetxs: HyperNetXSexp = None,
	):
	super().__init__(config, layer_idx)

	self.hypernetxs = hypernetxs
	self.hfconfig = config
	# Get main_config as a dataclass object
	main_cfg = from_dict(MainConfig, config.main_cfg)
	self.hyperxs_cfg = main_cfg.hyperxs
	self.n_cross_attn_tokens = main_cfg.hyperxs.n_cross_attn_tokens

	# Replace
	self.self_attn = IbaXs_LlamaAttention(config=config, layer_idx=layer_idx)
	self.mlp = IbaXs_LlamaMLP(config)


	#self.cross_attn_tokens = nn.Parameter(torch.empty(main_cfg.hyperxs.n_cross_attn_tokens,
	# hf_model_cfg.hidden_size))
	# In case of to(device) -> do not use self.layer_idx = LongTensor(layer_idx)
	self.register_buffer('layer_idx_hyperxs', torch.tensor(layer_idx, dtype=torch.long))
	# self.flag_hyper = True

	self.__loraxsTensor = None
	self.layer_idx = layer_idx

	# self.reset_parameters()

	def get_cache_loraxs(self):
	loraxsTensor = self.__loraxsTensor
	# self.loraxsTensor = None

	return loraxsTensor

	def reset_parameters(self):
	INIT_STD = 0.01
	# nn.init.normal_(self.hypernetxs_cross_attn_tokens, mean=0.0, std=INIT_STD)

	def set_loraxs_adapters(self, loraXsTensor: Tensor):
	# (batch, modules, rank, rank)
	if loraXsTensor is None:
	raise ModuleNotFoundError

	applied_modules = ['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj']
	idx = 0
	for key in applied_modules:
	for name, module in self.named_modules():
	# print('name', name, type(name))
	if name.endswith(key):
	if isinstance(module, LoraXSLinear):
	module.set_R(loraXsTensor[:, idx, : , :].contiguous())
	idx = idx + 1
	# print(f'name: {name}. R: {module.lora_train_R.shape, module.lora_train_R[1,1,1]}')
	else:
	raise NotImplementedError


	# def set_flag_hyper(self, flag: bool = False):
	# self.flag_hyper = flag

	@deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
	def forward(
	self,
	hidden_states: torch.Tensor,
	attention_mask: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.LongTensor] = None,
	past_key_values: Optional[Cache] = None,
	use_cache: Optional[bool] = False,
	cache_position: Optional[torch.LongTensor] = None,
	position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC
	### addtional arg
	flag_hyper: Optional[bool] = True,
	**kwargs #: Unpack[TransformersKwargs],
	) -> torch.Tensor:
	# if self.flag_hyper:
	# batch_size = hidden_states.shape[0]

	# hypernetxs_cross_attn_tokens = self.hypernetxs_cross_attn_tokens.expand(int(batch_size), -1, -1)
	# print('batch', batch_size, hypernetxs_cross_attn_tokens.shape)
	# hidden_states = torch.concat((hypernetxs_cross_attn_tokens, hidden_states), dim=1)

	# Copy paste modify from modeling_llama.py
	residual = hidden_states
	hidden_states = self.input_layernorm(hidden_states)
	# Self Attention
	hidden_states, _ = self.self_attn(
	hidden_states=hidden_states,
	attention_mask=attention_mask,
	position_ids=position_ids,
	past_key_values=past_key_values,
	use_cache=use_cache,
	cache_position=cache_position,
	position_embeddings=position_embeddings,
	**kwargs,
	)
	hidden_states = residual + hidden_states

	# Extract representative tokens
	if flag_hyper:
	cross_attention = hidden_states[:, 0:self.n_cross_attn_tokens, :]
	# Still push cross_attention to the next layer
	# hidden_states = hidden_states[:, self.n_cross_attn_tokens:, :]
	# save all lora adapters as a attribute
	self.__loraxsTensor = self.hypernetxs(cross_attention, self.layer_idx)
	# (batch, n_modules, r, r)

	# Fully Connected
	residual = hidden_states
	hidden_states = self.post_attention_layernorm(hidden_states)
	hidden_states = self.mlp(hidden_states)
	hidden_states = residual + hidden_states

	return hidden_states


	#back bone models
	class IbaXs_LlamaModel(LlamaModel):
	def __init__(self, config: HFLlamaConfig):
	super().__init__(config)
	# Get main_config as a dataclass object
	main_cfg = from_dict(MainConfig, config.main_cfg)
	self.hyperxs_cfg = main_cfg.hyperxs
	self.hypernetxs = HyperNetXSexp(main_cfg.hyperxs, config)
	self.layers = nn.ModuleList(
	[IbaXs_LlamaDecoderLayer(config, layer_idx, self.hypernetxs) \
	for layer_idx in range(config.num_hidden_layers)]
	)

	self.flag_hyper = True
	self.hypernetxs_cross_attn_tokens = nn.Parameter(torch.zeros(main_cfg.hyperxs.n_cross_attn_tokens,
	config.hidden_size))
	self.main_cfg = main_cfg
	# self.reset_parameters()

	def reset_parameters(self):
	INIT_STD = 0.01
	nn.init.normal_(self.hypernetxs_cross_attn_tokens, mean=0.0, std=INIT_STD)

	def _create_prefix_or_mask(
	self,
	batch_idx: torch.Tensor,
	head_idx: torch.Tensor,
	q_idx: torch.Tensor, #
	kv_idx: torch.Tensor, #
	) -> torch.Tensor:
	"""
	Creates a mask to UNLOCK specific regions.
	Boolean values will be process data inside create_causal_mask
	1. Prefix-sees-Prefix (bidirectional)
	2. Prefix-sees-Text (all)
	"""
	prefix_len = self.hypernetxs_cross_attn_tokens.shape[0] # K (int)

	# 1. Query is Prefix?
	is_query_prefix = q_idx < prefix_len
	# kv_idx [1, 1, 1, K] compared with safe_boundaries [Batch, 1, 1, 1]
	is_key_safe = kv_idx < self.safe_boundaries
	return is_query_prefix & is_key_safe

	def _create_prefix_and_mask(
	self,
	batch_idx: torch.Tensor,
	head_idx: torch.Tensor,
	q_idx: torch.Tensor,
	kv_idx: torch.Tensor,
	) -> torch.Tensor:
	"""
	Creates a mask to LOCK specific regions.
	1. Text-sees-Prefix
	"""
	prefix_len = self.hypernetxs_cross_attn_tokens.shape[0] # K (int)

	# The "forbidden" zone is:
	# Query is Text (q_idx >= prefix_len)
	# AND
	# Key is Prefix (kv_idx < prefix_len)

	is_forbidden = (q_idx >= prefix_len) & (kv_idx < prefix_len)

	# Return True if not in the forbidden zone.
	# ~ is the vmap-safe "NOT" operator for boolean tensors.
	# if q_idx.item() <= 10 and kv_idx.item() <= 10:
	# print('is_forbidden', ~is_forbidden)
	return ~is_forbidden


	# @check_model_inputs
	def forward(
	self,
	input_ids: Optional[torch.LongTensor] = None,
	attention_mask: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.LongTensor] = None,
	past_key_values: Optional[Cache] = None,
	inputs_embeds: Optional[torch.FloatTensor] = None,
	cache_position: Optional[torch.LongTensor] = None,
	use_cache: Optional[bool] = None,
	labels: Optional[torch.LongTensor] = None,
	**kwargs #: Unpack[TransformersKwargs],
	) -> BaseModelOutputWithPast:
	if (input_ids is None) ^ (inputs_embeds is not None):
	raise ValueError("You must specify exactly one of input_ids or inputs_embeds")

	if inputs_embeds is None:
	inputs_embeds: torch.Tensor = self.embed_tokens(input_ids)

	if use_cache and past_key_values is None:
	past_key_values = DynamicCache(config=self.config)

	is_prefill = (past_key_values is None) or \
	(hasattr(past_key_values, 'get_seq_length') and past_key_values.get_seq_length() == 0)
	prefix_len = self.main_cfg.hyperxs.n_cross_attn_tokens \
	if self.main_cfg.hyperxs.n_cross_attn_tokens is not None and is_prefill else 0

	# if cache_position is None:
	past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
	cache_position: torch.Tensor = torch.arange(
	past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1] +
	prefix_len, device=inputs_embeds.device
	)

	#if position_ids is None:
	# Count from K (prefix) + S (sequence)
	position_ids = cache_position.unsqueeze(0).expand(inputs_embeds.shape[0], -1)

	####
	# prefix mask boundary from labels
	batch_size , seq_len_input = inputs_embeds.shape[:2]
	safe_boundaries = torch.full(
	(batch_size, 1, 1, 1),
	seq_len_input + prefix_len,
	device=inputs_embeds.device,
	dtype=torch.long
	)
	if labels is not None and is_prefill:
	# labels shape: [Batch, Seq_Len]
	# Find the FIRST index where label != -100 for EACH row in the batch.
	is_real_label = (labels != -100)
	# all False (all -100) -> 0
	prompt_lens = is_real_label.int().argmax(dim=1)
	has_label = is_real_label.any(dim=1)

	# If a row has no labels (all -100), prompt_len should be the full sequence length
	prompt_lens = torch.where(
	has_label,
	prompt_lens,
	torch.tensor(seq_len_input, device=inputs_embeds.device)
	)
	# Calculate safe boundary: Prefix Length + Prompt Length
	# Reshape to [Batch, 1, 1, 1] to allow comparison with kv_idx [1, 1, 1, Seq]
	safe_boundaries = (prefix_len + prompt_lens).view(batch_size, 1, 1, 1)
	self.safe_boundaries = safe_boundaries
	####

	hidden_states = inputs_embeds
	# concat A SINGLE seq of tokens:
	active_or_mask_func = None
	active_and_mask_func = None
	# No cache or empty cache
	# if past_key_values is None or past_key_values.get_seq_length() == 0:
	if is_prefill:
	self.flag_hyper = True
	# Use functools.partial to pass `self`
	active_or_mask_func = self._create_prefix_or_mask
	active_and_mask_func = self._create_prefix_and_mask

	if self.hypernetxs_cross_attn_tokens is not None:
	batch_size = hidden_states.shape[0]
	# prefix cross-attention tokens
	prefix_embeds = self.hypernetxs_cross_attn_tokens.expand(int(batch_size), -1, -1)
	hidden_states = torch.concat((prefix_embeds, hidden_states), dim=1)
	# modify causal_mask ## NEED to check carefully later
	if attention_mask is not None:
	prefix_attention_mask = torch.ones((batch_size, prefix_len),
	dtype=attention_mask.dtype, device=attention_mask.device)
	attention_mask = torch.cat([prefix_attention_mask,
	attention_mask], dim=1)
	else:
	# generating mode
	self.flag_hyper = False
	# position_ids = text_position_ids # cache_position.unsqueeze(0)
	###

	# Need to check at generate()
	# print('attention_mask', attention_mask, attention_mask.shape, input_ids.shape)
	# causal_mask = create_causal_mask(
	# config=self.config,
	# input_embeds=hidden_states,
	# # attention_mask=attention_mask,
	# attention_mask = None,
	# cache_position=cache_position,
	# past_key_values=past_key_values,
	# position_ids=position_ids,
	# # Pass custom logic. Not work.
	# or_mask_function=active_or_mask_func,
	# and_mask_function=active_and_mask_func
	# )
	causal_mask = None

	if is_prefill:
	current_seq_len = hidden_states.shape[1]
	dtype = inputs_embeds.dtype
	min_dtype = torch.finfo(dtype).min

	# Grid
	q_idx = torch.arange(current_seq_len, device=inputs_embeds.device).view(1, 1, current_seq_len, 1)
	k_idx = torch.arange(current_seq_len, device=inputs_embeds.device).view(1, 1, 1, current_seq_len)

	# Basic Causal Mask
	mask_bool = q_idx >= k_idx

	# C. Logic Custom (Prefill)
	# Logic 1: Prefix Unlock
	prefix_unlock = (q_idx < prefix_len) & (k_idx < safe_boundaries)
	mask_bool = mask_bool \| prefix_unlock

	# Logic 2: Text Forbidden
	text_forbidden_prefix = (q_idx >= prefix_len) & (k_idx < prefix_len)
	mask_bool = mask_bool & (~text_forbidden_prefix)

	#Float Mask (Bias)
	causal_mask = torch.full_like(mask_bool, min_dtype, dtype=dtype)
	causal_mask = causal_mask.masked_fill(mask_bool, 0.0)

	# Add Padding Mask
	if attention_mask is not None:
	padding_mask_float = (1.0 - attention_mask.to(dtype)) * min_dtype
	padding_mask_float = padding_mask_float[:, None, None, :]
	causal_mask = causal_mask + padding_mask_float

	# (Prevent 8D & SDPA Compatibility)
	causal_mask = causal_mask.contiguous()

	else:
	# --- GENERATE (DECODING) ---
	self.flag_hyper = False
	# Để causal_mask = None. FLASH ATTENTION
	pass


	# print('causal_mask', type(causal_mask), causal_mask.dtype, causal_mask.shape)
	####
	####

	position_embeddings = self.rotary_emb(hidden_states, position_ids)

	for idx, decoder_layer in enumerate(self.layers[: self.config.num_hidden_layers]):
	hidden_states = decoder_layer(
	hidden_states,
	attention_mask=causal_mask,
	position_ids=position_ids,
	past_key_values=past_key_values,
	cache_position=cache_position,
	position_embeddings=position_embeddings,
	flag_hyper = self.flag_hyper,
	**kwargs,
	)
	### Modification
	if idx < self.config.num_hidden_layers - 1 and self.flag_hyper:
	self.layers[idx+1].set_loraxs_adapters(decoder_layer.get_cache_loraxs())
	### Apply previous output to the next stage
	### remove the representative cross-attention tokens.
	if self.flag_hyper:
	hidden_states = hidden_states[:, self.main_cfg.hyperxs.n_cross_attn_tokens:, :]
	###
	hidden_states = self.norm(hidden_states)
	return BaseModelOutputWithPast(
	last_hidden_state=hidden_states,
	past_key_values=past_key_values,
	)


	class IbaXs_LlamaForCausalLM(LlamaForCausalLM):
	def __init__(self, config: HFLlamaConfig,
	):
	super().__init__(config)
	self.model = IbaXs_LlamaModel(config)

	def reset_BA_xslora(self):
	for name, module in self.named_modules():
	if isinstance(module, LoraXSLinear):
	module.decompose_weight_svd(module.rank)
	# print('Reset BA for', name)

	@can_return_tuple
	@auto_docstring
	def forward(
	self,
	input_ids: Optional[torch.LongTensor] = None,
	attention_mask: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.LongTensor] = None,
	past_key_values: Optional[Cache] = None,
	inputs_embeds: Optional[torch.FloatTensor] = None,
	labels: Optional[torch.LongTensor] = None,
	use_cache: Optional[bool] = None,
	cache_position: Optional[torch.LongTensor] = None,
	logits_to_keep: Union[int, torch.Tensor] = 0,
	**kwargs: Unpack[TransformersKwargs],
	) -> CausalLMOutputWithPast:

	outputs: BaseModelOutputWithPast = self.model(
	input_ids=input_ids,
	attention_mask=attention_mask,
	position_ids=position_ids,
	past_key_values=past_key_values,
	inputs_embeds=inputs_embeds,
	use_cache=use_cache,
	cache_position=cache_position,
	labels=labels,
	**kwargs,
	)

	hidden_states = outputs.last_hidden_state
	# Only compute necessary logits, and do not upcast them to float if we are not computing the loss
	slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
	logits = self.lm_head(hidden_states[:, slice_indices, :])

	loss = None
	if labels is not None:
	loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)

	return CausalLMOutputWithPast(
	loss=loss,
	logits=logits,
	past_key_values=outputs.past_key_values,
	hidden_states=outputs.hidden_states,
	attentions=outputs.attentions,
	)

	def test_set_loraxs_adapters():
	main_cfg=MainConfig()
	# print(mainCfg)
	hf_model_cfg = AutoConfig.from_pretrained(
	main_cfg.model.base_model_name
	)
	#hypernetxs = HyperNetXSexp(hf_model_cfg = hf_model_cfg, hyperxs_cfg=mainCfg.hyperxs)
	layer = IbaXs_LlamaDecoderLayer(hf_model_cfg, main_cfg=main_cfg, layer_idx=1)
	rank = main_cfg.hyperxs.lora_attn_dim

	batch_size = main_cfg.training.batch_train
	modules = 7
	values = torch.arange(1, modules + 1)
	values_reshaped = values.view(modules, 1, 1)
	loraTensor = values_reshaped.expand(batch_size, modules, rank, rank)

	layer.set_loraxs_adapters(loraTensor)

	def test_llm():
	# print(mainCfg)
	main_cfg=MainConfig()
	config = AutoConfig.from_pretrained(
	main_cfg.model.base_model_name
	)
	config.hidden_size=128
	config.intermediate_size=256
	config.num_hidden_layers=6
	config.head_dim = config.hidden_size // config.num_attention_heads

	main_cfg_dict = asdict(main_cfg)
	config.main_cfg = main_cfg_dict

	model_bb = IbaXs_LlamaForCausalLM(config=config)
	model_bb.reset_BA_xslora()
	batch_size = main_cfg.training.per_device_train_batch_size
	input = torch.ones(batch_size, 11, dtype=torch.long)
	total_params = sum(p.numel() for p in model_bb.parameters())
	print('input llm', input.shape, total_params)
	# inference
	output = model_bb(input,logits_to_keep=1)
	print('output llm', output.logits.shape)

	# Assuming 'model' is your instantiated IbaXs_LlamaModel
	# model = model_bb.model
	# if hasattr(model, 'layers') and len(model.layers) > 1:

	# # Get the hypernet object from layer 0 and layer 1
	# hypernet_0 = model.layers[0].hypernetxs
	# hypernet_1 = model.layers[1].hypernetxs

	# # Check if they are the same object in memory
	# is_same_object = (hypernet_0 is hypernet_1)

	# print(f"Hypernet from Layer 0 ID: {id(hypernet_0)}")
	# print(f"Hypernet from Layer 1 ID: {id(hypernet_1)}")
	# print(f"Are they the same shared object? {is_same_object}")

	# # You can even check the parameter tensors directly
	# param_0 = hypernet_0.c_dim.weight
	# param_1 = hypernet_1.c_dim.weight

	# is_same_tensor = (param_0 is param_1)
	# print(f"Are their 'c_dim.weight' tensors the same object? {is_same_tensor}")
	# print('-'*50)

	### generate
	device = 'mps'
	from transformers import LlamaTokenizer
	tokenizer = LlamaTokenizer.from_pretrained("huggyllama/llama-7b", legacy=True)
	model_bb.eval()
	prompts = [
	"The capital of France is",
	"Here is a simple Python function to add two numbers:"
	]
	for i, prompt in enumerate(prompts):
	print(f"\n--- Prompt {i+1} ---")
	print(f"Input: {prompt}")

	# 4.1. Tokenize the Input
	# Convert the prompt string to PyTorch tensors
	inputs = tokenizer(prompt, return_tensors="pt").to(device)

	# 4.2. Generate Text
	# Use torch.no_grad() for inference
	with torch.no_grad():
	outputs = model_bb.generate(
	**inputs,
	max_new_tokens=50, # Generate up to 50 new tokens
	do_sample=True,
	temperature=0.7,
	top_k=50
	# Note: We don't need 'add_generation_prompt' here
	)

	# 4.3. Decode the Output
	# The output includes the prompt, so we slice it
	output_tokens = outputs[0][inputs["input_ids"].shape[1]:]
	generated_text = tokenizer.decode(output_tokens, skip_special_tokens=True)

	print(f"Output: {generated_text}")



	def test_backbone():
	# print(mainCfg)
	main_cfg=MainConfig()
	config = AutoConfig.from_pretrained(
	main_cfg.model.base_model_name
	)
	config.hidden_size=128
	config.intermediate_size=256
	config.num_hidden_layers=6
	config.head_dim = config.hidden_size // config.num_attention_heads

	main_cfg_dict = asdict(main_cfg)
	config.main_cfg = main_cfg_dict

	mode_bb = IbaXs_LlamaModel(config=config)
	batch_size = main_cfg.training.batch_train
	input = torch.ones(batch_size, 11, dtype=torch.long)
	total_params = sum(p.numel() for p in mode_bb.parameters())
	print('input bb', input.shape, total_params)
	output = mode_bb(input)
	print('output bb', output.last_hidden_state.shape)

	if __name__ == "__main__":
	print("Hello world from XS_llama.py")
	# test_backbone()
	test_llm()