Tabular-LM-v0.2-multitask_v2 / TabularModel.py
wwydmanski's picture
Upload folder using huggingface_hub
03d6533 verified
from transformers import (
AutoConfig,
AutoProcessor,
ProcessorMixin,
Qwen2TokenizerFast,
BaseImageProcessor,
Qwen2_5_VLForConditionalGeneration,
)
from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import (
Qwen2_5_VLCausalLMOutputWithPast,
Qwen2RMSNorm,
)
from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
from transformers.processing_utils import Unpack
from transformers.feature_extraction_sequence_utils import BatchFeature
from typing import List, Optional, TypedDict
# from tabpfn_extensions import TabPFNRegressor
# from tabpfn_extensions.embedding import TabPFNEmbedding
import numpy as np
import torch
from torch import nn
from torch.nn import CrossEntropyLoss
from pprint import pprint
class TabularProcessorKwargs(TypedDict):
"""
Keyword arguments for tabular processing.
"""
pass
class TabularPreprocessor(BaseImageProcessor):
def __call__(self, X: list | np.ndarray | torch.Tensor) -> torch.Tensor:
if not isinstance(X, list):
X = [X]
res = []
for X_sample in X:
if isinstance(X_sample, torch.Tensor):
X_sample = X_sample.cpu().numpy()
res.append(X_sample)
res = np.array(res)
return BatchFeature(data={"tabular_values": torch.from_numpy(res).to(torch.float32)})
AutoProcessor.register("TabularPreprocessor", TabularPreprocessor)
class TabularProcessor(nn.Module):
def __init__(self, **kwargs: Unpack[TabularProcessorKwargs]):
super().__init__(**kwargs)
self.tabpfn = TabPFNRegressor(
n_estimators=1,
model_path="./tabpfn-v2-regressor.ckpt", device="cuda:1"
)
def __call__(self, X: np.ndarray | torch.Tensor) -> torch.Tensor:
# Will convert specified categorical indices to category dtype, as well
# as handle `np.object` arrays or otherwise `object` dtype pandas columns.
if len(X.shape) == 2:
X = [X]
res = []
for X_sample in X:
if isinstance(X_sample, torch.Tensor):
X_sample = X_sample.cpu().numpy()
X_sample = X_sample[0]
self.tabpfn.fit(X_sample, np.random.random(X_sample.shape[0]))
embs = self.tabpfn.get_embeddings(X_sample)
embs_t = torch.from_numpy(embs).to(self.tabpfn.device)
embs_t = embs_t.mean(dim=0)
res.append(embs_t)
res = torch.stack(res)
res = res.view(-1, 192)
return res
class TabularBlock(nn.Module):
def __init__(self, input_dim: int, hidden_dim: int = 192):
super().__init__()
self.linear1 = nn.Linear(input_dim, hidden_dim)
self.activation = nn.GELU()
self.linear2 = nn.Linear(hidden_dim, input_dim)
def forward(self, x: torch.Tensor) -> torch.Tensor:
residual = x
x = self.linear1(x)
x = self.activation(x)
x = self.linear2(x)
return x + residual
class TabularLearnableProcessor(nn.Module):
def __init__(self, num_features: int = 1):
super().__init__()
# Each cell is processed individually as a scalar
self.input_proj = nn.Linear(num_features, 192)
self.nodes = nn.Sequential(
nn.GELU(),
TabularBlock(192, 64),
nn.GELU(),
TabularBlock(192, 64),
nn.GELU(),
TabularBlock(192, 64),
nn.GELU(),
TabularBlock(192, 64),
nn.GELU(),
TabularBlock(192, 64),
nn.GELU(),
TabularBlock(192, 64),
nn.GELU(),
TabularBlock(192, 64),
)
def forward(self, X: np.ndarray | torch.Tensor) -> torch.Tensor:
if isinstance(X, np.ndarray):
X = torch.from_numpy(X)
param_dtype = self.input_proj.weight.dtype
X = X.to(param_dtype)
# Flatten the table - each cell becomes a separate token
# X shape: (batch_size, rows, cols) -> (batch_size * rows * cols, 1)
batch_size = X.shape[0]
X_flat = X.reshape(-1, 1) # Flatten to individual cells
# RMS normalization per cell for stability
# X_normalized = X_flat * torch.rsqrt(X_flat.pow(2) + 1e-5)
projected = self.input_proj(X_flat)
# res = self.nodes(projected)
return projected
class Qwen_2_5_TabularProcessor(ProcessorMixin):
r"""
Constructs a Qwen2.5-VL processor which wraps a Qwen2.5-VL image processor and a Qwen2 tokenizer into a single processor.
[`Qwen2_5_VLProcessor`] offers all the functionalities of [`Qwen2VLImageProcessor`] and [`Qwen2TokenizerFast`]. See the
[`~Qwen2_5_VLProcessor.__call__`] and [`~Qwen2_5_VLProcessor.decode`] for more information.
Args:
image_processor ([`Qwen2VLImageProcessor`], *optional*):
The image processor is a required input.
tokenizer ([`Qwen2TokenizerFast`], *optional*):
The tokenizer is a required input.
chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
in a chat into a tokenizable string.
"""
attributes = ["tokenizer"]
valid_kwargs = ["chat_template"]
tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast")
def __init__(
self,
tabular_processor: TabularPreprocessor | None = None,
tokenizer=None,
chat_template=None,
**kwargs,
):
self.tabular_token = (
"<|tabular_pad|>"
if not hasattr(tokenizer, "tabular_token")
else tokenizer.tabular_token
)
self.tabular_processor = tabular_processor
super().__init__(tokenizer, chat_template=chat_template)
def __call__(
self,
tabular_values: np.ndarray | torch.Tensor | None = None,
text: TextInput | PreTokenizedInput | list[TextInput] | list[PreTokenizedInput] | None = None,
**kwargs: Unpack[TabularProcessorKwargs],
) -> BatchFeature:
"""
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwrags` arguments to
Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`] if `vision_infos` is not `None`.
Args:
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
tensor. Both channels-first and channels-last formats are supported.
text (`str`, `List[str]`, `List[List[str]]`):
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
videos (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
return_tensors (`str` or [`~utils.TensorType`], *optional*):
If set, will return tensors of a particular framework. Acceptable values are:
- `'tf'`: Return TensorFlow `tf.constant` objects.
- `'pt'`: Return PyTorch `torch.Tensor` objects.
- `'np'`: Return NumPy `np.ndarray` objects.
- `'jax'`: Return JAX `jnp.ndarray` objects.
Returns:
[`BatchFeature`]: A [`BatchFeature`] with the following fields:
- **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
- **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
`return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
`None`).
- **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
- **pixel_values_videos** -- Pixel values of videos to be fed to a model. Returned when `videos` is not `None`.
- **image_grid_thw** -- List of image 3D grid in LLM. Returned when `images` is not `None`.
- **video_grid_thw** -- List of video 3D grid in LLM. Returned when `videos` is not `None`.
- **second_per_grid_ts** -- List of video seconds per time grid. Returned when `videos` is not `None`.
"""
# print("Tabular values: ", tabular_values)
if tabular_values is not None:
tabular_inputs = self.tabular_processor(tabular_values)
else:
print("Warning! No tabular values provided!")
tabular_inputs = {}
if not isinstance(text, list):
text = [text]
if tabular_values is not None:
index = 0
for i in range(len(text)):
while self.tabular_token in text[i]:
# Each cell becomes a token: num_tokens = rows * cols
table_shape = tabular_inputs["tabular_values"][index].shape
rows, cols = table_shape[0], table_shape[1]
# Build pattern: for each row, add col tokens + row separator
row_pattern = "<|placeholder|>" * cols + "<|tabular_row|>"
replacement = row_pattern * rows
text[i] = text[i].replace(
self.tabular_token,
replacement,
1,
)
index += 1
text[i] = text[i].replace("<|placeholder|>", self.tabular_token)
text_inputs = self.tokenizer(text, **kwargs)
return BatchFeature(data={**text_inputs, **tabular_inputs})
def batch_decode(self, *args, **kwargs):
"""
This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
refer to the docstring of this method for more information.
"""
return self.tokenizer.batch_decode(*args, **kwargs)
def decode(self, *args, **kwargs):
"""
This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
the docstring of this method for more information.
"""
return self.tokenizer.decode(*args, **kwargs)
def post_process_image_text_to_text(
self,
generated_outputs,
skip_special_tokens=True,
clean_up_tokenization_spaces=False,
**kwargs,
):
"""
Post-process the output of the model to decode the text.
Args:
generated_outputs (`torch.Tensor` or `np.ndarray`):
The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
or `(sequence_length,)`.
skip_special_tokens (`bool`, *optional*, defaults to `True`):
Whether or not to remove special tokens in the output. Argument passed to the tokenizer's `batch_decode` method.
Clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
Whether or not to clean up the tokenization spaces. Argument passed to the tokenizer's `batch_decode` method.
**kwargs:
Additional arguments to be passed to the tokenizer's `batch_decode method`.
Returns:
`List[str]`: The decoded text.
"""
return self.tokenizer.batch_decode(
generated_outputs,
skip_special_tokens=skip_special_tokens,
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
**kwargs,
)
@property
def model_input_names(self):
tokenizer_input_names = self.tokenizer.model_input_names
tabular_processor_input_names = self.tabular_processor.model_input_names if hasattr(self.tabular_processor, 'model_input_names') else []
names_from_processor = list(
dict.fromkeys(tokenizer_input_names + tabular_processor_input_names)
)
return names_from_processor + ["tabular_values"]
class Qwen2_5_TabularModel(Qwen2_5_VLForConditionalGeneration):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.tabular_processor = TabularLearnableProcessor(num_features=1)
self.tabular_projection = nn.Sequential(
nn.Linear(192, self.config.hidden_size),
nn.ReLU(),
TabularBlock(self.config.hidden_size, self.config.hidden_size),
nn.ReLU(),
TabularBlock(self.config.hidden_size, self.config.hidden_size),
nn.ReLU(),
TabularBlock(self.config.hidden_size, self.config.hidden_size),
)
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
pixel_values: Optional[torch.Tensor] = None,
pixel_values_videos: Optional[torch.FloatTensor] = None,
tabular_values: Optional[torch.Tensor] = None,
image_grid_thw: Optional[torch.LongTensor] = None,
video_grid_thw: Optional[torch.LongTensor] = None,
rope_deltas: Optional[torch.LongTensor] = None,
cache_position: Optional[torch.LongTensor] = None,
second_per_grid_ts: Optional[torch.Tensor] = None,
):
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
Returns:
Example:
```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
>>> model = Qwen2_5_VLForConditionalGeneration.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
>>> processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
>>> messages = [
{
"role": "user",
"content": [
{"type": "image"},
{"type": "text", "text": "What is shown in this image?"},
],
},
]
>>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
>>> inputs = processor(text=[text], images=[image], vision_infos=[vision_infos])
>>> # Generate
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"The image shows a street scene with a red stop sign in the foreground. In the background, there is a large red gate with Chinese characters ..."
```"""
output_attentions = (
output_attentions
if output_attentions is not None
else self.config.output_attentions
)
output_hidden_states = (
output_hidden_states
if output_hidden_states is not None
else self.config.output_hidden_states
)
return_dict = (
return_dict if return_dict is not None else self.config.use_return_dict
)
if inputs_embeds is None:
inputs_embeds = self.language_model.embed_tokens(input_ids)
if pixel_values is not None:
pixel_values = pixel_values.type(self.visual.dtype)
image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw)
n_image_tokens = (input_ids == self.config.image_token_id).sum().item()
n_image_features = image_embeds.shape[0]
if n_image_tokens != n_image_features:
raise ValueError(
f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
)
mask = input_ids == self.config.image_token_id
mask_unsqueezed = mask.unsqueeze(-1)
mask_expanded = mask_unsqueezed.expand_as(inputs_embeds)
image_mask = mask_expanded.to(inputs_embeds.device)
image_embeds = image_embeds.to(
inputs_embeds.device, inputs_embeds.dtype
)
inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)
if pixel_values_videos is not None:
pixel_values_videos = pixel_values_videos.type(self.visual.dtype)
video_embeds = self.visual(pixel_values_videos, grid_thw=video_grid_thw)
n_video_tokens = (input_ids == self.config.video_token_id).sum().item()
n_video_features = video_embeds.shape[0]
if n_video_tokens != n_video_features:
raise ValueError(
f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}"
)
mask = input_ids == self.config.video_token_id
mask_unsqueezed = mask.unsqueeze(-1)
mask_expanded = mask_unsqueezed.expand_as(inputs_embeds)
video_mask = mask_expanded.to(inputs_embeds.device)
video_embeds = video_embeds.to(
inputs_embeds.device, inputs_embeds.dtype
)
inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds)
if tabular_values is not None:
proc_feats = self.tabular_processor(tabular_values.to(self.device, torch.float32))
proc_feats = proc_feats.to(inputs_embeds.dtype).to(self.device)
tabular_embeds = self.tabular_projection(proc_feats)
tabular_token_id = getattr(self.config, "tabular_token_id", None)
if tabular_token_id is None:
raise ValueError("Tabular token id (config.tabular_token_id) is not set.")
mask = (input_ids == int(tabular_token_id))
tabular_no_mask = mask.sum().item()
if tabular_no_mask != tabular_embeds.shape[0]:
raise ValueError(
f"Tabular features and tabular tokens do not match: tokens: {tabular_no_mask}, features {tabular_embeds.shape[0]}"
)
mask_unsqueezed = mask.unsqueeze(-1)
mask_expanded = mask_unsqueezed.expand_as(inputs_embeds)
tabular_mask = mask_expanded.to(inputs_embeds.device)
tabular_embeds = tabular_embeds.to(
inputs_embeds.device, inputs_embeds.dtype
)
inputs_embeds = inputs_embeds.masked_scatter(
tabular_mask, tabular_embeds
)
if attention_mask is not None:
attention_mask = attention_mask.to(inputs_embeds.device)
# if we get 4D attention mask we cannot calculate rope deltas anymore. TODO @raushan fixme
if position_ids is None and (
attention_mask is None or attention_mask.ndim == 2
):
# calculate RoPE index once per generation in the pre-fill stage only
if (
(cache_position is not None and cache_position[0] == 0)
or self.rope_deltas is None
or (past_key_values is None or past_key_values.get_seq_length() == 0)
):
position_ids, rope_deltas = self.model.get_rope_index(
input_ids,
image_grid_thw,
video_grid_thw,
second_per_grid_ts,
attention_mask,
)
self.rope_deltas = rope_deltas
# then use the prev pre-calculated rope-deltas to get the correct position ids
else:
batch_size, seq_length, _ = inputs_embeds.shape
delta = (
(cache_position[0] + self.rope_deltas).to(inputs_embeds.device)
if cache_position is not None
else 0
)
position_ids = torch.arange(seq_length, device=inputs_embeds.device)
position_ids = position_ids.view(1, -1).expand(batch_size, -1)
if cache_position is not None: # otherwise `deltas` is an int `0`
delta = delta.repeat_interleave(batch_size // delta.shape[0], dim=0)
position_ids = position_ids.add(delta)
position_ids = position_ids.unsqueeze(0).expand(3, -1, -1)
outputs = self.model(
input_ids=None,
position_ids=position_ids,
attention_mask=attention_mask,
past_key_values=past_key_values,
inputs_embeds=inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
cache_position=cache_position,
)
hidden_states = outputs[0]
logits = self.lm_head(hidden_states)
loss = None
if labels is not None:
# Upcast to float if we need to compute the loss to avoid potential precision issues
logits = logits.float()
# Shift so that tokens < n predict n
shift_logits = logits[..., :-1, :].contiguous()
shift_labels = labels[..., 1:].contiguous()
# Flatten the tokens
loss_fct = CrossEntropyLoss()
shift_logits = shift_logits.view(-1, self.config.vocab_size)
shift_labels = shift_labels.view(-1)
# Enable model parallelism
shift_labels = shift_labels.to(shift_logits.device)
loss = loss_fct(shift_logits, shift_labels)
if not return_dict:
output = (logits,) + outputs[1:]
return (loss,) + output if loss is not None else output
return Qwen2_5_VLCausalLMOutputWithPast(
loss=loss,
logits=logits,
past_key_values=outputs.past_key_values,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
rope_deltas=self.rope_deltas,
)
def prepare_inputs_for_generation(
self,
input_ids,
past_key_values=None,
attention_mask=None,
inputs_embeds=None,
cache_position=None,
position_ids=None,
use_cache=True,
pixel_values=None,
pixel_values_videos=None,
image_grid_thw=None,
video_grid_thw=None,
second_per_grid_ts=None,
**kwargs,
):
# Overwritten -- in specific circumstances we don't want to forward image inputs to the model
model_inputs = super().prepare_inputs_for_generation(
input_ids,
past_key_values=past_key_values,
attention_mask=attention_mask,
inputs_embeds=inputs_embeds,
cache_position=cache_position,
position_ids=position_ids,
pixel_values=pixel_values,
pixel_values_videos=pixel_values_videos,
image_grid_thw=image_grid_thw,
video_grid_thw=video_grid_thw,
second_per_grid_ts=second_per_grid_ts,
use_cache=use_cache,
**kwargs,
)
# Qwen2-5-VL position_ids are prepareed with rope_deltas in forward
model_inputs["position_ids"] = None
if cache_position[0] != 0:
model_inputs["pixel_values"] = None
model_inputs["pixel_values_videos"] = None
model_inputs["tabular_values"] = None
return model_inputs
if __name__ == "__main__":
template = """"{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% set tabular_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif content['type'] == 'tabular' or 'tabular' in content %}{% set tabular_count.value = tabular_count.value + 1 %}{% if add_vision_id %}Table {{ tabular_count.value }}: {% endif %}<|vision_start|><|tabular_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"""
MODE = "reconstruction_variable"
model_name_trained = f"./models/Tabular-LM-v0.1-{MODE}"
# model_name_trained = "Qwen/Qwen2.5-VL-3B-Instruct"
# model_name_trained = "./models/checkpoints/checkpoint-1000"
tabular_processor = TabularPreprocessor()
qwen_tabular_processor = Qwen_2_5_TabularProcessor(
tabular_processor=tabular_processor,
tokenizer=Qwen2TokenizerFast.from_pretrained(model_name_trained),
)
qwen_tabular_processor.tabular_token = "<|tabular_pad|>"
qwen_tabular_processor.tokenizer.add_tokens([qwen_tabular_processor.tabular_token, "<|tabular_row|>"])
qwen_tabular_processor.tokenizer.chat_template = template
tabular_data = np.random.randn(4,6).round(2)
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "This is a table."},
{"index": 0, "type": "tabular"},
{"type": "text", "text": "Give me its content in csv format."},
# {"type": "text", "text": "Give me a statistical summary."},
# {"type": "text", "text": "Give me the correlation matrix in csv format"},
# {"type": "text", "text": "Give me the content of the table"},
],
}
]
preprocessed = qwen_tabular_processor.tokenizer.apply_chat_template(
messages, tokenize=False
)
processed = qwen_tabular_processor(
[tabular_data], text=preprocessed, return_tensors="pt"
)
model = Qwen2_5_TabularModel.from_pretrained(model_name_trained).to("cuda:1")
model.config.tabular_token_id = (
qwen_tabular_processor.tokenizer.convert_tokens_to_ids("<|tabular_pad|>")
)
model.config.tabular_row_token_id = (
qwen_tabular_processor.tokenizer.convert_tokens_to_ids("<|tabular_row|>")
)
processed = {key: value.to("cuda:1") for key, value in processed.items()}
res = model.generate(**processed, max_new_tokens=512, do_sample=False)
generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(processed["input_ids"], res, strict=True)]
output_text = qwen_tabular_processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
print("="*80)
print("Original table:")
print(tabular_data)
print("\nModel output:")
print(output_text[0])
print("="*80)
if MODE in ["reconstruction", "reconstruction_variable"]:
# Try to evaluate reconstruction quality
from utils import text_to_array
generated_array = text_to_array(output_text[0])
# Round original to match expected precision
tabular_data_rounded = tabular_data.round(1)
print("\nReconstruction evaluation:")
print(f"Original shape: {tabular_data_rounded.shape}")
print(f"Generated shape: {generated_array.shape}")
if generated_array.shape == tabular_data_rounded.shape:
mse = np.mean((generated_array - tabular_data_rounded) ** 2)
mae = np.mean(np.abs(generated_array - tabular_data_rounded))
print(f"MSE: {mse:.4f}")
print(f"MAE: {mae:.4f}")
else:
print(f"Shape mismatch - cannot compute metrics")
if MODE == "summary":
summary_parts = []
# Podstawowe statystyki
summary_parts.append(f"Mean: {tabular_data.mean():.2f}")
summary_parts.append(f"Median: {np.median(tabular_data):.2f}")
summary_parts.append(f"Std: {tabular_data.std():.2f}")
summary_parts.append(f"Min: {tabular_data.min():.2f}")
summary_parts.append(f"Max: {tabular_data.max():.2f}")
# Średnie po wierszach
row_means = tabular_data.mean(axis=1)
row_means_str = ", ".join([f"{m:.2f}" for m in row_means])
summary_parts.append(f"Row means: [{row_means_str}]")
# Średnie po kolumnach
col_means = tabular_data.mean(axis=0)
col_means_str = ", ".join([f"{m:.2f}" for m in col_means])
summary_parts.append(f"Column means: [{col_means_str}]")
# Macierz korelacji (jeśli mamy więcej niż 1 kolumnę)
if tabular_data.shape[1] > 1:
try:
corrcoef = np.corrcoef(tabular_data.T)
corr_str = "Correlation matrix:\n"
for i in range(corrcoef.shape[0]):
corr_row = ", ".join([f"{corrcoef[i, j]:.2f}" for j in range(corrcoef.shape[1])])
corr_str += f" [{corr_row}]\n"
summary_parts.append(corr_str.strip())
except:
pass
summary_text = "\n".join(summary_parts)
print("True summary:")
print(summary_text)