| | from datasets import Dataset |
| | from consts import REASONING_START, REASONING_END, SOLUTION_START, SOLUTION_END |
| |
|
| |
|
| |
|
| | def is_numeric_answer(example): |
| | try: |
| | float(example["answer"]) |
| | return True |
| | except Exception as e: |
| | return f"error: {e}" |
| |
|
| | def resize_images(example): |
| | image = example["decoded_image"] |
| | image = image.resize((512,512)) |
| | example["decoded_image"] = image |
| | return example |
| |
|
| |
|
| | def convert_to_rgb(example): |
| | image = example["decoded_image"] |
| | if image.mode != "RGB": |
| | image = image.convert("RGB") |
| | example["decoded_image"] = image |
| | return example |
| |
|
| |
|
| | def make_conversation(example): |
| |
|
| | text_content = ( |
| | f"{example['question']}, provide your reasoning between {REASONING_START} and {REASONING_END} " |
| | f"and then your final answer between {SOLUTION_START} and (put a float here) {SOLUTION_END}" |
| | ) |
| |
|
| | prompt = [ |
| | { |
| | "role": "user", |
| | "content": [ |
| | {"type": "image"}, |
| | {"type": "text", "text": text_content}, |
| | ], |
| | }, |
| | ] |
| |
|
| | |
| | return {"prompt": prompt, "image": example["decoded_image"], "answer": example["answer"]} |
| |
|
| |
|
| |
|
| | def dataset_setup(dataset: Dataset, tokenizer) -> Dataset: |
| | dataset = dataset.filter(is_numeric_answer) |
| | dataset = dataset.map(resize_images) |
| | dataset = dataset.map(convert_to_rgb) |
| | train_dataset = dataset.map(make_conversation) |
| |
|
| | |
| | |
| | train_dataset = train_dataset.remove_columns("image") |
| | train_dataset = train_dataset.rename_column("decoded_image", "image") |
| |
|
| | train_dataset = train_dataset.map( |
| | lambda example: { |
| | "prompt": tokenizer.apply_chat_template( |
| | example["prompt"], |
| | tokenize=False, |
| | add_generation_prompt=False |
| | ) |
| | } |
| | ) |
| | return train_dataset, dataset |
| |
|
| |
|