|
|
import matplotlib.pyplot as plt |
|
|
import seaborn as sns |
|
|
import pandas as pd |
|
|
import random |
|
|
from typing import List, Dict |
|
|
import numpy as np |
|
|
from concurrent.futures import ThreadPoolExecutor |
|
|
from tqdm import tqdm |
|
|
import datasets |
|
|
|
|
|
import io |
|
|
from datasets import load_dataset, load_from_disk, concatenate_datasets |
|
|
from PIL import Image |
|
|
from tqdm import tqdm |
|
|
from functools import partial |
|
|
from pillow_avif import AvifImagePlugin |
|
|
from datasets import Dataset |
|
|
import json |
|
|
import yaml |
|
|
import os |
|
|
import re |
|
|
import time |
|
|
import random |
|
|
import base64 |
|
|
from openai import AzureOpenAI |
|
|
import concurrent.futures |
|
|
from typing import List, Dict |
|
|
import argparse |
|
|
import time |
|
|
|
|
|
|
|
|
def extract_problem_solution(gpt4o_response): |
|
|
|
|
|
parts = gpt4o_response.split("<think>") |
|
|
|
|
|
|
|
|
problem = parts[0].strip() |
|
|
|
|
|
problem = re.sub(r"^Question:\s*", "", problem) |
|
|
|
|
|
problem = re.sub(r"\s*Answer:\s*$", "", problem).strip() |
|
|
|
|
|
|
|
|
think_parts = [p.split("</think>")[0].strip() for p in parts[1:] if "</think>" in p] |
|
|
solution = f"<think>{' '.join(think_parts)}</think>" |
|
|
|
|
|
|
|
|
if "<answer>" in gpt4o_response: |
|
|
final_answer = ( |
|
|
gpt4o_response.split("<answer>")[-1].split("</answer>")[0].strip() |
|
|
) |
|
|
final_answer = re.sub(r"^Answer:\s*", "", final_answer) |
|
|
solution += f"\n\n<answer>{final_answer}</answer>" |
|
|
|
|
|
return problem, solution |
|
|
|
|
|
|
|
|
def load_image_from_path(image_path): |
|
|
try: |
|
|
img = Image.open(image_path) |
|
|
return img |
|
|
except Exception as e: |
|
|
print(f"Error loading image {image_path}: {str(e)}") |
|
|
return None |
|
|
|
|
|
|
|
|
def process_raw_data(raw_data): |
|
|
|
|
|
if isinstance(raw_data, str): |
|
|
data = json.loads(raw_data) |
|
|
else: |
|
|
data = raw_data |
|
|
|
|
|
|
|
|
try: |
|
|
problem, solution = extract_problem_solution(data["gpt4o_response"]) |
|
|
image = load_image_from_path(data["image_path"]) |
|
|
|
|
|
return { |
|
|
"image": image, |
|
|
"problem": problem, |
|
|
"solution": solution, |
|
|
"original_question": data["question"], |
|
|
"original_answer": data["answer"], |
|
|
} |
|
|
except Exception as e: |
|
|
print(f"Error processing data {data}: {str(e)}") |
|
|
return { |
|
|
"image": None, |
|
|
"problem": None, |
|
|
"solution": None, |
|
|
"original_question": None, |
|
|
"original_answer": None, |
|
|
} |
|
|
|
|
|
|
|
|
raw_data_list = [ |
|
|
"/path/to/reasoning_data_with_response_90k_verified", |
|
|
] |
|
|
|
|
|
raw_data = concatenate_datasets([load_from_disk(path) for path in raw_data_list]) |
|
|
|
|
|
processed_data = raw_data.map(process_raw_data, num_proc=128).shuffle(seed=42) |
|
|
|
|
|
hf_dict = { |
|
|
"image": [], |
|
|
"problem": [], |
|
|
"solution": [], |
|
|
"original_question": [], |
|
|
"original_answer": [], |
|
|
} |
|
|
|
|
|
for item in tqdm(processed_data): |
|
|
hf_dict["image"].append(item["image"]) |
|
|
hf_dict["problem"].append(item["problem"]) |
|
|
hf_dict["solution"].append(item["solution"]) |
|
|
hf_dict["original_question"].append(item["original_question"]) |
|
|
hf_dict["original_answer"].append(item["original_answer"]) |
|
|
|
|
|
|
|
|
features = datasets.Features( |
|
|
{ |
|
|
"image": datasets.Image(), |
|
|
"problem": datasets.Value("string"), |
|
|
"solution": datasets.Value("string"), |
|
|
"original_question": datasets.Value("string"), |
|
|
"original_answer": datasets.Value("string"), |
|
|
} |
|
|
) |
|
|
|
|
|
|
|
|
def has_empty_tags(text): |
|
|
|
|
|
pattern = r"<[^>]+></[^>]+>" |
|
|
return bool(re.search(pattern, text)) |
|
|
|
|
|
|
|
|
def has_answer_pattern(text): |
|
|
if "Answer:" in text: |
|
|
return True |
|
|
return False |
|
|
|
|
|
|
|
|
def has_valid_image_size(example): |
|
|
|
|
|
|
|
|
try: |
|
|
image = example["image"] |
|
|
if isinstance(image, dict) and "height" in image and "width" in image: |
|
|
return image["height"] >= 28 and image["width"] >= 28 |
|
|
|
|
|
return image.height >= 28 and image.width >= 28 |
|
|
except: |
|
|
return False |
|
|
|
|
|
|
|
|
ds = datasets.Dataset.from_dict(hf_dict, features=features) |
|
|
ds = ds.filter( |
|
|
lambda x: not has_empty_tags(x["solution"]) |
|
|
and not has_answer_pattern(x["problem"]) |
|
|
and has_valid_image_size(x) |
|
|
and x["image"] is not None, |
|
|
num_proc=128, |
|
|
) |
|
|
|
|
|
ds.push_to_hub("path/to/your/dataset") |
|
|
|