|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
""" |
|
|
Convert jsonl with `image` and `conversations` into |
|
|
a HuggingFace Dataset that LFM2-VL expects. |
|
|
Each sample must contain: |
|
|
- image : str (absolute path or relative to repo root) |
|
|
- messages: List[Dict] # openai-style |
|
|
""" |
|
|
import json, datasets |
|
|
from pathlib import Path |
|
|
from typing import List, Dict |
|
|
import multiprocessing as mp |
|
|
from PIL import Image |
|
|
SYSTEM_MSG = "You are a helpful vision-language assistant." |
|
|
|
|
|
|
|
|
""" |
|
|
Convert jsonl with `image` and `conversations` into |
|
|
a HuggingFace Dataset that works with the medical sample format. |
|
|
""" |
|
|
import json, datasets |
|
|
from pathlib import Path |
|
|
from typing import List, Dict |
|
|
import multiprocessing as mp |
|
|
from PIL import Image |
|
|
def format_vlm_sample(sample): |
|
|
"""Format a vlm sample into the expected message format.""" |
|
|
return [ |
|
|
{ |
|
|
"role": "user", |
|
|
"content": [ |
|
|
{"type": "image", "image": sample["image"]}, |
|
|
{"type": "text", "text": sample["question"]}, |
|
|
], |
|
|
}, |
|
|
{"role": "assistant", "content": [{"type": "text", "text": sample["gt_answer"]}]}, |
|
|
] |
|
|
def jsonl_to_dataset_hf_parallel(jsonl_file: str, image_root: str = "", num_workers: int = None): |
|
|
""" |
|
|
Fixed parallel version that handles None values properly |
|
|
""" |
|
|
if num_workers is None: |
|
|
num_workers = 8 |
|
|
|
|
|
valid_lines = [] |
|
|
with open(jsonl_file, encoding="utf-8") as f: |
|
|
for line_num, line in enumerate(f): |
|
|
line = line.strip() |
|
|
if line: |
|
|
try: |
|
|
|
|
|
rec = json.loads(line) |
|
|
if "image" in rec and "conversations" in rec: |
|
|
valid_lines.append({"line": line, "image_root": image_root, "line_num": line_num}) |
|
|
except: |
|
|
print(f"Warning: Line {line_num}: Invalid JSON") |
|
|
continue |
|
|
|
|
|
print(f"Found {len(valid_lines)} valid lines to process") |
|
|
|
|
|
|
|
|
raw_dataset = datasets.Dataset.from_list(valid_lines) |
|
|
|
|
|
def process_example_safe(example): |
|
|
"""Process function that never returns None""" |
|
|
|
|
|
rec = json.loads(example["line"]) |
|
|
image_path = Path(example["image_root"]) / rec["image"] |
|
|
|
|
|
if not image_path.exists(): |
|
|
|
|
|
return { |
|
|
"image": str(image_path.absolute()), |
|
|
"question": "dummy", |
|
|
"gt_answer": "dummy", |
|
|
"valid": False |
|
|
} |
|
|
|
|
|
|
|
|
question = "" |
|
|
gt_answer = "" |
|
|
|
|
|
for turn in rec["conversations"]: |
|
|
if turn["from"] == "human": |
|
|
question = turn["value"].replace("<image>", "").strip() |
|
|
elif turn["from"] == "gpt" or turn["from"] == "assistant": |
|
|
gt_answer = turn["value"].strip() |
|
|
break |
|
|
|
|
|
if not question or not gt_answer: |
|
|
return { |
|
|
"image": str(image_path.absolute()), |
|
|
"question": "dummy", |
|
|
"gt_answer": "dummy", |
|
|
"valid": False |
|
|
} |
|
|
|
|
|
return { |
|
|
"image": str(image_path.absolute()), |
|
|
"question": question, |
|
|
"gt_answer": gt_answer, |
|
|
"valid": True |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
processed_dataset = raw_dataset.map( |
|
|
process_example_safe, |
|
|
num_proc=num_workers, |
|
|
remove_columns=["line", "image_root", "line_num"], |
|
|
desc="Processing medical QA records" |
|
|
) |
|
|
|
|
|
|
|
|
valid_dataset = processed_dataset.filter(lambda x: x["valid"]) |
|
|
|
|
|
|
|
|
valid_dataset = valid_dataset.remove_columns(["valid"]) |
|
|
|
|
|
print(f"Valid samples after processing: {len(valid_dataset)}") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print(f"✅ Final dataset size: {len(valid_dataset)} medical QA samples") |
|
|
return valid_dataset |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
|
|
ds = jsonl_to_dataset_hf_parallel("data/train.jsonl") |
|
|
if len(ds) > 0: |
|
|
print("Sample:", ds[0].keys()) |
|
|
print("Question:", ds[0]["question"]) |
|
|
print("Answer:", ds[0]["gt_answer"]) |
|
|
|