rl4phyx-backup / root_scripts /check_data.py
YUNTA88's picture
Upload root_scripts/check_data.py with huggingface_hub
62cbd63 verified
import json, os
data_path = "/workspace/rl4phyx/RL4Phyx/SFT/sft_train/sft_train_formatted.jsonl"
with open(data_path) as f:
records = [json.loads(line) for line in f]
print(f"Total samples: {len(records)}")
# Format check
format_ok = 0
format_bad = 0
missing_img = 0
no_cot = 0
short_answer = 0
answer_lens = []
has_think = 0
has_step = 0
for i, r in enumerate(records):
# Check structure
if "messages" not in r or len(r["messages"]) < 2:
format_bad += 1
continue
user = r["messages"][0]
asst = r["messages"][1]
if user.get("role") != "user" or asst.get("role") != "assistant":
format_bad += 1
continue
# Check image
img_found = False
for c in user.get("content", []):
if c.get("type") == "image":
img_path = c["image"].replace("file://", "")
if os.path.isfile(img_path):
img_found = True
else:
missing_img += 1
# Check answer
ans = asst["content"][0]["text"]
answer_lens.append(len(ans))
if len(ans) < 50:
short_answer += 1
# Check CoT indicators
ans_lower = ans.lower()
if "<think>" in ans_lower or "\\boxed" in ans or "step" in ans_lower:
has_step += 1
if "<think>" in ans_lower:
has_think += 1
format_ok += 1
print(f"\nFormat OK: {format_ok}/{len(records)}")
print(f"Format bad: {format_bad}")
print(f"Missing images: {missing_img}")
print(f"Short answers (<50 chars): {short_answer}")
print(f"Has step/boxed/think: {has_step}/{len(records)}")
print(f"Has <think> tag: {has_think}/{len(records)}")
import statistics
print(f"\nAnswer length stats:")
print(f" Min: {min(answer_lens)}")
print(f" Max: {max(answer_lens)}")
print(f" Mean: {int(statistics.mean(answer_lens))}")
print(f" Median: {int(statistics.median(answer_lens))}")
# Show 3 random samples
import random
random.seed(42)
samples = random.sample(range(len(records)), 3)
for idx in samples:
r = records[idx]
ans = r["messages"][1]["content"][0]["text"]
q = [c["text"] for c in r["messages"][0]["content"] if c["type"]=="text"][0]
print(f"\n{'='*50}")
print(f"Sample {idx}")
print(f"Q: {q[:120]}...")
print(f"A ({len(ans)} chars): {ans[:300]}...")