RR_V1_UNTRAINED / datasets /verify_data.py
algorythmtechnologies's picture
Upload folder using huggingface_hub
947d4a1 verified
"""Quick verification script for generated dataset"""
import json
with open("datasets/synthetic_nozzles.json") as f:
data = json.load(f)
print(f"Total examples: {len(data)}")
# Count types
types = {}
for ex in data:
t = ex["id"].split("_")[0]
types[t] = types.get(t, 0) + 1
print("\nComposition:")
for t, c in sorted(types.items(), key=lambda x: -x[1]):
print(f" {t}: {c}")
# Check nozzle dimensions
print("\nNozzle Spot Checks:")
nozzles = [e for e in data if e["id"].startswith("nozzle")]
for ex in nozzles[:5]:
r = ex["reasoning"]
for line in r.split("\n"):
line = line.strip()
if "D* =" in line and "mm" in line:
print(f" [{ex['id']}] {line[:80]}")
if "MoS =" in line and "mm" not in line:
print(f" [{ex['id']}] {line[:80]}")
# Check for negative MoS without failure
print("\nMoS Check:")
neg_mos = 0
for ex in data:
if "MoS = -" in ex.get("reasoning",""):
neg_mos += 1
print(f" Examples with negative MoS: {neg_mos}")
print(f" Examples with positive MoS: {len(data) - neg_mos}")