cadforge / scripts /verify_all_tasks.py
eventhorizon28's picture
Upload folder using huggingface_hub
7c72eb2 verified
#!/usr/bin/env python3
import json
import sys
import time
import traceback
from pathlib import Path
TASKS_ROOT = Path(__file__).parent.parent / "server" / "tasks"
sys.path.insert(0, str(Path(__file__).parent.parent))
def verify_reference_code(task_dir: Path) -> dict:
task_json_path = task_dir / "task.json"
ref_code_path = task_dir / "reference_code.py"
with open(task_json_path) as f:
task_data = json.load(f)
code = ref_code_path.read_text()
result_dict = {
"task_id": task_data["id"],
"code_executes": False,
"shape_valid": False,
"volume_gt_zero": False,
"ground_truth_generated": False,
"error": None,
}
try:
import cadquery as cq
import math
local_ns = {"cq": cq, "cadquery": cq, "math": math}
exec(code, local_ns)
if "result" not in local_ns:
result_dict["error"] = "No 'result' variable defined"
return result_dict
result_dict["code_executes"] = True
result_obj = local_ns["result"]
if hasattr(result_obj, "val"):
shape = result_obj.val()
else:
shape = result_obj
result_dict["shape_valid"] = shape.isValid()
vol = shape.Volume()
result_dict["volume"] = round(vol, 2)
result_dict["volume_gt_zero"] = vol > 0
bb = shape.BoundingBox()
result_dict["bbox"] = [round(bb.xlen, 2), round(bb.ylen, 2), round(bb.zlen, 2)]
faces = shape.Faces()
result_dict["face_count"] = len(faces)
except Exception as e:
result_dict["error"] = f"{type(e).__name__}: {e}"
return result_dict
try:
from server.preprocessor import preprocess_from_code
gt = preprocess_from_code(code, str(task_dir), task_id=task_data["id"])
result_dict["ground_truth_generated"] = True
result_dict["gt_volume"] = gt.get("volume_mm3")
result_dict["gt_euler"] = gt.get("euler_characteristic")
result_dict["gt_dominant_face"] = gt.get("dominant_face_type")
except Exception as e:
result_dict["error"] = f"Preprocessing failed: {type(e).__name__}: {e}"
return result_dict
def run_reward_verification(task_dir: Path) -> dict:
result_dict = {
"reward_computed": False,
"reward_value": 0.0,
"error": None,
}
gt_json = task_dir / "ground_truth.json"
if not gt_json.exists():
result_dict["error"] = "No ground_truth.json"
return result_dict
ref_code = (task_dir / "reference_code.py").read_text()
try:
from server.executor import execute_cadquery_code
exec_result = execute_cadquery_code(ref_code, timeout=15.0)
if not exec_result["success"]:
result_dict["error"] = f"Execution failed: {exec_result['error']}"
return result_dict
props = exec_result["properties"]
import numpy as np
from server.preprocessor import sample_surface_points, voxelize, normalize_shape
import cadquery as cq
import math
local_ns = {"cq": cq, "cadquery": cq, "math": math}
exec(ref_code, local_ns)
result_obj = local_ns["result"]
shape = result_obj.val() if hasattr(result_obj, "val") else result_obj
normalized_shape, _ = normalize_shape(shape)
agent_points = sample_surface_points(normalized_shape, 2048)
agent_voxels = voxelize(normalized_shape, 64)
gt_points = np.load(str(task_dir / "surface_points.npy"))
gt_voxels = np.load(str(task_dir / "voxels_64.npy"))
from server.reward import compute_iou, best_of_6_iou, compute_mean_chamfer, compute_median_chamfer
iou = compute_iou(agent_voxels, gt_voxels)
iou_best = best_of_6_iou(agent_voxels, gt_voxels)
mean_cd = compute_mean_chamfer(agent_points, gt_points)
median_cd = compute_median_chamfer(agent_points, gt_points)
with open(gt_json) as f:
gt_data = json.load(f)
bbox_mm = gt_data.get("bbox_mm", [1, 1, 1])
bbox_diag = (sum(d**2 for d in bbox_mm)) ** 0.5
threshold = bbox_diag * 0.1
mean_cd_r = max(0, 1 - mean_cd / threshold) if threshold > 0 else 0
median_cd_r = max(0, 1 - median_cd / threshold) if threshold > 0 else 0
rgeom = 0.60 * iou_best + 0.20 * mean_cd_r + 0.20 * median_cd_r
frame_gap = iou_best - iou
frame_score = 0.1 if frame_gap > 0.15 else 1.0
norm_bb = normalized_shape.BoundingBox()
agent_bbox = [round(norm_bb.xlen, 4), round(norm_bb.ylen, 4), round(norm_bb.zlen, 4)]
gt_bbox = gt_data["bbox_mm"]
sorted_a = sorted(agent_bbox, reverse=True)
sorted_t = sorted(gt_bbox, reverse=True)
def match(a, b, tol=0.05):
for ai, bi in zip(a, b):
if bi == 0:
continue
if abs(ai - bi) / max(abs(bi), 1e-6) > tol:
return False
return True
s_match = match(sorted_a, sorted_t)
u_match = match(agent_bbox, gt_bbox)
param_score = 0.1 if (s_match and not u_match) else 1.0
face_score = 1.0 if props["dominant_face_type"] == gt_data["dominant_face_type"] else 0.0
reval = 0.40 * frame_score + 0.40 * param_score + 0.20 * face_score
total = 1.0 * (0.70 * rgeom + 0.30 * reval)
result_dict["reward_computed"] = True
result_dict["reward_value"] = round(total, 4)
result_dict["detail"] = {
"iou": round(iou, 4),
"iou_best": round(iou_best, 4),
"mean_cd": round(mean_cd, 4),
"mean_cd_reward": round(mean_cd_r, 4),
"median_cd": round(median_cd, 4),
"median_cd_reward": round(median_cd_r, 4),
"rgeom": round(rgeom, 4),
"frame_score": frame_score,
"param_score": param_score,
"face_score": face_score,
"reval": round(reval, 4),
}
except Exception as e:
result_dict["error"] = f"{type(e).__name__}: {e}\n{traceback.format_exc()}"
return result_dict
def main():
print("=" * 80)
print("CadForge Task Verification Pipeline")
print("=" * 80)
task_dirs = sorted(TASKS_ROOT.glob("task_*"))
print(f"\nFound {len(task_dirs)} tasks\n")
phase1_results = []
print("PHASE 1: Verify reference codes execute and generate ground truth")
print("-" * 60)
for task_dir in task_dirs:
t0 = time.time()
result = verify_reference_code(task_dir)
elapsed = time.time() - t0
status = "PASS" if all([
result["code_executes"],
result["shape_valid"],
result["volume_gt_zero"],
result["ground_truth_generated"],
]) else "FAIL"
print(f" {result['task_id']:35s} {status:5s} ({elapsed:.1f}s) "
f"vol={result.get('volume', 'N/A')} bbox={result.get('bbox', 'N/A')}")
if result.get("error"):
print(f" ERROR: {result['error']}")
phase1_results.append(result)
phase1_pass = sum(1 for r in phase1_results if r["ground_truth_generated"])
print(f"\nPhase 1: {phase1_pass}/{len(phase1_results)} tasks passed\n")
print("PHASE 2: Verify reward scores for reference code (should be ~1.0)")
print("-" * 60)
phase2_results = []
for task_dir in task_dirs:
t0 = time.time()
result = run_reward_verification(task_dir)
elapsed = time.time() - t0
task_id = task_dir.name
reward = result.get("reward_value", 0)
status = "PASS" if reward > 0.85 else "WARN" if reward > 0.5 else "FAIL"
print(f" {task_id:35s} {status:5s} reward={reward:.4f} ({elapsed:.1f}s)")
if result.get("error"):
print(f" ERROR: {result['error'][:200]}")
if result.get("detail"):
d = result["detail"]
print(f" IoU={d['iou_best']:.3f} MeanCD_r={d['mean_cd_reward']:.3f} "
f"MedianCD_r={d['median_cd_reward']:.3f} Rgeom={d['rgeom']:.3f} Reval={d['reval']:.3f}")
phase2_results.append(result)
phase2_pass = sum(1 for r in phase2_results if r.get("reward_value", 0) > 0.85)
print(f"\nPhase 2: {phase2_pass}/{len(phase2_results)} tasks score > 0.85")
print("\n" + "=" * 80)
print("SUMMARY")
print(f" Phase 1 (code + ground truth): {phase1_pass}/{len(phase1_results)}")
print(f" Phase 2 (reward > 0.85): {phase2_pass}/{len(phase2_results)}")
print("=" * 80)
with open(TASKS_ROOT.parent / "verification_results.json", "w") as f:
json.dump({
"phase1": phase1_results,
"phase2": phase2_results,
}, f, indent=2, default=str)
print(f"\nDetailed results saved to verification_results.json")
if phase1_pass < len(phase1_results) or phase2_pass < len(phase2_results):
sys.exit(1)
if __name__ == "__main__":
main()