waltgrace's picture
Initial release: data-label-factory v0.1.0
4cda727 verified
"""
experiments.py β€” dated experiment folder convention.
Every pipeline run goes into experiments/<YYYY-MM-DD_HHMMSS>_<name>/
with a README + config.json so we can compare runs over time.
Layout:
experiments/
β”œβ”€β”€ 2026-04-07_193000_first-yt-batch/
β”‚ β”œβ”€β”€ README.md ← what this run was, parameters, observations
β”‚ β”œβ”€β”€ config.json ← exact CLI args
β”‚ β”œβ”€β”€ gather/ ← gather_v2 outputs (images go to drone-dataset-v2/)
β”‚ β”‚ β”œβ”€β”€ manifest.json
β”‚ β”‚ └── stats.json
β”‚ β”œβ”€β”€ filter_qwen/ ← run_qwen_filter outputs
β”‚ β”‚ β”œβ”€β”€ keep_list.json
β”‚ β”‚ └── stats.json
β”‚ β”œβ”€β”€ label_falcon/ ← pod_label outputs (from RunPod)
β”‚ β”‚ β”œβ”€β”€ coco.json
β”‚ β”‚ └── stats.json
β”‚ β”œβ”€β”€ verify_qwen/ ← verify_vlm outputs (from RunPod)
β”‚ β”‚ β”œβ”€β”€ verified.json
β”‚ β”‚ └── stats.json
β”‚ └── reviews/ ← human verdicts from the web UI
β”‚ └── reviews.json
└── latest -> 2026-04-07_193000_first-yt-batch/ ← symlink to most recent
The drone-dataset-v2/ images themselves are SHARED across experiments β€”
each experiment writes labels/filters/verifications referencing those images,
not copies of them.
"""
import json
import os
import sys
import time
from datetime import datetime
from pathlib import Path
def make_experiment_dir(name: str = "", base: str = "experiments") -> str:
"""Create a fresh experiment dir with a timestamp + optional name suffix.
Returns the absolute path."""
ts = datetime.now().strftime("%Y-%m-%d_%H%M%S")
safe_name = name.strip().replace(" ", "-").replace("/", "_") if name else ""
folder = f"{ts}_{safe_name}" if safe_name else ts
full = os.path.abspath(os.path.join(base, folder))
os.makedirs(full, exist_ok=True)
# Create the standard subdirs
for sub in ("gather", "filter_qwen", "label_falcon", "verify_qwen", "reviews"):
os.makedirs(os.path.join(full, sub), exist_ok=True)
return full
def write_readme(experiment_dir: str, name: str, description: str, params: dict):
"""Write a small markdown README capturing what this experiment is."""
readme_path = os.path.join(experiment_dir, "README.md")
lines = [
f"# Experiment: {name or os.path.basename(experiment_dir)}",
"",
f"**Started:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
f"**Path:** `{experiment_dir}`",
"",
"## Description",
"",
description or "(no description)",
"",
"## Parameters",
"",
"```json",
json.dumps(params, indent=2),
"```",
"",
"## Pipeline stages",
"",
"1. **gather/** β€” image gathering manifest",
"2. **filter_qwen/** β€” image-level Qwen YES/NO filter results",
"3. **label_falcon/** β€” Falcon Perception bbox grounding (COCO format)",
"4. **verify_qwen/** β€” per-bbox Qwen verification",
"5. **reviews/** β€” human verdicts from the web UI",
"",
]
with open(readme_path, "w") as f:
f.write("\n".join(lines))
def write_config(experiment_dir: str, config: dict):
"""Write the exact config used for this experiment."""
with open(os.path.join(experiment_dir, "config.json"), "w") as f:
json.dump(config, f, indent=2)
def update_latest_symlink(experiment_dir: str, base: str = "experiments"):
"""Update the experiments/latest symlink to point at this experiment."""
base_abs = os.path.abspath(base)
link = os.path.join(base_abs, "latest")
target = os.path.basename(experiment_dir) # relative symlink
if os.path.islink(link):
os.unlink(link)
elif os.path.exists(link):
# Don't clobber a real directory
return
try:
os.symlink(target, link)
except OSError:
pass # symlinks can fail on some filesystems
def list_experiments(base: str = "experiments") -> list:
"""List all experiment directories in chronological order (newest first)."""
if not os.path.exists(base):
return []
out = []
for entry in sorted(os.listdir(base), reverse=True):
if entry == "latest":
continue
full = os.path.join(base, entry)
if not os.path.isdir(full):
continue
readme = os.path.join(full, "README.md")
config = os.path.join(full, "config.json")
cfg = {}
if os.path.exists(config):
try:
cfg = json.load(open(config))
except Exception:
pass
out.append({
"name": entry,
"path": full,
"config": cfg,
"has_readme": os.path.exists(readme),
})
return out
if __name__ == "__main__":
# CLI: list experiments or make one
import argparse
p = argparse.ArgumentParser()
sub = p.add_subparsers(dest="cmd")
p_new = sub.add_parser("new", help="Create a new dated experiment folder")
p_new.add_argument("--name", default="", help="Optional human-readable suffix")
p_new.add_argument("--description", default="")
p_list = sub.add_parser("list", help="List existing experiments")
args = p.parse_args()
if args.cmd == "new":
path = make_experiment_dir(args.name)
write_readme(path, args.name, args.description, {})
update_latest_symlink(path)
print(path)
elif args.cmd == "list":
for e in list_experiments():
print(f" {e['name']}")
else:
p.print_help()