waltgrace commited on
Commit
82cfd5b
·
verified ·
1 Parent(s): 68cddd0

feat: full pipeline end-to-end, OpenRouter labeling, verify stage

Browse files
data_label_factory/cli.py CHANGED
@@ -645,34 +645,368 @@ def cmd_label(args):
645
  print(f" {len(coco['images'])} images, {len(coco['annotations'])} bboxes")
646
 
647
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
648
  def cmd_pipeline(args):
649
- """Full pipeline: gather → filter for the project."""
 
 
 
 
 
 
 
650
  proj = load_project(args.project)
 
 
 
 
651
  print("=" * 70)
652
  print(f"PIPELINE — {proj.project_name} ({proj.target_object})")
 
 
 
653
  print("=" * 70)
654
 
655
  exp = make_experiment_dir(f"pipeline-{proj.project_name}")
656
  write_readme(exp, f"pipeline-{proj.project_name}",
657
  description=f"Full pipeline for {proj.target_object}",
658
  params=vars(args))
659
- write_config(exp, {"project": proj.raw, **vars(args)})
 
 
 
660
  update_latest_symlink(exp)
661
  print(f"Experiment: {exp}\n")
662
 
663
- # 1. Gather
664
- print(">>> GATHER")
665
- args.experiment = os.path.basename(exp).split("_", 2)[-1]
666
- cmd_gather(args)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
667
 
668
- # 2. Filter
669
- print("\n>>> FILTER")
670
- args.experiment = os.path.basename(exp)
671
- cmd_filter(args)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
672
 
673
- # Label + verify TBD via pod or qwen — skipping in this MVP
674
- print("\n>>> LABEL + VERIFY: skipped in MVP — use drone_factory pod path or extend")
675
- print(f"\nPIPELINE DONE — {exp}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
676
 
677
 
678
  def cmd_list(args):
@@ -818,14 +1152,26 @@ def main():
818
  sl.add_argument("--experiment", default=None)
819
  sl.add_argument("--limit", type=int, default=0)
820
 
821
- spi = sub.add_parser("pipeline", help="Full chain: gather → filter (label/verify TBD)")
822
  spi.add_argument("--project", required=True)
823
  spi.add_argument("--max-per-query", type=int, default=20)
824
  spi.add_argument("--workers", type=int, default=50)
825
  spi.add_argument("--experiment", default=None)
826
  spi.add_argument("--limit", type=int, default=0)
 
 
 
 
 
 
827
  add_backend_flag(spi)
828
 
 
 
 
 
 
 
829
  sl2 = sub.add_parser("label-v2", help="Label via provider registry (falcon, wilddet3d, chandra)")
830
  sl2.add_argument("--project", required=True)
831
  sl2.add_argument("--backend", default=None,
@@ -885,6 +1231,7 @@ def main():
885
  "filter": cmd_filter,
886
  "label": cmd_label,
887
  "label-v2": cmd_label_v2,
 
888
  "pipeline": cmd_pipeline,
889
  "list": cmd_list,
890
  "providers": cmd_providers,
 
645
  print(f" {len(coco['images'])} images, {len(coco['annotations'])} bboxes")
646
 
647
 
648
+ def cmd_verify_v2(args):
649
+ """Verify bboxes from a COCO file using a VLM provider (per-bbox YES/NO)."""
650
+ from .providers import create_provider
651
+
652
+ proj = load_project(args.project)
653
+ backend = args.backend or proj.backend_for("verify") or "openrouter"
654
+
655
+ # Find COCO file
656
+ exp_dir = resolve_experiment(args.experiment) if args.experiment else resolve_experiment("latest")
657
+ coco_files = []
658
+ for dirpath, _, filenames in os.walk(exp_dir):
659
+ for fn in filenames:
660
+ if fn.endswith(".coco.json"):
661
+ coco_files.append(os.path.join(dirpath, fn))
662
+ if not coco_files:
663
+ print(f" No COCO files in {exp_dir}")
664
+ return
665
+ coco_path = coco_files[0]
666
+ print(f"Verifying bboxes in {coco_path} via {backend}")
667
+
668
+ with open(coco_path) as f:
669
+ coco = json.load(f)
670
+
671
+ img_root = proj.local_image_dir()
672
+ images_by_id = {img["id"]: img for img in coco.get("images", [])}
673
+ categories = {cat["id"]: cat["name"] for cat in coco.get("categories", [])}
674
+ annotations = coco.get("annotations", [])
675
+
676
+ if args.limit > 0:
677
+ annotations = annotations[:args.limit]
678
+
679
+ try:
680
+ provider = create_provider(backend)
681
+ except Exception as e:
682
+ print(f" {e}")
683
+ return
684
+
685
+ print(f" {len(annotations)} bboxes to verify")
686
+ results = []
687
+ counts = {"YES": 0, "NO": 0, "UNSURE": 0, "ERROR": 0}
688
+ t0 = time.time()
689
+
690
+ for i, ann in enumerate(annotations, 1):
691
+ img = images_by_id.get(ann["image_id"], {})
692
+ img_path = os.path.join(img_root, img.get("file_name", ""))
693
+ cat_name = categories.get(ann.get("category_id"), "object")
694
+ bbox = ann["bbox"]
695
+
696
+ if not os.path.exists(img_path):
697
+ results.append({"ann_id": ann["id"], "verdict": "ERROR", "detail": "image not found"})
698
+ counts["ERROR"] += 1
699
+ continue
700
+
701
+ try:
702
+ vr = provider.verify_bbox(img_path, bbox, cat_name)
703
+ verdict = vr.verdict
704
+ except Exception as e:
705
+ verdict = "ERROR"
706
+ vr = type("VR", (), {"raw_answer": str(e), "elapsed": 0})()
707
+
708
+ counts[verdict] = counts.get(verdict, 0) + 1
709
+ results.append({
710
+ "ann_id": ann["id"], "image": img.get("file_name", ""),
711
+ "category": cat_name, "bbox": bbox,
712
+ "verdict": verdict, "raw_answer": vr.raw_answer[:120],
713
+ "elapsed": round(vr.elapsed, 2),
714
+ })
715
+
716
+ if i % 10 == 0 or i == len(annotations):
717
+ elapsed_total = time.time() - t0
718
+ rate = i / max(elapsed_total, 1)
719
+ eta = (len(annotations) - i) / max(rate, 0.001) / 60
720
+ print(f" [{i:4d}/{len(annotations)}] YES={counts.get('YES',0)} NO={counts.get('NO',0)} "
721
+ f"ERR={counts.get('ERROR',0)} ETA {eta:.1f} min")
722
+
723
+ # Save
724
+ out_dir = os.path.join(exp_dir, f"verify_{backend}")
725
+ os.makedirs(out_dir, exist_ok=True)
726
+ out_path = os.path.join(out_dir, "verified.json")
727
+ with open(out_path, "w") as f:
728
+ json.dump({"backend": backend, "project": proj.project_name,
729
+ "counts": counts, "results": results}, f, indent=2)
730
+ print(f"\nSaved {out_path}")
731
+ approve_rate = counts.get("YES", 0) / max(1, len(results))
732
+ print(f" Approval rate: {approve_rate:.0%} ({counts.get('YES',0)}/{len(results)})")
733
+
734
+
735
  def cmd_pipeline(args):
736
+ """Full pipeline: gather → filter label → verify → score.
737
+
738
+ Runs the complete data labeling factory end-to-end.
739
+ Uses the v2 provider registry for all stages.
740
+ """
741
+ from .providers import create_provider
742
+ from .metrics import score_coco
743
+
744
  proj = load_project(args.project)
745
+ filter_backend = resolve_backend(args, proj, "filter")
746
+ label_backend = getattr(args, "label_backend", None) or proj.backend_for("label") or "falcon"
747
+ verify_backend = getattr(args, "verify_backend", None) or proj.backend_for("verify") or filter_backend
748
+
749
  print("=" * 70)
750
  print(f"PIPELINE — {proj.project_name} ({proj.target_object})")
751
+ print(f" filter: {filter_backend}")
752
+ print(f" label: {label_backend}")
753
+ print(f" verify: {verify_backend}")
754
  print("=" * 70)
755
 
756
  exp = make_experiment_dir(f"pipeline-{proj.project_name}")
757
  write_readme(exp, f"pipeline-{proj.project_name}",
758
  description=f"Full pipeline for {proj.target_object}",
759
  params=vars(args))
760
+ write_config(exp, {"project": proj.raw, **vars(args),
761
+ "filter_backend": filter_backend,
762
+ "label_backend": label_backend,
763
+ "verify_backend": verify_backend})
764
  update_latest_symlink(exp)
765
  print(f"Experiment: {exp}\n")
766
 
767
+ skip_gather = getattr(args, "skip_gather", False)
768
+ img_root = proj.local_image_dir()
769
+
770
+ # ── 1. GATHER ──
771
+ if not skip_gather:
772
+ print("=" * 50)
773
+ print(">>> [1/4] GATHER")
774
+ print("=" * 50)
775
+ args.experiment = os.path.basename(exp).split("_", 2)[-1]
776
+ cmd_gather(args)
777
+ else:
778
+ print(">>> [1/4] GATHER — skipped (--skip-gather)")
779
+
780
+ # Collect images
781
+ images = []
782
+ if os.path.exists(img_root):
783
+ for root, _, names in os.walk(img_root):
784
+ for n in names:
785
+ if n.lower().endswith((".jpg", ".jpeg", ".png", ".webp")):
786
+ full = os.path.join(root, n)
787
+ rel = os.path.relpath(full, img_root)
788
+ parts = rel.split("/")
789
+ if len(parts) < 2:
790
+ continue
791
+ images.append(("/".join(parts[:2]), rel, full))
792
+ if args.limit > 0:
793
+ images = images[:args.limit]
794
+ print(f"\n {len(images)} images found in {img_root}")
795
+
796
+ if not images:
797
+ print(" No images — pipeline stopped. Run gather first.")
798
+ return
799
+
800
+ # ── 2. FILTER ──
801
+ print("\n" + "=" * 50)
802
+ print(f">>> [2/4] FILTER via {filter_backend}")
803
+ print("=" * 50)
804
+
805
+ try:
806
+ filter_prov = create_provider(filter_backend)
807
+ except Exception as e:
808
+ print(f" Filter provider error: {e}")
809
+ print(" Falling back to all-YES (no filter)")
810
+ filter_prov = None
811
+
812
+ prompt = proj.prompt("filter")
813
+ filter_results = []
814
+ counts = {"YES": 0, "NO": 0, "UNKNOWN": 0, "ERROR": 0}
815
+ t0 = time.time()
816
+
817
+ for i, (bucket, rel, full) in enumerate(images, 1):
818
+ if filter_prov:
819
+ fr = filter_prov.filter_image(full, prompt)
820
+ verdict = fr.verdict
821
+ raw = fr.raw_answer
822
+ elapsed_img = fr.elapsed
823
+ else:
824
+ verdict, raw, elapsed_img = "YES", "no filter", 0
825
+
826
+ counts[verdict] = counts.get(verdict, 0) + 1
827
+ filter_results.append({
828
+ "image_path": rel, "bucket": bucket, "verdict": verdict,
829
+ "raw_answer": raw[:120], "elapsed_seconds": round(elapsed_img, 3),
830
+ })
831
+ if i % 10 == 0 or i == len(images):
832
+ elapsed_total = time.time() - t0
833
+ rate = i / max(elapsed_total, 1)
834
+ eta = (len(images) - i) / max(rate, 0.001) / 60
835
+ print(f" [{i:4d}/{len(images)}] YES={counts['YES']} NO={counts['NO']} ETA {eta:.0f} min")
836
+
837
+ out_dir = os.path.join(exp, f"filter_{filter_backend}")
838
+ os.makedirs(out_dir, exist_ok=True)
839
+ with open(os.path.join(out_dir, "keep_list.json"), "w") as f:
840
+ json.dump({"backend": filter_backend, "project": proj.project_name,
841
+ "counts": counts, "results": filter_results}, f, indent=2)
842
+ print(f" YES rate: {counts['YES']}/{len(images)} ({counts['YES']/max(1,len(images)):.0%})")
843
+
844
+ # Keep only YES images for labeling
845
+ yes_images = [(b, r, full) for (b, r, full), fr in zip(images, filter_results)
846
+ if fr["verdict"] == "YES"]
847
+ print(f" {len(yes_images)} images pass filter → label stage")
848
+
849
+ if not yes_images:
850
+ print(" No images passed filter — pipeline stopped.")
851
+ print(f"\nPIPELINE DONE — {exp}")
852
+ return
853
+
854
+ # ── 3. LABEL ──
855
+ print("\n" + "=" * 50)
856
+ print(f">>> [3/4] LABEL via {label_backend}")
857
+ print("=" * 50)
858
+
859
+ try:
860
+ label_prov = create_provider(label_backend)
861
+ except Exception as e:
862
+ print(f" Label provider error: {e}")
863
+ print(f"\nPIPELINE STOPPED at label stage — {exp}")
864
+ return
865
 
866
+ from PIL import Image
867
+ coco = {
868
+ "info": {
869
+ "description": f"data_label_factory pipeline for {proj.project_name}",
870
+ "date_created": datetime.now().isoformat(timespec="seconds"),
871
+ "target_object": proj.target_object,
872
+ "filter_backend": filter_backend,
873
+ "label_backend": label_backend,
874
+ },
875
+ "images": [],
876
+ "annotations": [],
877
+ "categories": [
878
+ {"id": i + 1, "name": q, "supercategory": "object"}
879
+ for i, q in enumerate(proj.falcon_queries)
880
+ ],
881
+ }
882
+ cat_id = {q: i + 1 for i, q in enumerate(proj.falcon_queries)}
883
+ next_img_id, next_ann_id = 1, 1
884
+ n_total_dets = 0
885
+ t0 = time.time()
886
 
887
+ for i, (bucket, rel, full) in enumerate(yes_images, 1):
888
+ try:
889
+ im = Image.open(full)
890
+ iw, ih = im.size
891
+ except Exception as e:
892
+ continue
893
+
894
+ img_id = next_img_id
895
+ next_img_id += 1
896
+ coco["images"].append({
897
+ "id": img_id, "file_name": rel, "width": iw, "height": ih, "bucket": bucket
898
+ })
899
+
900
+ result = label_prov.label_image(full, proj.falcon_queries, image_wh=(iw, ih))
901
+ for ann in result.annotations:
902
+ cat_name = ann.get("category", proj.falcon_queries[0])
903
+ cid = cat_id.get(cat_name)
904
+ if cid is None:
905
+ cid = len(coco["categories"]) + 1
906
+ coco["categories"].append({"id": cid, "name": cat_name, "supercategory": "object"})
907
+ cat_id[cat_name] = cid
908
+
909
+ coco["annotations"].append({
910
+ "id": next_ann_id, "image_id": img_id,
911
+ "category_id": cid,
912
+ "bbox": ann["bbox"],
913
+ "area": round(ann["bbox"][2] * ann["bbox"][3], 2),
914
+ "iscrowd": 0,
915
+ "score": ann.get("score", 1.0),
916
+ })
917
+ next_ann_id += 1
918
+ n_total_dets += 1
919
+
920
+ if i % 5 == 0 or i == len(yes_images):
921
+ elapsed = time.time() - t0
922
+ rate = i / max(elapsed, 1)
923
+ eta = (len(yes_images) - i) / max(rate, 0.001) / 60
924
+ print(f" [{i:4d}/{len(yes_images)}] dets={n_total_dets} ETA {eta:.0f} min")
925
+
926
+ out_dir = os.path.join(exp, f"label_{label_backend}")
927
+ os.makedirs(out_dir, exist_ok=True)
928
+ coco_path = os.path.join(out_dir, f"{proj.project_name}.coco.json")
929
+ with open(coco_path, "w") as f:
930
+ json.dump(coco, f, indent=2)
931
+ print(f" {len(coco['images'])} images, {n_total_dets} bboxes → {coco_path}")
932
+
933
+ # ── 4. VERIFY ──
934
+ print("\n" + "=" * 50)
935
+ print(f">>> [4/4] VERIFY via {verify_backend}")
936
+ print("=" * 50)
937
+
938
+ try:
939
+ verify_prov = create_provider(verify_backend)
940
+ except Exception as e:
941
+ print(f" Verify provider error: {e} — skipping verify")
942
+ verify_prov = None
943
+
944
+ verify_results = []
945
+ v_counts = {"YES": 0, "NO": 0, "UNSURE": 0, "ERROR": 0}
946
+
947
+ if verify_prov and n_total_dets > 0:
948
+ verify_limit = args.limit if args.limit > 0 else len(coco["annotations"])
949
+ anns_to_verify = coco["annotations"][:verify_limit]
950
+ t0 = time.time()
951
+
952
+ for i, ann in enumerate(anns_to_verify, 1):
953
+ img = {im["id"]: im for im in coco["images"]}.get(ann["image_id"], {})
954
+ img_path = os.path.join(img_root, img.get("file_name", ""))
955
+ cat_name = {c["id"]: c["name"] for c in coco["categories"]}.get(ann["category_id"], "object")
956
+
957
+ if not os.path.exists(img_path):
958
+ verify_results.append({"ann_id": ann["id"], "verdict": "ERROR"})
959
+ v_counts["ERROR"] += 1
960
+ continue
961
+
962
+ try:
963
+ vr = verify_prov.verify_bbox(img_path, ann["bbox"], cat_name)
964
+ verdict = vr.verdict
965
+ except Exception:
966
+ verdict = "ERROR"
967
+
968
+ v_counts[verdict] = v_counts.get(verdict, 0) + 1
969
+ verify_results.append({
970
+ "ann_id": ann["id"], "category": cat_name,
971
+ "verdict": verdict,
972
+ })
973
+
974
+ if i % 10 == 0 or i == len(anns_to_verify):
975
+ elapsed_total = time.time() - t0
976
+ rate = i / max(elapsed_total, 1)
977
+ eta = (len(anns_to_verify) - i) / max(rate, 0.001) / 60
978
+ print(f" [{i:4d}/{len(anns_to_verify)}] YES={v_counts['YES']} NO={v_counts['NO']} ETA {eta:.1f} min")
979
+
980
+ out_dir = os.path.join(exp, f"verify_{verify_backend}")
981
+ os.makedirs(out_dir, exist_ok=True)
982
+ with open(os.path.join(out_dir, "verified.json"), "w") as f:
983
+ json.dump({"backend": verify_backend, "counts": v_counts,
984
+ "results": verify_results}, f, indent=2)
985
+ approve = v_counts.get("YES", 0) / max(1, len(verify_results))
986
+ print(f" Approval: {v_counts.get('YES',0)}/{len(verify_results)} ({approve:.0%})")
987
+ else:
988
+ print(" Skipped (no provider or no detections)")
989
+
990
+ # ── SCORE ──
991
+ print("\n" + "=" * 50)
992
+ print(">>> QUALITY SCORE")
993
+ print("=" * 50)
994
+ score = score_coco(coco)
995
+ print(f" Images: {score.total_images}")
996
+ print(f" Annotations: {score.total_annotations}")
997
+ print(f" Pass rate: {score.pass_rate:.0%}")
998
+ print(f" Mean score: {score.mean_score:.3f}")
999
+ for rule, rate in sorted(score.rule_breakdown.items()):
1000
+ flag = "ok" if rate >= 0.95 else "WARN"
1001
+ print(f" {rule:20s} {rate:6.1%} {flag}")
1002
+
1003
+ print(f"\n{'=' * 70}")
1004
+ print(f"PIPELINE DONE — {exp}")
1005
+ print(f" COCO: {coco_path}")
1006
+ print(f" {len(coco['images'])} images, {n_total_dets} bboxes, "
1007
+ f"filter={counts['YES']}/{len(images)} YES, "
1008
+ f"verify={v_counts.get('YES','?')}/{len(verify_results) if verify_results else '?'} approved")
1009
+ print(f"{'=' * 70}")
1010
 
1011
 
1012
  def cmd_list(args):
 
1152
  sl.add_argument("--experiment", default=None)
1153
  sl.add_argument("--limit", type=int, default=0)
1154
 
1155
+ spi = sub.add_parser("pipeline", help="Full chain: gather → filter labelverify → score")
1156
  spi.add_argument("--project", required=True)
1157
  spi.add_argument("--max-per-query", type=int, default=20)
1158
  spi.add_argument("--workers", type=int, default=50)
1159
  spi.add_argument("--experiment", default=None)
1160
  spi.add_argument("--limit", type=int, default=0)
1161
+ spi.add_argument("--skip-gather", action="store_true",
1162
+ help="Skip image gathering (use existing images)")
1163
+ spi.add_argument("--label-backend", default=None,
1164
+ help="Backend for bbox labeling (falcon, openrouter, etc.)")
1165
+ spi.add_argument("--verify-backend", default=None,
1166
+ help="Backend for per-bbox verification")
1167
  add_backend_flag(spi)
1168
 
1169
+ sv = sub.add_parser("verify", help="Verify bboxes in a COCO file via VLM")
1170
+ sv.add_argument("--project", required=True)
1171
+ sv.add_argument("--experiment", default=None)
1172
+ sv.add_argument("--limit", type=int, default=0)
1173
+ add_backend_flag(sv)
1174
+
1175
  sl2 = sub.add_parser("label-v2", help="Label via provider registry (falcon, wilddet3d, chandra)")
1176
  sl2.add_argument("--project", required=True)
1177
  sl2.add_argument("--backend", default=None,
 
1231
  "filter": cmd_filter,
1232
  "label": cmd_label,
1233
  "label-v2": cmd_label_v2,
1234
+ "verify": cmd_verify_v2,
1235
  "pipeline": cmd_pipeline,
1236
  "list": cmd_list,
1237
  "providers": cmd_providers,
data_label_factory/providers/openrouter.py CHANGED
@@ -38,6 +38,8 @@ import time
38
  import urllib.request
39
  from typing import Any
40
 
 
 
41
  from . import Provider, FilterResult, VerifyResult, LabelResult, register_provider
42
 
43
 
@@ -242,3 +244,62 @@ class OpenRouterProvider(Provider):
242
  elapsed=elapsed,
243
  confidence=conf,
244
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  import urllib.request
39
  from typing import Any
40
 
41
+ import re as _re
42
+
43
  from . import Provider, FilterResult, VerifyResult, LabelResult, register_provider
44
 
45
 
 
244
  elapsed=elapsed,
245
  confidence=conf,
246
  )
247
+
248
+ def label_image(self, image_path: str, queries: list[str],
249
+ image_wh: tuple[int, int] | None = None) -> LabelResult:
250
+ """Bbox detection via Gemma 4 vision grounding.
251
+
252
+ Prompts the model to return bounding box coordinates for each query.
253
+ Gemma 4 supports grounded detection — it returns [y1, x1, y2, x2]
254
+ normalized to 0-1000 when prompted correctly.
255
+ """
256
+ if image_wh is None:
257
+ from PIL import Image
258
+ im = Image.open(image_path)
259
+ image_wh = im.size
260
+
261
+ iw, ih = image_wh
262
+ all_annotations = []
263
+ total_elapsed = 0.0
264
+
265
+ for query in queries:
266
+ prompt = (
267
+ f"Detect all instances of \"{query}\" in this image. "
268
+ f"For each instance, return a bounding box as [ymin, xmin, ymax, xmax] "
269
+ f"with coordinates normalized from 0 to 1000. "
270
+ f"Format each detection on its own line as: "
271
+ f"[ymin, xmin, ymax, xmax] label\n"
272
+ f"If none found, say NONE."
273
+ )
274
+
275
+ try:
276
+ answer, elapsed, _ = self._call(image_path, prompt, max_tokens=512, timeout=30)
277
+ answer = _strip_thinking(answer)
278
+ total_elapsed += elapsed
279
+ except Exception as e:
280
+ continue
281
+
282
+ # Parse bbox lines: [y1, x1, y2, x2] label
283
+ for line in answer.split("\n"):
284
+ line = line.strip()
285
+ match = _re.search(r'\[(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\]', line)
286
+ if match:
287
+ y1 = int(match.group(1)) / 1000.0 * ih
288
+ x1 = int(match.group(2)) / 1000.0 * iw
289
+ y2 = int(match.group(3)) / 1000.0 * ih
290
+ x2 = int(match.group(4)) / 1000.0 * iw
291
+ w = max(0, x2 - x1)
292
+ h = max(0, y2 - y1)
293
+ if w > 0 and h > 0:
294
+ all_annotations.append({
295
+ "bbox": [round(x1, 2), round(y1, 2), round(w, 2), round(h, 2)],
296
+ "category": query,
297
+ "score": 0.8,
298
+ "source": "openrouter",
299
+ })
300
+
301
+ return LabelResult(
302
+ annotations=all_annotations,
303
+ elapsed=total_elapsed,
304
+ metadata={"model": self._model()},
305
+ )
data_label_factory/serve.py CHANGED
@@ -197,6 +197,57 @@ async def label_image(
197
  os.unlink(tmp_path)
198
 
199
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
200
  # ─── Verify ────────────────────────────────────────────────
201
 
202
  @app.post("/api/verify")
 
197
  os.unlink(tmp_path)
198
 
199
 
200
+ # ─── Ask (free-form VLM question) ─────────────────────────
201
+
202
+ @app.post("/api/ask")
203
+ async def ask_image(
204
+ image: UploadFile = File(...),
205
+ question: str = Form(default="What do you see in this image?"),
206
+ backend: str = Form(default="gemma"),
207
+ ):
208
+ """Ask a free-form question about an image via any VLM backend."""
209
+ from .providers import create_provider
210
+
211
+ suffix = Path(image.filename).suffix or ".jpg"
212
+ with tempfile.NamedTemporaryFile(suffix=suffix, delete=False, dir=str(UPLOAD_DIR)) as f:
213
+ f.write(await image.read())
214
+ tmp_path = f.name
215
+
216
+ try:
217
+ provider = create_provider(backend)
218
+ # Use _call for richer answers (more tokens than filter's 32)
219
+ if hasattr(provider, '_call'):
220
+ call_result = provider._call(tmp_path, question, max_tokens=256)
221
+ # Some providers return (text, elapsed), others (text, elapsed, usage)
222
+ if len(call_result) == 3:
223
+ answer, elapsed, _ = call_result
224
+ else:
225
+ answer, elapsed = call_result
226
+ else:
227
+ result = provider.filter_image(tmp_path, question)
228
+ answer = result.raw_answer
229
+ elapsed = result.elapsed
230
+
231
+ # Strip thinking tokens
232
+ if hasattr(provider, '_strip_thinking'):
233
+ from .providers.gemma import _strip_thinking
234
+ answer = _strip_thinking(answer)
235
+ elif 'thought' in answer.lower()[:20]:
236
+ import re
237
+ answer = re.sub(r'^(?:thought\s*\n?\s*)+', '', answer, flags=re.IGNORECASE).strip()
238
+
239
+ return {
240
+ "answer": answer,
241
+ "elapsed": round(elapsed, 2),
242
+ "backend": backend,
243
+ "question": question,
244
+ }
245
+ except Exception as e:
246
+ raise HTTPException(500, str(e))
247
+ finally:
248
+ os.unlink(tmp_path)
249
+
250
+
251
  # ─── Verify ────────────────────────────────────────────────
252
 
253
  @app.post("/api/verify")
web/app/api/dlf/route.ts CHANGED
@@ -16,11 +16,16 @@ export async function GET(req: NextRequest) {
16
  export async function POST(req: NextRequest) {
17
  const path = req.nextUrl.searchParams.get("path") || "/api/filter";
18
  try {
19
- const formData = await req.formData();
 
 
 
20
  const res = await fetch(`${DLF_API}${path}`, {
21
  method: "POST",
22
- body: formData,
 
23
  });
 
24
  const data = await res.json();
25
  return NextResponse.json(data);
26
  } catch (e: any) {
 
16
  export async function POST(req: NextRequest) {
17
  const path = req.nextUrl.searchParams.get("path") || "/api/filter";
18
  try {
19
+ // Forward the raw request body + content-type header to preserve multipart boundaries
20
+ const contentType = req.headers.get("content-type") || "";
21
+ const body = await req.arrayBuffer();
22
+
23
  const res = await fetch(`${DLF_API}${path}`, {
24
  method: "POST",
25
+ headers: { "Content-Type": contentType },
26
+ body: body,
27
  });
28
+
29
  const data = await res.json();
30
  return NextResponse.json(data);
31
  } catch (e: any) {
web/app/label/page.tsx CHANGED
@@ -62,6 +62,8 @@ export default function LabelPage() {
62
  const [loading, setLoading] = useState(false);
63
  const [loadingMsg, setLoadingMsg] = useState("");
64
  const [apiStatus, setApiStatus] = useState<"checking" | "up" | "down">("checking");
 
 
65
  const canvasRef = useRef<HTMLCanvasElement>(null);
66
  const fileInputRef = useRef<HTMLInputElement>(null);
67
 
@@ -172,6 +174,51 @@ export default function LabelPage() {
172
  setLoadingMsg("");
173
  };
174
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
  // Draw bboxes on canvas
176
  const drawAnnotations = (idx: number, result: LabelResult) => {
177
  const canvas = canvasRef.current;
@@ -298,13 +345,13 @@ export default function LabelPage() {
298
  {/* Description */}
299
  <div>
300
  <label className="block text-sm font-medium text-zinc-400 mb-1">
301
- What are you labeling?
302
  </label>
303
  <input
304
  type="text"
305
  value={description}
306
  onChange={(e) => setDescription(e.target.value)}
307
- placeholder="e.g. stop signs, fire hydrants, trading cards..."
308
  className="w-full px-4 py-3 rounded-lg bg-zinc-900 border border-zinc-700 text-zinc-100 placeholder:text-zinc-600 focus:border-blue-500 focus:outline-none"
309
  />
310
  </div>
@@ -394,8 +441,26 @@ export default function LabelPage() {
394
  >
395
  {loading ? loadingMsg : "Filter All"}
396
  </button>
 
 
 
 
 
 
 
397
  </div>
398
 
 
 
 
 
 
 
 
 
 
 
 
399
  {/* Filter summary */}
400
  {filterResults.length > 0 && (
401
  <div className="bg-zinc-900 rounded-lg p-4 border border-zinc-800">
 
62
  const [loading, setLoading] = useState(false);
63
  const [loadingMsg, setLoadingMsg] = useState("");
64
  const [apiStatus, setApiStatus] = useState<"checking" | "up" | "down">("checking");
65
+ const [askAnswer, setAskAnswer] = useState<string | null>(null);
66
+ const [askElapsed, setAskElapsed] = useState(0);
67
  const canvasRef = useRef<HTMLCanvasElement>(null);
68
  const fileInputRef = useRef<HTMLInputElement>(null);
69
 
 
174
  setLoadingMsg("");
175
  };
176
 
177
+ // Ask AI — runs BOTH question answering AND bbox detection in parallel
178
+ const askAI = async () => {
179
+ if (selectedImage === null || !files[selectedImage] || !description) return;
180
+ setLoading(true);
181
+ setLoadingMsg("Asking AI + detecting objects...");
182
+ setAskAnswer(null);
183
+
184
+ const file = files[selectedImage];
185
+
186
+ // Run ask + label in parallel
187
+ const askForm = new FormData();
188
+ askForm.append("image", file);
189
+ askForm.append("question", description);
190
+ askForm.append("backend", filterBackend);
191
+
192
+ const labelForm = new FormData();
193
+ labelForm.append("image", file);
194
+ labelForm.append("queries", description.replace(/\?/g, "").replace(/how many /gi, ""));
195
+ labelForm.append("backend", labelBackend);
196
+
197
+ const [askRes, labelRes] = await Promise.allSettled([
198
+ fetch(`${DLF_API}?path=/api/ask`, { method: "POST", body: askForm }).then(r => r.json()),
199
+ fetch(`${DLF_API}?path=/api/label`, { method: "POST", body: labelForm }).then(r => r.json()),
200
+ ]);
201
+
202
+ // Process ask result
203
+ if (askRes.status === "fulfilled") {
204
+ const data = askRes.value;
205
+ setAskAnswer(data.answer || data.error || "No response");
206
+ setAskElapsed(data.elapsed || 0);
207
+ } else {
208
+ setAskAnswer(`Error: ${askRes.reason}`);
209
+ }
210
+
211
+ // Process label result — draw bboxes
212
+ if (labelRes.status === "fulfilled" && labelRes.value.annotations) {
213
+ const data = labelRes.value as LabelResult;
214
+ setLabelResults((prev) => new Map(prev).set(file.name, data));
215
+ drawAnnotations(selectedImage, data);
216
+ }
217
+
218
+ setLoading(false);
219
+ setLoadingMsg("");
220
+ };
221
+
222
  // Draw bboxes on canvas
223
  const drawAnnotations = (idx: number, result: LabelResult) => {
224
  const canvas = canvasRef.current;
 
345
  {/* Description */}
346
  <div>
347
  <label className="block text-sm font-medium text-zinc-400 mb-1">
348
+ Target object or question
349
  </label>
350
  <input
351
  type="text"
352
  value={description}
353
  onChange={(e) => setDescription(e.target.value)}
354
+ placeholder="e.g. stop signs, fire hydrants, or ask: how many birds?"
355
  className="w-full px-4 py-3 rounded-lg bg-zinc-900 border border-zinc-700 text-zinc-100 placeholder:text-zinc-600 focus:border-blue-500 focus:outline-none"
356
  />
357
  </div>
 
441
  >
442
  {loading ? loadingMsg : "Filter All"}
443
  </button>
444
+ <button
445
+ onClick={askAI}
446
+ disabled={loading || !files.length || !description || selectedImage === null}
447
+ className="flex-1 px-4 py-3 bg-purple-600 hover:bg-purple-500 disabled:bg-zinc-800 disabled:text-zinc-600 rounded-lg font-medium transition-colors"
448
+ >
449
+ Ask AI
450
+ </button>
451
  </div>
452
 
453
+ {/* Ask AI answer */}
454
+ {askAnswer && (
455
+ <div className="bg-zinc-900 rounded-lg p-4 border border-purple-500/30">
456
+ <div className="flex justify-between text-sm mb-2">
457
+ <span className="text-purple-400 font-medium">AI Answer</span>
458
+ <span className="text-zinc-500">{askElapsed}s</span>
459
+ </div>
460
+ <p className="text-zinc-200 text-sm whitespace-pre-wrap">{askAnswer}</p>
461
+ </div>
462
+ )}
463
+
464
  {/* Filter summary */}
465
  {filterResults.length > 0 && (
466
  <div className="bg-zinc-900 rounded-lg p-4 border border-zinc-800">