Capture router-rejected uploads into private review queue (no data lost)
Browse files- game/datastore.py +44 -25
game/datastore.py
CHANGED
|
@@ -19,7 +19,8 @@ import datetime
|
|
| 19 |
import json
|
| 20 |
import os
|
| 21 |
|
| 22 |
-
DATASET_REPO = "HomesteaderLabs/forager-sightings"
|
|
|
|
| 23 |
LICENSE = "CC-BY-4.0"
|
| 24 |
|
| 25 |
_TOKEN = os.environ.get("HF_TOKEN")
|
|
@@ -120,21 +121,18 @@ def _hamming(a: str, b: str) -> int:
|
|
| 120 |
return bin(int(a, 16) ^ int(b, 16)).count("1")
|
| 121 |
|
| 122 |
|
| 123 |
-
def
|
| 124 |
-
"""
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
Returns: "stored" | "duplicate" | "disabled".
|
| 128 |
-
"""
|
| 129 |
if not _TOKEN:
|
| 130 |
return "disabled"
|
| 131 |
from io import BytesIO
|
| 132 |
from huggingface_hub import CommitOperationAdd, HfApi, hf_hub_download
|
| 133 |
|
| 134 |
-
# read existing metadata (+ its phashes) once
|
| 135 |
existing, rows = "", []
|
| 136 |
try:
|
| 137 |
-
with open(hf_hub_download(
|
| 138 |
token=_TOKEN, force_download=True)) as f:
|
| 139 |
txt = f.read()
|
| 140 |
existing = txt.rstrip("\n")
|
|
@@ -152,24 +150,45 @@ def append_sighting(image, user_label: str, machine: dict, contributor: str) ->
|
|
| 152 |
fname = f"images/{contributor}_{ts.replace(':', '').replace('-', '')}.jpg"
|
| 153 |
buf = BytesIO()
|
| 154 |
image.convert("RGB").save(buf, format="JPEG", quality=90)
|
| 155 |
-
row = {
|
| 156 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 157 |
"machine_prediction": machine.get("species", "unknown"),
|
| 158 |
"machine_confidence": round(float(machine.get("confidence", 0.0)), 4),
|
| 159 |
"machine_abstained": bool(machine.get("abstained", True)),
|
| 160 |
"machine_safety": machine.get("safety", "UNKNOWN"),
|
| 161 |
"routed_domain": machine.get("domain", "unknown"),
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
|
|
|
| 19 |
import json
|
| 20 |
import os
|
| 21 |
|
| 22 |
+
DATASET_REPO = "HomesteaderLabs/forager-sightings" # public: in-domain finds
|
| 23 |
+
REVIEW_REPO = "HomesteaderLabs/forager-sightings-review" # private: router-rejected quarantine
|
| 24 |
LICENSE = "CC-BY-4.0"
|
| 25 |
|
| 26 |
_TOKEN = os.environ.get("HF_TOKEN")
|
|
|
|
| 121 |
return bin(int(a, 16) ^ int(b, 16)).count("1")
|
| 122 |
|
| 123 |
|
| 124 |
+
def _store(repo: str, image, base_row: dict, contributor: str, msg: str) -> str:
|
| 125 |
+
"""Dedup against `repo`'s metadata.jsonl, then commit image + metadata row.
|
| 126 |
+
Returns "stored" | "duplicate" | "disabled". Used by both the public sightings
|
| 127 |
+
write and the private review-queue write."""
|
|
|
|
|
|
|
| 128 |
if not _TOKEN:
|
| 129 |
return "disabled"
|
| 130 |
from io import BytesIO
|
| 131 |
from huggingface_hub import CommitOperationAdd, HfApi, hf_hub_download
|
| 132 |
|
|
|
|
| 133 |
existing, rows = "", []
|
| 134 |
try:
|
| 135 |
+
with open(hf_hub_download(repo, "metadata.jsonl", repo_type="dataset",
|
| 136 |
token=_TOKEN, force_download=True)) as f:
|
| 137 |
txt = f.read()
|
| 138 |
existing = txt.rstrip("\n")
|
|
|
|
| 150 |
fname = f"images/{contributor}_{ts.replace(':', '').replace('-', '')}.jpg"
|
| 151 |
buf = BytesIO()
|
| 152 |
image.convert("RGB").save(buf, format="JPEG", quality=90)
|
| 153 |
+
row = {"file_name": fname, **base_row, "contributor": contributor,
|
| 154 |
+
"consent": True, "license": LICENSE, "timestamp": ts, "phash": ph}
|
| 155 |
+
new_meta = (existing + "\n" if existing else "") + json.dumps(row) + "\n"
|
| 156 |
+
try:
|
| 157 |
+
HfApi(token=_TOKEN).create_commit(
|
| 158 |
+
repo_id=repo, repo_type="dataset", commit_message=msg,
|
| 159 |
+
operations=[
|
| 160 |
+
CommitOperationAdd(path_in_repo=fname, path_or_fileobj=buf.getvalue()),
|
| 161 |
+
CommitOperationAdd(path_in_repo="metadata.jsonl",
|
| 162 |
+
path_or_fileobj=new_meta.encode("utf-8")),
|
| 163 |
+
],
|
| 164 |
+
)
|
| 165 |
+
except Exception:
|
| 166 |
+
# e.g. the Space token lacks write scope on this repo — degrade gracefully
|
| 167 |
+
# so the UI shows a friendly message instead of crashing the handler.
|
| 168 |
+
return "disabled"
|
| 169 |
+
return "stored"
|
| 170 |
+
|
| 171 |
+
|
| 172 |
+
def append_sighting(image, user_label: str, machine: dict, contributor: str) -> str:
|
| 173 |
+
"""In-domain find -> public dataset. Returns stored/duplicate/disabled."""
|
| 174 |
+
return _store(DATASET_REPO, image, {
|
| 175 |
+
"user_label": user_label,
|
| 176 |
"machine_prediction": machine.get("species", "unknown"),
|
| 177 |
"machine_confidence": round(float(machine.get("confidence", 0.0)), 4),
|
| 178 |
"machine_abstained": bool(machine.get("abstained", True)),
|
| 179 |
"machine_safety": machine.get("safety", "UNKNOWN"),
|
| 180 |
"routed_domain": machine.get("domain", "unknown"),
|
| 181 |
+
}, contributor, f"sighting: {machine.get('species', 'unknown')} by {contributor}")
|
| 182 |
+
|
| 183 |
+
|
| 184 |
+
def append_unrouted(image, user_label: str, router: dict, contributor: str) -> str:
|
| 185 |
+
"""Router-rejected (out-of-domain) find -> PRIVATE review queue for later triage.
|
| 186 |
+
Captures the model's blind spots (real forageables the router fumbles). Returns
|
| 187 |
+
stored/duplicate/disabled."""
|
| 188 |
+
return _store(REVIEW_REPO, image, {
|
| 189 |
+
"status": "unrouted",
|
| 190 |
+
"user_label": user_label,
|
| 191 |
+
"router_domain": router.get("domain", "unknown"),
|
| 192 |
+
"router_confidence": round(float(router.get("domain_confidence", 0.0)), 4),
|
| 193 |
+
"reason": router.get("reason", ""),
|
| 194 |
+
}, contributor, f"review: {user_label} by {contributor}")
|