HomesteaderLabs commited on
Commit
7483ac2
·
verified ·
1 Parent(s): 926a9f1

Capture router-rejected uploads into private review queue (no data lost)

Browse files
Files changed (1) hide show
  1. game/datastore.py +44 -25
game/datastore.py CHANGED
@@ -19,7 +19,8 @@ import datetime
19
  import json
20
  import os
21
 
22
- DATASET_REPO = "HomesteaderLabs/forager-sightings"
 
23
  LICENSE = "CC-BY-4.0"
24
 
25
  _TOKEN = os.environ.get("HF_TOKEN")
@@ -120,21 +121,18 @@ def _hamming(a: str, b: str) -> int:
120
  return bin(int(a, 16) ^ int(b, 16)).count("1")
121
 
122
 
123
- def append_sighting(image, user_label: str, machine: dict, contributor: str) -> str:
124
- """
125
- Commit one contributed photo + metadata row to the dataset, with near-dup
126
- rejection. `machine` carries the model's call.
127
- Returns: "stored" | "duplicate" | "disabled".
128
- """
129
  if not _TOKEN:
130
  return "disabled"
131
  from io import BytesIO
132
  from huggingface_hub import CommitOperationAdd, HfApi, hf_hub_download
133
 
134
- # read existing metadata (+ its phashes) once
135
  existing, rows = "", []
136
  try:
137
- with open(hf_hub_download(DATASET_REPO, "metadata.jsonl", repo_type="dataset",
138
  token=_TOKEN, force_download=True)) as f:
139
  txt = f.read()
140
  existing = txt.rstrip("\n")
@@ -152,24 +150,45 @@ def append_sighting(image, user_label: str, machine: dict, contributor: str) ->
152
  fname = f"images/{contributor}_{ts.replace(':', '').replace('-', '')}.jpg"
153
  buf = BytesIO()
154
  image.convert("RGB").save(buf, format="JPEG", quality=90)
155
- row = {
156
- "file_name": fname, "user_label": user_label,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
  "machine_prediction": machine.get("species", "unknown"),
158
  "machine_confidence": round(float(machine.get("confidence", 0.0)), 4),
159
  "machine_abstained": bool(machine.get("abstained", True)),
160
  "machine_safety": machine.get("safety", "UNKNOWN"),
161
  "routed_domain": machine.get("domain", "unknown"),
162
- "contributor": contributor, "consent": True, "license": LICENSE,
163
- "timestamp": ts, "phash": ph,
164
- }
165
- new_meta = (existing + "\n" if existing else "") + json.dumps(row) + "\n"
166
- HfApi(token=_TOKEN).create_commit(
167
- repo_id=DATASET_REPO, repo_type="dataset",
168
- commit_message=f"sighting: {row['machine_prediction']} by {contributor}",
169
- operations=[
170
- CommitOperationAdd(path_in_repo=fname, path_or_fileobj=buf.getvalue()),
171
- CommitOperationAdd(path_in_repo="metadata.jsonl",
172
- path_or_fileobj=new_meta.encode("utf-8")),
173
- ],
174
- )
175
- return "stored"
 
19
  import json
20
  import os
21
 
22
+ DATASET_REPO = "HomesteaderLabs/forager-sightings" # public: in-domain finds
23
+ REVIEW_REPO = "HomesteaderLabs/forager-sightings-review" # private: router-rejected quarantine
24
  LICENSE = "CC-BY-4.0"
25
 
26
  _TOKEN = os.environ.get("HF_TOKEN")
 
121
  return bin(int(a, 16) ^ int(b, 16)).count("1")
122
 
123
 
124
+ def _store(repo: str, image, base_row: dict, contributor: str, msg: str) -> str:
125
+ """Dedup against `repo`'s metadata.jsonl, then commit image + metadata row.
126
+ Returns "stored" | "duplicate" | "disabled". Used by both the public sightings
127
+ write and the private review-queue write."""
 
 
128
  if not _TOKEN:
129
  return "disabled"
130
  from io import BytesIO
131
  from huggingface_hub import CommitOperationAdd, HfApi, hf_hub_download
132
 
 
133
  existing, rows = "", []
134
  try:
135
+ with open(hf_hub_download(repo, "metadata.jsonl", repo_type="dataset",
136
  token=_TOKEN, force_download=True)) as f:
137
  txt = f.read()
138
  existing = txt.rstrip("\n")
 
150
  fname = f"images/{contributor}_{ts.replace(':', '').replace('-', '')}.jpg"
151
  buf = BytesIO()
152
  image.convert("RGB").save(buf, format="JPEG", quality=90)
153
+ row = {"file_name": fname, **base_row, "contributor": contributor,
154
+ "consent": True, "license": LICENSE, "timestamp": ts, "phash": ph}
155
+ new_meta = (existing + "\n" if existing else "") + json.dumps(row) + "\n"
156
+ try:
157
+ HfApi(token=_TOKEN).create_commit(
158
+ repo_id=repo, repo_type="dataset", commit_message=msg,
159
+ operations=[
160
+ CommitOperationAdd(path_in_repo=fname, path_or_fileobj=buf.getvalue()),
161
+ CommitOperationAdd(path_in_repo="metadata.jsonl",
162
+ path_or_fileobj=new_meta.encode("utf-8")),
163
+ ],
164
+ )
165
+ except Exception:
166
+ # e.g. the Space token lacks write scope on this repo — degrade gracefully
167
+ # so the UI shows a friendly message instead of crashing the handler.
168
+ return "disabled"
169
+ return "stored"
170
+
171
+
172
+ def append_sighting(image, user_label: str, machine: dict, contributor: str) -> str:
173
+ """In-domain find -> public dataset. Returns stored/duplicate/disabled."""
174
+ return _store(DATASET_REPO, image, {
175
+ "user_label": user_label,
176
  "machine_prediction": machine.get("species", "unknown"),
177
  "machine_confidence": round(float(machine.get("confidence", 0.0)), 4),
178
  "machine_abstained": bool(machine.get("abstained", True)),
179
  "machine_safety": machine.get("safety", "UNKNOWN"),
180
  "routed_domain": machine.get("domain", "unknown"),
181
+ }, contributor, f"sighting: {machine.get('species', 'unknown')} by {contributor}")
182
+
183
+
184
+ def append_unrouted(image, user_label: str, router: dict, contributor: str) -> str:
185
+ """Router-rejected (out-of-domain) find -> PRIVATE review queue for later triage.
186
+ Captures the model's blind spots (real forageables the router fumbles). Returns
187
+ stored/duplicate/disabled."""
188
+ return _store(REVIEW_REPO, image, {
189
+ "status": "unrouted",
190
+ "user_label": user_label,
191
+ "router_domain": router.get("domain", "unknown"),
192
+ "router_confidence": round(float(router.get("domain_confidence", 0.0)), 4),
193
+ "reason": router.get("reason", ""),
194
+ }, contributor, f"review: {user_label} by {contributor}")