HomesteaderLabs commited on
Commit
a10695a
·
verified ·
1 Parent(s): b4bd4a7

Phase 3: near-duplicate (dHash) rejection on contributed photos

Browse files
Files changed (1) hide show
  1. game/datastore.py +51 -20
game/datastore.py CHANGED
@@ -96,17 +96,57 @@ def load_contributors() -> list[dict]:
96
  return []
97
 
98
 
99
- def append_sighting(image, user_label: str, machine: dict, contributor: str) -> bool:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
  """
101
- Phase 2: commit one contributed photo + metadata row to the dataset.
102
- `machine` carries the model's call (prediction/confidence/abstained/safety/domain).
103
- Returns True on persisted write. Stub raises if no token so callers gate on
104
- persistence_enabled() first.
105
  """
106
  if not _TOKEN:
107
- return False
108
  from io import BytesIO
109
- from huggingface_hub import CommitOperationAdd, HfApi
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
 
111
  ts = _now()
112
  fname = f"images/{contributor}_{ts.replace(':', '').replace('-', '')}.jpg"
@@ -119,20 +159,11 @@ def append_sighting(image, user_label: str, machine: dict, contributor: str) ->
119
  "machine_abstained": bool(machine.get("abstained", True)),
120
  "machine_safety": machine.get("safety", "UNKNOWN"),
121
  "routed_domain": machine.get("domain", "unknown"),
122
- "contributor": contributor, "consent": True, "license": LICENSE, "timestamp": ts,
 
123
  }
124
- api = HfApi(token=_TOKEN)
125
- # append the metadata line to the existing jsonl
126
- existing = ""
127
- try:
128
- from huggingface_hub import hf_hub_download
129
- with open(hf_hub_download(DATASET_REPO, "metadata.jsonl", repo_type="dataset",
130
- token=_TOKEN, force_download=True)) as f:
131
- existing = f.read().rstrip("\n")
132
- except Exception:
133
- pass
134
  new_meta = (existing + "\n" if existing else "") + json.dumps(row) + "\n"
135
- api.create_commit(
136
  repo_id=DATASET_REPO, repo_type="dataset",
137
  commit_message=f"sighting: {row['machine_prediction']} by {contributor}",
138
  operations=[
@@ -141,4 +172,4 @@ def append_sighting(image, user_label: str, machine: dict, contributor: str) ->
141
  path_or_fileobj=new_meta.encode("utf-8")),
142
  ],
143
  )
144
- return True
 
96
  return []
97
 
98
 
99
+ # Near-duplicate detection via perceptual hash (dHash). Catches the same photo
100
+ # re-saved/resized/re-compressed, not just byte-identical files — which is the
101
+ # realistic abuse/pollution case (re-uploading a popular web image, gaming the
102
+ # contributor board, accidental double-submits). PIL + numpy only, no new dep.
103
+ DUP_HAMMING_THRESHOLD = 5 # <=5 of 64 bits differ => treat as the same image
104
+
105
+
106
+ def compute_phash(image) -> str:
107
+ """64-bit dHash as a 16-char hex string (row->row horizontal gradient)."""
108
+ import numpy as np
109
+ from PIL import Image
110
+ small = image.convert("L").resize((9, 8), Image.BILINEAR)
111
+ a = np.asarray(small, dtype=np.int16)
112
+ bits = (a[:, 1:] > a[:, :-1]).flatten() # 8x8 = 64 bits
113
+ val = 0
114
+ for b in bits:
115
+ val = (val << 1) | int(b)
116
+ return f"{val:016x}"
117
+
118
+
119
+ def _hamming(a: str, b: str) -> int:
120
+ return bin(int(a, 16) ^ int(b, 16)).count("1")
121
+
122
+
123
+ def append_sighting(image, user_label: str, machine: dict, contributor: str) -> str:
124
  """
125
+ Commit one contributed photo + metadata row to the dataset, with near-dup
126
+ rejection. `machine` carries the model's call.
127
+ Returns: "stored" | "duplicate" | "disabled".
 
128
  """
129
  if not _TOKEN:
130
+ return "disabled"
131
  from io import BytesIO
132
+ from huggingface_hub import CommitOperationAdd, HfApi, hf_hub_download
133
+
134
+ # read existing metadata (+ its phashes) once
135
+ existing, rows = "", []
136
+ try:
137
+ with open(hf_hub_download(DATASET_REPO, "metadata.jsonl", repo_type="dataset",
138
+ token=_TOKEN, force_download=True)) as f:
139
+ txt = f.read()
140
+ existing = txt.rstrip("\n")
141
+ rows = [json.loads(line) for line in txt.splitlines() if line.strip()]
142
+ except Exception:
143
+ pass
144
+
145
+ ph = compute_phash(image)
146
+ for r in rows:
147
+ h = r.get("phash")
148
+ if h and _hamming(h, ph) <= DUP_HAMMING_THRESHOLD:
149
+ return "duplicate"
150
 
151
  ts = _now()
152
  fname = f"images/{contributor}_{ts.replace(':', '').replace('-', '')}.jpg"
 
159
  "machine_abstained": bool(machine.get("abstained", True)),
160
  "machine_safety": machine.get("safety", "UNKNOWN"),
161
  "routed_domain": machine.get("domain", "unknown"),
162
+ "contributor": contributor, "consent": True, "license": LICENSE,
163
+ "timestamp": ts, "phash": ph,
164
  }
 
 
 
 
 
 
 
 
 
 
165
  new_meta = (existing + "\n" if existing else "") + json.dumps(row) + "\n"
166
+ HfApi(token=_TOKEN).create_commit(
167
  repo_id=DATASET_REPO, repo_type="dataset",
168
  commit_message=f"sighting: {row['machine_prediction']} by {contributor}",
169
  operations=[
 
172
  path_or_fileobj=new_meta.encode("utf-8")),
173
  ],
174
  )
175
+ return "stored"