Phase 3: near-duplicate (dHash) rejection on contributed photos
Browse files- game/datastore.py +51 -20
game/datastore.py
CHANGED
|
@@ -96,17 +96,57 @@ def load_contributors() -> list[dict]:
|
|
| 96 |
return []
|
| 97 |
|
| 98 |
|
| 99 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
"""
|
| 101 |
-
|
| 102 |
-
`machine` carries the model's call
|
| 103 |
-
Returns
|
| 104 |
-
persistence_enabled() first.
|
| 105 |
"""
|
| 106 |
if not _TOKEN:
|
| 107 |
-
return
|
| 108 |
from io import BytesIO
|
| 109 |
-
from huggingface_hub import CommitOperationAdd, HfApi
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
|
| 111 |
ts = _now()
|
| 112 |
fname = f"images/{contributor}_{ts.replace(':', '').replace('-', '')}.jpg"
|
|
@@ -119,20 +159,11 @@ def append_sighting(image, user_label: str, machine: dict, contributor: str) ->
|
|
| 119 |
"machine_abstained": bool(machine.get("abstained", True)),
|
| 120 |
"machine_safety": machine.get("safety", "UNKNOWN"),
|
| 121 |
"routed_domain": machine.get("domain", "unknown"),
|
| 122 |
-
"contributor": contributor, "consent": True, "license": LICENSE,
|
|
|
|
| 123 |
}
|
| 124 |
-
api = HfApi(token=_TOKEN)
|
| 125 |
-
# append the metadata line to the existing jsonl
|
| 126 |
-
existing = ""
|
| 127 |
-
try:
|
| 128 |
-
from huggingface_hub import hf_hub_download
|
| 129 |
-
with open(hf_hub_download(DATASET_REPO, "metadata.jsonl", repo_type="dataset",
|
| 130 |
-
token=_TOKEN, force_download=True)) as f:
|
| 131 |
-
existing = f.read().rstrip("\n")
|
| 132 |
-
except Exception:
|
| 133 |
-
pass
|
| 134 |
new_meta = (existing + "\n" if existing else "") + json.dumps(row) + "\n"
|
| 135 |
-
|
| 136 |
repo_id=DATASET_REPO, repo_type="dataset",
|
| 137 |
commit_message=f"sighting: {row['machine_prediction']} by {contributor}",
|
| 138 |
operations=[
|
|
@@ -141,4 +172,4 @@ def append_sighting(image, user_label: str, machine: dict, contributor: str) ->
|
|
| 141 |
path_or_fileobj=new_meta.encode("utf-8")),
|
| 142 |
],
|
| 143 |
)
|
| 144 |
-
return
|
|
|
|
| 96 |
return []
|
| 97 |
|
| 98 |
|
| 99 |
+
# Near-duplicate detection via perceptual hash (dHash). Catches the same photo
|
| 100 |
+
# re-saved/resized/re-compressed, not just byte-identical files — which is the
|
| 101 |
+
# realistic abuse/pollution case (re-uploading a popular web image, gaming the
|
| 102 |
+
# contributor board, accidental double-submits). PIL + numpy only, no new dep.
|
| 103 |
+
DUP_HAMMING_THRESHOLD = 5 # <=5 of 64 bits differ => treat as the same image
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
def compute_phash(image) -> str:
|
| 107 |
+
"""64-bit dHash as a 16-char hex string (row->row horizontal gradient)."""
|
| 108 |
+
import numpy as np
|
| 109 |
+
from PIL import Image
|
| 110 |
+
small = image.convert("L").resize((9, 8), Image.BILINEAR)
|
| 111 |
+
a = np.asarray(small, dtype=np.int16)
|
| 112 |
+
bits = (a[:, 1:] > a[:, :-1]).flatten() # 8x8 = 64 bits
|
| 113 |
+
val = 0
|
| 114 |
+
for b in bits:
|
| 115 |
+
val = (val << 1) | int(b)
|
| 116 |
+
return f"{val:016x}"
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
def _hamming(a: str, b: str) -> int:
|
| 120 |
+
return bin(int(a, 16) ^ int(b, 16)).count("1")
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
def append_sighting(image, user_label: str, machine: dict, contributor: str) -> str:
|
| 124 |
"""
|
| 125 |
+
Commit one contributed photo + metadata row to the dataset, with near-dup
|
| 126 |
+
rejection. `machine` carries the model's call.
|
| 127 |
+
Returns: "stored" | "duplicate" | "disabled".
|
|
|
|
| 128 |
"""
|
| 129 |
if not _TOKEN:
|
| 130 |
+
return "disabled"
|
| 131 |
from io import BytesIO
|
| 132 |
+
from huggingface_hub import CommitOperationAdd, HfApi, hf_hub_download
|
| 133 |
+
|
| 134 |
+
# read existing metadata (+ its phashes) once
|
| 135 |
+
existing, rows = "", []
|
| 136 |
+
try:
|
| 137 |
+
with open(hf_hub_download(DATASET_REPO, "metadata.jsonl", repo_type="dataset",
|
| 138 |
+
token=_TOKEN, force_download=True)) as f:
|
| 139 |
+
txt = f.read()
|
| 140 |
+
existing = txt.rstrip("\n")
|
| 141 |
+
rows = [json.loads(line) for line in txt.splitlines() if line.strip()]
|
| 142 |
+
except Exception:
|
| 143 |
+
pass
|
| 144 |
+
|
| 145 |
+
ph = compute_phash(image)
|
| 146 |
+
for r in rows:
|
| 147 |
+
h = r.get("phash")
|
| 148 |
+
if h and _hamming(h, ph) <= DUP_HAMMING_THRESHOLD:
|
| 149 |
+
return "duplicate"
|
| 150 |
|
| 151 |
ts = _now()
|
| 152 |
fname = f"images/{contributor}_{ts.replace(':', '').replace('-', '')}.jpg"
|
|
|
|
| 159 |
"machine_abstained": bool(machine.get("abstained", True)),
|
| 160 |
"machine_safety": machine.get("safety", "UNKNOWN"),
|
| 161 |
"routed_domain": machine.get("domain", "unknown"),
|
| 162 |
+
"contributor": contributor, "consent": True, "license": LICENSE,
|
| 163 |
+
"timestamp": ts, "phash": ph,
|
| 164 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 165 |
new_meta = (existing + "\n" if existing else "") + json.dumps(row) + "\n"
|
| 166 |
+
HfApi(token=_TOKEN).create_commit(
|
| 167 |
repo_id=DATASET_REPO, repo_type="dataset",
|
| 168 |
commit_message=f"sighting: {row['machine_prediction']} by {contributor}",
|
| 169 |
operations=[
|
|
|
|
| 172 |
path_or_fileobj=new_meta.encode("utf-8")),
|
| 173 |
],
|
| 174 |
)
|
| 175 |
+
return "stored"
|