Spaces:
Running
Running
deploy app, storage, readme
Browse files- README.md +21 -6
- storage.py +130 -0
README.md
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
emoji: 🐾
|
| 4 |
colorFrom: indigo
|
| 5 |
colorTo: purple
|
|
@@ -11,11 +11,26 @@ pinned: false
|
|
| 11 |
|
| 12 |
# e621 Image Rater
|
| 13 |
|
| 14 |
-
Pairwise
|
| 15 |
|
| 16 |
## How it works
|
| 17 |
|
| 18 |
-
-
|
| 19 |
-
-
|
| 20 |
-
-
|
| 21 |
-
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: e6 Visual Ratings
|
| 3 |
emoji: 🐾
|
| 4 |
colorFrom: indigo
|
| 5 |
colorTo: purple
|
|
|
|
| 11 |
|
| 12 |
# e621 Image Rater
|
| 13 |
|
| 14 |
+
Pairwise image quality voting for e621 images. Pairs are sampled from prebuilt group buckets.
|
| 15 |
|
| 16 |
## How it works
|
| 17 |
|
| 18 |
+
- The app loads `pool.parquet` at startup (from local `data/pool.parquet` in debug mode, or from the dataset repo in HF mode).
|
| 19 |
+
- Each round samples two items from the same `group`, then shows their image URLs and links to e621 post pages.
|
| 20 |
+
- Votes are buffered in memory, then flushed in batches to append-only parquet shards under `ratings_log/`.
|
| 21 |
+
- Local/debug mode defaults: flush every 3 votes or every 15 seconds.
|
| 22 |
+
- HF mode defaults: flush every 50 votes or every 300 seconds.
|
| 23 |
+
|
| 24 |
+
## Vote Log Schema
|
| 25 |
+
|
| 26 |
+
Each vote row stores:
|
| 27 |
+
|
| 28 |
+
- `vote_id`
|
| 29 |
+
- `timestamp` (ISO-8601 UTC)
|
| 30 |
+
- `md5a`
|
| 31 |
+
- `md5b`
|
| 32 |
+
- `winner_md5` (or `None` for tie)
|
| 33 |
+
- `url_a` and `url_b` (e621 post URLs)
|
| 34 |
+
- `dataset`
|
| 35 |
+
- `group`
|
| 36 |
+
- `session_id`
|
storage.py
ADDED
|
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import atexit
|
| 3 |
+
import threading
|
| 4 |
+
import time
|
| 5 |
+
import uuid
|
| 6 |
+
from datetime import datetime, timezone
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
from tempfile import NamedTemporaryFile
|
| 9 |
+
|
| 10 |
+
import pandas as pd
|
| 11 |
+
from huggingface_hub import HfApi
|
| 12 |
+
|
| 13 |
+
VOTES_REPO_ID = "taigasan/e6-visual-ratings"
|
| 14 |
+
VOTES_REPO_TYPE = "dataset"
|
| 15 |
+
VOTES_LOG_SUBDIR = "ratings_log"
|
| 16 |
+
VOTE_COLUMNS = [
|
| 17 |
+
"vote_id",
|
| 18 |
+
"timestamp",
|
| 19 |
+
"md5a",
|
| 20 |
+
"md5b",
|
| 21 |
+
"winner_md5",
|
| 22 |
+
"url_a",
|
| 23 |
+
"url_b",
|
| 24 |
+
"dataset",
|
| 25 |
+
"group",
|
| 26 |
+
"session_id",
|
| 27 |
+
]
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
class VoteStorage:
|
| 31 |
+
def __init__(self, mode: str, local_dir: str = "ratings_data"):
|
| 32 |
+
assert mode in ("hf", "local"), f"Unsupported storage mode: {mode}"
|
| 33 |
+
self.mode = mode
|
| 34 |
+
self.local_dir = local_dir
|
| 35 |
+
is_debug_mode = self.mode == "local"
|
| 36 |
+
self._flush_every = 3 if is_debug_mode else 50
|
| 37 |
+
self._flush_interval_sec = 15.0 if is_debug_mode else 300.0
|
| 38 |
+
self._votes_lock = threading.Lock()
|
| 39 |
+
self._votes_buffer: list[dict] = []
|
| 40 |
+
self._stop_event = threading.Event()
|
| 41 |
+
self._flush_thread = threading.Thread(target=self._flush_loop, daemon=True)
|
| 42 |
+
self._flush_thread.start()
|
| 43 |
+
atexit.register(self.close)
|
| 44 |
+
|
| 45 |
+
def _local_votes_path(self) -> Path:
|
| 46 |
+
root = Path(self.local_dir)
|
| 47 |
+
root.mkdir(parents=True, exist_ok=True)
|
| 48 |
+
log_dir = root / VOTES_LOG_SUBDIR
|
| 49 |
+
log_dir.mkdir(parents=True, exist_ok=True)
|
| 50 |
+
return log_dir
|
| 51 |
+
|
| 52 |
+
def _hf_token(self) -> str | None:
|
| 53 |
+
return os.getenv("RATINGS_APP_TOKEN")
|
| 54 |
+
|
| 55 |
+
def _empty_votes_df(self) -> pd.DataFrame:
|
| 56 |
+
return pd.DataFrame(columns=VOTE_COLUMNS)
|
| 57 |
+
|
| 58 |
+
def _upload_votes_batch(self, df: pd.DataFrame, commit_message: str):
|
| 59 |
+
assert set(VOTE_COLUMNS).issubset(df.columns), "Missing vote columns in upload batch"
|
| 60 |
+
ts = int(time.time())
|
| 61 |
+
shard = f"votes_{ts}_{uuid.uuid4().hex}.parquet"
|
| 62 |
+
if self.mode == "local":
|
| 63 |
+
_ = commit_message
|
| 64 |
+
local_path = self._local_votes_path() / shard
|
| 65 |
+
df[VOTE_COLUMNS].to_parquet(local_path, index=False)
|
| 66 |
+
return
|
| 67 |
+
api = HfApi(token=self._hf_token())
|
| 68 |
+
with NamedTemporaryFile(suffix=".parquet", delete=False) as tmp:
|
| 69 |
+
tmp_path = tmp.name
|
| 70 |
+
try:
|
| 71 |
+
df[VOTE_COLUMNS].to_parquet(tmp_path, index=False)
|
| 72 |
+
api.upload_file(
|
| 73 |
+
path_or_fileobj=tmp_path,
|
| 74 |
+
path_in_repo=f"{VOTES_LOG_SUBDIR}/{shard}",
|
| 75 |
+
repo_id=VOTES_REPO_ID,
|
| 76 |
+
repo_type=VOTES_REPO_TYPE,
|
| 77 |
+
commit_message=commit_message,
|
| 78 |
+
)
|
| 79 |
+
finally:
|
| 80 |
+
if os.path.exists(tmp_path):
|
| 81 |
+
os.remove(tmp_path)
|
| 82 |
+
|
| 83 |
+
def _flush_votes(self, force: bool = False):
|
| 84 |
+
with self._votes_lock:
|
| 85 |
+
if not self._votes_buffer:
|
| 86 |
+
return
|
| 87 |
+
if not force and len(self._votes_buffer) < self._flush_every:
|
| 88 |
+
return
|
| 89 |
+
batch = list(self._votes_buffer)
|
| 90 |
+
self._votes_buffer.clear()
|
| 91 |
+
incoming = pd.DataFrame(batch)
|
| 92 |
+
for col in VOTE_COLUMNS:
|
| 93 |
+
if col not in incoming.columns:
|
| 94 |
+
incoming[col] = None
|
| 95 |
+
self._upload_votes_batch(incoming[VOTE_COLUMNS], commit_message=f"append {len(batch)} vote rows")
|
| 96 |
+
|
| 97 |
+
def _flush_loop(self):
|
| 98 |
+
while not self._stop_event.wait(self._flush_interval_sec):
|
| 99 |
+
self._flush_votes(force=True)
|
| 100 |
+
|
| 101 |
+
def close(self):
|
| 102 |
+
if self._stop_event.is_set():
|
| 103 |
+
return
|
| 104 |
+
self._stop_event.set()
|
| 105 |
+
self._flush_thread.join(timeout=1.0)
|
| 106 |
+
self._flush_votes(force=True)
|
| 107 |
+
|
| 108 |
+
def append_vote_row(self, state: dict, winner: str | None):
|
| 109 |
+
id_a = int(state["id_a"])
|
| 110 |
+
id_b = int(state["id_b"])
|
| 111 |
+
winner_md5 = None
|
| 112 |
+
if winner == "A":
|
| 113 |
+
winner_md5 = state["key_a"]
|
| 114 |
+
elif winner == "B":
|
| 115 |
+
winner_md5 = state["key_b"]
|
| 116 |
+
vote_row = {
|
| 117 |
+
"vote_id": uuid.uuid4().hex,
|
| 118 |
+
"timestamp": datetime.now(timezone.utc).isoformat(timespec="seconds"),
|
| 119 |
+
"md5a": state["key_a"],
|
| 120 |
+
"md5b": state["key_b"],
|
| 121 |
+
"winner_md5": winner_md5,
|
| 122 |
+
"url_a": f"https://e621.net/posts/{id_a}",
|
| 123 |
+
"url_b": f"https://e621.net/posts/{id_b}",
|
| 124 |
+
"dataset": state["dataset"],
|
| 125 |
+
"group": state["group"],
|
| 126 |
+
"session_id": state["session_id"],
|
| 127 |
+
}
|
| 128 |
+
with self._votes_lock:
|
| 129 |
+
self._votes_buffer.append(vote_row)
|
| 130 |
+
self._flush_votes()
|