Spaces:

RedRocket
/

e6-visual-ratings

Running

App Files Files Community

taigasan commited on Apr 10

Commit

124cd9f

verified ·

1 Parent(s): 982d7de

deploy app, storage, readme

Browse files

Files changed (2) hide show

README.md +21 -6
storage.py +130 -0

README.md CHANGED Viewed

@@ -1,5 +1,5 @@
 ---
-title: e621 Image Rater
 emoji: 🐾
 colorFrom: indigo
 colorTo: purple
@@ -11,11 +11,26 @@ pinned: false
 # e621 Image Rater
-Pairwise ELO rating for e621 images. Each pair shares at least two common tags.
 ## How it works
-- Parquet is loaded at startup with **column projection** (`id`, `md5`, `file_ext`, `tag_string`, `score`, `rating`) — downloads ~20% of the 1.1 GB file instead of all of it.
-- An inverted tag index is built over tags appearing in ≥50 posts.
-- Each round picks a random post, samples two of its common tags, and finds a candidate post that also has both — guaranteeing ≥2 shared tags.
-- ELO ratings are stored in `elo_ratings.json` on disk (resets on Space restart unless you mount persistent storage).

 ---
+title: e6 Visual Ratings
 emoji: 🐾
 colorFrom: indigo
 colorTo: purple
 # e621 Image Rater
+Pairwise image quality voting for e621 images. Pairs are sampled from prebuilt group buckets.
 ## How it works
+- The app loads `pool.parquet` at startup (from local `data/pool.parquet` in debug mode, or from the dataset repo in HF mode).
+- Each round samples two items from the same `group`, then shows their image URLs and links to e621 post pages.
+- Votes are buffered in memory, then flushed in batches to append-only parquet shards under `ratings_log/`.
+- Local/debug mode defaults: flush every 3 votes or every 15 seconds.
+- HF mode defaults: flush every 50 votes or every 300 seconds.
+## Vote Log Schema
+Each vote row stores:
+- `vote_id`
+- `timestamp` (ISO-8601 UTC)
+- `md5a`
+- `md5b`
+- `winner_md5` (or `None` for tie)
+- `url_a` and `url_b` (e621 post URLs)
+- `dataset`
+- `group`
+- `session_id`

storage.py ADDED Viewed

	@@ -0,0 +1,130 @@

+import os
+import atexit
+import threading
+import time
+import uuid
+from datetime import datetime, timezone
+from pathlib import Path
+from tempfile import NamedTemporaryFile
+import pandas as pd
+from huggingface_hub import HfApi
+VOTES_REPO_ID = "taigasan/e6-visual-ratings"
+VOTES_REPO_TYPE = "dataset"
+VOTES_LOG_SUBDIR = "ratings_log"
+VOTE_COLUMNS = [
+    "vote_id",
+    "timestamp",
+    "md5a",
+    "md5b",
+    "winner_md5",
+    "url_a",
+    "url_b",
+    "dataset",
+    "group",
+    "session_id",
+]
+class VoteStorage:
+    def __init__(self, mode: str, local_dir: str = "ratings_data"):
+        assert mode in ("hf", "local"), f"Unsupported storage mode: {mode}"
+        self.mode = mode
+        self.local_dir = local_dir
+        is_debug_mode = self.mode == "local"
+        self._flush_every = 3 if is_debug_mode else 50
+        self._flush_interval_sec = 15.0 if is_debug_mode else 300.0
+        self._votes_lock = threading.Lock()
+        self._votes_buffer: list[dict] = []
+        self._stop_event = threading.Event()
+        self._flush_thread = threading.Thread(target=self._flush_loop, daemon=True)
+        self._flush_thread.start()
+        atexit.register(self.close)
+    def _local_votes_path(self) -> Path:
+        root = Path(self.local_dir)
+        root.mkdir(parents=True, exist_ok=True)
+        log_dir = root / VOTES_LOG_SUBDIR
+        log_dir.mkdir(parents=True, exist_ok=True)
+        return log_dir
+    def _hf_token(self) -> str | None:
+        return os.getenv("RATINGS_APP_TOKEN")
+    def _empty_votes_df(self) -> pd.DataFrame:
+        return pd.DataFrame(columns=VOTE_COLUMNS)
+    def _upload_votes_batch(self, df: pd.DataFrame, commit_message: str):
+        assert set(VOTE_COLUMNS).issubset(df.columns), "Missing vote columns in upload batch"
+        ts = int(time.time())
+        shard = f"votes_{ts}_{uuid.uuid4().hex}.parquet"
+        if self.mode == "local":
+            _ = commit_message
+            local_path = self._local_votes_path() / shard
+            df[VOTE_COLUMNS].to_parquet(local_path, index=False)
+            return
+        api = HfApi(token=self._hf_token())
+        with NamedTemporaryFile(suffix=".parquet", delete=False) as tmp:
+            tmp_path = tmp.name
+        try:
+            df[VOTE_COLUMNS].to_parquet(tmp_path, index=False)
+            api.upload_file(
+                path_or_fileobj=tmp_path,
+                path_in_repo=f"{VOTES_LOG_SUBDIR}/{shard}",
+                repo_id=VOTES_REPO_ID,
+                repo_type=VOTES_REPO_TYPE,
+                commit_message=commit_message,
+            )
+        finally:
+            if os.path.exists(tmp_path):
+                os.remove(tmp_path)
+    def _flush_votes(self, force: bool = False):
+        with self._votes_lock:
+            if not self._votes_buffer:
+                return
+            if not force and len(self._votes_buffer) < self._flush_every:
+                return
+            batch = list(self._votes_buffer)
+            self._votes_buffer.clear()
+        incoming = pd.DataFrame(batch)
+        for col in VOTE_COLUMNS:
+            if col not in incoming.columns:
+                incoming[col] = None
+        self._upload_votes_batch(incoming[VOTE_COLUMNS], commit_message=f"append {len(batch)} vote rows")
+    def _flush_loop(self):
+        while not self._stop_event.wait(self._flush_interval_sec):
+            self._flush_votes(force=True)
+    def close(self):
+        if self._stop_event.is_set():
+            return
+        self._stop_event.set()
+        self._flush_thread.join(timeout=1.0)
+        self._flush_votes(force=True)
+    def append_vote_row(self, state: dict, winner: str | None):
+        id_a = int(state["id_a"])
+        id_b = int(state["id_b"])
+        winner_md5 = None
+        if winner == "A":
+            winner_md5 = state["key_a"]
+        elif winner == "B":
+            winner_md5 = state["key_b"]
+        vote_row = {
+            "vote_id": uuid.uuid4().hex,
+            "timestamp": datetime.now(timezone.utc).isoformat(timespec="seconds"),
+            "md5a": state["key_a"],
+            "md5b": state["key_b"],
+            "winner_md5": winner_md5,
+            "url_a": f"https://e621.net/posts/{id_a}",
+            "url_b": f"https://e621.net/posts/{id_b}",
+            "dataset": state["dataset"],
+            "group": state["group"],
+            "session_id": state["session_id"],
+        }
+        with self._votes_lock:
+            self._votes_buffer.append(vote_row)
+        self._flush_votes()