taigasan commited on
Commit
124cd9f
·
verified ·
1 Parent(s): 982d7de

deploy app, storage, readme

Browse files
Files changed (2) hide show
  1. README.md +21 -6
  2. storage.py +130 -0
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: e621 Image Rater
3
  emoji: 🐾
4
  colorFrom: indigo
5
  colorTo: purple
@@ -11,11 +11,26 @@ pinned: false
11
 
12
  # e621 Image Rater
13
 
14
- Pairwise ELO rating for e621 images. Each pair shares at least two common tags.
15
 
16
  ## How it works
17
 
18
- - Parquet is loaded at startup with **column projection** (`id`, `md5`, `file_ext`, `tag_string`, `score`, `rating`) — downloads ~20% of the 1.1 GB file instead of all of it.
19
- - An inverted tag index is built over tags appearing in ≥50 posts.
20
- - Each round picks a random post, samples two of its common tags, and finds a candidate post that also has both — guaranteeing ≥2 shared tags.
21
- - ELO ratings are stored in `elo_ratings.json` on disk (resets on Space restart unless you mount persistent storage).
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: e6 Visual Ratings
3
  emoji: 🐾
4
  colorFrom: indigo
5
  colorTo: purple
 
11
 
12
  # e621 Image Rater
13
 
14
+ Pairwise image quality voting for e621 images. Pairs are sampled from prebuilt group buckets.
15
 
16
  ## How it works
17
 
18
+ - The app loads `pool.parquet` at startup (from local `data/pool.parquet` in debug mode, or from the dataset repo in HF mode).
19
+ - Each round samples two items from the same `group`, then shows their image URLs and links to e621 post pages.
20
+ - Votes are buffered in memory, then flushed in batches to append-only parquet shards under `ratings_log/`.
21
+ - Local/debug mode defaults: flush every 3 votes or every 15 seconds.
22
+ - HF mode defaults: flush every 50 votes or every 300 seconds.
23
+
24
+ ## Vote Log Schema
25
+
26
+ Each vote row stores:
27
+
28
+ - `vote_id`
29
+ - `timestamp` (ISO-8601 UTC)
30
+ - `md5a`
31
+ - `md5b`
32
+ - `winner_md5` (or `None` for tie)
33
+ - `url_a` and `url_b` (e621 post URLs)
34
+ - `dataset`
35
+ - `group`
36
+ - `session_id`
storage.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import atexit
3
+ import threading
4
+ import time
5
+ import uuid
6
+ from datetime import datetime, timezone
7
+ from pathlib import Path
8
+ from tempfile import NamedTemporaryFile
9
+
10
+ import pandas as pd
11
+ from huggingface_hub import HfApi
12
+
13
+ VOTES_REPO_ID = "taigasan/e6-visual-ratings"
14
+ VOTES_REPO_TYPE = "dataset"
15
+ VOTES_LOG_SUBDIR = "ratings_log"
16
+ VOTE_COLUMNS = [
17
+ "vote_id",
18
+ "timestamp",
19
+ "md5a",
20
+ "md5b",
21
+ "winner_md5",
22
+ "url_a",
23
+ "url_b",
24
+ "dataset",
25
+ "group",
26
+ "session_id",
27
+ ]
28
+
29
+
30
+ class VoteStorage:
31
+ def __init__(self, mode: str, local_dir: str = "ratings_data"):
32
+ assert mode in ("hf", "local"), f"Unsupported storage mode: {mode}"
33
+ self.mode = mode
34
+ self.local_dir = local_dir
35
+ is_debug_mode = self.mode == "local"
36
+ self._flush_every = 3 if is_debug_mode else 50
37
+ self._flush_interval_sec = 15.0 if is_debug_mode else 300.0
38
+ self._votes_lock = threading.Lock()
39
+ self._votes_buffer: list[dict] = []
40
+ self._stop_event = threading.Event()
41
+ self._flush_thread = threading.Thread(target=self._flush_loop, daemon=True)
42
+ self._flush_thread.start()
43
+ atexit.register(self.close)
44
+
45
+ def _local_votes_path(self) -> Path:
46
+ root = Path(self.local_dir)
47
+ root.mkdir(parents=True, exist_ok=True)
48
+ log_dir = root / VOTES_LOG_SUBDIR
49
+ log_dir.mkdir(parents=True, exist_ok=True)
50
+ return log_dir
51
+
52
+ def _hf_token(self) -> str | None:
53
+ return os.getenv("RATINGS_APP_TOKEN")
54
+
55
+ def _empty_votes_df(self) -> pd.DataFrame:
56
+ return pd.DataFrame(columns=VOTE_COLUMNS)
57
+
58
+ def _upload_votes_batch(self, df: pd.DataFrame, commit_message: str):
59
+ assert set(VOTE_COLUMNS).issubset(df.columns), "Missing vote columns in upload batch"
60
+ ts = int(time.time())
61
+ shard = f"votes_{ts}_{uuid.uuid4().hex}.parquet"
62
+ if self.mode == "local":
63
+ _ = commit_message
64
+ local_path = self._local_votes_path() / shard
65
+ df[VOTE_COLUMNS].to_parquet(local_path, index=False)
66
+ return
67
+ api = HfApi(token=self._hf_token())
68
+ with NamedTemporaryFile(suffix=".parquet", delete=False) as tmp:
69
+ tmp_path = tmp.name
70
+ try:
71
+ df[VOTE_COLUMNS].to_parquet(tmp_path, index=False)
72
+ api.upload_file(
73
+ path_or_fileobj=tmp_path,
74
+ path_in_repo=f"{VOTES_LOG_SUBDIR}/{shard}",
75
+ repo_id=VOTES_REPO_ID,
76
+ repo_type=VOTES_REPO_TYPE,
77
+ commit_message=commit_message,
78
+ )
79
+ finally:
80
+ if os.path.exists(tmp_path):
81
+ os.remove(tmp_path)
82
+
83
+ def _flush_votes(self, force: bool = False):
84
+ with self._votes_lock:
85
+ if not self._votes_buffer:
86
+ return
87
+ if not force and len(self._votes_buffer) < self._flush_every:
88
+ return
89
+ batch = list(self._votes_buffer)
90
+ self._votes_buffer.clear()
91
+ incoming = pd.DataFrame(batch)
92
+ for col in VOTE_COLUMNS:
93
+ if col not in incoming.columns:
94
+ incoming[col] = None
95
+ self._upload_votes_batch(incoming[VOTE_COLUMNS], commit_message=f"append {len(batch)} vote rows")
96
+
97
+ def _flush_loop(self):
98
+ while not self._stop_event.wait(self._flush_interval_sec):
99
+ self._flush_votes(force=True)
100
+
101
+ def close(self):
102
+ if self._stop_event.is_set():
103
+ return
104
+ self._stop_event.set()
105
+ self._flush_thread.join(timeout=1.0)
106
+ self._flush_votes(force=True)
107
+
108
+ def append_vote_row(self, state: dict, winner: str | None):
109
+ id_a = int(state["id_a"])
110
+ id_b = int(state["id_b"])
111
+ winner_md5 = None
112
+ if winner == "A":
113
+ winner_md5 = state["key_a"]
114
+ elif winner == "B":
115
+ winner_md5 = state["key_b"]
116
+ vote_row = {
117
+ "vote_id": uuid.uuid4().hex,
118
+ "timestamp": datetime.now(timezone.utc).isoformat(timespec="seconds"),
119
+ "md5a": state["key_a"],
120
+ "md5b": state["key_b"],
121
+ "winner_md5": winner_md5,
122
+ "url_a": f"https://e621.net/posts/{id_a}",
123
+ "url_b": f"https://e621.net/posts/{id_b}",
124
+ "dataset": state["dataset"],
125
+ "group": state["group"],
126
+ "session_id": state["session_id"],
127
+ }
128
+ with self._votes_lock:
129
+ self._votes_buffer.append(vote_row)
130
+ self._flush_votes()