taigasan commited on
Commit
4256aa6
·
verified ·
1 Parent(s): bf09265

deploy app, storage, readme

Browse files
Files changed (2) hide show
  1. app.py +32 -10
  2. explorer.py +188 -0
app.py CHANGED
@@ -11,8 +11,7 @@ import pandas as pd
11
  from huggingface_hub import hf_hub_download
12
 
13
  from storage import VoteStorage
14
- from results_tab import add_results_tab, build_results_data, load_more_results, on_gallery_select
15
- from stats_view import format_post_row
16
 
17
  LOCAL_DATA_DIR = 'data'
18
  DEBUG_MODE = os.getenv("DEBUG", "0").lower() in ("1", "true", "yes", "on")
@@ -39,10 +38,11 @@ _pool_group_dfs = {g: gdf for g, gdf in _pool_df.groupby("group")}
39
  _stats_lock = threading.Lock()
40
  _stats_last_loaded_at = 0.0
41
  _stats_by_key: dict[str, tuple[int, int]] = {}
 
42
 
43
 
44
  def _reload_stats_if_due(force: bool = False):
45
- global _stats_last_loaded_at, _stats_by_key
46
  now = time.time()
47
  if not force and (now - _stats_last_loaded_at) < STATS_RELOAD_S:
48
  return
@@ -56,11 +56,15 @@ def _reload_stats_if_due(force: bool = False):
56
  repo_type="dataset",
57
  token=RATINGS_APP_TOKEN,
58
  )
59
- _stats_df = pd.read_parquet(stats_path, columns=["item_key", "wins", "losses"]) # [n, 3]
60
  _stats_by_key = {
61
  str(r.item_key): (int(r.wins), int(r.losses))
62
  for r in _stats_df.itertuples(index=False)
63
  }
 
 
 
 
64
  _stats_last_loaded_at = now
65
 
66
 
@@ -105,6 +109,13 @@ def _commit_oldest_pending(state: dict):
105
  oldest = pending.pop(0)
106
  threading.Thread(target=VOTE_STORAGE.append_vote_row, args=(oldest.copy(), oldest.get("winner")), daemon=True).start()
107
 
 
 
 
 
 
 
 
108
  def _render_current(state: dict, submit_status: str = "") -> tuple:
109
  _reload_stats_if_due()
110
  wins_a, losses_a = _stats_by_key.get(str(state["key_a"]), (0, 0))
@@ -113,8 +124,8 @@ def _render_current(state: dict, submit_status: str = "") -> tuple:
113
  title_b = "Image B"
114
  img_a_html = f"<div class=\"rating-card\"><div class=\"rating-card-title\"><strong>{html.escape(title_a)}</strong></div><div class=\"rating-image-frame\"><img src=\"{html.escape(state['url_a'])}\" class=\"rating-image\" loading=\"eager\" referrerpolicy=\"no-referrer\"></div></div>"
115
  img_b_html = f"<div class=\"rating-card\"><div class=\"rating-card-title\"><strong>{html.escape(title_b)}</strong></div><div class=\"rating-image-frame\"><img src=\"{html.escape(state['url_b'])}\" class=\"rating-image\" loading=\"eager\" referrerpolicy=\"no-referrer\"></div></div>"
116
- link_a = format_post_row(state["id_a"], wins_a, losses_a, label="Image A")
117
- link_b = format_post_row(state["id_b"], wins_b, losses_b, label="Image B")
118
  can_go_back = bool(state.get("can_go_back"))
119
  back_md = "[Undo Rating (Ctrl+z)](#back)" if can_go_back else "<span class='subtle-back-link-disabled'>Undo Rating (Ctrl+z)</span>"
120
  details = f"<span class='subtle-note'>Group: {state['group']}</span>"
@@ -146,13 +157,14 @@ def _load_results(rating_pref: str):
146
  rating_pref = _normalize_rating_pref(rating_pref)
147
  _reload_stats_if_due()
148
  groups = _select_groups(DATASETS[DEFAULT_DATASET], rating_pref)
149
- summary, gallery_items, groups_state, page_meta, next_offset, btn_update = build_results_data(
150
  _pool_df,
151
  groups,
152
  rating_pref,
153
  _stats_by_key,
 
154
  )
155
- return summary, gallery_items, btn_update, "Click an image to reveal its ID and link.", groups_state, page_meta, next_offset
156
 
157
  # -- Gradio callbacks -------------------------------------------------------
158
 
@@ -469,6 +481,8 @@ with gr.Blocks(
469
 
470
  (
471
  results_summary_md,
 
 
472
  results_gallery,
473
  results_load_more_btn,
474
  selected_image_md,
@@ -480,6 +494,8 @@ with gr.Blocks(
480
  outputs = [img_a, img_b, link_a, link_b, back_link, details_md, submit_status_md, state]
481
  results_outputs = [
482
  results_summary_md,
 
 
483
  results_gallery,
484
  results_load_more_btn,
485
  selected_image_md,
@@ -499,13 +515,19 @@ with gr.Blocks(
499
  demo.load(fn=_initial_load, inputs=[state, rating_pref_store, submit_key_store], outputs=[rating_dd, submit_key_tb, *outputs], queue=False, show_progress="hidden")
500
  demo.load(fn=_load_results, inputs=[rating_pref_store], outputs=results_outputs, queue=False, show_progress="hidden")
501
  results_load_more_btn.click(
502
- fn=lambda g, o: load_more_results(_pool_df, g, _stats_by_key, o),
503
  inputs=[results_groups_state, results_page_offset_state],
504
  outputs=[results_gallery, results_page_meta_state, results_page_offset_state, results_load_more_btn],
505
  queue=False,
506
  show_progress="hidden",
507
  )
508
- results_gallery.select(fn=on_gallery_select, inputs=[results_page_meta_state], outputs=[selected_image_md], queue=False, show_progress="hidden")
 
 
 
 
 
 
509
 
510
  if __name__ == "__main__":
511
  demo.launch()
 
11
  from huggingface_hub import hf_hub_download
12
 
13
  from storage import VoteStorage
14
+ from explorer import add_results_tab, build_results_data, load_more_results, on_gallery_select
 
15
 
16
  LOCAL_DATA_DIR = 'data'
17
  DEBUG_MODE = os.getenv("DEBUG", "0").lower() in ("1", "true", "yes", "on")
 
38
  _stats_lock = threading.Lock()
39
  _stats_last_loaded_at = 0.0
40
  _stats_by_key: dict[str, tuple[int, int]] = {}
41
+ _classifier_score_by_key: dict[str, float] = {}
42
 
43
 
44
  def _reload_stats_if_due(force: bool = False):
45
+ global _stats_last_loaded_at, _stats_by_key, _classifier_score_by_key
46
  now = time.time()
47
  if not force and (now - _stats_last_loaded_at) < STATS_RELOAD_S:
48
  return
 
56
  repo_type="dataset",
57
  token=RATINGS_APP_TOKEN,
58
  )
59
+ _stats_df = pd.read_parquet(stats_path, columns=["item_key", "wins", "losses", "classifier_score"]) # [n, 4]
60
  _stats_by_key = {
61
  str(r.item_key): (int(r.wins), int(r.losses))
62
  for r in _stats_df.itertuples(index=False)
63
  }
64
+ _classifier_score_by_key = {
65
+ str(r.item_key): float(r.classifier_score)
66
+ for r in _stats_df.itertuples(index=False)
67
+ }
68
  _stats_last_loaded_at = now
69
 
70
 
 
109
  oldest = pending.pop(0)
110
  threading.Thread(target=VOTE_STORAGE.append_vote_row, args=(oldest.copy(), oldest.get("winner")), daemon=True).start()
111
 
112
+
113
+ def _format_rating_post_row(post_id: int, wins: int, losses: int, label: str | None = None) -> str:
114
+ total_votes = wins + losses
115
+ url = f"https://e621.net/posts/{post_id}"
116
+ row = f"{url} | Times rated: {total_votes}"
117
+ return f"{label}: {row}" if label else row
118
+
119
  def _render_current(state: dict, submit_status: str = "") -> tuple:
120
  _reload_stats_if_due()
121
  wins_a, losses_a = _stats_by_key.get(str(state["key_a"]), (0, 0))
 
124
  title_b = "Image B"
125
  img_a_html = f"<div class=\"rating-card\"><div class=\"rating-card-title\"><strong>{html.escape(title_a)}</strong></div><div class=\"rating-image-frame\"><img src=\"{html.escape(state['url_a'])}\" class=\"rating-image\" loading=\"eager\" referrerpolicy=\"no-referrer\"></div></div>"
126
  img_b_html = f"<div class=\"rating-card\"><div class=\"rating-card-title\"><strong>{html.escape(title_b)}</strong></div><div class=\"rating-image-frame\"><img src=\"{html.escape(state['url_b'])}\" class=\"rating-image\" loading=\"eager\" referrerpolicy=\"no-referrer\"></div></div>"
127
+ link_a = _format_rating_post_row(state["id_a"], wins_a, losses_a, label="Image A")
128
+ link_b = _format_rating_post_row(state["id_b"], wins_b, losses_b, label="Image B")
129
  can_go_back = bool(state.get("can_go_back"))
130
  back_md = "[Undo Rating (Ctrl+z)](#back)" if can_go_back else "<span class='subtle-back-link-disabled'>Undo Rating (Ctrl+z)</span>"
131
  details = f"<span class='subtle-note'>Group: {state['group']}</span>"
 
157
  rating_pref = _normalize_rating_pref(rating_pref)
158
  _reload_stats_if_due()
159
  groups = _select_groups(DATASETS[DEFAULT_DATASET], rating_pref)
160
+ summary, score_distribution_plot, distribution_data, gallery_items, groups_state, page_meta, next_offset, btn_update = build_results_data(
161
  _pool_df,
162
  groups,
163
  rating_pref,
164
  _stats_by_key,
165
+ _classifier_score_by_key,
166
  )
167
+ return summary, score_distribution_plot, distribution_data, gallery_items, btn_update, "Click an image to reveal its ID and link.", groups_state, page_meta, next_offset
168
 
169
  # -- Gradio callbacks -------------------------------------------------------
170
 
 
481
 
482
  (
483
  results_summary_md,
484
+ results_score_distribution_plot,
485
+ results_distribution_state,
486
  results_gallery,
487
  results_load_more_btn,
488
  selected_image_md,
 
494
  outputs = [img_a, img_b, link_a, link_b, back_link, details_md, submit_status_md, state]
495
  results_outputs = [
496
  results_summary_md,
497
+ results_score_distribution_plot,
498
+ results_distribution_state,
499
  results_gallery,
500
  results_load_more_btn,
501
  selected_image_md,
 
515
  demo.load(fn=_initial_load, inputs=[state, rating_pref_store, submit_key_store], outputs=[rating_dd, submit_key_tb, *outputs], queue=False, show_progress="hidden")
516
  demo.load(fn=_load_results, inputs=[rating_pref_store], outputs=results_outputs, queue=False, show_progress="hidden")
517
  results_load_more_btn.click(
518
+ fn=lambda g, o: load_more_results(_pool_df, g, _stats_by_key, _classifier_score_by_key, o),
519
  inputs=[results_groups_state, results_page_offset_state],
520
  outputs=[results_gallery, results_page_meta_state, results_page_offset_state, results_load_more_btn],
521
  queue=False,
522
  show_progress="hidden",
523
  )
524
+ results_gallery.select(
525
+ fn=on_gallery_select,
526
+ inputs=[results_page_meta_state, results_distribution_state],
527
+ outputs=[selected_image_md, results_score_distribution_plot],
528
+ queue=False,
529
+ show_progress="hidden",
530
+ )
531
 
532
  if __name__ == "__main__":
533
  demo.launch()
explorer.py ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import matplotlib.pyplot as plt
3
+ import numpy as np
4
+ import pandas as pd
5
+ from matplotlib.figure import Figure
6
+
7
+ PAGE_SIZE = 30
8
+
9
+
10
+ def _row_image_url(row) -> str | None:
11
+ sample_url = row.get("sample_url")
12
+ if isinstance(sample_url, str) and sample_url:
13
+ return sample_url
14
+ image_url = row.get("image_url")
15
+ if isinstance(image_url, str) and image_url:
16
+ return image_url
17
+ return None
18
+
19
+
20
+ def _gallery_items(meta: list[dict[str, str | int]]) -> list[tuple[str, str]]:
21
+ return [(str(item["url"]), f"Score: {float(item['classifier_score']):.4f}") for item in meta]
22
+
23
+
24
+ def _distribution_data(
25
+ pool_df: pd.DataFrame,
26
+ groups: list[str],
27
+ classifier_score_by_key: dict[str, float],
28
+ ) -> dict[str, list[float] | int]:
29
+ subset = pool_df[pool_df["group"].isin(set(groups))]["md5"].astype(str)
30
+ scores = [classifier_score_by_key.get(md5) for md5 in subset]
31
+ valid_scores = [float(score) for score in scores if score is not None]
32
+ if not valid_scores:
33
+ return {"bin_edges": [], "counts": [], "total": 0}
34
+ counts, bin_edges = np.histogram(valid_scores, bins=40)
35
+ return {
36
+ "bin_edges": bin_edges.astype(float).tolist(),
37
+ "counts": counts.astype(float).tolist(),
38
+ "total": int(len(valid_scores)),
39
+ }
40
+
41
+
42
+ def _classifier_score_distribution_plot(
43
+ distribution_data: dict[str, list[float] | int],
44
+ selected_score: float | None = None,
45
+ ) -> Figure:
46
+ fig, ax = plt.subplots(figsize=(6, 2.2))
47
+ fig.patch.set_facecolor("#0f1117")
48
+ ax.set_facecolor("#151922")
49
+ bin_edges = np.asarray(distribution_data.get("bin_edges", []), dtype=float)
50
+ counts = np.asarray(distribution_data.get("counts", []), dtype=float)
51
+ if counts.size > 0 and bin_edges.size == counts.size + 1:
52
+ widths = np.diff(bin_edges)
53
+ ax.bar(bin_edges[:-1], counts, width=widths, align="edge", color="#3b82f6", alpha=0.9, edgecolor="#93c5fd", linewidth=0.35)
54
+ ax.set_ylabel("Count", color="#e5e7eb")
55
+ if selected_score is not None:
56
+ ax.axvline(float(selected_score), color="#f97316", linewidth=2.0)
57
+ else:
58
+ ax.text(0.5, 0.5, "No classifier scores available.", ha="center", va="center", transform=ax.transAxes, color="#e5e7eb")
59
+ ax.set_yticks([])
60
+ ax.set_title("Classifier Score Distribution", color="#f3f4f6")
61
+ ax.set_xlabel("Classifier score", color="#e5e7eb")
62
+ ax.tick_params(colors="#d1d5db")
63
+ for spine in ax.spines.values():
64
+ spine.set_color("#4b5563")
65
+ ax.grid(axis="y", color="#374151", alpha=0.4)
66
+ fig.tight_layout()
67
+ return fig
68
+
69
+
70
+ def _approx_percentile(score: float, distribution_data: dict[str, list[float] | int]) -> float | None:
71
+ bin_edges = np.asarray(distribution_data.get("bin_edges", []), dtype=float)
72
+ counts = np.asarray(distribution_data.get("counts", []), dtype=float)
73
+ total = float(distribution_data.get("total", 0))
74
+ if total <= 0 or counts.size == 0 or bin_edges.size != counts.size + 1:
75
+ return None
76
+ if score <= float(bin_edges[0]):
77
+ return 0.0
78
+ if score >= float(bin_edges[-1]):
79
+ return 100.0
80
+ idx = int(np.searchsorted(bin_edges, score, side="right") - 1)
81
+ idx = max(0, min(idx, counts.size - 1))
82
+ below = float(counts[:idx].sum())
83
+ left = float(bin_edges[idx])
84
+ right = float(bin_edges[idx + 1])
85
+ width = right - left
86
+ frac = 0.0 if width <= 0 else (float(score) - left) / width
87
+ frac = max(0.0, min(1.0, frac))
88
+ return max(0.0, min(100.0, 100.0 * (below + frac * float(counts[idx])) / total))
89
+
90
+
91
+ def _build_page_meta(
92
+ pool_df: pd.DataFrame,
93
+ groups: list[str],
94
+ stats_by_key: dict[str, tuple[int, int]],
95
+ classifier_score_by_key: dict[str, float],
96
+ offset: int,
97
+ ) -> tuple[list[dict[str, str | int]], int, bool, int]:
98
+ subset = pool_df[pool_df["group"].isin(set(groups))][["id", "md5", "sample_url", "image_url"]]
99
+ has_sample = subset["sample_url"].notna() & (subset["sample_url"] != "")
100
+ has_image = subset["image_url"].notna() & (subset["image_url"] != "")
101
+ filtered = subset[has_sample | has_image]
102
+ page_df = filtered.iloc[offset:offset + PAGE_SIZE]
103
+ page_meta: list[dict[str, str | int]] = []
104
+ for row in page_df.to_dict("records"):
105
+ url = _row_image_url(row)
106
+ assert url is not None
107
+ post_id = int(row["id"])
108
+ md5 = str(row["md5"])
109
+ wins, losses = stats_by_key.get(md5, (0, 0))
110
+ classifier_score = float(classifier_score_by_key.get(md5, 0.0))
111
+ page_meta.append({"id": post_id, "md5": md5, "url": url, "wins": wins, "losses": losses, "classifier_score": classifier_score})
112
+ next_offset = offset + len(page_meta)
113
+ has_more = next_offset < len(filtered)
114
+ return page_meta, next_offset, has_more, len(filtered)
115
+
116
+
117
+ def build_results_data(
118
+ pool_df: pd.DataFrame,
119
+ groups: list[str],
120
+ category_label: str,
121
+ stats_by_key: dict[str, tuple[int, int]],
122
+ classifier_score_by_key: dict[str, float],
123
+ ) -> tuple[str, Figure, dict[str, list[float] | int], list[tuple[str, str]], list[str], list[dict[str, str | int]], int, dict]:
124
+ page_meta, next_offset, has_more, total = _build_page_meta(pool_df, groups, stats_by_key, classifier_score_by_key, offset=0)
125
+ summary = f"Showing {total} images for category: {category_label}."
126
+ distribution_data = _distribution_data(pool_df, groups, classifier_score_by_key)
127
+ score_distribution_plot = _classifier_score_distribution_plot(distribution_data)
128
+ return summary, score_distribution_plot, distribution_data, _gallery_items(page_meta), groups, page_meta, next_offset, gr.update(visible=has_more)
129
+
130
+
131
+ def load_more_results(
132
+ pool_df: pd.DataFrame,
133
+ groups: list[str],
134
+ stats_by_key: dict[str, tuple[int, int]],
135
+ classifier_score_by_key: dict[str, float],
136
+ offset: int,
137
+ ):
138
+ page_meta, next_offset, has_more, _total = _build_page_meta(pool_df, groups, stats_by_key, classifier_score_by_key, offset=int(offset))
139
+ return _gallery_items(page_meta), page_meta, next_offset, gr.update(visible=has_more)
140
+
141
+
142
+ def on_gallery_select(
143
+ evt: gr.SelectData,
144
+ meta: list[dict[str, str | int]],
145
+ distribution_data: dict[str, list[float] | int],
146
+ ) -> tuple[str, Figure]:
147
+ index = evt.index[0] if isinstance(evt.index, tuple) else evt.index
148
+ if not isinstance(index, int) or index < 0 or index >= len(meta):
149
+ return "No image selected.", _classifier_score_distribution_plot(distribution_data)
150
+ selected = meta[index]
151
+ post_id = int(selected["id"])
152
+ md5 = str(selected["md5"])
153
+ times_rated = int(selected["wins"]) + int(selected["losses"])
154
+ classifier_score = float(selected["classifier_score"])
155
+ percentile = _approx_percentile(classifier_score, distribution_data)
156
+ percentile_text = "n/a" if percentile is None else f"{percentile:.1f}%"
157
+ info = f"MD5: {md5} | ID: {post_id} | Times rated: {times_rated} | Score: {classifier_score:.4f} | Approx percentile: {percentile_text}\nhttps://e621.net/posts/{post_id}"
158
+ return info, _classifier_score_distribution_plot(distribution_data, selected_score=classifier_score)
159
+
160
+
161
+ def add_results_tab(pool_df: pd.DataFrame):
162
+ with gr.Tab("Explorer"):
163
+ results_summary_md = gr.Markdown()
164
+ results_gallery = gr.Gallery(
165
+ label="Category Mosaic",
166
+ columns=[6],
167
+ object_fit="contain",
168
+ preview=True,
169
+ height="auto",
170
+ )
171
+ results_load_more_btn = gr.Button("Load more (ArrowDown)", elem_id="btn-results-load-more")
172
+ selected_image_md = gr.Markdown("Click an image to reveal its ID and link.")
173
+ results_score_distribution_plot = gr.Plot(label="Classifier score distribution")
174
+ results_distribution_state = gr.State({"bin_edges": [], "counts": [], "total": 0})
175
+ results_groups_state = gr.State([])
176
+ results_page_meta_state = gr.State([])
177
+ results_page_offset_state = gr.State(0)
178
+ return (
179
+ results_summary_md,
180
+ results_score_distribution_plot,
181
+ results_distribution_state,
182
+ results_gallery,
183
+ results_load_more_btn,
184
+ selected_image_md,
185
+ results_groups_state,
186
+ results_page_meta_state,
187
+ results_page_offset_state,
188
+ )