rhli commited on
Commit
dad152b
·
verified ·
1 Parent(s): a722b84

[genarena deploy] Upload genarena package

Browse files
genarena/deploy/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ """GenArena deploy module for HuggingFace Spaces deployment."""
2
+
3
+ from genarena.deploy.app import main
4
+
5
+ __all__ = ["main"]
genarena/prompts/__init__.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Prompt module loader and validator."""
2
+
3
+ import importlib
4
+ import importlib.util
5
+ import os
6
+ from types import ModuleType
7
+ from typing import Optional
8
+
9
+
10
+ # Required attributes for a valid prompt module
11
+ REQUIRED_ATTRIBUTES = ["PROMPT_TEXT", "ALLOW_TIE", "build_prompt", "parse_response"]
12
+
13
+
14
+ def load_prompt(name: str) -> ModuleType:
15
+ """
16
+ Load a prompt module by name.
17
+
18
+ First tries to load from the genarena.prompts package, then attempts
19
+ to load from a file path if the name looks like a path.
20
+
21
+ Args:
22
+ name: Prompt module name (e.g., 'mmrb2') or path to a .py file
23
+
24
+ Returns:
25
+ Loaded module
26
+
27
+ Raises:
28
+ ImportError: If module cannot be found
29
+ ValueError: If module is invalid
30
+ """
31
+ module = None
32
+
33
+ # Try loading from genarena.prompts package
34
+ try:
35
+ module = importlib.import_module(f"genarena.prompts.{name}")
36
+ except ImportError:
37
+ pass
38
+
39
+ # If not found and name looks like a path, try loading from file
40
+ if module is None and (name.endswith('.py') or os.path.sep in name):
41
+ if os.path.isfile(name):
42
+ spec = importlib.util.spec_from_file_location("custom_prompt", name)
43
+ if spec and spec.loader:
44
+ module = importlib.util.module_from_spec(spec)
45
+ spec.loader.exec_module(module)
46
+
47
+ if module is None:
48
+ raise ImportError(
49
+ f"Could not load prompt module '{name}'. "
50
+ f"Make sure it exists in genarena/prompts/ or provide a valid file path."
51
+ )
52
+
53
+ # Validate the module
54
+ if not validate_prompt(module):
55
+ missing = get_missing_attributes(module)
56
+ raise ValueError(
57
+ f"Invalid prompt module '{name}'. "
58
+ f"Missing required attributes: {missing}"
59
+ )
60
+
61
+ return module
62
+
63
+
64
+ def validate_prompt(module: ModuleType) -> bool:
65
+ """
66
+ Validate that a module contains all required prompt attributes.
67
+
68
+ Required attributes:
69
+ - PROMPT_TEXT: str - The evaluation prompt text
70
+ - ALLOW_TIE: bool - Whether single-round ties are allowed
71
+ - build_prompt: callable - Function to build VLM messages
72
+ - parse_response: callable - Function to parse VLM response
73
+
74
+ Args:
75
+ module: Module to validate
76
+
77
+ Returns:
78
+ True if valid, False otherwise
79
+ """
80
+ for attr in REQUIRED_ATTRIBUTES:
81
+ if not hasattr(module, attr):
82
+ return False
83
+
84
+ # Check callable attributes
85
+ if attr in ("build_prompt", "parse_response"):
86
+ if not callable(getattr(module, attr)):
87
+ return False
88
+
89
+ return True
90
+
91
+
92
+ def get_missing_attributes(module: ModuleType) -> list[str]:
93
+ """
94
+ Get list of missing required attributes from a module.
95
+
96
+ Args:
97
+ module: Module to check
98
+
99
+ Returns:
100
+ List of missing attribute names
101
+ """
102
+ missing = []
103
+ for attr in REQUIRED_ATTRIBUTES:
104
+ if not hasattr(module, attr):
105
+ missing.append(attr)
106
+ elif attr in ("build_prompt", "parse_response"):
107
+ if not callable(getattr(module, attr)):
108
+ missing.append(f"{attr} (not callable)")
109
+ return missing
110
+
111
+
112
+ def list_available_prompts() -> list[str]:
113
+ """
114
+ List all available prompt modules in the prompts directory.
115
+
116
+ Returns:
117
+ List of prompt module names
118
+ """
119
+ prompts_dir = os.path.dirname(__file__)
120
+ available = []
121
+
122
+ for filename in os.listdir(prompts_dir):
123
+ if filename.endswith('.py') and not filename.startswith('_'):
124
+ name = filename[:-3] # Remove .py extension
125
+ available.append(name)
126
+
127
+ return sorted(available)
genarena/prompts/mmrb2.py ADDED
@@ -0,0 +1,369 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """MMRB2 prompt implementation for image editing evaluation.
2
+
3
+ This module implements the MMRB2 evaluation prompt for pairwise comparison
4
+ of image editing results. It uses a 1-6 scoring scale and does not allow
5
+ ties in single rounds.
6
+
7
+ Reference: MMRB2 evaluation framework
8
+ """
9
+
10
+ import base64
11
+ import io
12
+ import re
13
+ from typing import Any, Union
14
+
15
+ import json_repair
16
+ from PIL import Image as PILImage
17
+
18
+
19
+ # Whether single-round ties are allowed (mmrb2 requires a winner)
20
+ ALLOW_TIE = False
21
+
22
+
23
+ # The full evaluation prompt text from get_image_edit_prompt()
24
+ PROMPT_TEXT = """You are an expert in image editing quality analysis and AI evaluation. Your role is to act as an objective judge for comparing two AI-generated image editing responses to the same prompt. You will evaluate which response is better based on a comprehensive rubric specifically designed for image editing tasks.
25
+
26
+ **Important Guidelines:**
27
+ - Be completely impartial and avoid any position biases
28
+ - Ensure that the order in which the responses were presented does not influence your decision
29
+ - Do not allow the length of the responses to influence your evaluation
30
+ - Do not favor certain model names or types
31
+ - Be as objective as possible in your assessment
32
+ - Focus on image editing specific factors: faithfulness to editing instructions, preservation of input image elements, and overall editing quality
33
+
34
+ **Understanding the Content Structure:**
35
+ - **[ORIGINAL PROMPT TO MODEL:]**: This is the image editing instruction given to both AI models
36
+ - **[INPUT IMAGE FROM PROMPT:]**: This is the source image provided to both models for editing
37
+ - **[RESPONSE A:]**: The first model's edited image response
38
+ - **[RESPONSE B:]**: The second model's edited image response
39
+
40
+ Your evaluation must be based on a fine-grained rubric that covers the following criteria. For each criterion, you must provide detailed step-by-step reasoning comparing both responses. You will use a 1-6 scoring scale.
41
+
42
+ **Evaluation Criteria:**
43
+ 1. **text_faithfulness:** Which response better adheres to the text editing instruction? Consider how well each response follows the specific editing instructions (e.g., adding objects, changing colors, modifying scenes).
44
+
45
+ 2. **image_faithfulness:** Which response better respects and incorporates the key elements of the input image? Consider how well each response preserves important aspects of the original image (composition, lighting, style, background elements) while making the requested changes.
46
+
47
+ 3. **overall_image_quality:** Which response has better general technical and aesthetic quality, with fewer visual artifacts, distortions, or inconsistencies introduced during the editing process?
48
+
49
+ 4. **text_rendering:** If either response contains rendered text, which one has better text quality (spelling, legibility, integration with the image)? If no text is rendered, state "Not Applicable."
50
+
51
+ **Scoring Rubric:**
52
+ - Score 6 (A is significantly better): Response A is significantly superior across most criteria
53
+ - Score 5 (A is marginally better): Response A is noticeably better across several criteria
54
+ - Score 4 (Unsure or A is negligibly better): Response A is slightly better or roughly equivalent
55
+ - Score 3 (Unsure or B is negligibly better): Response B is slightly better or roughly equivalent
56
+ - Score 2 (B is marginally better): Response B is noticeably better across several criteria
57
+ - Score 1 (B is significantly better): Response B is significantly superior across most criteria
58
+
59
+ **Confidence Assessment:**
60
+ After your evaluation, assess your confidence in this judgment on a scale of 0.0 to 1.0:
61
+
62
+ **CRITICAL**: Be EXTREMELY conservative with confidence scores. Most comparisons should be in the 0.2-0.5 range.
63
+
64
+ - **Very High Confidence (0.8-1.0)**: ONLY for absolutely obvious cases where one response is dramatically better across ALL criteria with zero ambiguity. Use this extremely rarely (less than 10% of cases).
65
+ - **High Confidence (0.6-0.7)**: Clear differences but some uncertainty remains. Use sparingly (less than 20% of cases).
66
+ - **Medium Confidence (0.4-0.5)**: Noticeable differences but significant uncertainty. This should be your DEFAULT range.
67
+ - **Low Confidence (0.2-0.3)**: Very close comparison, difficult to distinguish. Responses are roughly equivalent or have conflicting strengths.
68
+ - **Very Low Confidence (0.0-0.1)**: Essentially indistinguishable responses or major conflicting strengths.
69
+
70
+ **IMPORTANT GUIDELINES**:
71
+ - DEFAULT to 0.3-0.5 range for most comparisons
72
+ - Only use 0.6+ when you are absolutely certain
73
+ - Consider: Could reasonable people disagree on this comparison?
74
+ - Consider: Are there any strengths in the "worse" response?
75
+ - Consider: How obvious would this be to a human evaluator?
76
+ - Remember: Quality assessment is inherently subjective
77
+
78
+ After your reasoning, you will provide a final numerical score, indicate which response is better, and assess your confidence. You must always output your response in the following structured JSON format:
79
+
80
+ {
81
+ "reasoning": {
82
+ "text_faithfulness": "YOUR REASONING HERE",
83
+ "image_faithfulness": "YOUR REASONING HERE",
84
+ "overall_image_quality": "YOUR REASONING HERE",
85
+ "text_rendering": "YOUR REASONING HERE",
86
+ "comparison_summary": "YOUR OVERALL COMPARISON SUMMARY HERE"
87
+ },
88
+ "score": <int 1-6>,
89
+ "better_response": "A" or "B",
90
+ "confidence": <float 0.0-1.0>,
91
+ "confidence_rationale": "YOUR CONFIDENCE ASSESSMENT REASONING HERE"
92
+ }"""
93
+
94
+
95
+ def _encode_image_to_base64(image_source: Union[str, bytes, PILImage.Image, io.BytesIO, dict[str, Any]]) -> str:
96
+ """
97
+ Encode an image to base64.
98
+
99
+ Args:
100
+ image_source: Either a file path (str), raw bytes, PIL.Image object, or BytesIO
101
+
102
+ Returns:
103
+ Base64 encoded string
104
+
105
+ Raises:
106
+ TypeError: If image_source type is not supported
107
+ ValueError: If image_source cannot be converted to bytes
108
+ """
109
+ image_bytes: bytes
110
+
111
+ if isinstance(image_source, str):
112
+ # It's a file path
113
+ with open(image_source, "rb") as f:
114
+ image_bytes = f.read()
115
+ elif isinstance(image_source, io.BytesIO):
116
+ # It's a BytesIO object
117
+ image_source.seek(0)
118
+ image_bytes = image_source.read()
119
+ elif isinstance(image_source, PILImage.Image):
120
+ # It's a PIL Image object (e.g., from HuggingFace datasets)
121
+ buffer = io.BytesIO()
122
+ image_source.save(buffer, format="PNG")
123
+ image_bytes = buffer.getvalue()
124
+ elif isinstance(image_source, dict):
125
+ # It's a dict (e.g., from HuggingFace datasets Image() type)
126
+ if "bytes" in image_source:
127
+ raw = image_source["bytes"]
128
+ if isinstance(raw, bytes):
129
+ image_bytes = raw
130
+ elif isinstance(raw, io.BytesIO):
131
+ raw.seek(0)
132
+ image_bytes = raw.read()
133
+ else:
134
+ # Recurse to handle nested types
135
+ return _encode_image_to_base64(raw)
136
+ elif "path" in image_source and image_source["path"]:
137
+ with open(image_source["path"], "rb") as f:
138
+ image_bytes = f.read()
139
+ else:
140
+ raise ValueError(f"Cannot extract image from dict: {image_source.keys()}")
141
+ elif isinstance(image_source, bytes):
142
+ # It's already bytes - MUST check after more specific types
143
+ image_bytes = image_source
144
+ else:
145
+ # Unknown type - raise error with helpful message
146
+ raise TypeError(
147
+ f"Unsupported image type: {type(image_source).__name__}. "
148
+ f"Expected str (path), bytes, PIL.Image, io.BytesIO, or dict. "
149
+ f"Got: {repr(image_source)[:200]}"
150
+ )
151
+
152
+ # Verify we have valid bytes before encoding
153
+ if not isinstance(image_bytes, bytes):
154
+ raise ValueError(
155
+ f"Failed to convert image to bytes. "
156
+ f"Got {type(image_bytes).__name__} instead. "
157
+ f"Original input was {type(image_source).__name__}"
158
+ )
159
+
160
+ return base64.b64encode(image_bytes).decode("utf-8")
161
+
162
+
163
+ def _get_image_media_type(image_source: Union[str, bytes, PILImage.Image]) -> str:
164
+ """
165
+ Determine the media type of an image.
166
+
167
+ Args:
168
+ image_source: Either a file path (str), raw bytes, or PIL.Image object
169
+
170
+ Returns:
171
+ Media type string (e.g., 'image/png')
172
+ """
173
+ if isinstance(image_source, str):
174
+ ext = image_source.lower().split('.')[-1]
175
+ media_types = {
176
+ 'png': 'image/png',
177
+ 'jpg': 'image/jpeg',
178
+ 'jpeg': 'image/jpeg',
179
+ 'webp': 'image/webp',
180
+ 'gif': 'image/gif',
181
+ }
182
+ return media_types.get(ext, 'image/png')
183
+ elif isinstance(image_source, PILImage.Image):
184
+ # For PIL.Image, we convert to PNG
185
+ return 'image/png'
186
+ else:
187
+ # Try to detect from bytes magic
188
+ if image_source[:8] == b'\x89PNG\r\n\x1a\n':
189
+ return 'image/png'
190
+ elif image_source[:2] == b'\xff\xd8':
191
+ return 'image/jpeg'
192
+ elif image_source[:4] == b'RIFF' and image_source[8:12] == b'WEBP':
193
+ return 'image/webp'
194
+ else:
195
+ return 'image/png'
196
+
197
+
198
+ def _create_image_content(image_source: Union[str, bytes]) -> dict[str, Any]:
199
+ """
200
+ Create an image content block for OpenAI API.
201
+
202
+ Args:
203
+ image_source: Either a file path (str) or raw bytes
204
+
205
+ Returns:
206
+ Image content dict for OpenAI API
207
+ """
208
+ base64_data = _encode_image_to_base64(image_source)
209
+ media_type = _get_image_media_type(image_source)
210
+
211
+ return {
212
+ "type": "image_url",
213
+ "image_url": {
214
+ "url": f"data:{media_type};base64,{base64_data}"
215
+ }
216
+ }
217
+
218
+
219
+ def build_prompt(
220
+ instruction: str,
221
+ input_images: list[Union[str, bytes]],
222
+ output_image_a: Union[str, bytes],
223
+ output_image_b: Union[str, bytes]
224
+ ) -> list[dict[str, Any]]:
225
+ """
226
+ Build the VLM prompt messages for pairwise evaluation.
227
+
228
+ Constructs messages in the format:
229
+ [EVALUATION PROMPT TEXT]
230
+ [ORIGINAL PROMPT TO MODEL:]
231
+ {instruction and input_images}
232
+ [RESPONSE A:]
233
+ {output_image_a}
234
+ [RESPONSE B:]
235
+ {output_image_b}
236
+
237
+ Args:
238
+ instruction: The editing instruction given to models
239
+ input_images: List of input images (file paths or bytes)
240
+ output_image_a: Output from model A (file path or bytes)
241
+ output_image_b: Output from model B (file path or bytes)
242
+
243
+ Returns:
244
+ List of message dicts for OpenAI Chat Completion API
245
+ """
246
+ # Build content list
247
+ content = []
248
+
249
+ # 1. Evaluation prompt
250
+ content.append({
251
+ "type": "text",
252
+ "text": PROMPT_TEXT
253
+ })
254
+
255
+ # 2. Original prompt to model section
256
+ content.append({
257
+ "type": "text",
258
+ "text": "[ORIGINAL PROMPT TO MODEL:]"
259
+ })
260
+
261
+ # Add instruction text
262
+ content.append({
263
+ "type": "text",
264
+ "text": instruction
265
+ })
266
+
267
+ # Add input images if any
268
+ if input_images:
269
+ content.append({
270
+ "type": "text",
271
+ "text": "[INPUT IMAGE FROM PROMPT:]"
272
+ })
273
+ for img in input_images:
274
+ content.append(_create_image_content(img))
275
+
276
+ # 3. Response A
277
+ content.append({
278
+ "type": "text",
279
+ "text": "[RESPONSE A:]"
280
+ })
281
+ content.append(_create_image_content(output_image_a))
282
+
283
+ # 4. Response B
284
+ content.append({
285
+ "type": "text",
286
+ "text": "[RESPONSE B:]"
287
+ })
288
+ content.append(_create_image_content(output_image_b))
289
+
290
+ # Return as OpenAI API format
291
+ return [
292
+ {
293
+ "role": "user",
294
+ "content": content
295
+ }
296
+ ]
297
+
298
+
299
+ def parse_response(response: str) -> dict[str, Any]:
300
+ """
301
+ Parse the VLM judge response.
302
+
303
+ Extracts structured information from VLM's JSON response,
304
+ handling markdown code blocks and minor JSON errors.
305
+
306
+ Args:
307
+ response: Raw response text from VLM
308
+
309
+ Returns:
310
+ Dict containing:
311
+ - winner: "A" or "B" (from better_response field)
312
+ - score: int 1-6
313
+ - confidence: float 0.0-1.0
314
+ - reasoning: dict with evaluation criteria
315
+ - raw_response: the original parsed JSON
316
+
317
+ Raises:
318
+ ValueError: If response cannot be parsed
319
+ """
320
+ # Remove markdown code block formatting
321
+ text = response.strip()
322
+ text = re.sub(r"^```(?:json)?\s*\n?", "", text)
323
+ text = re.sub(r"\n?```\s*$", "", text)
324
+
325
+ # Try to parse JSON with json_repair for fault tolerance
326
+ try:
327
+ parsed = json_repair.loads(text)
328
+ except Exception as e:
329
+ raise ValueError(f"Failed to parse JSON response: {e}\nResponse was:\n{response}")
330
+
331
+ # Extract fields
332
+ better_response = parsed.get("better_response", "")
333
+
334
+ # Normalize winner to uppercase
335
+ if isinstance(better_response, str):
336
+ winner = better_response.upper().strip()
337
+ if winner not in ("A", "B"):
338
+ # Try to extract from text
339
+ if "A" in winner:
340
+ winner = "A"
341
+ elif "B" in winner:
342
+ winner = "B"
343
+ else:
344
+ raise ValueError(f"Invalid better_response value: {better_response}")
345
+ else:
346
+ raise ValueError(f"better_response must be a string, got: {type(better_response)}")
347
+
348
+ # Extract score (1-6)
349
+ score = parsed.get("score", 4)
350
+ if isinstance(score, str):
351
+ score = int(score)
352
+ score = max(1, min(6, score))
353
+
354
+ # Extract confidence (0.0-1.0)
355
+ confidence = parsed.get("confidence", 0.5)
356
+ if isinstance(confidence, str):
357
+ confidence = float(confidence)
358
+ confidence = max(0.0, min(1.0, confidence))
359
+
360
+ # Extract reasoning
361
+ reasoning = parsed.get("reasoning", {})
362
+
363
+ return {
364
+ "winner": winner,
365
+ "score": score,
366
+ "confidence": confidence,
367
+ "reasoning": reasoning,
368
+ "raw_response": parsed
369
+ }
genarena/sync/__init__.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Sync module for GenArena.
3
+
4
+ This module provides Git version control and Huggingface synchronization
5
+ capabilities for arena data.
6
+ """
7
+
8
+ from genarena.sync.git_ops import (
9
+ is_git_initialized,
10
+ git_init,
11
+ ensure_gitignore,
12
+ git_add_all,
13
+ git_commit,
14
+ has_uncommitted_changes,
15
+ git_remote_add,
16
+ git_remote_get_url,
17
+ git_push,
18
+ git_sync,
19
+ )
20
+
21
+ from genarena.sync.auto_commit import (
22
+ auto_commit_and_push,
23
+ with_auto_commit,
24
+ )
25
+
26
+ from genarena.sync.hf_ops import (
27
+ get_hf_token,
28
+ require_hf_token,
29
+ validate_dataset_repo,
30
+ list_repo_files,
31
+ get_repo_file_info,
32
+ upload_file,
33
+ upload_files_batch,
34
+ download_file,
35
+ check_file_exists,
36
+ upload_arena_data,
37
+ pull_arena_data,
38
+ list_repo_contents,
39
+ )
40
+
41
+ from genarena.sync.packer import (
42
+ pack_model_dir,
43
+ pack_exp_dir,
44
+ unpack_zip,
45
+ collect_upload_tasks,
46
+ collect_download_tasks,
47
+ TempPackingContext,
48
+ TaskType,
49
+ PackTask,
50
+ UnpackTask,
51
+ )
52
+
53
+ from genarena.sync.init_ops import (
54
+ DEFAULT_BENCHMARK_REPO,
55
+ DEFAULT_ARENA_REPO,
56
+ discover_repo_subsets,
57
+ download_benchmark_data,
58
+ init_arena,
59
+ )
60
+
61
+ __all__ = [
62
+ # Git operations
63
+ "is_git_initialized",
64
+ "git_init",
65
+ "ensure_gitignore",
66
+ "git_add_all",
67
+ "git_commit",
68
+ "has_uncommitted_changes",
69
+ "git_remote_add",
70
+ "git_remote_get_url",
71
+ "git_push",
72
+ "git_sync",
73
+ # Auto commit
74
+ "auto_commit_and_push",
75
+ "with_auto_commit",
76
+ # Huggingface operations
77
+ "get_hf_token",
78
+ "require_hf_token",
79
+ "validate_dataset_repo",
80
+ "list_repo_files",
81
+ "get_repo_file_info",
82
+ "upload_file",
83
+ "upload_files_batch",
84
+ "download_file",
85
+ "check_file_exists",
86
+ "upload_arena_data",
87
+ "pull_arena_data",
88
+ "list_repo_contents",
89
+ # Packer utilities
90
+ "pack_model_dir",
91
+ "pack_exp_dir",
92
+ "unpack_zip",
93
+ "collect_upload_tasks",
94
+ "collect_download_tasks",
95
+ "TempPackingContext",
96
+ "TaskType",
97
+ "PackTask",
98
+ "UnpackTask",
99
+ # Init operations
100
+ "DEFAULT_BENCHMARK_REPO",
101
+ "DEFAULT_ARENA_REPO",
102
+ "discover_repo_subsets",
103
+ "download_benchmark_data",
104
+ "init_arena",
105
+ ]
genarena/sync/auto_commit.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Auto commit module for GenArena.
3
+
4
+ This module provides automatic commit and push functionality
5
+ that is triggered after command execution.
6
+ """
7
+
8
+ import logging
9
+ from typing import Callable, TypeVar
10
+
11
+ from genarena.sync.git_ops import (
12
+ is_git_initialized,
13
+ has_uncommitted_changes,
14
+ git_commit,
15
+ git_push,
16
+ git_remote_get_url,
17
+ )
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+ # Type variable for generic decorator
22
+ T = TypeVar("T")
23
+
24
+
25
+ def auto_commit_and_push(arena_dir: str, command_name: str) -> None:
26
+ """
27
+ Automatically commit and push changes after a command execution.
28
+
29
+ This function is designed to be called after commands that modify
30
+ arena_dir content (e.g., run, merge, delete). It silently skips
31
+ if Git is not initialized, and only warns on failure without
32
+ interrupting the main command flow.
33
+
34
+ Args:
35
+ arena_dir: Path to the arena directory
36
+ command_name: Name of the command that triggered this auto-commit
37
+ """
38
+ # Skip silently if Git is not initialized
39
+ if not is_git_initialized(arena_dir):
40
+ return
41
+
42
+ # Check if there are uncommitted changes
43
+ if not has_uncommitted_changes(arena_dir):
44
+ logger.debug(f"No changes to commit after {command_name}")
45
+ return
46
+
47
+ # Try to commit
48
+ try:
49
+ success, msg = git_commit(arena_dir, command_name=command_name)
50
+ if success:
51
+ if "Nothing to commit" not in msg:
52
+ logger.info(f"Auto-committed changes: {msg}")
53
+ else:
54
+ logger.warning(f"Auto-commit failed: {msg}")
55
+ return
56
+ except Exception as e:
57
+ logger.warning(f"Auto-commit failed with exception: {e}")
58
+ return
59
+
60
+ # Check if remote is configured and try to push
61
+ remote_url = git_remote_get_url(arena_dir)
62
+ if not remote_url:
63
+ logger.debug("No remote configured, skipping auto-push")
64
+ return
65
+
66
+ # Try to push
67
+ try:
68
+ success, msg = git_push(arena_dir)
69
+ if success:
70
+ logger.info(f"Auto-pushed changes: {msg}")
71
+ else:
72
+ logger.warning(f"Auto-push failed: {msg}")
73
+ except Exception as e:
74
+ logger.warning(f"Auto-push failed with exception: {e}")
75
+
76
+
77
+ def with_auto_commit(command_name: str):
78
+ """
79
+ Decorator that adds auto-commit functionality to command functions.
80
+
81
+ The decorated function must have 'arena_dir' as an argument or
82
+ in its args namespace.
83
+
84
+ Args:
85
+ command_name: Name of the command for commit message
86
+
87
+ Returns:
88
+ Decorator function
89
+ """
90
+ def decorator(func: Callable[..., int]) -> Callable[..., int]:
91
+ def wrapper(*args, **kwargs) -> int:
92
+ # Execute the original command
93
+ result = func(*args, **kwargs)
94
+
95
+ # Only auto-commit if the command succeeded (return code 0)
96
+ if result == 0:
97
+ # Try to get arena_dir from args
98
+ arena_dir = None
99
+
100
+ # Check kwargs first
101
+ if "arena_dir" in kwargs:
102
+ arena_dir = kwargs["arena_dir"]
103
+ # Check if first arg is argparse.Namespace
104
+ elif args and hasattr(args[0], "arena_dir"):
105
+ arena_dir = args[0].arena_dir
106
+
107
+ if arena_dir:
108
+ auto_commit_and_push(arena_dir, command_name)
109
+
110
+ return result
111
+
112
+ return wrapper
113
+
114
+ return decorator
genarena/sync/deploy_ops.py ADDED
@@ -0,0 +1,539 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Deploy operations for GenArena.
3
+
4
+ Handles uploading arena data to HuggingFace for Spaces deployment.
5
+ Unlike `hf upload`, this uploads images directly (not as ZIP) for CDN access.
6
+ Parquet benchmark data is downloaded from rhli/genarena during Docker build.
7
+ """
8
+
9
+ import logging
10
+ import os
11
+ from multiprocessing import Pool
12
+ from typing import Optional
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ # Default multiprocessing settings
18
+ DEFAULT_NUM_WORKERS = 16
19
+ DEFAULT_WORKER_TIMEOUT = 300 # seconds
20
+
21
+
22
+ def upload_for_deploy(
23
+ arena_dir: str,
24
+ arena_repo: str,
25
+ space_repo: str,
26
+ subsets: Optional[list[str]] = None,
27
+ overwrite: bool = False,
28
+ show_progress: bool = True,
29
+ max_retries: int = 3,
30
+ num_workers: int = DEFAULT_NUM_WORKERS,
31
+ worker_timeout: int = DEFAULT_WORKER_TIMEOUT,
32
+ ) -> tuple[bool, str]:
33
+ """
34
+ Upload all data needed for HuggingFace Spaces deployment.
35
+
36
+ This uploads:
37
+ 1. Arena data (pk_logs, models, state.json) to arena_repo (Dataset)
38
+ - Images are uploaded directly (not as ZIP) for CDN access
39
+ - Follows symlinks to upload actual image files
40
+ 2. Deploy files (Dockerfile, app.py, README.md) to space_repo
41
+
42
+ Note: Parquet benchmark data is NOT uploaded. It is downloaded from
43
+ rhli/genarena during Docker build in the Space.
44
+
45
+ Args:
46
+ arena_dir: Local arena directory
47
+ arena_repo: HF Dataset repo for arena data
48
+ space_repo: HF Space repo for deployment
49
+ subsets: Subsets to upload (None = all)
50
+ overwrite: Overwrite existing files
51
+ show_progress: Show progress bar
52
+ max_retries: Max retries per file
53
+ num_workers: Number of parallel workers for upload (default: 16)
54
+ worker_timeout: Timeout in seconds for each worker (default: 300)
55
+
56
+ Returns:
57
+ Tuple of (success, message)
58
+ """
59
+ from genarena.sync.hf_ops import (
60
+ require_hf_token,
61
+ validate_dataset_repo,
62
+ )
63
+
64
+ # Get token
65
+ try:
66
+ token = require_hf_token()
67
+ except ValueError as e:
68
+ return False, str(e)
69
+
70
+ messages = []
71
+
72
+ # 1. Upload arena data to Dataset repo (images as individual files, not ZIP)
73
+ logger.info(f"Uploading arena data to {arena_repo}...")
74
+ valid, msg = validate_dataset_repo(arena_repo, token)
75
+ if not valid:
76
+ return False, f"Arena repo validation failed: {msg}"
77
+
78
+ success, msg = upload_arena_data_for_cdn(
79
+ arena_dir=arena_dir,
80
+ repo_id=arena_repo,
81
+ subsets=subsets,
82
+ overwrite=overwrite,
83
+ show_progress=show_progress,
84
+ token=token,
85
+ num_workers=num_workers,
86
+ worker_timeout=worker_timeout,
87
+ )
88
+ if not success:
89
+ return False, f"Arena upload failed: {msg}"
90
+ messages.append(f"Arena data: {msg}")
91
+
92
+ # 2. Upload deploy files to Space repo
93
+ logger.info(f"Uploading deploy files to {space_repo}...")
94
+ success, msg = upload_deploy_files(
95
+ space_repo=space_repo,
96
+ overwrite=overwrite,
97
+ token=token,
98
+ )
99
+ if not success:
100
+ return False, f"Deploy files upload failed: {msg}"
101
+ messages.append(f"Deploy files: {msg}")
102
+
103
+ return True, "\n".join(messages)
104
+
105
+
106
+ def collect_files_follow_symlinks(
107
+ base_dir: str,
108
+ path_prefix: str = "",
109
+ ) -> list[tuple[str, str]]:
110
+ """
111
+ Collect all files under base_dir, following symlinks.
112
+
113
+ Args:
114
+ base_dir: Directory to scan
115
+ path_prefix: Prefix for remote paths
116
+
117
+ Returns:
118
+ List of (local_path, remote_path) tuples
119
+ """
120
+ files = []
121
+
122
+ if not os.path.isdir(base_dir):
123
+ return files
124
+
125
+ # Use os.walk with followlinks=True to traverse symlinks
126
+ for root, dirs, filenames in os.walk(base_dir, followlinks=True):
127
+ # Skip hidden directories and special directories
128
+ dirs[:] = [d for d in dirs if not d.startswith(".") and d != "__pycache__" and d != "raw_outputs"]
129
+
130
+ rel_root = os.path.relpath(root, base_dir)
131
+ if rel_root == ".":
132
+ rel_root = ""
133
+
134
+ for filename in filenames:
135
+ if filename.startswith("."):
136
+ continue
137
+
138
+ local_path = os.path.join(root, filename)
139
+
140
+ # Build remote path
141
+ if rel_root:
142
+ remote_path = f"{path_prefix}/{rel_root}/{filename}" if path_prefix else f"{rel_root}/{filename}"
143
+ else:
144
+ remote_path = f"{path_prefix}/{filename}" if path_prefix else filename
145
+
146
+ # Normalize path separators
147
+ remote_path = remote_path.replace("\\", "/")
148
+
149
+ files.append((local_path, remote_path))
150
+
151
+ return files
152
+
153
+
154
+ def _upload_batch_worker(args: tuple) -> tuple[int, int]:
155
+ """
156
+ Worker function for uploading a single batch.
157
+
158
+ Args:
159
+ args: Tuple of (batch_index, batch, repo_id, token, total_batches, max_retries)
160
+
161
+ Returns:
162
+ Tuple of (uploaded_count, failed_count)
163
+ """
164
+ from huggingface_hub import HfApi, CommitOperationAdd
165
+
166
+ batch_index, batch, repo_id, token, total_batches, max_retries = args
167
+
168
+ api = HfApi(token=token)
169
+
170
+ operations = []
171
+ failed_read = 0
172
+ for local_path, remote_path in batch:
173
+ try:
174
+ operations.append(
175
+ CommitOperationAdd(
176
+ path_in_repo=remote_path,
177
+ path_or_fileobj=local_path,
178
+ )
179
+ )
180
+ except Exception as e:
181
+ logger.warning(f"Failed to read {local_path}: {e}")
182
+ failed_read += 1
183
+
184
+ if not operations:
185
+ return 0, failed_read
186
+
187
+ # Try to commit batch with retries
188
+ for attempt in range(max_retries):
189
+ try:
190
+ api.create_commit(
191
+ repo_id=repo_id,
192
+ repo_type="dataset",
193
+ operations=operations,
194
+ commit_message=f"[genarena deploy] Upload batch {batch_index + 1}/{total_batches}",
195
+ )
196
+ return len(operations), failed_read
197
+ except Exception as e:
198
+ if attempt < max_retries - 1:
199
+ logger.warning(f"Batch {batch_index + 1} failed (attempt {attempt + 1}), retrying: {e}")
200
+ else:
201
+ logger.error(f"Batch {batch_index + 1} failed after {max_retries} attempts: {e}")
202
+ return 0, len(operations) + failed_read
203
+
204
+ return 0, len(operations) + failed_read
205
+
206
+
207
+ def upload_arena_data_for_cdn(
208
+ arena_dir: str,
209
+ repo_id: str,
210
+ subsets: Optional[list[str]] = None,
211
+ overwrite: bool = False,
212
+ show_progress: bool = True,
213
+ token: Optional[str] = None,
214
+ num_workers: int = DEFAULT_NUM_WORKERS,
215
+ worker_timeout: int = DEFAULT_WORKER_TIMEOUT,
216
+ ) -> tuple[bool, str]:
217
+ """
218
+ Upload arena data with images as individual files (not ZIP) for CDN access.
219
+
220
+ This function follows symlinks to upload actual image files.
221
+ Models directory often contains symlinks to external image directories.
222
+
223
+ Directory structure uploaded:
224
+ {subset}/models/{exp_name}/{model}/{index}.png (individual images)
225
+ {subset}/pk_logs/{exp_name}/*.jsonl (battle logs)
226
+ {subset}/arena/state.json (ELO state)
227
+
228
+ Args:
229
+ arena_dir: Path to the arena directory
230
+ repo_id: HuggingFace repository ID
231
+ subsets: List of subsets to upload (None = all)
232
+ overwrite: If True, overwrite existing files
233
+ show_progress: If True, show progress bar
234
+ token: HuggingFace token
235
+ num_workers: Number of parallel workers for upload (default: 16)
236
+ worker_timeout: Timeout in seconds for each worker (default: 300)
237
+
238
+ Returns:
239
+ Tuple of (success, message)
240
+ """
241
+ from huggingface_hub import HfApi
242
+
243
+ if token is None:
244
+ from genarena.sync.hf_ops import require_hf_token
245
+ token = require_hf_token()
246
+
247
+ api = HfApi(token=token)
248
+
249
+ # Validate arena directory
250
+ if not os.path.isdir(arena_dir):
251
+ return False, f"Arena directory not found: {arena_dir}"
252
+
253
+ # Discover subsets
254
+ available_subsets = [
255
+ d for d in os.listdir(arena_dir)
256
+ if os.path.isdir(os.path.join(arena_dir, d)) and not d.startswith(".")
257
+ ]
258
+
259
+ if subsets:
260
+ target_subsets = [s for s in subsets if s in available_subsets]
261
+ else:
262
+ target_subsets = available_subsets
263
+
264
+ if not target_subsets:
265
+ return False, "No subsets found to upload"
266
+
267
+ logger.info(f"Target subsets: {target_subsets}")
268
+
269
+ # Collect all files to upload (following symlinks)
270
+ all_files: list[tuple[str, str]] = []
271
+
272
+ for subset in target_subsets:
273
+ subset_dir = os.path.join(arena_dir, subset)
274
+ logger.info(f"Scanning subset: {subset}")
275
+
276
+ # Collect files from models/, pk_logs/, arena/
277
+ for subdir in ["models", "pk_logs", "arena"]:
278
+ subdir_path = os.path.join(subset_dir, subdir)
279
+ if os.path.isdir(subdir_path):
280
+ files = collect_files_follow_symlinks(subdir_path, f"{subset}/{subdir}")
281
+ all_files.extend(files)
282
+ logger.info(f" {subdir}: {len(files)} files")
283
+
284
+ if not all_files:
285
+ return False, "No files found to upload"
286
+
287
+ logger.info(f"Total files to upload: {len(all_files)}")
288
+
289
+ # Filter by extension (only upload relevant files)
290
+ valid_extensions = {".png", ".jpg", ".jpeg", ".webp", ".json", ".jsonl"}
291
+ all_files = [
292
+ (local, remote) for local, remote in all_files
293
+ if os.path.splitext(local)[1].lower() in valid_extensions
294
+ ]
295
+ logger.info(f"Files after extension filtering: {len(all_files)}")
296
+
297
+ # Filter out files in subdirectories under models/<exp>/<model>/
298
+ # Expected structure: {subset}/models/{exp}/{model}/{file}
299
+ # Files deeper than this (e.g., {subset}/models/{exp}/{model}/subfolder/{file}) should be skipped
300
+ def is_valid_model_path(remote: str) -> bool:
301
+ parts = remote.split("/")
302
+ # Non-models paths are always valid
303
+ if len(parts) < 2 or parts[1] != "models":
304
+ return True
305
+ # For models paths, expect exactly: subset/models/exp/model/file (5 parts)
306
+ return len(parts) == 5
307
+
308
+ before_depth_filter = len(all_files)
309
+ all_files = [(local, remote) for local, remote in all_files if is_valid_model_path(remote)]
310
+ depth_filtered = before_depth_filter - len(all_files)
311
+ if depth_filtered > 0:
312
+ logger.info(f"Skipped {depth_filtered} files in model subdirectories")
313
+ logger.info(f"Files after filtering: {len(all_files)}")
314
+
315
+ # Get existing files in repo (for skip check)
316
+ existing_files: set[str] = set()
317
+ if not overwrite:
318
+ try:
319
+ existing_files = set(api.list_repo_files(repo_id=repo_id, repo_type="dataset"))
320
+ logger.info(f"Existing files in repo: {len(existing_files)}")
321
+ except Exception:
322
+ pass
323
+
324
+ # Filter out existing files
325
+ if not overwrite:
326
+ original_count = len(all_files)
327
+ all_files = [
328
+ (local, remote) for local, remote in all_files
329
+ if remote not in existing_files
330
+ ]
331
+ skipped = original_count - len(all_files)
332
+ logger.info(f"Skipping {skipped} existing files, {len(all_files)} to upload")
333
+ else:
334
+ skipped = 0
335
+
336
+ if not all_files:
337
+ return True, f"All files already exist. Skipped {skipped} files."
338
+
339
+ # Upload in batches using create_commit with multiprocessing
340
+ batch_size = 500 # HuggingFace recommends smaller batches for large files
341
+ max_retries = 3
342
+
343
+ # Create batches
344
+ batches = []
345
+ for i in range(0, len(all_files), batch_size):
346
+ batch = all_files[i:i + batch_size]
347
+ batches.append(batch)
348
+
349
+ total_batches = len(batches)
350
+ logger.info(f"Uploading {total_batches} batches with {num_workers} workers (timeout: {worker_timeout}s per worker)")
351
+
352
+ # Prepare worker arguments
353
+ worker_args = [
354
+ (i, batch, repo_id, token, total_batches, max_retries)
355
+ for i, batch in enumerate(batches)
356
+ ]
357
+
358
+ total_uploaded = 0
359
+ total_failed = 0
360
+
361
+ # Use multiprocessing pool
362
+ with Pool(processes=num_workers) as pool:
363
+ if show_progress:
364
+ try:
365
+ from tqdm import tqdm
366
+ results = list(tqdm(
367
+ pool.imap_unordered(_upload_batch_worker, worker_args),
368
+ total=total_batches,
369
+ desc="Uploading batches",
370
+ unit="batch",
371
+ ))
372
+ except ImportError:
373
+ results = []
374
+ for args in worker_args:
375
+ try:
376
+ result = pool.apply_async(_upload_batch_worker, (args,))
377
+ uploaded, failed = result.get(timeout=worker_timeout)
378
+ results.append((uploaded, failed))
379
+ except Exception as e:
380
+ logger.error(f"Worker timeout or error: {e}")
381
+ results.append((0, len(args[1])))
382
+ else:
383
+ results = []
384
+ for args in worker_args:
385
+ try:
386
+ result = pool.apply_async(_upload_batch_worker, (args,))
387
+ uploaded, failed = result.get(timeout=worker_timeout)
388
+ results.append((uploaded, failed))
389
+ except Exception as e:
390
+ logger.error(f"Worker timeout or error: {e}")
391
+ results.append((0, len(args[1])))
392
+
393
+ # Aggregate results
394
+ for uploaded, failed in results:
395
+ total_uploaded += uploaded
396
+ total_failed += failed
397
+
398
+ return True, f"Uploaded {total_uploaded}, skipped {skipped}, failed {total_failed} files"
399
+
400
+
401
+ def upload_deploy_files(
402
+ space_repo: str,
403
+ overwrite: bool = False,
404
+ token: Optional[str] = None,
405
+ ) -> tuple[bool, str]:
406
+ """
407
+ Upload deploy files (Dockerfile, app.py, README.md) to Space repo.
408
+
409
+ Args:
410
+ space_repo: HF Space repo ID
411
+ overwrite: Overwrite existing files
412
+ token: HF token
413
+
414
+ Returns:
415
+ Tuple of (success, message)
416
+ """
417
+ from huggingface_hub import HfApi
418
+
419
+ from genarena.sync.hf_ops import upload_file
420
+
421
+ if token is None:
422
+ from genarena.sync.hf_ops import require_hf_token
423
+
424
+ token = require_hf_token()
425
+
426
+ api = HfApi(token=token)
427
+
428
+ # Get deploy directory
429
+ deploy_dir = os.path.dirname(os.path.abspath(__file__))
430
+ deploy_dir = os.path.join(os.path.dirname(deploy_dir), "deploy")
431
+
432
+ if not os.path.isdir(deploy_dir):
433
+ return False, f"Deploy directory not found: {deploy_dir}"
434
+
435
+ # Files to upload
436
+ deploy_files = [
437
+ ("Dockerfile", "Dockerfile"),
438
+ ("app.py", "genarena/deploy/app.py"),
439
+ ("README.md", "README.md"),
440
+ ]
441
+
442
+ # Get existing files
443
+ existing_files: set[str] = set()
444
+ if not overwrite:
445
+ try:
446
+ existing_files = set(
447
+ api.list_repo_files(repo_id=space_repo, repo_type="space")
448
+ )
449
+ except Exception:
450
+ pass
451
+
452
+ uploaded = 0
453
+ skipped = 0
454
+ failed = 0
455
+
456
+ for local_name, remote_path in deploy_files:
457
+ local_path = os.path.join(deploy_dir, local_name)
458
+ if not os.path.isfile(local_path):
459
+ logger.warning(f"Deploy file not found: {local_path}")
460
+ continue
461
+
462
+ if not overwrite and remote_path in existing_files:
463
+ skipped += 1
464
+ continue
465
+
466
+ success, msg = upload_file(
467
+ repo_id=space_repo,
468
+ local_path=local_path,
469
+ remote_path=remote_path,
470
+ token=token,
471
+ commit_message=f"Upload {remote_path}",
472
+ repo_type="space",
473
+ )
474
+ if success:
475
+ uploaded += 1
476
+ else:
477
+ failed += 1
478
+ logger.warning(f"Failed to upload {remote_path}: {msg}")
479
+
480
+ # Also upload the genarena package files needed for the Space
481
+ # We need to upload the entire genarena package
482
+ success, msg = upload_genarena_package(space_repo, token, overwrite)
483
+ if not success:
484
+ return False, f"Failed to upload genarena package: {msg}"
485
+
486
+ return True, f"Uploaded {uploaded}, skipped {skipped}, failed {failed} deploy files. {msg}"
487
+
488
+
489
+ def upload_genarena_package(
490
+ space_repo: str,
491
+ token: str,
492
+ overwrite: bool = False,
493
+ ) -> tuple[bool, str]:
494
+ """
495
+ Upload the genarena package to the Space repo.
496
+
497
+ Args:
498
+ space_repo: HF Space repo ID
499
+ token: HF token
500
+ overwrite: Overwrite existing files
501
+
502
+ Returns:
503
+ Tuple of (success, message)
504
+ """
505
+ from huggingface_hub import HfApi
506
+
507
+ api = HfApi(token=token)
508
+
509
+ # Get genarena package directory
510
+ genarena_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
511
+ project_root = os.path.dirname(genarena_dir)
512
+
513
+ try:
514
+ # Upload pyproject.toml
515
+ pyproject_path = os.path.join(project_root, "pyproject.toml")
516
+ if os.path.isfile(pyproject_path):
517
+ api.upload_file(
518
+ repo_id=space_repo,
519
+ path_or_fileobj=pyproject_path,
520
+ path_in_repo="pyproject.toml",
521
+ repo_type="space",
522
+ commit_message="Upload pyproject.toml",
523
+ )
524
+
525
+ # Upload genarena package using upload_folder
526
+ api.upload_folder(
527
+ repo_id=space_repo,
528
+ folder_path=genarena_dir,
529
+ path_in_repo="genarena",
530
+ repo_type="space",
531
+ commit_message="[genarena deploy] Upload genarena package",
532
+ allow_patterns=["**/*.py", "**/*.html", "**/*.css", "**/*.js"],
533
+ ignore_patterns=["**/__pycache__/**", "**/.pytest_cache/**"],
534
+ )
535
+
536
+ return True, "Package uploaded successfully"
537
+ except Exception as e:
538
+ logger.error(f"Failed to upload package: {e}")
539
+ return False, str(e)
genarena/sync/git_ops.py ADDED
@@ -0,0 +1,418 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Git operations module for GenArena.
3
+
4
+ This module provides Git version control functionality for arena data,
5
+ including initialization, commit, remote configuration, and push operations.
6
+ """
7
+
8
+ import logging
9
+ import os
10
+ import subprocess
11
+ from datetime import datetime
12
+ from pathlib import Path
13
+ from typing import Optional
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+ # Patterns to exclude from Git tracking (models directories contain large image files)
18
+ GITIGNORE_PATTERNS = [
19
+ "# GenArena: Exclude model output images (large files)",
20
+ "*/models/",
21
+ "",
22
+ "# Python cache",
23
+ "__pycache__/",
24
+ "*.pyc",
25
+ "",
26
+ "# OS files",
27
+ ".DS_Store",
28
+ "Thumbs.db",
29
+ ]
30
+
31
+
32
+ def _run_git_command(
33
+ arena_dir: str,
34
+ args: list,
35
+ check: bool = True,
36
+ capture_output: bool = True,
37
+ ) -> subprocess.CompletedProcess:
38
+ """
39
+ Run a git command in the arena directory.
40
+
41
+ Args:
42
+ arena_dir: Path to the arena directory
43
+ args: Git command arguments (without 'git' prefix)
44
+ check: If True, raise exception on non-zero exit code
45
+ capture_output: If True, capture stdout and stderr
46
+
47
+ Returns:
48
+ CompletedProcess instance
49
+
50
+ Raises:
51
+ subprocess.CalledProcessError: If check=True and command fails
52
+ """
53
+ cmd = ["git"] + args
54
+ return subprocess.run(
55
+ cmd,
56
+ cwd=arena_dir,
57
+ check=check,
58
+ capture_output=capture_output,
59
+ text=True,
60
+ )
61
+
62
+
63
+ def is_git_initialized(arena_dir: str) -> bool:
64
+ """
65
+ Check if the arena directory is a Git repository.
66
+
67
+ Args:
68
+ arena_dir: Path to the arena directory
69
+
70
+ Returns:
71
+ True if Git is initialized, False otherwise
72
+ """
73
+ git_dir = os.path.join(arena_dir, ".git")
74
+ return os.path.isdir(git_dir)
75
+
76
+
77
+ def git_init(arena_dir: str) -> tuple[bool, str]:
78
+ """
79
+ Initialize a Git repository in the arena directory.
80
+
81
+ Args:
82
+ arena_dir: Path to the arena directory
83
+
84
+ Returns:
85
+ Tuple of (success, message)
86
+ """
87
+ if is_git_initialized(arena_dir):
88
+ return True, "Git repository already initialized"
89
+
90
+ # Ensure directory exists
91
+ os.makedirs(arena_dir, exist_ok=True)
92
+
93
+ try:
94
+ result = _run_git_command(arena_dir, ["init"])
95
+ logger.info(f"Initialized Git repository in {arena_dir}")
96
+
97
+ # Ensure .gitignore is set up
98
+ ensure_gitignore(arena_dir)
99
+
100
+ return True, "Git repository initialized successfully"
101
+ except subprocess.CalledProcessError as e:
102
+ error_msg = f"Failed to initialize Git repository: {e.stderr}"
103
+ logger.error(error_msg)
104
+ return False, error_msg
105
+
106
+
107
+ def ensure_gitignore(arena_dir: str) -> tuple[bool, str]:
108
+ """
109
+ Create or update .gitignore file to exclude models directories.
110
+
111
+ Args:
112
+ arena_dir: Path to the arena directory
113
+
114
+ Returns:
115
+ Tuple of (success, message)
116
+ """
117
+ gitignore_path = os.path.join(arena_dir, ".gitignore")
118
+
119
+ existing_content = ""
120
+ if os.path.isfile(gitignore_path):
121
+ with open(gitignore_path, "r", encoding="utf-8") as f:
122
+ existing_content = f.read()
123
+
124
+ # Check if the key pattern already exists
125
+ key_pattern = "*/models/"
126
+ if key_pattern in existing_content:
127
+ return True, ".gitignore already contains required patterns"
128
+
129
+ # Append patterns to existing content
130
+ new_content = existing_content
131
+ if new_content and not new_content.endswith("\n"):
132
+ new_content += "\n"
133
+
134
+ if new_content:
135
+ new_content += "\n"
136
+
137
+ new_content += "\n".join(GITIGNORE_PATTERNS)
138
+
139
+ with open(gitignore_path, "w", encoding="utf-8") as f:
140
+ f.write(new_content)
141
+
142
+ logger.info(f"Updated .gitignore in {arena_dir}")
143
+ return True, ".gitignore updated successfully"
144
+
145
+
146
+ def git_add_all(arena_dir: str) -> tuple[bool, str]:
147
+ """
148
+ Stage all changes in the arena directory (respecting .gitignore).
149
+
150
+ Args:
151
+ arena_dir: Path to the arena directory
152
+
153
+ Returns:
154
+ Tuple of (success, message)
155
+ """
156
+ if not is_git_initialized(arena_dir):
157
+ return False, "Git repository not initialized"
158
+
159
+ try:
160
+ _run_git_command(arena_dir, ["add", "-A"])
161
+ return True, "All changes staged"
162
+ except subprocess.CalledProcessError as e:
163
+ error_msg = f"Failed to stage changes: {e.stderr}"
164
+ logger.error(error_msg)
165
+ return False, error_msg
166
+
167
+
168
+ def has_uncommitted_changes(arena_dir: str) -> bool:
169
+ """
170
+ Check if there are uncommitted changes in the repository.
171
+
172
+ Args:
173
+ arena_dir: Path to the arena directory
174
+
175
+ Returns:
176
+ True if there are uncommitted changes, False otherwise
177
+ """
178
+ if not is_git_initialized(arena_dir):
179
+ return False
180
+
181
+ try:
182
+ # Check for staged changes
183
+ result = _run_git_command(arena_dir, ["diff", "--cached", "--quiet"], check=False)
184
+ if result.returncode != 0:
185
+ return True
186
+
187
+ # Check for unstaged changes
188
+ result = _run_git_command(arena_dir, ["diff", "--quiet"], check=False)
189
+ if result.returncode != 0:
190
+ return True
191
+
192
+ # Check for untracked files (that aren't ignored)
193
+ result = _run_git_command(
194
+ arena_dir,
195
+ ["ls-files", "--others", "--exclude-standard"],
196
+ check=False
197
+ )
198
+ if result.stdout.strip():
199
+ return True
200
+
201
+ return False
202
+ except Exception as e:
203
+ logger.warning(f"Error checking for uncommitted changes: {e}")
204
+ return False
205
+
206
+
207
+ def git_commit(
208
+ arena_dir: str,
209
+ message: Optional[str] = None,
210
+ command_name: Optional[str] = None,
211
+ ) -> tuple[bool, str]:
212
+ """
213
+ Commit staged changes.
214
+
215
+ Args:
216
+ arena_dir: Path to the arena directory
217
+ message: Custom commit message (optional)
218
+ command_name: Name of the command that triggered this commit (for auto-commit)
219
+
220
+ Returns:
221
+ Tuple of (success, message)
222
+ """
223
+ if not is_git_initialized(arena_dir):
224
+ return False, "Git repository not initialized"
225
+
226
+ # Stage all changes first
227
+ success, msg = git_add_all(arena_dir)
228
+ if not success:
229
+ return False, msg
230
+
231
+ # Check if there's anything to commit
232
+ result = _run_git_command(arena_dir, ["diff", "--cached", "--quiet"], check=False)
233
+ if result.returncode == 0:
234
+ return True, "Nothing to commit, working tree clean"
235
+
236
+ # Generate commit message
237
+ timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
238
+ if message:
239
+ commit_msg = message
240
+ elif command_name:
241
+ commit_msg = f"[genarena] Auto commit after {command_name} at {timestamp}"
242
+ else:
243
+ commit_msg = f"[genarena] Auto commit at {timestamp}"
244
+
245
+ try:
246
+ _run_git_command(arena_dir, ["commit", "-m", commit_msg])
247
+ logger.info(f"Committed changes: {commit_msg}")
248
+ return True, f"Committed: {commit_msg}"
249
+ except subprocess.CalledProcessError as e:
250
+ error_msg = f"Failed to commit: {e.stderr}"
251
+ logger.error(error_msg)
252
+ return False, error_msg
253
+
254
+
255
+ def git_remote_get_url(arena_dir: str, remote_name: str = "origin") -> Optional[str]:
256
+ """
257
+ Get the URL of a remote repository.
258
+
259
+ Args:
260
+ arena_dir: Path to the arena directory
261
+ remote_name: Name of the remote (default: origin)
262
+
263
+ Returns:
264
+ Remote URL or None if not configured
265
+ """
266
+ if not is_git_initialized(arena_dir):
267
+ return None
268
+
269
+ try:
270
+ result = _run_git_command(
271
+ arena_dir,
272
+ ["remote", "get-url", remote_name],
273
+ check=False
274
+ )
275
+ if result.returncode == 0:
276
+ return result.stdout.strip()
277
+ return None
278
+ except Exception:
279
+ return None
280
+
281
+
282
+ def git_remote_add(
283
+ arena_dir: str,
284
+ url: str,
285
+ remote_name: str = "origin",
286
+ force: bool = False,
287
+ ) -> tuple[bool, str]:
288
+ """
289
+ Configure a remote repository.
290
+
291
+ Args:
292
+ arena_dir: Path to the arena directory
293
+ url: Remote repository URL
294
+ remote_name: Name of the remote (default: origin)
295
+ force: If True, overwrite existing remote URL
296
+
297
+ Returns:
298
+ Tuple of (success, message)
299
+ """
300
+ if not is_git_initialized(arena_dir):
301
+ return False, "Git repository not initialized"
302
+
303
+ existing_url = git_remote_get_url(arena_dir, remote_name)
304
+
305
+ if existing_url:
306
+ if existing_url == url:
307
+ return True, f"Remote '{remote_name}' already configured with this URL"
308
+
309
+ if not force:
310
+ return False, (
311
+ f"Remote '{remote_name}' already exists with URL: {existing_url}. "
312
+ f"Use --force to overwrite."
313
+ )
314
+
315
+ # Remove existing remote
316
+ try:
317
+ _run_git_command(arena_dir, ["remote", "remove", remote_name])
318
+ except subprocess.CalledProcessError as e:
319
+ return False, f"Failed to remove existing remote: {e.stderr}"
320
+
321
+ # Add remote
322
+ try:
323
+ _run_git_command(arena_dir, ["remote", "add", remote_name, url])
324
+ logger.info(f"Added remote '{remote_name}': {url}")
325
+ return True, f"Remote '{remote_name}' configured: {url}"
326
+ except subprocess.CalledProcessError as e:
327
+ error_msg = f"Failed to add remote: {e.stderr}"
328
+ logger.error(error_msg)
329
+ return False, error_msg
330
+
331
+
332
+ def git_push(
333
+ arena_dir: str,
334
+ remote_name: str = "origin",
335
+ branch: Optional[str] = None,
336
+ set_upstream: bool = True,
337
+ ) -> tuple[bool, str]:
338
+ """
339
+ Push commits to the remote repository.
340
+
341
+ Args:
342
+ arena_dir: Path to the arena directory
343
+ remote_name: Name of the remote (default: origin)
344
+ branch: Branch name (default: current branch)
345
+ set_upstream: If True, set upstream tracking
346
+
347
+ Returns:
348
+ Tuple of (success, message)
349
+ """
350
+ if not is_git_initialized(arena_dir):
351
+ return False, "Git repository not initialized"
352
+
353
+ # Check if remote is configured
354
+ remote_url = git_remote_get_url(arena_dir, remote_name)
355
+ if not remote_url:
356
+ return False, f"Remote '{remote_name}' not configured. Use 'genarena git remote --url <url>' first."
357
+
358
+ # Get current branch if not specified
359
+ if not branch:
360
+ try:
361
+ result = _run_git_command(arena_dir, ["branch", "--show-current"])
362
+ branch = result.stdout.strip()
363
+ if not branch:
364
+ # Might be on a detached HEAD, try to get default branch
365
+ branch = "main"
366
+ except subprocess.CalledProcessError:
367
+ branch = "main"
368
+
369
+ # Push
370
+ try:
371
+ push_args = ["push"]
372
+ if set_upstream:
373
+ push_args.extend(["-u", remote_name, branch])
374
+ else:
375
+ push_args.extend([remote_name, branch])
376
+
377
+ _run_git_command(arena_dir, push_args)
378
+ logger.info(f"Pushed to {remote_name}/{branch}")
379
+ return True, f"Pushed to {remote_name}/{branch}"
380
+ except subprocess.CalledProcessError as e:
381
+ error_msg = f"Failed to push: {e.stderr}"
382
+ logger.error(error_msg)
383
+ return False, error_msg
384
+
385
+
386
+ def git_sync(arena_dir: str) -> tuple[bool, str]:
387
+ """
388
+ Commit all changes and push to remote (one-click sync).
389
+
390
+ Args:
391
+ arena_dir: Path to the arena directory
392
+
393
+ Returns:
394
+ Tuple of (success, message)
395
+ """
396
+ if not is_git_initialized(arena_dir):
397
+ return False, "Git repository not initialized"
398
+
399
+ messages = []
400
+
401
+ # Commit changes
402
+ success, msg = git_commit(arena_dir)
403
+ messages.append(msg)
404
+
405
+ if not success and "Nothing to commit" not in msg:
406
+ return False, msg
407
+
408
+ # Push to remote
409
+ success, msg = git_push(arena_dir)
410
+ messages.append(msg)
411
+
412
+ if not success:
413
+ # If push fails due to no remote, still return partial success
414
+ if "not configured" in msg:
415
+ return True, f"{messages[0]} (push skipped: {msg})"
416
+ return False, msg
417
+
418
+ return True, " | ".join(messages)
genarena/sync/hf_ops.py ADDED
@@ -0,0 +1,887 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Huggingface operations module for GenArena.
3
+
4
+ This module provides functionality for uploading and downloading
5
+ arena data to/from Huggingface Dataset repositories.
6
+ """
7
+
8
+ import logging
9
+ import os
10
+ import time
11
+ import functools
12
+ from typing import Any, Callable, Optional, TypeVar
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+ # Type variable for retry decorator
17
+ T = TypeVar("T")
18
+
19
+ # Default retry configuration
20
+ DEFAULT_MAX_RETRIES = 3
21
+ DEFAULT_RETRY_DELAY = 2.0
22
+ DEFAULT_RETRY_BACKOFF = 2.0 # Exponential backoff multiplier
23
+
24
+
25
+ def retry_on_failure(
26
+ max_retries: int = DEFAULT_MAX_RETRIES,
27
+ delay: float = DEFAULT_RETRY_DELAY,
28
+ backoff: float = DEFAULT_RETRY_BACKOFF,
29
+ exceptions: tuple = (Exception,),
30
+ ) -> Callable:
31
+ """
32
+ Decorator that retries a function on failure with exponential backoff.
33
+
34
+ Args:
35
+ max_retries: Maximum number of retry attempts
36
+ delay: Initial delay between retries in seconds
37
+ backoff: Multiplier for delay after each retry
38
+ exceptions: Tuple of exception types to catch and retry
39
+
40
+ Returns:
41
+ Decorated function
42
+ """
43
+ def decorator(func: Callable[..., T]) -> Callable[..., T]:
44
+ @functools.wraps(func)
45
+ def wrapper(*args: Any, **kwargs: Any) -> T:
46
+ current_delay = delay
47
+ last_exception = None
48
+
49
+ for attempt in range(max_retries + 1):
50
+ try:
51
+ return func(*args, **kwargs)
52
+ except exceptions as e:
53
+ last_exception = e
54
+ if attempt < max_retries:
55
+ logger.warning(
56
+ f"{func.__name__} failed (attempt {attempt + 1}/{max_retries + 1}): {e}. "
57
+ f"Retrying in {current_delay:.1f}s..."
58
+ )
59
+ time.sleep(current_delay)
60
+ current_delay *= backoff
61
+ else:
62
+ logger.error(
63
+ f"{func.__name__} failed after {max_retries + 1} attempts: {e}"
64
+ )
65
+
66
+ # Re-raise the last exception
67
+ raise last_exception # type: ignore
68
+
69
+ return wrapper
70
+ return decorator
71
+
72
+ # Environment variable for HF token
73
+ HF_TOKEN_ENV = "HF_TOKEN"
74
+
75
+
76
+ def get_hf_token() -> Optional[str]:
77
+ """
78
+ Get the Huggingface token from environment variable.
79
+
80
+ Returns:
81
+ Token string or None if not set
82
+ """
83
+ return os.environ.get(HF_TOKEN_ENV)
84
+
85
+
86
+ def require_hf_token() -> str:
87
+ """
88
+ Get the Huggingface token, raising an error if not set.
89
+
90
+ Returns:
91
+ Token string
92
+
93
+ Raises:
94
+ ValueError: If HF_TOKEN environment variable is not set
95
+ """
96
+ token = get_hf_token()
97
+ if not token:
98
+ raise ValueError(
99
+ f"Environment variable {HF_TOKEN_ENV} is not set. "
100
+ f"Please set it with your Huggingface token: "
101
+ f"export {HF_TOKEN_ENV}='your_token_here'"
102
+ )
103
+ return token
104
+
105
+
106
+ def validate_dataset_repo(repo_id: str, token: Optional[str] = None) -> tuple[bool, str]:
107
+ """
108
+ Validate that a repository exists and is a Dataset type.
109
+
110
+ Args:
111
+ repo_id: Repository ID (e.g., "username/repo-name")
112
+ token: Huggingface token (optional for public repos)
113
+
114
+ Returns:
115
+ Tuple of (is_valid, message)
116
+ """
117
+ try:
118
+ from huggingface_hub import HfApi
119
+ from huggingface_hub.utils import RepositoryNotFoundError
120
+
121
+ api = HfApi(token=token)
122
+
123
+ try:
124
+ repo_info = api.repo_info(repo_id=repo_id, repo_type="dataset")
125
+ return True, f"Valid Dataset repository: {repo_id}"
126
+ except RepositoryNotFoundError:
127
+ # Try to check if it exists as a different type
128
+ try:
129
+ # Check if it's a model repo
130
+ api.repo_info(repo_id=repo_id, repo_type="model")
131
+ return False, (
132
+ f"Repository '{repo_id}' exists but is a Model repository, not a Dataset. "
133
+ f"Please create a Dataset repository on Huggingface."
134
+ )
135
+ except RepositoryNotFoundError:
136
+ pass
137
+
138
+ try:
139
+ # Check if it's a space repo
140
+ api.repo_info(repo_id=repo_id, repo_type="space")
141
+ return False, (
142
+ f"Repository '{repo_id}' exists but is a Space repository, not a Dataset. "
143
+ f"Please create a Dataset repository on Huggingface."
144
+ )
145
+ except RepositoryNotFoundError:
146
+ pass
147
+
148
+ return False, (
149
+ f"Repository '{repo_id}' does not exist. "
150
+ f"Please create a Dataset repository on Huggingface first: "
151
+ f"https://huggingface.co/new-dataset"
152
+ )
153
+
154
+ except ImportError:
155
+ return False, (
156
+ "huggingface_hub package is not installed. "
157
+ "Please install it with: pip install huggingface_hub"
158
+ )
159
+ except Exception as e:
160
+ return False, f"Error validating repository: {e}"
161
+
162
+
163
+ def list_repo_files(
164
+ repo_id: str,
165
+ token: Optional[str] = None,
166
+ revision: str = "main",
167
+ ) -> tuple[bool, list[str], str]:
168
+ """
169
+ List all files in a Huggingface Dataset repository.
170
+
171
+ Args:
172
+ repo_id: Repository ID
173
+ token: Huggingface token (optional for public repos)
174
+ revision: Branch/revision name
175
+
176
+ Returns:
177
+ Tuple of (success, file_list, message)
178
+ """
179
+ try:
180
+ from huggingface_hub import HfApi
181
+
182
+ api = HfApi(token=token)
183
+
184
+ files = api.list_repo_files(
185
+ repo_id=repo_id,
186
+ repo_type="dataset",
187
+ revision=revision,
188
+ )
189
+
190
+ return True, list(files), f"Found {len(files)} files in {repo_id}"
191
+
192
+ except Exception as e:
193
+ return False, [], f"Error listing repository files: {e}"
194
+
195
+
196
+ def get_repo_file_info(
197
+ repo_id: str,
198
+ token: Optional[str] = None,
199
+ revision: str = "main",
200
+ ) -> tuple[bool, list[dict], str]:
201
+ """
202
+ Get detailed file information from a Huggingface Dataset repository.
203
+
204
+ Args:
205
+ repo_id: Repository ID
206
+ token: Huggingface token (optional for public repos)
207
+ revision: Branch/revision name
208
+
209
+ Returns:
210
+ Tuple of (success, file_info_list, message)
211
+ """
212
+ try:
213
+ from huggingface_hub import HfApi
214
+
215
+ api = HfApi(token=token)
216
+
217
+ repo_info = api.repo_info(
218
+ repo_id=repo_id,
219
+ repo_type="dataset",
220
+ revision=revision,
221
+ files_metadata=True,
222
+ )
223
+
224
+ file_infos = []
225
+ if repo_info.siblings:
226
+ for sibling in repo_info.siblings:
227
+ file_infos.append({
228
+ "path": sibling.rfilename,
229
+ "size": sibling.size,
230
+ "blob_id": sibling.blob_id,
231
+ })
232
+
233
+ return True, file_infos, f"Found {len(file_infos)} files in {repo_id}"
234
+
235
+ except Exception as e:
236
+ return False, [], f"Error getting repository info: {e}"
237
+
238
+
239
+ def upload_file(
240
+ repo_id: str,
241
+ local_path: str,
242
+ remote_path: str,
243
+ token: str,
244
+ commit_message: Optional[str] = None,
245
+ max_retries: int = DEFAULT_MAX_RETRIES,
246
+ repo_type: str = "dataset",
247
+ ) -> tuple[bool, str]:
248
+ """
249
+ Upload a single file to a Huggingface repository with retry support.
250
+
251
+ Args:
252
+ repo_id: Repository ID
253
+ local_path: Local file path
254
+ remote_path: Path in the repository
255
+ token: Huggingface token
256
+ commit_message: Optional commit message
257
+ max_retries: Maximum number of retry attempts on failure
258
+ repo_type: Repository type ("dataset", "model", or "space")
259
+
260
+ Returns:
261
+ Tuple of (success, message)
262
+ """
263
+ from huggingface_hub import HfApi
264
+
265
+ api = HfApi(token=token)
266
+
267
+ if not commit_message:
268
+ commit_message = f"Upload {remote_path}"
269
+
270
+ @retry_on_failure(
271
+ max_retries=max_retries,
272
+ delay=DEFAULT_RETRY_DELAY,
273
+ backoff=DEFAULT_RETRY_BACKOFF,
274
+ )
275
+ def _do_upload() -> None:
276
+ api.upload_file(
277
+ path_or_fileobj=local_path,
278
+ path_in_repo=remote_path,
279
+ repo_id=repo_id,
280
+ repo_type=repo_type,
281
+ commit_message=commit_message,
282
+ )
283
+
284
+ try:
285
+ _do_upload()
286
+ return True, f"Uploaded {remote_path}"
287
+ except Exception as e:
288
+ return False, f"Error uploading file: {e}"
289
+
290
+
291
+ def upload_files_batch(
292
+ repo_id: str,
293
+ file_mappings: list[tuple[str, str]],
294
+ token: str,
295
+ commit_message: Optional[str] = None,
296
+ ) -> tuple[bool, str]:
297
+ """
298
+ Upload multiple files in a single commit.
299
+
300
+ Args:
301
+ repo_id: Repository ID
302
+ file_mappings: List of (local_path, remote_path) tuples
303
+ token: Huggingface token
304
+ commit_message: Optional commit message
305
+
306
+ Returns:
307
+ Tuple of (success, message)
308
+ """
309
+ try:
310
+ from huggingface_hub import HfApi, CommitOperationAdd
311
+
312
+ api = HfApi(token=token)
313
+
314
+ if not commit_message:
315
+ commit_message = f"Upload {len(file_mappings)} files"
316
+
317
+ operations = [
318
+ CommitOperationAdd(
319
+ path_in_repo=remote_path,
320
+ path_or_fileobj=local_path,
321
+ )
322
+ for local_path, remote_path in file_mappings
323
+ ]
324
+
325
+ api.create_commit(
326
+ repo_id=repo_id,
327
+ repo_type="dataset",
328
+ operations=operations,
329
+ commit_message=commit_message,
330
+ )
331
+
332
+ return True, f"Uploaded {len(file_mappings)} files"
333
+
334
+ except Exception as e:
335
+ return False, f"Error uploading files: {e}"
336
+
337
+
338
+ def download_file(
339
+ repo_id: str,
340
+ remote_path: str,
341
+ local_path: str,
342
+ token: Optional[str] = None,
343
+ revision: str = "main",
344
+ ) -> tuple[bool, str]:
345
+ """
346
+ Download a single file from a Huggingface Dataset repository.
347
+
348
+ Args:
349
+ repo_id: Repository ID
350
+ remote_path: Path in the repository
351
+ local_path: Local file path to save to
352
+ token: Huggingface token (optional for public repos)
353
+ revision: Branch/revision name
354
+
355
+ Returns:
356
+ Tuple of (success, message)
357
+ """
358
+ try:
359
+ from huggingface_hub import hf_hub_download
360
+
361
+ # Ensure local directory exists
362
+ os.makedirs(os.path.dirname(local_path), exist_ok=True)
363
+
364
+ # Download to a temp location first, then move
365
+ downloaded_path = hf_hub_download(
366
+ repo_id=repo_id,
367
+ filename=remote_path,
368
+ repo_type="dataset",
369
+ revision=revision,
370
+ token=token,
371
+ local_dir=os.path.dirname(local_path),
372
+ local_dir_use_symlinks=False,
373
+ )
374
+
375
+ # If downloaded to a different path, copy to expected location
376
+ if downloaded_path != local_path:
377
+ import shutil
378
+ os.makedirs(os.path.dirname(local_path), exist_ok=True)
379
+ shutil.copy2(downloaded_path, local_path)
380
+
381
+ return True, f"Downloaded {remote_path}"
382
+
383
+ except Exception as e:
384
+ return False, f"Error downloading file: {e}"
385
+
386
+
387
+ def check_file_exists(
388
+ repo_id: str,
389
+ remote_path: str,
390
+ token: Optional[str] = None,
391
+ revision: str = "main",
392
+ ) -> bool:
393
+ """
394
+ Check if a file exists in the repository.
395
+
396
+ Args:
397
+ repo_id: Repository ID
398
+ remote_path: Path in the repository
399
+ token: Huggingface token (optional for public repos)
400
+ revision: Branch/revision name
401
+
402
+ Returns:
403
+ True if file exists
404
+ """
405
+ try:
406
+ from huggingface_hub import HfApi
407
+
408
+ api = HfApi(token=token)
409
+ files = api.list_repo_files(
410
+ repo_id=repo_id,
411
+ repo_type="dataset",
412
+ revision=revision,
413
+ )
414
+
415
+ return remote_path in files
416
+
417
+ except Exception:
418
+ return False
419
+
420
+
421
+ def format_file_size(size_bytes: Optional[int]) -> str:
422
+ """
423
+ Format file size in human-readable format.
424
+
425
+ Args:
426
+ size_bytes: Size in bytes
427
+
428
+ Returns:
429
+ Human-readable size string
430
+ """
431
+ if size_bytes is None:
432
+ return "Unknown"
433
+
434
+ for unit in ["B", "KB", "MB", "GB", "TB"]:
435
+ if abs(size_bytes) < 1024.0:
436
+ return f"{size_bytes:.1f} {unit}"
437
+ size_bytes /= 1024.0
438
+
439
+ return f"{size_bytes:.1f} PB"
440
+
441
+
442
+ # =============================================================================
443
+ # High-level operations
444
+ # =============================================================================
445
+
446
+ def upload_arena_data(
447
+ arena_dir: str,
448
+ repo_id: str,
449
+ subsets: Optional[list[str]] = None,
450
+ models: Optional[list[str]] = None,
451
+ experiments: Optional[list[str]] = None,
452
+ overwrite: bool = False,
453
+ show_progress: bool = True,
454
+ max_retries: int = DEFAULT_MAX_RETRIES,
455
+ ) -> tuple[bool, str]:
456
+ """
457
+ Upload arena data to a Huggingface Dataset repository.
458
+
459
+ This function:
460
+ 1. Validates the repository exists and is a Dataset type
461
+ 2. Collects files to upload based on filters
462
+ 3. Packs directories into ZIP files
463
+ 4. Uploads files with progress indication and retry on failure
464
+
465
+ Supports resume upload: by default (overwrite=False), already uploaded files
466
+ are automatically skipped, enabling resumable uploads after connection failures.
467
+
468
+ Args:
469
+ arena_dir: Path to the arena directory
470
+ repo_id: Huggingface repository ID
471
+ subsets: List of subsets to upload (None = all)
472
+ models: List of models to upload (None = all)
473
+ experiments: List of experiments (exp_name) to upload (None = all)
474
+ overwrite: If True, overwrite existing files; if False, skip existing (resume mode)
475
+ show_progress: If True, show progress bar
476
+ max_retries: Maximum number of retry attempts per file on failure
477
+
478
+ Returns:
479
+ Tuple of (success, message)
480
+ """
481
+ from genarena.sync.packer import (
482
+ collect_upload_tasks,
483
+ pack_model_dir,
484
+ pack_exp_dir,
485
+ TempPackingContext,
486
+ TaskType,
487
+ )
488
+
489
+ # Get token
490
+ try:
491
+ token = require_hf_token()
492
+ except ValueError as e:
493
+ return False, str(e)
494
+
495
+ # Validate repository
496
+ valid, msg = validate_dataset_repo(repo_id, token)
497
+ if not valid:
498
+ return False, msg
499
+
500
+ logger.info(f"Uploading to repository: {repo_id}")
501
+
502
+ # Collect upload tasks
503
+ tasks = collect_upload_tasks(arena_dir, subsets, models, experiments)
504
+ if not tasks:
505
+ return False, "No files to upload. Check arena_dir and filters."
506
+
507
+ logger.info(f"Found {len(tasks)} items to scan")
508
+
509
+ # Get existing files in repo (for overwrite check)
510
+ existing_files = set()
511
+ if not overwrite:
512
+ logger.info("Checking existing files in remote repository...")
513
+ success, files, _ = list_repo_files(repo_id, token)
514
+ if success:
515
+ existing_files = set(files)
516
+ logger.info(f"Found {len(existing_files)} files in remote repository")
517
+
518
+ # Pre-scan: categorize tasks into to_upload and to_skip
519
+ to_upload = []
520
+ to_skip = []
521
+ for task in tasks:
522
+ if not overwrite and task.remote_path in existing_files:
523
+ to_skip.append(task)
524
+ else:
525
+ to_upload.append(task)
526
+
527
+ # Display scan summary
528
+ logger.info(f"Scan summary: {len(to_upload)} to upload, {len(to_skip)} already exist (will skip)")
529
+
530
+ if to_skip:
531
+ logger.info("Already uploaded (will skip):")
532
+ for task in to_skip[:10]:
533
+ logger.info(f" ✓ {task.remote_path}")
534
+ if len(to_skip) > 10:
535
+ logger.info(f" ... and {len(to_skip) - 10} more")
536
+
537
+ if to_upload:
538
+ logger.info("To be uploaded:")
539
+ for task in to_upload[:10]:
540
+ logger.info(f" → {task.remote_path}")
541
+ if len(to_upload) > 10:
542
+ logger.info(f" ... and {len(to_upload) - 10} more")
543
+
544
+ if not to_upload:
545
+ return True, f"All {len(to_skip)} files already exist in repository. Nothing to upload."
546
+
547
+ # Process tasks (only those that need uploading)
548
+ uploaded = 0
549
+ skipped = len(to_skip) # Pre-count skipped
550
+ failed = 0
551
+ errors = []
552
+
553
+ # Setup progress bar
554
+ if show_progress:
555
+ try:
556
+ from tqdm import tqdm
557
+ to_upload = tqdm(to_upload, desc="Uploading", unit="file")
558
+ except ImportError:
559
+ pass
560
+
561
+ with TempPackingContext() as ctx:
562
+ for task in to_upload:
563
+ try:
564
+ if task.task_type == TaskType.MODEL_ZIP:
565
+ # Pack model directory
566
+ zip_path = ctx.get_temp_zip_path(task.remote_path)
567
+ success, msg = pack_model_dir(task.local_path, zip_path)
568
+ if not success:
569
+ errors.append(f"{task.name}: {msg}")
570
+ failed += 1
571
+ continue
572
+
573
+ # Upload ZIP with retry
574
+ success, msg = upload_file(
575
+ repo_id, zip_path, task.remote_path, token,
576
+ commit_message=f"[genarena] Upload model: {task.subset}/{task.name}",
577
+ max_retries=max_retries,
578
+ )
579
+
580
+ elif task.task_type == TaskType.EXP_ZIP:
581
+ # Pack experiment directory
582
+ zip_path = ctx.get_temp_zip_path(task.remote_path)
583
+ success, msg = pack_exp_dir(task.local_path, zip_path)
584
+ if not success:
585
+ errors.append(f"{task.name}: {msg}")
586
+ failed += 1
587
+ continue
588
+
589
+ # Upload ZIP with retry
590
+ success, msg = upload_file(
591
+ repo_id, zip_path, task.remote_path, token,
592
+ commit_message=f"[genarena] Upload experiment: {task.subset}/{task.name}",
593
+ max_retries=max_retries,
594
+ )
595
+
596
+ elif task.task_type == TaskType.SMALL_FILE:
597
+ # Upload small file directly with retry
598
+ success, msg = upload_file(
599
+ repo_id, task.local_path, task.remote_path, token,
600
+ commit_message=f"[genarena] Upload {task.name}",
601
+ max_retries=max_retries,
602
+ )
603
+
604
+ else:
605
+ success = False
606
+ msg = f"Unknown task type: {task.task_type}"
607
+
608
+ if success:
609
+ uploaded += 1
610
+ logger.debug(f"Uploaded: {task.remote_path}")
611
+ else:
612
+ errors.append(f"{task.name}: {msg}")
613
+ failed += 1
614
+
615
+ except Exception as e:
616
+ errors.append(f"{task.name}: {e}")
617
+ failed += 1
618
+
619
+ # Summary
620
+ summary = f"Uploaded: {uploaded}, Skipped: {skipped}, Failed: {failed}"
621
+ if errors:
622
+ summary += f"\nErrors:\n" + "\n".join(f" - {e}" for e in errors[:5])
623
+ if len(errors) > 5:
624
+ summary += f"\n ... and {len(errors) - 5} more errors"
625
+
626
+ repo_url = f"https://huggingface.co/datasets/{repo_id}"
627
+ summary += f"\n\nRepository URL: {repo_url}"
628
+
629
+ success = failed == 0 or uploaded > 0
630
+ return success, summary
631
+
632
+
633
+ def pull_arena_data(
634
+ arena_dir: str,
635
+ repo_id: str,
636
+ subsets: Optional[list[str]] = None,
637
+ models: Optional[list[str]] = None,
638
+ experiments: Optional[list[str]] = None,
639
+ revision: str = "main",
640
+ overwrite: bool = False,
641
+ show_progress: bool = True,
642
+ ) -> tuple[bool, str]:
643
+ """
644
+ Pull arena data from a Huggingface Dataset repository.
645
+
646
+ This function:
647
+ 1. Validates the repository exists and is a Dataset type
648
+ 2. Lists files in the repository
649
+ 3. Filters based on subsets/models
650
+ 4. Downloads and unpacks ZIP files
651
+
652
+ Args:
653
+ arena_dir: Path to the local arena directory
654
+ repo_id: Huggingface repository ID
655
+ subsets: List of subsets to download (None = all)
656
+ models: List of models to download (None = all)
657
+ experiments: List of experiments (exp_name) to download (None = all)
658
+ revision: Branch/revision to download from
659
+ overwrite: If True, overwrite existing files
660
+ show_progress: If True, show progress bar
661
+
662
+ Returns:
663
+ Tuple of (success, message)
664
+ """
665
+ import tempfile
666
+ import shutil
667
+ from genarena.sync.packer import (
668
+ collect_download_tasks,
669
+ unpack_zip,
670
+ TaskType,
671
+ )
672
+
673
+ # Get token (optional for public repos)
674
+ token = get_hf_token()
675
+
676
+ # Validate repository
677
+ valid, msg = validate_dataset_repo(repo_id, token)
678
+ if not valid:
679
+ return False, msg
680
+
681
+ logger.info(f"Pulling from repository: {repo_id} (revision: {revision})")
682
+
683
+ # List files in repository
684
+ success, repo_files, msg = list_repo_files(repo_id, token, revision)
685
+ if not success:
686
+ return False, msg
687
+
688
+ if not repo_files:
689
+ return False, "Repository is empty"
690
+
691
+ # Collect download tasks
692
+ tasks = collect_download_tasks(repo_files, arena_dir, subsets, models, experiments)
693
+ if not tasks:
694
+ return False, "No matching files to download. Check filters."
695
+
696
+ logger.info(f"Found {len(tasks)} items to download")
697
+
698
+ # Process tasks
699
+ downloaded = 0
700
+ skipped = 0
701
+ failed = 0
702
+ errors = []
703
+
704
+ # Setup progress bar
705
+ if show_progress:
706
+ try:
707
+ from tqdm import tqdm
708
+ tasks = tqdm(tasks, desc="Downloading", unit="file")
709
+ except ImportError:
710
+ pass
711
+
712
+ # Create temp directory for downloads
713
+ temp_dir = tempfile.mkdtemp(prefix="genarena_pull_")
714
+
715
+ try:
716
+ for task in tasks:
717
+ try:
718
+ if task.task_type in (TaskType.MODEL_ZIP, TaskType.EXP_ZIP):
719
+ # Download ZIP to temp location
720
+ temp_zip = os.path.join(temp_dir, os.path.basename(task.remote_path))
721
+ success, msg = download_file(
722
+ repo_id, task.remote_path, temp_zip, token, revision
723
+ )
724
+
725
+ if not success:
726
+ errors.append(f"{task.name}: {msg}")
727
+ failed += 1
728
+ continue
729
+
730
+ # Unpack ZIP
731
+ success, msg = unpack_zip(temp_zip, task.local_path, overwrite)
732
+ if not success:
733
+ errors.append(f"{task.name}: {msg}")
734
+ failed += 1
735
+ continue
736
+
737
+ downloaded += 1
738
+ logger.debug(f"Downloaded and unpacked: {task.remote_path}")
739
+
740
+ elif task.task_type == TaskType.SMALL_FILE:
741
+ # Check if file exists and skip if not overwriting
742
+ if os.path.exists(task.local_path) and not overwrite:
743
+ logger.debug(f"Skipping existing: {task.local_path}")
744
+ skipped += 1
745
+ continue
746
+
747
+ # Download file directly
748
+ success, msg = download_file(
749
+ repo_id, task.remote_path, task.local_path, token, revision
750
+ )
751
+
752
+ if success:
753
+ downloaded += 1
754
+ logger.debug(f"Downloaded: {task.remote_path}")
755
+ else:
756
+ errors.append(f"{task.name}: {msg}")
757
+ failed += 1
758
+
759
+ except Exception as e:
760
+ errors.append(f"{task.name}: {e}")
761
+ failed += 1
762
+
763
+ finally:
764
+ # Cleanup temp directory
765
+ shutil.rmtree(temp_dir, ignore_errors=True)
766
+
767
+ # Summary
768
+ summary = f"Downloaded: {downloaded}, Skipped: {skipped}, Failed: {failed}"
769
+ if errors:
770
+ summary += f"\nErrors:\n" + "\n".join(f" - {e}" for e in errors[:5])
771
+ if len(errors) > 5:
772
+ summary += f"\n ... and {len(errors) - 5} more errors"
773
+
774
+ success = failed == 0 or downloaded > 0
775
+ return success, summary
776
+
777
+
778
+ def list_repo_contents(
779
+ repo_id: str,
780
+ revision: str = "main",
781
+ ) -> tuple[bool, str]:
782
+ """
783
+ List contents of a Huggingface Dataset repository.
784
+
785
+ Displays files organized by subset with size information.
786
+
787
+ Args:
788
+ repo_id: Huggingface repository ID
789
+ revision: Branch/revision name
790
+
791
+ Returns:
792
+ Tuple of (success, formatted_output)
793
+ """
794
+ # Get token (optional for public repos)
795
+ token = get_hf_token()
796
+
797
+ # Validate repository
798
+ valid, msg = validate_dataset_repo(repo_id, token)
799
+ if not valid:
800
+ return False, msg
801
+
802
+ # Get file info
803
+ success, file_infos, msg = get_repo_file_info(repo_id, token, revision)
804
+ if not success:
805
+ return False, msg
806
+
807
+ if not file_infos:
808
+ return True, f"Repository '{repo_id}' is empty"
809
+
810
+ # Organize by subset
811
+ subsets: dict[str, list[dict]] = {}
812
+ other_files: list[dict] = []
813
+
814
+ for info in file_infos:
815
+ path = info["path"]
816
+ parts = path.split("/")
817
+
818
+ if len(parts) >= 2:
819
+ subset = parts[0]
820
+ if subset not in subsets:
821
+ subsets[subset] = []
822
+ subsets[subset].append(info)
823
+ else:
824
+ other_files.append(info)
825
+
826
+ # Format output
827
+ lines = [
828
+ f"Repository: {repo_id}",
829
+ f"Revision: {revision}",
830
+ f"Total files: {len(file_infos)}",
831
+ "",
832
+ ]
833
+
834
+ total_size = sum(f.get("size", 0) or 0 for f in file_infos)
835
+ lines.append(f"Total size: {format_file_size(total_size)}")
836
+ lines.append("")
837
+
838
+ for subset in sorted(subsets.keys()):
839
+ files = subsets[subset]
840
+ subset_size = sum(f.get("size", 0) or 0 for f in files)
841
+
842
+ lines.append(f"=== {subset} ({len(files)} files, {format_file_size(subset_size)}) ===")
843
+
844
+ # Organize by type
845
+ models = []
846
+ experiments = []
847
+ other = []
848
+
849
+ for f in files:
850
+ path = f["path"]
851
+ if "/models/" in path:
852
+ models.append(f)
853
+ elif "/pk_logs/" in path:
854
+ experiments.append(f)
855
+ else:
856
+ other.append(f)
857
+
858
+ if models:
859
+ lines.append(" Models:")
860
+ for f in sorted(models, key=lambda x: x["path"]):
861
+ size = format_file_size(f.get("size"))
862
+ name = os.path.basename(f["path"])
863
+ lines.append(f" - {name} ({size})")
864
+
865
+ if experiments:
866
+ lines.append(" Experiments:")
867
+ for f in sorted(experiments, key=lambda x: x["path"]):
868
+ size = format_file_size(f.get("size"))
869
+ name = os.path.basename(f["path"])
870
+ lines.append(f" - {name} ({size})")
871
+
872
+ if other:
873
+ lines.append(" Other:")
874
+ for f in sorted(other, key=lambda x: x["path"]):
875
+ size = format_file_size(f.get("size"))
876
+ name = f["path"].split("/", 1)[1] if "/" in f["path"] else f["path"]
877
+ lines.append(f" - {name} ({size})")
878
+
879
+ lines.append("")
880
+
881
+ if other_files:
882
+ lines.append("=== Other files ===")
883
+ for f in sorted(other_files, key=lambda x: x["path"]):
884
+ size = format_file_size(f.get("size"))
885
+ lines.append(f" - {f['path']} ({size})")
886
+
887
+ return True, "\n".join(lines)
genarena/sync/init_ops.py ADDED
@@ -0,0 +1,427 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Initialization operations for GenArena.
3
+
4
+ This module provides functionality for one-click initialization of arena
5
+ directories, including downloading benchmark data and official arena data
6
+ from HuggingFace repositories.
7
+ """
8
+
9
+ import logging
10
+ import os
11
+ from typing import Optional
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+ # Default repository configurations
16
+ DEFAULT_BENCHMARK_REPO = "rhli/genarena"
17
+ DEFAULT_ARENA_REPO = "rhli/genarena-battlefield"
18
+
19
+
20
+ def _format_size(size_bytes: int) -> str:
21
+ """Format file size in human-readable format."""
22
+ if size_bytes < 1024:
23
+ return f"{size_bytes} B"
24
+ elif size_bytes < 1024 * 1024:
25
+ return f"{size_bytes / 1024:.1f} KB"
26
+ elif size_bytes < 1024 * 1024 * 1024:
27
+ return f"{size_bytes / 1024 / 1024:.1f} MB"
28
+ else:
29
+ return f"{size_bytes / 1024 / 1024 / 1024:.2f} GB"
30
+
31
+
32
+ def discover_repo_subsets(
33
+ repo_id: str,
34
+ token: Optional[str] = None,
35
+ revision: str = "main",
36
+ ) -> list[str]:
37
+ """
38
+ Discover available subsets in a HuggingFace repository.
39
+
40
+ Looks for directories containing parquet files or known subset patterns.
41
+
42
+ Args:
43
+ repo_id: HuggingFace repository ID
44
+ token: HuggingFace token (optional for public repos)
45
+ revision: Repository revision/branch
46
+
47
+ Returns:
48
+ List of subset names found in the repository
49
+ """
50
+ from huggingface_hub import HfApi
51
+
52
+ api = HfApi(token=token)
53
+
54
+ try:
55
+ files = api.list_repo_files(
56
+ repo_id=repo_id,
57
+ repo_type="dataset",
58
+ revision=revision,
59
+ )
60
+ except Exception as e:
61
+ logger.warning(f"Failed to list repo files: {e}")
62
+ return []
63
+
64
+ # Find directories that contain parquet files
65
+ subsets: set[str] = set()
66
+ for f in files:
67
+ # Look for patterns like: <subset>/data-*.parquet or <subset>/*.parquet
68
+ if f.endswith(".parquet"):
69
+ parts = f.split("/")
70
+ if len(parts) >= 2:
71
+ # First directory is the subset name
72
+ subset = parts[0]
73
+ # Skip hidden directories and common non-subset directories
74
+ if not subset.startswith(".") and subset not in ("data", "raw"):
75
+ subsets.add(subset)
76
+
77
+ return sorted(subsets)
78
+
79
+
80
+ def download_benchmark_data(
81
+ data_dir: str,
82
+ repo_id: str = DEFAULT_BENCHMARK_REPO,
83
+ subsets: Optional[list[str]] = None,
84
+ revision: str = "main",
85
+ overwrite: bool = False,
86
+ show_progress: bool = True,
87
+ ) -> tuple[bool, str, dict]:
88
+ """
89
+ Download benchmark Parquet data from HuggingFace.
90
+
91
+ Expected repository structure:
92
+ <subset>/data-00000-of-00001.parquet
93
+ <subset>/data-00001-of-00001.parquet
94
+ ...
95
+
96
+ Downloads to:
97
+ data_dir/<subset>/data-*.parquet
98
+
99
+ Args:
100
+ data_dir: Local directory to save data
101
+ repo_id: HuggingFace repository ID
102
+ subsets: List of subsets to download (None = all available)
103
+ revision: Repository revision/branch
104
+ overwrite: If True, overwrite existing files
105
+ show_progress: If True, show progress information
106
+
107
+ Returns:
108
+ Tuple of (success, message, stats_dict)
109
+ """
110
+ from huggingface_hub import HfApi, hf_hub_download
111
+
112
+ from genarena.sync.hf_ops import get_hf_token
113
+
114
+ token = get_hf_token()
115
+ api = HfApi(token=token)
116
+
117
+ stats = {
118
+ "downloaded_files": 0,
119
+ "skipped_files": 0,
120
+ "failed_files": 0,
121
+ "total_bytes": 0,
122
+ "subsets": {},
123
+ }
124
+
125
+ # Discover available subsets if not specified
126
+ if subsets is None:
127
+ logger.info(f"Discovering subsets in {repo_id}...")
128
+ subsets = discover_repo_subsets(repo_id, token, revision)
129
+ if not subsets:
130
+ return False, f"No subsets found in repository {repo_id}", stats
131
+ logger.info(f"Found subsets: {', '.join(subsets)}")
132
+
133
+ # List all files in the repo
134
+ try:
135
+ all_files = list(api.list_repo_files(
136
+ repo_id=repo_id,
137
+ repo_type="dataset",
138
+ revision=revision,
139
+ ))
140
+ except Exception as e:
141
+ return False, f"Failed to list repository files: {e}", stats
142
+
143
+ # Filter files for requested subsets
144
+ files_to_download: list[tuple[str, str]] = [] # (remote_path, local_path)
145
+
146
+ for subset in subsets:
147
+ subset_files = [
148
+ f for f in all_files
149
+ if f.startswith(f"{subset}/") and f.endswith(".parquet")
150
+ ]
151
+
152
+ if not subset_files:
153
+ logger.warning(f"No parquet files found for subset '{subset}'")
154
+ continue
155
+
156
+ stats["subsets"][subset] = {
157
+ "files": len(subset_files),
158
+ "bytes": 0,
159
+ "downloaded": 0,
160
+ "skipped": 0,
161
+ }
162
+
163
+ for remote_path in subset_files:
164
+ # Construct local path: data_dir/<subset>/filename.parquet
165
+ local_path = os.path.join(data_dir, remote_path)
166
+ files_to_download.append((remote_path, local_path))
167
+
168
+ if not files_to_download:
169
+ return False, "No parquet files found for the specified subsets", stats
170
+
171
+ # Create data directory
172
+ os.makedirs(data_dir, exist_ok=True)
173
+
174
+ # Download files
175
+ errors: list[str] = []
176
+
177
+ if show_progress:
178
+ try:
179
+ from tqdm import tqdm
180
+ files_iter = tqdm(files_to_download, desc="Downloading", unit="file")
181
+ except ImportError:
182
+ files_iter = files_to_download
183
+ else:
184
+ files_iter = files_to_download
185
+
186
+ for remote_path, local_path in files_iter:
187
+ subset = remote_path.split("/")[0]
188
+
189
+ # Check if file exists
190
+ if os.path.exists(local_path) and not overwrite:
191
+ logger.debug(f"Skipping existing file: {local_path}")
192
+ stats["skipped_files"] += 1
193
+ if subset in stats["subsets"]:
194
+ stats["subsets"][subset]["skipped"] += 1
195
+ continue
196
+
197
+ # Create directory
198
+ os.makedirs(os.path.dirname(local_path), exist_ok=True)
199
+
200
+ try:
201
+ # Download file
202
+ downloaded_path = hf_hub_download(
203
+ repo_id=repo_id,
204
+ filename=remote_path,
205
+ repo_type="dataset",
206
+ revision=revision,
207
+ token=token,
208
+ local_dir=data_dir,
209
+ local_dir_use_symlinks=False,
210
+ )
211
+
212
+ # Get file size
213
+ file_size = os.path.getsize(downloaded_path)
214
+ stats["downloaded_files"] += 1
215
+ stats["total_bytes"] += file_size
216
+
217
+ if subset in stats["subsets"]:
218
+ stats["subsets"][subset]["downloaded"] += 1
219
+ stats["subsets"][subset]["bytes"] += file_size
220
+
221
+ logger.debug(f"Downloaded: {remote_path} ({_format_size(file_size)})")
222
+
223
+ except Exception as e:
224
+ logger.error(f"Failed to download {remote_path}: {e}")
225
+ errors.append(f"{remote_path}: {e}")
226
+ stats["failed_files"] += 1
227
+
228
+ # Build summary message
229
+ lines = [
230
+ f"Benchmark data download complete:",
231
+ f" Downloaded: {stats['downloaded_files']} files ({_format_size(stats['total_bytes'])})",
232
+ f" Skipped: {stats['skipped_files']} files (already exist)",
233
+ f" Failed: {stats['failed_files']} files",
234
+ ]
235
+
236
+ if stats["subsets"]:
237
+ lines.append(" Subsets:")
238
+ for subset, info in stats["subsets"].items():
239
+ lines.append(
240
+ f" - {subset}: {info['downloaded']} downloaded, "
241
+ f"{info['skipped']} skipped ({_format_size(info['bytes'])})"
242
+ )
243
+
244
+ if errors:
245
+ lines.append(" Errors:")
246
+ for err in errors[:5]:
247
+ lines.append(f" - {err}")
248
+ if len(errors) > 5:
249
+ lines.append(f" ... and {len(errors) - 5} more errors")
250
+
251
+ success = stats["failed_files"] == 0 or stats["downloaded_files"] > 0
252
+ return success, "\n".join(lines), stats
253
+
254
+
255
+ def init_arena(
256
+ arena_dir: str = "./arena",
257
+ data_dir: str = "./data",
258
+ subsets: Optional[list[str]] = None,
259
+ benchmark_repo: str = DEFAULT_BENCHMARK_REPO,
260
+ arena_repo: str = DEFAULT_ARENA_REPO,
261
+ revision: str = "main",
262
+ overwrite: bool = False,
263
+ init_git: bool = False,
264
+ data_only: bool = False,
265
+ arena_only: bool = False,
266
+ show_progress: bool = True,
267
+ ) -> tuple[bool, str]:
268
+ """
269
+ One-click arena initialization.
270
+
271
+ This function:
272
+ 1. Downloads benchmark Parquet data from HuggingFace (unless arena_only)
273
+ 2. Downloads arena data (model outputs + logs) from HuggingFace (unless data_only)
274
+ 3. Initializes Git repository in arena_dir (if init_git)
275
+
276
+ Args:
277
+ arena_dir: Path to arena directory
278
+ data_dir: Path to benchmark data directory
279
+ subsets: List of subsets to download (None = all available)
280
+ benchmark_repo: HuggingFace repo for benchmark data
281
+ arena_repo: HuggingFace repo for arena data
282
+ revision: HuggingFace revision/branch
283
+ overwrite: If True, overwrite existing files
284
+ init_git: If True, initialize Git repository in arena_dir
285
+ data_only: If True, only download benchmark data
286
+ arena_only: If True, only download arena data
287
+ show_progress: If True, show progress information
288
+
289
+ Returns:
290
+ Tuple of (success, summary_message)
291
+ """
292
+ from genarena.sync.hf_ops import pull_arena_data, get_hf_token
293
+ from genarena.sync.git_ops import git_init, is_git_initialized
294
+
295
+ lines: list[str] = []
296
+ all_success = True
297
+ benchmark_stats: dict = {}
298
+ arena_stats: dict = {}
299
+
300
+ # Resolve absolute paths
301
+ arena_dir = os.path.abspath(arena_dir)
302
+ data_dir = os.path.abspath(data_dir)
303
+
304
+ # Step 1: Download benchmark data
305
+ if not arena_only:
306
+ step_num = 1
307
+ total_steps = 2 if not data_only else 1
308
+ if init_git:
309
+ total_steps += 1
310
+
311
+ print(f"[Step {step_num}/{total_steps}] Downloading benchmark data from {benchmark_repo}...")
312
+ print(f" Target directory: {data_dir}")
313
+ if subsets:
314
+ print(f" Subsets: {', '.join(subsets)}")
315
+ print()
316
+
317
+ success, msg, benchmark_stats = download_benchmark_data(
318
+ data_dir=data_dir,
319
+ repo_id=benchmark_repo,
320
+ subsets=subsets,
321
+ revision=revision,
322
+ overwrite=overwrite,
323
+ show_progress=show_progress,
324
+ )
325
+
326
+ print(f" {msg.replace(chr(10), chr(10) + ' ')}")
327
+ print()
328
+
329
+ if not success:
330
+ all_success = False
331
+ lines.append(f"Benchmark data download failed")
332
+ else:
333
+ lines.append(
334
+ f"Benchmark data: {benchmark_stats.get('downloaded_files', 0)} files "
335
+ f"({_format_size(benchmark_stats.get('total_bytes', 0))})"
336
+ )
337
+
338
+ # Step 2: Download arena data
339
+ if not data_only:
340
+ step_num = 1 if arena_only else 2
341
+ total_steps = 1 if arena_only else 2
342
+ if init_git:
343
+ total_steps += 1
344
+
345
+ print(f"[Step {step_num}/{total_steps}] Downloading arena data from {arena_repo}...")
346
+ print(f" Target directory: {arena_dir}")
347
+ if subsets:
348
+ print(f" Subsets: {', '.join(subsets)}")
349
+ print()
350
+
351
+ # Create arena directory
352
+ os.makedirs(arena_dir, exist_ok=True)
353
+
354
+ success, msg = pull_arena_data(
355
+ arena_dir=arena_dir,
356
+ repo_id=arena_repo,
357
+ subsets=subsets,
358
+ revision=revision,
359
+ overwrite=overwrite,
360
+ show_progress=show_progress,
361
+ )
362
+
363
+ print(f" {msg.replace(chr(10), chr(10) + ' ')}")
364
+ print()
365
+
366
+ if not success:
367
+ all_success = False
368
+ lines.append(f"Arena data download failed: {msg}")
369
+ else:
370
+ lines.append(f"Arena data: downloaded to {arena_dir}")
371
+
372
+ # Step 3: Initialize Git
373
+ if init_git and not data_only:
374
+ step_num = total_steps
375
+ print(f"[Step {step_num}/{total_steps}] Initializing Git repository...")
376
+
377
+ if is_git_initialized(arena_dir):
378
+ print(f" Git repository already initialized at {arena_dir}")
379
+ lines.append("Git: already initialized")
380
+ else:
381
+ success, msg = git_init(arena_dir)
382
+ print(f" {msg}")
383
+ if success:
384
+ lines.append("Git: initialized")
385
+ else:
386
+ lines.append(f"Git: initialization failed - {msg}")
387
+ print()
388
+
389
+ # Build final summary
390
+ summary_lines = [
391
+ "=== Summary ===",
392
+ ]
393
+
394
+ if not arena_only:
395
+ summary_lines.append(f"Data directory: {data_dir}")
396
+ if not data_only:
397
+ summary_lines.append(f"Arena directory: {arena_dir}")
398
+
399
+ if subsets:
400
+ summary_lines.append(f"Subsets: {', '.join(subsets)}")
401
+ elif benchmark_stats.get("subsets"):
402
+ summary_lines.append(f"Subsets: {', '.join(benchmark_stats['subsets'].keys())}")
403
+
404
+ for line in lines:
405
+ summary_lines.append(f" {line}")
406
+
407
+ # Add next steps
408
+ summary_lines.append("")
409
+ summary_lines.append("Next steps:")
410
+
411
+ if not data_only:
412
+ summary_lines.append(f" # View current status")
413
+ summary_lines.append(f" genarena status --arena_dir {arena_dir} --data_dir {data_dir}")
414
+ summary_lines.append("")
415
+ summary_lines.append(f" # Run evaluation battles")
416
+ example_subset = subsets[0] if subsets else "basic"
417
+ summary_lines.append(
418
+ f" genarena run --arena_dir {arena_dir} --data_dir {data_dir} --subset {example_subset}"
419
+ )
420
+ summary_lines.append("")
421
+ summary_lines.append(f" # View leaderboard")
422
+ summary_lines.append(f" genarena leaderboard --arena_dir {arena_dir} --subset {example_subset}")
423
+ else:
424
+ summary_lines.append(f" # Initialize arena directory")
425
+ summary_lines.append(f" genarena init --arena_dir <path> --arena-only")
426
+
427
+ return all_success, "\n".join(summary_lines)
genarena/sync/packer.py ADDED
@@ -0,0 +1,584 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ZIP packing utilities for GenArena.
3
+
4
+ This module provides functionality for packing and unpacking arena data
5
+ for Huggingface upload/download operations.
6
+ """
7
+
8
+ import logging
9
+ import os
10
+ import shutil
11
+ import tempfile
12
+ import zipfile
13
+ from dataclasses import dataclass
14
+ from enum import Enum
15
+ from pathlib import Path
16
+ from typing import Optional
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+ # Supported image file extensions for model directories
21
+ IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".gif", ".webp", ".bmp", ".tiff", ".svg"}
22
+
23
+
24
+ class TaskType(Enum):
25
+ """Type of upload/download task."""
26
+ MODEL_ZIP = "model_zip" # ZIP file for experiment-scoped model images
27
+ EXP_ZIP = "exp_zip" # ZIP file for experiment logs
28
+ SMALL_FILE = "small_file" # Small file (state.json, README.md)
29
+
30
+
31
+ @dataclass
32
+ class PackTask:
33
+ """Represents a file packing/upload task."""
34
+ task_type: TaskType
35
+ local_path: str # Local path (directory for ZIP, file for small files)
36
+ remote_path: str # Remote path in the HF repo
37
+ subset: str # Subset name
38
+ name: str # Model name or experiment name or file name
39
+
40
+
41
+ @dataclass
42
+ class UnpackTask:
43
+ """Represents a file unpacking/download task."""
44
+ task_type: TaskType
45
+ remote_path: str # Remote path in the HF repo
46
+ local_path: str # Local target path
47
+ subset: str # Subset name
48
+ name: str # Model name or experiment name or file name
49
+
50
+
51
+ def pack_directory(
52
+ source_dir: str,
53
+ output_zip: str,
54
+ file_extensions: Optional[set] = None,
55
+ max_depth: Optional[int] = None,
56
+ ) -> tuple[bool, str]:
57
+ """
58
+ Pack a directory into a ZIP file.
59
+
60
+ The directory name is preserved as the root folder inside the ZIP.
61
+ Symbolic links are followed and the actual file contents are packed.
62
+
63
+ Args:
64
+ source_dir: Path to the directory to pack
65
+ output_zip: Path to the output ZIP file
66
+ file_extensions: Optional set of file extensions to include (e.g., {".png", ".jpg"}).
67
+ If None, all files are included. Extensions should be lowercase with dot.
68
+ max_depth: Optional maximum directory depth to traverse. None means unlimited.
69
+ 0 = only files directly in source_dir
70
+ 1 = files in source_dir and its immediate subdirectories
71
+ etc.
72
+
73
+ Returns:
74
+ Tuple of (success, message)
75
+ """
76
+ if not os.path.isdir(source_dir):
77
+ return False, f"Source directory does not exist: {source_dir}"
78
+
79
+ # Resolve symlink if source_dir itself is a symlink
80
+ resolved_source = os.path.realpath(source_dir)
81
+ if not os.path.isdir(resolved_source):
82
+ return False, f"Source directory symlink target does not exist: {resolved_source}"
83
+
84
+ # Get the directory name to use as root in ZIP (use original name, not resolved)
85
+ dir_name = os.path.basename(source_dir.rstrip(os.sep))
86
+
87
+ try:
88
+ # Ensure output directory exists
89
+ os.makedirs(os.path.dirname(output_zip), exist_ok=True)
90
+
91
+ file_count = 0
92
+ with zipfile.ZipFile(output_zip, "w", zipfile.ZIP_DEFLATED) as zf:
93
+ # followlinks=True to traverse symlinked directories
94
+ for root, dirs, files in os.walk(resolved_source, followlinks=True):
95
+ # Calculate current depth relative to source
96
+ if max_depth is not None:
97
+ rel_root = os.path.relpath(root, resolved_source)
98
+ if rel_root == ".":
99
+ current_depth = 0
100
+ else:
101
+ current_depth = len(rel_root.split(os.sep))
102
+
103
+ # Skip directories beyond max_depth
104
+ if current_depth > max_depth:
105
+ dirs[:] = [] # Prevent further recursion
106
+ continue
107
+
108
+ # Stop recursion at max_depth
109
+ if current_depth == max_depth:
110
+ dirs[:] = []
111
+
112
+ for file in files:
113
+ # Filter by extension if specified
114
+ if file_extensions is not None:
115
+ ext = os.path.splitext(file)[1].lower()
116
+ if ext not in file_extensions:
117
+ continue
118
+
119
+ file_path = os.path.join(root, file)
120
+
121
+ # Skip broken symlinks
122
+ if os.path.islink(file_path) and not os.path.exists(file_path):
123
+ logger.warning(f"Skipping broken symlink: {file_path}")
124
+ continue
125
+
126
+ # Calculate archive name: use original dir_name as root
127
+ rel_to_resolved = os.path.relpath(file_path, resolved_source)
128
+ archive_name = os.path.join(dir_name, rel_to_resolved)
129
+ zf.write(file_path, archive_name)
130
+ file_count += 1
131
+
132
+ if file_count == 0:
133
+ # Remove empty ZIP file
134
+ os.remove(output_zip)
135
+ return False, f"No files to pack in {source_dir}"
136
+
137
+ return True, f"Packed {source_dir} -> {output_zip} ({file_count} files)"
138
+ except Exception as e:
139
+ return False, f"Failed to pack directory: {e}"
140
+
141
+
142
+ def pack_model_dir(model_dir: str, output_zip: str) -> tuple[bool, str]:
143
+ """
144
+ Pack a single model directory (containing images) into a ZIP file.
145
+
146
+ Only image files (png, jpg, jpeg, gif, webp, bmp, tiff, svg) are packed.
147
+ Only files directly under the model directory are included;
148
+ nested subdirectories (e.g., fail/) are excluded.
149
+
150
+ Args:
151
+ model_dir: Path to the model directory (e.g., arena_dir/basic/models/exp_001/model_a/)
152
+ output_zip: Path to the output ZIP file
153
+
154
+ Returns:
155
+ Tuple of (success, message)
156
+ """
157
+ return pack_directory(model_dir, output_zip, file_extensions=IMAGE_EXTENSIONS, max_depth=0)
158
+
159
+
160
+ def pack_exp_dir(exp_dir: str, output_zip: str) -> tuple[bool, str]:
161
+ """
162
+ Pack an experiment directory (containing battle logs) into a ZIP file.
163
+
164
+ Args:
165
+ exp_dir: Path to the experiment directory (e.g., arena_dir/basic/pk_logs/exp_001/)
166
+ output_zip: Path to the output ZIP file
167
+
168
+ Returns:
169
+ Tuple of (success, message)
170
+ """
171
+ return pack_directory(exp_dir, output_zip)
172
+
173
+
174
+ def unpack_zip(
175
+ zip_path: str,
176
+ target_dir: str,
177
+ overwrite: bool = False,
178
+ ) -> tuple[bool, str]:
179
+ """
180
+ Unpack a ZIP file to a target directory.
181
+
182
+ Args:
183
+ zip_path: Path to the ZIP file
184
+ target_dir: Target directory to extract to
185
+ overwrite: If True, overwrite existing files
186
+
187
+ Returns:
188
+ Tuple of (success, message)
189
+ """
190
+ if not os.path.isfile(zip_path):
191
+ return False, f"ZIP file does not exist: {zip_path}"
192
+
193
+ try:
194
+ os.makedirs(target_dir, exist_ok=True)
195
+
196
+ with zipfile.ZipFile(zip_path, "r") as zf:
197
+ for member in zf.namelist():
198
+ target_path = os.path.join(target_dir, member)
199
+
200
+ # Check if file exists and skip if not overwriting
201
+ if os.path.exists(target_path) and not overwrite:
202
+ logger.debug(f"Skipping existing file: {target_path}")
203
+ continue
204
+
205
+ # Extract file
206
+ zf.extract(member, target_dir)
207
+
208
+ return True, f"Unpacked {zip_path} -> {target_dir}"
209
+ except Exception as e:
210
+ return False, f"Failed to unpack ZIP: {e}"
211
+
212
+
213
+ def discover_subsets(arena_dir: str) -> list[str]:
214
+ """
215
+ Discover all subset directories in the arena directory.
216
+
217
+ A valid subset directory contains at least one of: models/, pk_logs/, arena/
218
+
219
+ Args:
220
+ arena_dir: Path to the arena directory
221
+
222
+ Returns:
223
+ List of subset names
224
+ """
225
+ subsets = []
226
+
227
+ if not os.path.isdir(arena_dir):
228
+ return subsets
229
+
230
+ for name in os.listdir(arena_dir):
231
+ subset_path = os.path.join(arena_dir, name)
232
+ if not os.path.isdir(subset_path):
233
+ continue
234
+
235
+ # Check if it looks like a subset directory
236
+ has_models = os.path.isdir(os.path.join(subset_path, "models"))
237
+ has_pk_logs = os.path.isdir(os.path.join(subset_path, "pk_logs"))
238
+ has_arena = os.path.isdir(os.path.join(subset_path, "arena"))
239
+
240
+ if has_models or has_pk_logs or has_arena:
241
+ subsets.append(name)
242
+
243
+ return sorted(subsets)
244
+
245
+
246
+ def discover_models(arena_dir: str, subset: str) -> list[str]:
247
+ """
248
+ Discover all model names in a subset (v2 layout).
249
+
250
+ Args:
251
+ arena_dir: Path to the arena directory
252
+ subset: Subset name
253
+
254
+ Returns:
255
+ List of model names (globally unique across experiments)
256
+ """
257
+ from genarena.models import GlobalModelOutputManager
258
+
259
+ models_root = os.path.join(arena_dir, subset, "models")
260
+ if not os.path.isdir(models_root):
261
+ return []
262
+ try:
263
+ mgr = GlobalModelOutputManager(models_root)
264
+ return mgr.models
265
+ except Exception:
266
+ # For packer utilities, be conservative: return empty on scan failure.
267
+ return []
268
+
269
+
270
+ def discover_model_experiments(arena_dir: str, subset: str) -> list[str]:
271
+ """
272
+ Discover experiment directories under a subset's models (v2 layout).
273
+
274
+ In v2, model outputs live under:
275
+ models/<exp_name>/<model_name>/...
276
+ This function returns exp_name directories that contain at least one model with images.
277
+ """
278
+ from genarena.models import GlobalModelOutputManager
279
+
280
+ models_root = os.path.join(arena_dir, subset, "models")
281
+ if not os.path.isdir(models_root):
282
+ return []
283
+ try:
284
+ mgr = GlobalModelOutputManager(models_root)
285
+ return mgr.experiments
286
+ except Exception:
287
+ return []
288
+
289
+
290
+ def discover_experiments(arena_dir: str, subset: str) -> list[str]:
291
+ """
292
+ Discover all experiment directories in a subset's pk_logs.
293
+
294
+ Excludes .pk_logs_rm (deleted/orphaned logs).
295
+
296
+ Args:
297
+ arena_dir: Path to the arena directory
298
+ subset: Subset name
299
+
300
+ Returns:
301
+ List of experiment names
302
+ """
303
+ pk_logs_dir = os.path.join(arena_dir, subset, "pk_logs")
304
+ experiments = []
305
+
306
+ if not os.path.isdir(pk_logs_dir):
307
+ return experiments
308
+
309
+ for name in os.listdir(pk_logs_dir):
310
+ # Skip hidden directories and .pk_logs_rm
311
+ if name.startswith("."):
312
+ continue
313
+
314
+ exp_path = os.path.join(pk_logs_dir, name)
315
+ if os.path.isdir(exp_path):
316
+ experiments.append(name)
317
+
318
+ return sorted(experiments)
319
+
320
+
321
+ def collect_upload_tasks(
322
+ arena_dir: str,
323
+ subsets: Optional[list[str]] = None,
324
+ models: Optional[list[str]] = None,
325
+ experiments: Optional[list[str]] = None,
326
+ ) -> list[PackTask]:
327
+ """
328
+ Collect all files/directories that need to be uploaded.
329
+
330
+ Args:
331
+ arena_dir: Path to the arena directory
332
+ subsets: List of subsets to include (None = all)
333
+ models: List of models to include (None = all)
334
+
335
+ Returns:
336
+ List of PackTask objects
337
+ """
338
+ tasks = []
339
+
340
+ # Discover subsets if not specified
341
+ all_subsets = discover_subsets(arena_dir)
342
+ target_subsets = subsets if subsets else all_subsets
343
+
344
+ for subset in target_subsets:
345
+ if subset not in all_subsets:
346
+ logger.warning(f"Subset '{subset}' not found in arena directory")
347
+ continue
348
+
349
+ subset_path = os.path.join(arena_dir, subset)
350
+
351
+ # Collect model directories (v2 layout: models/<exp_name>/<model_name>/):
352
+ # Each model is packed as a separate ZIP file.
353
+ # - Default: upload all models
354
+ # - If experiments filter is provided: only models under those exp_name
355
+ # - If models filter is provided: only those specific models
356
+ models_root = os.path.join(subset_path, "models")
357
+ all_model_exps = discover_model_experiments(arena_dir, subset)
358
+
359
+ target_model_exps: list[str]
360
+ if experiments:
361
+ target_model_exps = [e for e in experiments if e in all_model_exps]
362
+ else:
363
+ target_model_exps = all_model_exps
364
+
365
+ # Collect individual model directories
366
+ for exp in target_model_exps:
367
+ exp_model_path = os.path.join(subset_path, "models", exp)
368
+ if not os.path.isdir(exp_model_path):
369
+ continue
370
+
371
+ # List all model directories under this experiment
372
+ for model_name in os.listdir(exp_model_path):
373
+ model_path = os.path.join(exp_model_path, model_name)
374
+ if not os.path.isdir(model_path):
375
+ continue
376
+
377
+ # Apply models filter if specified
378
+ if models and model_name not in models:
379
+ continue
380
+
381
+ remote_path = f"{subset}/models/{exp}/{model_name}.zip"
382
+
383
+ tasks.append(PackTask(
384
+ task_type=TaskType.MODEL_ZIP,
385
+ local_path=model_path,
386
+ remote_path=remote_path,
387
+ subset=subset,
388
+ name=f"{exp}/{model_name}",
389
+ ))
390
+
391
+ # Collect experiment directories (only if no model filter, or always)
392
+ # Note: pk_logs are always uploaded regardless of model filter
393
+ pk_experiments = discover_experiments(arena_dir, subset)
394
+ if experiments:
395
+ pk_experiments = [e for e in pk_experiments if e in set(experiments)]
396
+ for exp in pk_experiments:
397
+ exp_path = os.path.join(subset_path, "pk_logs", exp)
398
+ remote_path = f"{subset}/pk_logs/{exp}.zip"
399
+
400
+ tasks.append(PackTask(
401
+ task_type=TaskType.EXP_ZIP,
402
+ local_path=exp_path,
403
+ remote_path=remote_path,
404
+ subset=subset,
405
+ name=exp,
406
+ ))
407
+
408
+ # Collect small files
409
+ # state.json
410
+ state_path = os.path.join(subset_path, "arena", "state.json")
411
+ if os.path.isfile(state_path):
412
+ tasks.append(PackTask(
413
+ task_type=TaskType.SMALL_FILE,
414
+ local_path=state_path,
415
+ remote_path=f"{subset}/arena/state.json",
416
+ subset=subset,
417
+ name="state.json",
418
+ ))
419
+
420
+ # README.md
421
+ readme_path = os.path.join(subset_path, "README.md")
422
+ if os.path.isfile(readme_path):
423
+ tasks.append(PackTask(
424
+ task_type=TaskType.SMALL_FILE,
425
+ local_path=readme_path,
426
+ remote_path=f"{subset}/README.md",
427
+ subset=subset,
428
+ name="README.md",
429
+ ))
430
+
431
+ return tasks
432
+
433
+
434
+ def collect_download_tasks(
435
+ repo_files: list[str],
436
+ arena_dir: str,
437
+ subsets: Optional[list[str]] = None,
438
+ models: Optional[list[str]] = None,
439
+ experiments: Optional[list[str]] = None,
440
+ ) -> list[UnpackTask]:
441
+ """
442
+ Collect files to download based on repo contents and filters.
443
+
444
+ Args:
445
+ repo_files: List of file paths in the HF repo
446
+ arena_dir: Local arena directory path
447
+ subsets: List of subsets to download (None = all)
448
+ models: List of models to download (None = all)
449
+ experiments: List of experiments to download (None = all)
450
+
451
+ Returns:
452
+ List of UnpackTask objects
453
+ """
454
+ tasks = []
455
+
456
+ for remote_path in repo_files:
457
+ # Parse the remote path to determine type
458
+ parts = remote_path.split("/")
459
+ if len(parts) < 2:
460
+ continue
461
+
462
+ subset = parts[0]
463
+
464
+ # Apply subset filter
465
+ if subsets and subset not in subsets:
466
+ continue
467
+
468
+ # Determine task type and apply filters
469
+ # New format: models/<exp_name>/<model_name>.zip
470
+ if len(parts) >= 4 and parts[1] == "models" and parts[3].endswith(".zip"):
471
+ exp_name = parts[2]
472
+ model_name = parts[3][:-4] # Remove .zip
473
+
474
+ # Apply experiments filter
475
+ if experiments and exp_name not in experiments:
476
+ continue
477
+
478
+ # Apply models filter
479
+ if models and model_name not in models:
480
+ continue
481
+
482
+ local_path = os.path.join(arena_dir, subset, "models", exp_name)
483
+ tasks.append(UnpackTask(
484
+ task_type=TaskType.MODEL_ZIP,
485
+ remote_path=remote_path,
486
+ local_path=local_path,
487
+ subset=subset,
488
+ name=f"{exp_name}/{model_name}",
489
+ ))
490
+
491
+ # Legacy format: models/<exp_name>.zip (for backward compatibility)
492
+ elif len(parts) == 3 and parts[1] == "models" and parts[2].endswith(".zip"):
493
+ exp_name = parts[2][:-4] # Remove .zip
494
+
495
+ # Apply experiments filter (legacy: models filter acts as exp filter)
496
+ exp_filter = experiments if experiments is not None else models
497
+ if exp_filter and exp_name not in exp_filter:
498
+ continue
499
+
500
+ local_path = os.path.join(arena_dir, subset, "models")
501
+ tasks.append(UnpackTask(
502
+ task_type=TaskType.MODEL_ZIP,
503
+ remote_path=remote_path,
504
+ local_path=local_path,
505
+ subset=subset,
506
+ name=exp_name,
507
+ ))
508
+
509
+ elif len(parts) >= 3 and parts[1] == "pk_logs" and parts[2].endswith(".zip"):
510
+ # Experiment ZIP file
511
+ exp_name = parts[2][:-4] # Remove .zip
512
+
513
+ if experiments and exp_name not in experiments:
514
+ continue
515
+
516
+ local_path = os.path.join(arena_dir, subset, "pk_logs")
517
+ tasks.append(UnpackTask(
518
+ task_type=TaskType.EXP_ZIP,
519
+ remote_path=remote_path,
520
+ local_path=local_path,
521
+ subset=subset,
522
+ name=exp_name,
523
+ ))
524
+
525
+ elif len(parts) >= 3 and parts[1] == "arena" and parts[2] == "state.json":
526
+ # state.json
527
+ local_path = os.path.join(arena_dir, subset, "arena", "state.json")
528
+ tasks.append(UnpackTask(
529
+ task_type=TaskType.SMALL_FILE,
530
+ remote_path=remote_path,
531
+ local_path=local_path,
532
+ subset=subset,
533
+ name="state.json",
534
+ ))
535
+
536
+ elif len(parts) >= 2 and parts[1] == "README.md":
537
+ # README.md
538
+ local_path = os.path.join(arena_dir, subset, "README.md")
539
+ tasks.append(UnpackTask(
540
+ task_type=TaskType.SMALL_FILE,
541
+ remote_path=remote_path,
542
+ local_path=local_path,
543
+ subset=subset,
544
+ name="README.md",
545
+ ))
546
+
547
+ return tasks
548
+
549
+
550
+ class TempPackingContext:
551
+ """
552
+ Context manager for temporary packing operations.
553
+
554
+ Creates a temporary directory for ZIP files and cleans up on exit.
555
+ """
556
+
557
+ def __init__(self):
558
+ self.temp_dir: Optional[str] = None
559
+
560
+ def __enter__(self) -> "TempPackingContext":
561
+ self.temp_dir = tempfile.mkdtemp(prefix="genarena_pack_")
562
+ return self
563
+
564
+ def __exit__(self, exc_type, exc_val, exc_tb):
565
+ if self.temp_dir and os.path.isdir(self.temp_dir):
566
+ shutil.rmtree(self.temp_dir, ignore_errors=True)
567
+
568
+ def get_temp_zip_path(self, remote_path: str) -> str:
569
+ """
570
+ Get a temporary path for a ZIP file.
571
+
572
+ Args:
573
+ remote_path: The remote path (used to generate unique local path)
574
+
575
+ Returns:
576
+ Temporary file path
577
+ """
578
+ if not self.temp_dir:
579
+ raise RuntimeError("TempPackingContext not entered")
580
+
581
+ # Use the remote path structure for the temp file
582
+ temp_path = os.path.join(self.temp_dir, remote_path)
583
+ os.makedirs(os.path.dirname(temp_path), exist_ok=True)
584
+ return temp_path
genarena/sync/submit.py ADDED
@@ -0,0 +1,833 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Submission functionality for GenArena.
3
+
4
+ This module provides the ability for users to submit their evaluation results
5
+ to the official leaderboard via GitHub PR.
6
+
7
+ Workflow:
8
+ 1. Validate local submission data
9
+ 2. Upload data to user's HuggingFace repository
10
+ 3. Create submission metadata JSON
11
+ 4. Fork official repo and create PR via GitHub CLI
12
+ """
13
+
14
+ import hashlib
15
+ import json
16
+ import logging
17
+ import os
18
+ import subprocess
19
+ import tempfile
20
+ from dataclasses import dataclass, field
21
+ from datetime import datetime, timezone
22
+ from typing import Any, Optional
23
+
24
+ from genarena import __version__
25
+ from genarena.experiments import is_valid_exp_name
26
+ from genarena.logs import load_battle_records
27
+ from genarena.sync.packer import (
28
+ TempPackingContext,
29
+ pack_exp_dir,
30
+ pack_directory,
31
+ IMAGE_EXTENSIONS,
32
+ )
33
+
34
+ logger = logging.getLogger(__name__)
35
+
36
+ # Default official submissions repository
37
+ DEFAULT_OFFICIAL_REPO = "genarena/submissions"
38
+
39
+ # URL to fetch official models list
40
+ OFFICIAL_MODELS_URL = (
41
+ "https://raw.githubusercontent.com/genarena/submissions/main/official_models.json"
42
+ )
43
+
44
+
45
+ @dataclass
46
+ class ValidationResult:
47
+ """Result of local submission validation."""
48
+
49
+ valid: bool
50
+ exp_name: str
51
+ subset: str
52
+ models: list[str] = field(default_factory=list)
53
+ new_models: list[str] = field(default_factory=list)
54
+ existing_models: list[str] = field(default_factory=list)
55
+ total_battles: int = 0
56
+ battles_per_pair: dict[str, int] = field(default_factory=dict)
57
+ elo_ratings: dict[str, float] = field(default_factory=dict)
58
+ elo_ci: dict[str, tuple[float, float]] = field(default_factory=dict)
59
+ evaluation_config: dict[str, Any] = field(default_factory=dict)
60
+ errors: list[str] = field(default_factory=list)
61
+ warnings: list[str] = field(default_factory=list)
62
+
63
+
64
+ @dataclass
65
+ class UploadResult:
66
+ """Result of HuggingFace upload."""
67
+
68
+ hf_repo: str
69
+ hf_revision: str
70
+ models_zip_path: str
71
+ models_zip_sha256: str
72
+ models_zip_size: int
73
+ pk_logs_zip_path: str
74
+ pk_logs_zip_sha256: str
75
+ pk_logs_zip_size: int
76
+
77
+
78
+ def fetch_official_models(subset: str, timeout: int = 10) -> set[str]:
79
+ """
80
+ Fetch official models list from GitHub.
81
+
82
+ Args:
83
+ subset: Subset name to get models for
84
+ timeout: Request timeout in seconds
85
+
86
+ Returns:
87
+ Set of official model names for the subset
88
+ """
89
+ import urllib.request
90
+ import urllib.error
91
+
92
+ try:
93
+ with urllib.request.urlopen(OFFICIAL_MODELS_URL, timeout=timeout) as resp:
94
+ data = json.load(resp)
95
+ return set(data.get("subsets", {}).get(subset, {}).get("models", []))
96
+ except urllib.error.URLError as e:
97
+ logger.warning(f"Failed to fetch official models list: {e}")
98
+ return set()
99
+ except json.JSONDecodeError as e:
100
+ logger.warning(f"Failed to parse official models list: {e}")
101
+ return set()
102
+ except Exception as e:
103
+ logger.warning(f"Unexpected error fetching official models: {e}")
104
+ return set()
105
+
106
+
107
+ def _load_experiment_config(exp_dir: str) -> dict[str, Any]:
108
+ """Load experiment configuration from config.json."""
109
+ config_path = os.path.join(exp_dir, "config.json")
110
+ if not os.path.isfile(config_path):
111
+ return {}
112
+ try:
113
+ with open(config_path, "r", encoding="utf-8") as f:
114
+ return json.load(f)
115
+ except (json.JSONDecodeError, IOError):
116
+ return {}
117
+
118
+
119
+ def validate_local_submission(
120
+ arena_dir: str,
121
+ subset: str,
122
+ exp_name: str,
123
+ skip_official_check: bool = False,
124
+ ) -> ValidationResult:
125
+ """
126
+ Validate local submission data.
127
+
128
+ Checks:
129
+ 1. exp_name format (_yyyymmdd suffix)
130
+ 2. pk_logs directory exists and has battle records
131
+ 3. models directory exists and has model outputs
132
+ 4. All models in battles have corresponding outputs
133
+ 5. At least one model is new (not in official leaderboard)
134
+
135
+ Args:
136
+ arena_dir: Arena directory path
137
+ subset: Subset name
138
+ exp_name: Experiment name
139
+ skip_official_check: Skip checking against official models (for testing)
140
+
141
+ Returns:
142
+ ValidationResult with validation status and details
143
+ """
144
+ errors: list[str] = []
145
+ warnings: list[str] = []
146
+
147
+ # Check exp_name format
148
+ if not is_valid_exp_name(exp_name):
149
+ errors.append(
150
+ f"Invalid exp_name format: '{exp_name}' must end with _yyyymmdd"
151
+ )
152
+
153
+ # Check paths exist
154
+ pk_logs_dir = os.path.join(arena_dir, subset, "pk_logs")
155
+ exp_dir = os.path.join(pk_logs_dir, exp_name)
156
+ models_root = os.path.join(arena_dir, subset, "models")
157
+ exp_models_dir = os.path.join(models_root, exp_name)
158
+
159
+ if not os.path.isdir(exp_dir):
160
+ errors.append(f"pk_logs directory not found: {exp_dir}")
161
+
162
+ if not os.path.isdir(exp_models_dir):
163
+ errors.append(f"models directory not found: {exp_models_dir}")
164
+
165
+ if errors:
166
+ return ValidationResult(
167
+ valid=False,
168
+ exp_name=exp_name,
169
+ subset=subset,
170
+ errors=errors,
171
+ warnings=warnings,
172
+ )
173
+
174
+ # Load battle records
175
+ records = load_battle_records(pk_logs_dir, exp_name=exp_name)
176
+ if not records:
177
+ errors.append("No battle records found in pk_logs")
178
+ return ValidationResult(
179
+ valid=False,
180
+ exp_name=exp_name,
181
+ subset=subset,
182
+ errors=errors,
183
+ warnings=warnings,
184
+ )
185
+
186
+ # Extract models and battle statistics
187
+ models: set[str] = set()
188
+ battles_per_pair: dict[str, int] = {}
189
+
190
+ for r in records:
191
+ model_a = r.get("model_a", "")
192
+ model_b = r.get("model_b", "")
193
+ if model_a and model_b:
194
+ models.add(model_a)
195
+ models.add(model_b)
196
+ # Ensure consistent pair key (sorted)
197
+ pair_key = f"{min(model_a, model_b)}_vs_{max(model_a, model_b)}"
198
+ battles_per_pair[pair_key] = battles_per_pair.get(pair_key, 0) + 1
199
+
200
+ models_list = sorted(models)
201
+
202
+ # Check model outputs exist
203
+ for model in models_list:
204
+ model_dir = os.path.join(exp_models_dir, model)
205
+ if not os.path.isdir(model_dir):
206
+ errors.append(f"Model output directory not found: {model_dir}")
207
+ else:
208
+ # Check if there are any images
209
+ has_images = False
210
+ for f in os.listdir(model_dir):
211
+ ext = os.path.splitext(f)[1].lower()
212
+ if ext in IMAGE_EXTENSIONS:
213
+ has_images = True
214
+ break
215
+ if not has_images:
216
+ errors.append(f"No image files found in model directory: {model_dir}")
217
+
218
+ # Check against official models
219
+ if not skip_official_check:
220
+ official_models = fetch_official_models(subset)
221
+ new_models = [m for m in models_list if m not in official_models]
222
+ existing_models = [m for m in models_list if m in official_models]
223
+
224
+ if not new_models:
225
+ errors.append(
226
+ "No new models found. All models already exist in official leaderboard. "
227
+ "Submissions must include at least one new model."
228
+ )
229
+ else:
230
+ new_models = models_list
231
+ existing_models = []
232
+ warnings.append("Skipped official models check (--skip-official-check)")
233
+
234
+ # Calculate ELO (only if no critical errors so far)
235
+ elo_ratings: dict[str, float] = {}
236
+ elo_ci: dict[str, tuple[float, float]] = {}
237
+
238
+ if not errors:
239
+ try:
240
+ from genarena.bt_elo import compute_bootstrap_bt_elo
241
+
242
+ battles = [
243
+ (r["model_a"], r["model_b"], r["final_winner"])
244
+ for r in records
245
+ if r.get("model_a") and r.get("model_b") and r.get("final_winner")
246
+ ]
247
+
248
+ if battles:
249
+ bt_result = compute_bootstrap_bt_elo(battles, num_bootstrap=100)
250
+ elo_ratings = bt_result.ratings
251
+ for model in models_list:
252
+ if model in bt_result.ci_lower and model in bt_result.ci_upper:
253
+ elo_ci[model] = (
254
+ bt_result.ci_lower[model],
255
+ bt_result.ci_upper[model],
256
+ )
257
+ except Exception as e:
258
+ warnings.append(f"Failed to calculate ELO: {e}")
259
+
260
+ # Load evaluation config
261
+ evaluation_config = _load_experiment_config(exp_dir)
262
+
263
+ return ValidationResult(
264
+ valid=len(errors) == 0,
265
+ exp_name=exp_name,
266
+ subset=subset,
267
+ models=models_list,
268
+ new_models=new_models,
269
+ existing_models=existing_models,
270
+ total_battles=len(records),
271
+ battles_per_pair=battles_per_pair,
272
+ elo_ratings=elo_ratings,
273
+ elo_ci=elo_ci,
274
+ evaluation_config=evaluation_config,
275
+ errors=errors,
276
+ warnings=warnings,
277
+ )
278
+
279
+
280
+ def upload_submission_data(
281
+ arena_dir: str,
282
+ subset: str,
283
+ exp_name: str,
284
+ hf_repo: str,
285
+ hf_revision: str = "main",
286
+ show_progress: bool = True,
287
+ ) -> UploadResult:
288
+ """
289
+ Pack and upload submission data to HuggingFace.
290
+
291
+ Args:
292
+ arena_dir: Arena directory path
293
+ subset: Subset name
294
+ exp_name: Experiment name
295
+ hf_repo: HuggingFace repository ID (e.g., "username/repo-name")
296
+ hf_revision: Repository revision/branch (default: "main")
297
+ show_progress: Show upload progress
298
+
299
+ Returns:
300
+ UploadResult with upload details
301
+
302
+ Raises:
303
+ RuntimeError: If upload fails
304
+ """
305
+ from huggingface_hub import HfApi
306
+
307
+ api = HfApi()
308
+
309
+ # Paths
310
+ exp_models_dir = os.path.join(arena_dir, subset, "models", exp_name)
311
+ exp_dir = os.path.join(arena_dir, subset, "pk_logs", exp_name)
312
+
313
+ with TempPackingContext() as ctx:
314
+ # Pack models
315
+ models_zip_path = ctx.get_temp_zip_path(f"{subset}/models/{exp_name}.zip")
316
+ success, msg = pack_directory(
317
+ exp_models_dir, models_zip_path, file_extensions=IMAGE_EXTENSIONS
318
+ )
319
+ if not success:
320
+ raise RuntimeError(f"Failed to pack models: {msg}")
321
+
322
+ # Calculate SHA256 for models
323
+ with open(models_zip_path, "rb") as f:
324
+ models_sha256 = hashlib.sha256(f.read()).hexdigest()
325
+ models_size = os.path.getsize(models_zip_path)
326
+
327
+ # Pack pk_logs
328
+ logs_zip_path = ctx.get_temp_zip_path(f"{subset}/pk_logs/{exp_name}.zip")
329
+ success, msg = pack_exp_dir(exp_dir, logs_zip_path)
330
+ if not success:
331
+ raise RuntimeError(f"Failed to pack pk_logs: {msg}")
332
+
333
+ # Calculate SHA256 for logs
334
+ with open(logs_zip_path, "rb") as f:
335
+ logs_sha256 = hashlib.sha256(f.read()).hexdigest()
336
+ logs_size = os.path.getsize(logs_zip_path)
337
+
338
+ # Upload to HF
339
+ hf_models_path = f"{subset}/models/{exp_name}.zip"
340
+ hf_logs_path = f"{subset}/pk_logs/{exp_name}.zip"
341
+
342
+ logger.info(f"Uploading models ZIP ({models_size / 1024 / 1024:.1f} MB)...")
343
+ api.upload_file(
344
+ path_or_fileobj=models_zip_path,
345
+ path_in_repo=hf_models_path,
346
+ repo_id=hf_repo,
347
+ repo_type="dataset",
348
+ revision=hf_revision,
349
+ )
350
+
351
+ logger.info(f"Uploading pk_logs ZIP ({logs_size / 1024 / 1024:.1f} MB)...")
352
+ api.upload_file(
353
+ path_or_fileobj=logs_zip_path,
354
+ path_in_repo=hf_logs_path,
355
+ repo_id=hf_repo,
356
+ repo_type="dataset",
357
+ revision=hf_revision,
358
+ )
359
+
360
+ return UploadResult(
361
+ hf_repo=hf_repo,
362
+ hf_revision=hf_revision,
363
+ models_zip_path=hf_models_path,
364
+ models_zip_sha256=models_sha256,
365
+ models_zip_size=models_size,
366
+ pk_logs_zip_path=hf_logs_path,
367
+ pk_logs_zip_sha256=logs_sha256,
368
+ pk_logs_zip_size=logs_size,
369
+ )
370
+
371
+
372
+ def create_submission_metadata(
373
+ validation: ValidationResult,
374
+ upload: UploadResult,
375
+ github_username: str,
376
+ title: str = "",
377
+ description: str = "",
378
+ contact: str = "",
379
+ ) -> dict[str, Any]:
380
+ """
381
+ Create submission metadata JSON.
382
+
383
+ Args:
384
+ validation: ValidationResult from validate_local_submission
385
+ upload: UploadResult from upload_submission_data
386
+ github_username: GitHub username of submitter
387
+ title: Submission title
388
+ description: Submission description
389
+ contact: Optional contact email
390
+
391
+ Returns:
392
+ Submission metadata dictionary
393
+ """
394
+ # Generate submission ID
395
+ timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%S")
396
+ hash_input = f"{timestamp}{validation.exp_name}{github_username}"
397
+ short_hash = hashlib.sha256(hash_input.encode()).hexdigest()[:8]
398
+ submission_id = f"sub_{timestamp}_{short_hash}"
399
+
400
+ # Build submitter info
401
+ submitter: dict[str, str] = {"github_username": github_username}
402
+ if contact:
403
+ submitter["contact"] = contact
404
+
405
+ # Build evaluation config (extract key fields)
406
+ eval_config = validation.evaluation_config
407
+ evaluation_config_summary = {
408
+ "judge_model": eval_config.get("judge_model", "unknown"),
409
+ "prompt_module": eval_config.get("prompt", "unknown"),
410
+ "temperature": eval_config.get("temperature", 0.0),
411
+ "position_debiasing": True, # Always true in genarena
412
+ }
413
+
414
+ # Build model pairs list
415
+ model_pairs = [
416
+ [min(p.split("_vs_")[0], p.split("_vs_")[1]),
417
+ max(p.split("_vs_")[0], p.split("_vs_")[1])]
418
+ for p in validation.battles_per_pair.keys()
419
+ ]
420
+
421
+ return {
422
+ "schema_version": "1.0",
423
+ "submission_id": submission_id,
424
+ "created_at": datetime.now(timezone.utc).isoformat(),
425
+ "submitter": submitter,
426
+ "experiment": {
427
+ "exp_name": validation.exp_name,
428
+ "subset": validation.subset,
429
+ "models": validation.models,
430
+ "new_models": validation.new_models,
431
+ "existing_models": validation.existing_models,
432
+ "model_pairs": model_pairs,
433
+ "total_battles": validation.total_battles,
434
+ "battles_per_pair": validation.battles_per_pair,
435
+ },
436
+ "data_location": {
437
+ "hf_repo_id": upload.hf_repo,
438
+ "hf_revision": upload.hf_revision,
439
+ "files": {
440
+ "models_zip": {
441
+ "path": upload.models_zip_path,
442
+ "sha256": upload.models_zip_sha256,
443
+ "size_bytes": upload.models_zip_size,
444
+ },
445
+ "pk_logs_zip": {
446
+ "path": upload.pk_logs_zip_path,
447
+ "sha256": upload.pk_logs_zip_sha256,
448
+ "size_bytes": upload.pk_logs_zip_size,
449
+ },
450
+ },
451
+ },
452
+ "elo_preview": {
453
+ "ratings": validation.elo_ratings,
454
+ "ci_95": {m: list(ci) for m, ci in validation.elo_ci.items()},
455
+ },
456
+ "evaluation_config": evaluation_config_summary,
457
+ "title": title or f"Submit {validation.exp_name}",
458
+ "description": description,
459
+ "verification": {
460
+ "local_validation_passed": validation.valid,
461
+ "genarena_version": __version__,
462
+ },
463
+ }
464
+
465
+
466
+ def _get_github_username() -> Optional[str]:
467
+ """Get GitHub username from gh CLI."""
468
+ try:
469
+ result = subprocess.run(
470
+ ["gh", "api", "user", "-q", ".login"],
471
+ capture_output=True,
472
+ text=True,
473
+ timeout=30,
474
+ )
475
+ if result.returncode == 0:
476
+ return result.stdout.strip()
477
+ except (subprocess.TimeoutExpired, FileNotFoundError):
478
+ pass
479
+ return None
480
+
481
+
482
+ def _check_gh_cli() -> tuple[bool, str]:
483
+ """Check if GitHub CLI is available and authenticated."""
484
+ try:
485
+ # Check if gh is installed
486
+ result = subprocess.run(
487
+ ["gh", "--version"],
488
+ capture_output=True,
489
+ text=True,
490
+ timeout=10,
491
+ )
492
+ if result.returncode != 0:
493
+ return False, "GitHub CLI (gh) is not installed"
494
+
495
+ # Check if authenticated
496
+ result = subprocess.run(
497
+ ["gh", "auth", "status"],
498
+ capture_output=True,
499
+ text=True,
500
+ timeout=10,
501
+ )
502
+ if result.returncode != 0:
503
+ return False, "GitHub CLI is not authenticated. Run 'gh auth login' first."
504
+
505
+ return True, "GitHub CLI is ready"
506
+ except FileNotFoundError:
507
+ return False, "GitHub CLI (gh) is not installed. Install it from https://cli.github.com"
508
+ except subprocess.TimeoutExpired:
509
+ return False, "GitHub CLI timed out"
510
+
511
+
512
+ def _generate_pr_body(submission: dict[str, Any]) -> str:
513
+ """Generate PR description body."""
514
+ exp = submission["experiment"]
515
+ elo = submission["elo_preview"]["ratings"]
516
+ eval_config = submission["evaluation_config"]
517
+
518
+ body = f"""## Submission Details
519
+
520
+ **Experiment:** `{exp['exp_name']}`
521
+ **Subset:** `{exp['subset']}`
522
+ **New Models:** {', '.join(f'`{m}`' for m in exp['new_models']) or 'None'}
523
+ **Total Battles:** {exp['total_battles']:,}
524
+ **Model Pairs:** {len(exp['model_pairs'])}
525
+
526
+ ### Evaluation Configuration
527
+
528
+ | Setting | Value |
529
+ |---------|-------|
530
+ | Judge Model | `{eval_config.get('judge_model', 'N/A')}` |
531
+ | Prompt Module | `{eval_config.get('prompt_module', 'N/A')}` |
532
+ | Temperature | {eval_config.get('temperature', 'N/A')} |
533
+ | Position Debiasing | {'Yes' if eval_config.get('position_debiasing') else 'No'} |
534
+
535
+ ### ELO Preview
536
+
537
+ | Model | ELO | 95% CI |
538
+ |-------|-----|--------|
539
+ """
540
+ ci_data = submission["elo_preview"].get("ci_95", {})
541
+ for model in sorted(elo.keys(), key=lambda m: -elo[m]):
542
+ ci = ci_data.get(model, [None, None])
543
+ ci_str = f"[{ci[0]:.1f}, {ci[1]:.1f}]" if ci[0] is not None else "N/A"
544
+ body += f"| {model} | {elo[model]:.1f} | {ci_str} |\n"
545
+
546
+ body += f"""
547
+ ### Data Location
548
+
549
+ - **HuggingFace Repo:** `{submission['data_location']['hf_repo_id']}`
550
+ - **Models ZIP:** `{submission['data_location']['files']['models_zip']['path']}`
551
+ - SHA256: `{submission['data_location']['files']['models_zip']['sha256'][:16]}...`
552
+ - Size: {submission['data_location']['files']['models_zip']['size_bytes'] / 1024 / 1024:.1f} MB
553
+ - **Logs ZIP:** `{submission['data_location']['files']['pk_logs_zip']['path']}`
554
+ - SHA256: `{submission['data_location']['files']['pk_logs_zip']['sha256'][:16]}...`
555
+ - Size: {submission['data_location']['files']['pk_logs_zip']['size_bytes'] / 1024:.1f} KB
556
+
557
+ ### Description
558
+
559
+ {submission.get('description') or submission.get('title', 'No description provided.')}
560
+
561
+ ---
562
+ *Submitted via genarena v{submission['verification']['genarena_version']}*
563
+ """
564
+ return body
565
+
566
+
567
+ def create_submission_pr(
568
+ submission: dict[str, Any],
569
+ official_repo: str = DEFAULT_OFFICIAL_REPO,
570
+ title: Optional[str] = None,
571
+ ) -> str:
572
+ """
573
+ Fork official repo and create PR with submission.
574
+
575
+ Args:
576
+ submission: Submission metadata dictionary
577
+ official_repo: Official submissions repository (default: genarena/submissions)
578
+ title: PR title (optional, auto-generated if not provided)
579
+
580
+ Returns:
581
+ PR URL
582
+
583
+ Raises:
584
+ RuntimeError: If PR creation fails
585
+ """
586
+ submission_id = submission["submission_id"]
587
+ filename = f"{submission_id}.json"
588
+
589
+ # Get GitHub username
590
+ gh_username = _get_github_username()
591
+ if not gh_username:
592
+ raise RuntimeError("Failed to get GitHub username. Ensure gh CLI is authenticated.")
593
+
594
+ # Fork the repo (idempotent - won't fail if already forked)
595
+ logger.info(f"Forking {official_repo}...")
596
+ result = subprocess.run(
597
+ ["gh", "repo", "fork", official_repo, "--clone=false"],
598
+ capture_output=True,
599
+ text=True,
600
+ timeout=60,
601
+ )
602
+ # Note: fork may "fail" if already forked, but that's OK
603
+
604
+ # Clone forked repo to temp directory
605
+ with tempfile.TemporaryDirectory() as tmpdir:
606
+ fork_repo = f"{gh_username}/submissions"
607
+ logger.info(f"Cloning {fork_repo}...")
608
+
609
+ result = subprocess.run(
610
+ ["gh", "repo", "clone", fork_repo, tmpdir],
611
+ capture_output=True,
612
+ text=True,
613
+ timeout=120,
614
+ )
615
+ if result.returncode != 0:
616
+ raise RuntimeError(f"Failed to clone fork: {result.stderr}")
617
+
618
+ # Sync with upstream
619
+ logger.info("Syncing with upstream...")
620
+ subprocess.run(
621
+ ["gh", "repo", "sync", fork_repo, "--source", official_repo],
622
+ capture_output=True,
623
+ text=True,
624
+ timeout=60,
625
+ )
626
+
627
+ # Pull latest changes
628
+ subprocess.run(
629
+ ["git", "pull", "origin", "main"],
630
+ cwd=tmpdir,
631
+ capture_output=True,
632
+ text=True,
633
+ timeout=60,
634
+ )
635
+
636
+ # Create branch
637
+ branch_name = f"submit/{submission_id}"
638
+ logger.info(f"Creating branch {branch_name}...")
639
+ result = subprocess.run(
640
+ ["git", "checkout", "-b", branch_name],
641
+ cwd=tmpdir,
642
+ capture_output=True,
643
+ text=True,
644
+ )
645
+ if result.returncode != 0:
646
+ raise RuntimeError(f"Failed to create branch: {result.stderr}")
647
+
648
+ # Write submission file
649
+ submissions_dir = os.path.join(tmpdir, "submissions", "pending")
650
+ os.makedirs(submissions_dir, exist_ok=True)
651
+ submission_path = os.path.join(submissions_dir, filename)
652
+
653
+ with open(submission_path, "w", encoding="utf-8") as f:
654
+ json.dump(submission, f, indent=2, ensure_ascii=False)
655
+
656
+ # Commit
657
+ logger.info("Committing submission...")
658
+ subprocess.run(["git", "add", "."], cwd=tmpdir, check=True)
659
+
660
+ commit_msg = title or f"Submit {submission['experiment']['exp_name']}"
661
+ result = subprocess.run(
662
+ ["git", "commit", "-m", commit_msg],
663
+ cwd=tmpdir,
664
+ capture_output=True,
665
+ text=True,
666
+ )
667
+ if result.returncode != 0:
668
+ raise RuntimeError(f"Failed to commit: {result.stderr}")
669
+
670
+ # Push
671
+ logger.info("Pushing to fork...")
672
+ result = subprocess.run(
673
+ ["git", "push", "-u", "origin", branch_name],
674
+ cwd=tmpdir,
675
+ capture_output=True,
676
+ text=True,
677
+ timeout=120,
678
+ )
679
+ if result.returncode != 0:
680
+ raise RuntimeError(f"Failed to push: {result.stderr}")
681
+
682
+ # Create PR
683
+ logger.info("Creating PR...")
684
+ pr_title = title or f"[Submission] {submission['experiment']['exp_name']}"
685
+ pr_body = _generate_pr_body(submission)
686
+
687
+ result = subprocess.run(
688
+ [
689
+ "gh", "pr", "create",
690
+ "--repo", official_repo,
691
+ "--head", f"{gh_username}:{branch_name}",
692
+ "--title", pr_title,
693
+ "--body", pr_body,
694
+ ],
695
+ capture_output=True,
696
+ text=True,
697
+ timeout=60,
698
+ )
699
+
700
+ if result.returncode != 0:
701
+ raise RuntimeError(f"Failed to create PR: {result.stderr}")
702
+
703
+ pr_url = result.stdout.strip()
704
+ return pr_url
705
+
706
+
707
+ def print_validation_summary(validation: ValidationResult) -> None:
708
+ """Print validation summary to console."""
709
+ print("\nValidation Results:")
710
+ print("-" * 40)
711
+
712
+ if validation.valid:
713
+ print("Status: PASSED")
714
+ else:
715
+ print("Status: FAILED")
716
+
717
+ print(f"\nExperiment: {validation.exp_name}")
718
+ print(f"Subset: {validation.subset}")
719
+ print(f"Models: {len(validation.models)}")
720
+ print(f" New models: {', '.join(validation.new_models) or 'None'}")
721
+ print(f" Existing models: {', '.join(validation.existing_models) or 'None'}")
722
+ print(f"Total battles: {validation.total_battles:,}")
723
+ print(f"Model pairs: {len(validation.battles_per_pair)}")
724
+
725
+ if validation.elo_ratings:
726
+ print("\nELO Preview:")
727
+ sorted_models = sorted(
728
+ validation.elo_ratings.keys(),
729
+ key=lambda m: -validation.elo_ratings[m]
730
+ )
731
+ for model in sorted_models:
732
+ elo = validation.elo_ratings[model]
733
+ ci = validation.elo_ci.get(model)
734
+ ci_str = f" [{ci[0]:.1f}, {ci[1]:.1f}]" if ci else ""
735
+ new_marker = " (new)" if model in validation.new_models else ""
736
+ print(f" {model}: {elo:.1f}{ci_str}{new_marker}")
737
+
738
+ if validation.evaluation_config:
739
+ config = validation.evaluation_config
740
+ print("\nEvaluation Config:")
741
+ print(f" Judge model: {config.get('judge_model', 'N/A')}")
742
+ print(f" Prompt: {config.get('prompt', 'N/A')}")
743
+ print(f" Temperature: {config.get('temperature', 'N/A')}")
744
+
745
+ if validation.warnings:
746
+ print("\nWarnings:")
747
+ for w in validation.warnings:
748
+ print(f" - {w}")
749
+
750
+ if validation.errors:
751
+ print("\nErrors:")
752
+ for e in validation.errors:
753
+ print(f" - {e}")
754
+
755
+ print()
756
+
757
+
758
+ def generate_official_models_json(
759
+ arena_dir: str,
760
+ output_path: Optional[str] = None,
761
+ ) -> dict[str, Any]:
762
+ """
763
+ Generate official_models.json from arena state files.
764
+
765
+ This function scans all subsets in the arena directory and extracts
766
+ the list of models from each subset's state.json file.
767
+
768
+ Args:
769
+ arena_dir: Path to the official arena directory
770
+ output_path: Optional path to write the JSON file
771
+
772
+ Returns:
773
+ The official_models.json content as a dictionary
774
+ """
775
+ from genarena.sync.packer import discover_subsets
776
+ from genarena.state import load_state
777
+
778
+ result: dict[str, Any] = {
779
+ "last_updated": datetime.now(timezone.utc).isoformat(),
780
+ "description": "List of models currently on the official GenArena leaderboard",
781
+ "subsets": {},
782
+ }
783
+
784
+ # Discover all subsets
785
+ subsets = discover_subsets(arena_dir)
786
+
787
+ for subset in subsets:
788
+ state_path = os.path.join(arena_dir, subset, "arena", "state.json")
789
+ if not os.path.isfile(state_path):
790
+ continue
791
+
792
+ state = load_state(state_path)
793
+ if not state.models:
794
+ continue
795
+
796
+ # Get sorted list of model names
797
+ models = sorted(state.models.keys())
798
+
799
+ result["subsets"][subset] = {
800
+ "models": models,
801
+ "model_count": len(models),
802
+ "total_battles": state.total_battles,
803
+ }
804
+
805
+ # Write to file if output_path specified
806
+ if output_path:
807
+ os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
808
+ with open(output_path, "w", encoding="utf-8") as f:
809
+ json.dump(result, f, indent=2, ensure_ascii=False)
810
+ logger.info(f"Wrote official_models.json to {output_path}")
811
+
812
+ return result
813
+
814
+
815
+ def print_official_models_summary(data: dict[str, Any]) -> None:
816
+ """Print summary of official models."""
817
+ print("\n=== Official Models ===\n")
818
+ print(f"Last Updated: {data.get('last_updated', 'N/A')}")
819
+ print()
820
+
821
+ subsets = data.get("subsets", {})
822
+ if not subsets:
823
+ print("No subsets found.")
824
+ return
825
+
826
+ for subset, info in sorted(subsets.items()):
827
+ models = info.get("models", [])
828
+ print(f"Subset: {subset}")
829
+ print(f" Models ({len(models)}):")
830
+ for model in models:
831
+ print(f" - {model}")
832
+ print(f" Total Battles: {info.get('total_battles', 0):,}")
833
+ print()
genarena/validation/__init__.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Validation module for GenArena submissions."""
2
+
3
+ from genarena.validation.schema import (
4
+ SUBMISSION_SCHEMA,
5
+ validate_submission_schema,
6
+ )
7
+ from genarena.validation.validator import (
8
+ validate_submission_file,
9
+ validate_submission_data,
10
+ ValidationReport,
11
+ )
12
+
13
+ __all__ = [
14
+ "SUBMISSION_SCHEMA",
15
+ "validate_submission_schema",
16
+ "validate_submission_file",
17
+ "validate_submission_data",
18
+ "ValidationReport",
19
+ ]
genarena/validation/schema.py ADDED
@@ -0,0 +1,323 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ JSON Schema definition for GenArena submissions.
3
+
4
+ This schema defines the structure of submission metadata files
5
+ that are submitted via GitHub PR to the official leaderboard.
6
+ """
7
+
8
+ from typing import Any
9
+
10
+ # JSON Schema for submission metadata
11
+ SUBMISSION_SCHEMA: dict[str, Any] = {
12
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
13
+ "title": "GenArena Submission",
14
+ "description": "Metadata for a GenArena evaluation submission",
15
+ "type": "object",
16
+ "required": [
17
+ "schema_version",
18
+ "submission_id",
19
+ "created_at",
20
+ "submitter",
21
+ "experiment",
22
+ "data_location",
23
+ "elo_preview",
24
+ ],
25
+ "properties": {
26
+ "schema_version": {
27
+ "type": "string",
28
+ "description": "Schema version (e.g., '1.0')",
29
+ "pattern": "^\\d+\\.\\d+$",
30
+ },
31
+ "submission_id": {
32
+ "type": "string",
33
+ "description": "Unique submission identifier",
34
+ "pattern": "^sub_\\d{8}T\\d{6}_[a-f0-9]{8}$",
35
+ },
36
+ "created_at": {
37
+ "type": "string",
38
+ "description": "ISO 8601 timestamp of submission creation",
39
+ "format": "date-time",
40
+ },
41
+ "submitter": {
42
+ "type": "object",
43
+ "required": ["github_username"],
44
+ "properties": {
45
+ "github_username": {
46
+ "type": "string",
47
+ "description": "GitHub username of submitter",
48
+ "minLength": 1,
49
+ },
50
+ "contact": {
51
+ "type": "string",
52
+ "description": "Optional contact email",
53
+ "format": "email",
54
+ },
55
+ },
56
+ },
57
+ "experiment": {
58
+ "type": "object",
59
+ "required": [
60
+ "exp_name",
61
+ "subset",
62
+ "models",
63
+ "new_models",
64
+ "total_battles",
65
+ ],
66
+ "properties": {
67
+ "exp_name": {
68
+ "type": "string",
69
+ "description": "Experiment name (must end with _yyyymmdd)",
70
+ "pattern": "^.+_\\d{8}$",
71
+ },
72
+ "subset": {
73
+ "type": "string",
74
+ "description": "Subset name (e.g., 'basic')",
75
+ "minLength": 1,
76
+ },
77
+ "models": {
78
+ "type": "array",
79
+ "description": "List of all model names in the experiment",
80
+ "items": {"type": "string"},
81
+ "minItems": 2,
82
+ },
83
+ "new_models": {
84
+ "type": "array",
85
+ "description": "List of new model names (not in official leaderboard)",
86
+ "items": {"type": "string"},
87
+ "minItems": 1,
88
+ },
89
+ "existing_models": {
90
+ "type": "array",
91
+ "description": "List of existing model names (already in official)",
92
+ "items": {"type": "string"},
93
+ },
94
+ "model_pairs": {
95
+ "type": "array",
96
+ "description": "List of model pairs evaluated",
97
+ "items": {
98
+ "type": "array",
99
+ "items": {"type": "string"},
100
+ "minItems": 2,
101
+ "maxItems": 2,
102
+ },
103
+ },
104
+ "total_battles": {
105
+ "type": "integer",
106
+ "description": "Total number of battles",
107
+ "minimum": 1,
108
+ },
109
+ "battles_per_pair": {
110
+ "type": "object",
111
+ "description": "Battle count per model pair",
112
+ "additionalProperties": {"type": "integer"},
113
+ },
114
+ },
115
+ },
116
+ "data_location": {
117
+ "type": "object",
118
+ "required": ["hf_repo_id", "files"],
119
+ "properties": {
120
+ "hf_repo_id": {
121
+ "type": "string",
122
+ "description": "HuggingFace repository ID",
123
+ "pattern": "^[\\w.-]+/[\\w.-]+$",
124
+ },
125
+ "hf_revision": {
126
+ "type": "string",
127
+ "description": "HuggingFace revision/branch",
128
+ "default": "main",
129
+ },
130
+ "files": {
131
+ "type": "object",
132
+ "required": ["models_zip", "pk_logs_zip"],
133
+ "properties": {
134
+ "models_zip": {
135
+ "$ref": "#/$defs/file_info",
136
+ },
137
+ "pk_logs_zip": {
138
+ "$ref": "#/$defs/file_info",
139
+ },
140
+ },
141
+ },
142
+ },
143
+ },
144
+ "elo_preview": {
145
+ "type": "object",
146
+ "required": ["ratings"],
147
+ "properties": {
148
+ "ratings": {
149
+ "type": "object",
150
+ "description": "ELO ratings by model",
151
+ "additionalProperties": {"type": "number"},
152
+ },
153
+ "ci_95": {
154
+ "type": "object",
155
+ "description": "95% confidence intervals by model",
156
+ "additionalProperties": {
157
+ "type": "array",
158
+ "items": {"type": "number"},
159
+ "minItems": 2,
160
+ "maxItems": 2,
161
+ },
162
+ },
163
+ },
164
+ },
165
+ "evaluation_config": {
166
+ "type": "object",
167
+ "description": "Evaluation configuration used",
168
+ "properties": {
169
+ "judge_model": {
170
+ "type": "string",
171
+ "description": "VLM judge model name",
172
+ },
173
+ "prompt_module": {
174
+ "type": "string",
175
+ "description": "Prompt module name",
176
+ },
177
+ "temperature": {
178
+ "type": "number",
179
+ "description": "VLM temperature",
180
+ "minimum": 0,
181
+ },
182
+ "position_debiasing": {
183
+ "type": "boolean",
184
+ "description": "Whether position debiasing was used",
185
+ },
186
+ },
187
+ },
188
+ "title": {
189
+ "type": "string",
190
+ "description": "Submission title",
191
+ },
192
+ "description": {
193
+ "type": "string",
194
+ "description": "Submission description",
195
+ },
196
+ "verification": {
197
+ "type": "object",
198
+ "properties": {
199
+ "local_validation_passed": {
200
+ "type": "boolean",
201
+ "description": "Whether local validation passed",
202
+ },
203
+ "genarena_version": {
204
+ "type": "string",
205
+ "description": "genarena version used for submission",
206
+ },
207
+ },
208
+ },
209
+ },
210
+ "$defs": {
211
+ "file_info": {
212
+ "type": "object",
213
+ "required": ["path", "sha256", "size_bytes"],
214
+ "properties": {
215
+ "path": {
216
+ "type": "string",
217
+ "description": "File path in HF repo",
218
+ },
219
+ "sha256": {
220
+ "type": "string",
221
+ "description": "SHA256 checksum",
222
+ "pattern": "^[a-f0-9]{64}$",
223
+ },
224
+ "size_bytes": {
225
+ "type": "integer",
226
+ "description": "File size in bytes",
227
+ "minimum": 1,
228
+ },
229
+ },
230
+ },
231
+ },
232
+ }
233
+
234
+
235
+ def validate_submission_schema(submission: dict[str, Any]) -> tuple[bool, list[str]]:
236
+ """
237
+ Validate submission against JSON schema.
238
+
239
+ Args:
240
+ submission: Submission metadata dictionary
241
+
242
+ Returns:
243
+ Tuple of (is_valid, list of error messages)
244
+ """
245
+ try:
246
+ import jsonschema
247
+ except ImportError:
248
+ # If jsonschema is not available, do basic validation
249
+ return _basic_validation(submission)
250
+
251
+ errors: list[str] = []
252
+
253
+ try:
254
+ jsonschema.validate(instance=submission, schema=SUBMISSION_SCHEMA)
255
+ return True, []
256
+ except jsonschema.ValidationError as e:
257
+ errors.append(f"Schema validation error: {e.message}")
258
+ if e.path:
259
+ errors.append(f" at path: {'.'.join(str(p) for p in e.path)}")
260
+ return False, errors
261
+ except jsonschema.SchemaError as e:
262
+ errors.append(f"Schema error: {e.message}")
263
+ return False, errors
264
+
265
+
266
+ def _basic_validation(submission: dict[str, Any]) -> tuple[bool, list[str]]:
267
+ """Basic validation without jsonschema library."""
268
+ errors: list[str] = []
269
+
270
+ required_fields = [
271
+ "schema_version",
272
+ "submission_id",
273
+ "created_at",
274
+ "submitter",
275
+ "experiment",
276
+ "data_location",
277
+ "elo_preview",
278
+ ]
279
+
280
+ for field in required_fields:
281
+ if field not in submission:
282
+ errors.append(f"Missing required field: {field}")
283
+
284
+ if errors:
285
+ return False, errors
286
+
287
+ # Check submitter
288
+ if "github_username" not in submission.get("submitter", {}):
289
+ errors.append("Missing submitter.github_username")
290
+
291
+ # Check experiment
292
+ exp = submission.get("experiment", {})
293
+ exp_required = ["exp_name", "subset", "models", "new_models", "total_battles"]
294
+ for field in exp_required:
295
+ if field not in exp:
296
+ errors.append(f"Missing experiment.{field}")
297
+
298
+ # Check new_models is not empty
299
+ if not exp.get("new_models"):
300
+ errors.append("experiment.new_models must have at least one model")
301
+
302
+ # Check data_location
303
+ data_loc = submission.get("data_location", {})
304
+ if "hf_repo_id" not in data_loc:
305
+ errors.append("Missing data_location.hf_repo_id")
306
+ if "files" not in data_loc:
307
+ errors.append("Missing data_location.files")
308
+ else:
309
+ files = data_loc.get("files", {})
310
+ for zip_type in ["models_zip", "pk_logs_zip"]:
311
+ if zip_type not in files:
312
+ errors.append(f"Missing data_location.files.{zip_type}")
313
+ else:
314
+ file_info = files[zip_type]
315
+ for field in ["path", "sha256", "size_bytes"]:
316
+ if field not in file_info:
317
+ errors.append(f"Missing data_location.files.{zip_type}.{field}")
318
+
319
+ # Check elo_preview
320
+ if "ratings" not in submission.get("elo_preview", {}):
321
+ errors.append("Missing elo_preview.ratings")
322
+
323
+ return len(errors) == 0, errors
genarena/validation/validator.py ADDED
@@ -0,0 +1,325 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Validator for GenArena submissions.
3
+
4
+ This module provides functions to validate submission files,
5
+ including downloading and verifying data from HuggingFace.
6
+ Used by the GitHub Actions bot for automated validation.
7
+ """
8
+
9
+ import hashlib
10
+ import json
11
+ import logging
12
+ import os
13
+ import tempfile
14
+ import zipfile
15
+ from dataclasses import dataclass, field
16
+ from typing import Any, Optional
17
+
18
+ from genarena.validation.schema import validate_submission_schema
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ @dataclass
24
+ class ValidationCheck:
25
+ """Single validation check result."""
26
+
27
+ name: str
28
+ passed: bool
29
+ error: Optional[str] = None
30
+
31
+
32
+ @dataclass
33
+ class ValidationReport:
34
+ """Complete validation report for a submission."""
35
+
36
+ status: str # "success" or "failed"
37
+ submission_id: str = ""
38
+ exp_name: str = ""
39
+ subset: str = ""
40
+ models: list[str] = field(default_factory=list)
41
+ new_models: list[str] = field(default_factory=list)
42
+ total_battles: int = 0
43
+ checks: list[ValidationCheck] = field(default_factory=list)
44
+ elo_comparison: dict[str, dict[str, float]] = field(default_factory=dict)
45
+ errors: list[str] = field(default_factory=list)
46
+
47
+ def add_check(self, name: str, passed: bool, error: Optional[str] = None) -> None:
48
+ """Add a validation check result."""
49
+ self.checks.append(ValidationCheck(name=name, passed=passed, error=error))
50
+ if not passed:
51
+ self.status = "failed"
52
+ if error:
53
+ self.errors.append(f"{name}: {error}")
54
+
55
+ def to_dict(self) -> dict[str, Any]:
56
+ """Convert to dictionary for JSON serialization."""
57
+ return {
58
+ "status": self.status,
59
+ "submission_id": self.submission_id,
60
+ "exp_name": self.exp_name,
61
+ "subset": self.subset,
62
+ "models": self.models,
63
+ "new_models": self.new_models,
64
+ "total_battles": self.total_battles,
65
+ "checks": [
66
+ {"name": c.name, "passed": c.passed, "error": c.error}
67
+ for c in self.checks
68
+ ],
69
+ "elo_comparison": self.elo_comparison,
70
+ "errors": self.errors,
71
+ }
72
+
73
+
74
+ def validate_submission_file(
75
+ submission_path: str,
76
+ official_models_path: Optional[str] = None,
77
+ download_data: bool = True,
78
+ ) -> ValidationReport:
79
+ """
80
+ Validate a submission JSON file.
81
+
82
+ This is the main entry point for validating submissions,
83
+ used by the GitHub Actions bot.
84
+
85
+ Args:
86
+ submission_path: Path to submission JSON file
87
+ official_models_path: Path to official_models.json (optional)
88
+ download_data: Whether to download and verify data from HF
89
+
90
+ Returns:
91
+ ValidationReport with all check results
92
+ """
93
+ report = ValidationReport(status="success")
94
+
95
+ # 1. Load and parse JSON
96
+ try:
97
+ with open(submission_path, "r", encoding="utf-8") as f:
98
+ submission = json.load(f)
99
+ report.add_check("JSON parse", True)
100
+ except json.JSONDecodeError as e:
101
+ report.add_check("JSON parse", False, str(e))
102
+ return report
103
+ except IOError as e:
104
+ report.add_check("File read", False, str(e))
105
+ return report
106
+
107
+ # 2. Schema validation
108
+ is_valid, schema_errors = validate_submission_schema(submission)
109
+ if is_valid:
110
+ report.add_check("Schema validation", True)
111
+ else:
112
+ for err in schema_errors:
113
+ report.add_check("Schema validation", False, err)
114
+ return report
115
+
116
+ # Extract basic info
117
+ report.submission_id = submission.get("submission_id", "")
118
+ exp = submission.get("experiment", {})
119
+ report.exp_name = exp.get("exp_name", "")
120
+ report.subset = exp.get("subset", "")
121
+ report.models = exp.get("models", [])
122
+ report.new_models = exp.get("new_models", [])
123
+ report.total_battles = exp.get("total_battles", 0)
124
+
125
+ # 3. Check new models against official list
126
+ if official_models_path and os.path.isfile(official_models_path):
127
+ try:
128
+ with open(official_models_path, "r", encoding="utf-8") as f:
129
+ official_data = json.load(f)
130
+ official_models = set(
131
+ official_data.get("subsets", {})
132
+ .get(report.subset, {})
133
+ .get("models", [])
134
+ )
135
+
136
+ # Verify new_models are actually new
137
+ for model in report.new_models:
138
+ if model in official_models:
139
+ report.add_check(
140
+ f"Model '{model}' is new",
141
+ False,
142
+ "Model already exists in official leaderboard",
143
+ )
144
+ else:
145
+ report.add_check(f"Model '{model}' is new", True)
146
+
147
+ except Exception as e:
148
+ report.add_check(
149
+ "Check official models", False, f"Failed to load official models: {e}"
150
+ )
151
+ else:
152
+ report.add_check(
153
+ "Check official models",
154
+ True,
155
+ "Skipped (no official_models.json provided)",
156
+ )
157
+
158
+ # 4. Download and verify data from HuggingFace
159
+ if download_data:
160
+ data_report = validate_submission_data(submission)
161
+ for check in data_report.checks:
162
+ report.checks.append(check)
163
+ if not check.passed:
164
+ report.status = "failed"
165
+ if check.error:
166
+ report.errors.append(f"{check.name}: {check.error}")
167
+ report.elo_comparison = data_report.elo_comparison
168
+ else:
169
+ report.add_check("Data verification", True, "Skipped (download_data=False)")
170
+
171
+ return report
172
+
173
+
174
+ def validate_submission_data(submission: dict[str, Any]) -> ValidationReport:
175
+ """
176
+ Download and validate submission data from HuggingFace.
177
+
178
+ Downloads the pk_logs ZIP, verifies checksum, extracts battles,
179
+ and recalculates ELO for comparison.
180
+
181
+ Args:
182
+ submission: Submission metadata dictionary
183
+
184
+ Returns:
185
+ ValidationReport with data validation results
186
+ """
187
+ report = ValidationReport(status="success")
188
+
189
+ data_loc = submission.get("data_location", {})
190
+ hf_repo = data_loc.get("hf_repo_id", "")
191
+ hf_revision = data_loc.get("hf_revision", "main")
192
+ files = data_loc.get("files", {})
193
+ pk_logs_info = files.get("pk_logs_zip", {})
194
+
195
+ if not hf_repo or not pk_logs_info:
196
+ report.add_check("Data location", False, "Missing HF repo or file info")
197
+ return report
198
+
199
+ try:
200
+ from huggingface_hub import hf_hub_download
201
+ except ImportError:
202
+ report.add_check(
203
+ "HuggingFace Hub",
204
+ False,
205
+ "huggingface_hub not installed",
206
+ )
207
+ return report
208
+
209
+ with tempfile.TemporaryDirectory() as tmpdir:
210
+ # Download pk_logs ZIP
211
+ try:
212
+ pk_logs_path = hf_hub_download(
213
+ repo_id=hf_repo,
214
+ filename=pk_logs_info["path"],
215
+ repo_type="dataset",
216
+ revision=hf_revision,
217
+ local_dir=tmpdir,
218
+ )
219
+ report.add_check("Download pk_logs", True)
220
+ except Exception as e:
221
+ report.add_check("Download pk_logs", False, str(e))
222
+ return report
223
+
224
+ # Verify SHA256
225
+ expected_sha = pk_logs_info.get("sha256", "")
226
+ try:
227
+ with open(pk_logs_path, "rb") as f:
228
+ actual_sha = hashlib.sha256(f.read()).hexdigest()
229
+
230
+ if actual_sha == expected_sha:
231
+ report.add_check("SHA256 checksum", True)
232
+ else:
233
+ report.add_check(
234
+ "SHA256 checksum",
235
+ False,
236
+ f"Expected {expected_sha[:16]}..., got {actual_sha[:16]}...",
237
+ )
238
+ return report
239
+ except Exception as e:
240
+ report.add_check("SHA256 checksum", False, str(e))
241
+ return report
242
+
243
+ # Extract ZIP
244
+ extract_dir = os.path.join(tmpdir, "extracted")
245
+ try:
246
+ with zipfile.ZipFile(pk_logs_path, "r") as zf:
247
+ zf.extractall(extract_dir)
248
+ report.add_check("Extract ZIP", True)
249
+ except Exception as e:
250
+ report.add_check("Extract ZIP", False, str(e))
251
+ return report
252
+
253
+ # Find battle log files
254
+ # The ZIP structure is: <exp_name>/*.jsonl
255
+ battle_records = []
256
+ try:
257
+ for root, dirs, filenames in os.walk(extract_dir):
258
+ for filename in filenames:
259
+ if filename.endswith(".jsonl") and "raw_outputs" not in root:
260
+ filepath = os.path.join(root, filename)
261
+ with open(filepath, "r", encoding="utf-8") as f:
262
+ for line in f:
263
+ line = line.strip()
264
+ if line:
265
+ try:
266
+ record = json.loads(line)
267
+ battle_records.append(record)
268
+ except json.JSONDecodeError:
269
+ continue
270
+ report.add_check("Parse battle logs", True)
271
+ except Exception as e:
272
+ report.add_check("Parse battle logs", False, str(e))
273
+ return report
274
+
275
+ # Verify battle count
276
+ expected_battles = submission.get("experiment", {}).get("total_battles", 0)
277
+ if len(battle_records) == expected_battles:
278
+ report.add_check("Battle count", True)
279
+ else:
280
+ report.add_check(
281
+ "Battle count",
282
+ False,
283
+ f"Expected {expected_battles}, got {len(battle_records)}",
284
+ )
285
+
286
+ # Recalculate ELO
287
+ try:
288
+ from genarena.bt_elo import compute_bt_elo_ratings
289
+
290
+ battles = [
291
+ (r["model_a"], r["model_b"], r["final_winner"])
292
+ for r in battle_records
293
+ if r.get("model_a") and r.get("model_b") and r.get("final_winner")
294
+ ]
295
+
296
+ if battles:
297
+ recalc_elo = compute_bt_elo_ratings(battles)
298
+ submitted_elo = submission.get("elo_preview", {}).get("ratings", {})
299
+
300
+ all_match = True
301
+ for model, submitted_rating in submitted_elo.items():
302
+ recalc_rating = recalc_elo.get(model, 0)
303
+ report.elo_comparison[model] = {
304
+ "submitted": submitted_rating,
305
+ "recalculated": recalc_rating,
306
+ }
307
+
308
+ # Allow small floating point differences (±1.0)
309
+ diff = abs(submitted_rating - recalc_rating)
310
+ if diff > 1.0:
311
+ report.add_check(
312
+ f"ELO '{model}'",
313
+ False,
314
+ f"Diff: {diff:.1f} (submitted: {submitted_rating:.1f}, "
315
+ f"recalc: {recalc_rating:.1f})",
316
+ )
317
+ all_match = False
318
+
319
+ if all_match:
320
+ report.add_check("ELO verification", True)
321
+
322
+ except Exception as e:
323
+ report.add_check("ELO verification", False, str(e))
324
+
325
+ return report
genarena/visualize/__init__.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ GenArena Arena Visualization Module.
3
+
4
+ Provides a web-based interface for browsing and analyzing battle records.
5
+ """
6
+
7
+ from genarena.visualize.app import create_app
8
+ from genarena.visualize.data_loader import ArenaDataLoader
9
+
10
+ __all__ = [
11
+ "create_app",
12
+ "ArenaDataLoader",
13
+ ]
14
+
genarena/visualize/app.py ADDED
@@ -0,0 +1,934 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Flask application for arena visualization."""
2
+
3
+ import io
4
+ import os
5
+
6
+ from flask import Flask, jsonify, render_template, request, send_file, abort, redirect
7
+
8
+ from genarena.visualize.data_loader import ArenaDataLoader
9
+
10
+
11
+ def create_app(arena_dir: str, data_dir: str) -> Flask:
12
+ """
13
+ Create and configure the Flask application.
14
+
15
+ Args:
16
+ arena_dir: Path to arena directory
17
+ data_dir: Path to data directory
18
+
19
+ Returns:
20
+ Configured Flask app
21
+ """
22
+ # Get the directory containing this file for templates/static
23
+ app_dir = os.path.dirname(os.path.abspath(__file__))
24
+
25
+ app = Flask(
26
+ __name__,
27
+ template_folder=os.path.join(app_dir, "templates"),
28
+ static_folder=os.path.join(app_dir, "static"),
29
+ )
30
+
31
+ # Store paths in config
32
+ app.config["ARENA_DIR"] = arena_dir
33
+ app.config["DATA_DIR"] = data_dir
34
+
35
+ # Create data loader
36
+ data_loader = ArenaDataLoader(arena_dir, data_dir)
37
+
38
+ # ========== Page Routes ==========
39
+
40
+ @app.route("/")
41
+ def index():
42
+ """Main page."""
43
+ return render_template("index.html")
44
+
45
+ # ========== API Routes ==========
46
+
47
+ @app.route("/api/subsets")
48
+ def api_subsets():
49
+ """Get list of available subsets."""
50
+ subsets = data_loader.discover_subsets()
51
+ return jsonify({"subsets": subsets})
52
+
53
+ @app.route("/api/subsets/<subset>/info")
54
+ def api_subset_info(subset: str):
55
+ """Get information about a subset."""
56
+ info = data_loader.get_subset_info(subset)
57
+ if not info:
58
+ return jsonify({"error": "Subset not found"}), 404
59
+
60
+ return jsonify({
61
+ "name": info.name,
62
+ "models": info.models,
63
+ "experiments": info.experiments,
64
+ "total_battles": info.total_battles,
65
+ "min_input_images": info.min_input_images,
66
+ "max_input_images": info.max_input_images,
67
+ "prompt_sources": info.prompt_sources,
68
+ })
69
+
70
+ @app.route("/api/subsets/<subset>/experiments/<exp_name>/battles")
71
+ def api_battles(subset: str, exp_name: str):
72
+ """Get paginated battle records."""
73
+ # Parse query parameters
74
+ page = request.args.get("page", 1, type=int)
75
+ page_size = request.args.get("page_size", 20, type=int)
76
+ result_filter = request.args.get("result", None, type=str)
77
+ consistency = request.args.get("consistent", None, type=str)
78
+ min_images = request.args.get("min_images", None, type=int)
79
+ max_images = request.args.get("max_images", None, type=int)
80
+ prompt_source = request.args.get("prompt_source", None, type=str)
81
+
82
+ # Support multiple models (comma-separated or multiple params)
83
+ models_param = request.args.get("models", None, type=str)
84
+ models = None
85
+ if models_param:
86
+ models = [m.strip() for m in models_param.split(",") if m.strip()]
87
+
88
+ # Convert consistency filter
89
+ consistency_filter = None
90
+ if consistency == "true":
91
+ consistency_filter = True
92
+ elif consistency == "false":
93
+ consistency_filter = False
94
+
95
+ # Get battles
96
+ records, total = data_loader.get_battles(
97
+ subset=subset,
98
+ exp_name=exp_name,
99
+ page=page,
100
+ page_size=page_size,
101
+ models=models,
102
+ result_filter=result_filter,
103
+ consistency_filter=consistency_filter,
104
+ min_images=min_images,
105
+ max_images=max_images,
106
+ prompt_source=prompt_source,
107
+ )
108
+
109
+ return jsonify({
110
+ "battles": [r.to_dict() for r in records],
111
+ "total": total,
112
+ "page": page,
113
+ "page_size": page_size,
114
+ "total_pages": (total + page_size - 1) // page_size,
115
+ })
116
+
117
+ @app.route("/api/subsets/<subset>/experiments/<exp_name>/battles/<path:battle_id>")
118
+ def api_battle_detail(subset: str, exp_name: str, battle_id: str):
119
+ """Get detailed battle record."""
120
+ # Parse battle_id: model_a_vs_model_b:sample_index
121
+ try:
122
+ parts = battle_id.rsplit(":", 1)
123
+ sample_index = int(parts[1])
124
+ model_part = parts[0]
125
+
126
+ # Split model names
127
+ if "_vs_" in model_part:
128
+ models = model_part.split("_vs_")
129
+ model_a, model_b = models[0], models[1]
130
+ else:
131
+ return jsonify({"error": "Invalid battle_id format"}), 400
132
+ except (ValueError, IndexError):
133
+ return jsonify({"error": "Invalid battle_id format"}), 400
134
+
135
+ record = data_loader.get_battle_detail(
136
+ subset, exp_name, model_a, model_b, sample_index
137
+ )
138
+
139
+ if not record:
140
+ return jsonify({"error": "Battle not found"}), 404
141
+
142
+ return jsonify(record.to_detail_dict())
143
+
144
+ @app.route("/api/subsets/<subset>/stats")
145
+ def api_stats(subset: str):
146
+ """Get statistics for a subset."""
147
+ exp_name = request.args.get("exp_name", None, type=str)
148
+ stats = data_loader.get_stats(subset, exp_name)
149
+
150
+ if not stats:
151
+ return jsonify({"error": "Subset not found"}), 404
152
+
153
+ return jsonify(stats)
154
+
155
+ @app.route("/api/subsets/<subset>/leaderboard")
156
+ def api_elo_leaderboard(subset: str):
157
+ """Get ELO leaderboard for a subset."""
158
+ # Support multiple models filter (comma-separated)
159
+ models_param = request.args.get("models", None, type=str)
160
+ filter_models = None
161
+ if models_param:
162
+ filter_models = [m.strip() for m in models_param.split(",") if m.strip()]
163
+
164
+ leaderboard = data_loader.get_elo_leaderboard(subset, filter_models)
165
+ return jsonify({"leaderboard": leaderboard})
166
+
167
+ @app.route("/api/subsets/<subset>/models/<path:model>/stats")
168
+ def api_model_stats(subset: str, model: str):
169
+ """Get detailed statistics for a specific model including win rates against all opponents."""
170
+ exp_name = request.args.get("exp_name", "__all__", type=str)
171
+ stats = data_loader.get_model_vs_stats(subset, model, exp_name)
172
+
173
+ if not stats:
174
+ return jsonify({"error": "Model not found"}), 404
175
+
176
+ return jsonify(stats)
177
+
178
+ @app.route("/api/subsets/<subset>/experiments/<exp_name>/h2h")
179
+ def api_head_to_head(subset: str, exp_name: str):
180
+ """Get head-to-head statistics between two models."""
181
+ model_a = request.args.get("model_a", None, type=str)
182
+ model_b = request.args.get("model_b", None, type=str)
183
+
184
+ if not model_a or not model_b:
185
+ return jsonify({"error": "model_a and model_b are required"}), 400
186
+
187
+ h2h = data_loader.get_head_to_head(subset, exp_name, model_a, model_b)
188
+ return jsonify(h2h)
189
+
190
+ @app.route("/api/subsets/<subset>/samples/<int:sample_index>/input_count")
191
+ def api_input_image_count(subset: str, sample_index: int):
192
+ """Get the number of input images for a sample."""
193
+ count = data_loader.get_input_image_count(subset, sample_index)
194
+ return jsonify({"count": count})
195
+
196
+ @app.route("/api/subsets/<subset>/experiments/<exp_name>/samples/<int:sample_index>/all_models")
197
+ def api_sample_all_models(subset: str, exp_name: str, sample_index: int):
198
+ """Get all model outputs for a specific sample, sorted by win rate."""
199
+ # Support multiple models filter (comma-separated)
200
+ models_param = request.args.get("models", None, type=str)
201
+ filter_models = None
202
+ if models_param:
203
+ filter_models = [m.strip() for m in models_param.split(",") if m.strip()]
204
+
205
+ # stats_scope: 'filtered' = only count battles between filtered models
206
+ # 'all' = count all battles (but show only filtered models)
207
+ stats_scope = request.args.get("stats_scope", "filtered", type=str)
208
+
209
+ result = data_loader.get_sample_all_models(
210
+ subset, exp_name, sample_index, filter_models, stats_scope
211
+ )
212
+
213
+ if not result:
214
+ return jsonify({"error": "Sample not found"}), 404
215
+
216
+ return jsonify(result)
217
+
218
+ @app.route("/api/subsets/<subset>/experiments/<exp_name>/samples/<int:sample_index>/models/<path:model>/battles")
219
+ def api_model_battles_for_sample(subset: str, exp_name: str, sample_index: int, model: str):
220
+ """Get all battle records for a specific model on a specific sample."""
221
+ # Parse optional opponent models filter (comma-separated)
222
+ opponents_param = request.args.get("opponents", None, type=str)
223
+ opponent_models = None
224
+ if opponents_param:
225
+ opponent_models = [m.strip() for m in opponents_param.split(",") if m.strip()]
226
+
227
+ result = data_loader.get_model_battles_for_sample(
228
+ subset=subset,
229
+ exp_name=exp_name,
230
+ sample_index=sample_index,
231
+ model=model,
232
+ opponent_models=opponent_models,
233
+ )
234
+
235
+ return jsonify(result)
236
+
237
+ @app.route("/api/subsets/<subset>/experiments/<exp_name>/prompts")
238
+ def api_prompts(subset: str, exp_name: str):
239
+ """Get paginated list of prompts/samples with all model outputs."""
240
+ # Parse query parameters
241
+ page = request.args.get("page", 1, type=int)
242
+ page_size = request.args.get("page_size", 10, type=int)
243
+ min_images = request.args.get("min_images", None, type=int)
244
+ max_images = request.args.get("max_images", None, type=int)
245
+ prompt_source = request.args.get("prompt_source", None, type=str)
246
+
247
+ # Support multiple models filter (comma-separated)
248
+ models_param = request.args.get("models", None, type=str)
249
+ filter_models = None
250
+ if models_param:
251
+ filter_models = [m.strip() for m in models_param.split(",") if m.strip()]
252
+
253
+ # Get prompts
254
+ prompts, total = data_loader.get_prompts(
255
+ subset=subset,
256
+ exp_name=exp_name,
257
+ page=page,
258
+ page_size=page_size,
259
+ min_images=min_images,
260
+ max_images=max_images,
261
+ prompt_source=prompt_source,
262
+ filter_models=filter_models,
263
+ )
264
+
265
+ return jsonify({
266
+ "prompts": prompts,
267
+ "total": total,
268
+ "page": page,
269
+ "page_size": page_size,
270
+ "total_pages": (total + page_size - 1) // page_size,
271
+ })
272
+
273
+ @app.route("/api/subsets/<subset>/experiments/<exp_name>/search")
274
+ def api_search(subset: str, exp_name: str):
275
+ """Search battles by text query (full-text search across instruction, task_type, prompt_source, metadata)."""
276
+ # Parse query parameters
277
+ query = request.args.get("q", "", type=str)
278
+ page = request.args.get("page", 1, type=int)
279
+ page_size = request.args.get("page_size", 20, type=int)
280
+ consistency = request.args.get("consistent", None, type=str)
281
+
282
+ # Support multiple models (comma-separated)
283
+ models_param = request.args.get("models", None, type=str)
284
+ models = None
285
+ if models_param:
286
+ models = [m.strip() for m in models_param.split(",") if m.strip()]
287
+
288
+ # Convert consistency filter
289
+ consistency_filter = None
290
+ if consistency == "true":
291
+ consistency_filter = True
292
+ elif consistency == "false":
293
+ consistency_filter = False
294
+
295
+ # Search battles
296
+ records, total = data_loader.search_battles(
297
+ subset=subset,
298
+ exp_name=exp_name,
299
+ query=query,
300
+ page=page,
301
+ page_size=page_size,
302
+ models=models,
303
+ consistency_filter=consistency_filter,
304
+ )
305
+
306
+ return jsonify({
307
+ "battles": [r.to_dict() for r in records],
308
+ "total": total,
309
+ "page": page,
310
+ "page_size": page_size,
311
+ "total_pages": (total + page_size - 1) // page_size,
312
+ "query": query,
313
+ })
314
+
315
+ @app.route("/api/subsets/<subset>/experiments/<exp_name>/search/prompts")
316
+ def api_search_prompts(subset: str, exp_name: str):
317
+ """Search prompts by text query."""
318
+ # Parse query parameters
319
+ query = request.args.get("q", "", type=str)
320
+ page = request.args.get("page", 1, type=int)
321
+ page_size = request.args.get("page_size", 10, type=int)
322
+
323
+ # Support multiple models filter (comma-separated)
324
+ models_param = request.args.get("models", None, type=str)
325
+ filter_models = None
326
+ if models_param:
327
+ filter_models = [m.strip() for m in models_param.split(",") if m.strip()]
328
+
329
+ # Search prompts
330
+ prompts, total = data_loader.search_prompts(
331
+ subset=subset,
332
+ exp_name=exp_name,
333
+ query=query,
334
+ page=page,
335
+ page_size=page_size,
336
+ filter_models=filter_models,
337
+ )
338
+
339
+ return jsonify({
340
+ "prompts": prompts,
341
+ "total": total,
342
+ "page": page,
343
+ "page_size": page_size,
344
+ "total_pages": (total + page_size - 1) // page_size,
345
+ "query": query,
346
+ })
347
+
348
+ @app.route("/api/subsets/<subset>/matrix")
349
+ def api_win_rate_matrix(subset: str):
350
+ """Get win rate matrix for all model pairs."""
351
+ exp_name = request.args.get("exp_name", "__all__", type=str)
352
+
353
+ # Support model filter (comma-separated)
354
+ models_param = request.args.get("models", None, type=str)
355
+ filter_models = None
356
+ if models_param:
357
+ filter_models = [m.strip() for m in models_param.split(",") if m.strip()]
358
+
359
+ result = data_loader.get_win_rate_matrix(subset, exp_name, filter_models)
360
+ return jsonify(result)
361
+
362
+ @app.route("/api/subsets/<subset>/leaderboard/by-source")
363
+ def api_elo_by_source(subset: str):
364
+ """Get ELO rankings grouped by prompt source."""
365
+ exp_name = request.args.get("exp_name", "__all__", type=str)
366
+ result = data_loader.get_elo_by_source(subset, exp_name)
367
+ return jsonify(result)
368
+
369
+ @app.route("/api/subsets/<subset>/elo-history")
370
+ def api_elo_history(subset: str):
371
+ """Get ELO history over time."""
372
+ exp_name = request.args.get("exp_name", "__all__", type=str)
373
+ granularity = request.args.get("granularity", "day", type=str)
374
+
375
+ # Support model filter (comma-separated)
376
+ models_param = request.args.get("models", None, type=str)
377
+ filter_models = None
378
+ if models_param:
379
+ filter_models = [m.strip() for m in models_param.split(",") if m.strip()]
380
+
381
+ result = data_loader.get_elo_history(subset, exp_name, granularity, filter_models)
382
+ return jsonify(result)
383
+
384
+ @app.route("/api/overview/leaderboards")
385
+ def api_overview_leaderboards():
386
+ """Get leaderboard data for all subsets (for Overview page)."""
387
+ result = data_loader.get_all_subsets_leaderboards()
388
+ return jsonify(result)
389
+
390
+ @app.route("/api/cross-subset/info")
391
+ def api_cross_subset_info():
392
+ """Get information about models across multiple subsets."""
393
+ subsets_param = request.args.get("subsets", "", type=str)
394
+ if not subsets_param:
395
+ return jsonify({"error": "subsets parameter is required"}), 400
396
+
397
+ subsets = [s.strip() for s in subsets_param.split(",") if s.strip()]
398
+ if len(subsets) < 1:
399
+ return jsonify({"error": "At least 1 subset required"}), 400
400
+
401
+ result = data_loader.get_cross_subset_info(subsets)
402
+ return jsonify(result)
403
+
404
+ @app.route("/api/cross-subset/elo")
405
+ def api_cross_subset_elo():
406
+ """Compute ELO rankings across multiple subsets."""
407
+ subsets_param = request.args.get("subsets", "", type=str)
408
+ if not subsets_param:
409
+ return jsonify({"error": "subsets parameter is required"}), 400
410
+
411
+ subsets = [s.strip() for s in subsets_param.split(",") if s.strip()]
412
+ if len(subsets) < 1:
413
+ return jsonify({"error": "At least 1 subset required"}), 400
414
+
415
+ exp_name = request.args.get("exp_name", "__all__", type=str)
416
+ model_scope = request.args.get("model_scope", "all", type=str)
417
+
418
+ result = data_loader.get_cross_subset_elo(subsets, exp_name, model_scope)
419
+ return jsonify(result)
420
+
421
+ # ========== Image Routes ==========
422
+
423
+ @app.route("/images/<subset>/<model>/<int:sample_index>")
424
+ def serve_model_image(subset: str, model: str, sample_index: int):
425
+ """Serve model output image."""
426
+ image_path = data_loader.get_image_path(subset, model, sample_index)
427
+
428
+ if not image_path or not os.path.isfile(image_path):
429
+ abort(404)
430
+
431
+ # Determine mime type
432
+ ext = os.path.splitext(image_path)[1].lower()
433
+ mime_types = {
434
+ ".png": "image/png",
435
+ ".jpg": "image/jpeg",
436
+ ".jpeg": "image/jpeg",
437
+ ".webp": "image/webp",
438
+ }
439
+ mimetype = mime_types.get(ext, "image/png")
440
+
441
+ return send_file(
442
+ image_path,
443
+ mimetype=mimetype,
444
+ max_age=3600, # Cache for 1 hour
445
+ )
446
+
447
+ @app.route("/images/<subset>/input/<int:sample_index>")
448
+ @app.route("/images/<subset>/input/<int:sample_index>/<int:img_idx>")
449
+ def serve_input_image(subset: str, sample_index: int, img_idx: int = 0):
450
+ """Serve input image from parquet dataset. Supports multiple images via img_idx."""
451
+ image_bytes = data_loader.get_input_image_by_idx(subset, sample_index, img_idx)
452
+
453
+ if not image_bytes:
454
+ abort(404)
455
+
456
+ return send_file(
457
+ io.BytesIO(image_bytes),
458
+ mimetype="image/png",
459
+ max_age=3600,
460
+ )
461
+
462
+ return app
463
+
464
+
465
+ def run_server(
466
+ arena_dir: str,
467
+ data_dir: str,
468
+ host: str = "0.0.0.0",
469
+ port: int = 8080,
470
+ debug: bool = False,
471
+ ):
472
+ """
473
+ Run the visualization server.
474
+
475
+ Args:
476
+ arena_dir: Path to arena directory
477
+ data_dir: Path to data directory
478
+ host: Host to bind to
479
+ port: Port to listen on
480
+ debug: Enable debug mode
481
+ """
482
+ import logging
483
+ logging.basicConfig(
484
+ level=logging.INFO,
485
+ format="%(asctime)s [%(levelname)s] %(message)s",
486
+ datefmt="%H:%M:%S"
487
+ )
488
+
489
+ print(f"\n{'='*60}")
490
+ print(f" GenArena Arena Visualizer")
491
+ print(f"{'='*60}")
492
+ print(f" Arena Dir: {arena_dir}")
493
+ print(f" Data Dir: {data_dir}")
494
+ print(f"{'='*60}")
495
+ print(f" Preloading data (this may take a while)...")
496
+ print(f"{'='*60}\n")
497
+
498
+ app = create_app(arena_dir, data_dir)
499
+
500
+ print(f"\n{'='*60}")
501
+ print(f" Server ready: http://{host}:{port}")
502
+ print(f"{'='*60}\n")
503
+
504
+ app.run(host=host, port=port, debug=debug, threaded=True)
505
+
506
+
507
+ def create_hf_app(
508
+ arena_dir: str,
509
+ data_dir: str,
510
+ hf_repo: str,
511
+ image_files: list[str],
512
+ ) -> Flask:
513
+ """
514
+ Create Flask app for HuggingFace Spaces deployment.
515
+
516
+ This version uses HF CDN URLs for model output images instead of
517
+ serving them from local filesystem.
518
+
519
+ Args:
520
+ arena_dir: Path to arena directory (metadata only, no images)
521
+ data_dir: Path to data directory containing parquet files
522
+ hf_repo: HuggingFace repo ID for image CDN URLs
523
+ image_files: List of image file paths in the HF repo
524
+
525
+ Returns:
526
+ Configured Flask app for HF Spaces
527
+ """
528
+ from genarena.visualize.data_loader import HFArenaDataLoader
529
+
530
+ # Get the directory containing this file for templates/static
531
+ app_dir = os.path.dirname(os.path.abspath(__file__))
532
+
533
+ app = Flask(
534
+ __name__,
535
+ template_folder=os.path.join(app_dir, "templates"),
536
+ static_folder=os.path.join(app_dir, "static"),
537
+ )
538
+
539
+ # Store config
540
+ app.config["ARENA_DIR"] = arena_dir
541
+ app.config["DATA_DIR"] = data_dir
542
+ app.config["USE_HF_CDN"] = True
543
+ app.config["HF_REPO"] = hf_repo
544
+
545
+ # Create HF data loader
546
+ data_loader = HFArenaDataLoader(arena_dir, data_dir, hf_repo, image_files)
547
+
548
+ # ========== Page Routes ==========
549
+
550
+ @app.route("/")
551
+ def index():
552
+ """Main page."""
553
+ return render_template("index.html")
554
+
555
+ # ========== API Routes ==========
556
+ # Copy all API routes from create_app - they work the same way
557
+
558
+ @app.route("/api/subsets")
559
+ def api_subsets():
560
+ """Get list of available subsets."""
561
+ subsets = data_loader.discover_subsets()
562
+ return jsonify({"subsets": subsets})
563
+
564
+ @app.route("/api/subsets/<subset>/info")
565
+ def api_subset_info(subset: str):
566
+ """Get information about a subset."""
567
+ info = data_loader.get_subset_info(subset)
568
+ if not info:
569
+ return jsonify({"error": "Subset not found"}), 404
570
+
571
+ return jsonify({
572
+ "name": info.name,
573
+ "models": info.models,
574
+ "experiments": info.experiments,
575
+ "total_battles": info.total_battles,
576
+ "min_input_images": info.min_input_images,
577
+ "max_input_images": info.max_input_images,
578
+ "prompt_sources": info.prompt_sources,
579
+ })
580
+
581
+ @app.route("/api/subsets/<subset>/experiments/<exp_name>/battles")
582
+ def api_battles(subset: str, exp_name: str):
583
+ """Get paginated battle records."""
584
+ page = request.args.get("page", 1, type=int)
585
+ page_size = request.args.get("page_size", 20, type=int)
586
+ result_filter = request.args.get("result", None, type=str)
587
+ consistency = request.args.get("consistent", None, type=str)
588
+ min_images = request.args.get("min_images", None, type=int)
589
+ max_images = request.args.get("max_images", None, type=int)
590
+ prompt_source = request.args.get("prompt_source", None, type=str)
591
+
592
+ models_param = request.args.get("models", None, type=str)
593
+ models = None
594
+ if models_param:
595
+ models = [m.strip() for m in models_param.split(",") if m.strip()]
596
+
597
+ consistency_filter = None
598
+ if consistency == "true":
599
+ consistency_filter = True
600
+ elif consistency == "false":
601
+ consistency_filter = False
602
+
603
+ records, total = data_loader.get_battles(
604
+ subset=subset,
605
+ exp_name=exp_name,
606
+ page=page,
607
+ page_size=page_size,
608
+ models=models,
609
+ result_filter=result_filter,
610
+ consistency_filter=consistency_filter,
611
+ min_images=min_images,
612
+ max_images=max_images,
613
+ prompt_source=prompt_source,
614
+ )
615
+
616
+ return jsonify({
617
+ "battles": [r.to_dict() for r in records],
618
+ "total": total,
619
+ "page": page,
620
+ "page_size": page_size,
621
+ "total_pages": (total + page_size - 1) // page_size,
622
+ })
623
+
624
+ @app.route("/api/subsets/<subset>/experiments/<exp_name>/battles/<path:battle_id>")
625
+ def api_battle_detail(subset: str, exp_name: str, battle_id: str):
626
+ """Get detailed battle record."""
627
+ try:
628
+ parts = battle_id.rsplit(":", 1)
629
+ sample_index = int(parts[1])
630
+ model_part = parts[0]
631
+
632
+ if "_vs_" in model_part:
633
+ models = model_part.split("_vs_")
634
+ model_a, model_b = models[0], models[1]
635
+ else:
636
+ return jsonify({"error": "Invalid battle_id format"}), 400
637
+ except (ValueError, IndexError):
638
+ return jsonify({"error": "Invalid battle_id format"}), 400
639
+
640
+ record = data_loader.get_battle_detail(
641
+ subset, exp_name, model_a, model_b, sample_index
642
+ )
643
+
644
+ if not record:
645
+ return jsonify({"error": "Battle not found"}), 404
646
+
647
+ return jsonify(record.to_detail_dict())
648
+
649
+ @app.route("/api/subsets/<subset>/stats")
650
+ def api_stats(subset: str):
651
+ """Get statistics for a subset."""
652
+ exp_name = request.args.get("exp_name", None, type=str)
653
+ stats = data_loader.get_stats(subset, exp_name)
654
+
655
+ if not stats:
656
+ return jsonify({"error": "Subset not found"}), 404
657
+
658
+ return jsonify(stats)
659
+
660
+ @app.route("/api/subsets/<subset>/leaderboard")
661
+ def api_elo_leaderboard(subset: str):
662
+ """Get ELO leaderboard for a subset."""
663
+ models_param = request.args.get("models", None, type=str)
664
+ filter_models = None
665
+ if models_param:
666
+ filter_models = [m.strip() for m in models_param.split(",") if m.strip()]
667
+
668
+ leaderboard = data_loader.get_elo_leaderboard(subset, filter_models)
669
+ return jsonify({"leaderboard": leaderboard})
670
+
671
+ @app.route("/api/subsets/<subset>/models/<path:model>/stats")
672
+ def api_model_stats(subset: str, model: str):
673
+ """Get detailed statistics for a specific model."""
674
+ exp_name = request.args.get("exp_name", "__all__", type=str)
675
+ stats = data_loader.get_model_vs_stats(subset, model, exp_name)
676
+
677
+ if not stats:
678
+ return jsonify({"error": "Model not found"}), 404
679
+
680
+ return jsonify(stats)
681
+
682
+ @app.route("/api/subsets/<subset>/experiments/<exp_name>/h2h")
683
+ def api_head_to_head(subset: str, exp_name: str):
684
+ """Get head-to-head statistics between two models."""
685
+ model_a = request.args.get("model_a", None, type=str)
686
+ model_b = request.args.get("model_b", None, type=str)
687
+
688
+ if not model_a or not model_b:
689
+ return jsonify({"error": "model_a and model_b are required"}), 400
690
+
691
+ h2h = data_loader.get_head_to_head(subset, exp_name, model_a, model_b)
692
+ return jsonify(h2h)
693
+
694
+ @app.route("/api/subsets/<subset>/samples/<int:sample_index>/input_count")
695
+ def api_input_image_count(subset: str, sample_index: int):
696
+ """Get the number of input images for a sample."""
697
+ count = data_loader.get_input_image_count(subset, sample_index)
698
+ return jsonify({"count": count})
699
+
700
+ @app.route("/api/subsets/<subset>/experiments/<exp_name>/samples/<int:sample_index>/all_models")
701
+ def api_sample_all_models(subset: str, exp_name: str, sample_index: int):
702
+ """Get all model outputs for a specific sample."""
703
+ models_param = request.args.get("models", None, type=str)
704
+ filter_models = None
705
+ if models_param:
706
+ filter_models = [m.strip() for m in models_param.split(",") if m.strip()]
707
+
708
+ stats_scope = request.args.get("stats_scope", "filtered", type=str)
709
+
710
+ result = data_loader.get_sample_all_models(
711
+ subset, exp_name, sample_index, filter_models, stats_scope
712
+ )
713
+
714
+ if not result:
715
+ return jsonify({"error": "Sample not found"}), 404
716
+
717
+ return jsonify(result)
718
+
719
+ @app.route("/api/subsets/<subset>/experiments/<exp_name>/samples/<int:sample_index>/models/<path:model>/battles")
720
+ def api_model_battles_for_sample(subset: str, exp_name: str, sample_index: int, model: str):
721
+ """Get all battle records for a specific model on a specific sample."""
722
+ opponents_param = request.args.get("opponents", None, type=str)
723
+ opponent_models = None
724
+ if opponents_param:
725
+ opponent_models = [m.strip() for m in opponents_param.split(",") if m.strip()]
726
+
727
+ result = data_loader.get_model_battles_for_sample(
728
+ subset=subset,
729
+ exp_name=exp_name,
730
+ sample_index=sample_index,
731
+ model=model,
732
+ opponent_models=opponent_models,
733
+ )
734
+
735
+ return jsonify(result)
736
+
737
+ @app.route("/api/subsets/<subset>/experiments/<exp_name>/prompts")
738
+ def api_prompts(subset: str, exp_name: str):
739
+ """Get paginated list of prompts/samples."""
740
+ page = request.args.get("page", 1, type=int)
741
+ page_size = request.args.get("page_size", 10, type=int)
742
+ min_images = request.args.get("min_images", None, type=int)
743
+ max_images = request.args.get("max_images", None, type=int)
744
+ prompt_source = request.args.get("prompt_source", None, type=str)
745
+
746
+ models_param = request.args.get("models", None, type=str)
747
+ filter_models = None
748
+ if models_param:
749
+ filter_models = [m.strip() for m in models_param.split(",") if m.strip()]
750
+
751
+ prompts, total = data_loader.get_prompts(
752
+ subset=subset,
753
+ exp_name=exp_name,
754
+ page=page,
755
+ page_size=page_size,
756
+ min_images=min_images,
757
+ max_images=max_images,
758
+ prompt_source=prompt_source,
759
+ filter_models=filter_models,
760
+ )
761
+
762
+ return jsonify({
763
+ "prompts": prompts,
764
+ "total": total,
765
+ "page": page,
766
+ "page_size": page_size,
767
+ "total_pages": (total + page_size - 1) // page_size,
768
+ })
769
+
770
+ @app.route("/api/subsets/<subset>/experiments/<exp_name>/search")
771
+ def api_search(subset: str, exp_name: str):
772
+ """Search battles by text query."""
773
+ query = request.args.get("q", "", type=str)
774
+ page = request.args.get("page", 1, type=int)
775
+ page_size = request.args.get("page_size", 20, type=int)
776
+ consistency = request.args.get("consistent", None, type=str)
777
+
778
+ models_param = request.args.get("models", None, type=str)
779
+ models = None
780
+ if models_param:
781
+ models = [m.strip() for m in models_param.split(",") if m.strip()]
782
+
783
+ consistency_filter = None
784
+ if consistency == "true":
785
+ consistency_filter = True
786
+ elif consistency == "false":
787
+ consistency_filter = False
788
+
789
+ records, total = data_loader.search_battles(
790
+ subset=subset,
791
+ exp_name=exp_name,
792
+ query=query,
793
+ page=page,
794
+ page_size=page_size,
795
+ models=models,
796
+ consistency_filter=consistency_filter,
797
+ )
798
+
799
+ return jsonify({
800
+ "battles": [r.to_dict() for r in records],
801
+ "total": total,
802
+ "page": page,
803
+ "page_size": page_size,
804
+ "total_pages": (total + page_size - 1) // page_size,
805
+ "query": query,
806
+ })
807
+
808
+ @app.route("/api/subsets/<subset>/experiments/<exp_name>/search/prompts")
809
+ def api_search_prompts(subset: str, exp_name: str):
810
+ """Search prompts by text query."""
811
+ query = request.args.get("q", "", type=str)
812
+ page = request.args.get("page", 1, type=int)
813
+ page_size = request.args.get("page_size", 10, type=int)
814
+
815
+ models_param = request.args.get("models", None, type=str)
816
+ filter_models = None
817
+ if models_param:
818
+ filter_models = [m.strip() for m in models_param.split(",") if m.strip()]
819
+
820
+ prompts, total = data_loader.search_prompts(
821
+ subset=subset,
822
+ exp_name=exp_name,
823
+ query=query,
824
+ page=page,
825
+ page_size=page_size,
826
+ filter_models=filter_models,
827
+ )
828
+
829
+ return jsonify({
830
+ "prompts": prompts,
831
+ "total": total,
832
+ "page": page,
833
+ "page_size": page_size,
834
+ "total_pages": (total + page_size - 1) // page_size,
835
+ "query": query,
836
+ })
837
+
838
+ @app.route("/api/subsets/<subset>/matrix")
839
+ def api_win_rate_matrix(subset: str):
840
+ """Get win rate matrix for all model pairs."""
841
+ exp_name = request.args.get("exp_name", "__all__", type=str)
842
+
843
+ models_param = request.args.get("models", None, type=str)
844
+ filter_models = None
845
+ if models_param:
846
+ filter_models = [m.strip() for m in models_param.split(",") if m.strip()]
847
+
848
+ result = data_loader.get_win_rate_matrix(subset, exp_name, filter_models)
849
+ return jsonify(result)
850
+
851
+ @app.route("/api/subsets/<subset>/leaderboard/by-source")
852
+ def api_elo_by_source(subset: str):
853
+ """Get ELO rankings grouped by prompt source."""
854
+ exp_name = request.args.get("exp_name", "__all__", type=str)
855
+ result = data_loader.get_elo_by_source(subset, exp_name)
856
+ return jsonify(result)
857
+
858
+ @app.route("/api/subsets/<subset>/elo-history")
859
+ def api_elo_history(subset: str):
860
+ """Get ELO history over time."""
861
+ exp_name = request.args.get("exp_name", "__all__", type=str)
862
+ granularity = request.args.get("granularity", "day", type=str)
863
+
864
+ models_param = request.args.get("models", None, type=str)
865
+ filter_models = None
866
+ if models_param:
867
+ filter_models = [m.strip() for m in models_param.split(",") if m.strip()]
868
+
869
+ result = data_loader.get_elo_history(subset, exp_name, granularity, filter_models)
870
+ return jsonify(result)
871
+
872
+ @app.route("/api/overview/leaderboards")
873
+ def api_overview_leaderboards():
874
+ """Get leaderboard data for all subsets."""
875
+ result = data_loader.get_all_subsets_leaderboards()
876
+ return jsonify(result)
877
+
878
+ @app.route("/api/cross-subset/info")
879
+ def api_cross_subset_info():
880
+ """Get information about models across multiple subsets."""
881
+ subsets_param = request.args.get("subsets", "", type=str)
882
+ if not subsets_param:
883
+ return jsonify({"error": "subsets parameter is required"}), 400
884
+
885
+ subsets = [s.strip() for s in subsets_param.split(",") if s.strip()]
886
+ if len(subsets) < 1:
887
+ return jsonify({"error": "At least 1 subset required"}), 400
888
+
889
+ result = data_loader.get_cross_subset_info(subsets)
890
+ return jsonify(result)
891
+
892
+ @app.route("/api/cross-subset/elo")
893
+ def api_cross_subset_elo():
894
+ """Compute ELO rankings across multiple subsets."""
895
+ subsets_param = request.args.get("subsets", "", type=str)
896
+ if not subsets_param:
897
+ return jsonify({"error": "subsets parameter is required"}), 400
898
+
899
+ subsets = [s.strip() for s in subsets_param.split(",") if s.strip()]
900
+ if len(subsets) < 1:
901
+ return jsonify({"error": "At least 1 subset required"}), 400
902
+
903
+ exp_name = request.args.get("exp_name", "__all__", type=str)
904
+ model_scope = request.args.get("model_scope", "all", type=str)
905
+
906
+ result = data_loader.get_cross_subset_elo(subsets, exp_name, model_scope)
907
+ return jsonify(result)
908
+
909
+ # ========== Image Routes ==========
910
+
911
+ @app.route("/images/<subset>/<model>/<int:sample_index>")
912
+ def serve_model_image(subset: str, model: str, sample_index: int):
913
+ """Redirect to HF CDN for model output images."""
914
+ url = data_loader.get_model_image_url(subset, model, sample_index)
915
+ if url:
916
+ return redirect(url)
917
+ abort(404)
918
+
919
+ @app.route("/images/<subset>/input/<int:sample_index>")
920
+ @app.route("/images/<subset>/input/<int:sample_index>/<int:img_idx>")
921
+ def serve_input_image(subset: str, sample_index: int, img_idx: int = 0):
922
+ """Serve input image from parquet dataset."""
923
+ image_bytes = data_loader.get_input_image_by_idx(subset, sample_index, img_idx)
924
+
925
+ if not image_bytes:
926
+ abort(404)
927
+
928
+ return send_file(
929
+ io.BytesIO(image_bytes),
930
+ mimetype="image/png",
931
+ max_age=3600,
932
+ )
933
+
934
+ return app
genarena/visualize/data_loader.py ADDED
@@ -0,0 +1,2331 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Data loader for arena visualization with preloading support."""
2
+
3
+ import json
4
+ import logging
5
+ import os
6
+ import re
7
+ from dataclasses import dataclass, field
8
+ from typing import Any, Optional
9
+
10
+ from genarena.data import DataSample, ParquetDataset, discover_subsets
11
+ from genarena.models import GlobalModelOutputManager
12
+ from genarena.state import ArenaState, load_state
13
+
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ @dataclass
19
+ class BattleRecord:
20
+ """A single battle record with all relevant information."""
21
+
22
+ # Battle identification
23
+ subset: str
24
+ exp_name: str
25
+ sample_index: int
26
+ model_a: str
27
+ model_b: str
28
+
29
+ # Battle result
30
+ final_winner: str # model name or "tie"
31
+ is_consistent: bool
32
+ timestamp: str = ""
33
+
34
+ # Raw VLM outputs (from audit logs, optional)
35
+ original_call: Optional[dict[str, Any]] = None
36
+ swapped_call: Optional[dict[str, Any]] = None
37
+
38
+ # Sample data (loaded on demand)
39
+ instruction: str = ""
40
+ task_type: str = ""
41
+ input_image_count: int = 1
42
+ prompt_source: Optional[str] = None
43
+ original_metadata: Optional[dict[str, Any]] = None
44
+
45
+ @property
46
+ def id(self) -> str:
47
+ """Unique identifier for this battle."""
48
+ return f"{self.subset}:{self.exp_name}:{self.model_a}_vs_{self.model_b}:{self.sample_index}"
49
+
50
+ @property
51
+ def winner_display(self) -> str:
52
+ """Display-friendly winner string."""
53
+ if self.final_winner == "tie":
54
+ return "Tie"
55
+ return self.final_winner
56
+
57
+ @property
58
+ def models(self) -> set[str]:
59
+ """Set of models involved in this battle."""
60
+ return {self.model_a, self.model_b}
61
+
62
+ def to_dict(self) -> dict[str, Any]:
63
+ """Convert to dictionary for JSON serialization."""
64
+ return {
65
+ "id": self.id,
66
+ "subset": self.subset,
67
+ "exp_name": self.exp_name,
68
+ "sample_index": self.sample_index,
69
+ "model_a": self.model_a,
70
+ "model_b": self.model_b,
71
+ "final_winner": self.final_winner,
72
+ "winner_display": self.winner_display,
73
+ "is_consistent": self.is_consistent,
74
+ "timestamp": self.timestamp,
75
+ "instruction": self.instruction,
76
+ "task_type": self.task_type,
77
+ "input_image_count": self.input_image_count,
78
+ "prompt_source": self.prompt_source,
79
+ "original_metadata": self.original_metadata,
80
+ "has_audit": self.original_call is not None,
81
+ }
82
+
83
+ def to_detail_dict(self) -> dict[str, Any]:
84
+ """Convert to detailed dictionary including VLM outputs."""
85
+ d = self.to_dict()
86
+ d["original_call"] = self.original_call
87
+ d["swapped_call"] = self.swapped_call
88
+ return d
89
+
90
+
91
+ @dataclass
92
+ class SubsetInfo:
93
+ """Information about a subset."""
94
+
95
+ name: str
96
+ models: list[str]
97
+ experiments: list[str]
98
+ total_battles: int
99
+ state: Optional[ArenaState] = None
100
+ min_input_images: int = 1
101
+ max_input_images: int = 1
102
+ prompt_sources: list[str] = field(default_factory=list)
103
+
104
+
105
+ class ArenaDataLoader:
106
+ """
107
+ Data loader for arena visualization.
108
+
109
+ Manages loading and querying battle records across multiple subsets.
110
+ Supports preloading for better performance with large datasets.
111
+ """
112
+
113
+ def __init__(self, arena_dir: str, data_dir: str, preload: bool = True):
114
+ """
115
+ Initialize the data loader.
116
+
117
+ Args:
118
+ arena_dir: Path to arena directory containing subset folders
119
+ data_dir: Path to data directory containing parquet files
120
+ preload: If True, preload all data at initialization
121
+ """
122
+ self.arena_dir = arena_dir
123
+ self.data_dir = data_dir
124
+
125
+ # Cached data
126
+ self._subsets: Optional[list[str]] = None
127
+ self._subset_info_cache: dict[str, SubsetInfo] = {}
128
+ self._dataset_cache: dict[str, ParquetDataset] = {}
129
+ self._model_manager_cache: dict[str, GlobalModelOutputManager] = {}
130
+
131
+ # Battle records cache: (subset, exp_name) -> List[BattleRecord]
132
+ self._battle_cache: dict[tuple[str, str], list[BattleRecord]] = {}
133
+
134
+ # Index for faster lookups: (subset, exp_name) -> {model -> [record_indices]}
135
+ self._model_index: dict[tuple[str, str], dict[str, list[int]]] = {}
136
+
137
+ # Sample data cache: (subset, sample_index) -> SampleMetadata dict
138
+ self._sample_cache: dict[tuple[str, int], dict[str, Any]] = {}
139
+
140
+ # Sample to parquet file mapping: (subset, sample_index) -> parquet_file_path
141
+ self._sample_file_map: dict[tuple[str, int], str] = {}
142
+
143
+ # Input image count range per subset: subset -> (min_count, max_count)
144
+ self._image_count_range: dict[str, tuple[int, int]] = {}
145
+
146
+ # Prompt sources per subset: subset -> list of unique prompt_source values
147
+ self._prompt_sources: dict[str, list[str]] = {}
148
+
149
+ # Audit logs cache: (subset, exp_name, model_a, model_b, sample_index) -> audit data
150
+ self._audit_cache: dict[tuple[str, str, str, str, int], dict[str, Any]] = {}
151
+
152
+ # Cross-subset ELO cache: (sorted_subsets_tuple, exp_name, model_scope) -> result dict
153
+ self._cross_subset_elo_cache: dict[tuple[tuple[str, ...], str, str], dict[str, Any]] = {}
154
+
155
+ if preload:
156
+ self._preload_all()
157
+
158
+ def _preload_all(self) -> None:
159
+ """Preload all data at initialization for better performance."""
160
+ logger.info("Preloading arena data...")
161
+
162
+ subsets = self.discover_subsets()
163
+ logger.info(f"Found {len(subsets)} subsets: {subsets}")
164
+
165
+ for subset in subsets:
166
+ logger.info(f"Loading subset: {subset}")
167
+
168
+ # Preload parquet dataset
169
+ self._preload_dataset(subset)
170
+
171
+ # Load subset info (models, experiments)
172
+ info = self.get_subset_info(subset)
173
+ if info:
174
+ logger.info(f" - {len(info.models)} models, {len(info.experiments)} experiments")
175
+
176
+ # Preload battle logs for each experiment
177
+ for exp_name in info.experiments:
178
+ records = self._load_battle_logs(subset, exp_name)
179
+ logger.info(f" - Experiment '{exp_name}': {len(records)} battles")
180
+
181
+ logger.info("Preloading complete!")
182
+
183
+ def _preload_dataset(self, subset: str) -> None:
184
+ """
185
+ Preload sample text data (instruction, task_type) using pyarrow directly.
186
+
187
+ This is much faster than using HuggingFace datasets because we skip
188
+ decoding image columns. Images are loaded on-demand when requested.
189
+ """
190
+ import pyarrow.parquet as pq
191
+
192
+ subset_path = os.path.join(self.data_dir, subset)
193
+ if not os.path.isdir(subset_path):
194
+ return
195
+
196
+ # Find parquet files
197
+ parquet_files = sorted([
198
+ os.path.join(subset_path, f)
199
+ for f in os.listdir(subset_path)
200
+ if f.startswith("data-") and f.endswith(".parquet")
201
+ ])
202
+
203
+ if not parquet_files:
204
+ return
205
+
206
+ logger.info(f" - Loading metadata from parquet (fast mode)...")
207
+
208
+ # Read all metadata columns + input_images (only to count, not decode)
209
+ columns_to_read = ["index", "instruction", "task_type", "input_images", "prompt_source", "original_metadata"]
210
+
211
+ total_rows = 0
212
+ min_img_count = float('inf')
213
+ max_img_count = 0
214
+ prompt_sources_set: set[str] = set()
215
+
216
+ for pf in parquet_files:
217
+ try:
218
+ # Get available columns in this file
219
+ import pyarrow.parquet as pq_schema
220
+ schema = pq.read_schema(pf)
221
+ available_columns = [c for c in columns_to_read if c in schema.names]
222
+
223
+ # Read the columns we need
224
+ table = pq.read_table(pf, columns=available_columns)
225
+
226
+ # Extract columns with defaults
227
+ def get_column(name, default=None):
228
+ if name in table.column_names:
229
+ return table.column(name).to_pylist()
230
+ return [default] * table.num_rows
231
+
232
+ indices = get_column("index", 0)
233
+ instructions = get_column("instruction", "")
234
+ task_types = get_column("task_type", "")
235
+ prompt_sources = get_column("prompt_source", None)
236
+ original_metadatas = get_column("original_metadata", None)
237
+
238
+ # Handle input_images separately for counting
239
+ has_input_images = "input_images" in table.column_names
240
+ input_images_col = table.column("input_images") if has_input_images else None
241
+
242
+ for i, idx in enumerate(indices):
243
+ idx = int(idx) if idx is not None else i
244
+
245
+ # Count input images without decoding
246
+ img_count = 0
247
+ if input_images_col is not None:
248
+ img_list = input_images_col[i].as_py()
249
+ img_count = len(img_list) if img_list else 0
250
+
251
+ min_img_count = min(min_img_count, img_count) if img_count > 0 else min_img_count
252
+ max_img_count = max(max_img_count, img_count)
253
+
254
+ # Track prompt sources
255
+ ps = prompt_sources[i] if prompt_sources[i] else None
256
+ if ps:
257
+ prompt_sources_set.add(str(ps))
258
+
259
+ # Build metadata dict
260
+ metadata = {
261
+ "instruction": str(instructions[i]) if instructions[i] else "",
262
+ "task_type": str(task_types[i]) if task_types[i] else "",
263
+ "input_image_count": img_count,
264
+ "prompt_source": ps,
265
+ "original_metadata": original_metadatas[i] if original_metadatas[i] else None,
266
+ }
267
+
268
+ self._sample_cache[(subset, idx)] = metadata
269
+ self._sample_file_map[(subset, idx)] = pf
270
+ total_rows += 1
271
+
272
+ except Exception as e:
273
+ logger.warning(f"Failed to read {pf}: {e}")
274
+ continue
275
+
276
+ # Store image count range for this subset
277
+ if total_rows > 0:
278
+ self._image_count_range[subset] = (
279
+ min_img_count if min_img_count != float('inf') else 1,
280
+ max_img_count if max_img_count > 0 else 1
281
+ )
282
+
283
+ # Store prompt sources for this subset
284
+ self._prompt_sources[subset] = sorted(prompt_sources_set)
285
+
286
+ logger.info(f" - Cached {total_rows} samples (input images: {self._image_count_range.get(subset, (1,1))}, sources: {len(prompt_sources_set)})")
287
+
288
+ def discover_subsets(self) -> list[str]:
289
+ """
290
+ Discover all available subsets.
291
+
292
+ A valid subset must exist in both arena_dir (with pk_logs) and data_dir.
293
+
294
+ Returns:
295
+ List of subset names
296
+ """
297
+ if self._subsets is not None:
298
+ return self._subsets
299
+
300
+ # Get subsets from data_dir (have parquet files)
301
+ data_subsets = set(discover_subsets(self.data_dir))
302
+
303
+ # Get subsets from arena_dir (have pk_logs)
304
+ arena_subsets = set()
305
+ if os.path.isdir(self.arena_dir):
306
+ for name in os.listdir(self.arena_dir):
307
+ subset_path = os.path.join(self.arena_dir, name)
308
+ pk_logs_path = os.path.join(subset_path, "pk_logs")
309
+ if os.path.isdir(pk_logs_path):
310
+ # Check if there are any experiment directories with battle logs
311
+ for exp_name in os.listdir(pk_logs_path):
312
+ exp_path = os.path.join(pk_logs_path, exp_name)
313
+ if os.path.isdir(exp_path):
314
+ # Check for .jsonl files
315
+ has_logs = any(
316
+ f.endswith(".jsonl")
317
+ for f in os.listdir(exp_path)
318
+ if os.path.isfile(os.path.join(exp_path, f))
319
+ )
320
+ if has_logs:
321
+ arena_subsets.add(name)
322
+ break
323
+
324
+ # Intersection: must have both data and battle logs
325
+ valid_subsets = sorted(data_subsets & arena_subsets)
326
+ self._subsets = valid_subsets
327
+ return valid_subsets
328
+
329
+ def get_subset_info(self, subset: str) -> Optional[SubsetInfo]:
330
+ """
331
+ Get information about a subset.
332
+
333
+ Args:
334
+ subset: Subset name
335
+
336
+ Returns:
337
+ SubsetInfo or None if subset doesn't exist
338
+ """
339
+ if subset in self._subset_info_cache:
340
+ return self._subset_info_cache[subset]
341
+
342
+ subset_path = os.path.join(self.arena_dir, subset)
343
+ if not os.path.isdir(subset_path):
344
+ return None
345
+
346
+ # Get models
347
+ model_manager = self._get_model_manager(subset)
348
+ models = model_manager.models if model_manager else []
349
+
350
+ # Get experiments
351
+ pk_logs_dir = os.path.join(subset_path, "pk_logs")
352
+ experiments = []
353
+ if os.path.isdir(pk_logs_dir):
354
+ for name in os.listdir(pk_logs_dir):
355
+ exp_path = os.path.join(pk_logs_dir, name)
356
+ if os.path.isdir(exp_path):
357
+ # Check for battle logs
358
+ has_logs = any(
359
+ f.endswith(".jsonl")
360
+ for f in os.listdir(exp_path)
361
+ if os.path.isfile(os.path.join(exp_path, f))
362
+ )
363
+ if has_logs:
364
+ experiments.append(name)
365
+ experiments.sort()
366
+
367
+ # Load state
368
+ state_path = os.path.join(subset_path, "arena", "state.json")
369
+ state = load_state(state_path)
370
+
371
+ # Get image count range
372
+ img_range = self._image_count_range.get(subset, (1, 1))
373
+
374
+ # Get prompt sources
375
+ prompt_sources = self._prompt_sources.get(subset, [])
376
+
377
+ info = SubsetInfo(
378
+ name=subset,
379
+ models=models,
380
+ experiments=experiments,
381
+ total_battles=state.total_battles,
382
+ state=state,
383
+ min_input_images=img_range[0],
384
+ max_input_images=img_range[1],
385
+ prompt_sources=prompt_sources,
386
+ )
387
+
388
+ self._subset_info_cache[subset] = info
389
+ return info
390
+
391
+ def _get_dataset(self, subset: str) -> Optional[ParquetDataset]:
392
+ """Get or create ParquetDataset for a subset."""
393
+ if subset not in self._dataset_cache:
394
+ try:
395
+ self._dataset_cache[subset] = ParquetDataset(self.data_dir, subset)
396
+ except Exception:
397
+ return None
398
+ return self._dataset_cache[subset]
399
+
400
+ def _get_model_manager(self, subset: str) -> Optional[GlobalModelOutputManager]:
401
+ """Get or create GlobalModelOutputManager for a subset."""
402
+ if subset not in self._model_manager_cache:
403
+ models_dir = os.path.join(self.arena_dir, subset, "models")
404
+ if os.path.isdir(models_dir):
405
+ self._model_manager_cache[subset] = GlobalModelOutputManager(models_dir)
406
+ else:
407
+ return None
408
+ return self._model_manager_cache[subset]
409
+
410
+ def _get_sample_data(self, subset: str, sample_index: int) -> dict[str, Any]:
411
+ """Get cached sample metadata."""
412
+ cache_key = (subset, sample_index)
413
+ if cache_key in self._sample_cache:
414
+ return self._sample_cache[cache_key]
415
+
416
+ # Fallback - return defaults
417
+ return {
418
+ "instruction": "",
419
+ "task_type": "",
420
+ "input_image_count": 1,
421
+ "prompt_source": None,
422
+ "original_metadata": None,
423
+ }
424
+
425
+ def _load_battle_logs(self, subset: str, exp_name: str) -> list[BattleRecord]:
426
+ """
427
+ Load battle records from log files.
428
+
429
+ Args:
430
+ subset: Subset name
431
+ exp_name: Experiment name
432
+
433
+ Returns:
434
+ List of BattleRecord objects
435
+ """
436
+ cache_key = (subset, exp_name)
437
+ if cache_key in self._battle_cache:
438
+ return self._battle_cache[cache_key]
439
+
440
+ records: list[BattleRecord] = []
441
+ exp_dir = os.path.join(self.arena_dir, subset, "pk_logs", exp_name)
442
+
443
+ if not os.path.isdir(exp_dir):
444
+ return records
445
+
446
+ # Load slim battle logs
447
+ for filename in os.listdir(exp_dir):
448
+ if not filename.endswith(".jsonl"):
449
+ continue
450
+
451
+ filepath = os.path.join(exp_dir, filename)
452
+ if not os.path.isfile(filepath):
453
+ continue
454
+
455
+ try:
456
+ with open(filepath, "r", encoding="utf-8") as f:
457
+ for line in f:
458
+ line = line.strip()
459
+ if not line:
460
+ continue
461
+ try:
462
+ data = json.loads(line)
463
+ sample_index = data.get("sample_index", -1)
464
+
465
+ # Get cached sample data
466
+ sample_meta = self._get_sample_data(subset, sample_index)
467
+
468
+ record = BattleRecord(
469
+ subset=subset,
470
+ exp_name=exp_name,
471
+ sample_index=sample_index,
472
+ model_a=data.get("model_a", ""),
473
+ model_b=data.get("model_b", ""),
474
+ final_winner=data.get("final_winner", "tie"),
475
+ is_consistent=data.get("is_consistent", False),
476
+ timestamp=data.get("timestamp", ""),
477
+ instruction=sample_meta.get("instruction", ""),
478
+ task_type=sample_meta.get("task_type", ""),
479
+ input_image_count=sample_meta.get("input_image_count", 1),
480
+ prompt_source=sample_meta.get("prompt_source"),
481
+ original_metadata=sample_meta.get("original_metadata"),
482
+ )
483
+ if record.model_a and record.model_b:
484
+ records.append(record)
485
+ except json.JSONDecodeError:
486
+ continue
487
+ except Exception:
488
+ continue
489
+
490
+ # Sort by sample_index
491
+ records.sort(key=lambda r: r.sample_index)
492
+
493
+ # Cache records
494
+ self._battle_cache[cache_key] = records
495
+
496
+ # Build model index for fast filtering
497
+ self._build_model_index(cache_key, records)
498
+
499
+ return records
500
+
501
+ def _build_model_index(
502
+ self, cache_key: tuple[str, str], records: list[BattleRecord]
503
+ ) -> None:
504
+ """Build index for fast model-based filtering."""
505
+ model_index: dict[str, list[int]] = {}
506
+
507
+ for i, record in enumerate(records):
508
+ for model in [record.model_a, record.model_b]:
509
+ if model not in model_index:
510
+ model_index[model] = []
511
+ model_index[model].append(i)
512
+
513
+ self._model_index[cache_key] = model_index
514
+
515
+ def _load_all_experiments_battles(self, subset: str) -> list[BattleRecord]:
516
+ """
517
+ Load battle records from all experiments for a subset.
518
+
519
+ Args:
520
+ subset: Subset name
521
+
522
+ Returns:
523
+ Combined list of BattleRecord objects from all experiments
524
+ """
525
+ info = self.get_subset_info(subset)
526
+ if not info:
527
+ return []
528
+
529
+ all_records: list[BattleRecord] = []
530
+ for exp_name in info.experiments:
531
+ records = self._load_battle_logs(subset, exp_name)
532
+ all_records.extend(records)
533
+
534
+ # Sort by sample_index for consistent ordering
535
+ all_records.sort(key=lambda r: (r.sample_index, r.exp_name, r.model_a, r.model_b))
536
+ return all_records
537
+
538
+ def _load_audit_log(
539
+ self, subset: str, exp_name: str, model_a: str, model_b: str, sample_index: int
540
+ ) -> Optional[dict[str, Any]]:
541
+ """
542
+ Load audit log for a specific battle.
543
+
544
+ Args:
545
+ subset: Subset name
546
+ exp_name: Experiment name
547
+ model_a: First model name
548
+ model_b: Second model name
549
+ sample_index: Sample index
550
+
551
+ Returns:
552
+ Audit data dict or None
553
+ """
554
+ cache_key = (subset, exp_name, model_a, model_b, sample_index)
555
+ if cache_key in self._audit_cache:
556
+ return self._audit_cache[cache_key]
557
+
558
+ # Determine filename (models are sorted alphabetically)
559
+ from genarena.utils import sanitize_name
560
+
561
+ first, second = sorted([model_a, model_b])
562
+ filename = f"{sanitize_name(first)}_vs_{sanitize_name(second)}.jsonl"
563
+ filepath = os.path.join(
564
+ self.arena_dir, subset, "pk_logs", exp_name, "raw_outputs", filename
565
+ )
566
+
567
+ if not os.path.isfile(filepath):
568
+ return None
569
+
570
+ try:
571
+ with open(filepath, "r", encoding="utf-8") as f:
572
+ for line in f:
573
+ line = line.strip()
574
+ if not line:
575
+ continue
576
+ try:
577
+ data = json.loads(line)
578
+ if data.get("sample_index") == sample_index:
579
+ self._audit_cache[cache_key] = data
580
+ return data
581
+ except json.JSONDecodeError:
582
+ continue
583
+ except Exception:
584
+ pass
585
+
586
+ return None
587
+
588
+ def get_battles(
589
+ self,
590
+ subset: str,
591
+ exp_name: str,
592
+ page: int = 1,
593
+ page_size: int = 20,
594
+ models: Optional[list[str]] = None,
595
+ result_filter: Optional[str] = None, # "wins", "losses", "ties"
596
+ consistency_filter: Optional[bool] = None,
597
+ min_images: Optional[int] = None,
598
+ max_images: Optional[int] = None,
599
+ prompt_source: Optional[str] = None,
600
+ ) -> tuple[list[BattleRecord], int]:
601
+ """
602
+ Get paginated battle records with filtering.
603
+
604
+ Args:
605
+ subset: Subset name
606
+ exp_name: Experiment name (use "__all__" for all experiments)
607
+ page: Page number (1-indexed)
608
+ page_size: Number of records per page
609
+ models: Filter by models (show battles involving ANY of these models)
610
+ result_filter: Filter by result relative to models ("wins", "losses", "ties")
611
+ consistency_filter: Filter by consistency (True/False/None for all)
612
+
613
+ Returns:
614
+ Tuple of (records, total_count)
615
+ """
616
+ # Handle "__all__" experiment - combine all experiments
617
+ if exp_name == "__all__":
618
+ all_records = self._load_all_experiments_battles(subset)
619
+ # For __all__, we don't use the model index optimization
620
+ cache_key = None
621
+ else:
622
+ all_records = self._load_battle_logs(subset, exp_name)
623
+ cache_key = (subset, exp_name)
624
+
625
+ # Apply filters using index for better performance
626
+ if models and cache_key and cache_key in self._model_index:
627
+ model_set = set(models)
628
+ model_index = self._model_index[cache_key]
629
+
630
+ if len(models) == 1:
631
+ # Single model: show battles involving this model
632
+ candidate_indices = set(model_index.get(models[0], []))
633
+ filtered = [all_records[i] for i in sorted(candidate_indices)]
634
+ else:
635
+ # 2+ models: show only battles BETWEEN these models (both participants must be in selected models)
636
+ # Find union of all records involving any selected model first
637
+ candidate_indices: set[int] = set()
638
+ for model in models:
639
+ if model in model_index:
640
+ candidate_indices.update(model_index[model])
641
+ # Then filter to keep only battles where BOTH models are in the selected set
642
+ filtered = [
643
+ all_records[i] for i in sorted(candidate_indices)
644
+ if all_records[i].model_a in model_set and all_records[i].model_b in model_set
645
+ ]
646
+
647
+ # Apply result filter
648
+ if result_filter:
649
+ if len(models) == 1:
650
+ # Single model: filter by that model's wins/losses/ties
651
+ model = models[0]
652
+ if result_filter == "wins":
653
+ filtered = [r for r in filtered if r.final_winner == model]
654
+ elif result_filter == "losses":
655
+ filtered = [
656
+ r
657
+ for r in filtered
658
+ if r.final_winner != "tie" and r.final_winner != model
659
+ ]
660
+ elif result_filter == "ties":
661
+ filtered = [r for r in filtered if r.final_winner == "tie"]
662
+ elif len(models) == 2:
663
+ # Two models: filter by winner (result_filter is the winning model name or "tie")
664
+ if result_filter == "ties":
665
+ filtered = [r for r in filtered if r.final_winner == "tie"]
666
+ elif result_filter in models:
667
+ # Filter by specific model winning
668
+ filtered = [r for r in filtered if r.final_winner == result_filter]
669
+ elif models:
670
+ # Fallback for __all__ mode or when index is not available
671
+ model_set = set(models)
672
+ if len(models) == 1:
673
+ model = models[0]
674
+ filtered = [r for r in all_records if model in r.models]
675
+ # Apply result filter
676
+ if result_filter:
677
+ if result_filter == "wins":
678
+ filtered = [r for r in filtered if r.final_winner == model]
679
+ elif result_filter == "losses":
680
+ filtered = [
681
+ r
682
+ for r in filtered
683
+ if r.final_winner != "tie" and r.final_winner != model
684
+ ]
685
+ elif result_filter == "ties":
686
+ filtered = [r for r in filtered if r.final_winner == "tie"]
687
+ else:
688
+ # 2+ models: show battles between these models
689
+ filtered = [
690
+ r for r in all_records
691
+ if r.model_a in model_set and r.model_b in model_set
692
+ ]
693
+ # Apply result filter
694
+ if result_filter:
695
+ if result_filter == "ties":
696
+ filtered = [r for r in filtered if r.final_winner == "tie"]
697
+ elif result_filter in models:
698
+ filtered = [r for r in filtered if r.final_winner == result_filter]
699
+ else:
700
+ filtered = all_records
701
+
702
+ # Apply consistency filter
703
+ if consistency_filter is not None:
704
+ filtered = [r for r in filtered if r.is_consistent == consistency_filter]
705
+
706
+ # Apply input image count filter
707
+ if min_images is not None or max_images is not None:
708
+ min_img = min_images if min_images is not None else 0
709
+ max_img = max_images if max_images is not None else float('inf')
710
+ filtered = [r for r in filtered if min_img <= r.input_image_count <= max_img]
711
+
712
+ # Apply prompt_source filter
713
+ if prompt_source:
714
+ filtered = [r for r in filtered if r.prompt_source == prompt_source]
715
+
716
+ total_count = len(filtered)
717
+
718
+ # Paginate
719
+ start = (page - 1) * page_size
720
+ end = start + page_size
721
+ page_records = filtered[start:end]
722
+
723
+ return page_records, total_count
724
+
725
+ def search_battles(
726
+ self,
727
+ subset: str,
728
+ exp_name: str,
729
+ query: str,
730
+ page: int = 1,
731
+ page_size: int = 20,
732
+ models: Optional[list[str]] = None,
733
+ consistency_filter: Optional[bool] = None,
734
+ search_fields: Optional[list[str]] = None,
735
+ ) -> tuple[list[BattleRecord], int]:
736
+ """
737
+ Search battle records by text query (full-text search).
738
+
739
+ Searches across instruction, task_type, prompt_source, and original_metadata.
740
+
741
+ Args:
742
+ subset: Subset name
743
+ exp_name: Experiment name (use "__all__" for all experiments)
744
+ query: Search query string (case-insensitive)
745
+ page: Page number (1-indexed)
746
+ page_size: Number of records per page
747
+ models: Optional filter by models
748
+ consistency_filter: Optional filter by consistency
749
+ search_fields: Fields to search in (default: all searchable fields)
750
+
751
+ Returns:
752
+ Tuple of (matching_records, total_count)
753
+ """
754
+ if not query or not query.strip():
755
+ # Empty query - return regular filtered results
756
+ return self.get_battles(
757
+ subset, exp_name, page, page_size,
758
+ models=models, consistency_filter=consistency_filter
759
+ )
760
+
761
+ # Normalize query for case-insensitive search
762
+ query_lower = query.lower().strip()
763
+ # Create regex pattern for more flexible matching
764
+ query_pattern = re.compile(re.escape(query_lower), re.IGNORECASE)
765
+
766
+ # Determine which fields to search
767
+ all_searchable_fields = ["instruction", "task_type", "prompt_source", "original_metadata"]
768
+ fields_to_search = search_fields if search_fields else all_searchable_fields
769
+
770
+ # Load all records
771
+ if exp_name == "__all__":
772
+ all_records = self._load_all_experiments_battles(subset)
773
+ else:
774
+ all_records = self._load_battle_logs(subset, exp_name)
775
+
776
+ # Apply model filter first (for efficiency)
777
+ if models:
778
+ model_set = set(models)
779
+ if len(models) == 1:
780
+ all_records = [r for r in all_records if models[0] in r.models]
781
+ else:
782
+ all_records = [
783
+ r for r in all_records
784
+ if r.model_a in model_set and r.model_b in model_set
785
+ ]
786
+
787
+ # Apply consistency filter
788
+ if consistency_filter is not None:
789
+ all_records = [r for r in all_records if r.is_consistent == consistency_filter]
790
+
791
+ # Search filter
792
+ def matches_query(record: BattleRecord) -> bool:
793
+ """Check if record matches the search query."""
794
+ for field_name in fields_to_search:
795
+ value = getattr(record, field_name, None)
796
+ if value is None:
797
+ continue
798
+
799
+ # Handle different field types
800
+ if field_name == "original_metadata" and isinstance(value, dict):
801
+ # Search in JSON string representation of metadata
802
+ metadata_str = json.dumps(value, ensure_ascii=False).lower()
803
+ if query_pattern.search(metadata_str):
804
+ return True
805
+ elif isinstance(value, str):
806
+ if query_pattern.search(value):
807
+ return True
808
+
809
+ return False
810
+
811
+ # Apply search filter
812
+ filtered = [r for r in all_records if matches_query(r)]
813
+
814
+ total_count = len(filtered)
815
+
816
+ # Paginate
817
+ start = (page - 1) * page_size
818
+ end = start + page_size
819
+ page_records = filtered[start:end]
820
+
821
+ return page_records, total_count
822
+
823
+ def search_prompts(
824
+ self,
825
+ subset: str,
826
+ exp_name: str,
827
+ query: str,
828
+ page: int = 1,
829
+ page_size: int = 10,
830
+ filter_models: Optional[list[str]] = None,
831
+ ) -> tuple[list[dict[str, Any]], int]:
832
+ """
833
+ Search prompts/samples by text query.
834
+
835
+ Args:
836
+ subset: Subset name
837
+ exp_name: Experiment name (use "__all__" for all experiments)
838
+ query: Search query string
839
+ page: Page number
840
+ page_size: Records per page
841
+ filter_models: Optional filter by models
842
+
843
+ Returns:
844
+ Tuple of (matching_prompts, total_count)
845
+ """
846
+ if not query or not query.strip():
847
+ # Empty query - return regular results
848
+ return self.get_prompts(subset, exp_name, page, page_size, filter_models=filter_models)
849
+
850
+ # Normalize query
851
+ query_lower = query.lower().strip()
852
+ query_pattern = re.compile(re.escape(query_lower), re.IGNORECASE)
853
+
854
+ # Load records and group by sample
855
+ if exp_name == "__all__":
856
+ all_records = self._load_all_experiments_battles(subset)
857
+ else:
858
+ all_records = self._load_battle_logs(subset, exp_name)
859
+
860
+ # Group by sample_index
861
+ sample_records: dict[int, list[BattleRecord]] = {}
862
+ for record in all_records:
863
+ if record.sample_index not in sample_records:
864
+ sample_records[record.sample_index] = []
865
+ sample_records[record.sample_index].append(record)
866
+
867
+ # Filter samples by query
868
+ matching_samples = []
869
+ for sample_index, records in sample_records.items():
870
+ if not records:
871
+ continue
872
+
873
+ first_record = records[0]
874
+
875
+ # Search in instruction, task_type, prompt_source, original_metadata
876
+ match_found = False
877
+
878
+ if first_record.instruction and query_pattern.search(first_record.instruction):
879
+ match_found = True
880
+ elif first_record.task_type and query_pattern.search(first_record.task_type):
881
+ match_found = True
882
+ elif first_record.prompt_source and query_pattern.search(first_record.prompt_source):
883
+ match_found = True
884
+ elif first_record.original_metadata:
885
+ metadata_str = json.dumps(first_record.original_metadata, ensure_ascii=False).lower()
886
+ if query_pattern.search(metadata_str):
887
+ match_found = True
888
+
889
+ if match_found:
890
+ matching_samples.append(sample_index)
891
+
892
+ # Sort and paginate
893
+ matching_samples.sort()
894
+ total_count = len(matching_samples)
895
+
896
+ start = (page - 1) * page_size
897
+ end = start + page_size
898
+ page_samples = matching_samples[start:end]
899
+
900
+ # Build result for each sample using get_sample_all_models
901
+ results = []
902
+ for sample_index in page_samples:
903
+ prompt_data = self.get_sample_all_models(subset, exp_name, sample_index, filter_models)
904
+ results.append(prompt_data)
905
+
906
+ return results, total_count
907
+
908
+ def get_battle_detail(
909
+ self, subset: str, exp_name: str, model_a: str, model_b: str, sample_index: int
910
+ ) -> Optional[BattleRecord]:
911
+ """
912
+ Get detailed battle record including VLM outputs.
913
+
914
+ Args:
915
+ subset: Subset name
916
+ exp_name: Experiment name (use "__all__" for all experiments)
917
+ model_a: First model name
918
+ model_b: Second model name
919
+ sample_index: Sample index
920
+
921
+ Returns:
922
+ BattleRecord with audit data, or None
923
+ """
924
+ # Find the battle record
925
+ if exp_name == "__all__":
926
+ all_records = self._load_all_experiments_battles(subset)
927
+ else:
928
+ all_records = self._load_battle_logs(subset, exp_name)
929
+
930
+ record = None
931
+ for r in all_records:
932
+ if (
933
+ r.sample_index == sample_index
934
+ and set([r.model_a, r.model_b]) == set([model_a, model_b])
935
+ ):
936
+ record = r
937
+ break
938
+
939
+ if not record:
940
+ return None
941
+
942
+ # Load audit data (use the record's actual exp_name for audit log lookup)
943
+ actual_exp_name = record.exp_name
944
+ audit = self._load_audit_log(
945
+ subset, actual_exp_name, record.model_a, record.model_b, sample_index
946
+ )
947
+ if audit:
948
+ record.original_call = audit.get("original_call")
949
+ record.swapped_call = audit.get("swapped_call")
950
+
951
+ return record
952
+
953
+ def get_image_path(
954
+ self, subset: str, model: str, sample_index: int
955
+ ) -> Optional[str]:
956
+ """
957
+ Get path to model output image.
958
+
959
+ Args:
960
+ subset: Subset name
961
+ model: Model name
962
+ sample_index: Sample index
963
+
964
+ Returns:
965
+ Image file path or None
966
+ """
967
+ model_manager = self._get_model_manager(subset)
968
+ if model_manager:
969
+ return model_manager.get_output_path(model, sample_index)
970
+ return None
971
+
972
+ def get_input_image(self, subset: str, sample_index: int) -> Optional[bytes]:
973
+ """
974
+ Get input image bytes for a sample.
975
+
976
+ Uses pyarrow to read directly from parquet for better performance.
977
+ Uses cached file mapping for fast lookup.
978
+
979
+ Args:
980
+ subset: Subset name
981
+ sample_index: Sample index
982
+
983
+ Returns:
984
+ Image bytes or None
985
+ """
986
+ import pyarrow.parquet as pq
987
+
988
+ # Use cached file mapping if available (fast path)
989
+ cache_key = (subset, sample_index)
990
+ if cache_key in self._sample_file_map:
991
+ pf = self._sample_file_map[cache_key]
992
+ result = self._read_image_from_parquet(pf, sample_index)
993
+ if result is not None:
994
+ return result
995
+
996
+ # Fallback: search all parquet files (slow path)
997
+ subset_path = os.path.join(self.data_dir, subset)
998
+ if not os.path.isdir(subset_path):
999
+ return None
1000
+
1001
+ parquet_files = sorted([
1002
+ os.path.join(subset_path, f)
1003
+ for f in os.listdir(subset_path)
1004
+ if f.startswith("data-") and f.endswith(".parquet")
1005
+ ])
1006
+
1007
+ for pf in parquet_files:
1008
+ result = self._read_image_from_parquet(pf, sample_index)
1009
+ if result is not None:
1010
+ return result
1011
+
1012
+ return None
1013
+
1014
+ def _read_image_from_parquet(self, parquet_file: str, sample_index: int) -> Optional[bytes]:
1015
+ """Read a single image from a parquet file."""
1016
+ import pyarrow.parquet as pq
1017
+
1018
+ try:
1019
+ table = pq.read_table(parquet_file, columns=["index", "input_images"])
1020
+ indices = table.column("index").to_pylist()
1021
+
1022
+ if sample_index not in indices:
1023
+ return None
1024
+
1025
+ row_idx = indices.index(sample_index)
1026
+ input_images = table.column("input_images")[row_idx].as_py()
1027
+
1028
+ if not input_images or len(input_images) == 0:
1029
+ return None
1030
+
1031
+ img_data = input_images[0]
1032
+
1033
+ # Handle different formats
1034
+ if isinstance(img_data, bytes):
1035
+ return img_data
1036
+ elif isinstance(img_data, dict):
1037
+ # HuggingFace Image format: {"bytes": ..., "path": ...}
1038
+ if "bytes" in img_data and img_data["bytes"]:
1039
+ return img_data["bytes"]
1040
+ elif "path" in img_data and img_data["path"]:
1041
+ path = img_data["path"]
1042
+ if os.path.isfile(path):
1043
+ with open(path, "rb") as f:
1044
+ return f.read()
1045
+
1046
+ except Exception as e:
1047
+ logger.debug(f"Error reading image from {parquet_file}: {e}")
1048
+
1049
+ return None
1050
+
1051
+ def get_input_image_count(self, subset: str, sample_index: int) -> int:
1052
+ """Get the number of input images for a sample."""
1053
+ import pyarrow.parquet as pq
1054
+
1055
+ cache_key = (subset, sample_index)
1056
+ if cache_key in self._sample_file_map:
1057
+ pf = self._sample_file_map[cache_key]
1058
+ try:
1059
+ table = pq.read_table(pf, columns=["index", "input_images"])
1060
+ indices = table.column("index").to_pylist()
1061
+ if sample_index in indices:
1062
+ row_idx = indices.index(sample_index)
1063
+ input_images = table.column("input_images")[row_idx].as_py()
1064
+ return len(input_images) if input_images else 0
1065
+ except Exception:
1066
+ pass
1067
+ return 1 # Default to 1
1068
+
1069
+ def get_input_image_by_idx(self, subset: str, sample_index: int, img_idx: int = 0) -> Optional[bytes]:
1070
+ """Get a specific input image by index."""
1071
+ import pyarrow.parquet as pq
1072
+
1073
+ cache_key = (subset, sample_index)
1074
+ if cache_key not in self._sample_file_map:
1075
+ return None
1076
+
1077
+ pf = self._sample_file_map[cache_key]
1078
+ try:
1079
+ table = pq.read_table(pf, columns=["index", "input_images"])
1080
+ indices = table.column("index").to_pylist()
1081
+
1082
+ if sample_index not in indices:
1083
+ return None
1084
+
1085
+ row_idx = indices.index(sample_index)
1086
+ input_images = table.column("input_images")[row_idx].as_py()
1087
+
1088
+ if not input_images or img_idx >= len(input_images):
1089
+ return None
1090
+
1091
+ img_data = input_images[img_idx]
1092
+
1093
+ if isinstance(img_data, bytes):
1094
+ return img_data
1095
+ elif isinstance(img_data, dict):
1096
+ if "bytes" in img_data and img_data["bytes"]:
1097
+ return img_data["bytes"]
1098
+ elif "path" in img_data and img_data["path"]:
1099
+ path = img_data["path"]
1100
+ if os.path.isfile(path):
1101
+ with open(path, "rb") as f:
1102
+ return f.read()
1103
+ except Exception as e:
1104
+ logger.debug(f"Error reading image: {e}")
1105
+
1106
+ return None
1107
+
1108
+ def get_head_to_head(
1109
+ self, subset: str, exp_name: str, model_a: str, model_b: str
1110
+ ) -> dict[str, Any]:
1111
+ """
1112
+ Get head-to-head statistics between two models.
1113
+
1114
+ Returns:
1115
+ Dict with wins_a, wins_b, ties, total, win_rate_a, win_rate_b
1116
+ """
1117
+ if exp_name == "__all__":
1118
+ all_records = self._load_all_experiments_battles(subset)
1119
+ # For __all__, we need to filter manually
1120
+ h2h_records = [
1121
+ r for r in all_records
1122
+ if set([r.model_a, r.model_b]) == set([model_a, model_b])
1123
+ ]
1124
+ else:
1125
+ all_records = self._load_battle_logs(subset, exp_name)
1126
+ cache_key = (subset, exp_name)
1127
+ model_index = self._model_index.get(cache_key, {})
1128
+
1129
+ # Find battles between these two models
1130
+ indices_a = set(model_index.get(model_a, []))
1131
+ indices_b = set(model_index.get(model_b, []))
1132
+ h2h_indices = indices_a & indices_b
1133
+ h2h_records = [all_records[idx] for idx in h2h_indices]
1134
+
1135
+ wins_a = 0
1136
+ wins_b = 0
1137
+ ties = 0
1138
+
1139
+ for record in h2h_records:
1140
+ if record.final_winner == model_a:
1141
+ wins_a += 1
1142
+ elif record.final_winner == model_b:
1143
+ wins_b += 1
1144
+ else:
1145
+ ties += 1
1146
+
1147
+ total = wins_a + wins_b + ties
1148
+
1149
+ return {
1150
+ "model_a": model_a,
1151
+ "model_b": model_b,
1152
+ "wins_a": wins_a,
1153
+ "wins_b": wins_b,
1154
+ "ties": ties,
1155
+ "total": total,
1156
+ "win_rate_a": wins_a / total if total > 0 else 0,
1157
+ "win_rate_b": wins_b / total if total > 0 else 0,
1158
+ "tie_rate": ties / total if total > 0 else 0,
1159
+ }
1160
+
1161
+ def get_win_rate_matrix(
1162
+ self,
1163
+ subset: str,
1164
+ exp_name: str = "__all__",
1165
+ filter_models: Optional[list[str]] = None,
1166
+ ) -> dict[str, Any]:
1167
+ """
1168
+ Compute win rate matrix for all model pairs.
1169
+
1170
+ Args:
1171
+ subset: Subset name
1172
+ exp_name: Experiment name (use "__all__" for all experiments)
1173
+ filter_models: Optional list of models to include
1174
+
1175
+ Returns:
1176
+ Dict with:
1177
+ - models: List of model names (sorted by ELO)
1178
+ - matrix: 2D array where matrix[i][j] = win rate of model i vs model j
1179
+ - counts: 2D array where counts[i][j] = number of battles between i and j
1180
+ - wins: 2D array where wins[i][j] = wins of model i vs model j
1181
+ """
1182
+ # Load all records
1183
+ if exp_name == "__all__":
1184
+ all_records = self._load_all_experiments_battles(subset)
1185
+ else:
1186
+ all_records = self._load_battle_logs(subset, exp_name)
1187
+
1188
+ # Determine models to include
1189
+ info = self.get_subset_info(subset)
1190
+ if filter_models:
1191
+ models = [m for m in filter_models if m in info.models]
1192
+ else:
1193
+ models = list(info.models)
1194
+
1195
+ # Get ELO leaderboard to sort models by ELO
1196
+ leaderboard = self.get_elo_leaderboard(subset, models)
1197
+ models = [entry["model"] for entry in leaderboard]
1198
+
1199
+ n = len(models)
1200
+ model_to_idx = {m: i for i, m in enumerate(models)}
1201
+
1202
+ # Initialize matrices
1203
+ wins_matrix = [[0] * n for _ in range(n)]
1204
+ counts_matrix = [[0] * n for _ in range(n)]
1205
+
1206
+ # Count wins for each pair
1207
+ model_set = set(models)
1208
+ for record in all_records:
1209
+ if record.model_a not in model_set or record.model_b not in model_set:
1210
+ continue
1211
+
1212
+ i = model_to_idx[record.model_a]
1213
+ j = model_to_idx[record.model_b]
1214
+
1215
+ # Count total battles (symmetric)
1216
+ counts_matrix[i][j] += 1
1217
+ counts_matrix[j][i] += 1
1218
+
1219
+ # Count wins
1220
+ if record.final_winner == record.model_a:
1221
+ wins_matrix[i][j] += 1
1222
+ elif record.final_winner == record.model_b:
1223
+ wins_matrix[j][i] += 1
1224
+ else:
1225
+ # Tie counts as 0.5 win for each
1226
+ wins_matrix[i][j] += 0.5
1227
+ wins_matrix[j][i] += 0.5
1228
+
1229
+ # Compute win rate matrix
1230
+ win_rate_matrix = [[0.0] * n for _ in range(n)]
1231
+ for i in range(n):
1232
+ for j in range(n):
1233
+ if counts_matrix[i][j] > 0:
1234
+ win_rate_matrix[i][j] = wins_matrix[i][j] / counts_matrix[i][j]
1235
+ elif i == j:
1236
+ win_rate_matrix[i][j] = 0.5 # Self vs self
1237
+
1238
+ return {
1239
+ "models": models,
1240
+ "matrix": win_rate_matrix,
1241
+ "counts": counts_matrix,
1242
+ "wins": wins_matrix,
1243
+ }
1244
+
1245
+ def get_elo_by_source(
1246
+ self,
1247
+ subset: str,
1248
+ exp_name: str = "__all__",
1249
+ ) -> dict[str, Any]:
1250
+ """
1251
+ Compute ELO rankings grouped by prompt_source.
1252
+
1253
+ Args:
1254
+ subset: Subset name
1255
+ exp_name: Experiment name
1256
+
1257
+ Returns:
1258
+ Dict with:
1259
+ - sources: List of source names
1260
+ - leaderboards: Dict mapping source -> list of model ELO entries
1261
+ - sample_counts: Dict mapping source -> number of samples
1262
+ - battle_counts: Dict mapping source -> number of battles
1263
+ """
1264
+ from genarena.bt_elo import compute_bt_elo_ratings
1265
+
1266
+ # Load all records
1267
+ if exp_name == "__all__":
1268
+ all_records = self._load_all_experiments_battles(subset)
1269
+ else:
1270
+ all_records = self._load_battle_logs(subset, exp_name)
1271
+
1272
+ # Group battles by prompt_source
1273
+ battles_by_source: dict[str, list[tuple[str, str, str]]] = {}
1274
+ sample_counts: dict[str, set[int]] = {}
1275
+
1276
+ for record in all_records:
1277
+ source = record.prompt_source or "unknown"
1278
+ if source not in battles_by_source:
1279
+ battles_by_source[source] = []
1280
+ sample_counts[source] = set()
1281
+
1282
+ # Convert winner to bt_elo format
1283
+ if record.final_winner == record.model_a:
1284
+ winner = "model_a"
1285
+ elif record.final_winner == record.model_b:
1286
+ winner = "model_b"
1287
+ else:
1288
+ winner = "tie"
1289
+
1290
+ battles_by_source[source].append((record.model_a, record.model_b, winner))
1291
+ sample_counts[source].add(record.sample_index)
1292
+
1293
+ # Compute ELO for each source
1294
+ leaderboards: dict[str, list[dict[str, Any]]] = {}
1295
+ battle_counts: dict[str, int] = {}
1296
+
1297
+ for source, battles in battles_by_source.items():
1298
+ if not battles:
1299
+ continue
1300
+
1301
+ battle_counts[source] = len(battles)
1302
+
1303
+ try:
1304
+ ratings = compute_bt_elo_ratings(battles)
1305
+
1306
+ # Build leaderboard
1307
+ entries = []
1308
+ for model, elo in ratings.items():
1309
+ # Count wins/losses/ties for this model in this source
1310
+ wins = losses = ties = 0
1311
+ for ma, mb, w in battles:
1312
+ if model == ma:
1313
+ if w == "model_a":
1314
+ wins += 1
1315
+ elif w == "model_b":
1316
+ losses += 1
1317
+ else:
1318
+ ties += 1
1319
+ elif model == mb:
1320
+ if w == "model_b":
1321
+ wins += 1
1322
+ elif w == "model_a":
1323
+ losses += 1
1324
+ else:
1325
+ ties += 1
1326
+
1327
+ total = wins + losses + ties
1328
+ entries.append({
1329
+ "model": model,
1330
+ "elo": round(elo, 1),
1331
+ "wins": wins,
1332
+ "losses": losses,
1333
+ "ties": ties,
1334
+ "total": total,
1335
+ "win_rate": (wins + 0.5 * ties) / total if total > 0 else 0,
1336
+ })
1337
+
1338
+ # Sort by ELO descending
1339
+ entries.sort(key=lambda x: -x["elo"])
1340
+ leaderboards[source] = entries
1341
+
1342
+ except Exception as e:
1343
+ logger.warning(f"Failed to compute ELO for source {source}: {e}")
1344
+ continue
1345
+
1346
+ # Sort sources by battle count
1347
+ sources = sorted(battle_counts.keys(), key=lambda s: -battle_counts[s])
1348
+
1349
+ return {
1350
+ "sources": sources,
1351
+ "leaderboards": leaderboards,
1352
+ "sample_counts": {s: len(sample_counts[s]) for s in sources},
1353
+ "battle_counts": battle_counts,
1354
+ }
1355
+
1356
+ def _load_elo_snapshot(self, snapshot_path: str) -> Optional[dict[str, Any]]:
1357
+ """
1358
+ Load ELO snapshot from a JSON file.
1359
+
1360
+ Args:
1361
+ snapshot_path: Path to elo_snapshot.json
1362
+
1363
+ Returns:
1364
+ Dict with elo ratings and metadata, or None if not found
1365
+ """
1366
+ if not os.path.isfile(snapshot_path):
1367
+ return None
1368
+
1369
+ try:
1370
+ with open(snapshot_path, "r", encoding="utf-8") as f:
1371
+ data = json.load(f)
1372
+
1373
+ if not isinstance(data, dict):
1374
+ return None
1375
+
1376
+ # Extract ELO ratings (support both {"elo": {...}} and direct {model: elo} format)
1377
+ elo_data = data.get("elo") if isinstance(data.get("elo"), dict) else data
1378
+ if not isinstance(elo_data, dict):
1379
+ return None
1380
+
1381
+ return {
1382
+ "elo": {str(k): float(v) for k, v in elo_data.items()},
1383
+ "battle_count": data.get("battle_count", 0),
1384
+ "model_count": data.get("model_count", len(elo_data)),
1385
+ "exp_name": data.get("exp_name", ""),
1386
+ }
1387
+ except Exception as e:
1388
+ logger.debug(f"Failed to load ELO snapshot from {snapshot_path}: {e}")
1389
+ return None
1390
+
1391
+ def get_elo_history(
1392
+ self,
1393
+ subset: str,
1394
+ exp_name: str = "__all__",
1395
+ granularity: str = "experiment",
1396
+ filter_models: Optional[list[str]] = None,
1397
+ max_points: int = 50,
1398
+ ) -> dict[str, Any]:
1399
+ """
1400
+ Get ELO history over experiments by reading pre-computed elo_snapshot.json files.
1401
+
1402
+ Args:
1403
+ subset: Subset name
1404
+ exp_name: Experiment name (only "__all__" or "experiment" granularity supported)
1405
+ granularity: Grouping method ("experiment" reads from snapshots; time-based not supported)
1406
+ filter_models: Optional models to track
1407
+ max_points: Maximum number of time points to return
1408
+
1409
+ Returns:
1410
+ Dict with:
1411
+ - timestamps: List of experiment names
1412
+ - models: Dict mapping model -> list of ELO values
1413
+ - battle_counts: List of cumulative battle counts
1414
+ """
1415
+ # Get subset info for experiment order
1416
+ info = self.get_subset_info(subset)
1417
+ if not info:
1418
+ return {"timestamps": [], "models": {}, "battle_counts": []}
1419
+
1420
+ # Only support experiment-level granularity (reading from snapshots)
1421
+ # Time-based granularity would require real-time computation which we want to avoid
1422
+ if granularity != "experiment":
1423
+ logger.warning(
1424
+ f"Time-based granularity '{granularity}' is not supported for ELO history. "
1425
+ f"Falling back to 'experiment' granularity."
1426
+ )
1427
+
1428
+ # Get ordered list of experiments
1429
+ experiments = info.experiments
1430
+ if not experiments:
1431
+ return {"timestamps": [], "models": {}, "battle_counts": []}
1432
+
1433
+ # If too many experiments, sample them
1434
+ if len(experiments) > max_points:
1435
+ step = len(experiments) // max_points
1436
+ sampled = [experiments[i] for i in range(0, len(experiments), step)]
1437
+ if sampled[-1] != experiments[-1]:
1438
+ sampled.append(experiments[-1])
1439
+ experiments = sampled
1440
+
1441
+ # Load ELO snapshots for each experiment
1442
+ timestamps: list[str] = []
1443
+ model_elos: dict[str, list[Optional[float]]] = {}
1444
+ battle_counts: list[int] = []
1445
+
1446
+ pk_logs_dir = os.path.join(self.arena_dir, subset, "pk_logs")
1447
+
1448
+ for exp in experiments:
1449
+ snapshot_path = os.path.join(pk_logs_dir, exp, "elo_snapshot.json")
1450
+ snapshot = self._load_elo_snapshot(snapshot_path)
1451
+
1452
+ if snapshot is None:
1453
+ # Skip experiments without snapshots
1454
+ continue
1455
+
1456
+ elo_ratings = snapshot["elo"]
1457
+ battle_count = snapshot["battle_count"]
1458
+
1459
+ timestamps.append(exp)
1460
+ battle_counts.append(battle_count)
1461
+
1462
+ # Update model ELOs
1463
+ all_models_so_far = set(model_elos.keys()) | set(elo_ratings.keys())
1464
+ for model in all_models_so_far:
1465
+ if model not in model_elos:
1466
+ # New model: fill with None for previous timestamps
1467
+ model_elos[model] = [None] * (len(timestamps) - 1)
1468
+ model_elos[model].append(elo_ratings.get(model))
1469
+
1470
+ # Ensure all models have the same length
1471
+ for model in model_elos:
1472
+ if len(model_elos[model]) < len(timestamps):
1473
+ model_elos[model].append(None)
1474
+
1475
+ # Filter to requested models if specified
1476
+ if filter_models:
1477
+ filter_set = set(filter_models)
1478
+ model_elos = {m: v for m, v in model_elos.items() if m in filter_set}
1479
+
1480
+ return {
1481
+ "timestamps": timestamps,
1482
+ "models": model_elos,
1483
+ "battle_counts": battle_counts,
1484
+ }
1485
+
1486
+ def get_cross_subset_info(
1487
+ self,
1488
+ subsets: list[str],
1489
+ ) -> dict[str, Any]:
1490
+ """
1491
+ Get information about models across multiple subsets.
1492
+
1493
+ Args:
1494
+ subsets: List of subset names
1495
+
1496
+ Returns:
1497
+ Dict with:
1498
+ - common_models: Models present in all subsets
1499
+ - all_models: Models present in any subset
1500
+ - per_subset_models: Dict mapping subset -> list of models
1501
+ - per_subset_battles: Dict mapping subset -> battle count
1502
+ """
1503
+ per_subset_models: dict[str, set[str]] = {}
1504
+ per_subset_battles: dict[str, int] = {}
1505
+
1506
+ for subset in subsets:
1507
+ info = self.get_subset_info(subset)
1508
+ if info:
1509
+ per_subset_models[subset] = set(info.models)
1510
+ per_subset_battles[subset] = info.total_battles
1511
+
1512
+ if not per_subset_models:
1513
+ return {
1514
+ "common_models": [],
1515
+ "all_models": [],
1516
+ "per_subset_models": {},
1517
+ "per_subset_battles": {},
1518
+ }
1519
+
1520
+ # Compute intersection and union
1521
+ all_model_sets = list(per_subset_models.values())
1522
+ common_models = set.intersection(*all_model_sets) if all_model_sets else set()
1523
+ all_models = set.union(*all_model_sets) if all_model_sets else set()
1524
+
1525
+ return {
1526
+ "common_models": sorted(common_models),
1527
+ "all_models": sorted(all_models),
1528
+ "per_subset_models": {s: sorted(m) for s, m in per_subset_models.items()},
1529
+ "per_subset_battles": per_subset_battles,
1530
+ "total_battles": sum(per_subset_battles.values()),
1531
+ }
1532
+
1533
+ def get_cross_subset_elo(
1534
+ self,
1535
+ subsets: list[str],
1536
+ exp_name: str = "__all__",
1537
+ model_scope: str = "all",
1538
+ ) -> dict[str, Any]:
1539
+ """
1540
+ Compute ELO rankings across multiple subsets.
1541
+
1542
+ Args:
1543
+ subsets: List of subset names
1544
+ exp_name: Experiment name (use "__all__" for all)
1545
+ model_scope: "common" = only models in all subsets, "all" = all models
1546
+
1547
+ Returns:
1548
+ Dict with merged leaderboard and per-subset comparison
1549
+ """
1550
+ # Check cache first
1551
+ cache_key = (tuple(sorted(subsets)), exp_name, model_scope)
1552
+ if cache_key in self._cross_subset_elo_cache:
1553
+ return self._cross_subset_elo_cache[cache_key]
1554
+
1555
+ from genarena.bt_elo import compute_bt_elo_ratings
1556
+
1557
+ # Get cross-subset info
1558
+ cross_info = self.get_cross_subset_info(subsets)
1559
+
1560
+ # Determine models to include
1561
+ if model_scope == "common":
1562
+ included_models = set(cross_info["common_models"])
1563
+ else:
1564
+ included_models = set(cross_info["all_models"])
1565
+
1566
+ if not included_models:
1567
+ return {
1568
+ "subsets": subsets,
1569
+ "model_scope": model_scope,
1570
+ "common_models": cross_info["common_models"],
1571
+ "all_models": cross_info["all_models"],
1572
+ "total_battles": 0,
1573
+ "leaderboard": [],
1574
+ "per_subset_elo": {},
1575
+ }
1576
+
1577
+ # Collect all battles
1578
+ all_battles = []
1579
+ model_presence: dict[str, set[str]] = {} # model -> set of subsets it's in
1580
+
1581
+ for subset in subsets:
1582
+ if exp_name == "__all__":
1583
+ records = self._load_all_experiments_battles(subset)
1584
+ else:
1585
+ records = self._load_battle_logs(subset, exp_name)
1586
+
1587
+ for record in records:
1588
+ # Skip if either model is not in included set
1589
+ if model_scope == "common":
1590
+ if record.model_a not in included_models or record.model_b not in included_models:
1591
+ continue
1592
+
1593
+ # Convert to bt_elo format
1594
+ if record.final_winner == record.model_a:
1595
+ winner = "model_a"
1596
+ elif record.final_winner == record.model_b:
1597
+ winner = "model_b"
1598
+ else:
1599
+ winner = "tie"
1600
+
1601
+ all_battles.append((record.model_a, record.model_b, winner))
1602
+
1603
+ # Track model presence
1604
+ for m in [record.model_a, record.model_b]:
1605
+ if m not in model_presence:
1606
+ model_presence[m] = set()
1607
+ model_presence[m].add(subset)
1608
+
1609
+ if not all_battles:
1610
+ return {
1611
+ "subsets": subsets,
1612
+ "model_scope": model_scope,
1613
+ "common_models": cross_info["common_models"],
1614
+ "all_models": cross_info["all_models"],
1615
+ "total_battles": 0,
1616
+ "leaderboard": [],
1617
+ "per_subset_elo": {},
1618
+ }
1619
+
1620
+ # Compute merged ELO
1621
+ try:
1622
+ ratings = compute_bt_elo_ratings(all_battles)
1623
+ except Exception as e:
1624
+ logger.error(f"Failed to compute cross-subset ELO: {e}")
1625
+ return {
1626
+ "subsets": subsets,
1627
+ "model_scope": model_scope,
1628
+ "error": str(e),
1629
+ "total_battles": len(all_battles),
1630
+ "leaderboard": [],
1631
+ }
1632
+
1633
+ # Count wins/losses/ties per model
1634
+ model_stats: dict[str, dict[str, int]] = {}
1635
+ for ma, mb, winner in all_battles:
1636
+ for m in [ma, mb]:
1637
+ if m not in model_stats:
1638
+ model_stats[m] = {"wins": 0, "losses": 0, "ties": 0}
1639
+
1640
+ if winner == "model_a":
1641
+ model_stats[ma]["wins"] += 1
1642
+ model_stats[mb]["losses"] += 1
1643
+ elif winner == "model_b":
1644
+ model_stats[mb]["wins"] += 1
1645
+ model_stats[ma]["losses"] += 1
1646
+ else:
1647
+ model_stats[ma]["ties"] += 1
1648
+ model_stats[mb]["ties"] += 1
1649
+
1650
+ # Build leaderboard
1651
+ leaderboard = []
1652
+ for model, elo in ratings.items():
1653
+ stats = model_stats.get(model, {"wins": 0, "losses": 0, "ties": 0})
1654
+ total = stats["wins"] + stats["losses"] + stats["ties"]
1655
+ leaderboard.append({
1656
+ "model": model,
1657
+ "elo": round(elo, 1),
1658
+ "wins": stats["wins"],
1659
+ "losses": stats["losses"],
1660
+ "ties": stats["ties"],
1661
+ "total": total,
1662
+ "win_rate": (stats["wins"] + 0.5 * stats["ties"]) / total if total > 0 else 0,
1663
+ "subset_presence": sorted(model_presence.get(model, set())),
1664
+ })
1665
+
1666
+ leaderboard.sort(key=lambda x: -x["elo"])
1667
+
1668
+ # Get per-subset ELO for comparison
1669
+ per_subset_elo: dict[str, dict[str, float]] = {}
1670
+ for subset in subsets:
1671
+ subset_lb = self.get_elo_leaderboard(subset)
1672
+ per_subset_elo[subset] = {entry["model"]: entry["elo"] for entry in subset_lb}
1673
+
1674
+ result = {
1675
+ "subsets": subsets,
1676
+ "model_scope": model_scope,
1677
+ "common_models": cross_info["common_models"],
1678
+ "all_models": cross_info["all_models"],
1679
+ "total_battles": len(all_battles),
1680
+ "leaderboard": leaderboard,
1681
+ "per_subset_elo": per_subset_elo,
1682
+ }
1683
+
1684
+ # Cache the result
1685
+ self._cross_subset_elo_cache[cache_key] = result
1686
+ return result
1687
+
1688
+ def get_stats(self, subset: str, exp_name: Optional[str] = None) -> dict[str, Any]:
1689
+ """
1690
+ Get statistics for a subset.
1691
+
1692
+ Args:
1693
+ subset: Subset name
1694
+ exp_name: Optional experiment name (if None, uses overall state; "__all__" for all experiments)
1695
+
1696
+ Returns:
1697
+ Statistics dictionary
1698
+ """
1699
+ info = self.get_subset_info(subset)
1700
+ if not info:
1701
+ return {}
1702
+
1703
+ if exp_name == "__all__":
1704
+ # Combine stats from all experiments
1705
+ records = self._load_all_experiments_battles(subset)
1706
+ total_battles = len(records)
1707
+ consistent = sum(1 for r in records if r.is_consistent)
1708
+ ties = sum(1 for r in records if r.final_winner == "tie")
1709
+ elif exp_name:
1710
+ records = self._load_battle_logs(subset, exp_name)
1711
+ total_battles = len(records)
1712
+ consistent = sum(1 for r in records if r.is_consistent)
1713
+ ties = sum(1 for r in records if r.final_winner == "tie")
1714
+ else:
1715
+ total_battles = info.total_battles
1716
+ consistent = 0
1717
+ ties = 0
1718
+
1719
+ return {
1720
+ "subset": subset,
1721
+ "models": info.models,
1722
+ "experiments": info.experiments,
1723
+ "total_battles": total_battles,
1724
+ "consistent_battles": consistent,
1725
+ "tie_battles": ties,
1726
+ "consistency_rate": consistent / total_battles if total_battles > 0 else 0,
1727
+ }
1728
+
1729
+ def get_model_win_stats(
1730
+ self, subset: str, exp_name: str, sample_index: int,
1731
+ filter_models: Optional[list[str]] = None
1732
+ ) -> dict[str, dict[str, Any]]:
1733
+ """
1734
+ Get win/loss statistics for all models on a specific sample.
1735
+
1736
+ Args:
1737
+ subset: Subset name
1738
+ exp_name: Experiment name (use "__all__" for all experiments)
1739
+ sample_index: Sample index
1740
+ filter_models: Optional list of models to filter (only count battles between these models)
1741
+
1742
+ Returns:
1743
+ Dict mapping model name to stats (wins, losses, ties, total, win_rate)
1744
+ """
1745
+ if exp_name == "__all__":
1746
+ all_records = self._load_all_experiments_battles(subset)
1747
+ else:
1748
+ all_records = self._load_battle_logs(subset, exp_name)
1749
+
1750
+ # Filter records for this sample
1751
+ sample_records = [r for r in all_records if r.sample_index == sample_index]
1752
+
1753
+ # If filter_models is specified, only count battles between those models
1754
+ if filter_models:
1755
+ filter_set = set(filter_models)
1756
+ sample_records = [
1757
+ r for r in sample_records
1758
+ if r.model_a in filter_set and r.model_b in filter_set
1759
+ ]
1760
+
1761
+ # Collect stats per model
1762
+ model_stats: dict[str, dict[str, int]] = {}
1763
+
1764
+ for record in sample_records:
1765
+ for model in [record.model_a, record.model_b]:
1766
+ if model not in model_stats:
1767
+ model_stats[model] = {"wins": 0, "losses": 0, "ties": 0}
1768
+
1769
+ if record.final_winner == "tie":
1770
+ model_stats[record.model_a]["ties"] += 1
1771
+ model_stats[record.model_b]["ties"] += 1
1772
+ elif record.final_winner == record.model_a:
1773
+ model_stats[record.model_a]["wins"] += 1
1774
+ model_stats[record.model_b]["losses"] += 1
1775
+ elif record.final_winner == record.model_b:
1776
+ model_stats[record.model_b]["wins"] += 1
1777
+ model_stats[record.model_a]["losses"] += 1
1778
+
1779
+ # Calculate win rate and total
1780
+ result: dict[str, dict[str, Any]] = {}
1781
+ for model, stats in model_stats.items():
1782
+ total = stats["wins"] + stats["losses"] + stats["ties"]
1783
+ win_rate = stats["wins"] / total if total > 0 else 0
1784
+ result[model] = {
1785
+ "wins": stats["wins"],
1786
+ "losses": stats["losses"],
1787
+ "ties": stats["ties"],
1788
+ "total": total,
1789
+ "win_rate": win_rate,
1790
+ }
1791
+
1792
+ return result
1793
+
1794
+ def get_sample_all_models(
1795
+ self, subset: str, exp_name: str, sample_index: int,
1796
+ filter_models: Optional[list[str]] = None,
1797
+ stats_scope: str = "filtered"
1798
+ ) -> dict[str, Any]:
1799
+ """
1800
+ Get all model outputs for a specific sample, sorted by win rate.
1801
+
1802
+ Args:
1803
+ subset: Subset name
1804
+ exp_name: Experiment name
1805
+ sample_index: Sample index
1806
+ filter_models: Optional list of models to filter (show only these models)
1807
+ stats_scope: 'filtered' = only count battles between filtered models,
1808
+ 'all' = count all battles (but show only filtered models)
1809
+
1810
+ Returns:
1811
+ Dict with sample info and all model outputs sorted by win rate
1812
+ """
1813
+ # Get sample metadata
1814
+ sample_meta = self._get_sample_data(subset, sample_index)
1815
+
1816
+ # Determine which models to use for stats calculation
1817
+ # If stats_scope is 'all', don't filter battles by models
1818
+ stats_filter = filter_models if stats_scope == "filtered" else None
1819
+ model_stats = self.get_model_win_stats(subset, exp_name, sample_index, stats_filter)
1820
+
1821
+ # Get all models that have outputs
1822
+ model_manager = self._get_model_manager(subset)
1823
+ available_models = []
1824
+
1825
+ if model_manager:
1826
+ # Determine which models to include
1827
+ models_to_check = model_manager.models
1828
+ if filter_models:
1829
+ filter_set = set(filter_models)
1830
+ models_to_check = [m for m in models_to_check if m in filter_set]
1831
+
1832
+ for model in models_to_check:
1833
+ output_path = model_manager.get_output_path(model, sample_index)
1834
+ if output_path and os.path.isfile(output_path):
1835
+ stats = model_stats.get(model, {
1836
+ "wins": 0, "losses": 0, "ties": 0, "total": 0, "win_rate": 0
1837
+ })
1838
+ available_models.append({
1839
+ "model": model,
1840
+ "wins": stats["wins"],
1841
+ "losses": stats["losses"],
1842
+ "ties": stats["ties"],
1843
+ "total": stats["total"],
1844
+ "win_rate": stats["win_rate"],
1845
+ })
1846
+
1847
+ # Sort by win rate (descending), then by wins (descending), then by model name
1848
+ available_models.sort(key=lambda x: (-x["win_rate"], -x["wins"], x["model"]))
1849
+
1850
+ return {
1851
+ "subset": subset,
1852
+ "exp_name": exp_name,
1853
+ "sample_index": sample_index,
1854
+ "instruction": sample_meta.get("instruction", ""),
1855
+ "task_type": sample_meta.get("task_type", ""),
1856
+ "input_image_count": sample_meta.get("input_image_count", 1),
1857
+ "prompt_source": sample_meta.get("prompt_source"),
1858
+ "original_metadata": sample_meta.get("original_metadata"),
1859
+ "models": available_models,
1860
+ }
1861
+
1862
+ def get_model_battles_for_sample(
1863
+ self,
1864
+ subset: str,
1865
+ exp_name: str,
1866
+ sample_index: int,
1867
+ model: str,
1868
+ opponent_models: Optional[list[str]] = None,
1869
+ ) -> dict[str, Any]:
1870
+ """
1871
+ Get all battle records for a specific model on a specific sample.
1872
+
1873
+ Args:
1874
+ subset: Subset name
1875
+ exp_name: Experiment name (use "__all__" for all experiments)
1876
+ sample_index: Sample index
1877
+ model: The model to get battles for
1878
+ opponent_models: Optional list of opponent models to filter by
1879
+
1880
+ Returns:
1881
+ Dict with model info and list of battle records
1882
+ """
1883
+ if exp_name == "__all__":
1884
+ all_records = self._load_all_experiments_battles(subset)
1885
+ else:
1886
+ all_records = self._load_battle_logs(subset, exp_name)
1887
+
1888
+ # Filter records for this sample and involving this model
1889
+ model_battles = []
1890
+ all_opponents = set()
1891
+
1892
+ for record in all_records:
1893
+ if record.sample_index != sample_index:
1894
+ continue
1895
+ if model not in [record.model_a, record.model_b]:
1896
+ continue
1897
+
1898
+ # Determine opponent
1899
+ opponent = record.model_b if record.model_a == model else record.model_a
1900
+ all_opponents.add(opponent)
1901
+
1902
+ # Apply opponent filter if specified
1903
+ if opponent_models and opponent not in opponent_models:
1904
+ continue
1905
+
1906
+ # Determine result for this model
1907
+ if record.final_winner == "tie":
1908
+ result = "tie"
1909
+ elif record.final_winner == model:
1910
+ result = "win"
1911
+ else:
1912
+ result = "loss"
1913
+
1914
+ # Build battle data with judge outputs
1915
+ battle_data = {
1916
+ "opponent": opponent,
1917
+ "result": result,
1918
+ "is_consistent": record.is_consistent,
1919
+ "model_a": record.model_a,
1920
+ "model_b": record.model_b,
1921
+ "final_winner": record.final_winner,
1922
+ "exp_name": record.exp_name,
1923
+ }
1924
+
1925
+ # Load audit logs if not already loaded on the record
1926
+ if not record.original_call and not record.swapped_call:
1927
+ actual_exp_name = record.exp_name
1928
+ audit = self._load_audit_log(
1929
+ subset, actual_exp_name, record.model_a, record.model_b, sample_index
1930
+ )
1931
+ if audit:
1932
+ battle_data["original_call"] = audit.get("original_call")
1933
+ battle_data["swapped_call"] = audit.get("swapped_call")
1934
+ else:
1935
+ # Use existing data if available
1936
+ if record.original_call:
1937
+ battle_data["original_call"] = record.original_call
1938
+ if record.swapped_call:
1939
+ battle_data["swapped_call"] = record.swapped_call
1940
+
1941
+ model_battles.append(battle_data)
1942
+
1943
+ # Sort battles by opponent name
1944
+ model_battles.sort(key=lambda x: x["opponent"])
1945
+
1946
+ # Get model stats
1947
+ model_stats = self.get_model_win_stats(subset, exp_name, sample_index)
1948
+ stats = model_stats.get(model, {
1949
+ "wins": 0, "losses": 0, "ties": 0, "total": 0, "win_rate": 0
1950
+ })
1951
+
1952
+ return {
1953
+ "model": model,
1954
+ "sample_index": sample_index,
1955
+ "wins": stats["wins"],
1956
+ "losses": stats["losses"],
1957
+ "ties": stats["ties"],
1958
+ "total": stats["total"],
1959
+ "win_rate": stats["win_rate"],
1960
+ "battles": model_battles,
1961
+ "all_opponents": sorted(list(all_opponents)),
1962
+ }
1963
+
1964
+ def get_elo_leaderboard(
1965
+ self,
1966
+ subset: str,
1967
+ filter_models: Optional[list[str]] = None,
1968
+ ) -> list[dict[str, Any]]:
1969
+ """
1970
+ Get ELO leaderboard for a subset from state.json.
1971
+
1972
+ Args:
1973
+ subset: Subset name
1974
+ filter_models: Optional list of models to filter (show only these models)
1975
+
1976
+ Returns:
1977
+ List of model stats sorted by ELO rating (descending)
1978
+ """
1979
+ info = self.get_subset_info(subset)
1980
+ if not info or not info.state:
1981
+ return []
1982
+
1983
+ state = info.state
1984
+ leaderboard = []
1985
+
1986
+ for model_name, model_stats in state.models.items():
1987
+ # Apply filter if specified
1988
+ if filter_models and model_name not in filter_models:
1989
+ continue
1990
+
1991
+ leaderboard.append({
1992
+ "model": model_name,
1993
+ "elo": model_stats.elo,
1994
+ "wins": model_stats.wins,
1995
+ "losses": model_stats.losses,
1996
+ "ties": model_stats.ties,
1997
+ "total_battles": model_stats.total_battles,
1998
+ "win_rate": model_stats.win_rate,
1999
+ })
2000
+
2001
+ # Sort by ELO rating (descending)
2002
+ leaderboard.sort(key=lambda x: -x["elo"])
2003
+
2004
+ # Add rank
2005
+ for i, entry in enumerate(leaderboard):
2006
+ entry["rank"] = i + 1
2007
+
2008
+ return leaderboard
2009
+
2010
+ def get_model_vs_stats(
2011
+ self,
2012
+ subset: str,
2013
+ model: str,
2014
+ exp_name: str = "__all__",
2015
+ ) -> dict[str, Any]:
2016
+ """
2017
+ Get win/loss/tie stats of a specific model against all other models.
2018
+
2019
+ Args:
2020
+ subset: Subset name
2021
+ model: Target model name
2022
+ exp_name: Experiment name (default "__all__" for all experiments)
2023
+
2024
+ Returns:
2025
+ Dict with model stats and versus stats against each opponent
2026
+ """
2027
+ # Get overall ELO stats
2028
+ info = self.get_subset_info(subset)
2029
+ if not info or not info.state:
2030
+ return {}
2031
+
2032
+ state = info.state
2033
+ if model not in state.models:
2034
+ return {}
2035
+
2036
+ model_stats = state.models[model]
2037
+
2038
+ # Load battle records
2039
+ if exp_name == "__all__":
2040
+ all_records = self._load_all_experiments_battles(subset)
2041
+ else:
2042
+ all_records = self._load_battle_logs(subset, exp_name)
2043
+
2044
+ # Calculate stats against each opponent
2045
+ vs_stats: dict[str, dict[str, int]] = {}
2046
+
2047
+ for record in all_records:
2048
+ if model not in [record.model_a, record.model_b]:
2049
+ continue
2050
+
2051
+ opponent = record.model_b if record.model_a == model else record.model_a
2052
+
2053
+ if opponent not in vs_stats:
2054
+ vs_stats[opponent] = {"wins": 0, "losses": 0, "ties": 0}
2055
+
2056
+ if record.final_winner == "tie":
2057
+ vs_stats[opponent]["ties"] += 1
2058
+ elif record.final_winner == model:
2059
+ vs_stats[opponent]["wins"] += 1
2060
+ else:
2061
+ vs_stats[opponent]["losses"] += 1
2062
+
2063
+ # Convert to list with win rates and opponent ELO
2064
+ vs_list = []
2065
+ for opponent, stats in vs_stats.items():
2066
+ total = stats["wins"] + stats["losses"] + stats["ties"]
2067
+ opponent_elo = state.models[opponent].elo if opponent in state.models else 1000.0
2068
+ vs_list.append({
2069
+ "opponent": opponent,
2070
+ "opponent_elo": opponent_elo,
2071
+ "wins": stats["wins"],
2072
+ "losses": stats["losses"],
2073
+ "ties": stats["ties"],
2074
+ "total": total,
2075
+ "win_rate": stats["wins"] / total if total > 0 else 0,
2076
+ })
2077
+
2078
+ # Sort by opponent ELO (descending)
2079
+ vs_list.sort(key=lambda x: -x["opponent_elo"])
2080
+
2081
+ return {
2082
+ "model": model,
2083
+ "elo": model_stats.elo,
2084
+ "wins": model_stats.wins,
2085
+ "losses": model_stats.losses,
2086
+ "ties": model_stats.ties,
2087
+ "total_battles": model_stats.total_battles,
2088
+ "win_rate": model_stats.win_rate,
2089
+ "vs_stats": vs_list,
2090
+ }
2091
+
2092
+ def get_all_subsets_leaderboards(self) -> dict[str, Any]:
2093
+ """
2094
+ Get leaderboard data for all subsets (for Overview page).
2095
+
2096
+ Returns:
2097
+ Dict with:
2098
+ - subsets: List of subset names
2099
+ - models: List of all unique model names across all subsets
2100
+ - data: Dict mapping subset -> {model -> {elo, rank, wins, losses, ties, ...}}
2101
+ - subset_info: Dict mapping subset -> {total_battles, model_count}
2102
+ """
2103
+ subsets = self.discover_subsets()
2104
+ all_models: set[str] = set()
2105
+ data: dict[str, dict[str, dict[str, Any]]] = {}
2106
+ subset_info: dict[str, dict[str, Any]] = {}
2107
+
2108
+ for subset in subsets:
2109
+ leaderboard = self.get_elo_leaderboard(subset)
2110
+ info = self.get_subset_info(subset)
2111
+
2112
+ if not leaderboard:
2113
+ continue
2114
+
2115
+ # Build subset data
2116
+ subset_data: dict[str, dict[str, Any]] = {}
2117
+ for entry in leaderboard:
2118
+ model = entry["model"]
2119
+ all_models.add(model)
2120
+ subset_data[model] = {
2121
+ "elo": entry["elo"],
2122
+ "rank": entry["rank"],
2123
+ "wins": entry["wins"],
2124
+ "losses": entry["losses"],
2125
+ "ties": entry["ties"],
2126
+ "total_battles": entry["total_battles"],
2127
+ "win_rate": entry["win_rate"],
2128
+ }
2129
+
2130
+ data[subset] = subset_data
2131
+ subset_info[subset] = {
2132
+ "total_battles": info.total_battles if info else 0,
2133
+ "model_count": len(leaderboard),
2134
+ }
2135
+
2136
+ # Sort models by average ELO across all subsets (descending)
2137
+ model_avg_elo: dict[str, tuple[float, int]] = {} # model -> (sum_elo, count)
2138
+ for model in all_models:
2139
+ total_elo = 0.0
2140
+ count = 0
2141
+ for subset in subsets:
2142
+ if subset in data and model in data[subset]:
2143
+ total_elo += data[subset][model]["elo"]
2144
+ count += 1
2145
+ if count > 0:
2146
+ model_avg_elo[model] = (total_elo / count, count)
2147
+ else:
2148
+ model_avg_elo[model] = (0.0, 0)
2149
+
2150
+ sorted_models = sorted(
2151
+ all_models,
2152
+ key=lambda m: (-model_avg_elo[m][0], -model_avg_elo[m][1], m)
2153
+ )
2154
+
2155
+ return {
2156
+ "subsets": subsets,
2157
+ "models": sorted_models,
2158
+ "data": data,
2159
+ "subset_info": subset_info,
2160
+ }
2161
+
2162
+ def get_prompts(
2163
+ self,
2164
+ subset: str,
2165
+ exp_name: str,
2166
+ page: int = 1,
2167
+ page_size: int = 10,
2168
+ min_images: Optional[int] = None,
2169
+ max_images: Optional[int] = None,
2170
+ prompt_source: Optional[str] = None,
2171
+ filter_models: Optional[list[str]] = None,
2172
+ ) -> tuple[list[dict[str, Any]], int]:
2173
+ """
2174
+ Get paginated list of prompts/samples with all model outputs.
2175
+
2176
+ Args:
2177
+ subset: Subset name
2178
+ exp_name: Experiment name (use "__all__" for all experiments)
2179
+ page: Page number (1-indexed)
2180
+ page_size: Number of records per page
2181
+ min_images: Minimum number of input images
2182
+ max_images: Maximum number of input images
2183
+ prompt_source: Filter by prompt source
2184
+ filter_models: Optional list of models to filter (show only these models)
2185
+
2186
+ Returns:
2187
+ Tuple of (prompts_list, total_count)
2188
+ """
2189
+ # Get all sample indices from battle logs
2190
+ if exp_name == "__all__":
2191
+ all_records = self._load_all_experiments_battles(subset)
2192
+ else:
2193
+ all_records = self._load_battle_logs(subset, exp_name)
2194
+
2195
+ # Collect unique sample indices
2196
+ sample_indices = set()
2197
+ for record in all_records:
2198
+ sample_indices.add(record.sample_index)
2199
+
2200
+ # Sort sample indices
2201
+ sorted_indices = sorted(sample_indices)
2202
+
2203
+ # Apply filters
2204
+ filtered_indices = []
2205
+ for idx in sorted_indices:
2206
+ sample_meta = self._get_sample_data(subset, idx)
2207
+ img_count = sample_meta.get("input_image_count", 1)
2208
+ source = sample_meta.get("prompt_source")
2209
+
2210
+ # Apply image count filter
2211
+ if min_images is not None and img_count < min_images:
2212
+ continue
2213
+ if max_images is not None and img_count > max_images:
2214
+ continue
2215
+
2216
+ # Apply prompt source filter
2217
+ if prompt_source and source != prompt_source:
2218
+ continue
2219
+
2220
+ filtered_indices.append(idx)
2221
+
2222
+ total_count = len(filtered_indices)
2223
+
2224
+ # Paginate
2225
+ start = (page - 1) * page_size
2226
+ end = start + page_size
2227
+ page_indices = filtered_indices[start:end]
2228
+
2229
+ # Build prompt data for each sample
2230
+ prompts = []
2231
+ for idx in page_indices:
2232
+ prompt_data = self.get_sample_all_models(subset, exp_name, idx, filter_models)
2233
+ prompts.append(prompt_data)
2234
+
2235
+ return prompts, total_count
2236
+
2237
+
2238
+ class HFArenaDataLoader(ArenaDataLoader):
2239
+ """
2240
+ Data loader for HuggingFace Spaces deployment.
2241
+
2242
+ Extends ArenaDataLoader to:
2243
+ - Build image URL index from HF file list
2244
+ - Return HF CDN URLs for model output images instead of local paths
2245
+ """
2246
+
2247
+ def __init__(
2248
+ self,
2249
+ arena_dir: str,
2250
+ data_dir: str,
2251
+ hf_repo: str,
2252
+ image_files: list[str],
2253
+ preload: bool = True,
2254
+ ):
2255
+ """
2256
+ Initialize the HF data loader.
2257
+
2258
+ Args:
2259
+ arena_dir: Path to arena directory (metadata only, no images)
2260
+ data_dir: Path to data directory containing parquet files
2261
+ hf_repo: HuggingFace repo ID for image CDN URLs
2262
+ image_files: List of image file paths in the HF repo
2263
+ preload: If True, preload all data at initialization
2264
+ """
2265
+ self.hf_repo = hf_repo
2266
+ self._image_url_index = self._build_image_index(image_files)
2267
+ super().__init__(arena_dir, data_dir, preload=preload)
2268
+
2269
+ def _build_image_index(
2270
+ self, image_files: list[str]
2271
+ ) -> dict[tuple[str, str, int], str]:
2272
+ """
2273
+ Build index: (subset, model, sample_index) -> hf_file_path
2274
+
2275
+ Expected path format: {subset}/models/{exp_name}/{model}/{index}.png
2276
+
2277
+ Args:
2278
+ image_files: List of image file paths from HF repo
2279
+
2280
+ Returns:
2281
+ Dict mapping (subset, model, sample_index) to HF file path
2282
+ """
2283
+ from genarena.models import parse_image_index
2284
+
2285
+ index: dict[tuple[str, str, int], str] = {}
2286
+
2287
+ for path in image_files:
2288
+ parts = path.split("/")
2289
+ # Expected: subset/models/exp_name/model/000000.png
2290
+ if len(parts) >= 5 and parts[1] == "models":
2291
+ subset = parts[0]
2292
+ # exp_name = parts[2] # Not needed for lookup
2293
+ model = parts[3]
2294
+ filename = parts[4]
2295
+ idx = parse_image_index(filename)
2296
+ if idx is not None:
2297
+ # If duplicate, later entries overwrite earlier ones
2298
+ index[(subset, model, idx)] = path
2299
+
2300
+ logger.info(f"Built image URL index with {len(index)} entries")
2301
+ return index
2302
+
2303
+ def get_model_image_url(
2304
+ self, subset: str, model: str, sample_index: int
2305
+ ) -> Optional[str]:
2306
+ """
2307
+ Get HF CDN URL for model output image.
2308
+
2309
+ Args:
2310
+ subset: Subset name
2311
+ model: Model name
2312
+ sample_index: Sample index
2313
+
2314
+ Returns:
2315
+ HF CDN URL or None if not found
2316
+ """
2317
+ path = self._image_url_index.get((subset, model, sample_index))
2318
+ if path:
2319
+ return f"https://huggingface.co/datasets/{self.hf_repo}/resolve/main/{path}"
2320
+ return None
2321
+
2322
+ def get_image_path(
2323
+ self, subset: str, model: str, sample_index: int
2324
+ ) -> Optional[str]:
2325
+ """
2326
+ Override to return None since images are served via CDN.
2327
+
2328
+ For HF deployment, use get_model_image_url() instead.
2329
+ """
2330
+ # Return None to indicate image should be fetched via CDN
2331
+ return None
genarena/visualize/static/app.js ADDED
The diff for this file is too large to render. See raw diff
 
genarena/visualize/static/style.css ADDED
@@ -0,0 +1,4104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* ========== CSS Variables ========== */
2
+ :root {
3
+ --bg-primary: #0d1117;
4
+ --bg-secondary: #161b22;
5
+ --bg-tertiary: #21262d;
6
+ --bg-hover: #30363d;
7
+ --border-color: #30363d;
8
+ --border-light: #484f58;
9
+
10
+ --text-primary: #e6edf3;
11
+ --text-secondary: #8b949e;
12
+ --text-muted: #6e7681;
13
+
14
+ --accent-blue: #58a6ff;
15
+ --accent-green: #3fb950;
16
+ --accent-red: #f85149;
17
+ --accent-yellow: #d29922;
18
+ --accent-purple: #a371f7;
19
+
20
+ --font-mono: 'JetBrains Mono', 'Fira Code', 'Cascadia Code', monospace;
21
+ --font-sans: 'Inter', -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
22
+
23
+ --radius-sm: 4px;
24
+ --radius-md: 8px;
25
+ --radius-lg: 12px;
26
+
27
+ --shadow-sm: 0 1px 2px rgba(0, 0, 0, 0.3);
28
+ --shadow-md: 0 4px 12px rgba(0, 0, 0, 0.4);
29
+ --shadow-lg: 0 8px 24px rgba(0, 0, 0, 0.5);
30
+ }
31
+
32
+ /* ========== Reset & Base ========== */
33
+ * {
34
+ margin: 0;
35
+ padding: 0;
36
+ box-sizing: border-box;
37
+ }
38
+
39
+ body {
40
+ font-family: var(--font-sans);
41
+ background: var(--bg-primary);
42
+ color: var(--text-primary);
43
+ line-height: 1.6;
44
+ min-height: 100vh;
45
+ }
46
+
47
+ /* ========== Header ========== */
48
+ .header {
49
+ display: flex;
50
+ align-items: center;
51
+ justify-content: space-between;
52
+ padding: 12px 24px;
53
+ background: var(--bg-secondary);
54
+ border-bottom: 1px solid var(--border-color);
55
+ position: sticky;
56
+ top: 0;
57
+ z-index: 100;
58
+ }
59
+
60
+ .header-left {
61
+ display: flex;
62
+ align-items: center;
63
+ gap: 32px;
64
+ flex: 0 0 auto;
65
+ }
66
+
67
+ .logo {
68
+ font-size: 1.25rem;
69
+ font-weight: 700;
70
+ color: var(--accent-blue);
71
+ letter-spacing: -0.02em;
72
+ }
73
+
74
+ .header-right {
75
+ flex: 0 0 auto;
76
+ }
77
+
78
+ .selector-group {
79
+ display: flex;
80
+ align-items: center;
81
+ gap: 8px;
82
+ }
83
+
84
+ .selector-group label {
85
+ font-size: 0.875rem;
86
+ color: var(--text-secondary);
87
+ }
88
+
89
+ .selector {
90
+ padding: 6px 12px;
91
+ background: var(--bg-tertiary);
92
+ border: 1px solid var(--border-color);
93
+ border-radius: var(--radius-sm);
94
+ color: var(--text-primary);
95
+ font-size: 0.875rem;
96
+ cursor: pointer;
97
+ min-width: 160px;
98
+ }
99
+
100
+ .selector:hover:not(:disabled) {
101
+ border-color: var(--border-light);
102
+ }
103
+
104
+ .selector:disabled {
105
+ opacity: 0.5;
106
+ cursor: not-allowed;
107
+ }
108
+
109
+ .stats-badge {
110
+ font-size: 0.75rem;
111
+ padding: 4px 10px;
112
+ background: var(--bg-tertiary);
113
+ border-radius: var(--radius-sm);
114
+ color: var(--text-secondary);
115
+ font-family: var(--font-mono);
116
+ }
117
+
118
+ /* Favorites Button */
119
+ .header-right {
120
+ display: flex;
121
+ align-items: center;
122
+ gap: 12px;
123
+ }
124
+
125
+ .btn-favorites {
126
+ display: flex;
127
+ align-items: center;
128
+ gap: 6px;
129
+ padding: 6px 12px;
130
+ background: var(--bg-tertiary);
131
+ border: 1px solid var(--border-color);
132
+ border-radius: var(--radius-sm);
133
+ color: var(--accent-yellow);
134
+ cursor: pointer;
135
+ transition: all 0.15s ease;
136
+ }
137
+
138
+ .btn-favorites:hover {
139
+ background: var(--bg-hover);
140
+ border-color: var(--accent-yellow);
141
+ }
142
+
143
+ .favorites-icon {
144
+ font-size: 1rem;
145
+ }
146
+
147
+ .favorites-count {
148
+ font-size: 0.75rem;
149
+ font-family: var(--font-mono);
150
+ background: var(--bg-primary);
151
+ padding: 2px 6px;
152
+ border-radius: var(--radius-sm);
153
+ color: var(--text-secondary);
154
+ }
155
+
156
+ /* View Toggle */
157
+ .view-toggle {
158
+ display: flex;
159
+ gap: 4px;
160
+ background: var(--bg-tertiary);
161
+ padding: 4px;
162
+ border-radius: var(--radius-sm);
163
+ border: 1px solid var(--border-color);
164
+ }
165
+
166
+ .view-btn {
167
+ display: flex;
168
+ align-items: center;
169
+ gap: 6px;
170
+ padding: 6px 12px;
171
+ background: transparent;
172
+ border: none;
173
+ border-radius: var(--radius-sm);
174
+ color: var(--text-secondary);
175
+ font-size: 0.8125rem;
176
+ cursor: pointer;
177
+ transition: all 0.15s ease;
178
+ }
179
+
180
+ .view-btn:hover {
181
+ color: var(--text-primary);
182
+ background: var(--bg-hover);
183
+ }
184
+
185
+ .view-btn.active {
186
+ background: var(--accent-blue);
187
+ color: #fff;
188
+ }
189
+
190
+ .view-icon {
191
+ font-size: 0.875rem;
192
+ }
193
+
194
+ /* ========== Main Layout ========== */
195
+ .main-container {
196
+ display: flex;
197
+ min-height: calc(100vh - 57px);
198
+ }
199
+
200
+ /* ========== Sidebar ========== */
201
+ .sidebar {
202
+ width: 280px;
203
+ flex-shrink: 0;
204
+ background: var(--bg-secondary);
205
+ border-right: 1px solid var(--border-color);
206
+ padding: 20px;
207
+ display: flex;
208
+ flex-direction: column;
209
+ gap: 24px;
210
+ }
211
+
212
+ .filter-section h3,
213
+ .stats-section h3 {
214
+ font-size: 0.75rem;
215
+ text-transform: uppercase;
216
+ letter-spacing: 0.05em;
217
+ color: var(--text-muted);
218
+ margin-bottom: 12px;
219
+ }
220
+
221
+ .filter-group {
222
+ margin-bottom: 12px;
223
+ }
224
+
225
+ .filter-group label {
226
+ display: block;
227
+ font-size: 0.8125rem;
228
+ color: var(--text-secondary);
229
+ margin-bottom: 4px;
230
+ }
231
+
232
+ .filter-select {
233
+ width: 100%;
234
+ padding: 8px 10px;
235
+ background: var(--bg-tertiary);
236
+ border: 1px solid var(--border-color);
237
+ border-radius: var(--radius-sm);
238
+ color: var(--text-primary);
239
+ font-size: 0.875rem;
240
+ }
241
+
242
+ /* Checkbox group for multi-select models */
243
+ .checkbox-group {
244
+ max-height: 200px;
245
+ overflow-y: auto;
246
+ background: var(--bg-tertiary);
247
+ border: 1px solid var(--border-color);
248
+ border-radius: var(--radius-sm);
249
+ padding: 8px;
250
+ margin-bottom: 8px;
251
+ }
252
+
253
+ .checkbox-item {
254
+ display: flex;
255
+ align-items: center;
256
+ padding: 4px 0;
257
+ cursor: pointer;
258
+ }
259
+
260
+ .checkbox-item:hover {
261
+ background: var(--bg-hover);
262
+ margin: 0 -8px;
263
+ padding: 4px 8px;
264
+ }
265
+
266
+ .checkbox-item input[type="checkbox"] {
267
+ margin-right: 8px;
268
+ accent-color: var(--accent-blue);
269
+ cursor: pointer;
270
+ }
271
+
272
+ .checkbox-item label {
273
+ font-size: 0.8125rem;
274
+ color: var(--text-primary);
275
+ cursor: pointer;
276
+ flex: 1;
277
+ white-space: nowrap;
278
+ overflow: hidden;
279
+ text-overflow: ellipsis;
280
+ }
281
+
282
+ .checkbox-actions {
283
+ display: flex;
284
+ gap: 4px;
285
+ }
286
+
287
+ .checkbox-actions .btn-small {
288
+ flex: 1;
289
+ font-size: 0.75rem;
290
+ }
291
+
292
+ #model-count {
293
+ font-size: 0.75rem;
294
+ color: var(--text-muted);
295
+ }
296
+
297
+ #prompts-model-count, #favorites-model-count {
298
+ font-size: 0.75rem;
299
+ color: var(--text-muted);
300
+ }
301
+
302
+ .filter-hint {
303
+ font-size: 0.75rem;
304
+ color: var(--text-muted);
305
+ margin-top: 8px;
306
+ font-style: italic;
307
+ }
308
+
309
+ /* Range Slider for image count filter */
310
+ .range-slider-container {
311
+ position: relative;
312
+ height: 30px;
313
+ margin: 8px 0;
314
+ }
315
+
316
+ .range-slider {
317
+ position: absolute;
318
+ width: 100%;
319
+ height: 4px;
320
+ background: transparent;
321
+ -webkit-appearance: none;
322
+ appearance: none;
323
+ pointer-events: none;
324
+ top: 50%;
325
+ transform: translateY(-50%);
326
+ }
327
+
328
+ .range-slider::-webkit-slider-runnable-track {
329
+ height: 4px;
330
+ background: var(--bg-tertiary);
331
+ border-radius: 2px;
332
+ }
333
+
334
+ .range-slider::-webkit-slider-thumb {
335
+ -webkit-appearance: none;
336
+ appearance: none;
337
+ width: 16px;
338
+ height: 16px;
339
+ background: var(--accent-blue);
340
+ border-radius: 50%;
341
+ cursor: pointer;
342
+ pointer-events: auto;
343
+ margin-top: -6px;
344
+ }
345
+
346
+ .range-slider::-moz-range-track {
347
+ height: 4px;
348
+ background: var(--bg-tertiary);
349
+ border-radius: 2px;
350
+ }
351
+
352
+ .range-slider::-moz-range-thumb {
353
+ width: 16px;
354
+ height: 16px;
355
+ background: var(--accent-blue);
356
+ border: none;
357
+ border-radius: 50%;
358
+ cursor: pointer;
359
+ pointer-events: auto;
360
+ }
361
+
362
+ .range-labels {
363
+ display: flex;
364
+ justify-content: space-between;
365
+ font-size: 0.75rem;
366
+ color: var(--text-muted);
367
+ }
368
+
369
+ #image-range-display {
370
+ font-family: var(--font-mono);
371
+ color: var(--accent-blue);
372
+ }
373
+
374
+ .btn {
375
+ padding: 8px 16px;
376
+ border: none;
377
+ border-radius: var(--radius-sm);
378
+ font-size: 0.875rem;
379
+ font-weight: 500;
380
+ cursor: pointer;
381
+ transition: all 0.15s ease;
382
+ }
383
+
384
+ .btn-primary {
385
+ background: var(--accent-blue);
386
+ color: #fff;
387
+ }
388
+
389
+ .btn-primary:hover {
390
+ background: #4c9aed;
391
+ }
392
+
393
+ .btn-secondary {
394
+ background: var(--bg-tertiary);
395
+ color: var(--text-secondary);
396
+ border: 1px solid var(--border-color);
397
+ }
398
+
399
+ .btn-secondary:hover {
400
+ background: var(--bg-hover);
401
+ color: var(--text-primary);
402
+ }
403
+
404
+ .btn-small {
405
+ padding: 4px 10px;
406
+ font-size: 0.8125rem;
407
+ }
408
+
409
+ .btn:disabled {
410
+ opacity: 0.5;
411
+ cursor: not-allowed;
412
+ }
413
+
414
+ .filter-section .btn {
415
+ width: 100%;
416
+ margin-top: 8px;
417
+ }
418
+
419
+ .filter-section .btn-secondary {
420
+ margin-top: 4px;
421
+ }
422
+
423
+ #stats-panel {
424
+ font-size: 0.875rem;
425
+ color: var(--text-secondary);
426
+ }
427
+
428
+ #stats-panel .stat-item {
429
+ display: flex;
430
+ justify-content: space-between;
431
+ padding: 6px 0;
432
+ border-bottom: 1px solid var(--border-color);
433
+ }
434
+
435
+ #stats-panel .stat-item:last-child {
436
+ border-bottom: none;
437
+ }
438
+
439
+ #stats-panel .stat-label {
440
+ color: var(--text-muted);
441
+ }
442
+
443
+ #stats-panel .stat-value {
444
+ color: var(--text-primary);
445
+ font-family: var(--font-mono);
446
+ }
447
+
448
+ .placeholder {
449
+ color: var(--text-muted);
450
+ font-style: italic;
451
+ }
452
+
453
+ /* Head-to-Head Section */
454
+ .h2h-section {
455
+ margin-top: 16px;
456
+ padding-top: 16px;
457
+ border-top: 1px solid var(--border-color);
458
+ }
459
+
460
+ .h2h-section h3 {
461
+ font-size: 0.75rem;
462
+ text-transform: uppercase;
463
+ letter-spacing: 0.05em;
464
+ color: var(--text-muted);
465
+ margin-bottom: 12px;
466
+ }
467
+
468
+ /* ELO Leaderboard Section */
469
+ .elo-section {
470
+ margin-top: 16px;
471
+ padding-top: 16px;
472
+ border-top: 1px solid var(--border-color);
473
+ }
474
+
475
+ .elo-header {
476
+ display: flex;
477
+ justify-content: space-between;
478
+ align-items: center;
479
+ margin-bottom: 12px;
480
+ }
481
+
482
+ .elo-header h3 {
483
+ font-size: 0.75rem;
484
+ text-transform: uppercase;
485
+ letter-spacing: 0.05em;
486
+ color: var(--text-muted);
487
+ margin: 0;
488
+ }
489
+
490
+ .btn-link {
491
+ background: transparent;
492
+ border: none;
493
+ color: var(--accent-blue);
494
+ font-size: 0.75rem;
495
+ cursor: pointer;
496
+ padding: 2px 4px;
497
+ }
498
+
499
+ .btn-link:hover {
500
+ text-decoration: underline;
501
+ }
502
+
503
+ #elo-panel {
504
+ /* No height limit - show all models */
505
+ }
506
+
507
+ /* ELO Bar Chart Item */
508
+ .elo-item {
509
+ display: flex;
510
+ align-items: center;
511
+ gap: 8px;
512
+ margin-bottom: 8px;
513
+ cursor: pointer;
514
+ padding: 4px;
515
+ border-radius: var(--radius-sm);
516
+ transition: background 0.15s ease;
517
+ }
518
+
519
+ .elo-item:hover {
520
+ background: var(--bg-hover);
521
+ }
522
+
523
+ .elo-rank {
524
+ font-size: 0.75rem;
525
+ font-weight: 600;
526
+ color: var(--text-muted);
527
+ min-width: 24px;
528
+ text-align: center;
529
+ }
530
+
531
+ .elo-rank.rank-1 {
532
+ color: var(--accent-yellow);
533
+ }
534
+
535
+ .elo-rank.rank-2 {
536
+ color: #c0c0c0;
537
+ }
538
+
539
+ .elo-rank.rank-3 {
540
+ color: #cd7f32;
541
+ }
542
+
543
+ .elo-model-name {
544
+ font-size: 0.75rem;
545
+ color: var(--text-secondary);
546
+ flex-shrink: 0;
547
+ width: 80px;
548
+ overflow: hidden;
549
+ text-overflow: ellipsis;
550
+ white-space: nowrap;
551
+ }
552
+
553
+ .elo-bar-container {
554
+ flex: 1;
555
+ height: 16px;
556
+ background: var(--bg-tertiary);
557
+ border-radius: 2px;
558
+ overflow: hidden;
559
+ position: relative;
560
+ }
561
+
562
+ .elo-bar {
563
+ height: 100%;
564
+ background: linear-gradient(90deg, var(--accent-blue), var(--accent-purple));
565
+ border-radius: 2px;
566
+ transition: width 0.3s ease;
567
+ }
568
+
569
+ .elo-value {
570
+ font-size: 0.6875rem;
571
+ font-family: var(--font-mono);
572
+ color: var(--text-primary);
573
+ min-width: 36px;
574
+ text-align: right;
575
+ }
576
+
577
+ /* Leaderboard Modal */
578
+ .leaderboard-modal-header {
579
+ display: flex;
580
+ justify-content: space-between;
581
+ align-items: center;
582
+ padding: 20px;
583
+ border-bottom: 1px solid var(--border-color);
584
+ }
585
+
586
+ .leaderboard-modal-header h2 {
587
+ font-size: 1.25rem;
588
+ color: var(--text-primary);
589
+ margin: 0;
590
+ }
591
+
592
+ .subset-badge {
593
+ font-size: 0.875rem;
594
+ padding: 4px 12px;
595
+ background: var(--bg-tertiary);
596
+ border-radius: var(--radius-sm);
597
+ color: var(--accent-blue);
598
+ }
599
+
600
+ #leaderboard-content {
601
+ padding: 20px;
602
+ max-height: calc(90vh - 80px);
603
+ overflow-y: auto;
604
+ }
605
+
606
+ /* Leaderboard Table */
607
+ .leaderboard-table {
608
+ width: 100%;
609
+ border-collapse: collapse;
610
+ }
611
+
612
+ .leaderboard-table th,
613
+ .leaderboard-table td {
614
+ padding: 12px 16px;
615
+ text-align: left;
616
+ border-bottom: 1px solid var(--border-color);
617
+ }
618
+
619
+ .leaderboard-table th {
620
+ background: var(--bg-tertiary);
621
+ color: var(--text-muted);
622
+ font-size: 0.75rem;
623
+ text-transform: uppercase;
624
+ letter-spacing: 0.05em;
625
+ font-weight: 600;
626
+ position: sticky;
627
+ top: 0;
628
+ }
629
+
630
+ .leaderboard-table td {
631
+ font-size: 0.875rem;
632
+ color: var(--text-primary);
633
+ }
634
+
635
+ .leaderboard-table tbody tr {
636
+ cursor: pointer;
637
+ transition: background 0.15s ease;
638
+ }
639
+
640
+ .leaderboard-table tbody tr:hover {
641
+ background: var(--bg-hover);
642
+ }
643
+
644
+ .leaderboard-table .rank-cell {
645
+ font-weight: 600;
646
+ text-align: center;
647
+ width: 60px;
648
+ }
649
+
650
+ .leaderboard-table .rank-cell.rank-1 {
651
+ color: var(--accent-yellow);
652
+ }
653
+
654
+ .leaderboard-table .rank-cell.rank-2 {
655
+ color: #c0c0c0;
656
+ }
657
+
658
+ .leaderboard-table .rank-cell.rank-3 {
659
+ color: #cd7f32;
660
+ }
661
+
662
+ .leaderboard-table .model-cell {
663
+ font-family: var(--font-mono);
664
+ }
665
+
666
+ .leaderboard-table .elo-cell {
667
+ font-family: var(--font-mono);
668
+ font-weight: 600;
669
+ color: var(--accent-blue);
670
+ }
671
+
672
+ .leaderboard-table .stat-cell {
673
+ font-family: var(--font-mono);
674
+ color: var(--text-secondary);
675
+ }
676
+
677
+ .leaderboard-table .win-rate-cell {
678
+ width: 120px;
679
+ }
680
+
681
+ .win-rate-bar {
682
+ display: flex;
683
+ align-items: center;
684
+ gap: 8px;
685
+ }
686
+
687
+ .win-rate-bar-bg {
688
+ flex: 1;
689
+ height: 6px;
690
+ background: var(--bg-primary);
691
+ border-radius: 3px;
692
+ overflow: hidden;
693
+ }
694
+
695
+ .win-rate-bar-fill {
696
+ height: 100%;
697
+ background: var(--accent-green);
698
+ border-radius: 3px;
699
+ }
700
+
701
+ .win-rate-text {
702
+ font-family: var(--font-mono);
703
+ font-size: 0.75rem;
704
+ color: var(--accent-green);
705
+ min-width: 40px;
706
+ text-align: right;
707
+ }
708
+
709
+ /* Model Stats Modal */
710
+ #model-stats-content {
711
+ padding: 20px;
712
+ }
713
+
714
+ .model-stats-header {
715
+ margin-bottom: 24px;
716
+ padding-bottom: 16px;
717
+ border-bottom: 1px solid var(--border-color);
718
+ }
719
+
720
+ .model-stats-header h2 {
721
+ font-size: 1.25rem;
722
+ margin: 0 0 8px 0;
723
+ font-family: var(--font-mono);
724
+ }
725
+
726
+ .model-stats-summary {
727
+ display: flex;
728
+ flex-wrap: wrap;
729
+ gap: 16px;
730
+ }
731
+
732
+ .model-stat-item {
733
+ background: var(--bg-tertiary);
734
+ padding: 12px 16px;
735
+ border-radius: var(--radius-sm);
736
+ text-align: center;
737
+ min-width: 100px;
738
+ }
739
+
740
+ .model-stat-item .stat-label {
741
+ font-size: 0.6875rem;
742
+ text-transform: uppercase;
743
+ letter-spacing: 0.05em;
744
+ color: var(--text-muted);
745
+ margin-bottom: 4px;
746
+ }
747
+
748
+ .model-stat-item .stat-value {
749
+ font-size: 1.25rem;
750
+ font-weight: 600;
751
+ font-family: var(--font-mono);
752
+ text-align: center;
753
+ }
754
+
755
+ .model-stat-item .stat-value.elo-value {
756
+ color: var(--accent-blue);
757
+ }
758
+
759
+ .model-stat-item .stat-value.wins-value {
760
+ color: var(--accent-green);
761
+ }
762
+
763
+ .model-stat-item .stat-value.losses-value {
764
+ color: var(--accent-red);
765
+ }
766
+
767
+ .model-stat-item .stat-value.ties-value {
768
+ color: var(--accent-yellow);
769
+ }
770
+
771
+ .vs-stats-section h3 {
772
+ font-size: 0.875rem;
773
+ color: var(--text-muted);
774
+ margin-bottom: 16px;
775
+ text-transform: uppercase;
776
+ letter-spacing: 0.05em;
777
+ }
778
+
779
+ .vs-stats-table {
780
+ width: 100%;
781
+ border-collapse: collapse;
782
+ }
783
+
784
+ .vs-stats-table th,
785
+ .vs-stats-table td {
786
+ padding: 10px 12px;
787
+ text-align: left;
788
+ border-bottom: 1px solid var(--border-color);
789
+ }
790
+
791
+ .vs-stats-table th {
792
+ background: var(--bg-tertiary);
793
+ color: var(--text-muted);
794
+ font-size: 0.75rem;
795
+ text-transform: uppercase;
796
+ letter-spacing: 0.05em;
797
+ font-weight: 600;
798
+ }
799
+
800
+ .vs-stats-table td {
801
+ font-size: 0.875rem;
802
+ color: var(--text-primary);
803
+ }
804
+
805
+ .vs-stats-table tbody tr:hover {
806
+ background: var(--bg-hover);
807
+ }
808
+
809
+ .vs-stats-table .opponent-cell {
810
+ font-family: var(--font-mono);
811
+ }
812
+
813
+ .vs-stats-table .opponent-elo {
814
+ font-size: 0.75rem;
815
+ color: var(--text-muted);
816
+ margin-left: 8px;
817
+ }
818
+
819
+ .vs-stats-table .wlt-cell {
820
+ font-family: var(--font-mono);
821
+ }
822
+
823
+ .vs-stats-table .wlt-cell .wins {
824
+ color: var(--accent-green);
825
+ }
826
+
827
+ .vs-stats-table .wlt-cell .losses {
828
+ color: var(--accent-red);
829
+ }
830
+
831
+ .vs-stats-table .wlt-cell .ties {
832
+ color: var(--accent-yellow);
833
+ }
834
+
835
+ .h2h-bar {
836
+ display: flex;
837
+ height: 24px;
838
+ border-radius: var(--radius-sm);
839
+ overflow: hidden;
840
+ margin: 8px 0;
841
+ }
842
+
843
+ .h2h-bar-a {
844
+ background: var(--accent-green);
845
+ display: flex;
846
+ align-items: center;
847
+ justify-content: center;
848
+ font-size: 0.75rem;
849
+ font-weight: 600;
850
+ color: #000;
851
+ min-width: 30px;
852
+ }
853
+
854
+ .h2h-bar-tie {
855
+ background: var(--accent-yellow);
856
+ display: flex;
857
+ align-items: center;
858
+ justify-content: center;
859
+ font-size: 0.75rem;
860
+ font-weight: 600;
861
+ color: #000;
862
+ min-width: 20px;
863
+ }
864
+
865
+ .h2h-bar-b {
866
+ background: var(--accent-red);
867
+ display: flex;
868
+ align-items: center;
869
+ justify-content: center;
870
+ font-size: 0.75rem;
871
+ font-weight: 600;
872
+ color: #fff;
873
+ min-width: 30px;
874
+ }
875
+
876
+ .h2h-labels {
877
+ display: flex;
878
+ justify-content: space-between;
879
+ font-size: 0.75rem;
880
+ color: var(--text-secondary);
881
+ margin-bottom: 4px;
882
+ }
883
+
884
+ .h2h-label {
885
+ max-width: 45%;
886
+ overflow: hidden;
887
+ text-overflow: ellipsis;
888
+ white-space: nowrap;
889
+ }
890
+
891
+ .h2h-stats-row {
892
+ display: flex;
893
+ justify-content: space-between;
894
+ font-size: 0.8125rem;
895
+ padding: 4px 0;
896
+ }
897
+
898
+ .h2h-stats-row .value {
899
+ font-family: var(--font-mono);
900
+ color: var(--text-primary);
901
+ }
902
+
903
+ /* ========== Content Area ========== */
904
+ .content {
905
+ flex: 1;
906
+ padding: 20px;
907
+ overflow-y: auto;
908
+ }
909
+
910
+ .content-header,
911
+ .content-footer {
912
+ display: flex;
913
+ justify-content: space-between;
914
+ align-items: center;
915
+ margin-bottom: 16px;
916
+ }
917
+
918
+ .content-footer {
919
+ margin-top: 16px;
920
+ margin-bottom: 0;
921
+ justify-content: flex-end;
922
+ }
923
+
924
+ .pagination-info {
925
+ font-size: 0.875rem;
926
+ color: var(--text-secondary);
927
+ }
928
+
929
+ .pagination-controls {
930
+ display: flex;
931
+ align-items: center;
932
+ gap: 4px;
933
+ }
934
+
935
+ .page-numbers {
936
+ display: flex;
937
+ align-items: center;
938
+ gap: 2px;
939
+ }
940
+
941
+ .page-number {
942
+ min-width: 32px;
943
+ height: 28px;
944
+ padding: 0 8px;
945
+ border: none;
946
+ border-radius: var(--radius-sm);
947
+ background: var(--bg-tertiary);
948
+ color: var(--text-secondary);
949
+ font-size: 0.8125rem;
950
+ cursor: pointer;
951
+ transition: all 0.15s ease;
952
+ }
953
+
954
+ .page-number:hover {
955
+ background: var(--bg-hover);
956
+ color: var(--text-primary);
957
+ }
958
+
959
+ .page-number.active {
960
+ background: var(--accent-blue);
961
+ color: #fff;
962
+ }
963
+
964
+ .page-number.ellipsis {
965
+ background: transparent;
966
+ cursor: default;
967
+ color: var(--text-muted);
968
+ }
969
+
970
+ .page-number.ellipsis:hover {
971
+ background: transparent;
972
+ color: var(--text-muted);
973
+ }
974
+
975
+ .page-jump {
976
+ display: flex;
977
+ align-items: center;
978
+ gap: 4px;
979
+ margin-left: 8px;
980
+ }
981
+
982
+ .page-input {
983
+ width: 60px;
984
+ height: 28px;
985
+ padding: 0 8px;
986
+ border: 1px solid var(--border-color);
987
+ border-radius: var(--radius-sm);
988
+ background: var(--bg-tertiary);
989
+ color: var(--text-primary);
990
+ font-size: 0.8125rem;
991
+ text-align: center;
992
+ }
993
+
994
+ .page-input:focus {
995
+ outline: none;
996
+ border-color: var(--accent-blue);
997
+ }
998
+
999
+ .page-input::-webkit-outer-spin-button,
1000
+ .page-input::-webkit-inner-spin-button {
1001
+ -webkit-appearance: none;
1002
+ margin: 0;
1003
+ }
1004
+
1005
+ .page-input[type=number] {
1006
+ -moz-appearance: textfield;
1007
+ }
1008
+
1009
+ /* ========== Battle List ========== */
1010
+ .battle-list {
1011
+ display: flex;
1012
+ flex-direction: column;
1013
+ gap: 12px;
1014
+ }
1015
+
1016
+ .empty-state {
1017
+ text-align: center;
1018
+ padding: 60px 20px;
1019
+ color: var(--text-muted);
1020
+ }
1021
+
1022
+ /* ========== Battle Card ========== */
1023
+ .battle-card {
1024
+ background: var(--bg-secondary);
1025
+ border: 1px solid var(--border-color);
1026
+ border-radius: var(--radius-md);
1027
+ padding: 16px;
1028
+ cursor: pointer;
1029
+ transition: all 0.15s ease;
1030
+ }
1031
+
1032
+ .battle-card:hover {
1033
+ border-color: var(--border-light);
1034
+ background: var(--bg-tertiary);
1035
+ }
1036
+
1037
+ .battle-card-header {
1038
+ display: flex;
1039
+ justify-content: space-between;
1040
+ align-items: flex-start;
1041
+ margin-bottom: 12px;
1042
+ }
1043
+
1044
+ .battle-models {
1045
+ display: flex;
1046
+ align-items: center;
1047
+ gap: 8px;
1048
+ flex-wrap: wrap;
1049
+ }
1050
+
1051
+ .model-name {
1052
+ font-family: var(--font-mono);
1053
+ font-size: 0.875rem;
1054
+ padding: 2px 8px;
1055
+ background: var(--bg-tertiary);
1056
+ border-radius: var(--radius-sm);
1057
+ }
1058
+
1059
+ .model-name.winner {
1060
+ background: rgba(63, 185, 80, 0.2);
1061
+ color: var(--accent-green);
1062
+ }
1063
+
1064
+ .model-name.loser {
1065
+ background: rgba(248, 81, 73, 0.15);
1066
+ color: var(--accent-red);
1067
+ }
1068
+
1069
+ .vs-label {
1070
+ color: var(--text-muted);
1071
+ font-size: 0.75rem;
1072
+ }
1073
+
1074
+ .battle-badges {
1075
+ display: flex;
1076
+ gap: 6px;
1077
+ }
1078
+
1079
+ .badge {
1080
+ display: inline-flex;
1081
+ align-items: center;
1082
+ justify-content: center;
1083
+ font-size: 0.6875rem;
1084
+ padding: 2px 6px;
1085
+ border-radius: var(--radius-sm);
1086
+ text-transform: uppercase;
1087
+ font-weight: 600;
1088
+ letter-spacing: 0.02em;
1089
+ line-height: 1;
1090
+ }
1091
+
1092
+ .badge-win {
1093
+ background: rgba(63, 185, 80, 0.2);
1094
+ color: var(--accent-green);
1095
+ }
1096
+
1097
+ .badge-loss {
1098
+ background: rgba(248, 81, 73, 0.15);
1099
+ color: var(--accent-red);
1100
+ }
1101
+
1102
+ .badge-tie {
1103
+ background: rgba(210, 153, 34, 0.2);
1104
+ color: var(--accent-yellow);
1105
+ }
1106
+
1107
+ .badge-consistent {
1108
+ background: rgba(88, 166, 255, 0.15);
1109
+ color: var(--accent-blue);
1110
+ }
1111
+
1112
+ .badge-inconsistent {
1113
+ background: rgba(163, 113, 247, 0.15);
1114
+ color: var(--accent-purple);
1115
+ }
1116
+
1117
+ .battle-instruction {
1118
+ font-size: 0.875rem;
1119
+ color: var(--text-secondary);
1120
+ margin-bottom: 12px;
1121
+ display: -webkit-box;
1122
+ -webkit-line-clamp: 2;
1123
+ -webkit-box-orient: vertical;
1124
+ overflow: hidden;
1125
+ }
1126
+
1127
+ .battle-images {
1128
+ display: grid;
1129
+ grid-template-columns: repeat(3, 1fr);
1130
+ gap: 8px;
1131
+ }
1132
+
1133
+ .battle-image-container {
1134
+ position: relative;
1135
+ aspect-ratio: 1;
1136
+ background: var(--bg-tertiary);
1137
+ border-radius: var(--radius-sm);
1138
+ overflow: hidden;
1139
+ }
1140
+
1141
+ .battle-image-container img {
1142
+ width: 100%;
1143
+ height: 100%;
1144
+ object-fit: cover;
1145
+ }
1146
+
1147
+ /* Multiple input images grid */
1148
+ .battle-image-container.multi-input {
1149
+ display: flex;
1150
+ flex-direction: column;
1151
+ }
1152
+
1153
+ .input-thumbs-grid {
1154
+ flex: 1;
1155
+ display: grid;
1156
+ gap: 2px;
1157
+ padding: 2px;
1158
+ min-height: 0; /* Prevent grid from growing based on content */
1159
+ }
1160
+
1161
+ /* Dynamic grid based on image count */
1162
+ .battle-image-container.multi-input[data-count="2"] .input-thumbs-grid {
1163
+ grid-template-columns: repeat(2, 1fr);
1164
+ }
1165
+
1166
+ .battle-image-container.multi-input[data-count="3"] .input-thumbs-grid,
1167
+ .battle-image-container.multi-input[data-count="4"] .input-thumbs-grid {
1168
+ grid-template-columns: 1fr 1fr;
1169
+ grid-template-rows: 1fr 1fr;
1170
+ }
1171
+
1172
+ .battle-image-container.multi-input[data-count="3"] .input-thumb,
1173
+ .battle-image-container.multi-input[data-count="4"] .input-thumb {
1174
+ aspect-ratio: 1;
1175
+ min-height: 0;
1176
+ min-width: 0;
1177
+ }
1178
+
1179
+ .battle-image-container.multi-input[data-count="5"] .input-thumbs-grid,
1180
+ .battle-image-container.multi-input[data-count="6"] .input-thumbs-grid {
1181
+ grid-template-columns: repeat(3, 1fr);
1182
+ grid-template-rows: repeat(2, 1fr);
1183
+ }
1184
+
1185
+ .battle-image-container.multi-input[data-count="7"] .input-thumbs-grid,
1186
+ .battle-image-container.multi-input[data-count="8"] .input-thumbs-grid,
1187
+ .battle-image-container.multi-input[data-count="9"] .input-thumbs-grid {
1188
+ grid-template-columns: repeat(3, 1fr);
1189
+ grid-template-rows: repeat(3, 1fr);
1190
+ }
1191
+
1192
+ /* Fallback for 10+ images */
1193
+ .battle-image-container.multi-input .input-thumbs-grid {
1194
+ grid-template-columns: repeat(auto-fit, minmax(40px, 1fr));
1195
+ }
1196
+
1197
+ .input-thumb {
1198
+ position: relative;
1199
+ overflow: hidden;
1200
+ border-radius: 2px;
1201
+ background: var(--bg-primary);
1202
+ aspect-ratio: 1;
1203
+ }
1204
+
1205
+ .input-thumb img {
1206
+ width: 100%;
1207
+ height: 100%;
1208
+ object-fit: cover;
1209
+ object-position: center;
1210
+ }
1211
+
1212
+ .image-label {
1213
+ position: absolute;
1214
+ bottom: 4px;
1215
+ left: 4px;
1216
+ font-size: 0.625rem;
1217
+ padding: 2px 4px;
1218
+ background: rgba(0, 0, 0, 0.7);
1219
+ border-radius: 2px;
1220
+ color: #fff;
1221
+ z-index: 1;
1222
+ }
1223
+
1224
+ .battle-meta {
1225
+ display: flex;
1226
+ flex-wrap: wrap;
1227
+ gap: 8px;
1228
+ justify-content: space-between;
1229
+ margin-top: 8px;
1230
+ font-size: 0.75rem;
1231
+ color: var(--text-muted);
1232
+ }
1233
+
1234
+ .battle-meta span {
1235
+ white-space: nowrap;
1236
+ }
1237
+
1238
+ /* ========== Modal ========== */
1239
+ .modal {
1240
+ position: fixed;
1241
+ top: 0;
1242
+ left: 0;
1243
+ right: 0;
1244
+ bottom: 0;
1245
+ z-index: 1000;
1246
+ display: flex;
1247
+ align-items: center;
1248
+ justify-content: center;
1249
+ }
1250
+
1251
+ .modal.hidden {
1252
+ display: none;
1253
+ }
1254
+
1255
+ .modal-backdrop {
1256
+ position: absolute;
1257
+ top: 0;
1258
+ left: 0;
1259
+ right: 0;
1260
+ bottom: 0;
1261
+ background: rgba(0, 0, 0, 0.7);
1262
+ }
1263
+
1264
+ .modal-content {
1265
+ position: relative;
1266
+ background: var(--bg-secondary);
1267
+ border: 1px solid var(--border-color);
1268
+ border-radius: var(--radius-lg);
1269
+ max-width: 1200px;
1270
+ max-height: 90vh;
1271
+ width: 95%;
1272
+ overflow-y: auto;
1273
+ box-shadow: var(--shadow-lg);
1274
+ }
1275
+
1276
+ .modal-content-wide {
1277
+ max-width: 95vw;
1278
+ width: 95vw;
1279
+ }
1280
+
1281
+ .modal-close {
1282
+ position: absolute;
1283
+ top: 12px;
1284
+ right: 12px;
1285
+ width: 32px;
1286
+ height: 32px;
1287
+ border: none;
1288
+ background: var(--bg-tertiary);
1289
+ border-radius: var(--radius-sm);
1290
+ color: var(--text-secondary);
1291
+ font-size: 1.25rem;
1292
+ cursor: pointer;
1293
+ z-index: 10;
1294
+ }
1295
+
1296
+ .modal-close:hover {
1297
+ background: var(--bg-hover);
1298
+ color: var(--text-primary);
1299
+ }
1300
+
1301
+ /* ========== Detail View ========== */
1302
+ .detail-header {
1303
+ padding: 20px;
1304
+ border-bottom: 1px solid var(--border-color);
1305
+ }
1306
+
1307
+ .detail-header h2 {
1308
+ font-size: 1.125rem;
1309
+ margin-bottom: 8px;
1310
+ }
1311
+
1312
+ .detail-meta-info {
1313
+ display: flex;
1314
+ flex-wrap: wrap;
1315
+ gap: 12px;
1316
+ margin-top: 8px;
1317
+ margin-bottom: 8px;
1318
+ }
1319
+
1320
+ .meta-tag {
1321
+ font-size: 0.8125rem;
1322
+ color: var(--text-secondary);
1323
+ background: var(--bg-tertiary);
1324
+ padding: 4px 10px;
1325
+ border-radius: var(--radius-sm);
1326
+ }
1327
+
1328
+ .meta-tag strong {
1329
+ color: var(--text-muted);
1330
+ font-weight: 500;
1331
+ }
1332
+
1333
+ .detail-instruction {
1334
+ font-size: 0.9375rem;
1335
+ color: var(--text-secondary);
1336
+ background: var(--bg-tertiary);
1337
+ padding: 12px;
1338
+ border-radius: var(--radius-sm);
1339
+ margin-top: 12px;
1340
+ }
1341
+
1342
+ .detail-metadata-section {
1343
+ margin-top: 16px;
1344
+ }
1345
+
1346
+ .detail-metadata-section h4 {
1347
+ font-size: 0.75rem;
1348
+ text-transform: uppercase;
1349
+ letter-spacing: 0.05em;
1350
+ color: var(--text-muted);
1351
+ margin-bottom: 8px;
1352
+ }
1353
+
1354
+ .metadata-json {
1355
+ font-family: var(--font-mono);
1356
+ font-size: 0.75rem;
1357
+ line-height: 1.5;
1358
+ white-space: pre-wrap;
1359
+ word-break: break-word;
1360
+ color: var(--text-secondary);
1361
+ background: var(--bg-primary);
1362
+ padding: 12px;
1363
+ border-radius: var(--radius-sm);
1364
+ max-height: 200px;
1365
+ overflow-y: auto;
1366
+ border: 1px solid var(--border-color);
1367
+ }
1368
+
1369
+ .detail-images {
1370
+ display: grid;
1371
+ grid-template-columns: repeat(3, 1fr);
1372
+ gap: 16px;
1373
+ padding: 20px;
1374
+ border-bottom: 1px solid var(--border-color);
1375
+ }
1376
+
1377
+ .detail-image-box {
1378
+ text-align: center;
1379
+ }
1380
+
1381
+ .detail-image-box.input-image {
1382
+ background: var(--bg-tertiary);
1383
+ border-radius: var(--radius-md);
1384
+ padding: 12px;
1385
+ }
1386
+
1387
+ .detail-image-box.output-image {
1388
+ background: var(--bg-tertiary);
1389
+ border: 2px solid var(--border-light);
1390
+ border-radius: var(--radius-md);
1391
+ padding: 12px;
1392
+ position: relative;
1393
+ }
1394
+
1395
+ .detail-image-box.output-image.winner {
1396
+ border-color: var(--accent-green);
1397
+ box-shadow: 0 0 12px rgba(63, 185, 80, 0.2);
1398
+ }
1399
+
1400
+ .detail-image-box.output-image.loser {
1401
+ border-color: var(--accent-red);
1402
+ opacity: 0.85;
1403
+ }
1404
+
1405
+ .detail-image-box h4 {
1406
+ font-size: 0.8125rem;
1407
+ color: var(--text-secondary);
1408
+ margin-bottom: 8px;
1409
+ }
1410
+
1411
+ .detail-image-box.output-image.winner h4 {
1412
+ color: var(--accent-green);
1413
+ }
1414
+
1415
+ .detail-image-box.output-image.loser h4 {
1416
+ color: var(--accent-red);
1417
+ }
1418
+
1419
+ .detail-image-box img {
1420
+ width: 100%;
1421
+ max-height: 400px;
1422
+ object-fit: contain;
1423
+ background: var(--bg-primary);
1424
+ border-radius: var(--radius-sm);
1425
+ cursor: zoom-in;
1426
+ transition: transform 0.2s ease;
1427
+ }
1428
+
1429
+ .detail-image-box img:hover {
1430
+ opacity: 0.9;
1431
+ }
1432
+
1433
+ /* Image Lightbox */
1434
+ .lightbox {
1435
+ position: fixed;
1436
+ top: 0;
1437
+ left: 0;
1438
+ right: 0;
1439
+ bottom: 0;
1440
+ z-index: 2000;
1441
+ background: rgba(0, 0, 0, 0.95);
1442
+ display: flex;
1443
+ align-items: center;
1444
+ justify-content: center;
1445
+ cursor: zoom-out;
1446
+ opacity: 0;
1447
+ visibility: hidden;
1448
+ transition: opacity 0.2s ease, visibility 0.2s ease;
1449
+ }
1450
+
1451
+ .lightbox.active {
1452
+ opacity: 1;
1453
+ visibility: visible;
1454
+ }
1455
+
1456
+ .lightbox img {
1457
+ max-width: 95vw;
1458
+ max-height: 95vh;
1459
+ object-fit: contain;
1460
+ border-radius: var(--radius-md);
1461
+ box-shadow: var(--shadow-lg);
1462
+ }
1463
+
1464
+ .lightbox-close {
1465
+ position: absolute;
1466
+ top: 20px;
1467
+ right: 20px;
1468
+ width: 40px;
1469
+ height: 40px;
1470
+ border: none;
1471
+ background: var(--bg-tertiary);
1472
+ border-radius: 50%;
1473
+ color: var(--text-primary);
1474
+ font-size: 1.5rem;
1475
+ cursor: pointer;
1476
+ display: flex;
1477
+ align-items: center;
1478
+ justify-content: center;
1479
+ }
1480
+
1481
+ .lightbox-close:hover {
1482
+ background: var(--bg-hover);
1483
+ }
1484
+
1485
+ .lightbox-label {
1486
+ position: absolute;
1487
+ bottom: 20px;
1488
+ left: 50%;
1489
+ transform: translateX(-50%);
1490
+ background: var(--bg-tertiary);
1491
+ padding: 8px 16px;
1492
+ border-radius: var(--radius-sm);
1493
+ color: var(--text-primary);
1494
+ font-size: 0.875rem;
1495
+ }
1496
+
1497
+ .detail-vlm-outputs {
1498
+ padding: 20px;
1499
+ }
1500
+
1501
+ .detail-vlm-outputs h3 {
1502
+ font-size: 0.875rem;
1503
+ color: var(--text-muted);
1504
+ margin-bottom: 12px;
1505
+ text-transform: uppercase;
1506
+ letter-spacing: 0.05em;
1507
+ }
1508
+
1509
+ .vlm-call {
1510
+ background: var(--bg-tertiary);
1511
+ border-radius: var(--radius-md);
1512
+ padding: 16px;
1513
+ margin-bottom: 16px;
1514
+ }
1515
+
1516
+ .vlm-call h4 {
1517
+ font-size: 0.8125rem;
1518
+ color: var(--accent-blue);
1519
+ margin-bottom: 8px;
1520
+ }
1521
+
1522
+ .vlm-call-meta {
1523
+ font-size: 0.75rem;
1524
+ color: var(--text-muted);
1525
+ margin-bottom: 12px;
1526
+ }
1527
+
1528
+ .vlm-response {
1529
+ font-family: var(--font-mono);
1530
+ font-size: 0.8125rem;
1531
+ line-height: 1.5;
1532
+ white-space: pre-wrap;
1533
+ word-break: break-word;
1534
+ color: var(--text-secondary);
1535
+ background: var(--bg-primary);
1536
+ padding: 12px;
1537
+ border-radius: var(--radius-sm);
1538
+ max-height: 300px;
1539
+ overflow-y: auto;
1540
+ }
1541
+
1542
+ /* ========== Loading State ========== */
1543
+ .loading {
1544
+ text-align: center;
1545
+ padding: 40px;
1546
+ color: var(--text-muted);
1547
+ }
1548
+
1549
+ .loading::after {
1550
+ content: '';
1551
+ display: inline-block;
1552
+ width: 20px;
1553
+ height: 20px;
1554
+ border: 2px solid var(--border-color);
1555
+ border-top-color: var(--accent-blue);
1556
+ border-radius: 50%;
1557
+ animation: spin 0.8s linear infinite;
1558
+ margin-left: 8px;
1559
+ vertical-align: middle;
1560
+ }
1561
+
1562
+ @keyframes spin {
1563
+ to { transform: rotate(360deg); }
1564
+ }
1565
+
1566
+ /* ========== Favorites Modal ========== */
1567
+ .favorites-modal-header {
1568
+ display: flex;
1569
+ justify-content: space-between;
1570
+ align-items: center;
1571
+ padding: 20px;
1572
+ border-bottom: 1px solid var(--border-color);
1573
+ }
1574
+
1575
+ .favorites-modal-header h2 {
1576
+ font-size: 1.25rem;
1577
+ color: var(--text-primary);
1578
+ }
1579
+
1580
+ /* Favorites Scrollable Container */
1581
+ .favorites-scrollable {
1582
+ max-height: calc(90vh - 80px);
1583
+ overflow-y: auto;
1584
+ padding: 20px;
1585
+ }
1586
+
1587
+ /* Favorites Model Filter - Horizontal Layout */
1588
+ .favorites-model-filter {
1589
+ padding: 16px;
1590
+ background: var(--bg-secondary);
1591
+ border: 1px solid var(--border-color);
1592
+ border-radius: var(--radius-md);
1593
+ margin-bottom: 20px;
1594
+ }
1595
+
1596
+ .favorites-model-filter label {
1597
+ display: block;
1598
+ font-size: 0.875rem;
1599
+ font-weight: 500;
1600
+ margin-bottom: 12px;
1601
+ color: var(--text-primary);
1602
+ }
1603
+
1604
+ /* Horizontal checkbox group for favorites */
1605
+ .checkbox-group-horizontal {
1606
+ display: flex;
1607
+ flex-wrap: wrap;
1608
+ gap: 8px;
1609
+ margin-bottom: 12px;
1610
+ }
1611
+
1612
+ .checkbox-group-horizontal .checkbox-item {
1613
+ display: inline-flex;
1614
+ align-items: center;
1615
+ padding: 6px 12px;
1616
+ background: var(--bg-tertiary);
1617
+ border: 1px solid var(--border-color);
1618
+ border-radius: var(--radius-sm);
1619
+ cursor: pointer;
1620
+ transition: all 0.15s ease;
1621
+ white-space: nowrap;
1622
+ }
1623
+
1624
+ .checkbox-group-horizontal .checkbox-item:hover {
1625
+ background: var(--bg-hover);
1626
+ border-color: var(--accent-blue);
1627
+ }
1628
+
1629
+ .checkbox-group-horizontal .checkbox-item.selected {
1630
+ background: var(--accent-blue);
1631
+ border-color: var(--accent-blue);
1632
+ color: white;
1633
+ }
1634
+
1635
+ .checkbox-group-horizontal .checkbox-item.selected .checkbox-label {
1636
+ color: white;
1637
+ }
1638
+
1639
+ .checkbox-group-horizontal .checkbox-item input[type="checkbox"] {
1640
+ /* Hide the actual checkbox, use styling for selection state */
1641
+ position: absolute;
1642
+ opacity: 0;
1643
+ width: 0;
1644
+ height: 0;
1645
+ }
1646
+
1647
+ .checkbox-group-horizontal .checkbox-item .checkbox-label {
1648
+ font-size: 0.8125rem;
1649
+ color: var(--text-primary);
1650
+ cursor: pointer;
1651
+ margin: 0;
1652
+ }
1653
+
1654
+ /* Inline checkbox actions for favorites */
1655
+ .checkbox-actions-inline {
1656
+ display: flex;
1657
+ gap: 8px;
1658
+ flex-wrap: wrap;
1659
+ }
1660
+
1661
+ .checkbox-actions-inline .btn-small {
1662
+ font-size: 0.75rem;
1663
+ }
1664
+
1665
+ /* Filter controls row - buttons and toggle on same line */
1666
+ .filter-controls-row {
1667
+ display: flex;
1668
+ justify-content: space-between;
1669
+ align-items: center;
1670
+ flex-wrap: wrap;
1671
+ gap: 12px;
1672
+ }
1673
+
1674
+ /* Stats scope toggle */
1675
+ .stats-scope-toggle {
1676
+ display: flex;
1677
+ align-items: center;
1678
+ }
1679
+
1680
+ .stats-scope-toggle .toggle-label {
1681
+ display: flex;
1682
+ align-items: center;
1683
+ cursor: pointer;
1684
+ font-size: 0.8125rem;
1685
+ color: var(--text-secondary);
1686
+ gap: 6px;
1687
+ }
1688
+
1689
+ .stats-scope-toggle .toggle-label input[type="checkbox"] {
1690
+ width: 16px;
1691
+ height: 16px;
1692
+ accent-color: var(--accent-blue);
1693
+ cursor: pointer;
1694
+ }
1695
+
1696
+ .stats-scope-toggle .toggle-text {
1697
+ user-select: none;
1698
+ }
1699
+
1700
+ .stats-scope-toggle .toggle-label:hover .toggle-text {
1701
+ color: var(--text-primary);
1702
+ }
1703
+
1704
+ #favorites-content {
1705
+ /* padding removed as it's in the scrollable container now */
1706
+ }
1707
+
1708
+ .favorites-empty {
1709
+ text-align: center;
1710
+ padding: 60px 20px;
1711
+ color: var(--text-muted);
1712
+ }
1713
+
1714
+ /* Favorite Prompt Card */
1715
+ .favorite-prompt-card {
1716
+ background: var(--bg-tertiary);
1717
+ border: 1px solid var(--border-color);
1718
+ border-radius: var(--radius-md);
1719
+ margin-bottom: 24px;
1720
+ overflow: hidden;
1721
+ }
1722
+
1723
+ .favorite-prompt-header {
1724
+ display: flex;
1725
+ justify-content: space-between;
1726
+ align-items: flex-start;
1727
+ padding: 16px;
1728
+ border-bottom: 1px solid var(--border-color);
1729
+ background: var(--bg-secondary);
1730
+ }
1731
+
1732
+ .favorite-prompt-info {
1733
+ flex: 1;
1734
+ }
1735
+
1736
+ .favorite-prompt-instruction {
1737
+ font-size: 0.9375rem;
1738
+ color: var(--text-primary);
1739
+ margin-bottom: 8px;
1740
+ line-height: 1.5;
1741
+ }
1742
+
1743
+ .favorite-prompt-meta {
1744
+ display: flex;
1745
+ flex-wrap: wrap;
1746
+ gap: 8px;
1747
+ font-size: 0.75rem;
1748
+ color: var(--text-muted);
1749
+ }
1750
+
1751
+ .favorite-prompt-meta span {
1752
+ background: var(--bg-primary);
1753
+ padding: 2px 8px;
1754
+ border-radius: var(--radius-sm);
1755
+ }
1756
+
1757
+ .favorite-prompt-actions {
1758
+ display: flex;
1759
+ gap: 8px;
1760
+ }
1761
+
1762
+ .btn-unfavorite {
1763
+ background: transparent;
1764
+ border: 1px solid var(--accent-red);
1765
+ color: var(--accent-red);
1766
+ padding: 4px 12px;
1767
+ border-radius: var(--radius-sm);
1768
+ font-size: 0.75rem;
1769
+ cursor: pointer;
1770
+ transition: all 0.15s ease;
1771
+ }
1772
+
1773
+ .btn-unfavorite:hover {
1774
+ background: rgba(248, 81, 73, 0.15);
1775
+ }
1776
+
1777
+ /* Favorite Input Images Section */
1778
+ .favorite-input-section {
1779
+ padding: 12px 16px;
1780
+ background: var(--bg-primary);
1781
+ border-bottom: 1px solid var(--border-color);
1782
+ }
1783
+
1784
+ .favorite-input-title {
1785
+ font-size: 0.75rem;
1786
+ text-transform: uppercase;
1787
+ letter-spacing: 0.05em;
1788
+ color: var(--text-muted);
1789
+ margin-bottom: 8px;
1790
+ }
1791
+
1792
+ .favorite-input-images {
1793
+ display: flex;
1794
+ gap: 8px;
1795
+ flex-wrap: wrap;
1796
+ }
1797
+
1798
+ .favorite-input-image {
1799
+ width: 80px;
1800
+ height: 80px;
1801
+ border-radius: var(--radius-sm);
1802
+ overflow: hidden;
1803
+ cursor: zoom-in;
1804
+ border: 1px solid var(--border-color);
1805
+ }
1806
+
1807
+ .favorite-input-image img {
1808
+ width: 100%;
1809
+ height: 100%;
1810
+ object-fit: cover;
1811
+ }
1812
+
1813
+ /* Favorite Models Grid */
1814
+ .favorite-models-section {
1815
+ padding: 16px;
1816
+ }
1817
+
1818
+ .favorite-models-title {
1819
+ font-size: 0.75rem;
1820
+ text-transform: uppercase;
1821
+ letter-spacing: 0.05em;
1822
+ color: var(--text-muted);
1823
+ margin-bottom: 12px;
1824
+ }
1825
+
1826
+ .favorite-models-grid {
1827
+ display: grid;
1828
+ grid-template-columns: repeat(auto-fill, minmax(180px, 1fr));
1829
+ gap: 12px;
1830
+ }
1831
+
1832
+ .favorite-model-card {
1833
+ background: var(--bg-secondary);
1834
+ border: 1px solid var(--border-color);
1835
+ border-radius: var(--radius-sm);
1836
+ overflow: hidden;
1837
+ transition: all 0.15s ease;
1838
+ }
1839
+
1840
+ .favorite-model-card:hover {
1841
+ border-color: var(--border-light);
1842
+ }
1843
+
1844
+ .favorite-model-card.rank-1 {
1845
+ border-color: var(--accent-yellow);
1846
+ box-shadow: 0 0 8px rgba(210, 153, 34, 0.2);
1847
+ }
1848
+
1849
+ .favorite-model-card.rank-2 {
1850
+ border-color: var(--text-muted);
1851
+ }
1852
+
1853
+ .favorite-model-card.rank-3 {
1854
+ border-color: #cd7f32;
1855
+ }
1856
+
1857
+ .favorite-model-image {
1858
+ aspect-ratio: 1;
1859
+ width: 100%;
1860
+ overflow: hidden;
1861
+ cursor: zoom-in;
1862
+ background: var(--bg-primary);
1863
+ }
1864
+
1865
+ .favorite-model-image img {
1866
+ width: 100%;
1867
+ height: 100%;
1868
+ object-fit: cover;
1869
+ transition: transform 0.2s ease;
1870
+ }
1871
+
1872
+ .favorite-model-image:hover img {
1873
+ transform: scale(1.05);
1874
+ }
1875
+
1876
+ .favorite-model-info {
1877
+ padding: 8px;
1878
+ border-top: 1px solid var(--border-color);
1879
+ }
1880
+
1881
+ .favorite-model-name {
1882
+ font-family: var(--font-mono);
1883
+ font-size: 0.75rem;
1884
+ color: var(--text-primary);
1885
+ margin-bottom: 4px;
1886
+ white-space: nowrap;
1887
+ overflow: hidden;
1888
+ text-overflow: ellipsis;
1889
+ }
1890
+
1891
+ .favorite-model-rank {
1892
+ display: inline-block;
1893
+ margin-right: 4px;
1894
+ font-size: 0.6875rem;
1895
+ }
1896
+
1897
+ .favorite-model-rank.rank-1 {
1898
+ color: var(--accent-yellow);
1899
+ }
1900
+
1901
+ .favorite-model-rank.rank-2 {
1902
+ color: var(--text-muted);
1903
+ }
1904
+
1905
+ .favorite-model-rank.rank-3 {
1906
+ color: #cd7f32;
1907
+ }
1908
+
1909
+ .favorite-model-stats {
1910
+ font-size: 0.6875rem;
1911
+ color: var(--text-muted);
1912
+ }
1913
+
1914
+ .favorite-model-stats .win-rate {
1915
+ color: var(--accent-green);
1916
+ font-weight: 600;
1917
+ }
1918
+
1919
+ .favorite-model-stats .wins {
1920
+ color: var(--accent-green);
1921
+ }
1922
+
1923
+ .favorite-model-stats .losses {
1924
+ color: var(--accent-red);
1925
+ }
1926
+
1927
+ .favorite-model-stats .ties {
1928
+ color: var(--accent-yellow);
1929
+ }
1930
+
1931
+ /* Battle Card Favorite Button */
1932
+ .battle-card-header {
1933
+ display: flex;
1934
+ justify-content: space-between;
1935
+ align-items: flex-start;
1936
+ margin-bottom: 12px;
1937
+ }
1938
+
1939
+ .btn-favorite-toggle {
1940
+ background: transparent;
1941
+ border: none;
1942
+ font-size: 1.25rem;
1943
+ cursor: pointer;
1944
+ color: var(--text-muted);
1945
+ padding: 4px;
1946
+ line-height: 1;
1947
+ transition: all 0.15s ease;
1948
+ }
1949
+
1950
+ .btn-favorite-toggle:hover {
1951
+ color: var(--accent-yellow);
1952
+ transform: scale(1.1);
1953
+ }
1954
+
1955
+ .btn-favorite-toggle.favorited {
1956
+ color: var(--accent-yellow);
1957
+ }
1958
+
1959
+ /* Favorites loading state */
1960
+ .favorite-loading {
1961
+ display: flex;
1962
+ flex-direction: column;
1963
+ align-items: center;
1964
+ justify-content: center;
1965
+ padding: 40px;
1966
+ color: var(--text-muted);
1967
+ }
1968
+
1969
+ .favorite-loading::after {
1970
+ content: '';
1971
+ display: block;
1972
+ width: 30px;
1973
+ height: 30px;
1974
+ border: 3px solid var(--border-color);
1975
+ border-top-color: var(--accent-blue);
1976
+ border-radius: 50%;
1977
+ animation: spin 0.8s linear infinite;
1978
+ margin-top: 12px;
1979
+ }
1980
+
1981
+ /* ========== Prompts View ========== */
1982
+ .prompts-list {
1983
+ display: flex;
1984
+ flex-direction: column;
1985
+ gap: 24px;
1986
+ }
1987
+
1988
+ /* Prompt Card - similar to favorite-prompt-card but standalone */
1989
+ .prompt-card {
1990
+ background: var(--bg-secondary);
1991
+ border: 1px solid var(--border-color);
1992
+ border-radius: var(--radius-md);
1993
+ overflow: hidden;
1994
+ transition: all 0.15s ease;
1995
+ }
1996
+
1997
+ .prompt-card:hover {
1998
+ border-color: var(--border-light);
1999
+ }
2000
+
2001
+ .prompt-card-header {
2002
+ display: flex;
2003
+ justify-content: space-between;
2004
+ align-items: flex-start;
2005
+ padding: 16px;
2006
+ border-bottom: 1px solid var(--border-color);
2007
+ background: var(--bg-tertiary);
2008
+ }
2009
+
2010
+ .prompt-card-info {
2011
+ flex: 1;
2012
+ }
2013
+
2014
+ .prompt-card-instruction {
2015
+ font-size: 0.9375rem;
2016
+ color: var(--text-primary);
2017
+ margin-bottom: 8px;
2018
+ line-height: 1.5;
2019
+ }
2020
+
2021
+ .prompt-card-meta {
2022
+ display: flex;
2023
+ flex-wrap: wrap;
2024
+ gap: 8px;
2025
+ font-size: 0.75rem;
2026
+ color: var(--text-muted);
2027
+ }
2028
+
2029
+ .prompt-card-meta span {
2030
+ background: var(--bg-primary);
2031
+ padding: 2px 8px;
2032
+ border-radius: var(--radius-sm);
2033
+ }
2034
+
2035
+ .prompt-card-actions {
2036
+ display: flex;
2037
+ gap: 8px;
2038
+ }
2039
+
2040
+ /* Prompt Input Images */
2041
+ .prompt-input-section {
2042
+ padding: 12px 16px;
2043
+ background: var(--bg-primary);
2044
+ border-bottom: 1px solid var(--border-color);
2045
+ }
2046
+
2047
+ .prompt-input-title {
2048
+ font-size: 0.75rem;
2049
+ text-transform: uppercase;
2050
+ letter-spacing: 0.05em;
2051
+ color: var(--text-muted);
2052
+ margin-bottom: 8px;
2053
+ }
2054
+
2055
+ .prompt-input-images {
2056
+ display: flex;
2057
+ gap: 8px;
2058
+ flex-wrap: wrap;
2059
+ }
2060
+
2061
+ .prompt-input-image {
2062
+ width: 100px;
2063
+ height: 100px;
2064
+ border-radius: var(--radius-sm);
2065
+ overflow: hidden;
2066
+ cursor: zoom-in;
2067
+ border: 1px solid var(--border-color);
2068
+ background: var(--bg-secondary);
2069
+ }
2070
+
2071
+ .prompt-input-image img {
2072
+ width: 100%;
2073
+ height: 100%;
2074
+ object-fit: cover;
2075
+ }
2076
+
2077
+ /* Prompt Models Grid */
2078
+ .prompt-models-section {
2079
+ padding: 16px;
2080
+ }
2081
+
2082
+ .prompt-models-title {
2083
+ font-size: 0.75rem;
2084
+ text-transform: uppercase;
2085
+ letter-spacing: 0.05em;
2086
+ color: var(--text-muted);
2087
+ margin-bottom: 12px;
2088
+ }
2089
+
2090
+ .prompt-models-grid {
2091
+ display: grid;
2092
+ grid-template-columns: repeat(auto-fill, minmax(200px, 1fr));
2093
+ gap: 12px;
2094
+ }
2095
+
2096
+ .prompt-model-card {
2097
+ background: var(--bg-tertiary);
2098
+ border: 1px solid var(--border-color);
2099
+ border-radius: var(--radius-sm);
2100
+ overflow: hidden;
2101
+ transition: all 0.15s ease;
2102
+ }
2103
+
2104
+ .prompt-model-card:hover {
2105
+ border-color: var(--border-light);
2106
+ }
2107
+
2108
+ .prompt-model-card.rank-1 {
2109
+ border-color: var(--accent-yellow);
2110
+ box-shadow: 0 0 8px rgba(210, 153, 34, 0.2);
2111
+ }
2112
+
2113
+ .prompt-model-card.rank-2 {
2114
+ border-color: var(--text-muted);
2115
+ }
2116
+
2117
+ .prompt-model-card.rank-3 {
2118
+ border-color: #cd7f32;
2119
+ }
2120
+
2121
+ .prompt-model-image {
2122
+ aspect-ratio: 1;
2123
+ width: 100%;
2124
+ overflow: hidden;
2125
+ cursor: zoom-in;
2126
+ background: var(--bg-primary);
2127
+ }
2128
+
2129
+ .prompt-model-image img {
2130
+ width: 100%;
2131
+ height: 100%;
2132
+ object-fit: cover;
2133
+ transition: transform 0.2s ease;
2134
+ }
2135
+
2136
+ .prompt-model-image:hover img {
2137
+ transform: scale(1.05);
2138
+ }
2139
+
2140
+ .prompt-model-info {
2141
+ padding: 8px;
2142
+ border-top: 1px solid var(--border-color);
2143
+ }
2144
+
2145
+ .prompt-model-name {
2146
+ font-family: var(--font-mono);
2147
+ font-size: 0.75rem;
2148
+ color: var(--text-primary);
2149
+ margin-bottom: 4px;
2150
+ white-space: nowrap;
2151
+ overflow: hidden;
2152
+ text-overflow: ellipsis;
2153
+ }
2154
+
2155
+ .prompt-model-rank {
2156
+ display: inline-block;
2157
+ margin-right: 4px;
2158
+ font-size: 0.75rem;
2159
+ }
2160
+
2161
+ .prompt-model-rank.rank-1 {
2162
+ color: var(--accent-yellow);
2163
+ }
2164
+
2165
+ .prompt-model-rank.rank-2 {
2166
+ color: var(--text-muted);
2167
+ }
2168
+
2169
+ .prompt-model-rank.rank-3 {
2170
+ color: #cd7f32;
2171
+ }
2172
+
2173
+ .prompt-model-stats {
2174
+ font-size: 0.6875rem;
2175
+ color: var(--text-muted);
2176
+ }
2177
+
2178
+ .prompt-model-stats .win-rate {
2179
+ color: var(--accent-green);
2180
+ font-weight: 600;
2181
+ }
2182
+
2183
+ .prompt-model-stats .wins {
2184
+ color: var(--accent-green);
2185
+ }
2186
+
2187
+ .prompt-model-stats .losses {
2188
+ color: var(--accent-red);
2189
+ }
2190
+
2191
+ .prompt-model-stats .ties {
2192
+ color: var(--accent-yellow);
2193
+ }
2194
+
2195
+ /* ========== Clickable Model Name ========== */
2196
+ .prompt-model-name.clickable {
2197
+ cursor: pointer;
2198
+ transition: color 0.2s ease;
2199
+ }
2200
+
2201
+ .prompt-model-name.clickable:hover {
2202
+ color: var(--accent-blue);
2203
+ text-decoration: underline;
2204
+ }
2205
+
2206
+ /* ========== Model Battles Modal ========== */
2207
+ .model-battles-modal {
2208
+ max-height: 80vh;
2209
+ overflow-y: auto;
2210
+ padding: 20px;
2211
+ }
2212
+
2213
+ .model-battles-header {
2214
+ margin-bottom: 20px;
2215
+ padding-bottom: 12px;
2216
+ border-bottom: 1px solid var(--border-color);
2217
+ }
2218
+
2219
+ .model-battles-header h2 {
2220
+ margin: 0 0 4px 0;
2221
+ font-size: 1.25rem;
2222
+ color: var(--text-primary);
2223
+ }
2224
+
2225
+ .model-battles-subtitle {
2226
+ margin: 0 0 8px 0;
2227
+ font-size: 0.875rem;
2228
+ color: var(--text-muted);
2229
+ }
2230
+
2231
+ .model-battles-stats {
2232
+ display: flex;
2233
+ flex-wrap: wrap;
2234
+ gap: 16px;
2235
+ font-size: 0.875rem;
2236
+ }
2237
+
2238
+ .model-battles-stats .stat-item {
2239
+ color: var(--text-secondary);
2240
+ }
2241
+
2242
+ .model-battles-filter {
2243
+ background: var(--bg-card);
2244
+ border-radius: 8px;
2245
+ padding: 12px;
2246
+ margin-bottom: 16px;
2247
+ }
2248
+
2249
+ .model-battles-filter .filter-header {
2250
+ display: flex;
2251
+ justify-content: space-between;
2252
+ align-items: center;
2253
+ margin-bottom: 12px;
2254
+ }
2255
+
2256
+ .model-battles-filter .filter-header h4 {
2257
+ margin: 0;
2258
+ font-size: 0.875rem;
2259
+ color: var(--text-primary);
2260
+ }
2261
+
2262
+ .model-battles-filter .filter-actions {
2263
+ display: flex;
2264
+ gap: 8px;
2265
+ }
2266
+
2267
+ .opponent-checkboxes {
2268
+ display: flex;
2269
+ flex-wrap: wrap;
2270
+ gap: 8px;
2271
+ max-height: 150px;
2272
+ overflow-y: auto;
2273
+ }
2274
+
2275
+ .opponent-checkbox {
2276
+ display: flex;
2277
+ align-items: center;
2278
+ gap: 4px;
2279
+ background: var(--bg-main);
2280
+ padding: 4px 8px;
2281
+ border-radius: 4px;
2282
+ font-size: 0.75rem;
2283
+ cursor: pointer;
2284
+ white-space: nowrap;
2285
+ }
2286
+
2287
+ .opponent-checkbox input {
2288
+ cursor: pointer;
2289
+ }
2290
+
2291
+ .opponent-checkbox span {
2292
+ color: var(--text-secondary);
2293
+ }
2294
+
2295
+ .opponent-checkbox:hover {
2296
+ background: var(--bg-hover);
2297
+ }
2298
+
2299
+ .model-battles-list {
2300
+ margin-top: 16px;
2301
+ }
2302
+
2303
+ .model-battles-list h4 {
2304
+ margin: 0 0 12px 0;
2305
+ font-size: 0.875rem;
2306
+ color: var(--text-primary);
2307
+ }
2308
+
2309
+ /* Battle Record Cards */
2310
+ .battle-records-container {
2311
+ display: flex;
2312
+ flex-direction: column;
2313
+ gap: 8px;
2314
+ }
2315
+
2316
+ .battle-record-card {
2317
+ background: var(--bg-card);
2318
+ border-radius: 8px;
2319
+ border: 1px solid var(--border-color);
2320
+ overflow: hidden;
2321
+ }
2322
+
2323
+ .battle-record-header {
2324
+ display: flex;
2325
+ justify-content: space-between;
2326
+ align-items: center;
2327
+ padding: 12px 16px;
2328
+ cursor: pointer;
2329
+ transition: background-color 0.2s ease;
2330
+ }
2331
+
2332
+ .battle-record-header:hover {
2333
+ background: var(--bg-hover);
2334
+ }
2335
+
2336
+ .battle-record-info {
2337
+ display: flex;
2338
+ align-items: center;
2339
+ gap: 12px;
2340
+ flex-wrap: wrap;
2341
+ }
2342
+
2343
+ .battle-opponent {
2344
+ font-weight: 500;
2345
+ color: var(--text-primary);
2346
+ font-size: 0.875rem;
2347
+ }
2348
+
2349
+ .expand-icon {
2350
+ color: var(--text-muted);
2351
+ font-size: 0.75rem;
2352
+ transition: transform 0.2s ease;
2353
+ }
2354
+
2355
+ .battle-record-card.expanded .expand-icon {
2356
+ transform: rotate(180deg);
2357
+ }
2358
+
2359
+ /* Judge Outputs Section */
2360
+ .battle-card-judge-outputs {
2361
+ display: none;
2362
+ padding: 12px 16px 16px;
2363
+ border-top: 1px solid var(--border-color);
2364
+ background: var(--bg-main);
2365
+ }
2366
+
2367
+ .battle-record-card.expanded .battle-card-judge-outputs {
2368
+ display: block;
2369
+ }
2370
+
2371
+ .judge-outputs-title {
2372
+ font-size: 0.8125rem;
2373
+ font-weight: 600;
2374
+ color: var(--text-primary);
2375
+ margin-bottom: 12px;
2376
+ }
2377
+
2378
+ .judge-call {
2379
+ background: var(--bg-card);
2380
+ border-radius: 6px;
2381
+ padding: 12px;
2382
+ margin-bottom: 10px;
2383
+ }
2384
+
2385
+ .judge-call:last-child {
2386
+ margin-bottom: 0;
2387
+ }
2388
+
2389
+ .judge-call-header {
2390
+ display: flex;
2391
+ align-items: center;
2392
+ gap: 8px;
2393
+ margin-bottom: 8px;
2394
+ }
2395
+
2396
+ .judge-call-label {
2397
+ font-size: 0.75rem;
2398
+ font-weight: 600;
2399
+ color: var(--accent-blue);
2400
+ text-transform: uppercase;
2401
+ }
2402
+
2403
+ .judge-call-order {
2404
+ font-size: 0.75rem;
2405
+ color: var(--text-muted);
2406
+ }
2407
+
2408
+ .judge-call-meta {
2409
+ font-size: 0.75rem;
2410
+ color: var(--text-secondary);
2411
+ margin-bottom: 8px;
2412
+ }
2413
+
2414
+ .judge-call-response {
2415
+ font-size: 0.75rem;
2416
+ color: var(--text-secondary);
2417
+ background: var(--bg-main);
2418
+ padding: 10px;
2419
+ border-radius: 4px;
2420
+ white-space: pre-wrap;
2421
+ word-break: break-word;
2422
+ max-height: 200px;
2423
+ overflow-y: auto;
2424
+ line-height: 1.5;
2425
+ font-family: var(--font-mono);
2426
+ }
2427
+
2428
+ .battles-table {
2429
+ width: 100%;
2430
+ border-collapse: collapse;
2431
+ font-size: 0.8125rem;
2432
+ }
2433
+
2434
+ .battles-table th,
2435
+ .battles-table td {
2436
+ padding: 8px 12px;
2437
+ text-align: left;
2438
+ border-bottom: 1px solid var(--border-color);
2439
+ }
2440
+
2441
+ .battles-table th {
2442
+ background: var(--bg-card);
2443
+ color: var(--text-primary);
2444
+ font-weight: 600;
2445
+ }
2446
+
2447
+ .battles-table td {
2448
+ color: var(--text-secondary);
2449
+ }
2450
+
2451
+ .battles-table tbody tr:hover {
2452
+ background: var(--bg-hover);
2453
+ }
2454
+
2455
+ .badge.result-win {
2456
+ background: rgba(16, 185, 129, 0.2);
2457
+ color: var(--accent-green);
2458
+ }
2459
+
2460
+ .badge.result-loss {
2461
+ background: rgba(239, 68, 68, 0.2);
2462
+ color: var(--accent-red);
2463
+ }
2464
+
2465
+ .badge.result-tie {
2466
+ background: rgba(245, 158, 11, 0.2);
2467
+ color: var(--accent-yellow);
2468
+ }
2469
+
2470
+ /* ========== Responsive ========== */
2471
+ @media (max-width: 1024px) {
2472
+ .sidebar {
2473
+ width: 240px;
2474
+ }
2475
+
2476
+ .battle-images {
2477
+ grid-template-columns: repeat(2, 1fr);
2478
+ }
2479
+
2480
+ .detail-images {
2481
+ grid-template-columns: repeat(2, 1fr);
2482
+ }
2483
+ }
2484
+
2485
+ @media (max-width: 768px) {
2486
+ .header {
2487
+ flex-direction: column;
2488
+ gap: 12px;
2489
+ padding: 12px 16px;
2490
+ }
2491
+
2492
+ .header-center {
2493
+ flex-direction: column;
2494
+ width: 100%;
2495
+ }
2496
+
2497
+ .selector-group {
2498
+ width: 100%;
2499
+ }
2500
+
2501
+ .selector {
2502
+ flex: 1;
2503
+ }
2504
+
2505
+ .main-container {
2506
+ flex-direction: column;
2507
+ }
2508
+
2509
+ .sidebar {
2510
+ width: 100%;
2511
+ border-right: none;
2512
+ border-bottom: 1px solid var(--border-color);
2513
+ }
2514
+
2515
+ .battle-images,
2516
+ .detail-images {
2517
+ grid-template-columns: 1fr;
2518
+ }
2519
+ }
2520
+
2521
+ /* ========== Opponent Sections (Collapsible) ========== */
2522
+ .model-battles-hint {
2523
+ font-size: 0.75rem;
2524
+ color: var(--text-muted);
2525
+ margin-bottom: 12px;
2526
+ font-style: italic;
2527
+ }
2528
+
2529
+ .opponent-sections-container {
2530
+ display: flex;
2531
+ flex-direction: column;
2532
+ gap: 8px;
2533
+ }
2534
+
2535
+ .opponent-section {
2536
+ background: var(--bg-tertiary);
2537
+ border: 1px solid var(--border-color);
2538
+ border-radius: var(--radius-md);
2539
+ overflow: hidden;
2540
+ }
2541
+
2542
+ .opponent-section.hidden {
2543
+ display: none;
2544
+ }
2545
+
2546
+ .opponent-section-header {
2547
+ display: flex;
2548
+ justify-content: space-between;
2549
+ align-items: center;
2550
+ padding: 12px 16px;
2551
+ cursor: pointer;
2552
+ transition: background-color 0.2s ease;
2553
+ user-select: none;
2554
+ }
2555
+
2556
+ .opponent-section-header:hover {
2557
+ background: var(--bg-hover);
2558
+ }
2559
+
2560
+ .opponent-section-info {
2561
+ display: flex;
2562
+ align-items: center;
2563
+ gap: 16px;
2564
+ flex-wrap: wrap;
2565
+ }
2566
+
2567
+ .opponent-name {
2568
+ font-weight: 600;
2569
+ font-size: 0.9375rem;
2570
+ color: var(--text-primary);
2571
+ }
2572
+
2573
+ .opponent-name.result-win {
2574
+ color: var(--accent-green);
2575
+ }
2576
+
2577
+ .opponent-name.result-loss {
2578
+ color: var(--accent-red);
2579
+ }
2580
+
2581
+ .opponent-name.result-tie {
2582
+ color: var(--accent-yellow);
2583
+ }
2584
+
2585
+ .opponent-stats {
2586
+ font-size: 0.8125rem;
2587
+ color: var(--text-secondary);
2588
+ font-family: var(--font-mono);
2589
+ }
2590
+
2591
+ .opponent-stats .wins {
2592
+ color: var(--accent-green);
2593
+ }
2594
+
2595
+ .opponent-stats .losses {
2596
+ color: var(--accent-red);
2597
+ }
2598
+
2599
+ .opponent-stats .ties {
2600
+ color: var(--accent-yellow);
2601
+ }
2602
+
2603
+ .opponent-section .expand-icon {
2604
+ color: var(--text-muted);
2605
+ font-size: 0.875rem;
2606
+ transition: transform 0.2s ease;
2607
+ }
2608
+
2609
+ .opponent-section.expanded .expand-icon {
2610
+ transform: rotate(180deg);
2611
+ }
2612
+
2613
+ .opponent-section-content {
2614
+ display: none;
2615
+ padding: 0 16px 16px;
2616
+ border-top: 1px solid var(--border-color);
2617
+ background: var(--bg-secondary);
2618
+ }
2619
+
2620
+ .opponent-section.expanded .opponent-section-content {
2621
+ display: block;
2622
+ }
2623
+
2624
+ /* Battle Record Item (inside opponent section) */
2625
+ .battle-record-item {
2626
+ background: var(--bg-tertiary);
2627
+ border-radius: var(--radius-sm);
2628
+ padding: 12px;
2629
+ margin-top: 12px;
2630
+ }
2631
+
2632
+ .battle-record-item:first-child {
2633
+ margin-top: 16px;
2634
+ }
2635
+
2636
+ .battle-record-item-header {
2637
+ display: flex;
2638
+ align-items: center;
2639
+ gap: 8px;
2640
+ flex-wrap: wrap;
2641
+ margin-bottom: 12px;
2642
+ }
2643
+
2644
+ .battle-exp-name {
2645
+ font-size: 0.75rem;
2646
+ color: var(--text-muted);
2647
+ background: var(--bg-primary);
2648
+ padding: 2px 6px;
2649
+ border-radius: var(--radius-sm);
2650
+ }
2651
+
2652
+ /* Battle Judge Outputs (inside battle record item) */
2653
+ .battle-judge-outputs {
2654
+ display: flex;
2655
+ flex-direction: column;
2656
+ gap: 12px;
2657
+ }
2658
+
2659
+ .battle-judge-outputs .judge-call {
2660
+ background: var(--bg-primary);
2661
+ border-radius: var(--radius-sm);
2662
+ padding: 12px;
2663
+ margin-bottom: 0;
2664
+ }
2665
+
2666
+ .battle-judge-outputs .placeholder {
2667
+ font-size: 0.75rem;
2668
+ color: var(--text-muted);
2669
+ font-style: italic;
2670
+ padding: 8px;
2671
+ text-align: center;
2672
+ }
2673
+
2674
+
2675
+ /* ========== Search Box ========== */
2676
+ .search-box {
2677
+ display: flex;
2678
+ align-items: center;
2679
+ gap: 4px;
2680
+ }
2681
+
2682
+ .search-input {
2683
+ width: 200px;
2684
+ padding: 6px 12px;
2685
+ background: var(--bg-tertiary);
2686
+ border: 1px solid var(--border-color);
2687
+ border-radius: var(--radius-sm);
2688
+ color: var(--text-primary);
2689
+ font-size: 0.875rem;
2690
+ }
2691
+
2692
+ .search-input:focus {
2693
+ outline: none;
2694
+ border-color: var(--accent-blue);
2695
+ }
2696
+
2697
+ .search-input::placeholder {
2698
+ color: var(--text-muted);
2699
+ }
2700
+
2701
+ .search-btn, .clear-search-btn {
2702
+ padding: 6px 10px;
2703
+ min-width: auto;
2704
+ }
2705
+
2706
+ .clear-search-btn {
2707
+ color: var(--accent-red);
2708
+ }
2709
+
2710
+ /* Search results highlight */
2711
+ .search-highlight {
2712
+ background: rgba(88, 166, 255, 0.3);
2713
+ padding: 0 2px;
2714
+ border-radius: 2px;
2715
+ }
2716
+
2717
+ /* ========== Compare View ========== */
2718
+ .compare-list {
2719
+ display: flex;
2720
+ flex-direction: column;
2721
+ gap: 16px;
2722
+ }
2723
+
2724
+ .compare-controls {
2725
+ display: flex;
2726
+ justify-content: center;
2727
+ padding: 16px;
2728
+ background: var(--bg-secondary);
2729
+ border-radius: var(--radius-md);
2730
+ border: 1px solid var(--border-color);
2731
+ }
2732
+
2733
+ .compare-input-group {
2734
+ display: flex;
2735
+ align-items: center;
2736
+ gap: 12px;
2737
+ }
2738
+
2739
+ .compare-input-group label {
2740
+ color: var(--text-secondary);
2741
+ font-size: 0.875rem;
2742
+ }
2743
+
2744
+ .compare-sample-input {
2745
+ width: 120px;
2746
+ padding: 8px 12px;
2747
+ background: var(--bg-tertiary);
2748
+ border: 1px solid var(--border-color);
2749
+ border-radius: var(--radius-sm);
2750
+ color: var(--text-primary);
2751
+ font-size: 0.875rem;
2752
+ }
2753
+
2754
+ .compare-content {
2755
+ padding: 16px;
2756
+ }
2757
+
2758
+ .compare-grid {
2759
+ display: grid;
2760
+ grid-template-columns: repeat(auto-fill, minmax(250px, 1fr));
2761
+ gap: 16px;
2762
+ }
2763
+
2764
+ .compare-model-card {
2765
+ background: var(--bg-secondary);
2766
+ border: 2px solid var(--border-color);
2767
+ border-radius: var(--radius-md);
2768
+ overflow: hidden;
2769
+ transition: all 0.15s ease;
2770
+ cursor: pointer;
2771
+ }
2772
+
2773
+ .compare-model-card:hover {
2774
+ border-color: var(--accent-blue);
2775
+ }
2776
+
2777
+ .compare-model-card.rank-1 {
2778
+ border-color: var(--accent-yellow);
2779
+ box-shadow: 0 0 12px rgba(210, 153, 34, 0.3);
2780
+ }
2781
+
2782
+ .compare-model-card.rank-2 {
2783
+ border-color: #c0c0c0;
2784
+ }
2785
+
2786
+ .compare-model-card.rank-3 {
2787
+ border-color: #cd7f32;
2788
+ }
2789
+
2790
+ .compare-model-card.selected {
2791
+ border-color: var(--accent-purple);
2792
+ box-shadow: 0 0 12px rgba(163, 113, 247, 0.3);
2793
+ }
2794
+
2795
+ .compare-model-image {
2796
+ aspect-ratio: 1;
2797
+ width: 100%;
2798
+ overflow: hidden;
2799
+ background: var(--bg-primary);
2800
+ }
2801
+
2802
+ .compare-model-image img {
2803
+ width: 100%;
2804
+ height: 100%;
2805
+ object-fit: cover;
2806
+ }
2807
+
2808
+ .compare-model-info {
2809
+ padding: 12px;
2810
+ border-top: 1px solid var(--border-color);
2811
+ }
2812
+
2813
+ .compare-model-name {
2814
+ font-family: var(--font-mono);
2815
+ font-size: 0.875rem;
2816
+ color: var(--text-primary);
2817
+ margin-bottom: 4px;
2818
+ }
2819
+
2820
+ .compare-model-stats {
2821
+ font-size: 0.75rem;
2822
+ color: var(--text-muted);
2823
+ }
2824
+
2825
+ .compare-model-stats .win-rate {
2826
+ color: var(--accent-green);
2827
+ font-weight: 600;
2828
+ }
2829
+
2830
+ /* Compare header info */
2831
+ .compare-header-info {
2832
+ background: var(--bg-tertiary);
2833
+ padding: 16px;
2834
+ border-radius: var(--radius-md);
2835
+ margin-bottom: 16px;
2836
+ }
2837
+
2838
+ .compare-header-info h3 {
2839
+ font-size: 1rem;
2840
+ margin-bottom: 8px;
2841
+ }
2842
+
2843
+ .compare-header-info .instruction {
2844
+ color: var(--text-secondary);
2845
+ font-size: 0.875rem;
2846
+ line-height: 1.5;
2847
+ }
2848
+
2849
+ /* ========== ELO Actions ========== */
2850
+ .elo-actions {
2851
+ display: flex;
2852
+ gap: 4px;
2853
+ margin-top: 12px;
2854
+ flex-wrap: wrap;
2855
+ }
2856
+
2857
+ .elo-actions .btn {
2858
+ flex: 1;
2859
+ min-width: 60px;
2860
+ font-size: 0.6875rem;
2861
+ padding: 4px 6px;
2862
+ }
2863
+
2864
+ /* ========== Multi-Subset Modal ========== */
2865
+ .multi-subset-header {
2866
+ padding: 20px;
2867
+ border-bottom: 1px solid var(--border-color);
2868
+ }
2869
+
2870
+ .multi-subset-header h2 {
2871
+ margin: 0;
2872
+ font-size: 1.25rem;
2873
+ }
2874
+
2875
+ .multi-subset-body {
2876
+ padding: 20px;
2877
+ }
2878
+
2879
+ .multi-subset-selection {
2880
+ margin-bottom: 20px;
2881
+ }
2882
+
2883
+ .multi-subset-selection h4 {
2884
+ font-size: 0.875rem;
2885
+ color: var(--text-secondary);
2886
+ margin-bottom: 12px;
2887
+ }
2888
+
2889
+ .multi-subset-info {
2890
+ background: var(--bg-tertiary);
2891
+ padding: 12px 16px;
2892
+ border-radius: var(--radius-sm);
2893
+ margin-bottom: 16px;
2894
+ }
2895
+
2896
+ .multi-subset-info p {
2897
+ margin: 4px 0;
2898
+ font-size: 0.875rem;
2899
+ color: var(--text-secondary);
2900
+ }
2901
+
2902
+ .multi-subset-info span {
2903
+ font-family: var(--font-mono);
2904
+ color: var(--accent-blue);
2905
+ }
2906
+
2907
+ .multi-subset-options {
2908
+ margin-bottom: 16px;
2909
+ display: flex;
2910
+ align-items: center;
2911
+ gap: 16px;
2912
+ }
2913
+
2914
+ .multi-subset-options label:first-child {
2915
+ color: var(--text-secondary);
2916
+ font-size: 0.875rem;
2917
+ }
2918
+
2919
+ .radio-label {
2920
+ display: flex;
2921
+ align-items: center;
2922
+ gap: 4px;
2923
+ font-size: 0.875rem;
2924
+ color: var(--text-primary);
2925
+ cursor: pointer;
2926
+ }
2927
+
2928
+ .radio-label input[type="radio"] {
2929
+ accent-color: var(--accent-blue);
2930
+ }
2931
+
2932
+ .multi-subset-results {
2933
+ border-top: 1px solid var(--border-color);
2934
+ padding: 20px;
2935
+ margin-top: 20px;
2936
+ }
2937
+
2938
+ .multi-subset-results h3 {
2939
+ font-size: 0.875rem;
2940
+ color: var(--text-muted);
2941
+ text-transform: uppercase;
2942
+ letter-spacing: 0.05em;
2943
+ margin-bottom: 16px;
2944
+ }
2945
+
2946
+ /* Subset tag for showing model presence */
2947
+ .subset-tag {
2948
+ display: inline-block;
2949
+ font-size: 0.5625rem;
2950
+ padding: 2px 4px;
2951
+ background: var(--bg-primary);
2952
+ border-radius: 2px;
2953
+ margin-left: 4px;
2954
+ color: var(--text-muted);
2955
+ }
2956
+
2957
+ /* ========== Win Rate Matrix Modal ========== */
2958
+ .matrix-modal-header {
2959
+ display: flex;
2960
+ justify-content: space-between;
2961
+ align-items: center;
2962
+ padding: 20px;
2963
+ border-bottom: 1px solid var(--border-color);
2964
+ }
2965
+
2966
+ .matrix-modal-header h2 {
2967
+ margin: 0;
2968
+ font-size: 1.25rem;
2969
+ }
2970
+
2971
+ .matrix-content {
2972
+ padding: 20px;
2973
+ overflow-x: auto;
2974
+ }
2975
+
2976
+ .matrix-table-container {
2977
+ overflow-x: auto;
2978
+ max-width: 100%;
2979
+ }
2980
+
2981
+ .matrix-table {
2982
+ border-collapse: collapse;
2983
+ font-size: 0.75rem;
2984
+ }
2985
+
2986
+ .matrix-table th,
2987
+ .matrix-table td {
2988
+ padding: 8px;
2989
+ text-align: center;
2990
+ border: 1px solid var(--border-color);
2991
+ min-width: 60px;
2992
+ }
2993
+
2994
+ .matrix-table th {
2995
+ background: var(--bg-tertiary);
2996
+ color: var(--text-secondary);
2997
+ font-weight: 600;
2998
+ position: sticky;
2999
+ }
3000
+
3001
+ .matrix-table th:first-child {
3002
+ left: 0;
3003
+ z-index: 2;
3004
+ }
3005
+
3006
+ .matrix-table thead th {
3007
+ top: 0;
3008
+ z-index: 1;
3009
+ }
3010
+
3011
+ .matrix-table thead th:first-child {
3012
+ z-index: 3;
3013
+ }
3014
+
3015
+ .matrix-table tbody th {
3016
+ text-align: right;
3017
+ background: var(--bg-tertiary);
3018
+ left: 0;
3019
+ z-index: 1;
3020
+ }
3021
+
3022
+ .matrix-cell {
3023
+ font-family: var(--font-mono);
3024
+ font-size: 0.6875rem;
3025
+ cursor: pointer;
3026
+ transition: all 0.15s ease;
3027
+ }
3028
+
3029
+ .matrix-cell:hover {
3030
+ transform: scale(1.1);
3031
+ z-index: 5;
3032
+ position: relative;
3033
+ }
3034
+
3035
+ .matrix-cell-diagonal {
3036
+ background: var(--bg-primary) !important;
3037
+ color: var(--text-muted);
3038
+ }
3039
+
3040
+ /* Matrix tooltip */
3041
+ .matrix-tooltip {
3042
+ position: absolute;
3043
+ background: var(--bg-secondary);
3044
+ border: 1px solid var(--border-color);
3045
+ border-radius: var(--radius-sm);
3046
+ padding: 8px 12px;
3047
+ font-size: 0.75rem;
3048
+ color: var(--text-primary);
3049
+ z-index: 100;
3050
+ pointer-events: none;
3051
+ box-shadow: var(--shadow-md);
3052
+ white-space: nowrap;
3053
+ }
3054
+
3055
+ /* ========== ELO History Modal ========== */
3056
+ .elo-history-header {
3057
+ display: flex;
3058
+ justify-content: space-between;
3059
+ align-items: center;
3060
+ padding: 20px;
3061
+ border-bottom: 1px solid var(--border-color);
3062
+ }
3063
+
3064
+ .elo-history-header h2 {
3065
+ margin: 0;
3066
+ font-size: 1.25rem;
3067
+ }
3068
+
3069
+ .elo-history-controls {
3070
+ display: flex;
3071
+ align-items: center;
3072
+ gap: 8px;
3073
+ }
3074
+
3075
+ .elo-history-controls label {
3076
+ font-size: 0.875rem;
3077
+ color: var(--text-secondary);
3078
+ }
3079
+
3080
+ .elo-history-content {
3081
+ padding: 20px;
3082
+ min-height: 400px;
3083
+ }
3084
+
3085
+ .elo-history-chart {
3086
+ width: 100%;
3087
+ height: 400px;
3088
+ }
3089
+
3090
+ .elo-history-legend {
3091
+ padding: 0 20px 20px;
3092
+ display: flex;
3093
+ flex-wrap: wrap;
3094
+ gap: 12px;
3095
+ justify-content: center;
3096
+ }
3097
+
3098
+ .legend-item {
3099
+ display: flex;
3100
+ align-items: center;
3101
+ gap: 6px;
3102
+ font-size: 0.75rem;
3103
+ cursor: pointer;
3104
+ padding: 4px 8px;
3105
+ border-radius: var(--radius-sm);
3106
+ background: var(--bg-tertiary);
3107
+ transition: all 0.15s ease;
3108
+ }
3109
+
3110
+ .legend-item:hover {
3111
+ background: var(--bg-hover);
3112
+ }
3113
+
3114
+ .legend-item.hidden {
3115
+ opacity: 0.4;
3116
+ }
3117
+
3118
+ .legend-color {
3119
+ width: 12px;
3120
+ height: 3px;
3121
+ border-radius: 1px;
3122
+ }
3123
+
3124
+ .legend-label {
3125
+ color: var(--text-secondary);
3126
+ }
3127
+
3128
+ /* ========== ELO by Source Modal ========== */
3129
+ .elo-by-source-header {
3130
+ display: flex;
3131
+ justify-content: space-between;
3132
+ align-items: center;
3133
+ padding: 20px;
3134
+ border-bottom: 1px solid var(--border-color);
3135
+ }
3136
+
3137
+ .elo-by-source-header h2 {
3138
+ margin: 0;
3139
+ font-size: 1.25rem;
3140
+ }
3141
+
3142
+ .elo-by-source-content {
3143
+ padding: 20px;
3144
+ max-height: calc(90vh - 100px);
3145
+ overflow-y: auto;
3146
+ }
3147
+
3148
+ .source-section {
3149
+ margin-bottom: 24px;
3150
+ background: var(--bg-tertiary);
3151
+ border-radius: var(--radius-md);
3152
+ overflow: hidden;
3153
+ }
3154
+
3155
+ .source-section-header {
3156
+ display: flex;
3157
+ justify-content: space-between;
3158
+ align-items: center;
3159
+ padding: 12px 16px;
3160
+ background: var(--bg-secondary);
3161
+ cursor: pointer;
3162
+ }
3163
+
3164
+ .source-section-header:hover {
3165
+ background: var(--bg-hover);
3166
+ }
3167
+
3168
+ .source-name {
3169
+ font-weight: 600;
3170
+ color: var(--text-primary);
3171
+ }
3172
+
3173
+ .source-stats {
3174
+ font-size: 0.75rem;
3175
+ color: var(--text-muted);
3176
+ }
3177
+
3178
+ .source-leaderboard {
3179
+ padding: 12px 16px;
3180
+ }
3181
+
3182
+ .source-leaderboard-item {
3183
+ display: flex;
3184
+ align-items: center;
3185
+ gap: 12px;
3186
+ padding: 6px 0;
3187
+ border-bottom: 1px solid var(--border-color);
3188
+ }
3189
+
3190
+ .source-leaderboard-item:last-child {
3191
+ border-bottom: none;
3192
+ }
3193
+
3194
+ .source-rank {
3195
+ font-weight: 600;
3196
+ min-width: 24px;
3197
+ text-align: center;
3198
+ color: var(--text-muted);
3199
+ }
3200
+
3201
+ .source-rank.rank-1 { color: var(--accent-yellow); }
3202
+ .source-rank.rank-2 { color: #c0c0c0; }
3203
+ .source-rank.rank-3 { color: #cd7f32; }
3204
+
3205
+ .source-model-name {
3206
+ flex: 1;
3207
+ font-family: var(--font-mono);
3208
+ font-size: 0.8125rem;
3209
+ }
3210
+
3211
+ .source-elo {
3212
+ font-family: var(--font-mono);
3213
+ font-size: 0.8125rem;
3214
+ color: var(--accent-blue);
3215
+ min-width: 50px;
3216
+ text-align: right;
3217
+ }
3218
+
3219
+ /* ========== Missing CSS Variables Fix ========== */
3220
+ /* These were referenced but not defined - redeclare them properly */
3221
+
3222
+ /* ========== Win Rate Matrix Styles ========== */
3223
+ .matrix-scroll-container {
3224
+ overflow-x: auto;
3225
+ max-width: 100%;
3226
+ padding-bottom: 10px;
3227
+ }
3228
+
3229
+ .win-rate-matrix {
3230
+ border-collapse: collapse;
3231
+ font-size: 0.6875rem;
3232
+ margin: 0 auto;
3233
+ }
3234
+
3235
+ .win-rate-matrix th,
3236
+ .win-rate-matrix td {
3237
+ padding: 4px 6px;
3238
+ text-align: center;
3239
+ border: 1px solid var(--border-color);
3240
+ min-width: 50px;
3241
+ max-width: 80px;
3242
+ }
3243
+
3244
+ .win-rate-matrix .matrix-corner {
3245
+ background: var(--bg-primary);
3246
+ position: sticky;
3247
+ left: 0;
3248
+ top: 0;
3249
+ z-index: 3;
3250
+ }
3251
+
3252
+ .win-rate-matrix .matrix-header-cell {
3253
+ background: var(--bg-tertiary);
3254
+ color: var(--text-secondary);
3255
+ font-weight: 600;
3256
+ font-size: 0.625rem;
3257
+ writing-mode: vertical-rl;
3258
+ text-orientation: mixed;
3259
+ padding: 8px 4px;
3260
+ max-height: 100px;
3261
+ white-space: nowrap;
3262
+ overflow: hidden;
3263
+ text-overflow: ellipsis;
3264
+ }
3265
+
3266
+ .win-rate-matrix .matrix-row-header {
3267
+ background: var(--bg-tertiary);
3268
+ color: var(--text-secondary);
3269
+ font-weight: 600;
3270
+ font-size: 0.625rem;
3271
+ text-align: right;
3272
+ padding-right: 8px;
3273
+ position: sticky;
3274
+ left: 0;
3275
+ z-index: 1;
3276
+ white-space: nowrap;
3277
+ max-width: none;
3278
+ }
3279
+
3280
+ .win-rate-matrix .matrix-cell {
3281
+ font-family: var(--font-mono);
3282
+ font-size: 0.625rem;
3283
+ cursor: default;
3284
+ transition: transform 0.1s ease;
3285
+ }
3286
+
3287
+ .win-rate-matrix .matrix-cell:hover {
3288
+ transform: scale(1.15);
3289
+ position: relative;
3290
+ z-index: 2;
3291
+ box-shadow: 0 0 8px rgba(0,0,0,0.5);
3292
+ }
3293
+
3294
+ .win-rate-matrix .matrix-diagonal {
3295
+ background: var(--bg-primary) !important;
3296
+ color: var(--text-muted);
3297
+ }
3298
+
3299
+ .win-rate-matrix .matrix-no-data {
3300
+ background: var(--bg-primary) !important;
3301
+ color: var(--text-muted);
3302
+ }
3303
+
3304
+ .matrix-legend {
3305
+ display: flex;
3306
+ align-items: center;
3307
+ justify-content: center;
3308
+ gap: 12px;
3309
+ margin-top: 16px;
3310
+ padding: 12px;
3311
+ background: var(--bg-tertiary);
3312
+ border-radius: var(--radius-sm);
3313
+ }
3314
+
3315
+ .matrix-legend-label {
3316
+ font-size: 0.75rem;
3317
+ color: var(--text-secondary);
3318
+ }
3319
+
3320
+ .matrix-legend-gradient {
3321
+ display: flex;
3322
+ align-items: center;
3323
+ gap: 8px;
3324
+ }
3325
+
3326
+ .matrix-legend-gradient .legend-bar {
3327
+ width: 100px;
3328
+ height: 16px;
3329
+ background: linear-gradient(to right, rgb(255, 55, 55), rgb(255, 255, 255), rgb(102, 200, 102));
3330
+ border-radius: 2px;
3331
+ border: 1px solid var(--border-color);
3332
+ }
3333
+
3334
+ .matrix-legend-gradient .legend-low,
3335
+ .matrix-legend-gradient .legend-high {
3336
+ font-size: 0.6875rem;
3337
+ color: var(--text-muted);
3338
+ }
3339
+
3340
+ /* ========== ELO History SVG Styles ========== */
3341
+ .elo-history-chart {
3342
+ font-family: var(--font-sans);
3343
+ }
3344
+
3345
+ .elo-history-chart .axis-line {
3346
+ stroke: var(--border-color);
3347
+ stroke-width: 1;
3348
+ }
3349
+
3350
+ .elo-history-chart .grid-line {
3351
+ stroke: var(--border-color);
3352
+ stroke-width: 0.5;
3353
+ stroke-dasharray: 4 4;
3354
+ opacity: 0.5;
3355
+ }
3356
+
3357
+ .elo-history-chart .axis-label {
3358
+ font-size: 10px;
3359
+ fill: var(--text-muted);
3360
+ }
3361
+
3362
+ .elo-history-chart .axis-title {
3363
+ font-size: 11px;
3364
+ fill: var(--text-secondary);
3365
+ font-weight: 500;
3366
+ }
3367
+
3368
+ .elo-history-chart .elo-line {
3369
+ stroke-linecap: round;
3370
+ stroke-linejoin: round;
3371
+ }
3372
+
3373
+ .elo-history-chart .elo-line:hover {
3374
+ stroke-width: 4;
3375
+ }
3376
+
3377
+ .elo-history-chart .elo-point {
3378
+ cursor: pointer;
3379
+ opacity: 0;
3380
+ transition: opacity 0.15s ease, r 0.15s ease;
3381
+ }
3382
+
3383
+ .elo-history-chart:hover .elo-point {
3384
+ opacity: 1;
3385
+ }
3386
+
3387
+ .elo-history-chart .elo-point:hover {
3388
+ r: 6;
3389
+ stroke: var(--text-primary);
3390
+ stroke-width: 2;
3391
+ }
3392
+
3393
+ .elo-history-chart-container {
3394
+ position: relative;
3395
+ }
3396
+
3397
+ .elo-tooltip {
3398
+ position: absolute;
3399
+ background: var(--bg-primary);
3400
+ border: 1px solid var(--border-color);
3401
+ border-radius: var(--radius-sm);
3402
+ padding: 8px 12px;
3403
+ font-size: 0.75rem;
3404
+ line-height: 1.4;
3405
+ pointer-events: none;
3406
+ opacity: 0;
3407
+ transition: opacity 0.15s ease;
3408
+ z-index: 100;
3409
+ box-shadow: 0 4px 12px rgba(0, 0, 0, 0.3);
3410
+ white-space: nowrap;
3411
+ }
3412
+
3413
+ .elo-tooltip.visible {
3414
+ opacity: 1;
3415
+ }
3416
+
3417
+ .elo-tooltip strong {
3418
+ color: var(--text-primary);
3419
+ }
3420
+
3421
+ /* Legend hidden state */
3422
+ .legend-item.hidden-model {
3423
+ opacity: 0.4;
3424
+ }
3425
+
3426
+ .legend-item.hidden-model .legend-color {
3427
+ background: var(--text-muted) !important;
3428
+ }
3429
+
3430
+ /* ========== ELO by Source Expanded Styles ========== */
3431
+ .source-sections-container {
3432
+ display: flex;
3433
+ flex-direction: column;
3434
+ gap: 12px;
3435
+ }
3436
+
3437
+ .source-section {
3438
+ background: var(--bg-secondary);
3439
+ border: 1px solid var(--border-color);
3440
+ border-radius: var(--radius-md);
3441
+ overflow: hidden;
3442
+ }
3443
+
3444
+ .source-section-header {
3445
+ display: flex;
3446
+ justify-content: space-between;
3447
+ align-items: center;
3448
+ padding: 12px 16px;
3449
+ cursor: pointer;
3450
+ user-select: none;
3451
+ transition: background 0.15s ease;
3452
+ }
3453
+
3454
+ .source-section-header:hover {
3455
+ background: var(--bg-hover);
3456
+ }
3457
+
3458
+ .source-section-info {
3459
+ display: flex;
3460
+ align-items: center;
3461
+ gap: 12px;
3462
+ }
3463
+
3464
+ .source-name {
3465
+ font-weight: 600;
3466
+ color: var(--text-primary);
3467
+ }
3468
+
3469
+ .source-battles {
3470
+ font-size: 0.75rem;
3471
+ color: var(--text-muted);
3472
+ }
3473
+
3474
+ .source-section .expand-icon {
3475
+ color: var(--text-muted);
3476
+ font-size: 0.75rem;
3477
+ transition: transform 0.2s ease;
3478
+ }
3479
+
3480
+ .source-section.expanded .expand-icon {
3481
+ transform: rotate(180deg);
3482
+ }
3483
+
3484
+ .source-section-content {
3485
+ display: none;
3486
+ padding: 12px 16px;
3487
+ border-top: 1px solid var(--border-color);
3488
+ }
3489
+
3490
+ .source-section.expanded .source-section-content {
3491
+ display: block;
3492
+ }
3493
+
3494
+ .source-leaderboard {
3495
+ width: 100%;
3496
+ border-collapse: collapse;
3497
+ font-size: 0.8125rem;
3498
+ }
3499
+
3500
+ .source-leaderboard th,
3501
+ .source-leaderboard td {
3502
+ padding: 8px 12px;
3503
+ text-align: left;
3504
+ border-bottom: 1px solid var(--border-color);
3505
+ }
3506
+
3507
+ .source-leaderboard th {
3508
+ background: var(--bg-tertiary);
3509
+ color: var(--text-secondary);
3510
+ font-weight: 600;
3511
+ font-size: 0.75rem;
3512
+ }
3513
+
3514
+ .source-leaderboard tbody tr:hover {
3515
+ background: var(--bg-hover);
3516
+ }
3517
+
3518
+ .source-leaderboard .rank-cell {
3519
+ font-weight: 600;
3520
+ color: var(--text-muted);
3521
+ }
3522
+
3523
+ .source-leaderboard .rank-cell.rank-1 { color: var(--accent-yellow); }
3524
+ .source-leaderboard .rank-cell.rank-2 { color: #c0c0c0; }
3525
+ .source-leaderboard .rank-cell.rank-3 { color: #cd7f32; }
3526
+
3527
+ .source-leaderboard .model-cell {
3528
+ font-family: var(--font-mono);
3529
+ font-size: 0.75rem;
3530
+ }
3531
+
3532
+ .source-leaderboard .elo-cell {
3533
+ font-family: var(--font-mono);
3534
+ color: var(--accent-blue);
3535
+ }
3536
+
3537
+ .source-leaderboard .wins {
3538
+ color: var(--accent-green);
3539
+ }
3540
+
3541
+ .source-leaderboard .losses {
3542
+ color: var(--accent-red);
3543
+ }
3544
+
3545
+ .source-leaderboard .ties {
3546
+ color: var(--accent-yellow);
3547
+ }
3548
+
3549
+ .source-leaderboard .win-rate-cell {
3550
+ font-family: var(--font-mono);
3551
+ color: var(--accent-green);
3552
+ }
3553
+
3554
+ /* ========== Merged ELO Results ========== */
3555
+ .merged-elo-info {
3556
+ background: var(--bg-tertiary);
3557
+ padding: 12px 16px;
3558
+ border-radius: var(--radius-sm);
3559
+ margin-bottom: 16px;
3560
+ }
3561
+
3562
+ .merged-elo-info p {
3563
+ margin: 4px 0;
3564
+ font-size: 0.875rem;
3565
+ color: var(--text-secondary);
3566
+ }
3567
+
3568
+ .merged-leaderboard {
3569
+ width: 100%;
3570
+ border-collapse: collapse;
3571
+ font-size: 0.8125rem;
3572
+ }
3573
+
3574
+ .merged-leaderboard th,
3575
+ .merged-leaderboard td {
3576
+ padding: 8px 12px;
3577
+ text-align: left;
3578
+ border-bottom: 1px solid var(--border-color);
3579
+ }
3580
+
3581
+ .merged-leaderboard th {
3582
+ background: var(--bg-tertiary);
3583
+ color: var(--text-secondary);
3584
+ font-weight: 600;
3585
+ font-size: 0.75rem;
3586
+ }
3587
+
3588
+ .merged-leaderboard tbody tr:hover {
3589
+ background: var(--bg-hover);
3590
+ }
3591
+
3592
+ .merged-leaderboard .rank-cell {
3593
+ font-weight: 600;
3594
+ color: var(--text-muted);
3595
+ }
3596
+
3597
+ .merged-leaderboard .rank-cell.rank-1 { color: var(--accent-yellow); }
3598
+ .merged-leaderboard .rank-cell.rank-2 { color: #c0c0c0; }
3599
+ .merged-leaderboard .rank-cell.rank-3 { color: #cd7f32; }
3600
+
3601
+ .merged-leaderboard .model-cell {
3602
+ font-family: var(--font-mono);
3603
+ font-size: 0.75rem;
3604
+ }
3605
+
3606
+ .merged-leaderboard .elo-cell {
3607
+ font-family: var(--font-mono);
3608
+ color: var(--accent-blue);
3609
+ }
3610
+
3611
+ .merged-leaderboard .wins {
3612
+ color: var(--accent-green);
3613
+ }
3614
+
3615
+ .merged-leaderboard .losses {
3616
+ color: var(--accent-red);
3617
+ }
3618
+
3619
+ .merged-leaderboard .stat-cell.ties,
3620
+ .merged-leaderboard .ties {
3621
+ color: var(--accent-yellow);
3622
+ }
3623
+
3624
+ .merged-leaderboard .win-rate-cell {
3625
+ font-family: var(--font-mono);
3626
+ color: var(--accent-green);
3627
+ }
3628
+
3629
+ /* ========== Header Navigation ========== */
3630
+ .header-nav {
3631
+ display: flex;
3632
+ align-items: center;
3633
+ gap: 24px;
3634
+ }
3635
+
3636
+ .nav-link {
3637
+ color: var(--text-secondary);
3638
+ text-decoration: none;
3639
+ font-size: 1rem;
3640
+ cursor: pointer;
3641
+ transition: color 0.15s ease;
3642
+ }
3643
+
3644
+ .nav-link:hover {
3645
+ color: var(--text-primary);
3646
+ }
3647
+
3648
+ .nav-link.active {
3649
+ color: var(--accent-blue);
3650
+ }
3651
+
3652
+ .nav-separator {
3653
+ color: var(--border-light);
3654
+ font-size: 0.875rem;
3655
+ user-select: none;
3656
+ }
3657
+
3658
+ .nav-link.nav-external {
3659
+ font-size: 0.875rem;
3660
+ display: flex;
3661
+ align-items: center;
3662
+ gap: 4px;
3663
+ }
3664
+
3665
+ .nav-link.nav-external .external-icon {
3666
+ font-size: 0.75rem;
3667
+ opacity: 0.7;
3668
+ }
3669
+
3670
+ .nav-link.nav-external:hover .external-icon {
3671
+ opacity: 1;
3672
+ }
3673
+
3674
+ /* Header Action Buttons (unified style) */
3675
+ .btn-header-action {
3676
+ display: flex;
3677
+ align-items: center;
3678
+ gap: 6px;
3679
+ padding: 6px 12px;
3680
+ background: var(--bg-tertiary);
3681
+ border: 1px solid var(--border-color);
3682
+ border-radius: var(--radius-sm);
3683
+ color: var(--accent-yellow);
3684
+ font-size: 0.875rem;
3685
+ cursor: pointer;
3686
+ transition: all 0.15s ease;
3687
+ }
3688
+
3689
+ .btn-header-action:hover {
3690
+ background: var(--bg-hover);
3691
+ border-color: var(--accent-yellow);
3692
+ }
3693
+
3694
+ .header-action-icon {
3695
+ font-size: 1rem;
3696
+ }
3697
+
3698
+ .header-action-count {
3699
+ font-size: 0.75rem;
3700
+ font-family: var(--font-mono);
3701
+ background: var(--bg-primary);
3702
+ padding: 2px 6px;
3703
+ border-radius: var(--radius-sm);
3704
+ color: var(--text-secondary);
3705
+ }
3706
+
3707
+ /* Logo clickable */
3708
+ .logo {
3709
+ cursor: pointer;
3710
+ transition: color 0.15s ease;
3711
+ }
3712
+
3713
+ .logo:hover {
3714
+ color: var(--text-primary);
3715
+ }
3716
+
3717
+ /* ========== Full Page Layout (no sidebar) ========== */
3718
+ .full-page {
3719
+ flex: 1;
3720
+ padding: 32px 48px;
3721
+ overflow-y: auto;
3722
+ max-width: 1600px;
3723
+ margin: 0 auto;
3724
+ width: 100%;
3725
+ }
3726
+
3727
+ .page-header {
3728
+ display: flex;
3729
+ align-items: center;
3730
+ justify-content: space-between;
3731
+ margin-bottom: 12px;
3732
+ }
3733
+
3734
+ .page-header h2 {
3735
+ font-size: 1.5rem;
3736
+ font-weight: 600;
3737
+ color: var(--text-primary);
3738
+ margin: 0;
3739
+ }
3740
+
3741
+ .page-description {
3742
+ font-size: 0.9375rem;
3743
+ color: var(--text-secondary);
3744
+ line-height: 1.5;
3745
+ margin-bottom: 24px;
3746
+ }
3747
+
3748
+ /* Gallery Page Container (with sidebar) */
3749
+ .gallery-page-container {
3750
+ display: flex;
3751
+ width: 100%;
3752
+ }
3753
+
3754
+ /* Gallery Controls (top bar in content area) */
3755
+ .gallery-controls {
3756
+ display: flex;
3757
+ align-items: center;
3758
+ gap: 16px;
3759
+ padding: 12px 16px;
3760
+ background: var(--bg-tertiary);
3761
+ border-bottom: 1px solid var(--border-color);
3762
+ flex-wrap: wrap;
3763
+ margin-bottom: 16px;
3764
+ }
3765
+
3766
+ .gallery-controls .selector-group {
3767
+ display: flex;
3768
+ align-items: center;
3769
+ gap: 8px;
3770
+ }
3771
+
3772
+ .gallery-controls .selector-group label {
3773
+ font-size: 0.8125rem;
3774
+ color: var(--text-secondary);
3775
+ }
3776
+
3777
+ .gallery-controls .selector {
3778
+ min-width: 140px;
3779
+ }
3780
+
3781
+ .gallery-controls .view-toggle {
3782
+ margin-left: auto;
3783
+ }
3784
+
3785
+ .gallery-controls .search-box {
3786
+ display: flex;
3787
+ align-items: center;
3788
+ gap: 4px;
3789
+ }
3790
+
3791
+ /* ========== Overview Page Styles ========== */
3792
+ .overview-content {
3793
+ background: var(--bg-secondary);
3794
+ border: 1px solid var(--border-color);
3795
+ border-radius: var(--radius-md);
3796
+ overflow: hidden;
3797
+ }
3798
+
3799
+ .overview-table-container {
3800
+ overflow-x: auto;
3801
+ max-width: 100%;
3802
+ }
3803
+
3804
+ .overview-table {
3805
+ width: 100%;
3806
+ border-collapse: collapse;
3807
+ font-size: 0.875rem;
3808
+ }
3809
+
3810
+ .overview-table th,
3811
+ .overview-table td {
3812
+ padding: 12px 16px;
3813
+ text-align: left;
3814
+ border-bottom: 1px solid var(--border-color);
3815
+ white-space: nowrap;
3816
+ }
3817
+
3818
+ .overview-table th {
3819
+ background: var(--bg-tertiary);
3820
+ color: var(--text-secondary);
3821
+ font-size: 0.75rem;
3822
+ text-transform: uppercase;
3823
+ letter-spacing: 0.05em;
3824
+ font-weight: 600;
3825
+ position: sticky;
3826
+ top: 0;
3827
+ z-index: 10;
3828
+ }
3829
+
3830
+ .overview-table th.sortable {
3831
+ cursor: pointer;
3832
+ user-select: none;
3833
+ transition: background 0.15s ease;
3834
+ }
3835
+
3836
+ .overview-table th.sortable:hover {
3837
+ background: var(--bg-hover);
3838
+ color: var(--text-primary);
3839
+ }
3840
+
3841
+ .overview-table th.sorted-asc::after {
3842
+ content: ' ▲';
3843
+ font-size: 0.625rem;
3844
+ }
3845
+
3846
+ .overview-table th.sorted-desc::after {
3847
+ content: ' ▼';
3848
+ font-size: 0.625rem;
3849
+ }
3850
+
3851
+ .overview-table th.subset-header {
3852
+ cursor: pointer;
3853
+ transition: all 0.15s ease;
3854
+ }
3855
+
3856
+ .overview-table th.subset-header:hover {
3857
+ background: var(--accent-blue);
3858
+ color: #fff;
3859
+ }
3860
+
3861
+ .overview-table th.model-header {
3862
+ position: sticky;
3863
+ left: 0;
3864
+ z-index: 11;
3865
+ background: var(--bg-tertiary);
3866
+ }
3867
+
3868
+ .overview-table td.model-cell {
3869
+ font-family: var(--font-mono);
3870
+ font-weight: 500;
3871
+ position: sticky;
3872
+ left: 0;
3873
+ background: var(--bg-secondary);
3874
+ z-index: 1;
3875
+ cursor: pointer;
3876
+ transition: background 0.15s ease;
3877
+ }
3878
+
3879
+ .overview-table tr:hover td.model-cell {
3880
+ background: var(--bg-tertiary);
3881
+ }
3882
+
3883
+ .overview-table td.model-cell:hover {
3884
+ color: var(--accent-blue);
3885
+ }
3886
+
3887
+ .overview-table td.elo-cell {
3888
+ font-family: var(--font-mono);
3889
+ text-align: center;
3890
+ }
3891
+
3892
+ .overview-table td.elo-cell.no-data {
3893
+ color: var(--text-muted);
3894
+ }
3895
+
3896
+ .overview-table td.avg-elo-cell {
3897
+ font-family: var(--font-mono);
3898
+ font-weight: 600;
3899
+ color: var(--accent-blue);
3900
+ text-align: center;
3901
+ background: var(--bg-tertiary);
3902
+ }
3903
+
3904
+ .overview-table tbody tr {
3905
+ transition: background 0.15s ease;
3906
+ }
3907
+
3908
+ .overview-table tbody tr:hover {
3909
+ background: var(--bg-hover);
3910
+ }
3911
+
3912
+ /* ELO value coloring */
3913
+ .elo-high {
3914
+ color: var(--accent-green);
3915
+ }
3916
+
3917
+ .elo-mid {
3918
+ color: var(--text-primary);
3919
+ }
3920
+
3921
+ .elo-low {
3922
+ color: var(--accent-red);
3923
+ }
3924
+
3925
+ /* Rank badge in overview */
3926
+ .rank-badge {
3927
+ display: inline-block;
3928
+ width: 24px;
3929
+ height: 24px;
3930
+ line-height: 24px;
3931
+ text-align: center;
3932
+ border-radius: 50%;
3933
+ font-size: 0.6875rem;
3934
+ font-weight: 600;
3935
+ margin-right: 8px;
3936
+ }
3937
+
3938
+ .rank-badge.rank-1 {
3939
+ background: rgba(210, 153, 34, 0.2);
3940
+ color: var(--accent-yellow);
3941
+ }
3942
+
3943
+ .rank-badge.rank-2 {
3944
+ background: rgba(192, 192, 192, 0.2);
3945
+ color: #c0c0c0;
3946
+ }
3947
+
3948
+ .rank-badge.rank-3 {
3949
+ background: rgba(205, 127, 50, 0.2);
3950
+ color: #cd7f32;
3951
+ }
3952
+
3953
+ /* Subset info in header */
3954
+ .subset-header-info {
3955
+ display: block;
3956
+ font-size: 0.625rem;
3957
+ font-weight: 400;
3958
+ color: var(--text-muted);
3959
+ text-transform: none;
3960
+ letter-spacing: normal;
3961
+ margin-top: 2px;
3962
+ }
3963
+
3964
+ /* ========== Cross-Subset Modal Styles ========== */
3965
+ #cross-subset-modal .modal-content {
3966
+ padding: 32px;
3967
+ max-width: 800px;
3968
+ width: 90%;
3969
+ }
3970
+
3971
+ .cross-subset-modal-header {
3972
+ margin-bottom: 8px;
3973
+ }
3974
+
3975
+ .cross-subset-modal-header h2 {
3976
+ font-size: 1.25rem;
3977
+ font-weight: 600;
3978
+ color: var(--text-primary);
3979
+ margin: 0;
3980
+ }
3981
+
3982
+ .modal-description {
3983
+ font-size: 0.875rem;
3984
+ color: var(--text-secondary);
3985
+ margin-bottom: 24px;
3986
+ }
3987
+
3988
+ #cross-subset-modal .cross-subset-content {
3989
+ background: var(--bg-tertiary);
3990
+ border: 1px solid var(--border-color);
3991
+ border-radius: var(--radius-md);
3992
+ padding: 24px;
3993
+ margin-bottom: 24px;
3994
+ }
3995
+
3996
+ #cross-subset-modal .cross-subset-info {
3997
+ background: var(--bg-secondary);
3998
+ }
3999
+
4000
+ #cross-subset-modal .cross-subset-results {
4001
+ margin-top: 24px;
4002
+ }
4003
+
4004
+ #cross-subset-modal .cross-subset-results:empty {
4005
+ display: none;
4006
+ margin-top: 0;
4007
+ }
4008
+
4009
+ .cross-subset-selection {
4010
+ margin-bottom: 20px;
4011
+ }
4012
+
4013
+ .cross-subset-selection h4 {
4014
+ font-size: 0.875rem;
4015
+ color: var(--text-secondary);
4016
+ margin-bottom: 12px;
4017
+ }
4018
+
4019
+ .cross-subset-info {
4020
+ background: var(--bg-tertiary);
4021
+ padding: 12px 16px;
4022
+ border-radius: var(--radius-sm);
4023
+ margin-bottom: 16px;
4024
+ }
4025
+
4026
+ .cross-subset-info p {
4027
+ margin: 4px 0;
4028
+ font-size: 0.875rem;
4029
+ color: var(--text-secondary);
4030
+ }
4031
+
4032
+ .cross-subset-info span {
4033
+ font-family: var(--font-mono);
4034
+ color: var(--accent-blue);
4035
+ }
4036
+
4037
+ .cross-subset-options {
4038
+ margin-bottom: 16px;
4039
+ display: flex;
4040
+ align-items: center;
4041
+ gap: 16px;
4042
+ }
4043
+
4044
+ .cross-subset-options label:first-child {
4045
+ color: var(--text-secondary);
4046
+ font-size: 0.875rem;
4047
+ }
4048
+
4049
+ .cross-subset-results {
4050
+ background: var(--bg-tertiary);
4051
+ border: 1px solid var(--border-color);
4052
+ border-radius: var(--radius-md);
4053
+ padding: 24px;
4054
+ }
4055
+
4056
+ .cross-subset-results:empty {
4057
+ display: none;
4058
+ }
4059
+
4060
+ .cross-subset-results h3 {
4061
+ font-size: 0.875rem;
4062
+ color: var(--text-muted);
4063
+ text-transform: uppercase;
4064
+ letter-spacing: 0.05em;
4065
+ margin-bottom: 16px;
4066
+ }
4067
+
4068
+ /* ========== Responsive Adjustments ========== */
4069
+ @media (max-width: 1200px) {
4070
+ .full-page {
4071
+ padding: 24px 32px;
4072
+ }
4073
+ }
4074
+
4075
+ @media (max-width: 768px) {
4076
+ .header-center {
4077
+ flex-wrap: wrap;
4078
+ gap: 8px;
4079
+ }
4080
+
4081
+ .header-nav {
4082
+ order: -1;
4083
+ width: 100%;
4084
+ justify-content: center;
4085
+ }
4086
+
4087
+ .subset-controls {
4088
+ flex-wrap: wrap;
4089
+ justify-content: center;
4090
+ }
4091
+
4092
+ .full-page {
4093
+ padding: 16px;
4094
+ }
4095
+
4096
+ .overview-table {
4097
+ font-size: 0.75rem;
4098
+ }
4099
+
4100
+ .overview-table th,
4101
+ .overview-table td {
4102
+ padding: 8px 10px;
4103
+ }
4104
+ }
genarena/visualize/templates/index.html ADDED
@@ -0,0 +1,413 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>GenArena Explorer</title>
7
+ <link rel="stylesheet" href="static/style.css">
8
+ </head>
9
+ <body>
10
+ <div id="app">
11
+ <!-- Header -->
12
+ <header class="header">
13
+ <div class="header-left">
14
+ <h1 class="logo" id="logo-link" title="Back to Overview">GenArena</h1>
15
+ <nav class="header-nav">
16
+ <a id="nav-overview" class="nav-link active" title="All Subsets Overview">Overview</a>
17
+ <a id="nav-gallery" class="nav-link" title="Browse Battles">Gallery</a>
18
+ <span class="nav-separator">|</span>
19
+ <a href="#" class="nav-link nav-external" target="_blank" rel="noopener noreferrer" title="Project Page">Project Page <span class="external-icon">↗</span></a>
20
+ <a href="#" class="nav-link nav-external" target="_blank" rel="noopener noreferrer" title="arXiv Paper">arXiv <span class="external-icon">↗</span></a>
21
+ <a href="#" class="nav-link nav-external" target="_blank" rel="noopener noreferrer" title="GitHub Repository">GitHub <span class="external-icon">↗</span></a>
22
+ </nav>
23
+ </div>
24
+ <div class="header-right">
25
+ <button id="favorites-btn" class="btn-header-action" title="View Favorites">
26
+ <span class="header-action-icon">★</span>
27
+ <span id="favorites-count" class="header-action-count">0</span>
28
+ </button>
29
+ </div>
30
+ </header>
31
+
32
+ <!-- Main Content -->
33
+ <div class="main-container">
34
+ <!-- Overview Page (default landing) -->
35
+ <div id="overview-page" class="full-page">
36
+ <div class="page-header">
37
+ <h2>All Subsets Leaderboard</h2>
38
+ <button id="cross-subset-btn" class="btn btn-secondary">Cross-Subset</button>
39
+ </div>
40
+ <p class="page-description">ELO rankings across all evaluation subsets. Click on a subset column header to view details, or click a model row to see its performance.</p>
41
+ <div id="overview-content" class="overview-content">
42
+ <div class="loading">Loading leaderboards...</div>
43
+ </div>
44
+ </div>
45
+
46
+ <!-- Gallery Page (with sidebar) -->
47
+ <div id="gallery-page" class="gallery-page-container" style="display: none;">
48
+ <!-- Sidebar Filters -->
49
+ <aside class="sidebar">
50
+ <div class="filter-section">
51
+ <h3>Filters</h3>
52
+
53
+ <!-- Model filter - only visible in battles view -->
54
+ <div class="filter-group battles-only">
55
+ <label>Models: <span id="model-count">(0 selected)</span></label>
56
+ <div id="model-checkboxes" class="checkbox-group">
57
+ <!-- Populated by JavaScript -->
58
+ </div>
59
+ <div class="checkbox-actions">
60
+ <button id="select-all-models" class="btn btn-small">Select All</button>
61
+ <button id="clear-all-models" class="btn btn-small">Clear All</button>
62
+ </div>
63
+ </div>
64
+
65
+ <div class="filter-group battles-only" id="result-filter-group" style="display: none;">
66
+ <label for="result-filter">Result (single model only):</label>
67
+ <select id="result-filter" class="filter-select">
68
+ <option value="">All results</option>
69
+ <option value="wins">Wins</option>
70
+ <option value="losses">Losses</option>
71
+ <option value="ties">Ties</option>
72
+ </select>
73
+ </div>
74
+
75
+ <div class="filter-group battles-only">
76
+ <label for="consistency-filter">Consistency:</label>
77
+ <select id="consistency-filter" class="filter-select">
78
+ <option value="">All</option>
79
+ <option value="true">Consistent</option>
80
+ <option value="false">Inconsistent</option>
81
+ </select>
82
+ </div>
83
+
84
+ <div class="filter-group" id="prompt-source-filter-group" style="display: none;">
85
+ <label for="prompt-source-filter">Prompt Source:</label>
86
+ <select id="prompt-source-filter" class="filter-select">
87
+ <option value="">All sources</option>
88
+ </select>
89
+ </div>
90
+
91
+ <!-- Model filter for prompts view -->
92
+ <div class="filter-group prompts-only" id="prompts-model-filter-group" style="display: none;">
93
+ <label>Filter Models: <span id="prompts-model-count">(0 selected)</span></label>
94
+ <div id="prompts-model-checkboxes" class="checkbox-group">
95
+ <!-- Populated by JavaScript -->
96
+ </div>
97
+ <div class="checkbox-actions">
98
+ <button id="prompts-select-all-models" class="btn btn-small">Select All</button>
99
+ <button id="prompts-clear-all-models" class="btn btn-small">Clear All</button>
100
+ </div>
101
+ <button id="prompts-apply-model-filter" class="btn btn-primary btn-small" style="width: 100%; margin-top: 8px;">Apply Filter</button>
102
+ <p class="filter-hint">Show only selected models and their battles</p>
103
+ </div>
104
+
105
+ <div class="filter-group" id="image-count-filter-group" style="display: none;">
106
+ <label>Input Images: <span id="image-range-display">1-1</span></label>
107
+ <div class="range-slider-container">
108
+ <input type="range" id="min-images-slider" class="range-slider" min="1" max="10" value="1">
109
+ <input type="range" id="max-images-slider" class="range-slider" min="1" max="10" value="10">
110
+ </div>
111
+ <div class="range-labels">
112
+ <span id="min-images-label">1</span>
113
+ <span id="max-images-label">10</span>
114
+ </div>
115
+ </div>
116
+
117
+ <button id="apply-filters" class="btn btn-primary">Apply Filters</button>
118
+ <button id="clear-filters" class="btn btn-secondary">Clear</button>
119
+ </div>
120
+
121
+ <div class="stats-section">
122
+ <h3>Statistics</h3>
123
+ <div id="stats-panel">
124
+ <p class="placeholder">Select a subset and experiment</p>
125
+ </div>
126
+ </div>
127
+
128
+ <div class="h2h-section" id="h2h-section" style="display: none;">
129
+ <h3>Head-to-Head</h3>
130
+ <div id="h2h-panel">
131
+ <!-- Filled by JavaScript -->
132
+ </div>
133
+ </div>
134
+
135
+ <!-- ELO Leaderboard Section -->
136
+ <div class="elo-section" id="elo-section">
137
+ <div class="elo-header">
138
+ <h3>ELO Leaderboard</h3>
139
+ <button id="view-full-leaderboard" class="btn btn-small btn-link" title="View Full Leaderboard">
140
+ View All →
141
+ </button>
142
+ </div>
143
+ <div id="elo-panel">
144
+ <p class="placeholder">Select a subset to view rankings</p>
145
+ </div>
146
+ <div class="elo-actions">
147
+ <button id="view-matrix" class="btn btn-small btn-secondary" title="View Win Rate Matrix">Matrix</button>
148
+ <button id="view-elo-history" class="btn btn-small btn-secondary" title="View ELO History">History</button>
149
+ <button id="view-elo-by-source" class="btn btn-small btn-secondary" title="View ELO by Source">By Source</button>
150
+ </div>
151
+ </div>
152
+ </aside>
153
+
154
+ <!-- Battle List -->
155
+ <main class="content">
156
+ <!-- Gallery Controls -->
157
+ <div class="gallery-controls">
158
+ <div class="selector-group">
159
+ <label for="subset-select">Subset:</label>
160
+ <select id="subset-select" class="selector">
161
+ <option value="">Select subset...</option>
162
+ </select>
163
+ </div>
164
+ <div class="selector-group">
165
+ <label for="exp-select">Experiment:</label>
166
+ <select id="exp-select" class="selector" disabled>
167
+ <option value="">Select experiment...</option>
168
+ </select>
169
+ </div>
170
+ <div class="view-toggle">
171
+ <button id="view-battles" class="view-btn active" title="View Battles">
172
+ <span class="view-icon">⚔️</span>
173
+ <span>Battles</span>
174
+ </button>
175
+ <button id="view-prompts" class="view-btn" title="View by Prompt">
176
+ <span class="view-icon">📝</span>
177
+ <span>Prompts</span>
178
+ </button>
179
+ </div>
180
+ <div class="search-box">
181
+ <input type="text" id="search-input" class="search-input" placeholder="Search prompts..." title="Search by instruction, task type, or metadata">
182
+ <button id="search-btn" class="btn btn-small search-btn" title="Search">🔍</button>
183
+ <button id="clear-search-btn" class="btn btn-small clear-search-btn" title="Clear search" style="display: none;">✕</button>
184
+ </div>
185
+ </div>
186
+
187
+ <div class="content-header">
188
+ <div id="pagination-info" class="pagination-info"></div>
189
+ <div class="pagination-controls">
190
+ <button id="first-page" class="btn btn-small" disabled>&laquo;</button>
191
+ <button id="prev-page" class="btn btn-small" disabled>&lt;</button>
192
+ <div id="page-numbers" class="page-numbers">
193
+ <!-- Populated by JavaScript -->
194
+ </div>
195
+ <button id="next-page" class="btn btn-small" disabled>&gt;</button>
196
+ <button id="last-page" class="btn btn-small" disabled>&raquo;</button>
197
+ <div class="page-jump">
198
+ <input type="number" id="page-input" class="page-input" min="1" placeholder="Page">
199
+ <button id="page-go" class="btn btn-small">Go</button>
200
+ </div>
201
+ </div>
202
+ </div>
203
+
204
+ <div id="battle-list" class="battle-list">
205
+ <div class="empty-state">
206
+ <p>Select a subset and experiment to view battles</p>
207
+ </div>
208
+ </div>
209
+
210
+ <!-- Prompts View Container -->
211
+ <div id="prompts-list" class="prompts-list" style="display: none;">
212
+ <div class="empty-state">
213
+ <p>Select a subset and experiment to view prompts</p>
214
+ </div>
215
+ </div>
216
+
217
+ <div class="content-footer">
218
+ <div class="pagination-controls">
219
+ <button id="first-page-bottom" class="btn btn-small" disabled>&laquo;</button>
220
+ <button id="prev-page-bottom" class="btn btn-small" disabled>&lt;</button>
221
+ <div id="page-numbers-bottom" class="page-numbers">
222
+ <!-- Populated by JavaScript -->
223
+ </div>
224
+ <button id="next-page-bottom" class="btn btn-small" disabled>&gt;</button>
225
+ <button id="last-page-bottom" class="btn btn-small" disabled>&raquo;</button>
226
+ <div class="page-jump">
227
+ <input type="number" id="page-input-bottom" class="page-input" min="1" placeholder="Page">
228
+ <button id="page-go-bottom" class="btn btn-small">Go</button>
229
+ </div>
230
+ </div>
231
+ </div>
232
+ </main>
233
+ </div> <!-- End of gallery-page -->
234
+ </div>
235
+
236
+ <!-- Detail Modal -->
237
+ <div id="detail-modal" class="modal hidden">
238
+ <div class="modal-backdrop"></div>
239
+ <div class="modal-content">
240
+ <button class="modal-close">&times;</button>
241
+ <div id="detail-content">
242
+ <!-- Filled by JavaScript -->
243
+ </div>
244
+ </div>
245
+ </div>
246
+
247
+ <!-- Favorites Modal -->
248
+ <div id="favorites-modal" class="modal hidden">
249
+ <div class="modal-backdrop"></div>
250
+ <div class="modal-content modal-content-wide">
251
+ <button class="modal-close">&times;</button>
252
+ <div class="favorites-modal-header">
253
+ <h2>Favorite Prompts</h2>
254
+ <button id="clear-all-favorites" class="btn btn-secondary btn-small">Clear All</button>
255
+ </div>
256
+ <div id="favorites-scrollable" class="favorites-scrollable">
257
+ <!-- Favorites Model Filter - horizontal layout -->
258
+ <div class="favorites-model-filter" id="favorites-model-filter-group">
259
+ <label>Filter Models:</label>
260
+ <div id="favorites-model-checkboxes" class="checkbox-group-horizontal">
261
+ <!-- Populated by JavaScript -->
262
+ </div>
263
+ <div class="filter-controls-row">
264
+ <div class="checkbox-actions-inline">
265
+ <button id="favorites-select-all-models" class="btn btn-small">Select All</button>
266
+ <button id="favorites-clear-all-models" class="btn btn-small">Clear All</button>
267
+ <button id="favorites-apply-model-filter" class="btn btn-primary btn-small">Apply Filter</button>
268
+ </div>
269
+ <div class="stats-scope-toggle">
270
+ <label class="toggle-label">
271
+ <input type="checkbox" id="favorites-stats-scope-all">
272
+ <span class="toggle-text">Win rate includes all opponents</span>
273
+ </label>
274
+ </div>
275
+ </div>
276
+ </div>
277
+ <div id="favorites-content">
278
+ <!-- Filled by JavaScript -->
279
+ </div>
280
+ </div>
281
+ </div>
282
+ </div>
283
+
284
+ <!-- Image Lightbox -->
285
+ <div id="lightbox" class="lightbox">
286
+ <button class="lightbox-close">&times;</button>
287
+ <img id="lightbox-img" src="" alt="Enlarged image">
288
+ <div id="lightbox-label" class="lightbox-label"></div>
289
+ </div>
290
+
291
+ <!-- ELO Leaderboard Modal -->
292
+ <div id="leaderboard-modal" class="modal hidden">
293
+ <div class="modal-backdrop"></div>
294
+ <div class="modal-content modal-content-wide">
295
+ <button class="modal-close">&times;</button>
296
+ <div class="leaderboard-modal-header">
297
+ <h2>ELO Leaderboard</h2>
298
+ <span id="leaderboard-subset-name" class="subset-badge"></span>
299
+ </div>
300
+ <div id="leaderboard-content">
301
+ <!-- Filled by JavaScript -->
302
+ </div>
303
+ </div>
304
+ </div>
305
+
306
+ <!-- Model Stats Modal -->
307
+ <div id="model-stats-modal" class="modal hidden">
308
+ <div class="modal-backdrop"></div>
309
+ <div class="modal-content modal-content-wide">
310
+ <button class="modal-close">&times;</button>
311
+ <div id="model-stats-content">
312
+ <!-- Filled by JavaScript -->
313
+ </div>
314
+ </div>
315
+ </div>
316
+
317
+ <!-- Win Rate Matrix Modal -->
318
+ <div id="matrix-modal" class="modal hidden">
319
+ <div class="modal-backdrop"></div>
320
+ <div class="modal-content modal-content-wide">
321
+ <button class="modal-close">&times;</button>
322
+ <div class="matrix-modal-header">
323
+ <h2>Win Rate Matrix</h2>
324
+ <span id="matrix-subset-name" class="subset-badge"></span>
325
+ </div>
326
+ <div id="matrix-content" class="matrix-content">
327
+ <!-- Filled by JavaScript -->
328
+ </div>
329
+ </div>
330
+ </div>
331
+
332
+ <!-- ELO History Modal -->
333
+ <div id="elo-history-modal" class="modal hidden">
334
+ <div class="modal-backdrop"></div>
335
+ <div class="modal-content modal-content-wide">
336
+ <button class="modal-close">&times;</button>
337
+ <div class="elo-history-header">
338
+ <h2>ELO History</h2>
339
+ <div class="elo-history-controls">
340
+ <label for="elo-history-granularity">Group by:</label>
341
+ <select id="elo-history-granularity" class="selector">
342
+ <option value="experiment" selected>Experiment</option>
343
+ <option value="day">Day</option>
344
+ <option value="week">Week</option>
345
+ </select>
346
+ </div>
347
+ </div>
348
+ <div id="elo-history-content" class="elo-history-content">
349
+ <!-- Filled by JavaScript - SVG chart -->
350
+ </div>
351
+ <div id="elo-history-legend" class="elo-history-legend">
352
+ <!-- Filled by JavaScript -->
353
+ </div>
354
+ </div>
355
+ </div>
356
+
357
+ <!-- ELO by Source Modal -->
358
+ <div id="elo-by-source-modal" class="modal hidden">
359
+ <div class="modal-backdrop"></div>
360
+ <div class="modal-content modal-content-wide">
361
+ <button class="modal-close">&times;</button>
362
+ <div class="elo-by-source-header">
363
+ <h2>ELO Rankings by Prompt Source</h2>
364
+ <span id="elo-by-source-subset-name" class="subset-badge"></span>
365
+ </div>
366
+ <div id="elo-by-source-content" class="elo-by-source-content">
367
+ <!-- Filled by JavaScript -->
368
+ </div>
369
+ </div>
370
+ </div>
371
+
372
+ <!-- Cross-Subset Modal -->
373
+ <div id="cross-subset-modal" class="modal hidden">
374
+ <div class="modal-backdrop"></div>
375
+ <div class="modal-content modal-content-wide">
376
+ <button class="modal-close">&times;</button>
377
+ <div class="cross-subset-modal-header">
378
+ <h2>Cross-Subset ELO Analysis</h2>
379
+ </div>
380
+ <p class="modal-description">Merge battles from multiple subsets to compute combined ELO rankings.</p>
381
+ <div class="cross-subset-content">
382
+ <div class="cross-subset-selection">
383
+ <h4>Select subsets to merge:</h4>
384
+ <div id="cross-subset-checkboxes" class="checkbox-group">
385
+ <!-- Populated by JavaScript -->
386
+ </div>
387
+ <div class="checkbox-actions">
388
+ <button id="cross-subset-select-all" class="btn btn-small">Select All</button>
389
+ <button id="cross-subset-clear-all" class="btn btn-small">Clear All</button>
390
+ </div>
391
+ </div>
392
+ <div class="cross-subset-info">
393
+ <p>Common models (in all selected): <span id="common-model-count">-</span></p>
394
+ <p>Union models (in any selected): <span id="union-model-count">-</span></p>
395
+ <p>Total battles: <span id="total-battles-count">-</span></p>
396
+ </div>
397
+ <div class="cross-subset-options">
398
+ <label>Model scope:</label>
399
+ <label class="radio-label"><input type="radio" name="model-scope" value="all" checked> All models</label>
400
+ <label class="radio-label"><input type="radio" name="model-scope" value="common"> Common only</label>
401
+ </div>
402
+ <button id="calculate-merged-elo" class="btn btn-primary">Calculate Merged ELO</button>
403
+ </div>
404
+ <div id="cross-subset-results" class="cross-subset-results">
405
+ <!-- Filled by JavaScript -->
406
+ </div>
407
+ </div>
408
+ </div>
409
+ </div>
410
+
411
+ <script src="static/app.js"></script>
412
+ </body>
413
+ </html>