stellaathena commited on
Commit
81afbdf
·
verified ·
1 Parent(s): c3a5000

Initial commit: MATH & PIQA Backend

Browse files
README.md CHANGED
@@ -1,12 +1,101 @@
1
  ---
2
- title: Math Piqa Backend
3
- emoji: 📊
4
- colorFrom: yellow
5
- colorTo: yellow
6
  sdk: gradio
7
- sdk_version: 6.4.0
8
  app_file: app.py
9
  pinned: false
 
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: LLM Evaluation Backend
3
+ emoji: ⚙️
4
+ colorFrom: purple
5
+ colorTo: blue
6
  sdk: gradio
7
+ sdk_version: 4.44.0
8
  app_file: app.py
9
  pinned: false
10
+ license: apache-2.0
11
  ---
12
 
13
+ # LLM Evaluation Backend
14
+
15
+ Automated evaluation backend using [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness).
16
+
17
+ ## Features
18
+
19
+ - **Automatic evaluation** of submitted models
20
+ - **Queue management** with status tracking
21
+ - **Result upload** to HuggingFace datasets
22
+ - **Dashboard UI** for monitoring
23
+
24
+ ## Setup
25
+
26
+ 1. Create datasets in your HuggingFace organization:
27
+ - `your-org/requests` - evaluation queue
28
+ - `your-org/results` - evaluation results
29
+ 2. Update `src/envs.py` with your configuration
30
+ 3. Set `HF_TOKEN` environment variable
31
+ 4. Deploy with GPU compute for faster evaluations
32
+
33
+ ## Configuration
34
+
35
+ Edit `src/envs.py` to configure:
36
+
37
+ ```python
38
+ # Your organization
39
+ OWNER = "your-org-name"
40
+
41
+ # Device (cpu or cuda:0)
42
+ DEVICE = "cuda:0"
43
+
44
+ # Tasks to evaluate
45
+ TASKS_HARNESS = [
46
+ "mmlu",
47
+ "hellaswag",
48
+ "arc_challenge",
49
+ "winogrande",
50
+ "gsm8k",
51
+ "truthfulqa_mc2",
52
+ ]
53
+
54
+ # Few-shot examples
55
+ NUM_FEWSHOT = 0
56
+ ```
57
+
58
+ ## Running Locally
59
+
60
+ ```bash
61
+ # Install dependencies
62
+ pip install -r requirements.txt
63
+
64
+ # Set token
65
+ export HF_TOKEN=your_token
66
+
67
+ # Run the backend
68
+ python main_backend_harness.py
69
+
70
+ # Or run the dashboard
71
+ python app.py
72
+ ```
73
+
74
+ ## Architecture
75
+
76
+ ```
77
+ backend/
78
+ ├── app.py # Gradio dashboard
79
+ ├── main_backend_harness.py # Main evaluation loop
80
+ ├── src/
81
+ │ ├── envs.py # Configuration
82
+ │ ├── logging.py # Logging setup
83
+ │ ├── backend/
84
+ │ │ ├── manage_requests.py # Queue management
85
+ │ │ ├── run_eval_suite_harness.py # lm-eval integration
86
+ │ │ └── sort_queue.py # Priority sorting
87
+ │ └── display/ # UI utilities
88
+ ```
89
+
90
+ ## Evaluation Flow
91
+
92
+ 1. **Sync**: Download pending requests from Hub
93
+ 2. **Sort**: Order by priority (FIFO default)
94
+ 3. **Evaluate**: Run lm-eval on the model
95
+ 4. **Upload**: Push results to results dataset
96
+ 5. **Update**: Mark request as FINISHED
97
+
98
+ ## Related
99
+
100
+ - [Frontend Leaderboard](../frontend/) - Display results
101
+ - [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) - Evaluation framework
app.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Backend Application - Evaluation Dashboard
3
+
4
+ A Gradio UI for monitoring and triggering evaluations.
5
+ """
6
+
7
+ import logging
8
+ from functools import partial
9
+
10
+ import gradio as gr
11
+ from apscheduler.schedulers.background import BackgroundScheduler
12
+
13
+ from main_backend_harness import run_auto_eval, sync_data
14
+ from src.display.css_html_js import dark_mode_gradio_js
15
+ from src.display.log_visualizer import log_file_to_html_string
16
+ from src.envs import QUEUE_REPO, REFRESH_RATE, REPO_ID, RESULTS_REPO
17
+ from src.logging import configure_root_logger, log_file, setup_logger
18
+
19
+ # Configure logging
20
+ logging.getLogger("httpx").setLevel(logging.WARNING)
21
+ logging.getLogger("numexpr").setLevel(logging.WARNING)
22
+ logging.getLogger("absl").setLevel(logging.WARNING)
23
+
24
+ configure_root_logger()
25
+ logging.basicConfig(level=logging.INFO)
26
+
27
+ logger = setup_logger(__name__)
28
+
29
+
30
+ # Markdown content
31
+ intro_md = """
32
+ # Evaluation Backend Dashboard
33
+
34
+ This dashboard monitors and controls the automatic evaluation pipeline.
35
+
36
+ Evaluations are run using [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness).
37
+ """
38
+
39
+ links_md = f"""
40
+ # Important Links
41
+
42
+ | Description | Link |
43
+ |-------------|------|
44
+ | Leaderboard | [View Leaderboard](https://huggingface.co/spaces/{REPO_ID.replace('/backend', '/leaderboard')}) |
45
+ | Request Queue | [{QUEUE_REPO}](https://huggingface.co/datasets/{QUEUE_REPO}) |
46
+ | Results Dataset | [{RESULTS_REPO}](https://huggingface.co/datasets/{RESULTS_REPO}) |
47
+ """
48
+
49
+
50
+ def trigger_auto_eval():
51
+ """Trigger an evaluation run."""
52
+ logger.info("Manual evaluation triggered")
53
+ sync_data()
54
+ run_auto_eval()
55
+
56
+
57
+ def get_log_html(reverse: bool = True) -> str:
58
+ """Get log file as HTML with optional reverse ordering."""
59
+ return log_file_to_html_string(log_file, reverse=reverse)
60
+
61
+
62
+ # Build the Gradio interface
63
+ with gr.Blocks(js=dark_mode_gradio_js) as demo:
64
+ gr.Markdown(intro_md)
65
+
66
+ with gr.Tab("Dashboard"):
67
+ # Log display
68
+ output_html = gr.HTML(lambda: get_log_html(True), every=1)
69
+
70
+ with gr.Row():
71
+ download_button = gr.DownloadButton("Download Log File", value=str(log_file))
72
+
73
+ with gr.Accordion("Log Settings", open=False):
74
+ reverse_checkbox = gr.Checkbox(
75
+ label="Show newest first",
76
+ value=True,
77
+ )
78
+ reverse_checkbox.change(
79
+ fn=get_log_html,
80
+ inputs=[reverse_checkbox],
81
+ outputs=[output_html],
82
+ )
83
+
84
+ # Manual trigger button
85
+ with gr.Row():
86
+ trigger_button = gr.Button("Run Evaluation Now", variant="primary")
87
+ trigger_button.click(fn=trigger_auto_eval, inputs=[], outputs=[])
88
+
89
+ gr.Markdown(links_md)
90
+
91
+ with gr.Tab("Configuration"):
92
+ gr.Markdown("""
93
+ ## Current Configuration
94
+
95
+ The backend is configured to evaluate models on the following tasks:
96
+
97
+ | Task | Description |
98
+ |------|-------------|
99
+ | MMLU | Massive Multitask Language Understanding |
100
+ | HellaSwag | Commonsense reasoning |
101
+ | ARC-Challenge | Advanced reasoning |
102
+ | WinoGrande | Pronoun resolution |
103
+ | GSM8K | Math word problems |
104
+ | TruthfulQA | Truthfulness evaluation |
105
+
106
+ ### Settings
107
+
108
+ - **Refresh Rate**: 10 minutes
109
+ - **Device**: GPU (cuda:0) when available
110
+ - **Batch Size**: Auto-detected
111
+
112
+ To modify tasks or settings, edit `src/envs.py`.
113
+ """)
114
+
115
+
116
+ # Background scheduler for automatic evaluations
117
+ scheduler = BackgroundScheduler()
118
+ scheduler.add_job(sync_data, "interval", seconds=REFRESH_RATE, id="sync_data")
119
+ scheduler.add_job(run_auto_eval, "interval", seconds=REFRESH_RATE, id="auto_eval")
120
+ scheduler.start()
121
+
122
+ logger.info(f"Scheduler started. Refresh rate: {REFRESH_RATE} seconds")
123
+
124
+ # Launch
125
+ if __name__ == "__main__":
126
+ demo.queue(default_concurrency_limit=40).launch(
127
+ server_name="0.0.0.0",
128
+ show_error=True,
129
+ server_port=7860,
130
+ )
main_backend_harness.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Main backend script for running evaluations with lm-evaluation-harness.
3
+
4
+ This script:
5
+ 1. Downloads pending evaluation requests from the Hub
6
+ 2. Processes them one at a time
7
+ 3. Runs evaluations using lm-eval
8
+ 4. Uploads results back to the Hub
9
+ """
10
+
11
+ import logging
12
+ import pprint
13
+ from huggingface_hub import snapshot_download
14
+
15
+ from src.backend.manage_requests import (
16
+ FAILED_STATUS,
17
+ FINISHED_STATUS,
18
+ PENDING_STATUS,
19
+ RUNNING_STATUS,
20
+ check_completed_evals,
21
+ get_eval_requests,
22
+ set_eval_request,
23
+ )
24
+ from src.backend.run_eval_suite_harness import run_evaluation
25
+ from src.backend.sort_queue import sort_models_by_priority
26
+ from src.envs import (
27
+ API,
28
+ DEVICE,
29
+ EVAL_REQUESTS_PATH_BACKEND,
30
+ EVAL_RESULTS_PATH_BACKEND,
31
+ LIMIT,
32
+ NUM_FEWSHOT,
33
+ QUEUE_REPO,
34
+ RESULTS_REPO,
35
+ TASKS_HARNESS,
36
+ TOKEN,
37
+ )
38
+ from src.logging import setup_logger
39
+
40
+ # Suppress noisy loggers
41
+ logging.getLogger("openai").setLevel(logging.WARNING)
42
+ logging.getLogger("httpx").setLevel(logging.WARNING)
43
+
44
+ logger = setup_logger(__name__)
45
+ pp = pprint.PrettyPrinter(width=80)
46
+
47
+
48
+ def sync_data():
49
+ """Download latest data from Hub."""
50
+ logger.info("Syncing data from Hub...")
51
+
52
+ # Download results
53
+ snapshot_download(
54
+ repo_id=RESULTS_REPO,
55
+ revision="main",
56
+ local_dir=EVAL_RESULTS_PATH_BACKEND,
57
+ repo_type="dataset",
58
+ max_workers=60,
59
+ token=TOKEN,
60
+ )
61
+
62
+ # Download requests
63
+ snapshot_download(
64
+ repo_id=QUEUE_REPO,
65
+ revision="main",
66
+ local_dir=EVAL_REQUESTS_PATH_BACKEND,
67
+ repo_type="dataset",
68
+ max_workers=60,
69
+ token=TOKEN,
70
+ )
71
+
72
+ logger.info("Data sync complete.")
73
+
74
+
75
+ def run_auto_eval():
76
+ """Process pending evaluation requests."""
77
+ logger.info("=" * 60)
78
+ logger.info("Starting auto evaluation run")
79
+ logger.info("=" * 60)
80
+
81
+ # Check for completed evaluations
82
+ check_completed_evals(
83
+ api=API,
84
+ checked_status=RUNNING_STATUS,
85
+ completed_status=FINISHED_STATUS,
86
+ failed_status=FAILED_STATUS,
87
+ hf_repo=QUEUE_REPO,
88
+ local_dir=EVAL_REQUESTS_PATH_BACKEND,
89
+ hf_repo_results=RESULTS_REPO,
90
+ local_dir_results=EVAL_RESULTS_PATH_BACKEND,
91
+ )
92
+
93
+ # Get pending evaluation requests
94
+ eval_requests = get_eval_requests(
95
+ job_status=[PENDING_STATUS],
96
+ hf_repo=QUEUE_REPO,
97
+ local_dir=EVAL_REQUESTS_PATH_BACKEND,
98
+ )
99
+
100
+ logger.info(f"Found {len(eval_requests)} pending requests")
101
+
102
+ if not eval_requests:
103
+ logger.info("No pending evaluations. Exiting.")
104
+ return
105
+
106
+ # Sort by priority (FIFO by default)
107
+ eval_requests = sort_models_by_priority(api=API, models=eval_requests)
108
+
109
+ # Process the first request
110
+ eval_request = eval_requests[0]
111
+
112
+ logger.info("\n" + "-" * 40)
113
+ logger.info("Processing evaluation request:")
114
+ logger.info(pp.pformat(vars(eval_request)))
115
+ logger.info("-" * 40)
116
+
117
+ # Update status to RUNNING
118
+ set_eval_request(
119
+ api=API,
120
+ eval_request=eval_request,
121
+ set_to_status=RUNNING_STATUS,
122
+ hf_repo=QUEUE_REPO,
123
+ local_dir=EVAL_REQUESTS_PATH_BACKEND,
124
+ )
125
+
126
+ try:
127
+ # Run the evaluation
128
+ run_evaluation(
129
+ eval_request=eval_request,
130
+ task_names=TASKS_HARNESS,
131
+ num_fewshot=NUM_FEWSHOT,
132
+ local_dir=EVAL_RESULTS_PATH_BACKEND,
133
+ results_repo=RESULTS_REPO,
134
+ batch_size="auto",
135
+ device=DEVICE,
136
+ limit=LIMIT,
137
+ )
138
+
139
+ # Mark as finished
140
+ set_eval_request(
141
+ api=API,
142
+ eval_request=eval_request,
143
+ set_to_status=FINISHED_STATUS,
144
+ hf_repo=QUEUE_REPO,
145
+ local_dir=EVAL_REQUESTS_PATH_BACKEND,
146
+ )
147
+ logger.info(f"Evaluation completed for {eval_request.model}")
148
+
149
+ except Exception as e:
150
+ logger.error(f"Evaluation failed for {eval_request.model}: {e}")
151
+
152
+ # Mark as failed
153
+ set_eval_request(
154
+ api=API,
155
+ eval_request=eval_request,
156
+ set_to_status=FAILED_STATUS,
157
+ hf_repo=QUEUE_REPO,
158
+ local_dir=EVAL_REQUESTS_PATH_BACKEND,
159
+ )
160
+
161
+
162
+ if __name__ == "__main__":
163
+ # Sync data and run evaluation
164
+ sync_data()
165
+ run_auto_eval()
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ gradio>=4.0.0
2
+ pandas>=2.0.0
3
+ huggingface_hub>=0.20.0
4
+ transformers>=4.36.0
5
+ apscheduler>=3.10.0
6
+ lm-eval>=0.4.0
7
+ accelerate>=0.25.0
8
+ torch>=2.0.0
src/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # Backend source package
src/backend/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # Backend evaluation package
src/backend/manage_requests.py ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Manage evaluation requests and their status."""
2
+
3
+ import json
4
+ import os
5
+ from dataclasses import dataclass
6
+ from pathlib import Path
7
+ from typing import List, Optional
8
+
9
+ from huggingface_hub import HfApi
10
+
11
+
12
+ # Status constants
13
+ PENDING_STATUS = "PENDING"
14
+ RUNNING_STATUS = "RUNNING"
15
+ FINISHED_STATUS = "FINISHED"
16
+ FAILED_STATUS = "FAILED"
17
+
18
+
19
+ @dataclass
20
+ class EvalRequest:
21
+ """Represents an evaluation request."""
22
+ model: str
23
+ revision: str
24
+ precision: str
25
+ weight_type: str
26
+ model_type: str
27
+ status: str
28
+ submitted_time: str
29
+ base_model: str = ""
30
+ likes: int = 0
31
+ params: float = 0.0
32
+ license: str = ""
33
+ private: bool = False
34
+ json_filepath: str = ""
35
+
36
+ def get_model_args(self) -> str:
37
+ """Get model arguments string for lm-eval."""
38
+ args = f"pretrained={self.model}"
39
+
40
+ if self.revision and self.revision != "main":
41
+ args += f",revision={self.revision}"
42
+
43
+ if self.precision:
44
+ args += f",dtype={self.precision}"
45
+
46
+ # Add trust_remote_code for safety
47
+ args += ",trust_remote_code=True"
48
+
49
+ return args
50
+
51
+
52
+ def get_eval_requests(
53
+ job_status: List[str],
54
+ hf_repo: str,
55
+ local_dir: str,
56
+ ) -> List[EvalRequest]:
57
+ """
58
+ Load evaluation requests with specified status.
59
+
60
+ Args:
61
+ job_status: List of status values to filter by
62
+ hf_repo: HuggingFace dataset repo ID
63
+ local_dir: Local directory with cached requests
64
+
65
+ Returns:
66
+ List of EvalRequest objects
67
+ """
68
+ requests = []
69
+ requests_dir = Path(local_dir)
70
+
71
+ if not requests_dir.exists():
72
+ return requests
73
+
74
+ for json_file in requests_dir.rglob("*.json"):
75
+ try:
76
+ with open(json_file, "r") as f:
77
+ data = json.load(f)
78
+
79
+ if data.get("status", PENDING_STATUS) in job_status:
80
+ request = EvalRequest(
81
+ model=data.get("model", ""),
82
+ revision=data.get("revision", "main"),
83
+ precision=data.get("precision", "float16"),
84
+ weight_type=data.get("weight_type", "Original"),
85
+ model_type=data.get("model_type", ""),
86
+ status=data.get("status", PENDING_STATUS),
87
+ submitted_time=data.get("submitted_time", ""),
88
+ base_model=data.get("base_model", ""),
89
+ likes=data.get("likes", 0),
90
+ params=data.get("params", 0.0),
91
+ license=data.get("license", ""),
92
+ private=data.get("private", False),
93
+ json_filepath=str(json_file),
94
+ )
95
+ requests.append(request)
96
+
97
+ except (json.JSONDecodeError, OSError) as e:
98
+ print(f"Error loading {json_file}: {e}")
99
+ continue
100
+
101
+ return requests
102
+
103
+
104
+ def set_eval_request(
105
+ api: HfApi,
106
+ eval_request: EvalRequest,
107
+ set_to_status: str,
108
+ hf_repo: str,
109
+ local_dir: str,
110
+ ) -> None:
111
+ """
112
+ Update the status of an evaluation request.
113
+
114
+ Args:
115
+ api: HuggingFace API client
116
+ eval_request: The request to update
117
+ set_to_status: New status value
118
+ hf_repo: HuggingFace dataset repo ID
119
+ local_dir: Local directory with cached requests
120
+ """
121
+ json_filepath = Path(eval_request.json_filepath)
122
+
123
+ if not json_filepath.exists():
124
+ print(f"Request file not found: {json_filepath}")
125
+ return
126
+
127
+ # Load current data
128
+ with open(json_filepath, "r") as f:
129
+ data = json.load(f)
130
+
131
+ # Update status
132
+ data["status"] = set_to_status
133
+
134
+ # Save locally
135
+ with open(json_filepath, "w") as f:
136
+ json.dump(data, f, indent=2)
137
+
138
+ # Upload to Hub
139
+ try:
140
+ repo_path = str(json_filepath).replace(local_dir + "/", "")
141
+ api.upload_file(
142
+ path_or_fileobj=str(json_filepath),
143
+ path_in_repo=repo_path,
144
+ repo_id=hf_repo,
145
+ repo_type="dataset",
146
+ commit_message=f"Update status to {set_to_status} for {eval_request.model}",
147
+ )
148
+ except Exception as e:
149
+ print(f"Failed to upload status update: {e}")
150
+
151
+
152
+ def check_completed_evals(
153
+ api: HfApi,
154
+ checked_status: str,
155
+ completed_status: str,
156
+ failed_status: str,
157
+ hf_repo: str,
158
+ local_dir: str,
159
+ hf_repo_results: str,
160
+ local_dir_results: str,
161
+ ) -> None:
162
+ """
163
+ Check for completed evaluations and update their status.
164
+
165
+ Args:
166
+ api: HuggingFace API client
167
+ checked_status: Status to check (e.g., RUNNING)
168
+ completed_status: Status to set if results exist
169
+ failed_status: Status to set if evaluation failed
170
+ hf_repo: Requests dataset repo ID
171
+ local_dir: Local requests directory
172
+ hf_repo_results: Results dataset repo ID
173
+ local_dir_results: Local results directory
174
+ """
175
+ running_requests = get_eval_requests([checked_status], hf_repo, local_dir)
176
+
177
+ for request in running_requests:
178
+ # Check if results exist
179
+ model_results_dir = Path(local_dir_results) / request.model
180
+
181
+ if model_results_dir.exists():
182
+ result_files = list(model_results_dir.rglob("results_*.json"))
183
+ if result_files:
184
+ # Results found, mark as completed
185
+ set_eval_request(
186
+ api=api,
187
+ eval_request=request,
188
+ set_to_status=completed_status,
189
+ hf_repo=hf_repo,
190
+ local_dir=local_dir,
191
+ )
192
+ print(f"Marked {request.model} as {completed_status}")
src/backend/run_eval_suite_harness.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Run evaluations using lm-evaluation-harness."""
2
+
3
+ import json
4
+ import logging
5
+ from datetime import datetime
6
+ from pathlib import Path
7
+ from typing import List, Optional, Union
8
+
9
+ from lm_eval import evaluator, utils
10
+ from lm_eval.tasks import TaskManager
11
+
12
+ from src.backend.manage_requests import EvalRequest
13
+ from src.envs import API
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ def run_evaluation(
19
+ eval_request: EvalRequest,
20
+ task_names: List[str],
21
+ num_fewshot: int,
22
+ batch_size: Union[int, str],
23
+ device: str,
24
+ local_dir: str,
25
+ results_repo: str,
26
+ limit: Optional[int] = None,
27
+ ) -> dict:
28
+ """
29
+ Run evaluation for a model using lm-evaluation-harness.
30
+
31
+ Args:
32
+ eval_request: The evaluation request with model info
33
+ task_names: List of task names to evaluate
34
+ num_fewshot: Number of few-shot examples
35
+ batch_size: Batch size (int or "auto")
36
+ device: Device to run on ("cpu" or "cuda:0")
37
+ local_dir: Directory to save results locally
38
+ results_repo: HuggingFace dataset repo for results
39
+ limit: Limit samples per task (for testing)
40
+
41
+ Returns:
42
+ Evaluation results dictionary
43
+ """
44
+ if limit:
45
+ logger.warning(
46
+ "WARNING: --limit SHOULD ONLY BE USED FOR TESTING. "
47
+ "REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
48
+ )
49
+
50
+ # Initialize task manager and validate tasks
51
+ task_manager = TaskManager()
52
+ all_tasks = task_manager.all_tasks
53
+ task_names = utils.pattern_match(task_names, all_tasks)
54
+
55
+ logger.info(f"Model: {eval_request.model}")
56
+ logger.info(f"Selected Tasks: {task_names}")
57
+ logger.info(f"Few-shot: {num_fewshot}")
58
+ logger.info(f"Device: {device}")
59
+
60
+ # Run evaluation
61
+ try:
62
+ results = evaluator.simple_evaluate(
63
+ model="hf",
64
+ model_args=eval_request.get_model_args(),
65
+ tasks=task_names,
66
+ num_fewshot=num_fewshot,
67
+ batch_size=batch_size,
68
+ device=device,
69
+ limit=limit,
70
+ write_out=True,
71
+ log_samples=True, # Save per-sample results
72
+ )
73
+ except Exception as e:
74
+ logger.error(f"Evaluation failed: {e}")
75
+ raise
76
+
77
+ # Add model metadata to results
78
+ results["config"]["model_dtype"] = eval_request.precision
79
+ results["config"]["model_name"] = eval_request.model
80
+ results["config"]["model_sha"] = eval_request.revision
81
+ results["config"]["model_type"] = eval_request.model_type
82
+
83
+ # Log results summary
84
+ logger.info("\n" + "=" * 60)
85
+ logger.info("EVALUATION RESULTS")
86
+ logger.info("=" * 60)
87
+ logger.info(evaluator.make_table(results))
88
+
89
+ # Save results locally
90
+ timestamp = datetime.now().strftime("%Y%m%dT%H%M%S")
91
+ results_path = Path(local_dir) / eval_request.model / f"results_{timestamp}.json"
92
+ results_path.parent.mkdir(exist_ok=True, parents=True)
93
+
94
+ dumped = json.dumps(results, indent=2, default=str)
95
+ results_path.write_text(dumped)
96
+
97
+ logger.info(f"Results saved to: {results_path}")
98
+
99
+ # Upload to HuggingFace Hub
100
+ try:
101
+ repo_path = results_path.relative_to(local_dir).as_posix()
102
+ API.upload_file(
103
+ path_or_fileobj=str(results_path),
104
+ path_in_repo=repo_path,
105
+ repo_id=results_repo,
106
+ repo_type="dataset",
107
+ commit_message=f"Add evaluation results for {eval_request.model}",
108
+ )
109
+ logger.info(f"Results uploaded to {results_repo}")
110
+ except Exception as e:
111
+ logger.error(f"Failed to upload results: {e}")
112
+
113
+ return results
src/backend/sort_queue.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Sort evaluation queue by priority."""
2
+
3
+ from typing import List
4
+ from huggingface_hub import HfApi
5
+
6
+ from src.backend.manage_requests import EvalRequest
7
+
8
+
9
+ def sort_models_by_priority(api: HfApi, models: List[EvalRequest]) -> List[EvalRequest]:
10
+ """
11
+ Sort models by priority for evaluation.
12
+
13
+ Current strategy: FIFO (first in, first out) based on submission time.
14
+ Can be extended to prioritize by model popularity, size, etc.
15
+
16
+ Args:
17
+ api: HuggingFace API client
18
+ models: List of evaluation requests
19
+
20
+ Returns:
21
+ Sorted list of evaluation requests
22
+ """
23
+ # Sort by submission time (oldest first)
24
+ return sorted(models, key=lambda x: x.submitted_time)
25
+
26
+
27
+ def sort_models_by_likes(api: HfApi, models: List[EvalRequest]) -> List[EvalRequest]:
28
+ """
29
+ Sort models by Hub likes (most popular first).
30
+
31
+ Args:
32
+ api: HuggingFace API client
33
+ models: List of evaluation requests
34
+
35
+ Returns:
36
+ Sorted list of evaluation requests
37
+ """
38
+ return sorted(models, key=lambda x: x.likes, reverse=True)
39
+
40
+
41
+ def sort_models_by_size(models: List[EvalRequest], ascending: bool = True) -> List[EvalRequest]:
42
+ """
43
+ Sort models by parameter count.
44
+
45
+ Args:
46
+ models: List of evaluation requests
47
+ ascending: If True, smallest models first
48
+
49
+ Returns:
50
+ Sorted list of evaluation requests
51
+ """
52
+ return sorted(models, key=lambda x: x.params, reverse=not ascending)
src/display/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # Display utilities
src/display/css_html_js.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """CSS and JavaScript for the backend UI."""
2
+
3
+ dark_mode_gradio_js = """
4
+ function refresh() {
5
+ const url = new URL(window.location);
6
+ if (url.searchParams.get('__theme') !== 'dark') {
7
+ url.searchParams.set('__theme', 'dark');
8
+ window.location.href = url.href;
9
+ }
10
+ }
11
+ """
src/display/log_visualizer.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Log visualization utilities for the backend UI."""
2
+
3
+ from pathlib import Path
4
+ from typing import Union
5
+
6
+ from src.envs import NUM_LINES_VISUALIZE
7
+ from src.logging import log_file
8
+
9
+
10
+ def log_file_to_html_string(log_path: Union[str, Path] = log_file, reverse: bool = True) -> str:
11
+ """
12
+ Convert log file contents to HTML string for display.
13
+
14
+ Args:
15
+ log_path: Path to the log file
16
+ reverse: If True, show newest entries first
17
+
18
+ Returns:
19
+ HTML-formatted log contents
20
+ """
21
+ log_path = Path(log_path)
22
+
23
+ if not log_path.exists():
24
+ return "<pre>No logs yet.</pre>"
25
+
26
+ try:
27
+ with open(log_path, "r") as f:
28
+ lines = f.readlines()
29
+
30
+ # Limit number of lines
31
+ lines = lines[-NUM_LINES_VISUALIZE:]
32
+
33
+ if reverse:
34
+ lines = lines[::-1]
35
+
36
+ # Escape HTML and wrap in pre tag
37
+ content = "".join(lines)
38
+ content = content.replace("&", "&amp;")
39
+ content = content.replace("<", "&lt;")
40
+ content = content.replace(">", "&gt;")
41
+
42
+ return f"<pre style='font-size: 12px; white-space: pre-wrap;'>{content}</pre>"
43
+
44
+ except Exception as e:
45
+ return f"<pre>Error reading log file: {e}</pre>"
src/envs.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Backend environment configuration."""
2
+
3
+ import os
4
+ from huggingface_hub import HfApi
5
+
6
+ # ----------------------------------
7
+ # Configuration
8
+ # ----------------------------------
9
+
10
+ TOKEN = os.environ.get("HF_TOKEN") # Read/write token for your org
11
+ OWNER = "stellaathena" # HuggingFace username/org
12
+
13
+ # Device configuration
14
+ DEVICE = os.environ.get("DEVICE", "cuda:0") # "cpu" or "cuda:0"
15
+ LIMIT = None # Set to int for testing (e.g., 20), None for full evaluation
16
+
17
+ # Evaluation settings
18
+ NUM_FEWSHOT = 0 # Zero-shot evaluation
19
+
20
+ # Tasks to evaluate (lm-evaluation-harness task names)
21
+ TASKS_HARNESS = [
22
+ "minerva_math",
23
+ "piqa",
24
+ ]
25
+
26
+ # ----------------------------------
27
+ # Derived Configuration
28
+ # ----------------------------------
29
+
30
+ REPO_ID = f"{OWNER}/math-piqa-backend"
31
+ QUEUE_REPO = f"{OWNER}/math-piqa-requests"
32
+ RESULTS_REPO = f"{OWNER}/math-piqa-results"
33
+
34
+ # Cache paths
35
+ CACHE_PATH = os.getenv("HF_HOME", ".")
36
+
37
+ EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
38
+ EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
39
+ EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
40
+ EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
41
+
42
+ # Refresh rate (seconds)
43
+ REFRESH_RATE = 10 * 60 # 10 minutes
44
+
45
+ # Log visualization
46
+ NUM_LINES_VISUALIZE = 300
47
+
48
+ # API client
49
+ API = HfApi(token=TOKEN)
src/logging.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Logging configuration for the backend."""
2
+
3
+ import logging
4
+ import os
5
+ from pathlib import Path
6
+
7
+ # Log file location
8
+ log_file = Path("evaluation.log")
9
+
10
+
11
+ def configure_root_logger():
12
+ """Configure the root logger with file and console handlers."""
13
+ logging.basicConfig(
14
+ level=logging.INFO,
15
+ format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
16
+ handlers=[
17
+ logging.FileHandler(log_file),
18
+ logging.StreamHandler(),
19
+ ],
20
+ )
21
+
22
+
23
+ def setup_logger(name: str) -> logging.Logger:
24
+ """
25
+ Set up a logger with the given name.
26
+
27
+ Args:
28
+ name: Logger name (typically __name__)
29
+
30
+ Returns:
31
+ Configured logger
32
+ """
33
+ logger = logging.getLogger(name)
34
+ logger.setLevel(logging.INFO)
35
+ return logger