Spaces:
Runtime error
Runtime error
Initial commit: MATH & PIQA Backend
Browse files- README.md +95 -6
- app.py +130 -0
- main_backend_harness.py +165 -0
- requirements.txt +8 -0
- src/__init__.py +1 -0
- src/backend/__init__.py +1 -0
- src/backend/manage_requests.py +192 -0
- src/backend/run_eval_suite_harness.py +113 -0
- src/backend/sort_queue.py +52 -0
- src/display/__init__.py +1 -0
- src/display/css_html_js.py +11 -0
- src/display/log_visualizer.py +45 -0
- src/envs.py +49 -0
- src/logging.py +35 -0
README.md
CHANGED
|
@@ -1,12 +1,101 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: gradio
|
| 7 |
-
sdk_version:
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
|
|
|
| 10 |
---
|
| 11 |
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: LLM Evaluation Backend
|
| 3 |
+
emoji: ⚙️
|
| 4 |
+
colorFrom: purple
|
| 5 |
+
colorTo: blue
|
| 6 |
sdk: gradio
|
| 7 |
+
sdk_version: 4.44.0
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
+
license: apache-2.0
|
| 11 |
---
|
| 12 |
|
| 13 |
+
# LLM Evaluation Backend
|
| 14 |
+
|
| 15 |
+
Automated evaluation backend using [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness).
|
| 16 |
+
|
| 17 |
+
## Features
|
| 18 |
+
|
| 19 |
+
- **Automatic evaluation** of submitted models
|
| 20 |
+
- **Queue management** with status tracking
|
| 21 |
+
- **Result upload** to HuggingFace datasets
|
| 22 |
+
- **Dashboard UI** for monitoring
|
| 23 |
+
|
| 24 |
+
## Setup
|
| 25 |
+
|
| 26 |
+
1. Create datasets in your HuggingFace organization:
|
| 27 |
+
- `your-org/requests` - evaluation queue
|
| 28 |
+
- `your-org/results` - evaluation results
|
| 29 |
+
2. Update `src/envs.py` with your configuration
|
| 30 |
+
3. Set `HF_TOKEN` environment variable
|
| 31 |
+
4. Deploy with GPU compute for faster evaluations
|
| 32 |
+
|
| 33 |
+
## Configuration
|
| 34 |
+
|
| 35 |
+
Edit `src/envs.py` to configure:
|
| 36 |
+
|
| 37 |
+
```python
|
| 38 |
+
# Your organization
|
| 39 |
+
OWNER = "your-org-name"
|
| 40 |
+
|
| 41 |
+
# Device (cpu or cuda:0)
|
| 42 |
+
DEVICE = "cuda:0"
|
| 43 |
+
|
| 44 |
+
# Tasks to evaluate
|
| 45 |
+
TASKS_HARNESS = [
|
| 46 |
+
"mmlu",
|
| 47 |
+
"hellaswag",
|
| 48 |
+
"arc_challenge",
|
| 49 |
+
"winogrande",
|
| 50 |
+
"gsm8k",
|
| 51 |
+
"truthfulqa_mc2",
|
| 52 |
+
]
|
| 53 |
+
|
| 54 |
+
# Few-shot examples
|
| 55 |
+
NUM_FEWSHOT = 0
|
| 56 |
+
```
|
| 57 |
+
|
| 58 |
+
## Running Locally
|
| 59 |
+
|
| 60 |
+
```bash
|
| 61 |
+
# Install dependencies
|
| 62 |
+
pip install -r requirements.txt
|
| 63 |
+
|
| 64 |
+
# Set token
|
| 65 |
+
export HF_TOKEN=your_token
|
| 66 |
+
|
| 67 |
+
# Run the backend
|
| 68 |
+
python main_backend_harness.py
|
| 69 |
+
|
| 70 |
+
# Or run the dashboard
|
| 71 |
+
python app.py
|
| 72 |
+
```
|
| 73 |
+
|
| 74 |
+
## Architecture
|
| 75 |
+
|
| 76 |
+
```
|
| 77 |
+
backend/
|
| 78 |
+
├── app.py # Gradio dashboard
|
| 79 |
+
├── main_backend_harness.py # Main evaluation loop
|
| 80 |
+
├── src/
|
| 81 |
+
│ ├── envs.py # Configuration
|
| 82 |
+
│ ├── logging.py # Logging setup
|
| 83 |
+
│ ├── backend/
|
| 84 |
+
│ │ ├── manage_requests.py # Queue management
|
| 85 |
+
│ │ ├── run_eval_suite_harness.py # lm-eval integration
|
| 86 |
+
│ │ └── sort_queue.py # Priority sorting
|
| 87 |
+
│ └── display/ # UI utilities
|
| 88 |
+
```
|
| 89 |
+
|
| 90 |
+
## Evaluation Flow
|
| 91 |
+
|
| 92 |
+
1. **Sync**: Download pending requests from Hub
|
| 93 |
+
2. **Sort**: Order by priority (FIFO default)
|
| 94 |
+
3. **Evaluate**: Run lm-eval on the model
|
| 95 |
+
4. **Upload**: Push results to results dataset
|
| 96 |
+
5. **Update**: Mark request as FINISHED
|
| 97 |
+
|
| 98 |
+
## Related
|
| 99 |
+
|
| 100 |
+
- [Frontend Leaderboard](../frontend/) - Display results
|
| 101 |
+
- [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) - Evaluation framework
|
app.py
ADDED
|
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Backend Application - Evaluation Dashboard
|
| 3 |
+
|
| 4 |
+
A Gradio UI for monitoring and triggering evaluations.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import logging
|
| 8 |
+
from functools import partial
|
| 9 |
+
|
| 10 |
+
import gradio as gr
|
| 11 |
+
from apscheduler.schedulers.background import BackgroundScheduler
|
| 12 |
+
|
| 13 |
+
from main_backend_harness import run_auto_eval, sync_data
|
| 14 |
+
from src.display.css_html_js import dark_mode_gradio_js
|
| 15 |
+
from src.display.log_visualizer import log_file_to_html_string
|
| 16 |
+
from src.envs import QUEUE_REPO, REFRESH_RATE, REPO_ID, RESULTS_REPO
|
| 17 |
+
from src.logging import configure_root_logger, log_file, setup_logger
|
| 18 |
+
|
| 19 |
+
# Configure logging
|
| 20 |
+
logging.getLogger("httpx").setLevel(logging.WARNING)
|
| 21 |
+
logging.getLogger("numexpr").setLevel(logging.WARNING)
|
| 22 |
+
logging.getLogger("absl").setLevel(logging.WARNING)
|
| 23 |
+
|
| 24 |
+
configure_root_logger()
|
| 25 |
+
logging.basicConfig(level=logging.INFO)
|
| 26 |
+
|
| 27 |
+
logger = setup_logger(__name__)
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
# Markdown content
|
| 31 |
+
intro_md = """
|
| 32 |
+
# Evaluation Backend Dashboard
|
| 33 |
+
|
| 34 |
+
This dashboard monitors and controls the automatic evaluation pipeline.
|
| 35 |
+
|
| 36 |
+
Evaluations are run using [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness).
|
| 37 |
+
"""
|
| 38 |
+
|
| 39 |
+
links_md = f"""
|
| 40 |
+
# Important Links
|
| 41 |
+
|
| 42 |
+
| Description | Link |
|
| 43 |
+
|-------------|------|
|
| 44 |
+
| Leaderboard | [View Leaderboard](https://huggingface.co/spaces/{REPO_ID.replace('/backend', '/leaderboard')}) |
|
| 45 |
+
| Request Queue | [{QUEUE_REPO}](https://huggingface.co/datasets/{QUEUE_REPO}) |
|
| 46 |
+
| Results Dataset | [{RESULTS_REPO}](https://huggingface.co/datasets/{RESULTS_REPO}) |
|
| 47 |
+
"""
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def trigger_auto_eval():
|
| 51 |
+
"""Trigger an evaluation run."""
|
| 52 |
+
logger.info("Manual evaluation triggered")
|
| 53 |
+
sync_data()
|
| 54 |
+
run_auto_eval()
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def get_log_html(reverse: bool = True) -> str:
|
| 58 |
+
"""Get log file as HTML with optional reverse ordering."""
|
| 59 |
+
return log_file_to_html_string(log_file, reverse=reverse)
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
# Build the Gradio interface
|
| 63 |
+
with gr.Blocks(js=dark_mode_gradio_js) as demo:
|
| 64 |
+
gr.Markdown(intro_md)
|
| 65 |
+
|
| 66 |
+
with gr.Tab("Dashboard"):
|
| 67 |
+
# Log display
|
| 68 |
+
output_html = gr.HTML(lambda: get_log_html(True), every=1)
|
| 69 |
+
|
| 70 |
+
with gr.Row():
|
| 71 |
+
download_button = gr.DownloadButton("Download Log File", value=str(log_file))
|
| 72 |
+
|
| 73 |
+
with gr.Accordion("Log Settings", open=False):
|
| 74 |
+
reverse_checkbox = gr.Checkbox(
|
| 75 |
+
label="Show newest first",
|
| 76 |
+
value=True,
|
| 77 |
+
)
|
| 78 |
+
reverse_checkbox.change(
|
| 79 |
+
fn=get_log_html,
|
| 80 |
+
inputs=[reverse_checkbox],
|
| 81 |
+
outputs=[output_html],
|
| 82 |
+
)
|
| 83 |
+
|
| 84 |
+
# Manual trigger button
|
| 85 |
+
with gr.Row():
|
| 86 |
+
trigger_button = gr.Button("Run Evaluation Now", variant="primary")
|
| 87 |
+
trigger_button.click(fn=trigger_auto_eval, inputs=[], outputs=[])
|
| 88 |
+
|
| 89 |
+
gr.Markdown(links_md)
|
| 90 |
+
|
| 91 |
+
with gr.Tab("Configuration"):
|
| 92 |
+
gr.Markdown("""
|
| 93 |
+
## Current Configuration
|
| 94 |
+
|
| 95 |
+
The backend is configured to evaluate models on the following tasks:
|
| 96 |
+
|
| 97 |
+
| Task | Description |
|
| 98 |
+
|------|-------------|
|
| 99 |
+
| MMLU | Massive Multitask Language Understanding |
|
| 100 |
+
| HellaSwag | Commonsense reasoning |
|
| 101 |
+
| ARC-Challenge | Advanced reasoning |
|
| 102 |
+
| WinoGrande | Pronoun resolution |
|
| 103 |
+
| GSM8K | Math word problems |
|
| 104 |
+
| TruthfulQA | Truthfulness evaluation |
|
| 105 |
+
|
| 106 |
+
### Settings
|
| 107 |
+
|
| 108 |
+
- **Refresh Rate**: 10 minutes
|
| 109 |
+
- **Device**: GPU (cuda:0) when available
|
| 110 |
+
- **Batch Size**: Auto-detected
|
| 111 |
+
|
| 112 |
+
To modify tasks or settings, edit `src/envs.py`.
|
| 113 |
+
""")
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
# Background scheduler for automatic evaluations
|
| 117 |
+
scheduler = BackgroundScheduler()
|
| 118 |
+
scheduler.add_job(sync_data, "interval", seconds=REFRESH_RATE, id="sync_data")
|
| 119 |
+
scheduler.add_job(run_auto_eval, "interval", seconds=REFRESH_RATE, id="auto_eval")
|
| 120 |
+
scheduler.start()
|
| 121 |
+
|
| 122 |
+
logger.info(f"Scheduler started. Refresh rate: {REFRESH_RATE} seconds")
|
| 123 |
+
|
| 124 |
+
# Launch
|
| 125 |
+
if __name__ == "__main__":
|
| 126 |
+
demo.queue(default_concurrency_limit=40).launch(
|
| 127 |
+
server_name="0.0.0.0",
|
| 128 |
+
show_error=True,
|
| 129 |
+
server_port=7860,
|
| 130 |
+
)
|
main_backend_harness.py
ADDED
|
@@ -0,0 +1,165 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Main backend script for running evaluations with lm-evaluation-harness.
|
| 3 |
+
|
| 4 |
+
This script:
|
| 5 |
+
1. Downloads pending evaluation requests from the Hub
|
| 6 |
+
2. Processes them one at a time
|
| 7 |
+
3. Runs evaluations using lm-eval
|
| 8 |
+
4. Uploads results back to the Hub
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
import logging
|
| 12 |
+
import pprint
|
| 13 |
+
from huggingface_hub import snapshot_download
|
| 14 |
+
|
| 15 |
+
from src.backend.manage_requests import (
|
| 16 |
+
FAILED_STATUS,
|
| 17 |
+
FINISHED_STATUS,
|
| 18 |
+
PENDING_STATUS,
|
| 19 |
+
RUNNING_STATUS,
|
| 20 |
+
check_completed_evals,
|
| 21 |
+
get_eval_requests,
|
| 22 |
+
set_eval_request,
|
| 23 |
+
)
|
| 24 |
+
from src.backend.run_eval_suite_harness import run_evaluation
|
| 25 |
+
from src.backend.sort_queue import sort_models_by_priority
|
| 26 |
+
from src.envs import (
|
| 27 |
+
API,
|
| 28 |
+
DEVICE,
|
| 29 |
+
EVAL_REQUESTS_PATH_BACKEND,
|
| 30 |
+
EVAL_RESULTS_PATH_BACKEND,
|
| 31 |
+
LIMIT,
|
| 32 |
+
NUM_FEWSHOT,
|
| 33 |
+
QUEUE_REPO,
|
| 34 |
+
RESULTS_REPO,
|
| 35 |
+
TASKS_HARNESS,
|
| 36 |
+
TOKEN,
|
| 37 |
+
)
|
| 38 |
+
from src.logging import setup_logger
|
| 39 |
+
|
| 40 |
+
# Suppress noisy loggers
|
| 41 |
+
logging.getLogger("openai").setLevel(logging.WARNING)
|
| 42 |
+
logging.getLogger("httpx").setLevel(logging.WARNING)
|
| 43 |
+
|
| 44 |
+
logger = setup_logger(__name__)
|
| 45 |
+
pp = pprint.PrettyPrinter(width=80)
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def sync_data():
|
| 49 |
+
"""Download latest data from Hub."""
|
| 50 |
+
logger.info("Syncing data from Hub...")
|
| 51 |
+
|
| 52 |
+
# Download results
|
| 53 |
+
snapshot_download(
|
| 54 |
+
repo_id=RESULTS_REPO,
|
| 55 |
+
revision="main",
|
| 56 |
+
local_dir=EVAL_RESULTS_PATH_BACKEND,
|
| 57 |
+
repo_type="dataset",
|
| 58 |
+
max_workers=60,
|
| 59 |
+
token=TOKEN,
|
| 60 |
+
)
|
| 61 |
+
|
| 62 |
+
# Download requests
|
| 63 |
+
snapshot_download(
|
| 64 |
+
repo_id=QUEUE_REPO,
|
| 65 |
+
revision="main",
|
| 66 |
+
local_dir=EVAL_REQUESTS_PATH_BACKEND,
|
| 67 |
+
repo_type="dataset",
|
| 68 |
+
max_workers=60,
|
| 69 |
+
token=TOKEN,
|
| 70 |
+
)
|
| 71 |
+
|
| 72 |
+
logger.info("Data sync complete.")
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
def run_auto_eval():
|
| 76 |
+
"""Process pending evaluation requests."""
|
| 77 |
+
logger.info("=" * 60)
|
| 78 |
+
logger.info("Starting auto evaluation run")
|
| 79 |
+
logger.info("=" * 60)
|
| 80 |
+
|
| 81 |
+
# Check for completed evaluations
|
| 82 |
+
check_completed_evals(
|
| 83 |
+
api=API,
|
| 84 |
+
checked_status=RUNNING_STATUS,
|
| 85 |
+
completed_status=FINISHED_STATUS,
|
| 86 |
+
failed_status=FAILED_STATUS,
|
| 87 |
+
hf_repo=QUEUE_REPO,
|
| 88 |
+
local_dir=EVAL_REQUESTS_PATH_BACKEND,
|
| 89 |
+
hf_repo_results=RESULTS_REPO,
|
| 90 |
+
local_dir_results=EVAL_RESULTS_PATH_BACKEND,
|
| 91 |
+
)
|
| 92 |
+
|
| 93 |
+
# Get pending evaluation requests
|
| 94 |
+
eval_requests = get_eval_requests(
|
| 95 |
+
job_status=[PENDING_STATUS],
|
| 96 |
+
hf_repo=QUEUE_REPO,
|
| 97 |
+
local_dir=EVAL_REQUESTS_PATH_BACKEND,
|
| 98 |
+
)
|
| 99 |
+
|
| 100 |
+
logger.info(f"Found {len(eval_requests)} pending requests")
|
| 101 |
+
|
| 102 |
+
if not eval_requests:
|
| 103 |
+
logger.info("No pending evaluations. Exiting.")
|
| 104 |
+
return
|
| 105 |
+
|
| 106 |
+
# Sort by priority (FIFO by default)
|
| 107 |
+
eval_requests = sort_models_by_priority(api=API, models=eval_requests)
|
| 108 |
+
|
| 109 |
+
# Process the first request
|
| 110 |
+
eval_request = eval_requests[0]
|
| 111 |
+
|
| 112 |
+
logger.info("\n" + "-" * 40)
|
| 113 |
+
logger.info("Processing evaluation request:")
|
| 114 |
+
logger.info(pp.pformat(vars(eval_request)))
|
| 115 |
+
logger.info("-" * 40)
|
| 116 |
+
|
| 117 |
+
# Update status to RUNNING
|
| 118 |
+
set_eval_request(
|
| 119 |
+
api=API,
|
| 120 |
+
eval_request=eval_request,
|
| 121 |
+
set_to_status=RUNNING_STATUS,
|
| 122 |
+
hf_repo=QUEUE_REPO,
|
| 123 |
+
local_dir=EVAL_REQUESTS_PATH_BACKEND,
|
| 124 |
+
)
|
| 125 |
+
|
| 126 |
+
try:
|
| 127 |
+
# Run the evaluation
|
| 128 |
+
run_evaluation(
|
| 129 |
+
eval_request=eval_request,
|
| 130 |
+
task_names=TASKS_HARNESS,
|
| 131 |
+
num_fewshot=NUM_FEWSHOT,
|
| 132 |
+
local_dir=EVAL_RESULTS_PATH_BACKEND,
|
| 133 |
+
results_repo=RESULTS_REPO,
|
| 134 |
+
batch_size="auto",
|
| 135 |
+
device=DEVICE,
|
| 136 |
+
limit=LIMIT,
|
| 137 |
+
)
|
| 138 |
+
|
| 139 |
+
# Mark as finished
|
| 140 |
+
set_eval_request(
|
| 141 |
+
api=API,
|
| 142 |
+
eval_request=eval_request,
|
| 143 |
+
set_to_status=FINISHED_STATUS,
|
| 144 |
+
hf_repo=QUEUE_REPO,
|
| 145 |
+
local_dir=EVAL_REQUESTS_PATH_BACKEND,
|
| 146 |
+
)
|
| 147 |
+
logger.info(f"Evaluation completed for {eval_request.model}")
|
| 148 |
+
|
| 149 |
+
except Exception as e:
|
| 150 |
+
logger.error(f"Evaluation failed for {eval_request.model}: {e}")
|
| 151 |
+
|
| 152 |
+
# Mark as failed
|
| 153 |
+
set_eval_request(
|
| 154 |
+
api=API,
|
| 155 |
+
eval_request=eval_request,
|
| 156 |
+
set_to_status=FAILED_STATUS,
|
| 157 |
+
hf_repo=QUEUE_REPO,
|
| 158 |
+
local_dir=EVAL_REQUESTS_PATH_BACKEND,
|
| 159 |
+
)
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
if __name__ == "__main__":
|
| 163 |
+
# Sync data and run evaluation
|
| 164 |
+
sync_data()
|
| 165 |
+
run_auto_eval()
|
requirements.txt
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio>=4.0.0
|
| 2 |
+
pandas>=2.0.0
|
| 3 |
+
huggingface_hub>=0.20.0
|
| 4 |
+
transformers>=4.36.0
|
| 5 |
+
apscheduler>=3.10.0
|
| 6 |
+
lm-eval>=0.4.0
|
| 7 |
+
accelerate>=0.25.0
|
| 8 |
+
torch>=2.0.0
|
src/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# Backend source package
|
src/backend/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# Backend evaluation package
|
src/backend/manage_requests.py
ADDED
|
@@ -0,0 +1,192 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Manage evaluation requests and their status."""
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
import os
|
| 5 |
+
from dataclasses import dataclass
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
from typing import List, Optional
|
| 8 |
+
|
| 9 |
+
from huggingface_hub import HfApi
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
# Status constants
|
| 13 |
+
PENDING_STATUS = "PENDING"
|
| 14 |
+
RUNNING_STATUS = "RUNNING"
|
| 15 |
+
FINISHED_STATUS = "FINISHED"
|
| 16 |
+
FAILED_STATUS = "FAILED"
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
@dataclass
|
| 20 |
+
class EvalRequest:
|
| 21 |
+
"""Represents an evaluation request."""
|
| 22 |
+
model: str
|
| 23 |
+
revision: str
|
| 24 |
+
precision: str
|
| 25 |
+
weight_type: str
|
| 26 |
+
model_type: str
|
| 27 |
+
status: str
|
| 28 |
+
submitted_time: str
|
| 29 |
+
base_model: str = ""
|
| 30 |
+
likes: int = 0
|
| 31 |
+
params: float = 0.0
|
| 32 |
+
license: str = ""
|
| 33 |
+
private: bool = False
|
| 34 |
+
json_filepath: str = ""
|
| 35 |
+
|
| 36 |
+
def get_model_args(self) -> str:
|
| 37 |
+
"""Get model arguments string for lm-eval."""
|
| 38 |
+
args = f"pretrained={self.model}"
|
| 39 |
+
|
| 40 |
+
if self.revision and self.revision != "main":
|
| 41 |
+
args += f",revision={self.revision}"
|
| 42 |
+
|
| 43 |
+
if self.precision:
|
| 44 |
+
args += f",dtype={self.precision}"
|
| 45 |
+
|
| 46 |
+
# Add trust_remote_code for safety
|
| 47 |
+
args += ",trust_remote_code=True"
|
| 48 |
+
|
| 49 |
+
return args
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def get_eval_requests(
|
| 53 |
+
job_status: List[str],
|
| 54 |
+
hf_repo: str,
|
| 55 |
+
local_dir: str,
|
| 56 |
+
) -> List[EvalRequest]:
|
| 57 |
+
"""
|
| 58 |
+
Load evaluation requests with specified status.
|
| 59 |
+
|
| 60 |
+
Args:
|
| 61 |
+
job_status: List of status values to filter by
|
| 62 |
+
hf_repo: HuggingFace dataset repo ID
|
| 63 |
+
local_dir: Local directory with cached requests
|
| 64 |
+
|
| 65 |
+
Returns:
|
| 66 |
+
List of EvalRequest objects
|
| 67 |
+
"""
|
| 68 |
+
requests = []
|
| 69 |
+
requests_dir = Path(local_dir)
|
| 70 |
+
|
| 71 |
+
if not requests_dir.exists():
|
| 72 |
+
return requests
|
| 73 |
+
|
| 74 |
+
for json_file in requests_dir.rglob("*.json"):
|
| 75 |
+
try:
|
| 76 |
+
with open(json_file, "r") as f:
|
| 77 |
+
data = json.load(f)
|
| 78 |
+
|
| 79 |
+
if data.get("status", PENDING_STATUS) in job_status:
|
| 80 |
+
request = EvalRequest(
|
| 81 |
+
model=data.get("model", ""),
|
| 82 |
+
revision=data.get("revision", "main"),
|
| 83 |
+
precision=data.get("precision", "float16"),
|
| 84 |
+
weight_type=data.get("weight_type", "Original"),
|
| 85 |
+
model_type=data.get("model_type", ""),
|
| 86 |
+
status=data.get("status", PENDING_STATUS),
|
| 87 |
+
submitted_time=data.get("submitted_time", ""),
|
| 88 |
+
base_model=data.get("base_model", ""),
|
| 89 |
+
likes=data.get("likes", 0),
|
| 90 |
+
params=data.get("params", 0.0),
|
| 91 |
+
license=data.get("license", ""),
|
| 92 |
+
private=data.get("private", False),
|
| 93 |
+
json_filepath=str(json_file),
|
| 94 |
+
)
|
| 95 |
+
requests.append(request)
|
| 96 |
+
|
| 97 |
+
except (json.JSONDecodeError, OSError) as e:
|
| 98 |
+
print(f"Error loading {json_file}: {e}")
|
| 99 |
+
continue
|
| 100 |
+
|
| 101 |
+
return requests
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
def set_eval_request(
|
| 105 |
+
api: HfApi,
|
| 106 |
+
eval_request: EvalRequest,
|
| 107 |
+
set_to_status: str,
|
| 108 |
+
hf_repo: str,
|
| 109 |
+
local_dir: str,
|
| 110 |
+
) -> None:
|
| 111 |
+
"""
|
| 112 |
+
Update the status of an evaluation request.
|
| 113 |
+
|
| 114 |
+
Args:
|
| 115 |
+
api: HuggingFace API client
|
| 116 |
+
eval_request: The request to update
|
| 117 |
+
set_to_status: New status value
|
| 118 |
+
hf_repo: HuggingFace dataset repo ID
|
| 119 |
+
local_dir: Local directory with cached requests
|
| 120 |
+
"""
|
| 121 |
+
json_filepath = Path(eval_request.json_filepath)
|
| 122 |
+
|
| 123 |
+
if not json_filepath.exists():
|
| 124 |
+
print(f"Request file not found: {json_filepath}")
|
| 125 |
+
return
|
| 126 |
+
|
| 127 |
+
# Load current data
|
| 128 |
+
with open(json_filepath, "r") as f:
|
| 129 |
+
data = json.load(f)
|
| 130 |
+
|
| 131 |
+
# Update status
|
| 132 |
+
data["status"] = set_to_status
|
| 133 |
+
|
| 134 |
+
# Save locally
|
| 135 |
+
with open(json_filepath, "w") as f:
|
| 136 |
+
json.dump(data, f, indent=2)
|
| 137 |
+
|
| 138 |
+
# Upload to Hub
|
| 139 |
+
try:
|
| 140 |
+
repo_path = str(json_filepath).replace(local_dir + "/", "")
|
| 141 |
+
api.upload_file(
|
| 142 |
+
path_or_fileobj=str(json_filepath),
|
| 143 |
+
path_in_repo=repo_path,
|
| 144 |
+
repo_id=hf_repo,
|
| 145 |
+
repo_type="dataset",
|
| 146 |
+
commit_message=f"Update status to {set_to_status} for {eval_request.model}",
|
| 147 |
+
)
|
| 148 |
+
except Exception as e:
|
| 149 |
+
print(f"Failed to upload status update: {e}")
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
def check_completed_evals(
|
| 153 |
+
api: HfApi,
|
| 154 |
+
checked_status: str,
|
| 155 |
+
completed_status: str,
|
| 156 |
+
failed_status: str,
|
| 157 |
+
hf_repo: str,
|
| 158 |
+
local_dir: str,
|
| 159 |
+
hf_repo_results: str,
|
| 160 |
+
local_dir_results: str,
|
| 161 |
+
) -> None:
|
| 162 |
+
"""
|
| 163 |
+
Check for completed evaluations and update their status.
|
| 164 |
+
|
| 165 |
+
Args:
|
| 166 |
+
api: HuggingFace API client
|
| 167 |
+
checked_status: Status to check (e.g., RUNNING)
|
| 168 |
+
completed_status: Status to set if results exist
|
| 169 |
+
failed_status: Status to set if evaluation failed
|
| 170 |
+
hf_repo: Requests dataset repo ID
|
| 171 |
+
local_dir: Local requests directory
|
| 172 |
+
hf_repo_results: Results dataset repo ID
|
| 173 |
+
local_dir_results: Local results directory
|
| 174 |
+
"""
|
| 175 |
+
running_requests = get_eval_requests([checked_status], hf_repo, local_dir)
|
| 176 |
+
|
| 177 |
+
for request in running_requests:
|
| 178 |
+
# Check if results exist
|
| 179 |
+
model_results_dir = Path(local_dir_results) / request.model
|
| 180 |
+
|
| 181 |
+
if model_results_dir.exists():
|
| 182 |
+
result_files = list(model_results_dir.rglob("results_*.json"))
|
| 183 |
+
if result_files:
|
| 184 |
+
# Results found, mark as completed
|
| 185 |
+
set_eval_request(
|
| 186 |
+
api=api,
|
| 187 |
+
eval_request=request,
|
| 188 |
+
set_to_status=completed_status,
|
| 189 |
+
hf_repo=hf_repo,
|
| 190 |
+
local_dir=local_dir,
|
| 191 |
+
)
|
| 192 |
+
print(f"Marked {request.model} as {completed_status}")
|
src/backend/run_eval_suite_harness.py
ADDED
|
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Run evaluations using lm-evaluation-harness."""
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
import logging
|
| 5 |
+
from datetime import datetime
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
from typing import List, Optional, Union
|
| 8 |
+
|
| 9 |
+
from lm_eval import evaluator, utils
|
| 10 |
+
from lm_eval.tasks import TaskManager
|
| 11 |
+
|
| 12 |
+
from src.backend.manage_requests import EvalRequest
|
| 13 |
+
from src.envs import API
|
| 14 |
+
|
| 15 |
+
logger = logging.getLogger(__name__)
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def run_evaluation(
|
| 19 |
+
eval_request: EvalRequest,
|
| 20 |
+
task_names: List[str],
|
| 21 |
+
num_fewshot: int,
|
| 22 |
+
batch_size: Union[int, str],
|
| 23 |
+
device: str,
|
| 24 |
+
local_dir: str,
|
| 25 |
+
results_repo: str,
|
| 26 |
+
limit: Optional[int] = None,
|
| 27 |
+
) -> dict:
|
| 28 |
+
"""
|
| 29 |
+
Run evaluation for a model using lm-evaluation-harness.
|
| 30 |
+
|
| 31 |
+
Args:
|
| 32 |
+
eval_request: The evaluation request with model info
|
| 33 |
+
task_names: List of task names to evaluate
|
| 34 |
+
num_fewshot: Number of few-shot examples
|
| 35 |
+
batch_size: Batch size (int or "auto")
|
| 36 |
+
device: Device to run on ("cpu" or "cuda:0")
|
| 37 |
+
local_dir: Directory to save results locally
|
| 38 |
+
results_repo: HuggingFace dataset repo for results
|
| 39 |
+
limit: Limit samples per task (for testing)
|
| 40 |
+
|
| 41 |
+
Returns:
|
| 42 |
+
Evaluation results dictionary
|
| 43 |
+
"""
|
| 44 |
+
if limit:
|
| 45 |
+
logger.warning(
|
| 46 |
+
"WARNING: --limit SHOULD ONLY BE USED FOR TESTING. "
|
| 47 |
+
"REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
|
| 48 |
+
)
|
| 49 |
+
|
| 50 |
+
# Initialize task manager and validate tasks
|
| 51 |
+
task_manager = TaskManager()
|
| 52 |
+
all_tasks = task_manager.all_tasks
|
| 53 |
+
task_names = utils.pattern_match(task_names, all_tasks)
|
| 54 |
+
|
| 55 |
+
logger.info(f"Model: {eval_request.model}")
|
| 56 |
+
logger.info(f"Selected Tasks: {task_names}")
|
| 57 |
+
logger.info(f"Few-shot: {num_fewshot}")
|
| 58 |
+
logger.info(f"Device: {device}")
|
| 59 |
+
|
| 60 |
+
# Run evaluation
|
| 61 |
+
try:
|
| 62 |
+
results = evaluator.simple_evaluate(
|
| 63 |
+
model="hf",
|
| 64 |
+
model_args=eval_request.get_model_args(),
|
| 65 |
+
tasks=task_names,
|
| 66 |
+
num_fewshot=num_fewshot,
|
| 67 |
+
batch_size=batch_size,
|
| 68 |
+
device=device,
|
| 69 |
+
limit=limit,
|
| 70 |
+
write_out=True,
|
| 71 |
+
log_samples=True, # Save per-sample results
|
| 72 |
+
)
|
| 73 |
+
except Exception as e:
|
| 74 |
+
logger.error(f"Evaluation failed: {e}")
|
| 75 |
+
raise
|
| 76 |
+
|
| 77 |
+
# Add model metadata to results
|
| 78 |
+
results["config"]["model_dtype"] = eval_request.precision
|
| 79 |
+
results["config"]["model_name"] = eval_request.model
|
| 80 |
+
results["config"]["model_sha"] = eval_request.revision
|
| 81 |
+
results["config"]["model_type"] = eval_request.model_type
|
| 82 |
+
|
| 83 |
+
# Log results summary
|
| 84 |
+
logger.info("\n" + "=" * 60)
|
| 85 |
+
logger.info("EVALUATION RESULTS")
|
| 86 |
+
logger.info("=" * 60)
|
| 87 |
+
logger.info(evaluator.make_table(results))
|
| 88 |
+
|
| 89 |
+
# Save results locally
|
| 90 |
+
timestamp = datetime.now().strftime("%Y%m%dT%H%M%S")
|
| 91 |
+
results_path = Path(local_dir) / eval_request.model / f"results_{timestamp}.json"
|
| 92 |
+
results_path.parent.mkdir(exist_ok=True, parents=True)
|
| 93 |
+
|
| 94 |
+
dumped = json.dumps(results, indent=2, default=str)
|
| 95 |
+
results_path.write_text(dumped)
|
| 96 |
+
|
| 97 |
+
logger.info(f"Results saved to: {results_path}")
|
| 98 |
+
|
| 99 |
+
# Upload to HuggingFace Hub
|
| 100 |
+
try:
|
| 101 |
+
repo_path = results_path.relative_to(local_dir).as_posix()
|
| 102 |
+
API.upload_file(
|
| 103 |
+
path_or_fileobj=str(results_path),
|
| 104 |
+
path_in_repo=repo_path,
|
| 105 |
+
repo_id=results_repo,
|
| 106 |
+
repo_type="dataset",
|
| 107 |
+
commit_message=f"Add evaluation results for {eval_request.model}",
|
| 108 |
+
)
|
| 109 |
+
logger.info(f"Results uploaded to {results_repo}")
|
| 110 |
+
except Exception as e:
|
| 111 |
+
logger.error(f"Failed to upload results: {e}")
|
| 112 |
+
|
| 113 |
+
return results
|
src/backend/sort_queue.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Sort evaluation queue by priority."""
|
| 2 |
+
|
| 3 |
+
from typing import List
|
| 4 |
+
from huggingface_hub import HfApi
|
| 5 |
+
|
| 6 |
+
from src.backend.manage_requests import EvalRequest
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def sort_models_by_priority(api: HfApi, models: List[EvalRequest]) -> List[EvalRequest]:
|
| 10 |
+
"""
|
| 11 |
+
Sort models by priority for evaluation.
|
| 12 |
+
|
| 13 |
+
Current strategy: FIFO (first in, first out) based on submission time.
|
| 14 |
+
Can be extended to prioritize by model popularity, size, etc.
|
| 15 |
+
|
| 16 |
+
Args:
|
| 17 |
+
api: HuggingFace API client
|
| 18 |
+
models: List of evaluation requests
|
| 19 |
+
|
| 20 |
+
Returns:
|
| 21 |
+
Sorted list of evaluation requests
|
| 22 |
+
"""
|
| 23 |
+
# Sort by submission time (oldest first)
|
| 24 |
+
return sorted(models, key=lambda x: x.submitted_time)
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def sort_models_by_likes(api: HfApi, models: List[EvalRequest]) -> List[EvalRequest]:
|
| 28 |
+
"""
|
| 29 |
+
Sort models by Hub likes (most popular first).
|
| 30 |
+
|
| 31 |
+
Args:
|
| 32 |
+
api: HuggingFace API client
|
| 33 |
+
models: List of evaluation requests
|
| 34 |
+
|
| 35 |
+
Returns:
|
| 36 |
+
Sorted list of evaluation requests
|
| 37 |
+
"""
|
| 38 |
+
return sorted(models, key=lambda x: x.likes, reverse=True)
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def sort_models_by_size(models: List[EvalRequest], ascending: bool = True) -> List[EvalRequest]:
|
| 42 |
+
"""
|
| 43 |
+
Sort models by parameter count.
|
| 44 |
+
|
| 45 |
+
Args:
|
| 46 |
+
models: List of evaluation requests
|
| 47 |
+
ascending: If True, smallest models first
|
| 48 |
+
|
| 49 |
+
Returns:
|
| 50 |
+
Sorted list of evaluation requests
|
| 51 |
+
"""
|
| 52 |
+
return sorted(models, key=lambda x: x.params, reverse=not ascending)
|
src/display/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# Display utilities
|
src/display/css_html_js.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""CSS and JavaScript for the backend UI."""
|
| 2 |
+
|
| 3 |
+
dark_mode_gradio_js = """
|
| 4 |
+
function refresh() {
|
| 5 |
+
const url = new URL(window.location);
|
| 6 |
+
if (url.searchParams.get('__theme') !== 'dark') {
|
| 7 |
+
url.searchParams.set('__theme', 'dark');
|
| 8 |
+
window.location.href = url.href;
|
| 9 |
+
}
|
| 10 |
+
}
|
| 11 |
+
"""
|
src/display/log_visualizer.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Log visualization utilities for the backend UI."""
|
| 2 |
+
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
from typing import Union
|
| 5 |
+
|
| 6 |
+
from src.envs import NUM_LINES_VISUALIZE
|
| 7 |
+
from src.logging import log_file
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def log_file_to_html_string(log_path: Union[str, Path] = log_file, reverse: bool = True) -> str:
|
| 11 |
+
"""
|
| 12 |
+
Convert log file contents to HTML string for display.
|
| 13 |
+
|
| 14 |
+
Args:
|
| 15 |
+
log_path: Path to the log file
|
| 16 |
+
reverse: If True, show newest entries first
|
| 17 |
+
|
| 18 |
+
Returns:
|
| 19 |
+
HTML-formatted log contents
|
| 20 |
+
"""
|
| 21 |
+
log_path = Path(log_path)
|
| 22 |
+
|
| 23 |
+
if not log_path.exists():
|
| 24 |
+
return "<pre>No logs yet.</pre>"
|
| 25 |
+
|
| 26 |
+
try:
|
| 27 |
+
with open(log_path, "r") as f:
|
| 28 |
+
lines = f.readlines()
|
| 29 |
+
|
| 30 |
+
# Limit number of lines
|
| 31 |
+
lines = lines[-NUM_LINES_VISUALIZE:]
|
| 32 |
+
|
| 33 |
+
if reverse:
|
| 34 |
+
lines = lines[::-1]
|
| 35 |
+
|
| 36 |
+
# Escape HTML and wrap in pre tag
|
| 37 |
+
content = "".join(lines)
|
| 38 |
+
content = content.replace("&", "&")
|
| 39 |
+
content = content.replace("<", "<")
|
| 40 |
+
content = content.replace(">", ">")
|
| 41 |
+
|
| 42 |
+
return f"<pre style='font-size: 12px; white-space: pre-wrap;'>{content}</pre>"
|
| 43 |
+
|
| 44 |
+
except Exception as e:
|
| 45 |
+
return f"<pre>Error reading log file: {e}</pre>"
|
src/envs.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Backend environment configuration."""
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
from huggingface_hub import HfApi
|
| 5 |
+
|
| 6 |
+
# ----------------------------------
|
| 7 |
+
# Configuration
|
| 8 |
+
# ----------------------------------
|
| 9 |
+
|
| 10 |
+
TOKEN = os.environ.get("HF_TOKEN") # Read/write token for your org
|
| 11 |
+
OWNER = "stellaathena" # HuggingFace username/org
|
| 12 |
+
|
| 13 |
+
# Device configuration
|
| 14 |
+
DEVICE = os.environ.get("DEVICE", "cuda:0") # "cpu" or "cuda:0"
|
| 15 |
+
LIMIT = None # Set to int for testing (e.g., 20), None for full evaluation
|
| 16 |
+
|
| 17 |
+
# Evaluation settings
|
| 18 |
+
NUM_FEWSHOT = 0 # Zero-shot evaluation
|
| 19 |
+
|
| 20 |
+
# Tasks to evaluate (lm-evaluation-harness task names)
|
| 21 |
+
TASKS_HARNESS = [
|
| 22 |
+
"minerva_math",
|
| 23 |
+
"piqa",
|
| 24 |
+
]
|
| 25 |
+
|
| 26 |
+
# ----------------------------------
|
| 27 |
+
# Derived Configuration
|
| 28 |
+
# ----------------------------------
|
| 29 |
+
|
| 30 |
+
REPO_ID = f"{OWNER}/math-piqa-backend"
|
| 31 |
+
QUEUE_REPO = f"{OWNER}/math-piqa-requests"
|
| 32 |
+
RESULTS_REPO = f"{OWNER}/math-piqa-results"
|
| 33 |
+
|
| 34 |
+
# Cache paths
|
| 35 |
+
CACHE_PATH = os.getenv("HF_HOME", ".")
|
| 36 |
+
|
| 37 |
+
EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
|
| 38 |
+
EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
|
| 39 |
+
EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
|
| 40 |
+
EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
|
| 41 |
+
|
| 42 |
+
# Refresh rate (seconds)
|
| 43 |
+
REFRESH_RATE = 10 * 60 # 10 minutes
|
| 44 |
+
|
| 45 |
+
# Log visualization
|
| 46 |
+
NUM_LINES_VISUALIZE = 300
|
| 47 |
+
|
| 48 |
+
# API client
|
| 49 |
+
API = HfApi(token=TOKEN)
|
src/logging.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Logging configuration for the backend."""
|
| 2 |
+
|
| 3 |
+
import logging
|
| 4 |
+
import os
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
|
| 7 |
+
# Log file location
|
| 8 |
+
log_file = Path("evaluation.log")
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def configure_root_logger():
|
| 12 |
+
"""Configure the root logger with file and console handlers."""
|
| 13 |
+
logging.basicConfig(
|
| 14 |
+
level=logging.INFO,
|
| 15 |
+
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
| 16 |
+
handlers=[
|
| 17 |
+
logging.FileHandler(log_file),
|
| 18 |
+
logging.StreamHandler(),
|
| 19 |
+
],
|
| 20 |
+
)
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def setup_logger(name: str) -> logging.Logger:
|
| 24 |
+
"""
|
| 25 |
+
Set up a logger with the given name.
|
| 26 |
+
|
| 27 |
+
Args:
|
| 28 |
+
name: Logger name (typically __name__)
|
| 29 |
+
|
| 30 |
+
Returns:
|
| 31 |
+
Configured logger
|
| 32 |
+
"""
|
| 33 |
+
logger = logging.getLogger(name)
|
| 34 |
+
logger.setLevel(logging.INFO)
|
| 35 |
+
return logger
|