Spaces:

stellaathena
/

math-piqa-backend

Runtime error

App Files Files Community

stellaathena commited on Jan 27

Commit

81afbdf

verified ·

1 Parent(s): c3a5000

Initial commit: MATH & PIQA Backend

Browse files

Files changed (14) hide show

README.md +95 -6
app.py +130 -0
main_backend_harness.py +165 -0
requirements.txt +8 -0
src/__init__.py +1 -0
src/backend/__init__.py +1 -0
src/backend/manage_requests.py +192 -0
src/backend/run_eval_suite_harness.py +113 -0
src/backend/sort_queue.py +52 -0
src/display/__init__.py +1 -0
src/display/css_html_js.py +11 -0
src/display/log_visualizer.py +45 -0
src/envs.py +49 -0
src/logging.py +35 -0

README.md CHANGED Viewed

@@ -1,12 +1,101 @@
 ---
-title: Math Piqa Backend
-emoji: 📊
-colorFrom: yellow
-colorTo: yellow
 sdk: gradio
-sdk_version: 6.4.0
 app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: LLM Evaluation Backend
+emoji: ⚙️
+colorFrom: purple
+colorTo: blue
 sdk: gradio
+sdk_version: 4.44.0
 app_file: app.py
 pinned: false
+license: apache-2.0
 ---
+# LLM Evaluation Backend
+Automated evaluation backend using [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness).
+## Features
+- **Automatic evaluation** of submitted models
+- **Queue management** with status tracking
+- **Result upload** to HuggingFace datasets
+- **Dashboard UI** for monitoring
+## Setup
+1. Create datasets in your HuggingFace organization:
+   - `your-org/requests` - evaluation queue
+   - `your-org/results` - evaluation results
+2. Update `src/envs.py` with your configuration
+3. Set `HF_TOKEN` environment variable
+4. Deploy with GPU compute for faster evaluations
+## Configuration
+Edit `src/envs.py` to configure:
+```python
+# Your organization
+OWNER = "your-org-name"
+# Device (cpu or cuda:0)
+DEVICE = "cuda:0"
+# Tasks to evaluate
+TASKS_HARNESS = [
+    "mmlu",
+    "hellaswag",
+    "arc_challenge",
+    "winogrande",
+    "gsm8k",
+    "truthfulqa_mc2",
+]
+# Few-shot examples
+NUM_FEWSHOT = 0
+```
+## Running Locally
+```bash
+# Install dependencies
+pip install -r requirements.txt
+# Set token
+export HF_TOKEN=your_token
+# Run the backend
+python main_backend_harness.py
+# Or run the dashboard
+python app.py
+```
+## Architecture
+```
+backend/
+├── app.py                      # Gradio dashboard
+├── main_backend_harness.py     # Main evaluation loop
+├── src/
+│   ├── envs.py                # Configuration
+│   ├── logging.py             # Logging setup
+│   ├── backend/
+│   │   ├── manage_requests.py # Queue management
+│   │   ├── run_eval_suite_harness.py  # lm-eval integration
+│   │   └── sort_queue.py      # Priority sorting
+│   └── display/               # UI utilities
+```
+## Evaluation Flow
+1. **Sync**: Download pending requests from Hub
+2. **Sort**: Order by priority (FIFO default)
+3. **Evaluate**: Run lm-eval on the model
+4. **Upload**: Push results to results dataset
+5. **Update**: Mark request as FINISHED
+## Related
+- [Frontend Leaderboard](../frontend/) - Display results
+- [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) - Evaluation framework

app.py ADDED Viewed

	@@ -0,0 +1,130 @@

+"""
+Backend Application - Evaluation Dashboard
+A Gradio UI for monitoring and triggering evaluations.
+"""
+import logging
+from functools import partial
+import gradio as gr
+from apscheduler.schedulers.background import BackgroundScheduler
+from main_backend_harness import run_auto_eval, sync_data
+from src.display.css_html_js import dark_mode_gradio_js
+from src.display.log_visualizer import log_file_to_html_string
+from src.envs import QUEUE_REPO, REFRESH_RATE, REPO_ID, RESULTS_REPO
+from src.logging import configure_root_logger, log_file, setup_logger
+# Configure logging
+logging.getLogger("httpx").setLevel(logging.WARNING)
+logging.getLogger("numexpr").setLevel(logging.WARNING)
+logging.getLogger("absl").setLevel(logging.WARNING)
+configure_root_logger()
+logging.basicConfig(level=logging.INFO)
+logger = setup_logger(__name__)
+# Markdown content
+intro_md = """
+# Evaluation Backend Dashboard
+This dashboard monitors and controls the automatic evaluation pipeline.
+Evaluations are run using [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness).
+"""
+links_md = f"""
+# Important Links
+| Description | Link |
+|-------------|------|
+| Leaderboard | [View Leaderboard](https://huggingface.co/spaces/{REPO_ID.replace('/backend', '/leaderboard')}) |
+| Request Queue | [{QUEUE_REPO}](https://huggingface.co/datasets/{QUEUE_REPO}) |
+| Results Dataset | [{RESULTS_REPO}](https://huggingface.co/datasets/{RESULTS_REPO}) |
+"""
+def trigger_auto_eval():
+    """Trigger an evaluation run."""
+    logger.info("Manual evaluation triggered")
+    sync_data()
+    run_auto_eval()
+def get_log_html(reverse: bool = True) -> str:
+    """Get log file as HTML with optional reverse ordering."""
+    return log_file_to_html_string(log_file, reverse=reverse)
+# Build the Gradio interface
+with gr.Blocks(js=dark_mode_gradio_js) as demo:
+    gr.Markdown(intro_md)
+    with gr.Tab("Dashboard"):
+        # Log display
+        output_html = gr.HTML(lambda: get_log_html(True), every=1)
+        with gr.Row():
+            download_button = gr.DownloadButton("Download Log File", value=str(log_file))
+        with gr.Accordion("Log Settings", open=False):
+            reverse_checkbox = gr.Checkbox(
+                label="Show newest first",
+                value=True,
+            )
+            reverse_checkbox.change(
+                fn=get_log_html,
+                inputs=[reverse_checkbox],
+                outputs=[output_html],
+            )
+        # Manual trigger button
+        with gr.Row():
+            trigger_button = gr.Button("Run Evaluation Now", variant="primary")
+            trigger_button.click(fn=trigger_auto_eval, inputs=[], outputs=[])
+        gr.Markdown(links_md)
+    with gr.Tab("Configuration"):
+        gr.Markdown("""
+## Current Configuration
+The backend is configured to evaluate models on the following tasks:
+| Task | Description |
+|------|-------------|
+| MMLU | Massive Multitask Language Understanding |
+| HellaSwag | Commonsense reasoning |
+| ARC-Challenge | Advanced reasoning |
+| WinoGrande | Pronoun resolution |
+| GSM8K | Math word problems |
+| TruthfulQA | Truthfulness evaluation |
+### Settings
+- **Refresh Rate**: 10 minutes
+- **Device**: GPU (cuda:0) when available
+- **Batch Size**: Auto-detected
+To modify tasks or settings, edit `src/envs.py`.
+        """)
+# Background scheduler for automatic evaluations
+scheduler = BackgroundScheduler()
+scheduler.add_job(sync_data, "interval", seconds=REFRESH_RATE, id="sync_data")
+scheduler.add_job(run_auto_eval, "interval", seconds=REFRESH_RATE, id="auto_eval")
+scheduler.start()
+logger.info(f"Scheduler started. Refresh rate: {REFRESH_RATE} seconds")
+# Launch
+if __name__ == "__main__":
+    demo.queue(default_concurrency_limit=40).launch(
+        server_name="0.0.0.0",
+        show_error=True,
+        server_port=7860,
+    )

main_backend_harness.py ADDED Viewed

	@@ -0,0 +1,165 @@

+"""
+Main backend script for running evaluations with lm-evaluation-harness.
+This script:
+1. Downloads pending evaluation requests from the Hub
+2. Processes them one at a time
+3. Runs evaluations using lm-eval
+4. Uploads results back to the Hub
+"""
+import logging
+import pprint
+from huggingface_hub import snapshot_download
+from src.backend.manage_requests import (
+    FAILED_STATUS,
+    FINISHED_STATUS,
+    PENDING_STATUS,
+    RUNNING_STATUS,
+    check_completed_evals,
+    get_eval_requests,
+    set_eval_request,
+)
+from src.backend.run_eval_suite_harness import run_evaluation
+from src.backend.sort_queue import sort_models_by_priority
+from src.envs import (
+    API,
+    DEVICE,
+    EVAL_REQUESTS_PATH_BACKEND,
+    EVAL_RESULTS_PATH_BACKEND,
+    LIMIT,
+    NUM_FEWSHOT,
+    QUEUE_REPO,
+    RESULTS_REPO,
+    TASKS_HARNESS,
+    TOKEN,
+)
+from src.logging import setup_logger
+# Suppress noisy loggers
+logging.getLogger("openai").setLevel(logging.WARNING)
+logging.getLogger("httpx").setLevel(logging.WARNING)
+logger = setup_logger(__name__)
+pp = pprint.PrettyPrinter(width=80)
+def sync_data():
+    """Download latest data from Hub."""
+    logger.info("Syncing data from Hub...")
+    # Download results
+    snapshot_download(
+        repo_id=RESULTS_REPO,
+        revision="main",
+        local_dir=EVAL_RESULTS_PATH_BACKEND,
+        repo_type="dataset",
+        max_workers=60,
+        token=TOKEN,
+    )
+    # Download requests
+    snapshot_download(
+        repo_id=QUEUE_REPO,
+        revision="main",
+        local_dir=EVAL_REQUESTS_PATH_BACKEND,
+        repo_type="dataset",
+        max_workers=60,
+        token=TOKEN,
+    )
+    logger.info("Data sync complete.")
+def run_auto_eval():
+    """Process pending evaluation requests."""
+    logger.info("=" * 60)
+    logger.info("Starting auto evaluation run")
+    logger.info("=" * 60)
+    # Check for completed evaluations
+    check_completed_evals(
+        api=API,
+        checked_status=RUNNING_STATUS,
+        completed_status=FINISHED_STATUS,
+        failed_status=FAILED_STATUS,
+        hf_repo=QUEUE_REPO,
+        local_dir=EVAL_REQUESTS_PATH_BACKEND,
+        hf_repo_results=RESULTS_REPO,
+        local_dir_results=EVAL_RESULTS_PATH_BACKEND,
+    )
+    # Get pending evaluation requests
+    eval_requests = get_eval_requests(
+        job_status=[PENDING_STATUS],
+        hf_repo=QUEUE_REPO,
+        local_dir=EVAL_REQUESTS_PATH_BACKEND,
+    )
+    logger.info(f"Found {len(eval_requests)} pending requests")
+    if not eval_requests:
+        logger.info("No pending evaluations. Exiting.")
+        return
+    # Sort by priority (FIFO by default)
+    eval_requests = sort_models_by_priority(api=API, models=eval_requests)
+    # Process the first request
+    eval_request = eval_requests[0]
+    logger.info("\n" + "-" * 40)
+    logger.info("Processing evaluation request:")
+    logger.info(pp.pformat(vars(eval_request)))
+    logger.info("-" * 40)
+    # Update status to RUNNING
+    set_eval_request(
+        api=API,
+        eval_request=eval_request,
+        set_to_status=RUNNING_STATUS,
+        hf_repo=QUEUE_REPO,
+        local_dir=EVAL_REQUESTS_PATH_BACKEND,
+    )
+    try:
+        # Run the evaluation
+        run_evaluation(
+            eval_request=eval_request,
+            task_names=TASKS_HARNESS,
+            num_fewshot=NUM_FEWSHOT,
+            local_dir=EVAL_RESULTS_PATH_BACKEND,
+            results_repo=RESULTS_REPO,
+            batch_size="auto",
+            device=DEVICE,
+            limit=LIMIT,
+        )
+        # Mark as finished
+        set_eval_request(
+            api=API,
+            eval_request=eval_request,
+            set_to_status=FINISHED_STATUS,
+            hf_repo=QUEUE_REPO,
+            local_dir=EVAL_REQUESTS_PATH_BACKEND,
+        )
+        logger.info(f"Evaluation completed for {eval_request.model}")
+    except Exception as e:
+        logger.error(f"Evaluation failed for {eval_request.model}: {e}")
+        # Mark as failed
+        set_eval_request(
+            api=API,
+            eval_request=eval_request,
+            set_to_status=FAILED_STATUS,
+            hf_repo=QUEUE_REPO,
+            local_dir=EVAL_REQUESTS_PATH_BACKEND,
+        )
+if __name__ == "__main__":
+    # Sync data and run evaluation
+    sync_data()
+    run_auto_eval()

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+gradio>=4.0.0
+pandas>=2.0.0
+huggingface_hub>=0.20.0
+transformers>=4.36.0
+apscheduler>=3.10.0
+lm-eval>=0.4.0
+accelerate>=0.25.0
+torch>=2.0.0

src/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Backend source package

src/backend/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Backend evaluation package

src/backend/manage_requests.py ADDED Viewed

	@@ -0,0 +1,192 @@

+"""Manage evaluation requests and their status."""
+import json
+import os
+from dataclasses import dataclass
+from pathlib import Path
+from typing import List, Optional
+from huggingface_hub import HfApi
+# Status constants
+PENDING_STATUS = "PENDING"
+RUNNING_STATUS = "RUNNING"
+FINISHED_STATUS = "FINISHED"
+FAILED_STATUS = "FAILED"
+@dataclass
+class EvalRequest:
+    """Represents an evaluation request."""
+    model: str
+    revision: str
+    precision: str
+    weight_type: str
+    model_type: str
+    status: str
+    submitted_time: str
+    base_model: str = ""
+    likes: int = 0
+    params: float = 0.0
+    license: str = ""
+    private: bool = False
+    json_filepath: str = ""
+    def get_model_args(self) -> str:
+        """Get model arguments string for lm-eval."""
+        args = f"pretrained={self.model}"
+        if self.revision and self.revision != "main":
+            args += f",revision={self.revision}"
+        if self.precision:
+            args += f",dtype={self.precision}"
+        # Add trust_remote_code for safety
+        args += ",trust_remote_code=True"
+        return args
+def get_eval_requests(
+    job_status: List[str],
+    hf_repo: str,
+    local_dir: str,
+) -> List[EvalRequest]:
+    """
+    Load evaluation requests with specified status.
+    Args:
+        job_status: List of status values to filter by
+        hf_repo: HuggingFace dataset repo ID
+        local_dir: Local directory with cached requests
+    Returns:
+        List of EvalRequest objects
+    """
+    requests = []
+    requests_dir = Path(local_dir)
+    if not requests_dir.exists():
+        return requests
+    for json_file in requests_dir.rglob("*.json"):
+        try:
+            with open(json_file, "r") as f:
+                data = json.load(f)
+            if data.get("status", PENDING_STATUS) in job_status:
+                request = EvalRequest(
+                    model=data.get("model", ""),
+                    revision=data.get("revision", "main"),
+                    precision=data.get("precision", "float16"),
+                    weight_type=data.get("weight_type", "Original"),
+                    model_type=data.get("model_type", ""),
+                    status=data.get("status", PENDING_STATUS),
+                    submitted_time=data.get("submitted_time", ""),
+                    base_model=data.get("base_model", ""),
+                    likes=data.get("likes", 0),
+                    params=data.get("params", 0.0),
+                    license=data.get("license", ""),
+                    private=data.get("private", False),
+                    json_filepath=str(json_file),
+                )
+                requests.append(request)
+        except (json.JSONDecodeError, OSError) as e:
+            print(f"Error loading {json_file}: {e}")
+            continue
+    return requests
+def set_eval_request(
+    api: HfApi,
+    eval_request: EvalRequest,
+    set_to_status: str,
+    hf_repo: str,
+    local_dir: str,
+) -> None:
+    """
+    Update the status of an evaluation request.
+    Args:
+        api: HuggingFace API client
+        eval_request: The request to update
+        set_to_status: New status value
+        hf_repo: HuggingFace dataset repo ID
+        local_dir: Local directory with cached requests
+    """
+    json_filepath = Path(eval_request.json_filepath)
+    if not json_filepath.exists():
+        print(f"Request file not found: {json_filepath}")
+        return
+    # Load current data
+    with open(json_filepath, "r") as f:
+        data = json.load(f)
+    # Update status
+    data["status"] = set_to_status
+    # Save locally
+    with open(json_filepath, "w") as f:
+        json.dump(data, f, indent=2)
+    # Upload to Hub
+    try:
+        repo_path = str(json_filepath).replace(local_dir + "/", "")
+        api.upload_file(
+            path_or_fileobj=str(json_filepath),
+            path_in_repo=repo_path,
+            repo_id=hf_repo,
+            repo_type="dataset",
+            commit_message=f"Update status to {set_to_status} for {eval_request.model}",
+        )
+    except Exception as e:
+        print(f"Failed to upload status update: {e}")
+def check_completed_evals(
+    api: HfApi,
+    checked_status: str,
+    completed_status: str,
+    failed_status: str,
+    hf_repo: str,
+    local_dir: str,
+    hf_repo_results: str,
+    local_dir_results: str,
+) -> None:
+    """
+    Check for completed evaluations and update their status.
+    Args:
+        api: HuggingFace API client
+        checked_status: Status to check (e.g., RUNNING)
+        completed_status: Status to set if results exist
+        failed_status: Status to set if evaluation failed
+        hf_repo: Requests dataset repo ID
+        local_dir: Local requests directory
+        hf_repo_results: Results dataset repo ID
+        local_dir_results: Local results directory
+    """
+    running_requests = get_eval_requests([checked_status], hf_repo, local_dir)
+    for request in running_requests:
+        # Check if results exist
+        model_results_dir = Path(local_dir_results) / request.model
+        if model_results_dir.exists():
+            result_files = list(model_results_dir.rglob("results_*.json"))
+            if result_files:
+                # Results found, mark as completed
+                set_eval_request(
+                    api=api,
+                    eval_request=request,
+                    set_to_status=completed_status,
+                    hf_repo=hf_repo,
+                    local_dir=local_dir,
+                )
+                print(f"Marked {request.model} as {completed_status}")

src/backend/run_eval_suite_harness.py ADDED Viewed

	@@ -0,0 +1,113 @@

+"""Run evaluations using lm-evaluation-harness."""
+import json
+import logging
+from datetime import datetime
+from pathlib import Path
+from typing import List, Optional, Union
+from lm_eval import evaluator, utils
+from lm_eval.tasks import TaskManager
+from src.backend.manage_requests import EvalRequest
+from src.envs import API
+logger = logging.getLogger(__name__)
+def run_evaluation(
+    eval_request: EvalRequest,
+    task_names: List[str],
+    num_fewshot: int,
+    batch_size: Union[int, str],
+    device: str,
+    local_dir: str,
+    results_repo: str,
+    limit: Optional[int] = None,
+) -> dict:
+    """
+    Run evaluation for a model using lm-evaluation-harness.
+    Args:
+        eval_request: The evaluation request with model info
+        task_names: List of task names to evaluate
+        num_fewshot: Number of few-shot examples
+        batch_size: Batch size (int or "auto")
+        device: Device to run on ("cpu" or "cuda:0")
+        local_dir: Directory to save results locally
+        results_repo: HuggingFace dataset repo for results
+        limit: Limit samples per task (for testing)
+    Returns:
+        Evaluation results dictionary
+    """
+    if limit:
+        logger.warning(
+            "WARNING: --limit SHOULD ONLY BE USED FOR TESTING. "
+            "REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
+        )
+    # Initialize task manager and validate tasks
+    task_manager = TaskManager()
+    all_tasks = task_manager.all_tasks
+    task_names = utils.pattern_match(task_names, all_tasks)
+    logger.info(f"Model: {eval_request.model}")
+    logger.info(f"Selected Tasks: {task_names}")
+    logger.info(f"Few-shot: {num_fewshot}")
+    logger.info(f"Device: {device}")
+    # Run evaluation
+    try:
+        results = evaluator.simple_evaluate(
+            model="hf",
+            model_args=eval_request.get_model_args(),
+            tasks=task_names,
+            num_fewshot=num_fewshot,
+            batch_size=batch_size,
+            device=device,
+            limit=limit,
+            write_out=True,
+            log_samples=True,  # Save per-sample results
+        )
+    except Exception as e:
+        logger.error(f"Evaluation failed: {e}")
+        raise
+    # Add model metadata to results
+    results["config"]["model_dtype"] = eval_request.precision
+    results["config"]["model_name"] = eval_request.model
+    results["config"]["model_sha"] = eval_request.revision
+    results["config"]["model_type"] = eval_request.model_type
+    # Log results summary
+    logger.info("\n" + "=" * 60)
+    logger.info("EVALUATION RESULTS")
+    logger.info("=" * 60)
+    logger.info(evaluator.make_table(results))
+    # Save results locally
+    timestamp = datetime.now().strftime("%Y%m%dT%H%M%S")
+    results_path = Path(local_dir) / eval_request.model / f"results_{timestamp}.json"
+    results_path.parent.mkdir(exist_ok=True, parents=True)
+    dumped = json.dumps(results, indent=2, default=str)
+    results_path.write_text(dumped)
+    logger.info(f"Results saved to: {results_path}")
+    # Upload to HuggingFace Hub
+    try:
+        repo_path = results_path.relative_to(local_dir).as_posix()
+        API.upload_file(
+            path_or_fileobj=str(results_path),
+            path_in_repo=repo_path,
+            repo_id=results_repo,
+            repo_type="dataset",
+            commit_message=f"Add evaluation results for {eval_request.model}",
+        )
+        logger.info(f"Results uploaded to {results_repo}")
+    except Exception as e:
+        logger.error(f"Failed to upload results: {e}")
+    return results

src/backend/sort_queue.py ADDED Viewed

	@@ -0,0 +1,52 @@

+"""Sort evaluation queue by priority."""
+from typing import List
+from huggingface_hub import HfApi
+from src.backend.manage_requests import EvalRequest
+def sort_models_by_priority(api: HfApi, models: List[EvalRequest]) -> List[EvalRequest]:
+    """
+    Sort models by priority for evaluation.
+    Current strategy: FIFO (first in, first out) based on submission time.
+    Can be extended to prioritize by model popularity, size, etc.
+    Args:
+        api: HuggingFace API client
+        models: List of evaluation requests
+    Returns:
+        Sorted list of evaluation requests
+    """
+    # Sort by submission time (oldest first)
+    return sorted(models, key=lambda x: x.submitted_time)
+def sort_models_by_likes(api: HfApi, models: List[EvalRequest]) -> List[EvalRequest]:
+    """
+    Sort models by Hub likes (most popular first).
+    Args:
+        api: HuggingFace API client
+        models: List of evaluation requests
+    Returns:
+        Sorted list of evaluation requests
+    """
+    return sorted(models, key=lambda x: x.likes, reverse=True)
+def sort_models_by_size(models: List[EvalRequest], ascending: bool = True) -> List[EvalRequest]:
+    """
+    Sort models by parameter count.
+    Args:
+        models: List of evaluation requests
+        ascending: If True, smallest models first
+    Returns:
+        Sorted list of evaluation requests
+    """
+    return sorted(models, key=lambda x: x.params, reverse=not ascending)

src/display/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Display utilities

src/display/css_html_js.py ADDED Viewed

	@@ -0,0 +1,11 @@

+"""CSS and JavaScript for the backend UI."""
+dark_mode_gradio_js = """
+function refresh() {
+    const url = new URL(window.location);
+    if (url.searchParams.get('__theme') !== 'dark') {
+        url.searchParams.set('__theme', 'dark');
+        window.location.href = url.href;
+    }
+}
+"""

src/display/log_visualizer.py ADDED Viewed

	@@ -0,0 +1,45 @@

+"""Log visualization utilities for the backend UI."""
+from pathlib import Path
+from typing import Union
+from src.envs import NUM_LINES_VISUALIZE
+from src.logging import log_file
+def log_file_to_html_string(log_path: Union[str, Path] = log_file, reverse: bool = True) -> str:
+    """
+    Convert log file contents to HTML string for display.
+    Args:
+        log_path: Path to the log file
+        reverse: If True, show newest entries first
+    Returns:
+        HTML-formatted log contents
+    """
+    log_path = Path(log_path)
+    if not log_path.exists():
+        return "<pre>No logs yet.</pre>"
+    try:
+        with open(log_path, "r") as f:
+            lines = f.readlines()
+        # Limit number of lines
+        lines = lines[-NUM_LINES_VISUALIZE:]
+        if reverse:
+            lines = lines[::-1]
+        # Escape HTML and wrap in pre tag
+        content = "".join(lines)
+        content = content.replace("&", "&amp;")
+        content = content.replace("<", "&lt;")
+        content = content.replace(">", "&gt;")
+        return f"<pre style='font-size: 12px; white-space: pre-wrap;'>{content}</pre>"
+    except Exception as e:
+        return f"<pre>Error reading log file: {e}</pre>"

src/envs.py ADDED Viewed

	@@ -0,0 +1,49 @@

+"""Backend environment configuration."""
+import os
+from huggingface_hub import HfApi
+# ----------------------------------
+# Configuration
+# ----------------------------------
+TOKEN = os.environ.get("HF_TOKEN")  # Read/write token for your org
+OWNER = "stellaathena"  # HuggingFace username/org
+# Device configuration
+DEVICE = os.environ.get("DEVICE", "cuda:0")  # "cpu" or "cuda:0"
+LIMIT = None  # Set to int for testing (e.g., 20), None for full evaluation
+# Evaluation settings
+NUM_FEWSHOT = 0  # Zero-shot evaluation
+# Tasks to evaluate (lm-evaluation-harness task names)
+TASKS_HARNESS = [
+    "minerva_math",
+    "piqa",
+]
+# ----------------------------------
+# Derived Configuration
+# ----------------------------------
+REPO_ID = f"{OWNER}/math-piqa-backend"
+QUEUE_REPO = f"{OWNER}/math-piqa-requests"
+RESULTS_REPO = f"{OWNER}/math-piqa-results"
+# Cache paths
+CACHE_PATH = os.getenv("HF_HOME", ".")
+EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
+EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
+EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
+EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
+# Refresh rate (seconds)
+REFRESH_RATE = 10 * 60  # 10 minutes
+# Log visualization
+NUM_LINES_VISUALIZE = 300
+# API client
+API = HfApi(token=TOKEN)

src/logging.py ADDED Viewed

	@@ -0,0 +1,35 @@

+"""Logging configuration for the backend."""
+import logging
+import os
+from pathlib import Path
+# Log file location
+log_file = Path("evaluation.log")
+def configure_root_logger():
+    """Configure the root logger with file and console handlers."""
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+        handlers=[
+            logging.FileHandler(log_file),
+            logging.StreamHandler(),
+        ],
+    )
+def setup_logger(name: str) -> logging.Logger:
+    """
+    Set up a logger with the given name.
+    Args:
+        name: Logger name (typically __name__)
+    Returns:
+        Configured logger
+    """
+    logger = logging.getLogger(name)
+    logger.setLevel(logging.INFO)
+    return logger