File size: 2,211 Bytes
16038fc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
from __future__ import annotations

from dataclasses import dataclass
from typing import List

from fsds_cleaning_env.dataset_generators import EVAL_SEEDS, SIZE_MEDIUM


@dataclass(frozen=True)
class EvaluationTask:
    """Specification for a single evaluation scenario.

    These tasks are meant to be used by evaluation / benchmarking scripts and
    SHOULD NOT be used for on-policy RL training so that evaluation remains
    held-out.

    eval_index selects which fixed seed from EVAL_SEEDS to use, so the same
    table is produced every time for reproducible evaluation.
    """

    name: str
    task_id: str
    description: str
    max_steps: int
    eval_index: int = 0
    n_rows: int = SIZE_MEDIUM


def _expand_eval_tasks() -> List[EvaluationTask]:
    """Build evaluation tasks from base scenarios and fixed seeds."""
    base = [
        ("ecommerce_mobile_baseline", "ecommerce_mobile", "Canonical mobile conversion cleaning task."),
        ("subscription_churn_baseline", "subscription_churn", "Subscription churn table cleaning for churn modeling."),
        ("delivery_eta_baseline", "delivery_eta", "Last-mile delivery ETA cleaning task."),
    ]
    tasks = []
    for name, task_id, desc in base:
        seeds = EVAL_SEEDS.get(task_id, [42])
        for idx, _ in enumerate(seeds):
            tasks.append(
                EvaluationTask(
                    name=f"{name}_seed{idx}",
                    task_id=task_id,
                    description=desc,
                    max_steps=18,
                    eval_index=idx,
                    n_rows=SIZE_MEDIUM,
                )
            )
    return tasks


EVAL_TASKS: List[EvaluationTask] = _expand_eval_tasks()

"""
NOTE FOR FUTURE AGENTS:

- Each EvaluationTask has eval_index pointing to a fixed seed in EVAL_SEEDS.
  Use get_eval_dataset(task_id, eval_index) to get the held-out table.
- For evaluation, call env.reset(task_id=..., seed=EVAL_SEEDS[task_id][eval_index])
  so the environment produces the same table each run.
- To add more tasks: extend EVAL_SEEDS in dataset_generators.py and re-run
  _expand_eval_tasks(), or append manual EvaluationTask entries.
"""

__all__ = ["EvaluationTask", "EVAL_TASKS"]