Spaces:

meetkai
/

modelchorus-evals

Runtime error

App Files Files Community

brycemeetkai commited on 11 days ago

Commit

caf09eb

verified ·

1 Parent(s): a540a5c

Mirror evals/ from c1978f83e59e

Browse files

Files changed (22) hide show

evals/albanian/README.md +21 -30
evals/albanian/albanian.yaml +0 -2
evals/albanian/summarization/_default_summarization_yaml +0 -18
evals/albanian/summarization/albanian_massivesumm_long.yaml +0 -14
evals/albanian/summarization/albanian_massivesumm_short.yaml +0 -9
evals/albanian/summarization/albanian_summarization.yaml +0 -11
evals/albanian/summarization/utils.py +0 -111
evals/eval_config.toml +11 -9
evals/portuguese/README.md +15 -27
evals/portuguese/nli/portuguese_assin2_rte.yaml +0 -28
evals/portuguese/nli/portuguese_assin2_sts.yaml +0 -22
evals/portuguese/nli/portuguese_faquad_nli.yaml +0 -28
evals/portuguese/nli/portuguese_nli.yaml +0 -18
evals/portuguese/nli/utils.py +0 -102
evals/portuguese/portuguese.yaml +0 -2
evals/run_eval.py +20 -2
evals/ukrainian/README.md +17 -33
evals/ukrainian/summarization/_default_summarization_yaml +0 -18
evals/ukrainian/summarization/ukrainian_massivesumm_long.yaml +0 -14
evals/ukrainian/summarization/ukrainian_summarization.yaml +0 -10
evals/ukrainian/summarization/utils.py +0 -111
evals/ukrainian/ukrainian.yaml +0 -2

evals/albanian/README.md CHANGED Viewed

@@ -7,24 +7,21 @@ Albanian (Tosk, `als_Latn` / macro `sq`) evaluation suite for the
 ### Custom Tasks (require `--include_path`)
-| #   | Task Name                    | Category        | Dataset (HuggingFace)                                                                   | Metric             |
-| --- | ---------------------------- | --------------- | --------------------------------------------------------------------------------------- | ------------------ |
-| 1   | `albanian_sib200`            | Classification  | `Davlan/sib200` (`als_Latn`)                                                            | f1_macro           |
-| 2   | `albanian_belebele`          | MCQ             | `facebook/belebele` (`als_Latn`)                                                        | f1_macro           |
-| 3   | `albanian_global_mmlu`       | MCQ             | `CohereLabs/Global-MMLU-Lite` (`sq`, v2)                                                | f1_macro           |
-| 4   | `albanian_massivesumm_short` | Summarization   | `MaLA-LM/MassiveSumm_short` (filtered `language=sqi`)                                   | rouge_l            |
-| 5   | `albanian_massivesumm_long`  | Summarization   | `MaLA-LM/MassiveSumm_long` (filtered `language=sqi`)                                    | rouge_l            |
-| 6   | `albanian_aya`               | Open generation | `CohereLabs/aya_evaluation_suite` (`dolly_machine_translated`, filtered `language=sqi`) | llm_judge_score    |
-| 7   | `albanian_polywrite`         | Open generation | `MaLA-LM/PolyWrite` (filtered `lang_script=sqi_Latn`)                                   | open_quality_score |
 #### Subgroups
-| Group                      | Tasks                               |
-| -------------------------- | ----------------------------------- |
-| `albanian_classification`  | sib200                              |
-| `albanian_mcq`             | belebele, global_mmlu               |
-| `albanian_summarization`   | massivesumm_short, massivesumm_long |
-| `albanian_open_generation` | aya, polywrite                      |
 ## Setup
@@ -40,7 +37,7 @@ All commands must be run from the `multilingual_bench/` directory:
 cd /path/to/functionary_internal/evaluation/multilingual_bench
 ```
-### Run the Entire Albanian Suite (all 7 tasks)
 ```bash
 OPENAI_API_KEY="$OPENROUTER_API_KEY" \
@@ -68,7 +65,6 @@ python run_eval.py --models gpt-5-mini --tasks albanian
 ```bash
 lm_eval --include_path lm_eval_tasks --tasks albanian_classification ...
 lm_eval --include_path lm_eval_tasks --tasks albanian_mcq ...
-lm_eval --include_path lm_eval_tasks --tasks albanian_summarization ...
 lm_eval --include_path lm_eval_tasks --tasks albanian_open_generation ...
 ```
@@ -78,8 +74,6 @@ lm_eval --include_path lm_eval_tasks --tasks albanian_open_generation ...
 lm_eval --include_path lm_eval_tasks --tasks albanian_sib200 ...
 lm_eval --include_path lm_eval_tasks --tasks albanian_belebele ...
 lm_eval --include_path lm_eval_tasks --tasks albanian_global_mmlu ...
-lm_eval --include_path lm_eval_tasks --tasks albanian_massivesumm_short ...
-lm_eval --include_path lm_eval_tasks --tasks albanian_massivesumm_long ...
 lm_eval --include_path lm_eval_tasks --tasks albanian_aya ...
 lm_eval --include_path lm_eval_tasks --tasks albanian_polywrite ...
 ```
@@ -93,22 +87,19 @@ With `--log_samples`, the output directory contains:
 ## Dataset Sources
-| Dataset           | Source                            | Config                                             | Notes                                                                |
-| ----------------- | --------------------------------- | -------------------------------------------------- | -------------------------------------------------------------------- |
-| SIB-200           | `Davlan/sib200`                   | `als_Latn`                                         | text + ClassLabel `category` (7 topics)                              |
-| Belebele          | `facebook/belebele`               | `als_Latn`                                         | flores_passage + question + 4 mc_answers, `correct_answer_num` 1-4   |
-| Global-MMLU-Lite  | `CohereLabs/Global-MMLU-Lite`     | `sq`                                               | question + `option_a..d` + `answer` letter (400 samples, CS+CA)      |
-| MassiveSumm short | `MaLA-LM/MassiveSumm_short`       | — (filter `language=sqi`)                          | `text`, `summary`, `language`; gated                                 |
-| MassiveSumm long  | `MaLA-LM/MassiveSumm_long`        | — (filter `language=sqi`)                          | same schema; longer articles                                         |
-| Aya Eval          | `CohereLabs/aya_evaluation_suite` | `dolly_machine_translated` (filter `language=sqi`) | `inputs`, `targets`, `language`, `script`                            |
-| PolyWrite         | `MaLA-LM/PolyWrite`               | — (filter `lang_script=sqi_Latn`)                  | `prompt_translated`, `category`, `lang_script` (no reference answer) |
 ### Gated datasets
-Several upstream datasets are gated on Hugging Face. Accept the terms (once) and export an HF token before running:
 - Aya Eval: <https://huggingface.co/datasets/CohereLabs/aya_evaluation_suite>
-- MassiveSumm short / long: <https://huggingface.co/datasets/MaLA-LM/MassiveSumm_short> and <https://huggingface.co/datasets/MaLA-LM/MassiveSumm_long>
 ```bash
 export HF_TOKEN="hf_..."

 ### Custom Tasks (require `--include_path`)
+| #   | Task Name              | Category        | Dataset (HuggingFace)                                                                   | Metric             |
+| --- | ---------------------- | --------------- | --------------------------------------------------------------------------------------- | ------------------ |
+| 1   | `albanian_sib200`      | Classification  | `Davlan/sib200` (`als_Latn`)                                                            | f1_macro           |
+| 2   | `albanian_belebele`    | MCQ             | `facebook/belebele` (`als_Latn`)                                                        | f1_macro           |
+| 3   | `albanian_global_mmlu` | MCQ             | `CohereLabs/Global-MMLU-Lite` (`sq`, v2)                                                | f1_macro           |
+| 4   | `albanian_aya`         | Open generation | `CohereLabs/aya_evaluation_suite` (`dolly_machine_translated`, filtered `language=sqi`) | llm_judge_score    |
+| 5   | `albanian_polywrite`   | Open generation | `MaLA-LM/PolyWrite` (filtered `lang_script=sqi_Latn`)                                   | open_quality_score |
 #### Subgroups
+| Group                      | Tasks                 |
+| -------------------------- | --------------------- |
+| `albanian_classification`  | sib200                |
+| `albanian_mcq`             | belebele, global_mmlu |
+| `albanian_open_generation` | aya, polywrite        |
 ## Setup
 cd /path/to/functionary_internal/evaluation/multilingual_bench
 ```
+### Run the Entire Albanian Suite (all 5 tasks)
 ```bash
 OPENAI_API_KEY="$OPENROUTER_API_KEY" \
 ```bash
 lm_eval --include_path lm_eval_tasks --tasks albanian_classification ...
 lm_eval --include_path lm_eval_tasks --tasks albanian_mcq ...
 lm_eval --include_path lm_eval_tasks --tasks albanian_open_generation ...
 ```
 lm_eval --include_path lm_eval_tasks --tasks albanian_sib200 ...
 lm_eval --include_path lm_eval_tasks --tasks albanian_belebele ...
 lm_eval --include_path lm_eval_tasks --tasks albanian_global_mmlu ...
 lm_eval --include_path lm_eval_tasks --tasks albanian_aya ...
 lm_eval --include_path lm_eval_tasks --tasks albanian_polywrite ...
 ```
 ## Dataset Sources
+| Dataset          | Source                            | Config                                             | Notes                                                                |
+| ---------------- | --------------------------------- | -------------------------------------------------- | -------------------------------------------------------------------- |
+| SIB-200          | `Davlan/sib200`                   | `als_Latn`                                         | text + ClassLabel `category` (7 topics)                              |
+| Belebele         | `facebook/belebele`               | `als_Latn`                                         | flores_passage + question + 4 mc_answers, `correct_answer_num` 1-4   |
+| Global-MMLU-Lite | `CohereLabs/Global-MMLU-Lite`     | `sq`                                               | question + `option_a..d` + `answer` letter (400 samples, CS+CA)      |
+| Aya Eval         | `CohereLabs/aya_evaluation_suite` | `dolly_machine_translated` (filter `language=sqi`) | `inputs`, `targets`, `language`, `script`                            |
+| PolyWrite        | `MaLA-LM/PolyWrite`               | — (filter `lang_script=sqi_Latn`)                  | `prompt_translated`, `category`, `lang_script` (no reference answer) |
 ### Gated datasets
+The Aya Eval dataset is gated on Hugging Face. Accept the terms (once) and export an HF token before running:
 - Aya Eval: <https://huggingface.co/datasets/CohereLabs/aya_evaluation_suite>
 ```bash
 export HF_TOKEN="hf_..."

evals/albanian/albanian.yaml CHANGED Viewed

@@ -9,13 +9,11 @@
 #
 # Metrics:
 #   classification & mcq → f1_macro                            (per sub-group)
-#   summarization        → rouge_l                             (per sub-group)
 #   open_generation      → llm_judge_score / open_quality_score (per sub-group)
 group: albanian
 task:
   - albanian_classification
   - albanian_mcq
-  - albanian_summarization
   - albanian_open_generation
 metadata:
   version: 1.0

 #
 # Metrics:
 #   classification & mcq → f1_macro                            (per sub-group)
 #   open_generation      → llm_judge_score / open_quality_score (per sub-group)
 group: albanian
 task:
   - albanian_classification
   - albanian_mcq
   - albanian_open_generation
 metadata:
   version: 1.0

evals/albanian/summarization/_default_summarization_yaml DELETED Viewed

@@ -1,18 +0,0 @@
-# Shared config for Albanian summarization tasks (MassiveSumm).
-# The Albanian-only filter is applied in process_docs (the upstream
-# datasets are highly multilingual single-table dumps, no per-language
-# config). Scoring is sentence-level ROUGE-L F1.
-output_type: generate_until
-test_split: train
-generation_kwargs:
-  do_sample: false
-  max_gen_toks: 8192
-  until:
-    - "<|endoftext|>"
-process_results: !function utils.process_results
-metric_list:
-  - metric: rouge_l
-    aggregation: !function utils.rouge_l_agg
-    higher_is_better: true
-metadata:
-  version: 1.0

evals/albanian/summarization/albanian_massivesumm_long.yaml DELETED Viewed

@@ -1,14 +0,0 @@
-task: albanian_massivesumm_long
-task_alias: massivesumm_long
-# Gated dataset: accept terms at
-# https://huggingface.co/datasets/MaLA-LM/MassiveSumm_long and export HF_TOKEN.
-dataset_path: MaLA-LM/MassiveSumm_long
-include: _default_summarization_yaml
-process_docs: !function utils.process_docs
-generation_kwargs:
-  do_sample: false
-  max_gen_toks: 8192
-  until:
-    - "<|endoftext|>"
-doc_to_text: "Ti je një sistem përmbledhjeje lajmesh.\nPërmblidh artikullin e mëposhtëm në një paragraf të shkurtër (3-5 fjali) në gjuhën shqipe. Mos shto komente.\n\nArtikulli:\n{{text}}\n\nPërmbledhja:"
-doc_to_target: "{{summary}}"

evals/albanian/summarization/albanian_massivesumm_short.yaml DELETED Viewed

@@ -1,9 +0,0 @@
-task: albanian_massivesumm_short
-task_alias: massivesumm_short
-# Gated dataset: accept terms at
-# https://huggingface.co/datasets/MaLA-LM/MassiveSumm_short and export HF_TOKEN.
-dataset_path: MaLA-LM/MassiveSumm_short
-include: _default_summarization_yaml
-process_docs: !function utils.process_docs
-doc_to_text: "Ti je një sistem përmbledhjeje lajmesh.\nPërmblidh artikullin e mëposhtëm në një ose dy fjali të shkurtra në gjuhën shqipe. Mos shto komente.\n\nArtikulli:\n{{text}}\n\nPërmbledhja:"
-doc_to_target: "{{summary}}"

evals/albanian/summarization/albanian_summarization.yaml DELETED Viewed

@@ -1,11 +0,0 @@
-# Summarization subgroup (MassiveSumm short + long)
-group: albanian_summarization
-task:
-  - albanian_massivesumm_short
-  - albanian_massivesumm_long
-aggregate_metric_list:
-  - metric: rouge_l
-    aggregation: mean
-    weight_by_size: true
-metadata:
-  version: 1.0

evals/albanian/summarization/utils.py DELETED Viewed

@@ -1,111 +0,0 @@
-"""Utility helpers for Albanian summarization tasks (MassiveSumm short/long).
-Both MassiveSumm subsets are highly multilingual single-table datasets
-(one ``train`` split, no language configs). We filter to Albanian rows
-inside ``process_docs``. The HF dataset is **gated** — accept the terms
-on the dataset page once and export ``HF_TOKEN`` before running.
-Scoring uses ROUGE-L F1 via the ``rouge_score`` package, which is
-already a transitive dependency of lm-evaluation-harness.
-"""
-from __future__ import annotations
-import re
-import string
-import datasets
-_ALBANIAN_LANG_CODES = {"sqi", "als", "aln"}
-def _strip_think_tags(text: str) -> str:
-    """Strip <think>...</think> reasoning wrapper (e.g. Qwen thinking models)."""
-    if "</think>" in text:
-        return text.split("</think>")[-1].strip()
-    return text
-def _filter_albanian(dataset: datasets.Dataset) -> datasets.Dataset:
-    """Keep rows whose ``language`` field is one of Albanian variants."""
-    if "language" not in dataset.column_names:
-        return dataset
-    return dataset.filter(lambda row: str(row.get("language", "")).lower() in _ALBANIAN_LANG_CODES)
-def _normalise_doc(doc):
-    """Project the columns we actually need."""
-    text = (doc.get("text") or "").strip()
-    summary = (doc.get("summary") or "").strip()
-    title = (doc.get("title") or "").strip()
-    return {
-        "text": text,
-        "summary": summary,
-        "title": title,
-    }
-def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
-    filtered = _filter_albanian(dataset)
-    return filtered.map(_normalise_doc, remove_columns=[
-        c for c in filtered.column_names if c not in ("text", "summary", "title")
-    ])
-# ── ROUGE-L scoring ──────────────────────────────────────────────────
-_PUNCT_TABLE = str.maketrans("", "", string.punctuation)
-_WHITESPACE_RE = re.compile(r"\s+")
-def _normalise(text: str) -> str:
-    text = text.translate(_PUNCT_TABLE)
-    text = _WHITESPACE_RE.sub(" ", text)
-    return text.strip().lower()
-def _lcs_length(a, b):
-    m, n = len(a), len(b)
-    if m == 0 or n == 0:
-        return 0
-    dp = [0] * (n + 1)
-    for i in range(1, m + 1):
-        prev = 0
-        for j in range(1, n + 1):
-            tmp = dp[j]
-            if a[i - 1] == b[j - 1]:
-                dp[j] = prev + 1
-            else:
-                dp[j] = max(dp[j], dp[j - 1])
-            prev = tmp
-    return dp[n]
-def _rouge_l_f1(pred: str, gold: str) -> float:
-    """Compute sentence-level ROUGE-L F1 (no stemming) between pred and gold."""
-    pred_tokens = _normalise(pred).split()
-    gold_tokens = _normalise(gold).split()
-    if not pred_tokens or not gold_tokens:
-        return 0.0
-    lcs = _lcs_length(pred_tokens, gold_tokens)
-    if lcs == 0:
-        return 0.0
-    precision = lcs / len(pred_tokens)
-    recall = lcs / len(gold_tokens)
-    return 2 * precision * recall / (precision + recall)
-def process_results(doc, results):
-    raw_response = results[0].strip() if results and results[0] else ""
-    pred = _strip_think_tags(raw_response)
-    gold = (doc.get("summary") or "").strip()
-    return {"rouge_l": (gold, pred)}
-def rouge_l_agg(items):
-    if not items:
-        return 0.0
-    scores = [_rouge_l_f1(pred, gold) for gold, pred in items]
-    return sum(scores) / len(scores)

evals/eval_config.toml CHANGED Viewed

@@ -29,6 +29,17 @@ apply_chat_template = true
 log_samples = true
 output_path = "output/results"
 # ── Hugging Face Hub ─────────────────────────────────────────────────
 # Token: export HF_TOKEN="hf_..."  (https://huggingface.co/settings/tokens)
 #
@@ -161,9 +172,6 @@ name = "albanian_classification"
 [[tasks]]
 name = "albanian_mcq"
-[[tasks]]
-name = "albanian_summarization"
 [[tasks]]
 name = "albanian_open_generation"
@@ -174,9 +182,6 @@ name = "portuguese_mcq"
 [[tasks]]
 name = "portuguese_classification"
-[[tasks]]
-name = "portuguese_nli"
 # Ukrainian
 [[tasks]]
 name = "ukrainian_classification"
@@ -187,9 +192,6 @@ name = "ukrainian_mcq"
 [[tasks]]
 name = "ukrainian_qa"
-[[tasks]]
-name = "ukrainian_summarization"
 [[tasks]]
 name = "ukrainian_open_generation"

 log_samples = true
 output_path = "output/results"
+# Resilience knobs forwarded to lm_eval's TemplateAPI (local-chat-completions):
+#   max_retries → tenacity stop_after_attempt(N) per request.
+#   timeout     → aiohttp ClientTimeout(total=SEC) per request.
+# Default lm_eval values (3 / 300) are too low for long CoT tasks against an
+# overloaded SGLang / Functionary endpoint — slow tail requests or server
+# hiccups (ServerDisconnectedError, ConnectionReset, Cloudflare 524) exhaust
+# retries and abort the run. Mirrors functionary_internal's working config:
+# https://github.com/MeetKai/functionary_internal/blob/main/functionary_internal/evaluation/multilingual_bench/lm_eval_tasks/eval_config.toml
+max_retries = 5
+timeout = 1800
 # ── Hugging Face Hub ─────────────────────────────────────────────────
 # Token: export HF_TOKEN="hf_..."  (https://huggingface.co/settings/tokens)
 #
 [[tasks]]
 name = "albanian_mcq"
 [[tasks]]
 name = "albanian_open_generation"
 [[tasks]]
 name = "portuguese_classification"
 # Ukrainian
 [[tasks]]
 name = "ukrainian_classification"
 [[tasks]]
 name = "ukrainian_qa"
 [[tasks]]
 name = "ukrainian_open_generation"

evals/portuguese/README.md CHANGED Viewed

@@ -12,17 +12,13 @@ Portuguese (PT-BR) evaluation suite for the `lm-evaluation-harness` framework.
 | 4   | `portuguese_hatebr`      | Classification | `eduagarcia/portuguese_benchmark` (HateBR)      | f1_macro    |
 | 5   | `portuguese_hate_speech` | Classification | `eduagarcia/portuguese_benchmark` (Hate Speech) | f1_macro    |
 | 6   | `portuguese_tweetsentbr` | Classification | `eduagarcia/tweetsentbr_fewshot`                | f1_macro    |
-| 7   | `portuguese_assin2_rte`  | NLI            | `assin2`                                        | f1_macro    |
-| 8   | `portuguese_faquad_nli`  | NLI            | `ruanchaves/faquad-nli`                         | f1_macro    |
-| 9   | `portuguese_assin2_sts`  | NLI            | `assin2`                                        | pearson     |
 ### Subgroups
-| Group                       | Tasks                              |
-| --------------------------- | ---------------------------------- |
-| `portuguese_mcq`            | enem, bluex, oab_exams             |
-| `portuguese_classification` | hatebr, hate_speech, tweetsentbr   |
-| `portuguese_nli`            | assin2_rte, faquad_nli, assin2_sts |
 ## Setup
@@ -39,7 +35,7 @@ cd functionary_internal/evaluation/multilingual_bench/lm_eval_tasks
 export INCLUDE_PATH="$(pwd)"
 ```
-### Run the Entire Portuguese (all 9 tasks)
 ```bash
 OPENAI_API_KEY="your-key" \
@@ -63,9 +59,6 @@ lm_eval --include_path $INCLUDE_PATH --tasks portuguese_mcq ...
 # Classification (hate speech, sentiment)
 lm_eval --include_path $INCLUDE_PATH --tasks portuguese_classification ...
-# Natural Language Inference (ASSIN2 RTE + FaQuAD NLI + ASSIN2 STS)
-lm_eval --include_path $INCLUDE_PATH --tasks portuguese_nli ...
 ```
 ### Run a Single Task
@@ -85,9 +78,6 @@ lm_eval --include_path $INCLUDE_PATH --tasks portuguese_hatebr ...
 # Sentiment analysis
 lm_eval --include_path $INCLUDE_PATH --tasks portuguese_tweetsentbr ...
-# Textual entailment
-lm_eval --include_path $INCLUDE_PATH --tasks portuguese_assin2_rte ...
 ```
 ### Run with a Local HuggingFace Model
@@ -106,8 +96,8 @@ lm_eval \
 ### Mix and Match
 ```bash
-# Run ENEM + ASSIN2 RTE only
-lm_eval --include_path $INCLUDE_PATH --tasks portuguese_enem,portuguese_assin2_rte ...
 ```
 ## Output
@@ -119,13 +109,11 @@ With `--log_samples`, the output directory contains:
 ## Dataset Sources
-| Dataset                | Source                                 | Config                          | Fields                                                      |
-| ---------------------- | -------------------------------------- | ------------------------------- | ----------------------------------------------------------- |
-| ENEM                   | `eduagarcia/enem_challenge`            | —                               | question, choices, answerKey                                |
-| BLUEX                  | `eduagarcia-temp/BLUEX_without_images` | —                               | question, choices, answerKey                                |
-| OAB Exams              | `eduagarcia/oab_exams`                 | —                               | question, choices, answerKey                                |
-| HateBR                 | `eduagarcia/portuguese_benchmark`      | `HateBR_offensive_binary`       | sentence, label                                             |
-| Portuguese Hate Speech | `eduagarcia/portuguese_benchmark`      | `Portuguese_Hate_Speech_binary` | sentence, label                                             |
-| TweetSentBR            | `eduagarcia/tweetsentbr_fewshot`       | —                               | sentence, label                                             |
-| ASSIN2                 | `assin2`                               | —                               | premise, hypothesis, entailment_judgment, relatedness_score |
-| FaQuAD-NLI             | `ruanchaves/faquad-nli`                | —                               | question, answer, label                                     |

 | 4   | `portuguese_hatebr`      | Classification | `eduagarcia/portuguese_benchmark` (HateBR)      | f1_macro    |
 | 5   | `portuguese_hate_speech` | Classification | `eduagarcia/portuguese_benchmark` (Hate Speech) | f1_macro    |
 | 6   | `portuguese_tweetsentbr` | Classification | `eduagarcia/tweetsentbr_fewshot`                | f1_macro    |
 ### Subgroups
+| Group                       | Tasks                            |
+| --------------------------- | -------------------------------- |
+| `portuguese_mcq`            | enem, bluex, oab_exams           |
+| `portuguese_classification` | hatebr, hate_speech, tweetsentbr |
 ## Setup
 export INCLUDE_PATH="$(pwd)"
 ```
+### Run the Entire Portuguese (all 6 tasks)
 ```bash
 OPENAI_API_KEY="your-key" \
 # Classification (hate speech, sentiment)
 lm_eval --include_path $INCLUDE_PATH --tasks portuguese_classification ...
 ```
 ### Run a Single Task
 # Sentiment analysis
 lm_eval --include_path $INCLUDE_PATH --tasks portuguese_tweetsentbr ...
 ```
 ### Run with a Local HuggingFace Model
 ### Mix and Match
 ```bash
+# Run ENEM + HateBR only
+lm_eval --include_path $INCLUDE_PATH --tasks portuguese_enem,portuguese_hatebr ...
 ```
 ## Output
 ## Dataset Sources
+| Dataset                | Source                                 | Config                          | Fields                       |
+| ---------------------- | -------------------------------------- | ------------------------------- | ---------------------------- |
+| ENEM                   | `eduagarcia/enem_challenge`            | —                               | question, choices, answerKey |
+| BLUEX                  | `eduagarcia-temp/BLUEX_without_images` | —                               | question, choices, answerKey |
+| OAB Exams              | `eduagarcia/oab_exams`                 | —                               | question, choices, answerKey |
+| HateBR                 | `eduagarcia/portuguese_benchmark`      | `HateBR_offensive_binary`       | sentence, label              |
+| Portuguese Hate Speech | `eduagarcia/portuguese_benchmark`      | `Portuguese_Hate_Speech_binary` | sentence, label              |
+| TweetSentBR            | `eduagarcia/tweetsentbr_fewshot`       | —                               | sentence, label              |

evals/portuguese/nli/portuguese_assin2_rte.yaml DELETED Viewed

@@ -1,28 +0,0 @@
-task: portuguese_assin2_rte
-task_alias: assin2_rte
-dataset_path: assin2
-test_split: test
-output_type: generate_until
-generation_kwargs:
-  do_sample: false
-  max_gen_toks: 8192
-  until:
-    - "<|endoftext|>"
-process_docs: !function utils.process_assin2_rte_docs
-doc_to_text: "Indique se a hipótese pode ser inferida a partir da premissa. Responda apenas com \"Sim\" ou \"Não\".\n\nPremissa: {{premise}}\nHipótese: {{hypothesis}}\nPergunta: A hipótese pode ser inferida pela premissa? Sim ou Não?\nResposta:"
-doc_to_target: "{{target}}"
-filter_list:
-  - name: "get_label"
-    filter:
-      - function: "strip_think_recover"
-      - function: "regex"
-        regex_pattern: "(Sim|Não)"
-        group_select: 0
-      - function: "take_first"
-process_results: !function utils.process_nli_results
-metric_list:
-  - metric: f1_macro
-    aggregation: !function utils.macro_f1_agg
-    higher_is_better: true
-metadata:
-  version: 1.0

evals/portuguese/nli/portuguese_assin2_sts.yaml DELETED Viewed

@@ -1,22 +0,0 @@
-task: portuguese_assin2_sts
-task_alias: assin2_sts
-dataset_path: assin2
-test_split: test
-output_type: generate_until
-generation_kwargs:
-  do_sample: false
-  max_gen_toks: 8192
-  until:
-    - "<|endoftext|>"
-doc_to_text: "Avalie o grau de similaridade entre as duas frases abaixo. Dê uma pontuação entre 1,0 e 5,0 (1,0 = pouco similar, 5,0 = muito similar).\n\nFrase 1: {{premise}}\nFrase 2: {{hypothesis}}\nPergunta: Quão similares são as duas frases? Dê uma pontuação entre 1,0 a 5,0.\nResposta:"
-doc_to_target: !function utils.assin2_float_to_pt_str
-process_results: !function utils.process_sts_results
-metric_list:
-  - metric: pearson
-    aggregation: !function utils.pearson_agg
-    higher_is_better: true
-  - metric: mse
-    aggregation: !function utils.mse_agg
-    higher_is_better: false
-metadata:
-  version: 1.0

evals/portuguese/nli/portuguese_faquad_nli.yaml DELETED Viewed

@@ -1,28 +0,0 @@
-task: portuguese_faquad_nli
-task_alias: faquad_nli
-dataset_path: ruanchaves/faquad-nli
-test_split: test
-output_type: generate_until
-generation_kwargs:
-  do_sample: false
-  max_gen_toks: 8192
-  until:
-    - "<|endoftext|>"
-process_docs: !function utils.process_faquad_nli_docs
-doc_to_text: "Julgue se a resposta satisfaz à pergunta de maneira satisfatória. Responda apenas com \"Sim\" ou \"Não\".\n\nPergunta: {{question}}\nResposta: {{answer}}\nA resposta dada satisfaz à pergunta? Sim ou Não?"
-doc_to_target: "{{target}}"
-filter_list:
-  - name: "get_label"
-    filter:
-      - function: "strip_think_recover"
-      - function: "regex"
-        regex_pattern: "(Sim|Não)"
-        group_select: 0
-      - function: "take_first"
-process_results: !function utils.process_nli_results
-metric_list:
-  - metric: f1_macro
-    aggregation: !function utils.macro_f1_agg
-    higher_is_better: true
-metadata:
-  version: 1.0

evals/portuguese/nli/portuguese_nli.yaml DELETED Viewed

@@ -1,18 +0,0 @@
-# Natural Language Inference subgroup (ASSIN2 RTE + FaQuAD NLI + ASSIN2 STS)
-group: portuguese_nli
-task:
-  - portuguese_assin2_rte
-  - portuguese_faquad_nli
-  - portuguese_assin2_sts
-aggregate_metric_list:
-  - metric: f1_macro
-    aggregation: mean
-    weight_by_size: true
-  - metric: pearson
-    aggregation: mean
-    weight_by_size: true
-  - metric: mse
-    aggregation: mean
-    weight_by_size: true
-metadata:
-  version: 1.0

evals/portuguese/nli/utils.py DELETED Viewed

@@ -1,102 +0,0 @@
-"""Utility helpers for Portuguese NLI / STS tasks (generative mode).
-Covers ASSIN2 (RTE & STS) and FaQuAD-NLI.
-"""
-import os as _os, sys as _sys  # noqa: E401
-_sys.path.insert(0, _os.path.normpath(_os.path.join(_os.path.dirname(__file__), "..","..",)))
-import math
-import re
-from f1_utils import macro_f1_agg, process_results_f1  # noqa: F401
-# ── Document pre-processing ─────────────────────────────────────────
-def process_assin2_rte_docs(dataset):
-    """Map entailment_judgment 0/1 → Não/Sim."""
-    def _map(doc):
-        doc["target"] = "Sim" if doc["entailment_judgment"] == 1 else "Não"
-        return doc
-    return dataset.map(_map)
-def process_faquad_nli_docs(dataset):
-    """Map label 0/1 → Não/Sim."""
-    def _map(doc):
-        doc["target"] = "Sim" if doc["label"] == 1 else "Não"
-        return doc
-    return dataset.map(_map)
-# ── NLI result processing ────────────────────────────────────────────
-def process_nli_results(doc, results):
-    """Return (pred, gold) tuple for macro-F1 aggregation."""
-    return process_results_f1(doc, results)
-# ── STS helpers ──────────────────────────────────────────────────────
-def assin2_float_to_pt_str(doc):
-    """Format relatedness_score as a Portuguese-style decimal string (comma)."""
-    return "{:.1f}".format(doc["relatedness_score"]).replace(".", ",")
-def _extract_float(text):
-    """Extract a float value from text, handling Portuguese comma notation."""
-    text = text.strip()
-    # Try to find a number pattern (comma or dot as decimal separator)
-    match = re.search(r"(\d+[,.]?\d*)", text)
-    if match:
-        num_str = match.group(1).replace(",", ".")
-        try:
-            val = float(num_str)
-            return max(1.0, min(5.0, val))  # Clip to [1.0, 5.0]
-        except ValueError:
-            pass
-    return 5.0  # Default fallback (same as original)
-def process_sts_results(doc, results):
-    """Extract predicted float and pair with gold for pearson/mse."""
-    pred_text = results[0].strip() if results[0] else ""
-    pred_val = _extract_float(pred_text)
-    gold_val = doc["relatedness_score"]
-    return {
-        "pearson": (pred_val, gold_val),
-        "mse": (pred_val, gold_val),
-    }
-def pearson_agg(items):
-    """Compute Pearson correlation coefficient."""
-    preds = [item[0] for item in items]
-    golds = [item[1] for item in items]
-    n = len(preds)
-    if n < 2:
-        return 0.0
-    mean_p = sum(preds) / n
-    mean_g = sum(golds) / n
-    cov = sum((p - mean_p) * (g - mean_g) for p, g in zip(preds, golds)) / n
-    std_p = math.sqrt(sum((p - mean_p) ** 2 for p in preds) / n)
-    std_g = math.sqrt(sum((g - mean_g) ** 2 for g in golds) / n)
-    if std_p * std_g == 0:
-        return 0.0
-    return cov / (std_p * std_g)
-def mse_agg(items):
-    """Compute mean squared error."""
-    preds = [item[0] for item in items]
-    golds = [item[1] for item in items]
-    return sum((p - g) ** 2 for p, g in zip(preds, golds)) / len(preds)

evals/portuguese/portuguese.yaml CHANGED Viewed

@@ -8,11 +8,9 @@
 # Sub-groups:
 #   mcq            → exam multiple-choice (enem, bluex, oab_exams)
 #   classification → sentiment / hate / emotion classification
-#   nli            → natural language inference & textual similarity
 group: portuguese
 task:
   - portuguese_mcq
   - portuguese_classification
-  - portuguese_nli
 metadata:
   version: 1.0

 # Sub-groups:
 #   mcq            → exam multiple-choice (enem, bluex, oab_exams)
 #   classification → sentiment / hate / emotion classification
 group: portuguese
 task:
   - portuguese_mcq
   - portuguese_classification
 metadata:
   version: 1.0

evals/run_eval.py CHANGED Viewed

@@ -456,10 +456,21 @@ def run_single_eval(
     gen_kwargs: str | None = None,
     hf_hub: dict | None = None,
     endpoint_kind: str = "chat_completions",
 ) -> dict | None:
-    """Run a single lm_eval evaluation via the Python API and return results."""
     _ensure_lm_eval_api_key()
-    model_args = f"model={model_id},base_url={base_url},num_concurrent={num_concurrent}"
     tracker = _build_evaluation_tracker(output_path, hf_hub)
@@ -957,6 +968,11 @@ def main():
                 if args.num_concurrent is not None
                 else defaults.get("num_concurrent", 5)
             )
             eval_kwargs = dict(
                 include_path=SCRIPT_DIR,
@@ -976,6 +992,8 @@ def main():
                 gen_kwargs=gen_kwargs,
                 hf_hub=hf_hub,
                 endpoint_kind=model.get("endpoint_kind", "chat_completions"),
             )
             header = f"[{run_idx}/{total}] {model['name']} x {task['name']}"

     gen_kwargs: str | None = None,
     hf_hub: dict | None = None,
     endpoint_kind: str = "chat_completions",
+    max_retries: int = 3,
+    timeout: int = 300,
 ) -> dict | None:
+    """Run a single lm_eval evaluation via the Python API and return results.
+    ``max_retries`` and ``timeout`` are forwarded into lm-eval's TemplateAPI
+    via ``model_args``. Defaults match lm-eval's own stock values; the TOML
+    ``[defaults]`` block typically overrides them for Functionary endpoints
+    (see eval_config.toml for the rationale).
+    """
     _ensure_lm_eval_api_key()
+    model_args = (
+        f"model={model_id},base_url={base_url},num_concurrent={num_concurrent},"
+        f"max_retries={max_retries},timeout={timeout}"
+    )
     tracker = _build_evaluation_tracker(output_path, hf_hub)
                 if args.num_concurrent is not None
                 else defaults.get("num_concurrent", 5)
             )
+            # Resilience knobs read from [defaults]; see eval_config.toml.
+            # Stock lm-eval defaults (3 / 300) are kept as fallbacks so the
+            # behavior is unchanged when the TOML doesn't override them.
+            max_retries = int(defaults.get("max_retries", 3))
+            timeout = int(defaults.get("timeout", 300))
             eval_kwargs = dict(
                 include_path=SCRIPT_DIR,
                 gen_kwargs=gen_kwargs,
                 hf_hub=hf_hub,
                 endpoint_kind=model.get("endpoint_kind", "chat_completions"),
+                max_retries=max_retries,
+                timeout=timeout,
             )
             header = f"[{run_idx}/{total}] {model['name']} x {task['name']}"

evals/ukrainian/README.md CHANGED Viewed

@@ -7,15 +7,14 @@ Ukrainian (`ukr_Cyrl` / macro `uk`) evaluation suite for the
 ### Custom Tasks (require `--include_path`)
-| #   | Task Name                    | Category          | Dataset (HuggingFace)                                          | Metric             |
-| --- | ---------------------------- | ----------------- | -------------------------------------------------------------- | ------------------ |
-| 1   | `ukrainian_sib200`           | Classification    | `Davlan/sib200` (`ukr_Cyrl`)                                   | f1_macro           |
-| 2   | `ukrainian_belebele`         | MCQ               | `facebook/belebele` (`ukr_Cyrl`)                               | f1_macro           |
-| 3   | `ukrainian_global_mmlu`      | MCQ               | `CohereLabs/Global-MMLU` (`uk`, filtered to CS+CA, ~400 items) | f1_macro           |
-| 4   | `ukrainian_zno`              | MCQ (native exam) | `osyvokon/zno`                                                 | f1_macro           |
-| 5   | `ukrainian_squad`            | Extractive QA     | `HPLT/ua-squad` (revised UA-SQuAD)                             | exact_match + f1   |
-| 6   | `ukrainian_massivesumm_long` | Summarization     | `MaLA-LM/MassiveSumm_long` (filtered `language=ukr`)           | rouge_l            |
-| 7   | `ukrainian_polywrite`        | Open generation   | `MaLA-LM/PolyWrite` (filtered `lang_script=ukr_Cyrl`)          | open_quality_score |
 #### Subgroups
@@ -24,7 +23,6 @@ Ukrainian (`ukr_Cyrl` / macro `uk`) evaluation suite for the
 | `ukrainian_classification`  | sib200                     |
 | `ukrainian_mcq`             | belebele, global_mmlu, zno |
 | `ukrainian_qa`              | squad                      |
-| `ukrainian_summarization`   | massivesumm_long           |
 | `ukrainian_open_generation` | polywrite                  |
 ## Setup
@@ -41,7 +39,7 @@ All commands must be run from the `multilingual_bench/` directory:
 cd /path/to/functionary_internal/evaluation/multilingual_bench
 ```
-### Run the Entire Ukrainian Suite (all 7 tasks)
 ```bash
 OPENAI_API_KEY="$OPENROUTER_API_KEY" \
@@ -70,7 +68,6 @@ python run_eval.py --models gpt-5-mini --tasks ukrainian
 lm_eval --include_path lm_eval_tasks --tasks ukrainian_classification ...
 lm_eval --include_path lm_eval_tasks --tasks ukrainian_mcq ...
 lm_eval --include_path lm_eval_tasks --tasks ukrainian_qa ...
-lm_eval --include_path lm_eval_tasks --tasks ukrainian_summarization ...
 lm_eval --include_path lm_eval_tasks --tasks ukrainian_open_generation ...
 ```
@@ -82,7 +79,6 @@ lm_eval --include_path lm_eval_tasks --tasks ukrainian_belebele ...
 lm_eval --include_path lm_eval_tasks --tasks ukrainian_global_mmlu ...
 lm_eval --include_path lm_eval_tasks --tasks ukrainian_zno ...
 lm_eval --include_path lm_eval_tasks --tasks ukrainian_squad ...
-lm_eval --include_path lm_eval_tasks --tasks ukrainian_massivesumm_long ...
 lm_eval --include_path lm_eval_tasks --tasks ukrainian_polywrite ...
 ```
@@ -95,26 +91,14 @@ With `--log_samples`, the output directory contains:
 ## Dataset Sources
-| Dataset          | Source                     | Config                            | Notes                                                                                                                                                                                                                                                                                                                                      |
-| ---------------- | -------------------------- | --------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| SIB-200          | `Davlan/sib200`            | `ukr_Cyrl`                        | text + ClassLabel `category` (7 topics)                                                                                                                                                                                                                                                                                                    |
-| Belebele         | `facebook/belebele`        | `ukr_Cyrl`                        | flores_passage + question + 4 mc_answers, `correct_answer_num` 1-4                                                                                                                                                                                                                                                                         |
-| Global-MMLU      | `CohereLabs/Global-MMLU`   | `uk`                              | Loaded from the **full** Global-MMLU (Lite doesn't ship `uk`) and filtered in `process_global_mmlu_docs` to `cultural_sensitivity_label ∈ {CS, CA}`, giving ~400 culturally-annotated items — the same subset Lite ships for other languages. Schema: `question`, `option_a..d`, `answer` letter, `subject`, `cultural_sensitivity_label`. |
-| ZNO              | `osyvokon/zno`             | —                                 | Native Ukrainian high-school exam (Ukrainian language & literature + history of Ukraine). 751 test items (2020-2023). Cyrillic markers А/Б/В/Г/Д, gold in `correct_answers[0]`                                                                                                                                                             |
-| UA-SQuAD         | `HPLT/ua-squad`            | —                                 | Native Ukrainian extractive QA (revised UA-SQuAD). 7,729 rows, ~50/50 `train`/`test` (we use `test`). Standard SQuAD schema: `id`, `context`, `question`, `answers.{text, answer_start}`. MIT license.                                                                                                                                     |
-| MassiveSumm long | `MaLA-LM/MassiveSumm_long` | — (filter `language=ukr`)         | `text`, `summary`, `language`; longer articles; gated                                                                                                                                                                                                                                                                                      |
-| PolyWrite        | `MaLA-LM/PolyWrite`        | — (filter `lang_script=ukr_Cyrl`) | `prompt_translated`, `category`, `lang_script` (no reference answer)                                                                                                                                                                                                                                                                       |
-### Gated datasets
-The MassiveSumm dataset is gated on Hugging Face. Accept the terms (once) and export an HF token before running:
-- MassiveSumm long: <https://huggingface.co/datasets/MaLA-LM/MassiveSumm_long>
-```bash
-export HF_TOKEN="hf_..."
-huggingface-cli login   # one-time, optional if HF_TOKEN is exported
-```
 ### LLM-judge tasks

 ### Custom Tasks (require `--include_path`)
+| #   | Task Name               | Category          | Dataset (HuggingFace)                                          | Metric             |
+| --- | ----------------------- | ----------------- | -------------------------------------------------------------- | ------------------ |
+| 1   | `ukrainian_sib200`      | Classification    | `Davlan/sib200` (`ukr_Cyrl`)                                   | f1_macro           |
+| 2   | `ukrainian_belebele`    | MCQ               | `facebook/belebele` (`ukr_Cyrl`)                               | f1_macro           |
+| 3   | `ukrainian_global_mmlu` | MCQ               | `CohereLabs/Global-MMLU` (`uk`, filtered to CS+CA, ~400 items) | f1_macro           |
+| 4   | `ukrainian_zno`         | MCQ (native exam) | `osyvokon/zno`                                                 | f1_macro           |
+| 5   | `ukrainian_squad`       | Extractive QA     | `HPLT/ua-squad` (revised UA-SQuAD)                             | exact_match + f1   |
+| 6   | `ukrainian_polywrite`   | Open generation   | `MaLA-LM/PolyWrite` (filtered `lang_script=ukr_Cyrl`)          | open_quality_score |
 #### Subgroups
 | `ukrainian_classification`  | sib200                     |
 | `ukrainian_mcq`             | belebele, global_mmlu, zno |
 | `ukrainian_qa`              | squad                      |
 | `ukrainian_open_generation` | polywrite                  |
 ## Setup
 cd /path/to/functionary_internal/evaluation/multilingual_bench
 ```
+### Run the Entire Ukrainian Suite (all 6 tasks)
 ```bash
 OPENAI_API_KEY="$OPENROUTER_API_KEY" \
 lm_eval --include_path lm_eval_tasks --tasks ukrainian_classification ...
 lm_eval --include_path lm_eval_tasks --tasks ukrainian_mcq ...
 lm_eval --include_path lm_eval_tasks --tasks ukrainian_qa ...
 lm_eval --include_path lm_eval_tasks --tasks ukrainian_open_generation ...
 ```
 lm_eval --include_path lm_eval_tasks --tasks ukrainian_global_mmlu ...
 lm_eval --include_path lm_eval_tasks --tasks ukrainian_zno ...
 lm_eval --include_path lm_eval_tasks --tasks ukrainian_squad ...
 lm_eval --include_path lm_eval_tasks --tasks ukrainian_polywrite ...
 ```
 ## Dataset Sources
+| Dataset     | Source                   | Config                            | Notes                                                                                                                                                                                                                                                                                                                                      |
+| ----------- | ------------------------ | --------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| SIB-200     | `Davlan/sib200`          | `ukr_Cyrl`                        | text + ClassLabel `category` (7 topics)                                                                                                                                                                                                                                                                                                    |
+| Belebele    | `facebook/belebele`      | `ukr_Cyrl`                        | flores_passage + question + 4 mc_answers, `correct_answer_num` 1-4                                                                                                                                                                                                                                                                         |
+| Global-MMLU | `CohereLabs/Global-MMLU` | `uk`                              | Loaded from the **full** Global-MMLU (Lite doesn't ship `uk`) and filtered in `process_global_mmlu_docs` to `cultural_sensitivity_label ∈ {CS, CA}`, giving ~400 culturally-annotated items — the same subset Lite ships for other languages. Schema: `question`, `option_a..d`, `answer` letter, `subject`, `cultural_sensitivity_label`. |
+| ZNO         | `osyvokon/zno`           | —                                 | Native Ukrainian high-school exam (Ukrainian language & literature + history of Ukraine). 751 test items (2020-2023). Cyrillic markers А/Б/В/Г/Д, gold in `correct_answers[0]`                                                                                                                                                             |
+| UA-SQuAD    | `HPLT/ua-squad`          | —                                 | Native Ukrainian extractive QA (revised UA-SQuAD). 7,729 rows, ~50/50 `train`/`test` (we use `test`). Standard SQuAD schema: `id`, `context`, `question`, `answers.{text, answer_start}`. MIT license.                                                                                                                                     |
+| PolyWrite   | `MaLA-LM/PolyWrite`      | — (filter `lang_script=ukr_Cyrl`) | `prompt_translated`, `category`, `lang_script` (no reference answer)                                                                                                                                                                                                                                                                       |
 ### LLM-judge tasks

evals/ukrainian/summarization/_default_summarization_yaml DELETED Viewed

@@ -1,18 +0,0 @@
-# Shared config for Ukrainian summarization tasks (MassiveSumm).
-# The Ukrainian-only filter is applied in process_docs (the upstream
-# datasets are highly multilingual single-table dumps, no per-language
-# config). Scoring is sentence-level ROUGE-L F1.
-output_type: generate_until
-test_split: train
-generation_kwargs:
-  do_sample: false
-  max_gen_toks: 8192
-  until:
-    - "<|endoftext|>"
-process_results: !function utils.process_results
-metric_list:
-  - metric: rouge_l
-    aggregation: !function utils.rouge_l_agg
-    higher_is_better: true
-metadata:
-  version: 1.0

evals/ukrainian/summarization/ukrainian_massivesumm_long.yaml DELETED Viewed

@@ -1,14 +0,0 @@
-task: ukrainian_massivesumm_long
-task_alias: massivesumm_long
-# Gated dataset: accept terms at
-# https://huggingface.co/datasets/MaLA-LM/MassiveSumm_long and export HF_TOKEN.
-dataset_path: MaLA-LM/MassiveSumm_long
-include: _default_summarization_yaml
-process_docs: !function utils.process_docs
-generation_kwargs:
-  do_sample: false
-  max_gen_toks: 8192
-  until:
-    - "<|endoftext|>"
-doc_to_text: "Ти — система реферування новин.\nСтисни наведену нижче статтю в короткий абзац (3-5 речень) українською мовою. Не додавай коментарів.\n\nСтаття:\n{{text}}\n\nСтислий виклад:"
-doc_to_target: "{{summary}}"

evals/ukrainian/summarization/ukrainian_summarization.yaml DELETED Viewed

@@ -1,10 +0,0 @@
-# Summarization subgroup (MassiveSumm long)
-group: ukrainian_summarization
-task:
-  - ukrainian_massivesumm_long
-aggregate_metric_list:
-  - metric: rouge_l
-    aggregation: mean
-    weight_by_size: true
-metadata:
-  version: 1.0

evals/ukrainian/summarization/utils.py DELETED Viewed

@@ -1,111 +0,0 @@
-"""Utility helpers for Ukrainian summarization tasks (MassiveSumm short/long).
-Both MassiveSumm subsets are highly multilingual single-table datasets
-(one ``train`` split, no language configs). We filter to Ukrainian rows
-inside ``process_docs``. The HF dataset is **gated** — accept the terms
-on the dataset page once and export ``HF_TOKEN`` before running.
-Scoring uses ROUGE-L F1 via the ``rouge_score`` package, which is
-already a transitive dependency of lm-evaluation-harness.
-"""
-from __future__ import annotations
-import re
-import string
-import datasets
-_UKRAINIAN_LANG_CODES = {"ukr", "uk"}
-def _strip_think_tags(text: str) -> str:
-    """Strip <think>...</think> reasoning wrapper (e.g. Qwen thinking models)."""
-    if "</think>" in text:
-        return text.split("</think>")[-1].strip()
-    return text
-def _filter_ukrainian(dataset: datasets.Dataset) -> datasets.Dataset:
-    """Keep rows whose ``language`` field is one of Ukrainian variants."""
-    if "language" not in dataset.column_names:
-        return dataset
-    return dataset.filter(lambda row: str(row.get("language", "")).lower() in _UKRAINIAN_LANG_CODES)
-def _normalise_doc(doc):
-    """Project the columns we actually need."""
-    text = (doc.get("text") or "").strip()
-    summary = (doc.get("summary") or "").strip()
-    title = (doc.get("title") or "").strip()
-    return {
-        "text": text,
-        "summary": summary,
-        "title": title,
-    }
-def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
-    filtered = _filter_ukrainian(dataset)
-    return filtered.map(_normalise_doc, remove_columns=[
-        c for c in filtered.column_names if c not in ("text", "summary", "title")
-    ])
-# ── ROUGE-L scoring ──────────────────────────────────────────────────
-_PUNCT_TABLE = str.maketrans("", "", string.punctuation)
-_WHITESPACE_RE = re.compile(r"\s+")
-def _normalise(text: str) -> str:
-    text = text.translate(_PUNCT_TABLE)
-    text = _WHITESPACE_RE.sub(" ", text)
-    return text.strip().lower()
-def _lcs_length(a, b):
-    m, n = len(a), len(b)
-    if m == 0 or n == 0:
-        return 0
-    dp = [0] * (n + 1)
-    for i in range(1, m + 1):
-        prev = 0
-        for j in range(1, n + 1):
-            tmp = dp[j]
-            if a[i - 1] == b[j - 1]:
-                dp[j] = prev + 1
-            else:
-                dp[j] = max(dp[j], dp[j - 1])
-            prev = tmp
-    return dp[n]
-def _rouge_l_f1(pred: str, gold: str) -> float:
-    """Compute sentence-level ROUGE-L F1 (no stemming) between pred and gold."""
-    pred_tokens = _normalise(pred).split()
-    gold_tokens = _normalise(gold).split()
-    if not pred_tokens or not gold_tokens:
-        return 0.0
-    lcs = _lcs_length(pred_tokens, gold_tokens)
-    if lcs == 0:
-        return 0.0
-    precision = lcs / len(pred_tokens)
-    recall = lcs / len(gold_tokens)
-    return 2 * precision * recall / (precision + recall)
-def process_results(doc, results):
-    raw_response = results[0].strip() if results and results[0] else ""
-    pred = _strip_think_tags(raw_response)
-    gold = (doc.get("summary") or "").strip()
-    return {"rouge_l": (gold, pred)}
-def rouge_l_agg(items):
-    if not items:
-        return 0.0
-    scores = [_rouge_l_f1(pred, gold) for gold, pred in items]
-    return sum(scores) / len(scores)

evals/ukrainian/ukrainian.yaml CHANGED Viewed

@@ -10,14 +10,12 @@
 # Metrics:
 #   classification & mcq → f1_macro                            (per sub-group)
 #   qa                   → exact_match + f1                    (per sub-group)
-#   summarization        → rouge_l                             (per sub-group)
 #   open_generation      → llm_judge_score / open_quality_score (per sub-group)
 group: ukrainian
 task:
   - ukrainian_classification
   - ukrainian_mcq
   - ukrainian_qa
-  - ukrainian_summarization
   - ukrainian_open_generation
 metadata:
   version: 1.0

 # Metrics:
 #   classification & mcq → f1_macro                            (per sub-group)
 #   qa                   → exact_match + f1                    (per sub-group)
 #   open_generation      → llm_judge_score / open_quality_score (per sub-group)
 group: ukrainian
 task:
   - ukrainian_classification
   - ukrainian_mcq
   - ukrainian_qa
   - ukrainian_open_generation
 metadata:
   version: 1.0