ThomasTheMaker's picture
Upload folder using huggingface_hub
feba2ad verified
raw
history blame
1.63 kB
"""
Paloma is a comprehensive evaluation benchmark for large language models (LLMs) that focuses
on measuring perplexity across diverse text domains.
To evaluate on Paloma, we use the huggingface evaluation framework.
For more details, see: https://huggingface.co/datasets/allenai/paloma
"""
import evaluate
from datasets import load_dataset
from datasets.utils.logging import disable_progress_bar, enable_progress_bar
from src.config.evaluation_config import PalomaEvaluationConfig
def run_paloma_evaluation(
model_path: str,
paloma_config: PalomaEvaluationConfig,
) -> None:
"""Run Perplexity evaluation on the Paloma evaluation dataset.
We use the HuggingFace evaluate library to load in and compute the perplexity metric.
Args:
model_path (str): Path to the model checkpoint to be evaluated
paloma_config (PalomaEvaluationConfig): Configuration for Paloma evaluation
"""
disable_progress_bar()
# load custom evaluation space, see https://huggingface.co/spaces/pico-lm/perplexity
perplexity = evaluate.load("pico-lm/perplexity")
dataset = load_dataset(
paloma_config.dataset_name, split=paloma_config.dataset_split
)["text"]
# compute perplexity score on Paloma dataset
perplexity_result = perplexity.compute(
model_id=model_path,
predictions=dataset,
add_start_token=False,
max_length=paloma_config.max_length,
batch_size=paloma_config.batch_size,
trust_remote_code=True,
)
mean_perplexity = perplexity_result["mean_perplexity"]
enable_progress_bar()
return mean_perplexity