Spaces:
Sleeping
Sleeping
| import json | |
| import pathlib | |
| from copy import deepcopy | |
| import click | |
| import pandas as pd | |
| import pandera.pandas as pa | |
| from tqdm.auto import tqdm | |
| from src.common.data import load_dataset | |
| from src.eval.metrics import grade_to_weight | |
| from src.eval.schema import DatasetEvalSchema | |
| from src.eval.matchers import build_check_function | |
| from src.generate.generators import GenerationAnswer | |
| from src.generate.schema import GeneratedDatasetSchema | |
| from src.common.schema import DatasetSchema, LeaderBoardSchema | |
| def _evaluate_single_answer( | |
| row: dict, | |
| ) -> bool: | |
| if pd.isna(row[GeneratedDatasetSchema.generated_answer]): | |
| return False | |
| if not type(row[GeneratedDatasetSchema.generated_answer]) is GenerationAnswer: | |
| raise ValueError( | |
| f"Expected GenerationAnswer, got {type(row[GeneratedDatasetSchema.generated_answer])} for id {row[DatasetSchema.id_]}", | |
| ) | |
| y_pred = row[GeneratedDatasetSchema.generated_answer].answer | |
| if not y_pred: | |
| return False | |
| y_true = row[DatasetSchema.correct_answer] | |
| check_function = build_check_function( | |
| row[DatasetSchema.check_type], | |
| row[DatasetSchema.check_function], | |
| ) | |
| try: | |
| result = check_function( | |
| y_true=deepcopy(y_true), | |
| y_pred=deepcopy(y_pred), | |
| ) | |
| except Exception as e: | |
| print(e) | |
| print( | |
| f"Error evaluating row with {row[DatasetSchema.check_type]} {row[DatasetSchema.id_]}: {y_true} vs {y_pred}" | |
| ) | |
| exit(1) | |
| return result | |
| def _evaluate( | |
| generated_df: pd.DataFrame, | |
| ) -> pd.DataFrame: | |
| tqdm.pandas() | |
| generated_df[GeneratedDatasetSchema.generated_answer] = generated_df[ | |
| GeneratedDatasetSchema.generated_answer | |
| ].apply( | |
| lambda x: GenerationAnswer.model_validate(deepcopy(x)) if x else None, | |
| ) | |
| dataset_df = load_dataset() | |
| predictions_df = dataset_df.join( | |
| generated_df.set_index(GeneratedDatasetSchema.id_), | |
| on=DatasetSchema.id_, | |
| ) | |
| predictions_df[DatasetEvalSchema.is_correct] = predictions_df.progress_apply( | |
| _evaluate_single_answer, | |
| axis=1, | |
| ) | |
| predictions_df[DatasetEvalSchema.predicted_answer] = predictions_df[ | |
| GeneratedDatasetSchema.generated_answer | |
| ].apply( | |
| lambda x: x.answer if not pd.isna(x) else None, | |
| ) | |
| predictions_df[DatasetEvalSchema.context] = predictions_df[ | |
| GeneratedDatasetSchema.generated_answer | |
| ].apply( | |
| lambda x: x.context if not pd.isna(x) else None, | |
| ) | |
| predictions_df = predictions_df[list(DatasetEvalSchema._collect_fields().keys())] | |
| return predictions_df | |
| def evaluate( | |
| file: pathlib.Path = pathlib.Path("./gemma3:4b.jsonl"), | |
| ): | |
| file = pathlib.Path(file) | |
| df = pd.read_json(file, lines=True) | |
| evaluated_df = _evaluate(df) | |
| evaluated_df.to_json( | |
| file.with_suffix(".eval.jsonl"), orient="records", lines=True, force_ascii=False | |
| ) | |
| def _metrics( | |
| df: pd.DataFrame, | |
| model_name: str, | |
| model_size: float, | |
| model_url: str, | |
| model_config: str, | |
| ) -> pd.DataFrame: | |
| pass1 = df[DatasetEvalSchema.is_correct].mean() | |
| w = df[DatasetEvalSchema.grade].apply(grade_to_weight) | |
| weighted_accuracy = ( | |
| df[DatasetEvalSchema.is_correct].astype(int) * w | |
| ).sum() / w.sum() | |
| arith_pass1 = df[df[DatasetEvalSchema.task_type] == "arith"][ | |
| DatasetEvalSchema.is_correct | |
| ].mean() | |
| geometry_pass1 = df[df[DatasetEvalSchema.task_type] == "geometry"][ | |
| DatasetEvalSchema.is_correct | |
| ].mean() | |
| logic_pass1 = df[df[DatasetEvalSchema.task_type] == "logic"][ | |
| DatasetEvalSchema.is_correct | |
| ].mean() | |
| result = { | |
| LeaderBoardSchema.model_name: model_name, | |
| LeaderBoardSchema.model_size: model_size, | |
| LeaderBoardSchema.model_url: model_url, | |
| LeaderBoardSchema.config: str(model_config), | |
| LeaderBoardSchema.pass1: pass1, | |
| LeaderBoardSchema.weighted_pass1: weighted_accuracy, | |
| LeaderBoardSchema.arith_pass1: arith_pass1, | |
| LeaderBoardSchema.geometry_pass1: geometry_pass1, | |
| LeaderBoardSchema.logic_pass1: logic_pass1, | |
| } | |
| result_df = pd.DataFrame([result]) | |
| result_df = result_df[list(LeaderBoardSchema._collect_fields().keys())] | |
| return result_df | |
| def metrics( | |
| model_name: str, | |
| file: pathlib.Path = pathlib.Path("./gemma3:4b_eval.jsonl"), | |
| model_size: float = None, | |
| model_url: str = None, | |
| model_config: str = None, | |
| ): | |
| file = pathlib.Path(file) | |
| df = pd.read_json(file, lines=True) | |
| metrics_df = _metrics( | |
| df, | |
| model_name=model_name, | |
| model_size=model_size, | |
| model_url=model_url, | |
| model_config=model_config or "", | |
| ) | |
| metrics = metrics_df.to_dict(orient="records")[0] | |
| print(f"Metrics for {model_name}:") | |
| for key, value in metrics.items(): | |
| print(f"{key}: {value}") | |
| json.dump( | |
| metrics_df.to_dict(orient="records"), | |
| open(file.with_suffix(".metrics.json"), "w"), | |
| ensure_ascii=False, | |
| ) | |
| if __name__ == "__main__": | |
| evaluate() | |