Spaces:

Intel
/

low_bit_open_llm_leaderboard

Running

App Files Files Community

low_bit_open_llm_leaderboard / src /leaderboard /read_evals.py

wenjiao

solve crush

c06fe89 about 9 hours ago

raw

history blame contribute delete

12 kB

	import glob
	import json
	import logging
	import os
	import traceback
	from dataclasses import dataclass
	from typing import Optional

	import dateutil
	import numpy as np

	from huggingface_hub import ModelCard

	from src.display.formatting import make_clickable_model
	from src.display.utils import auto_eval_cols, ModelType, Tasks, Precision, WeightType, QuantType, WeightDtype, ComputeDtype

	logger = logging.getLogger(__name__)


	@dataclass
	class EvalResult:
	# Also see src.display.utils.AutoEvalColumn for what will be displayed.
	eval_name: str # org_model_precision (uid)
	full_model: str # org/model (path on hub)
	org: str
	model: str
	revision: str # commit hash, "" if main
	results: dict
	quant_type: QuantType = QuantType.Unknown
	precision: Precision = Precision.Unknown
	weight_dtype: WeightDtype = WeightDtype.Unknown
	compute_dtype: ComputeDtype = ComputeDtype.Unknown
	double_quant: bool = False
	model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
	weight_type: WeightType = WeightType.Original # Original or Adapter
	architecture: str = "Unknown" # From config file
	license: str = "?"
	likes: int = 0
	num_params: int = 0
	model_size: int = 0
	group_size: int = -1
	date: str = "" # submission date of request file
	still_on_hub: bool = True
	is_merge: bool = False
	flagged: bool = False
	status: str = "Finished"
	tags: Optional[list] = None
	result_file: str = ""

	@classmethod
	def init_from_json_file(cls, json_filepath):
	"""Inits the result from the specific model result file"""

	result_file = "/".join(json_filepath.split("/")[2:])
	with open(json_filepath) as fp:
	data = json.load(fp)

	# We manage the legacy config format
	config = data.get("config_general")
	if not isinstance(config, dict):
	raise ValueError(f"Missing or invalid config_general in {json_filepath}")

	# Precision
	precision = Precision.from_str(config.get("precision", "4bit"))
	quant_type = QuantType.from_str(str(config.get("quant_type", "GPTQ")))
	weight_dtype = WeightDtype.from_str(data["task_info"].get("weight_dtype", "int4"))
	compute_dtype = ComputeDtype.from_str(data["task_info"].get("compute_dtype", "bfloat16"))

	model_params = round(float(config["model_params"]), 2)
	model_size = round(float(config["model_size"]), 2)

	if data.get("quantization_config", None):
	double_quant = data["quantization_config"].get("bnb_4bit_use_double_quant", False)
	group_size = data["quantization_config"].get("group_size", -1)
	else:
	double_quant = False
	group_size = -1

	local = config.get("local", False)
	if not local:
	local = data["task_info"].get("local", False)

	# Get model and org
	org_and_model = config.get("model_name")
	org_and_model = org_and_model.split("/", 1)

	if local and org_and_model[0] != "Intel":
	org_and_model = config.get("model_name").split("/")
	org_and_model = ["local", org_and_model[-1]]
	quant_type = QuantType.autoround

	if len(org_and_model) == 1:
	org = None
	model = org_and_model[0]
	result_key = f"{model}_{precision.value.name}"
	else:
	org = org_and_model[0]
	model = org_and_model[1]
	result_key = f"{org}_{model}_{precision.value.name}"
	full_model = "/".join(org_and_model)

	# Extract results
	results = {}
	for task in Tasks:
	task = task.value
	if task.benchmark == "mmlu":
	mmlu_data = data["results"].get("harness\|mmlu\|0")
	if mmlu_data is None:
	continue
	accs = np.array([mmlu_data.get(task.metric)])
	else:
	accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k])
	if accs.size == 0 or any([acc is None for acc in accs]):
	continue

	mean_acc = np.mean(accs) * 100.0
	mean_acc = round(mean_acc, 2)
	results[task.benchmark] = mean_acc

	return cls(
	eval_name=result_key,
	full_model=full_model,
	org=org,
	model=model,
	results=results,
	precision=precision,
	quant_type=quant_type,
	weight_dtype=weight_dtype,
	compute_dtype=compute_dtype,
	double_quant=double_quant,
	revision=config.get("model_sha", "main"),
	num_params=model_params,
	model_size=model_size,
	group_size=group_size,
	result_file=result_file
	)

	def update_with_request_file(self, requests_path):
	"""Finds the relevant request file for the current model and updates info with it"""
	request_file = get_request_file_for_model(requests_path, self.full_model,
	self.quant_type.value.name, self.precision.value.name,
	self.weight_dtype.value.name, self.compute_dtype.value.name)

	try:
	with open(request_file, "r") as f:
	request = json.load(f)
	self.date = request.get("submitted_time", "")
	self.architecture = request.get("architectures", "Unknown")
	self.status = request.get("status", "Failed")
	except Exception as e:
	self.status = "Failed"
	logger.warning("Could not find request file for %s/%s: %s", self.org, self.model, e)

	def update_with_dynamic_file_dict(self, file_dict):
	self.license = file_dict.get("license", "?")
	self.likes = file_dict.get("likes", 0)
	self.still_on_hub = file_dict["still_on_hub"]
	self.tags = file_dict.get("tags", [])
	self.flagged = any("flagged" in tag for tag in self.tags)


	def to_dict(self):
	"""Converts the Eval Result to a dict compatible with our dataframe display"""

	valid_results = [v for v in self.results.values() if v is not None]
	average = sum(valid_results) / len(valid_results) if valid_results else 0

	data_dict = {
	"eval_name": self.eval_name,
	"date": self.date,
	auto_eval_cols.precision.name: self.precision.value.name,
	auto_eval_cols.quant_type.name: self.quant_type.value.name,
	auto_eval_cols.model_type_symbol.name: self.quant_type.value.symbol,
	auto_eval_cols.weight_dtype.name: self.weight_dtype.value.name,
	auto_eval_cols.compute_dtype.name: self.compute_dtype.value.name,
	auto_eval_cols.model.name: make_clickable_model(self.full_model, self.result_file),
	auto_eval_cols.revision.name: self.revision,
	auto_eval_cols.average.name: average,
	auto_eval_cols.model_size.name: self.model_size,
	auto_eval_cols.dummy.name: self.full_model,
	}

	data_dict[auto_eval_cols.still_on_hub.name] = self.still_on_hub
	data_dict[auto_eval_cols.flagged.name] = self.flagged

	if hasattr(auto_eval_cols, "double_quant"):
	data_dict[auto_eval_cols.double_quant.name] = self.double_quant
	if hasattr(auto_eval_cols, "architecture"):
	data_dict[auto_eval_cols.architecture.name] = self.architecture
	if hasattr(auto_eval_cols, "params"):
	data_dict[auto_eval_cols.params.name] = self.num_params
	if hasattr(auto_eval_cols, "license"):
	data_dict[auto_eval_cols.license.name] = self.license
	if hasattr(auto_eval_cols, "likes"):
	data_dict[auto_eval_cols.likes.name] = self.likes
	if hasattr(auto_eval_cols, "group_size"):
	data_dict[auto_eval_cols.group_size.name] = self.group_size

	if hasattr(auto_eval_cols, "merged"):
	data_dict[auto_eval_cols.merged.name] = "merge" in (self.tags if self.tags else [])
	if hasattr(auto_eval_cols, "moe"):
	data_dict[auto_eval_cols.moe.name] = ("moe" in (self.tags if self.tags else [])) or "moe" in self.full_model.lower()

	for task in Tasks:
	data_dict[task.value.col_name] = self.results.get(task.value.benchmark, 0)

	return data_dict



	def get_request_file_for_model(requests_path, model_name,
	quant_type, precision, weight_dtype, compute_dtype):
	"""Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
	request_files = os.path.join(
	requests_path,
	f"{model_name}_eval_request_*.json",
	)
	request_files = glob.glob(request_files)

	request_file = ""
	request_files = sorted(request_files, reverse=True)
	for tmp_request_file in request_files:
	with open(tmp_request_file, "r") as f:
	req_content = json.load(f)
	if (
	req_content["status"] in ["Finished"]
	and req_content["precision"] == precision.split(".")[-1]
	and str(req_content["quant_type"]) == quant_type
	and req_content["weight_dtype"] == weight_dtype.split(".")[-1]
	and req_content["compute_dtype"] == compute_dtype.split(".")[-1]
	):
	request_file = tmp_request_file
	elif (
	req_content["status"] in ["Finished"]
	and req_content["precision"] == precision.split(".")[-1]
	and quant_type == "AutoRound"
	and req_content["weight_dtype"] == weight_dtype.split(".")[-1]
	and req_content["compute_dtype"] == compute_dtype.split(".")[-1]
	):
	request_file = tmp_request_file
	return request_file


	def get_raw_eval_results(results_path: str, requests_path: str, dynamic_path: str) -> list[EvalResult]:
	"""From the path of the results folder root, extract all needed info for results"""
	model_result_filepaths = []

	for root, _, files in os.walk(results_path):
	result_files = [f for f in files if f.startswith("results_") and f.endswith(".json")]
	if len(result_files) == 0:
	continue

	try:
	result_files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
	except Exception:
	result_files = [result_files[-1]]

	for file in result_files:
	model_result_filepaths.append(os.path.join(root, file))

	dynamic_data = {}
	if os.path.exists(dynamic_path):
	with open(dynamic_path) as f:
	dynamic_data = json.load(f)

	eval_results = {}
	for model_result_filepath in model_result_filepaths:
	try:
	eval_result = EvalResult.init_from_json_file(model_result_filepath)
	except Exception as e:
	logger.warning("Skipping malformed eval result %s: %s", model_result_filepath, e)
	continue

	eval_result.update_with_request_file(requests_path)

	if eval_result.full_model in dynamic_data:
	eval_result.update_with_dynamic_file_dict(dynamic_data[eval_result.full_model])
	if "meta-llama" in eval_result.full_model:
	eval_result.still_on_hub = True

	eval_name = eval_result.eval_name
	if eval_name in eval_results:
	eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
	else:
	eval_results[eval_name] = eval_result

	results = []
	for v in eval_results.values():
	try:
	if v.status == "Finished":
	v.to_dict() # Validate that to_dict() doesn't raise
	results.append(v)
	except Exception as e:
	logger.warning("Error processing %s: %s", v.eval_name, e)
	continue

	return results