Spaces:

Intel
/

low_bit_open_llm_leaderboard

Runtime error

App Files Files Community

low_bit_open_llm_leaderboard / src /submission /submit.py

n1ck-guo

support for autoround type

d56aebd over 1 year ago

raw

history blame contribute delete

10.5 kB

	import json
	import os
	from datetime import datetime, timezone
	import time

	from huggingface_hub import ModelCard, snapshot_download

	from src.display.formatting import styled_error, styled_message, styled_warning
	from src.envs import API, EVAL_REQUESTS_PATH, DYNAMIC_INFO_PATH, DYNAMIC_INFO_FILE_PATH, DYNAMIC_INFO_REPO, H4_TOKEN, QUEUE_REPO, RATE_LIMIT_PERIOD, RATE_LIMIT_QUOTA, REPO, GIT_REQUESTS_PATH, GIT_STATUS_PATH, GLOBAL_COND
	from src.leaderboard.filter_models import DO_NOT_SUBMIT_MODELS
	from src.submission.check_validity import (
	already_submitted_models,
	check_model_card,
	get_model_size,
	get_quantized_model_parameters_memory,
	is_model_on_hub,
	is_gguf_on_hub,
	user_submission_permission,
	get_model_tags
	)

	REQUESTED_MODELS = None
	USERS_TO_SUBMISSION_DATES = None

	def add_new_eval(
	model: str,
	revision: str,
	private: bool,
	compute_dtype: str="float16",
	precision: str="4bit",
	weight_dtype: str="int4",
	gguf_ftype: str="*Q4_0.gguf",
	):
	global REQUESTED_MODELS
	global USERS_TO_SUBMISSION_DATES
	if not REQUESTED_MODELS:
	REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(GIT_STATUS_PATH)

	quant_type = None
	user_name = ""
	model_path = model
	if "/" in model:
	user_name = model.split("/")[0]
	model_path = model.split("/")[1]

	precision = precision.split(" ")[0]
	current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")

	# Is the user rate limited?
	if user_name != "":
	user_can_submit, error_msg = user_submission_permission(
	user_name, USERS_TO_SUBMISSION_DATES, RATE_LIMIT_PERIOD, RATE_LIMIT_QUOTA
	)
	if not user_can_submit:
	return styled_error(error_msg)

	# Did the model authors forbid its submission to the leaderboard?
	if model in DO_NOT_SUBMIT_MODELS:
	return styled_warning("Model authors have requested that their model be not submitted on the leaderboard.")

	# Does the model actually exist?
	if revision == "":
	revision = "main"

	architecture = "?"
	downloads = 0
	created_at = ""
	gguf_on_hub, error, gguf_files, new_gguf_ftype = is_gguf_on_hub(repo_id=model, filename=gguf_ftype)
	if new_gguf_ftype is not None:
	gguf_ftype = new_gguf_ftype

	model_on_hub, error, model_config = is_model_on_hub(model_name=model, revision=revision, test_tokenizer=True)

	# Is the model on the hub?
	if (not model_on_hub or model_config is None) and (not gguf_on_hub or gguf_files is None):
	return styled_error(f'Model "{model}" {error}')

	if model_config is not None:
	architectures = getattr(model_config, "architectures", None)
	if architectures:
	architecture = ";".join(architectures)
	downloads = getattr(model_config, 'downloads', 0)
	created_at = getattr(model_config, 'created_at', '')
	quantization_config = getattr(model_config, 'quantization_config', None)

	if gguf_files is not None:
	architectures = ""
	downloads = 0
	created_at = ""
	quantization_config = None
	quant_type = "llama.cpp"


	# Is the model info correctly filled?
	try:
	model_info = API.model_info(repo_id=model, revision=revision)
	except Exception:
	return styled_error("Could not get your model information. Please fill it up properly.")

	# Were the model card and license filled?
	try:
	if model_info.cardData is None:
	license = "unknown"
	else:
	license = model_info.cardData.get("license", "unknown")
	except Exception:
	return styled_error("Please select a license for your model")

	modelcard_OK, error_msg, model_card = check_model_card(model)

	# maybe don't have model card
	"""
	if not modelcard_OK:
	return styled_error(error_msg)
	"""

	tags = get_model_tags(model_card, model)

	# Seems good, creating the eval
	print("Adding new eval")

	script = "ITREX"
	hardware = "cpu"
	precision = "4bit"
	if quantization_config is not None:
	quant_method = quantization_config.get("quant_method", None)
	if "bnb_4bit_quant_type" in quantization_config:
	quant_method = "bitsandbytes"
	quant_type = "bitsandbytes"
	hardware = "gpu"
	if quantization_config.get("load_in_4bit", True):
	precision = "4bit"
	if quantization_config.get("load_in_8bit", True):
	precision = "8bit"
	if quant_method == "gptq":
	hardware = "cpu"
	quant_type = "GPTQ"
	precision = f"{quantization_config.get('bits', '4bit')}bit"
	if quant_method == "awq":
	hardware = "gpu"
	quant_type = "AWQ"
	precision = f"{quantization_config.get('bits', '4bit')}bit"
	if quant_method == "aqlm":
	hardware = "gpu"
	quant_type = "AQLM"
	nbits_per_codebook = quantization_config.get('nbits_per_codebook')
	num_codebooks = quantization_config.get('num_codebooks')
	in_group_size = quantization_config.get('in_group_size')
	bits = int(nbits_per_codebook * num_codebooks / in_group_size)
	precision = f"{bits}bit"
	if "auto-round" in quant_method:
	hardware = "gpu"
	quant_type = "AutoRound"
	precision = f"{quantization_config.get('bits', '4bit')}bit"

	if precision == "4bit":
	weight_dtype = "int4"
	elif precision == "3bit":
	weight_dtype = "int3"
	elif precision == "2bit":
	weight_dtype = "int2"

	if quant_type is None or quant_type == "":
	# return styled_error("Please select a quantization model like GPTQ, AWQ etc.")
	# for eval fp32/fp16/bf16
	quant_type = None

	if quant_type is None:
	weight_dtype = str(getattr(model_config, "torch_dtype", "float16"))
	if weight_dtype in ["torch.float16", "float16"]:
	weight_dtype = "float16"
	precision = "16bit"
	elif weight_dtype in ["torch.bfloat16", "bfloat16"]:
	weight_dtype = "bfloat16"
	precision = "16bit"
	elif weight_dtype in ["torch.float32", "float32"]:
	weight_dtype = "float32"
	precision = "32bit"
	else:
	weight_dtype = "float32"
	precision = "32bit"
	model_type = "original"
	model_params, model_size = get_model_size(model_info=model_info, precision=precision)
	else:
	model_params, model_size = get_quantized_model_parameters_memory(model_info,
	quant_method=quant_type.lower(),
	bits=precision)
	model_type = "quantization"

	if quant_type == "llama.cpp":
	hardware = "cpu"
	script = "llama_cpp"
	tags = "llama.cpp"
	else:
	hardware = "gpu"

	if compute_dtype == "?":
	compute_dtype = "float16"

	eval_entry = {
	"model": model,
	"revision": revision,
	"private": private,
	"params": model_size,
	"architectures": architecture,
	"quant_type": quant_type,
	"precision": precision,
	"model_params": model_params,
	"model_size": model_size,
	"precision": precision,
	"weight_dtype": weight_dtype,
	"compute_dtype": compute_dtype,
	"gguf_ftype": gguf_ftype,
	"hardware": hardware,
	"status": "Pending",
	"submitted_time": current_time,
	"model_type": model_type,
	"job_id": -1,
	"job_start_time": None,
	"scripts": script
	}

	supplementary_info = {
	"likes": model_info.likes,
	"license": license,
	"still_on_hub": True,
	"tags": tags,
	"downloads": downloads,
	"created_at": created_at
	}
	print(eval_entry)

	# ToDo: need open
	# Check for duplicate submission
	if f"{model}_{revision}_{quant_type}_{precision}_{weight_dtype}_{compute_dtype}" in REQUESTED_MODELS:
	return styled_warning("This model has been already submitted.")

	print("Creating huggingface/dataset eval file")
	OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
	os.makedirs(OUT_DIR, exist_ok=True)
	out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{quant_type}_{precision}_{weight_dtype}_{compute_dtype}.json"

	with open(out_path, "w") as f:
	f.write(json.dumps(eval_entry))

	print("Uploading eval file")
	try:
	API.upload_file(
	path_or_fileobj=out_path,
	path_in_repo=out_path.split("eval-queue/")[1],
	repo_id=QUEUE_REPO,
	repo_type="dataset",
	commit_message=f"Add {model} to eval queue",
	)
	except Exception as e:
	print(str(e))
	print("upload error........")

	print("Creating git eval file")
	OUT_DIR = f"{GIT_REQUESTS_PATH}/{user_name}"
	os.makedirs(OUT_DIR, exist_ok=True)
	req_out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{quant_type}_{precision}_{weight_dtype}_{compute_dtype}.json"
	req_git_path = "/".join(req_out_path.split('/')[1:])

	print("Creating status file")
	OUT_DIR = f"{GIT_STATUS_PATH}/{user_name}"
	os.makedirs(OUT_DIR, exist_ok=True)
	sta_out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{quant_type}_{precision}_{weight_dtype}_{compute_dtype}.json"
	sta_git_path = "/".join(sta_out_path.split('/')[1:])

	print("Uploading eval file")
	try:
	print("git-push get lock..............")
	GLOBAL_COND.acquire()
	branch = REPO.active_branch.name
	REPO.remotes.origin.pull(branch)

	REPO.index.remove("requests", False, r=True)

	with open(req_out_path, "w") as f:
	f.write(json.dumps(eval_entry, indent=4))
	with open(sta_out_path, "w") as f:
	f.write(json.dumps(eval_entry, indent=4))

	REPO.index.add([req_git_path, sta_git_path])
	commit = REPO.index.commit(f"Add {model} to eval requests/status.")
	REPO.remotes.origin.push(branch)
	time.sleep(10)

	print("git-push release lock..............")
	GLOBAL_COND.release()
	except Exception as e:
	print(str(e))
	print("git-push error........")
	GLOBAL_COND.release()

	return styled_message(
	"Your request has been submitted to the evaluation queue!\nPlease wait for up to 3 hours for the model to show in the PENDING list."
	)