Upload folder using huggingface_hub

b0c0df0 verified about 1 month ago

5.46 kB

	import datetime
	import json
	import os
	import re
	import sys
	import time
	from pathlib import Path

	import requests
	import yaml
	from loguru import logger as eval_logger

	import lmms_eval.tasks._task_utils.file_utils as file_utils
	from lmms_eval.filters.extraction import ExtendedRegexFilter


	def clotho_aqa_doc_to_audio(doc):
	return [doc["audio"]]


	def clotho_aqa_doc_to_text(doc, lmms_eval_specific_kwargs):
	question = doc["question"]
	pre_prompt = lmms_eval_specific_kwargs["pre_prompt"]
	post_prompt = lmms_eval_specific_kwargs["post_prompt"]
	return f"{pre_prompt}{question}{post_prompt}"


	# functions for the clotho_asqa_v2 task, need to be tested later

	with open(Path(__file__).parent / "_default_template_yaml", "r") as f:
	raw_data = f.readlines()
	safe_data = []
	for i, line in enumerate(raw_data):
	# remove function definition since yaml load cannot handle it
	if "!function" not in line:
	safe_data.append(line)

	config = yaml.safe_load("".join(safe_data))


	NUM_SECONDS_TO_SLEEP = 2
	GPT_EVAL_MODEL_NAME = os.getenv("MODEL_VERSION", "gpt-4o-2024-11-20")
	API_TYPE = os.getenv("API_TYPE", "azure")

	if API_TYPE == "openai":
	API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions")
	API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY")
	headers = {
	"Authorization": f"Bearer {API_KEY}",
	"Content-Type": "application/json",
	}

	elif API_TYPE == "azure":
	API_URL = os.getenv("AZURE_ENDPOINT", "https://api.cognitive.microsoft.com/sts/v1.0/issueToken")
	API_KEY = os.getenv("AZURE_API_KEY", "YOUR_API_KEY")
	headers = {
	"api-key": API_KEY,
	"Content-Type": "application/json",
	}

	eval_prompt = """
	[Question]
	{question}

	[Reference Answer]
	{ground_truth}

	[Model Answer]
	{model_response}

	[Task]
	Rate the model's answer based on its alignment with the reference answer, focusing on accuracy and relevance to the reference provided. Please be critical on the details.
	Criteria: Assess if the model's response mirrors the reference in terms of content, accuracy, and relevance.
	Score0: The answer is completely misaligned, providing incorrect or irrelevant information compared to the reference.
	Score1: The answer shows minimal alignment, often misunderstanding or providing irrelevant details unrelated to the reference.
	Score2: The answer recognizes the topic but diverges significantly from the reference in accuracy or relevance.
	Score3: The answer aligns with the reference generally but lacks detail or precise accuracy in some aspects.
	Score4: The answer is mostly accurate and relevant, closely following the reference but could be clearer or more detailed.
	Score5: The answer is highly accurate, detailed, and matches the reference answer perfectly, capturing its essence and detail.

	Your response should be formatted as follows:
	Explanation: (Provide a concise explanation of your rating, comparing the reference answer with the model's response. "The reference answer is [XXX], while the model's answer is [YYY]. I think ...")
	Rating: (int)"""


	retries = 3
	NUM_SECONDS_TO_SLEEP = 5


	def get_eval(max_tokens: int, content: str, retries: int = retries):
	global headers

	messages = [
	{"role": "user", "content": content},
	]

	payload = {"model": GPT_EVAL_MODEL_NAME, "messages": messages, "temperature": 0.7, "max_tokens": max_tokens, "top_p": 0.95, "frequency_penalty": 0, "presence_penalty": 0, "stop": None}

	for attempt in range(retries):
	try:
	response = requests.post(API_URL, headers=headers, json=payload, timeout=60)
	response.raise_for_status()
	response_data = response.json()

	content = response_data["choices"][0]["message"]["content"].strip()
	if content != "":
	return content, response_data["model"]
	break # If successful, break out of the loop

	except Exception as e:
	eval_logger.info(f"Attempt {attempt + 1} failed with error: {e}")
	if attempt < retries: # If we have retries left, sleep and then continue to next attempt
	time.sleep(NUM_SECONDS_TO_SLEEP)
	else: # If this was the last attempt, log and return empty
	eval_logger.error(f"All {retries} attempts failed. Last error message: {e}")
	return "", ""
	return "", ""


	def clotho_aqa_v2_process_results(doc, result):
	pred = result[0]
	ground_truth_str = doc["answer"]
	content = eval_prompt.format(model_response=pred, ground_truth=ground_truth_str, question=doc["question"])
	eval_answer, model_name = get_eval(max_tokens=1024, content=content)
	return {
	"gpt_eval": {"eval_answer": eval_answer, "model_name": model_name},
	}


	def clotho_aqa_v2_aggregate_results(results):
	score = 0
	for result in results:
	eval_answer = result["eval_answer"]
	eval_score = re.search(r"([0-5])", eval_answer).group(1)
	try:
	eval_score = float(eval_score)
	except Exception as e:
	eval_logger.error(f"Error parsing eval_score: {e}")
	eval_score = 0.0
	score += eval_score

	return score / len(results) * 20