Spaces:

Imaginethat
/

aOt

Paused

App Files Files Community

aOt / eval_scripts /UGC-VideoCap /evaluation.py

Imaginethat

Upload 68 files

8a11f7f verified about 2 months ago

raw

history blame contribute delete

8.11 kB

	import os
	import re
	import json
	from datetime import datetime
	from tqdm import tqdm
	import multiprocessing
	import sys
	from pathlib import Path

	try:
	with open("apikey.txt", "r") as f:
	api_key = f.read()
	except:
	api_key = ''

	def call_gpt4o(msg):
	while True:
	try:
	response = openai.ChatCompletion.create(
	model="gpt-4o",
	messages=msg,
	api_key=api_key,
	request_timeout=5)
	break
	except:
	print("Timeout, retrying...")
	time.sleep(5)

	output_text = response['choices'][0]['message']['content']
	return output_text


	INPUT_JSONL = sys.argv[1]
	OUTPUT_JSON = sys.argv[2]
	script_dir = Path(__file__).resolve().parent
	GT_PATH = script_dir / "final_caption_qa.json"
	NUM_PROCESSES = 10

	def safe_parse_evaluation(response_str):
	try:
	match = re.search(r"\{.*\}", response_str, re.DOTALL)
	if match:
	return json.loads(match.group(0).replace("'", '"'))
	except Exception as e:
	print(f"Error parsing evaluation output: {e}")
	return {}

	def evaluate_caption(sample_id, pred_caption, true_caption):

	system_msg = (
	"You are an assistant that compares a ground truth video description and a predicted video description. "
	"Evaluate the predicted description against the ground truth on the following three dimensions:\n"
	"1. visual: the accuracy and completeness of visual content including the scene setting, background, characters or objects, their actions or interactions, and any OCR text.\n"
	"2. audio: how well it captures voices, background music, sound effects, and their emotional tone.\n"
	"3. details: the completeness, thematic consistency, purpose, coherence, and integration of multimodal content.\n\n"

	"For each dimension, assign an integer score from 1 to 5, following these detailed grading criteria:\n\n"

	"Score 1: The description is mostly irrelevant or misleading. It misrepresents or omits most key information. "
	"At least 3 important elements are missing or incorrect. Severe hallucinations may be present.\n\n"

	"Score 2: The description captures a few elements (1-2 aspects) but is vague or inaccurate for the rest. "
	"It is poorly structured or confusing, with major omissions or incorrect details.\n\n"

	"Score 3: The description aligns with the video on most elements (3 or more), but lacks depth or specificity. "
	"Some key details are missing, or minor factual errors exist. It's generally correct but too generic or incomplete.\n\n"

	"Score 4: A mostly accurate and complete description. Captures nearly all key information (4+ aspects), "
	"with clear structure and appropriate level of detail. Minor omissions or simplifications are acceptable.\n\n"

	"Score 5: Exceptionally accurate and detailed. Covers all relevant aspects thoroughly, with well-integrated information. "
	"Captures subtle nuances (e.g., emotion, scene dynamics, audio-visual interplay) and reads like it was written by a domain expert.\n\n"


	"Respond only with a valid Python dictionary in this format:\n"
	"{'visual': int, 'audio': int, 'details': int}"
	)

	user_msg = (
	f"Sample ID: {sample_id}\n"
	f"Predicted Description: {pred_caption}\n"
	f"Ground Truth Description: {true_caption}\n"
	)

	for i in range(20):
	try:
	messages=[
	{"role": "system", "content": system_msg},
	{"role": "user", "content": user_msg}
	]
	content = call_gpt4o(messages)
	print(content)
	eval_dict = safe_parse_evaluation(content)

	v = int(eval_dict["visual"])
	a = int(eval_dict["audio"])
	d = int(eval_dict["details"])
	return {'visual': v, 'audio': a, 'details': d}
	except Exception as e:
	print(f"Error evaluating sample {sample_id}: {e}")

	v = a = d = 0
	return {'visual': v, 'audio': a, 'details': d}

	def process_sample(args):
	video_id, pred_caption, true_caption = args
	scores = evaluate_caption(video_id, pred_caption, true_caption)
	avg_score = sum(scores.values()) / len(scores)

	result_data = {
	'visual_score': scores['visual'],
	'audio_score': scores['audio'],
	'details_score': scores['details'],
	'average_score': avg_score
	}
	return video_id, result_data

	def run_evaluation():
	try:
	with open(GT_PATH, 'r', encoding='utf-8') as f_gt:
	gt_anno = json.load(f_gt)["samples"]
	gt = {anno["video_id"]: anno["answer"] for anno in gt_anno}
	except Exception as e:
	print(f"Error loading ground truth file: {e}")
	return

	results = {}
	if os.path.exists(OUTPUT_JSON):
	try:
	with open(OUTPUT_JSON, 'r', encoding='utf-8') as f:
	existing = json.load(f)
	results = existing.get('evaluations', {})
	except Exception as e:
	print(f"Warning: Failed to read previous results: {e}")
	results = {}

	tasks = []
	with open(INPUT_JSONL, 'r', encoding='utf-8') as f:
	for line in f:
	try:
	sample = json.loads(line)
	video_id = sample.get('video_id')
	pred_caption = sample.get('caption', '').strip()

	if not video_id or not pred_caption:
	continue

	if video_id in results:
	continue
	if video_id == "PI-0abb79ae-18ae-40d9-a66f-3ad7744a6095": # no GT
	continue

	true_caption = gt[video_id]
	tasks.append((video_id, pred_caption, true_caption))
	except json.JSONDecodeError:
	continue

	if not tasks:
	print("No new samples to evaluate.")
	return

	print(f"Found {len(tasks)} new samples to evaluate.")

	with multiprocessing.Pool(processes=NUM_PROCESSES) as pool:
	with tqdm(total=len(tasks), desc="Evaluating samples") as pbar:
	for video_id, result_data in pool.imap_unordered(process_sample, tasks):
	if result_data:
	results[video_id] = result_data

	all_scores = list(results.values())
	count = len(all_scores)
	sum_visual = sum(s['visual_score'] for s in all_scores)
	sum_audio = sum(s['audio_score'] for s in all_scores)
	sum_details = sum(s['details_score'] for s in all_scores)
	sum_avg = sum(s['average_score'] for s in all_scores)

	current_output = {
	'evaluations': results,
	'overall_visual_average': round(sum_visual / count / 5.0 * 100, 2),
	'overall_audio_average': round(sum_audio / count / 5.0 * 100, 2),
	'overall_details_average': round(sum_details / count / 5.0 * 100, 2),
	'overall_average_percent': round((sum_avg / count) / 5.0 * 100, 2)
	}

	with open(OUTPUT_JSON, 'w', encoding='utf-8') as out_f:
	json.dump(current_output, out_f, indent=2, ensure_ascii=False)

	pbar.update(1)

	print("\nEvaluation complete.")
	if os.path.exists(OUTPUT_JSON):
	with open(OUTPUT_JSON, 'r', encoding='utf-8') as f:
	final_output = json.load(f)
	print(f"Overall visual avg: {final_output.get('overall_visual_average', 0):.2f}, "
	f"audio avg: {final_output.get('overall_audio_average', 0):.2f}, "
	f"details avg: {final_output.get('overall_details_average', 0):.2f}, "
	f"overall%: {final_output.get('overall_average_percent', 0):.2f}")

	if __name__ == '__main__':
	run_evaluation()