Imaginethat's picture
Upload 68 files
8a11f7f verified
import os
import re
import json
from datetime import datetime
from tqdm import tqdm
import multiprocessing
import sys
from pathlib import Path
try:
with open("apikey.txt", "r") as f:
api_key = f.read()
except:
api_key = ''
def call_gpt4o(msg):
while True:
try:
response = openai.ChatCompletion.create(
model="gpt-4o",
messages=msg,
api_key=api_key,
request_timeout=5)
break
except:
print("Timeout, retrying...")
time.sleep(5)
output_text = response['choices'][0]['message']['content']
return output_text
INPUT_JSONL = sys.argv[1]
OUTPUT_JSON = sys.argv[2]
script_dir = Path(__file__).resolve().parent
GT_PATH = script_dir / "final_caption_qa.json"
NUM_PROCESSES = 10
def safe_parse_evaluation(response_str):
try:
match = re.search(r"\{.*\}", response_str, re.DOTALL)
if match:
return json.loads(match.group(0).replace("'", '"'))
except Exception as e:
print(f"Error parsing evaluation output: {e}")
return {}
def evaluate_caption(sample_id, pred_caption, true_caption):
system_msg = (
"You are an assistant that compares a ground truth video description and a predicted video description. "
"Evaluate the predicted description against the ground truth on the following three dimensions:\n"
"1. **visual**: the accuracy and completeness of visual content including the scene setting, background, characters or objects, their actions or interactions, and any OCR text.\n"
"2. **audio**: how well it captures voices, background music, sound effects, and their emotional tone.\n"
"3. **details**: the completeness, thematic consistency, purpose, coherence, and integration of multimodal content.\n\n"
"For each dimension, assign an integer score from 1 to 5, following these detailed grading criteria:\n\n"
"**Score 1:** The description is mostly irrelevant or misleading. It misrepresents or omits most key information. "
"At least 3 important elements are missing or incorrect. Severe hallucinations may be present.\n\n"
"**Score 2:** The description captures a few elements (1-2 aspects) but is vague or inaccurate for the rest. "
"It is poorly structured or confusing, with major omissions or incorrect details.\n\n"
"**Score 3:** The description aligns with the video on most elements (3 or more), but lacks depth or specificity. "
"Some key details are missing, or minor factual errors exist. It's generally correct but too generic or incomplete.\n\n"
"**Score 4:** A mostly accurate and complete description. Captures nearly all key information (4+ aspects), "
"with clear structure and appropriate level of detail. Minor omissions or simplifications are acceptable.\n\n"
"**Score 5:** Exceptionally accurate and detailed. Covers all relevant aspects thoroughly, with well-integrated information. "
"Captures subtle nuances (e.g., emotion, scene dynamics, audio-visual interplay) and reads like it was written by a domain expert.\n\n"
"Respond only with a valid Python dictionary in this format:\n"
"{'visual': int, 'audio': int, 'details': int}"
)
user_msg = (
f"Sample ID: {sample_id}\n"
f"Predicted Description: {pred_caption}\n"
f"Ground Truth Description: {true_caption}\n"
)
for i in range(20):
try:
messages=[
{"role": "system", "content": system_msg},
{"role": "user", "content": user_msg}
]
content = call_gpt4o(messages)
print(content)
eval_dict = safe_parse_evaluation(content)
v = int(eval_dict["visual"])
a = int(eval_dict["audio"])
d = int(eval_dict["details"])
return {'visual': v, 'audio': a, 'details': d}
except Exception as e:
print(f"Error evaluating sample {sample_id}: {e}")
v = a = d = 0
return {'visual': v, 'audio': a, 'details': d}
def process_sample(args):
video_id, pred_caption, true_caption = args
scores = evaluate_caption(video_id, pred_caption, true_caption)
avg_score = sum(scores.values()) / len(scores)
result_data = {
'visual_score': scores['visual'],
'audio_score': scores['audio'],
'details_score': scores['details'],
'average_score': avg_score
}
return video_id, result_data
def run_evaluation():
try:
with open(GT_PATH, 'r', encoding='utf-8') as f_gt:
gt_anno = json.load(f_gt)["samples"]
gt = {anno["video_id"]: anno["answer"] for anno in gt_anno}
except Exception as e:
print(f"Error loading ground truth file: {e}")
return
results = {}
if os.path.exists(OUTPUT_JSON):
try:
with open(OUTPUT_JSON, 'r', encoding='utf-8') as f:
existing = json.load(f)
results = existing.get('evaluations', {})
except Exception as e:
print(f"Warning: Failed to read previous results: {e}")
results = {}
tasks = []
with open(INPUT_JSONL, 'r', encoding='utf-8') as f:
for line in f:
try:
sample = json.loads(line)
video_id = sample.get('video_id')
pred_caption = sample.get('caption', '').strip()
if not video_id or not pred_caption:
continue
if video_id in results:
continue
if video_id == "PI-0abb79ae-18ae-40d9-a66f-3ad7744a6095": # no GT
continue
true_caption = gt[video_id]
tasks.append((video_id, pred_caption, true_caption))
except json.JSONDecodeError:
continue
if not tasks:
print("No new samples to evaluate.")
return
print(f"Found {len(tasks)} new samples to evaluate.")
with multiprocessing.Pool(processes=NUM_PROCESSES) as pool:
with tqdm(total=len(tasks), desc="Evaluating samples") as pbar:
for video_id, result_data in pool.imap_unordered(process_sample, tasks):
if result_data:
results[video_id] = result_data
all_scores = list(results.values())
count = len(all_scores)
sum_visual = sum(s['visual_score'] for s in all_scores)
sum_audio = sum(s['audio_score'] for s in all_scores)
sum_details = sum(s['details_score'] for s in all_scores)
sum_avg = sum(s['average_score'] for s in all_scores)
current_output = {
'evaluations': results,
'overall_visual_average': round(sum_visual / count / 5.0 * 100, 2),
'overall_audio_average': round(sum_audio / count / 5.0 * 100, 2),
'overall_details_average': round(sum_details / count / 5.0 * 100, 2),
'overall_average_percent': round((sum_avg / count) / 5.0 * 100, 2)
}
with open(OUTPUT_JSON, 'w', encoding='utf-8') as out_f:
json.dump(current_output, out_f, indent=2, ensure_ascii=False)
pbar.update(1)
print("\nEvaluation complete.")
if os.path.exists(OUTPUT_JSON):
with open(OUTPUT_JSON, 'r', encoding='utf-8') as f:
final_output = json.load(f)
print(f"Overall visual avg: {final_output.get('overall_visual_average', 0):.2f}, "
f"audio avg: {final_output.get('overall_audio_average', 0):.2f}, "
f"details avg: {final_output.get('overall_details_average', 0):.2f}, "
f"overall%: {final_output.get('overall_average_percent', 0):.2f}")
if __name__ == '__main__':
run_evaluation()