Spaces:
Paused
Paused
File size: 8,105 Bytes
8a11f7f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 |
import os
import re
import json
from datetime import datetime
from tqdm import tqdm
import multiprocessing
import sys
from pathlib import Path
try:
with open("apikey.txt", "r") as f:
api_key = f.read()
except:
api_key = ''
def call_gpt4o(msg):
while True:
try:
response = openai.ChatCompletion.create(
model="gpt-4o",
messages=msg,
api_key=api_key,
request_timeout=5)
break
except:
print("Timeout, retrying...")
time.sleep(5)
output_text = response['choices'][0]['message']['content']
return output_text
INPUT_JSONL = sys.argv[1]
OUTPUT_JSON = sys.argv[2]
script_dir = Path(__file__).resolve().parent
GT_PATH = script_dir / "final_caption_qa.json"
NUM_PROCESSES = 10
def safe_parse_evaluation(response_str):
try:
match = re.search(r"\{.*\}", response_str, re.DOTALL)
if match:
return json.loads(match.group(0).replace("'", '"'))
except Exception as e:
print(f"Error parsing evaluation output: {e}")
return {}
def evaluate_caption(sample_id, pred_caption, true_caption):
system_msg = (
"You are an assistant that compares a ground truth video description and a predicted video description. "
"Evaluate the predicted description against the ground truth on the following three dimensions:\n"
"1. **visual**: the accuracy and completeness of visual content including the scene setting, background, characters or objects, their actions or interactions, and any OCR text.\n"
"2. **audio**: how well it captures voices, background music, sound effects, and their emotional tone.\n"
"3. **details**: the completeness, thematic consistency, purpose, coherence, and integration of multimodal content.\n\n"
"For each dimension, assign an integer score from 1 to 5, following these detailed grading criteria:\n\n"
"**Score 1:** The description is mostly irrelevant or misleading. It misrepresents or omits most key information. "
"At least 3 important elements are missing or incorrect. Severe hallucinations may be present.\n\n"
"**Score 2:** The description captures a few elements (1-2 aspects) but is vague or inaccurate for the rest. "
"It is poorly structured or confusing, with major omissions or incorrect details.\n\n"
"**Score 3:** The description aligns with the video on most elements (3 or more), but lacks depth or specificity. "
"Some key details are missing, or minor factual errors exist. It's generally correct but too generic or incomplete.\n\n"
"**Score 4:** A mostly accurate and complete description. Captures nearly all key information (4+ aspects), "
"with clear structure and appropriate level of detail. Minor omissions or simplifications are acceptable.\n\n"
"**Score 5:** Exceptionally accurate and detailed. Covers all relevant aspects thoroughly, with well-integrated information. "
"Captures subtle nuances (e.g., emotion, scene dynamics, audio-visual interplay) and reads like it was written by a domain expert.\n\n"
"Respond only with a valid Python dictionary in this format:\n"
"{'visual': int, 'audio': int, 'details': int}"
)
user_msg = (
f"Sample ID: {sample_id}\n"
f"Predicted Description: {pred_caption}\n"
f"Ground Truth Description: {true_caption}\n"
)
for i in range(20):
try:
messages=[
{"role": "system", "content": system_msg},
{"role": "user", "content": user_msg}
]
content = call_gpt4o(messages)
print(content)
eval_dict = safe_parse_evaluation(content)
v = int(eval_dict["visual"])
a = int(eval_dict["audio"])
d = int(eval_dict["details"])
return {'visual': v, 'audio': a, 'details': d}
except Exception as e:
print(f"Error evaluating sample {sample_id}: {e}")
v = a = d = 0
return {'visual': v, 'audio': a, 'details': d}
def process_sample(args):
video_id, pred_caption, true_caption = args
scores = evaluate_caption(video_id, pred_caption, true_caption)
avg_score = sum(scores.values()) / len(scores)
result_data = {
'visual_score': scores['visual'],
'audio_score': scores['audio'],
'details_score': scores['details'],
'average_score': avg_score
}
return video_id, result_data
def run_evaluation():
try:
with open(GT_PATH, 'r', encoding='utf-8') as f_gt:
gt_anno = json.load(f_gt)["samples"]
gt = {anno["video_id"]: anno["answer"] for anno in gt_anno}
except Exception as e:
print(f"Error loading ground truth file: {e}")
return
results = {}
if os.path.exists(OUTPUT_JSON):
try:
with open(OUTPUT_JSON, 'r', encoding='utf-8') as f:
existing = json.load(f)
results = existing.get('evaluations', {})
except Exception as e:
print(f"Warning: Failed to read previous results: {e}")
results = {}
tasks = []
with open(INPUT_JSONL, 'r', encoding='utf-8') as f:
for line in f:
try:
sample = json.loads(line)
video_id = sample.get('video_id')
pred_caption = sample.get('caption', '').strip()
if not video_id or not pred_caption:
continue
if video_id in results:
continue
if video_id == "PI-0abb79ae-18ae-40d9-a66f-3ad7744a6095": # no GT
continue
true_caption = gt[video_id]
tasks.append((video_id, pred_caption, true_caption))
except json.JSONDecodeError:
continue
if not tasks:
print("No new samples to evaluate.")
return
print(f"Found {len(tasks)} new samples to evaluate.")
with multiprocessing.Pool(processes=NUM_PROCESSES) as pool:
with tqdm(total=len(tasks), desc="Evaluating samples") as pbar:
for video_id, result_data in pool.imap_unordered(process_sample, tasks):
if result_data:
results[video_id] = result_data
all_scores = list(results.values())
count = len(all_scores)
sum_visual = sum(s['visual_score'] for s in all_scores)
sum_audio = sum(s['audio_score'] for s in all_scores)
sum_details = sum(s['details_score'] for s in all_scores)
sum_avg = sum(s['average_score'] for s in all_scores)
current_output = {
'evaluations': results,
'overall_visual_average': round(sum_visual / count / 5.0 * 100, 2),
'overall_audio_average': round(sum_audio / count / 5.0 * 100, 2),
'overall_details_average': round(sum_details / count / 5.0 * 100, 2),
'overall_average_percent': round((sum_avg / count) / 5.0 * 100, 2)
}
with open(OUTPUT_JSON, 'w', encoding='utf-8') as out_f:
json.dump(current_output, out_f, indent=2, ensure_ascii=False)
pbar.update(1)
print("\nEvaluation complete.")
if os.path.exists(OUTPUT_JSON):
with open(OUTPUT_JSON, 'r', encoding='utf-8') as f:
final_output = json.load(f)
print(f"Overall visual avg: {final_output.get('overall_visual_average', 0):.2f}, "
f"audio avg: {final_output.get('overall_audio_average', 0):.2f}, "
f"details avg: {final_output.get('overall_details_average', 0):.2f}, "
f"overall%: {final_output.get('overall_average_percent', 0):.2f}")
if __name__ == '__main__':
run_evaluation() |