import os import re import json from datetime import datetime from tqdm import tqdm import multiprocessing import sys from pathlib import Path try: with open("apikey.txt", "r") as f: api_key = f.read() except: api_key = '' def call_gpt4o(msg): while True: try: response = openai.ChatCompletion.create( model="gpt-4o", messages=msg, api_key=api_key, request_timeout=5) break except: print("Timeout, retrying...") time.sleep(5) output_text = response['choices'][0]['message']['content'] return output_text INPUT_JSONL = sys.argv[1] OUTPUT_JSON = sys.argv[2] script_dir = Path(__file__).resolve().parent GT_PATH = script_dir / "final_caption_qa.json" NUM_PROCESSES = 10 def safe_parse_evaluation(response_str): try: match = re.search(r"\{.*\}", response_str, re.DOTALL) if match: return json.loads(match.group(0).replace("'", '"')) except Exception as e: print(f"Error parsing evaluation output: {e}") return {} def evaluate_caption(sample_id, pred_caption, true_caption): system_msg = ( "You are an assistant that compares a ground truth video description and a predicted video description. " "Evaluate the predicted description against the ground truth on the following three dimensions:\n" "1. **visual**: the accuracy and completeness of visual content including the scene setting, background, characters or objects, their actions or interactions, and any OCR text.\n" "2. **audio**: how well it captures voices, background music, sound effects, and their emotional tone.\n" "3. **details**: the completeness, thematic consistency, purpose, coherence, and integration of multimodal content.\n\n" "For each dimension, assign an integer score from 1 to 5, following these detailed grading criteria:\n\n" "**Score 1:** The description is mostly irrelevant or misleading. It misrepresents or omits most key information. " "At least 3 important elements are missing or incorrect. Severe hallucinations may be present.\n\n" "**Score 2:** The description captures a few elements (1-2 aspects) but is vague or inaccurate for the rest. " "It is poorly structured or confusing, with major omissions or incorrect details.\n\n" "**Score 3:** The description aligns with the video on most elements (3 or more), but lacks depth or specificity. " "Some key details are missing, or minor factual errors exist. It's generally correct but too generic or incomplete.\n\n" "**Score 4:** A mostly accurate and complete description. Captures nearly all key information (4+ aspects), " "with clear structure and appropriate level of detail. Minor omissions or simplifications are acceptable.\n\n" "**Score 5:** Exceptionally accurate and detailed. Covers all relevant aspects thoroughly, with well-integrated information. " "Captures subtle nuances (e.g., emotion, scene dynamics, audio-visual interplay) and reads like it was written by a domain expert.\n\n" "Respond only with a valid Python dictionary in this format:\n" "{'visual': int, 'audio': int, 'details': int}" ) user_msg = ( f"Sample ID: {sample_id}\n" f"Predicted Description: {pred_caption}\n" f"Ground Truth Description: {true_caption}\n" ) for i in range(20): try: messages=[ {"role": "system", "content": system_msg}, {"role": "user", "content": user_msg} ] content = call_gpt4o(messages) print(content) eval_dict = safe_parse_evaluation(content) v = int(eval_dict["visual"]) a = int(eval_dict["audio"]) d = int(eval_dict["details"]) return {'visual': v, 'audio': a, 'details': d} except Exception as e: print(f"Error evaluating sample {sample_id}: {e}") v = a = d = 0 return {'visual': v, 'audio': a, 'details': d} def process_sample(args): video_id, pred_caption, true_caption = args scores = evaluate_caption(video_id, pred_caption, true_caption) avg_score = sum(scores.values()) / len(scores) result_data = { 'visual_score': scores['visual'], 'audio_score': scores['audio'], 'details_score': scores['details'], 'average_score': avg_score } return video_id, result_data def run_evaluation(): try: with open(GT_PATH, 'r', encoding='utf-8') as f_gt: gt_anno = json.load(f_gt)["samples"] gt = {anno["video_id"]: anno["answer"] for anno in gt_anno} except Exception as e: print(f"Error loading ground truth file: {e}") return results = {} if os.path.exists(OUTPUT_JSON): try: with open(OUTPUT_JSON, 'r', encoding='utf-8') as f: existing = json.load(f) results = existing.get('evaluations', {}) except Exception as e: print(f"Warning: Failed to read previous results: {e}") results = {} tasks = [] with open(INPUT_JSONL, 'r', encoding='utf-8') as f: for line in f: try: sample = json.loads(line) video_id = sample.get('video_id') pred_caption = sample.get('caption', '').strip() if not video_id or not pred_caption: continue if video_id in results: continue if video_id == "PI-0abb79ae-18ae-40d9-a66f-3ad7744a6095": # no GT continue true_caption = gt[video_id] tasks.append((video_id, pred_caption, true_caption)) except json.JSONDecodeError: continue if not tasks: print("No new samples to evaluate.") return print(f"Found {len(tasks)} new samples to evaluate.") with multiprocessing.Pool(processes=NUM_PROCESSES) as pool: with tqdm(total=len(tasks), desc="Evaluating samples") as pbar: for video_id, result_data in pool.imap_unordered(process_sample, tasks): if result_data: results[video_id] = result_data all_scores = list(results.values()) count = len(all_scores) sum_visual = sum(s['visual_score'] for s in all_scores) sum_audio = sum(s['audio_score'] for s in all_scores) sum_details = sum(s['details_score'] for s in all_scores) sum_avg = sum(s['average_score'] for s in all_scores) current_output = { 'evaluations': results, 'overall_visual_average': round(sum_visual / count / 5.0 * 100, 2), 'overall_audio_average': round(sum_audio / count / 5.0 * 100, 2), 'overall_details_average': round(sum_details / count / 5.0 * 100, 2), 'overall_average_percent': round((sum_avg / count) / 5.0 * 100, 2) } with open(OUTPUT_JSON, 'w', encoding='utf-8') as out_f: json.dump(current_output, out_f, indent=2, ensure_ascii=False) pbar.update(1) print("\nEvaluation complete.") if os.path.exists(OUTPUT_JSON): with open(OUTPUT_JSON, 'r', encoding='utf-8') as f: final_output = json.load(f) print(f"Overall visual avg: {final_output.get('overall_visual_average', 0):.2f}, " f"audio avg: {final_output.get('overall_audio_average', 0):.2f}, " f"details avg: {final_output.get('overall_details_average', 0):.2f}, " f"overall%: {final_output.get('overall_average_percent', 0):.2f}") if __name__ == '__main__': run_evaluation()