Spaces:

Imaginethat
/

aOt

Paused

aOt

File size: 8,105 Bytes

8a11f7f

import os
import re
import json
from datetime import datetime
from tqdm import tqdm
import multiprocessing
import sys
from pathlib import Path

try:
    with open("apikey.txt", "r") as f:
        api_key = f.read()
except:
    api_key = ''

def call_gpt4o(msg):
    while True:
        try:
            response = openai.ChatCompletion.create(
                model="gpt-4o", 
                messages=msg, 
                api_key=api_key, 
                request_timeout=5)
            break
        except:
            print("Timeout, retrying...")
            time.sleep(5)

    output_text = response['choices'][0]['message']['content']
    return output_text


INPUT_JSONL = sys.argv[1]
OUTPUT_JSON = sys.argv[2]
script_dir = Path(__file__).resolve().parent
GT_PATH = script_dir / "final_caption_qa.json"
NUM_PROCESSES = 10

def safe_parse_evaluation(response_str):
    try:
        match = re.search(r"\{.*\}", response_str, re.DOTALL)
        if match:
            return json.loads(match.group(0).replace("'", '"'))
    except Exception as e:
        print(f"Error parsing evaluation output: {e}")
    return {}

def evaluate_caption(sample_id, pred_caption, true_caption):

    system_msg = (
        "You are an assistant that compares a ground truth video description and a predicted video description. "
        "Evaluate the predicted description against the ground truth on the following three dimensions:\n"
        "1. **visual**: the accuracy and completeness of visual content including the scene setting, background, characters or objects, their actions or interactions, and any OCR text.\n"
        "2. **audio**: how well it captures voices, background music, sound effects, and their emotional tone.\n"
        "3. **details**: the completeness, thematic consistency, purpose, coherence, and integration of multimodal content.\n\n"

        "For each dimension, assign an integer score from 1 to 5, following these detailed grading criteria:\n\n"

        "**Score 1:** The description is mostly irrelevant or misleading. It misrepresents or omits most key information. "
        "At least 3 important elements are missing or incorrect. Severe hallucinations may be present.\n\n"

        "**Score 2:** The description captures a few elements (1-2 aspects) but is vague or inaccurate for the rest. "
        "It is poorly structured or confusing, with major omissions or incorrect details.\n\n"

        "**Score 3:** The description aligns with the video on most elements (3 or more), but lacks depth or specificity. "
        "Some key details are missing, or minor factual errors exist. It's generally correct but too generic or incomplete.\n\n"

        "**Score 4:** A mostly accurate and complete description. Captures nearly all key information (4+ aspects), "
        "with clear structure and appropriate level of detail. Minor omissions or simplifications are acceptable.\n\n"

        "**Score 5:** Exceptionally accurate and detailed. Covers all relevant aspects thoroughly, with well-integrated information. "
        "Captures subtle nuances (e.g., emotion, scene dynamics, audio-visual interplay) and reads like it was written by a domain expert.\n\n"


        "Respond only with a valid Python dictionary in this format:\n"
        "{'visual': int, 'audio': int, 'details': int}"
    )

    user_msg = (
        f"Sample ID: {sample_id}\n"
        f"Predicted Description: {pred_caption}\n"
        f"Ground Truth Description: {true_caption}\n"
    )

    for i in range(20):
        try:
            messages=[
                {"role": "system", "content": system_msg},
                {"role": "user", "content": user_msg}
            ]
            content = call_gpt4o(messages)
            print(content)
            eval_dict = safe_parse_evaluation(content)

            v = int(eval_dict["visual"])
            a = int(eval_dict["audio"])
            d = int(eval_dict["details"])
            return {'visual': v, 'audio': a, 'details': d}
        except Exception as e:
            print(f"Error evaluating sample {sample_id}: {e}")

    v = a = d = 0
    return {'visual': v, 'audio': a, 'details': d}

def process_sample(args):
    video_id, pred_caption, true_caption = args
    scores = evaluate_caption(video_id, pred_caption, true_caption)
    avg_score = sum(scores.values()) / len(scores)
    
    result_data = {
        'visual_score': scores['visual'],
        'audio_score': scores['audio'],
        'details_score': scores['details'],
        'average_score': avg_score
    }
    return video_id, result_data

def run_evaluation():
    try:
        with open(GT_PATH, 'r', encoding='utf-8') as f_gt:
            gt_anno = json.load(f_gt)["samples"]
        gt = {anno["video_id"]: anno["answer"] for anno in gt_anno}
    except Exception as e:
        print(f"Error loading ground truth file: {e}")
        return

    results = {}
    if os.path.exists(OUTPUT_JSON):
        try:
            with open(OUTPUT_JSON, 'r', encoding='utf-8') as f:
                existing = json.load(f)
                results = existing.get('evaluations', {})
        except Exception as e:
            print(f"Warning: Failed to read previous results: {e}")
            results = {}
    
    tasks = []
    with open(INPUT_JSONL, 'r', encoding='utf-8') as f:
        for line in f:
            try:
                sample = json.loads(line)
                video_id = sample.get('video_id')
                pred_caption = sample.get('caption', '').strip()
        
                if not video_id or not pred_caption:
                    continue

                if video_id in results:
                    continue
                if video_id == "PI-0abb79ae-18ae-40d9-a66f-3ad7744a6095": # no GT
                    continue
                
                true_caption = gt[video_id]
                tasks.append((video_id, pred_caption, true_caption))
            except json.JSONDecodeError:
                continue
    
    if not tasks:
        print("No new samples to evaluate.")
        return

    print(f"Found {len(tasks)} new samples to evaluate.")

    with multiprocessing.Pool(processes=NUM_PROCESSES) as pool:
        with tqdm(total=len(tasks), desc="Evaluating samples") as pbar:
            for video_id, result_data in pool.imap_unordered(process_sample, tasks):
                if result_data:
                    results[video_id] = result_data
                    
                    all_scores = list(results.values())
                    count = len(all_scores)
                    sum_visual = sum(s['visual_score'] for s in all_scores)
                    sum_audio = sum(s['audio_score'] for s in all_scores)
                    sum_details = sum(s['details_score'] for s in all_scores)
                    sum_avg = sum(s['average_score'] for s in all_scores)

                    current_output = {
                        'evaluations': results,
                        'overall_visual_average': round(sum_visual / count / 5.0 * 100, 2),
                        'overall_audio_average': round(sum_audio / count / 5.0 * 100, 2),
                        'overall_details_average': round(sum_details / count / 5.0 * 100, 2),
                        'overall_average_percent': round((sum_avg / count) / 5.0 * 100, 2)
                    }

                    with open(OUTPUT_JSON, 'w', encoding='utf-8') as out_f:
                        json.dump(current_output, out_f, indent=2, ensure_ascii=False)
                
                pbar.update(1)

    print("\nEvaluation complete.")
    if os.path.exists(OUTPUT_JSON):
        with open(OUTPUT_JSON, 'r', encoding='utf-8') as f:
            final_output = json.load(f)
            print(f"Overall visual avg: {final_output.get('overall_visual_average', 0):.2f}, "
                  f"audio avg: {final_output.get('overall_audio_average', 0):.2f}, "
                  f"details avg: {final_output.get('overall_details_average', 0):.2f}, "
                  f"overall%: {final_output.get('overall_average_percent', 0):.2f}")

if __name__ == '__main__':
    run_evaluation()