File size: 8,105 Bytes
8a11f7f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
import os
import re
import json
from datetime import datetime
from tqdm import tqdm
import multiprocessing
import sys
from pathlib import Path

try:
    with open("apikey.txt", "r") as f:
        api_key = f.read()
except:
    api_key = ''

def call_gpt4o(msg):
    while True:
        try:
            response = openai.ChatCompletion.create(
                model="gpt-4o", 
                messages=msg, 
                api_key=api_key, 
                request_timeout=5)
            break
        except:
            print("Timeout, retrying...")
            time.sleep(5)

    output_text = response['choices'][0]['message']['content']
    return output_text


INPUT_JSONL = sys.argv[1]
OUTPUT_JSON = sys.argv[2]
script_dir = Path(__file__).resolve().parent
GT_PATH = script_dir / "final_caption_qa.json"
NUM_PROCESSES = 10

def safe_parse_evaluation(response_str):
    try:
        match = re.search(r"\{.*\}", response_str, re.DOTALL)
        if match:
            return json.loads(match.group(0).replace("'", '"'))
    except Exception as e:
        print(f"Error parsing evaluation output: {e}")
    return {}

def evaluate_caption(sample_id, pred_caption, true_caption):

    system_msg = (
        "You are an assistant that compares a ground truth video description and a predicted video description. "
        "Evaluate the predicted description against the ground truth on the following three dimensions:\n"
        "1. **visual**: the accuracy and completeness of visual content including the scene setting, background, characters or objects, their actions or interactions, and any OCR text.\n"
        "2. **audio**: how well it captures voices, background music, sound effects, and their emotional tone.\n"
        "3. **details**: the completeness, thematic consistency, purpose, coherence, and integration of multimodal content.\n\n"

        "For each dimension, assign an integer score from 1 to 5, following these detailed grading criteria:\n\n"

        "**Score 1:** The description is mostly irrelevant or misleading. It misrepresents or omits most key information. "
        "At least 3 important elements are missing or incorrect. Severe hallucinations may be present.\n\n"

        "**Score 2:** The description captures a few elements (1-2 aspects) but is vague or inaccurate for the rest. "
        "It is poorly structured or confusing, with major omissions or incorrect details.\n\n"

        "**Score 3:** The description aligns with the video on most elements (3 or more), but lacks depth or specificity. "
        "Some key details are missing, or minor factual errors exist. It's generally correct but too generic or incomplete.\n\n"

        "**Score 4:** A mostly accurate and complete description. Captures nearly all key information (4+ aspects), "
        "with clear structure and appropriate level of detail. Minor omissions or simplifications are acceptable.\n\n"

        "**Score 5:** Exceptionally accurate and detailed. Covers all relevant aspects thoroughly, with well-integrated information. "
        "Captures subtle nuances (e.g., emotion, scene dynamics, audio-visual interplay) and reads like it was written by a domain expert.\n\n"


        "Respond only with a valid Python dictionary in this format:\n"
        "{'visual': int, 'audio': int, 'details': int}"
    )

    user_msg = (
        f"Sample ID: {sample_id}\n"
        f"Predicted Description: {pred_caption}\n"
        f"Ground Truth Description: {true_caption}\n"
    )

    for i in range(20):
        try:
            messages=[
                {"role": "system", "content": system_msg},
                {"role": "user", "content": user_msg}
            ]
            content = call_gpt4o(messages)
            print(content)
            eval_dict = safe_parse_evaluation(content)

            v = int(eval_dict["visual"])
            a = int(eval_dict["audio"])
            d = int(eval_dict["details"])
            return {'visual': v, 'audio': a, 'details': d}
        except Exception as e:
            print(f"Error evaluating sample {sample_id}: {e}")

    v = a = d = 0
    return {'visual': v, 'audio': a, 'details': d}

def process_sample(args):
    video_id, pred_caption, true_caption = args
    scores = evaluate_caption(video_id, pred_caption, true_caption)
    avg_score = sum(scores.values()) / len(scores)
    
    result_data = {
        'visual_score': scores['visual'],
        'audio_score': scores['audio'],
        'details_score': scores['details'],
        'average_score': avg_score
    }
    return video_id, result_data

def run_evaluation():
    try:
        with open(GT_PATH, 'r', encoding='utf-8') as f_gt:
            gt_anno = json.load(f_gt)["samples"]
        gt = {anno["video_id"]: anno["answer"] for anno in gt_anno}
    except Exception as e:
        print(f"Error loading ground truth file: {e}")
        return

    results = {}
    if os.path.exists(OUTPUT_JSON):
        try:
            with open(OUTPUT_JSON, 'r', encoding='utf-8') as f:
                existing = json.load(f)
                results = existing.get('evaluations', {})
        except Exception as e:
            print(f"Warning: Failed to read previous results: {e}")
            results = {}
    
    tasks = []
    with open(INPUT_JSONL, 'r', encoding='utf-8') as f:
        for line in f:
            try:
                sample = json.loads(line)
                video_id = sample.get('video_id')
                pred_caption = sample.get('caption', '').strip()
        
                if not video_id or not pred_caption:
                    continue

                if video_id in results:
                    continue
                if video_id == "PI-0abb79ae-18ae-40d9-a66f-3ad7744a6095": # no GT
                    continue
                
                true_caption = gt[video_id]
                tasks.append((video_id, pred_caption, true_caption))
            except json.JSONDecodeError:
                continue
    
    if not tasks:
        print("No new samples to evaluate.")
        return

    print(f"Found {len(tasks)} new samples to evaluate.")

    with multiprocessing.Pool(processes=NUM_PROCESSES) as pool:
        with tqdm(total=len(tasks), desc="Evaluating samples") as pbar:
            for video_id, result_data in pool.imap_unordered(process_sample, tasks):
                if result_data:
                    results[video_id] = result_data
                    
                    all_scores = list(results.values())
                    count = len(all_scores)
                    sum_visual = sum(s['visual_score'] for s in all_scores)
                    sum_audio = sum(s['audio_score'] for s in all_scores)
                    sum_details = sum(s['details_score'] for s in all_scores)
                    sum_avg = sum(s['average_score'] for s in all_scores)

                    current_output = {
                        'evaluations': results,
                        'overall_visual_average': round(sum_visual / count / 5.0 * 100, 2),
                        'overall_audio_average': round(sum_audio / count / 5.0 * 100, 2),
                        'overall_details_average': round(sum_details / count / 5.0 * 100, 2),
                        'overall_average_percent': round((sum_avg / count) / 5.0 * 100, 2)
                    }

                    with open(OUTPUT_JSON, 'w', encoding='utf-8') as out_f:
                        json.dump(current_output, out_f, indent=2, ensure_ascii=False)
                
                pbar.update(1)

    print("\nEvaluation complete.")
    if os.path.exists(OUTPUT_JSON):
        with open(OUTPUT_JSON, 'r', encoding='utf-8') as f:
            final_output = json.load(f)
            print(f"Overall visual avg: {final_output.get('overall_visual_average', 0):.2f}, "
                  f"audio avg: {final_output.get('overall_audio_average', 0):.2f}, "
                  f"details avg: {final_output.get('overall_details_average', 0):.2f}, "
                  f"overall%: {final_output.get('overall_average_percent', 0):.2f}")

if __name__ == '__main__':
    run_evaluation()